Exemple #1
0
    def __init__(self, app, dispatcher):
        """Start the job manager"""
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context
        self.track_jobs_in_database = self.app.config.track_jobs_in_database

        # Initialize structures for handling job limits
        self.__clear_user_job_count()

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting_jobs = []
        # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times)
        self.job_wrappers = {}
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread(
            name="JobHandlerQueue.monitor_thread", target=self.__monitor)
        self.monitor_thread.setDaemon(True)
Exemple #2
0
    def __init__( self, app, dispatcher ):
        """Start the job manager"""
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context
        self.track_jobs_in_database = self.app.config.track_jobs_in_database

        # Initialize structures for handling job limits
        self.__clear_user_job_count()

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting_jobs = []
        # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times)
        self.job_wrappers = {}
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread( name="JobHandlerQueue.monitor_thread", target=self.__monitor )
        self.monitor_thread.setDaemon( True )
Exemple #3
0
 def __init__(self, config):
     assert sys.version_info >= (2, 6), 'S3 Object Store support requires Python >= 2.6'
     super(S3ObjectStore, self).__init__()
     self.config = config
     self.staging_path = self.config.file_path
     self.s3_conn = get_OS_connection(self.config)
     self.bucket = self._get_bucket(self.config.os_bucket_name)
     self.use_rr = self.config.os_use_reduced_redundancy
     self.cache_size = self.config.object_store_cache_size
     self.transfer_progress = 0
     # Clean cache only if value is set in universe_wsgi.ini
     if self.cache_size != -1:
         # Convert GBs to bytes for comparison
         self.cache_size = self.cache_size * 1073741824
         # Helper for interruptable sleep
         self.sleeper = Sleeper()
         self.cache_monitor_thread = threading.Thread(target=self.__cache_monitor)
         self.cache_monitor_thread.start()
         log.info("Cache cleaner manager started")
     # Test if 'axel' is available for parallel download and pull the key into cache
     try:
         subprocess.call('axel')
         self.use_axel = True
     except OSError:
         self.use_axel = False
Exemple #4
0
    def __init__(self, app, job_handler):
        self.app = app
        self.job_handler = job_handler  # the (singular) handler if we are passing jobs in memory

        self.sa_session = app.model.context
        self.job_lock = False
        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread(target=self.__monitor)
        # Recover jobs at startup
        self.__check_jobs_at_startup()
        # Start the queue
        self.monitor_thread.start()
        log.info("job manager queue started")
Exemple #5
0
    def __init__(self, app, job_handler):
        self.app = app
        self.job_handler = job_handler

        self.sa_session = app.model.context

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()

        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting = []

        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread(target=self.monitor)
        self.monitor_thread.start()
        log.info("job manager stop queue started")
Exemple #6
0
    def __init__(self, app, dispatcher):
        """Start the job manager"""
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context
        self.track_jobs_in_database = self.app.config.track_jobs_in_database

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting_jobs = []
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread(
            name="JobHandlerQueue.monitor_thread", target=self.__monitor)
        self.monitor_thread.setDaemon(True)
Exemple #7
0
    def __init__(self, config, fsmon=False):
        super(DistributedObjectStore, self).__init__()
        self.distributed_config = config.distributed_object_store_config_file
        assert self.distributed_config is not None, "distributed object store ('object_store = distributed') " \
                                                    "requires a config file, please set one in " \
                                                    "'distributed_object_store_config_file')"
        self.backends = {}
        self.weighted_backend_ids = []
        self.original_weighted_backend_ids = []
        self.max_percent_full = {}
        self.global_max_percent_full = 0.0

        random.seed()

        self.__parse_distributed_config(config)

        self.sleeper = None
        if fsmon and ( self.global_max_percent_full or filter( lambda x: x != 0.0, self.max_percent_full.values() ) ):
            self.sleeper = Sleeper()
            self.filesystem_monitor_thread = threading.Thread(target=self.__filesystem_monitor)
            self.filesystem_monitor_thread.setDaemon( True )
            self.filesystem_monitor_thread.start()
            log.info("Filesystem space monitor started")
Exemple #8
0
 def __init__(self, config):
     super(S3ObjectStore, self).__init__()
     self.config = config
     self.staging_path = self.config.file_path
     self.s3_conn = get_OS_connection(self.config)
     self.bucket = self._get_bucket(self.config.os_bucket_name)
     self.use_rr = self.config.os_use_reduced_redundancy
     self.cache_size = self.config.object_store_cache_size
     self.transfer_progress = 0
     # Clean cache only if value is set in universe_wsgi.ini
     if self.cache_size != -1:
         # Convert GBs to bytes for comparison
         self.cache_size = self.cache_size * 1073741824
         # Helper for interruptable sleep
         self.sleeper = Sleeper()
         self.cache_monitor_thread = threading.Thread(target=self.__cache_monitor)
         self.cache_monitor_thread.start()
         log.info("Cache cleaner manager started")
     # Test if 'axel' is available for parallel download and pull the key into cache
     try:
         subprocess.call('axel')
         self.use_axel = True
     except OSError:
         self.use_axel = False
 def __init__(self, config):
     assert sys.version_info >= (2, 6), 'S3 Object Store support requires Python >= 2.6'
     super(S3ObjectStore, self).__init__()
     self.config = config
     self.staging_path = self.config.file_path
     self.s3_conn = S3Connection()
     self.bucket = self._get_bucket(self.config.s3_bucket)
     self.use_rr = self.config.use_reduced_redundancy
     self.cache_size = self.config.object_store_cache_size * 1073741824 # Convert GBs to bytes
     self.transfer_progress = 0
     # Clean cache only if value is set in universe_wsgi.ini
     if self.cache_size != -1:
         # Helper for interruptable sleep
         self.sleeper = Sleeper()
         self.cache_monitor_thread = threading.Thread(target=self.__cache_monitor)
         self.cache_monitor_thread.start()
         log.info("Cache cleaner manager started")
Exemple #10
0
    def __init__( self, app, dispatcher ):
        """Start the job manager"""
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context
        self.track_jobs_in_database = self.app.config.track_jobs_in_database

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting_jobs = []
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread( target=self.__monitor )
Exemple #11
0
    def __init__( self, app, job_handler ):
        self.app = app
        self.job_handler = job_handler # the (singular) handler if we are passing jobs in memory

        self.sa_session = app.model.context
        self.job_lock = False
        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread( target=self.__monitor )
        # Recover jobs at startup
        self.__check_jobs_at_startup()
        # Start the queue
        self.monitor_thread.start()
        log.info( "job manager queue started" )
Exemple #12
0
    def __init__( self, app, job_handler ):
        self.app = app
        self.job_handler = job_handler

        self.sa_session = app.model.context

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()

        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting = []

        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread( target=self.monitor )
        self.monitor_thread.start()
        log.info( "job manager stop queue started" )
Exemple #13
0
    def __init__(self, config):
        super(DistributedObjectStore, self).__init__()
        self.distributed_config = config.distributed_object_store_config_file
        assert self.distributed_config is not None, "distributed object store ('object_store = distributed') " \
                                                    "requires a config file, please set one in " \
                                                    "'distributed_object_store_config_file')"
        self.backends = {}
        self.weighted_backend_ids = []
        self.original_weighted_backend_ids = []
        self.max_percent_full = {}
        self.global_max_percent_full = 0.0

        random.seed()

        self.__parse_distributed_config(config)

        if self.global_max_percent_full or filter(lambda x: x is not None, self.max_percent_full.values()):
            self.sleeper = Sleeper()
            self.filesystem_monitor_thread = threading.Thread(target=self.__filesystem_monitor)
            self.filesystem_monitor_thread.start()
            log.info("Filesystem space monitor started")
Exemple #14
0
class JobHandlerQueue(object):
    """
    Job manager, waits for jobs to be runnable and then dispatches to
    a JobRunner.
    """
    STOP_SIGNAL = object()

    def __init__(self, app, dispatcher):
        """Start the job manager"""
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context
        self.track_jobs_in_database = self.app.config.track_jobs_in_database

        # Initialize structures for handling job limits
        self.__clear_user_job_count()

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting_jobs = []
        # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times)
        self.job_wrappers = {}
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread(
            name="JobHandlerQueue.monitor_thread", target=self.__monitor)
        self.monitor_thread.setDaemon(True)

    def start(self):
        """
        The JobManager should start, and then start its Handler, if it has one.
        """
        # Recover jobs at startup
        self.__check_jobs_at_startup()
        # Start the queue
        self.monitor_thread.start()
        log.info("job handler queue started")

    def __check_jobs_at_startup(self):
        """
        Checks all jobs that are in the 'new', 'queued' or 'running' state in
        the database and requeues or cleans up as necessary.  Only run as the
        job handler starts.
        """
        for job in self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                                  .filter( ( ( model.Job.state == model.Job.states.NEW ) \
                                             | ( model.Job.state == model.Job.states.RUNNING ) \
                                             | ( model.Job.state == model.Job.states.QUEUED ) ) \
                                           & ( model.Job.handler == self.app.config.server_name ) ):
            if job.tool_id not in self.app.toolbox.tools_by_id:
                log.warning(
                    "(%s) Tool '%s' removed from tool config, unable to recover job"
                    % (job.id, job.tool_id))
                JobWrapper(job, self).fail(
                    'This tool was disabled before the job completed.  Please contact your Galaxy administrator.'
                )
            if job.job_runner_name is not None and job.job_runner_external_id is None:
                # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner.
                log.debug(
                    "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue"
                    % job.id)
                job.job_runner_name = None
                if self.track_jobs_in_database:
                    job.state = model.Job.states.NEW
                else:
                    self.queue.put((job.id, job.tool_id))
            elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None:
                # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist
                # TODO: test me extensively
                job_wrapper = JobWrapper(job, self)
                job_destination = self.dispatcher.url_to_destination(
                    job.job_runner_name)
                if job_destination.id is None:
                    job_destination.id = 'legacy_url'
                job_wrapper.set_job_destination(job_destination,
                                                job.job_runner_external_id)
                self.dispatcher.recover(job, job_wrapper)
                log.info(
                    '(%s) Converted job from a URL to a destination and recovered'
                    % (job.id))
            elif job.job_runner_name is None:
                # Never (fully) dispatched
                log.debug(
                    "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue"
                    % (job.id, job.state))
                if self.track_jobs_in_database:
                    job.state = model.Job.states.NEW
                else:
                    self.queue.put((job.id, job.tool_id))
            else:
                # Already dispatched and running
                job_wrapper = JobWrapper(job, self)
                job_wrapper.job_runner_mapper.cached_job_destination = JobDestination(
                    id=job.destination_id,
                    runner=job.job_runner_name,
                    params=job.destination_params)
                self.dispatcher.recover(job, job_wrapper)
        if self.sa_session.dirty:
            self.sa_session.flush()

    def __monitor(self):
        """
        Continually iterate the waiting jobs, checking is each is ready to
        run and dispatching if so.
        """
        while self.running:
            try:
                self.__monitor_step()
            except:
                log.exception("Exception in monitor_step")
            # Sleep
            self.sleeper.sleep(1)

    def __monitor_step(self):
        """
        Called repeatedly by `monitor` to process waiting jobs. Gets any new
        jobs (either from the database or from its own queue), then iterates
        over all new and waiting jobs to check the state of the jobs each
        depends on. If the job has dependencies that have not finished, it
        it goes to the waiting queue. If the job has dependencies with errors,
        it is marked as having errors and removed from the queue. Otherwise,
        the job is dispatched.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        if self.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                    .join(model.JobToInputDatasetAssociation) \
                    .join(model.HistoryDatasetAssociation) \
                    .join(model.Dataset) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 or_((model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA),
                                     (model.HistoryDatasetAssociation.deleted == True ),
                                     (model.Dataset.state != model.Dataset.states.OK ),
                                     (model.Dataset.deleted == True)))).subquery()
            ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                    .join(model.JobToInputLibraryDatasetAssociation) \
                    .join(model.LibraryDatasetDatasetAssociation) \
                    .join(model.Dataset) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 or_((model.LibraryDatasetDatasetAssociation._state != None),
                                     (model.LibraryDatasetDatasetAssociation.deleted == True),
                                     (model.Dataset.state != model.Dataset.states.OK),
                                     (model.Dataset.deleted == True)))).subquery()
            jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 (model.Job.handler == self.app.config.server_name),
                                 ~model.Job.table.c.id.in_(hda_not_ready),
                                 ~model.Job.table.c.id.in_(ldda_not_ready))) \
                    .order_by(model.Job.id).all()
        else:
            # Get job objects and append to watch queue for any which were
            # previously waiting
            for job_id in self.waiting_jobs:
                jobs_to_check.append(
                    self.sa_session.query(model.Job).get(job_id))
            try:
                while 1:
                    message = self.queue.get_nowait()
                    if message is self.STOP_SIGNAL:
                        return
                    # Unpack the message
                    job_id, tool_id = message
                    # Get the job object and append to watch queue
                    jobs_to_check.append(
                        self.sa_session.query(model.Job).get(job_id))
            except Empty:
                pass
        # Ensure that we get new job counts on each iteration
        self.__clear_user_job_count()
        # Iterate over new and waiting jobs and look for any that are
        # ready to run
        new_waiting_jobs = []
        for job in jobs_to_check:
            try:
                # Check the job's dependencies, requeue if they're not done.
                # Some of these states will only happen when using the in-memory job queue
                job_state = self.__check_if_ready_to_run(job)
                if job_state == JOB_WAIT:
                    new_waiting_jobs.append(job.id)
                elif job_state == JOB_INPUT_ERROR:
                    log.info(
                        "(%d) Job unable to run: one or more inputs in error state"
                        % job.id)
                elif job_state == JOB_INPUT_DELETED:
                    log.info(
                        "(%d) Job unable to run: one or more inputs deleted" %
                        job.id)
                elif job_state == JOB_READY:
                    self.dispatcher.put(self.job_wrappers.pop(job.id))
                    log.info("(%d) Job dispatched" % job.id)
                elif job_state == JOB_DELETED:
                    log.info("(%d) Job deleted by user while still queued" %
                             job.id)
                elif job_state == JOB_ADMIN_DELETED:
                    log.info("(%d) Job deleted by admin while still queued" %
                             job.id)
                elif job_state == JOB_USER_OVER_QUOTA:
                    log.info("(%d) User (%s) is over quota: job paused" %
                             (job.id, job.user_id))
                    job.state = model.Job.states.PAUSED
                    for dataset_assoc in job.output_datasets + job.output_library_datasets:
                        dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED
                        dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run"
                        self.sa_session.add(dataset_assoc.dataset.dataset)
                    self.sa_session.add(job)
                elif job_state == JOB_ERROR:
                    log.error("(%d) Error checking job readiness" % job.id)
                else:
                    log.error("(%d) Job in unknown state '%s'" %
                              (job.id, job_state))
                    new_waiting_jobs.append(job.id)
            except Exception:
                log.exception("failure running job %d" % job.id)
        # Update the waiting list
        if not self.track_jobs_in_database:
            self.waiting_jobs = new_waiting_jobs
        # Remove cached wrappers for any jobs that are no longer being tracked
        for id in self.job_wrappers.keys():
            if id not in new_waiting_jobs:
                del self.job_wrappers[id]
        # Flush, if we updated the state
        self.sa_session.flush()
        # Done with the session
        self.sa_session.remove()

    def __check_if_ready_to_run(self, job):
        """
        Check if a job is ready to run by verifying that each of its input
        datasets is ready (specifically in the OK state). If any input dataset
        has an error, fail the job and return JOB_INPUT_ERROR. If any input
        dataset is deleted, fail the job and return JOB_INPUT_DELETED.  If all
        input datasets are in OK state, return JOB_READY indicating that the
        job can be dispatched. Otherwise, return JOB_WAIT indicating that input
        datasets are still being prepared.
        """
        # If tracking in the database, job.state is guaranteed to be NEW and the inputs are guaranteed to be OK
        if not self.track_jobs_in_database:
            if job.state == model.Job.states.DELETED:
                return JOB_DELETED
            elif job.state == model.Job.states.ERROR:
                return JOB_ADMIN_DELETED
            for dataset_assoc in job.input_datasets + job.input_library_datasets:
                idata = dataset_assoc.dataset
                if not idata:
                    continue
                # don't run jobs for which the input dataset was deleted
                if idata.deleted:
                    self.job_wrappers.pop(job.id, JobWrapper(job, self)).fail(
                        "input data %s (file: %s) was deleted before the job started"
                        % (idata.hid, idata.file_name))
                    return JOB_INPUT_DELETED
                # an error in the input data causes us to bail immediately
                elif idata.state == idata.states.ERROR:
                    self.job_wrappers.pop(job.id, JobWrapper(job, self)).fail(
                        "input data %s is in error state" % (idata.hid))
                    return JOB_INPUT_ERROR
                elif idata.state == idata.states.FAILED_METADATA:
                    self.job_wrappers.pop(job.id, JobWrapper(job, self)).fail(
                        "input data %s failed to properly set metadata" %
                        (idata.hid))
                    return JOB_INPUT_ERROR
                elif idata.state != idata.states.OK and not (
                        idata.state == idata.states.SETTING_METADATA
                        and job.tool_id is not None and job.tool_id == self.
                        app.datatypes_registry.set_external_metadata_tool.id):
                    # need to requeue
                    return JOB_WAIT
        # Create the job wrapper so that the destination can be set
        if job.id not in self.job_wrappers:
            self.job_wrappers[job.id] = JobWrapper(job, self)
        # Cause the job_destination to be set and cached by the mapper
        try:
            self.job_wrappers[job.id].job_destination
        except Exception, e:
            failure_message = getattr(e, 'failure_message',
                                      DEFAULT_JOB_PUT_FAILURE_MESSAGE)
            if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE:
                log.exception('Failed to generate job destination')
            else:
                log.debug("Intentionally failing job with message (%s)" %
                          failure_message)
            self.job_wrappers[job.id].fail(failure_message)
            return JOB_ERROR
        # job is ready to run, check limits
        state = self.__check_user_jobs(job, self.job_wrappers[job.id])
        if state == JOB_READY and self.app.config.enable_quotas:
            quota = self.app.quota_agent.get_quota(job.user)
            if quota is not None:
                try:
                    usage = self.app.quota_agent.get_usage(user=job.user,
                                                           history=job.history)
                    if usage > quota:
                        return JOB_USER_OVER_QUOTA
                except AssertionError, e:
                    pass  # No history, should not happen with an anon user
Exemple #15
0
class JobManagerQueue( object ):
    """
    Job manager, waits for jobs to be runnable and then dispatches to a
    JobHandler.
    """
    STOP_SIGNAL = object()
    def __init__( self, app, job_handler ):
        self.app = app
        self.job_handler = job_handler # the (singular) handler if we are passing jobs in memory

        self.sa_session = app.model.context
        self.job_lock = False
        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread( name="JobManagerQueue.monitor_thread", target=self.__monitor )
        self.monitor_thread.setDaemon( True )
        # Recover jobs at startup
        self.__check_jobs_at_startup()
        # Start the queue
        self.monitor_thread.start()
        log.info( "job manager queue started" )

    def __check_jobs_at_startup( self ):
        """
        Checks all jobs that are in the 'new', 'queued' or 'running' state in
        the database and requeues or cleans up as necessary.  Only run as the
        job manager starts.
        """
        for job in self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                                  .filter( ( ( model.Job.state == model.Job.states.NEW ) \
                                             | ( model.Job.state == model.Job.states.RUNNING ) \
                                             | ( model.Job.state == model.Job.states.QUEUED ) ) \
                                           & ( model.Job.handler == None ) ):
            if job.tool_id not in self.app.toolbox.tools_by_id:
                log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % ( job.id, job.tool_id ) )
                JobWrapper( job, self ).fail( 'This tool was disabled before the job completed.  Please contact your Galaxy administrator.' )
            else:
                job.handler = self.__get_handler( job ) # handler's recovery method will take it from here
                log.info( "(%d) Job in '%s' state had no handler at job manager startup, assigned '%s' handler" % ( job.id, job.state, job.handler ) )
        if self.sa_session.dirty:
            self.sa_session.flush()

    def __monitor( self ):
        """
        Continually iterate the waiting jobs and dispatch to a handler
        """
        # HACK: Delay until after forking, we need a way to do post fork notification!!!
        time.sleep( 10 )
        while self.running:
            try:
                self.__monitor_step()
            except:
                log.exception( "Exception in monitor_step" )
            # Sleep
            self.sleeper.sleep( 1 )

    def __monitor_step( self ):
        """
        Called repeatedly by `monitor` to process waiting jobs. Gets any new
        jobs (either from the database or from its own queue), then assigns a
        handler.
        """
        # Do nothing if the queue is locked
        if self.job_lock:
            log.info( 'Job queue is administratively locked, sleeping...' )
            time.sleep( 10 )
            return
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        if self.app.config.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            jobs_to_check = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                                .filter( ( model.Job.state == model.Job.states.NEW ) \
                                         & ( model.Job.handler == None ) ).all()
        else:
            # Get job objects and append to watch queue for any which were
            # previously waiting
            try:
                while 1:
                    message = self.queue.get_nowait()
                    if message is self.STOP_SIGNAL:
                        return
                    # Unpack the message
                    job_id, tool_id = message
                    # Get the job object and append to watch queue
                    jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) )
            except Empty:
                pass

        for job in jobs_to_check:
            job.handler = self.__get_handler( job )
            job.job_runner_name = self.__get_runner_url( job )
            log.debug( "(%s) Job assigned to handler '%s'" % ( job.id, job.handler ) )
            self.sa_session.add( job )

        # If tracking in the database, handlers will pick up the job now
        self.sa_session.flush()

        time.sleep( 5 )

        # This only does something in the case that there is only one handler and it is this Galaxy process
        for job in jobs_to_check:
            self.job_handler.job_queue.put( job.id, job.tool_id )

    def __get_handler( self, job ):
        try:
            params = None
            if job.params:
                params = from_json_string( job.params )
            return self.app.toolbox.tools_by_id.get( job.tool_id, None ).get_job_handler( params )
        except:
            log.exception( "(%s) Caught exception attempting to get tool-specific job handler for tool '%s', selecting at random from available handlers instead:" % ( job.id, job.tool_id ) )
            return random.choice( self.app.config.job_handlers )

    def __get_runner_url( self, job ):
        """This fetches the raw runner URL, and does not perform any computation e.g. for the dynamic runner"""
        try:
            return self.app.toolbox.tools_by_id.get( job.tool_id, None ).get_job_runner_url( job.params )
        except Exception, e:
            log.warning( 'Unable to determine job runner URL for job %s: %s' % (job.id, str(e)) )
        return None
Exemple #16
0
class JobManagerStopQueue( object ):
    """
    A queue for jobs which need to be terminated prematurely.
    """
    STOP_SIGNAL = object()
    def __init__( self, app, job_handler ):
        self.app = app
        self.job_handler = job_handler

        self.sa_session = app.model.context

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()

        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting = []

        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread( name="JobManagerStopQueue.monitor_thread", target=self.monitor )
        self.monitor_thread.setDaemon( True )
        self.monitor_thread.start()
        log.info( "job manager stop queue started" )

    def monitor( self ):
        """
        Continually iterate the waiting jobs, stop any that are found.
        """
        # HACK: Delay until after forking, we need a way to do post fork notification!!!
        time.sleep( 10 )
        while self.running:
            try:
                self.monitor_step()
            except:
                log.exception( "Exception in monitor_step" )
            # Sleep
            self.sleeper.sleep( 1 )

    def monitor_step( self ):
        """
        Called repeatedly by `monitor` to stop jobs.
        """
        jobs_to_check = []
        # Pull from the queue even if tracking in the database (in the case of Administrative stopped jobs)
        try:
            while 1:
                message = self.queue.get_nowait()
                if message is self.STOP_SIGNAL:
                    return
                # Unpack the message
                job_id, error_msg = message
                # Get the job object and append to watch queue
                jobs_to_check.append( ( self.sa_session.query( model.Job ).get( job_id ), error_msg ) )
        except Empty:
            pass

        # If tracking in the database, the handler will pick up the stop itself.  Otherwise, notify the handler.
        for job, error_msg in jobs_to_check:
            self.job_handler.job_stop_queue.put( job.id, error_msg )

    def put( self, job_id, error_msg=None ):
        self.queue.put( ( job_id, error_msg ) )

    def shutdown( self ):
        """Attempts to gracefully shut down the worker thread"""
        if self.parent_pid != os.getpid():
            # We're not the real job queue, do nothing
            return
        else:
            log.info( "sending stop signal to worker thread" )
            self.running = False
            if not self.app.config.track_jobs_in_database:
                self.queue.put( self.STOP_SIGNAL )
            self.sleeper.wake()
            log.info( "job manager stop queue stopped" )
Exemple #17
0
class DistributedObjectStore(ObjectStore):
    """
    ObjectStore that defers to a list of backends, for getting objects the
    first store where the object exists is used, objects are created in a
    store selected randomly, but with weighting.
    """

    def __init__(self, config, fsmon=False):
        super(DistributedObjectStore, self).__init__()
        self.distributed_config = config.distributed_object_store_config_file
        assert self.distributed_config is not None, "distributed object store ('object_store = distributed') " \
                                                    "requires a config file, please set one in " \
                                                    "'distributed_object_store_config_file')"
        self.backends = {}
        self.weighted_backend_ids = []
        self.original_weighted_backend_ids = []
        self.max_percent_full = {}
        self.global_max_percent_full = 0.0

        random.seed()

        self.__parse_distributed_config(config)

        self.sleeper = None
        if fsmon and ( self.global_max_percent_full or filter( lambda x: x != 0.0, self.max_percent_full.values() ) ):
            self.sleeper = Sleeper()
            self.filesystem_monitor_thread = threading.Thread(target=self.__filesystem_monitor)
            self.filesystem_monitor_thread.setDaemon( True )
            self.filesystem_monitor_thread.start()
            log.info("Filesystem space monitor started")

    def __parse_distributed_config(self, config):
        tree = util.parse_xml(self.distributed_config)
        root = tree.getroot()
        log.debug('Loading backends for distributed object store from %s' % self.distributed_config)
        self.global_max_percent_full = float(root.get('maxpctfull', 0))
        for elem in [ e for e in root if e.tag == 'backend' ]:
            id = elem.get('id')
            weight = int(elem.get('weight', 1))
            maxpctfull = float(elem.get('maxpctfull', 0))
            if elem.get('type', 'disk'):
                path = None
                extra_dirs = {}
                for sub in elem:
                    if sub.tag == 'files_dir':
                        path = sub.get('path')
                    elif sub.tag == 'extra_dir':
                        type = sub.get('type')
                        extra_dirs[type] = sub.get('path')
                self.backends[id] = DiskObjectStore(config, file_path=path, extra_dirs=extra_dirs)
                self.max_percent_full[id] = maxpctfull
                log.debug("Loaded disk backend '%s' with weight %s and file_path: %s" % (id, weight, path))
                if extra_dirs:
                    log.debug("    Extra directories:")
                    for type, dir in extra_dirs.items():
                        log.debug("        %s: %s" % (type, dir))
            for i in range(0, weight):
                # The simplest way to do weighting: add backend ids to a
                # sequence the number of times equalling weight, then randomly
                # choose a backend from that sequence at creation
                self.weighted_backend_ids.append(id)
        self.original_weighted_backend_ids = self.weighted_backend_ids

    def __filesystem_monitor(self):
        while self.running:
            new_weighted_backend_ids = self.original_weighted_backend_ids
            for id, backend in self.backends.items():
                maxpct = self.max_percent_full[id] or self.global_max_percent_full
                pct = backend.get_store_usage_percent()
                if pct > maxpct:
                    new_weighted_backend_ids = filter(lambda x: x != id, new_weighted_backend_ids)
            self.weighted_backend_ids = new_weighted_backend_ids
            self.sleeper.sleep(120) # Test free space every 2 minutes

    def shutdown(self):
        super(DistributedObjectStore, self).shutdown()
        if self.sleeper is not None:
            self.sleeper.wake()

    def exists(self, obj, **kwargs):
        return self.__call_method('exists', obj, False, False, **kwargs)

    def file_ready(self, obj, **kwargs):
        return self.__call_method('file_ready', obj, False, False, **kwargs)

    def create(self, obj, **kwargs):
        """
        create() is the only method in which obj.object_store_id may be None
        """
        if obj.object_store_id is None or not self.exists(obj, **kwargs):
            if obj.object_store_id is None or obj.object_store_id not in self.weighted_backend_ids:
                try:
                    obj.object_store_id = random.choice(self.weighted_backend_ids)
                except IndexError:
                    raise ObjectInvalid( 'objectstore.create, could not generate obj.object_store_id: %s, kwargs: %s'
                        %( str( obj ), str( kwargs ) ) )
                object_session( obj ).add( obj )
                object_session( obj ).flush()
                log.debug("Selected backend '%s' for creation of %s %s" % (obj.object_store_id, obj.__class__.__name__, obj.id))
            else:
                log.debug("Using preferred backend '%s' for creation of %s %s" % (obj.object_store_id, obj.__class__.__name__, obj.id))
            self.backends[obj.object_store_id].create(obj, **kwargs)

    def empty(self, obj, **kwargs):
        return self.__call_method('empty', obj, True, False, **kwargs)

    def size(self, obj, **kwargs):
        return self.__call_method('size', obj, 0, False, **kwargs)

    def delete(self, obj, **kwargs):
        return self.__call_method('delete', obj, False, False, **kwargs)

    def get_data(self, obj, **kwargs):
        return self.__call_method('get_data', obj, ObjectNotFound, True, **kwargs)

    def get_filename(self, obj, **kwargs):
        return self.__call_method('get_filename', obj, ObjectNotFound, True, **kwargs)

    def update_from_file(self, obj, **kwargs):
        if kwargs.get('create', False):
            self.create(obj, **kwargs)
            kwargs['create'] = False
        return self.__call_method('update_from_file', obj, ObjectNotFound, True, **kwargs)

    def get_object_url(self, obj, **kwargs):
        return self.__call_method('get_object_url', obj, None, False, **kwargs)

    def __call_method(self, method, obj, default, default_is_exception, **kwargs):
        object_store_id = self.__get_store_id_for(obj, **kwargs)
        if object_store_id is not None:
            return self.backends[object_store_id].__getattribute__(method)(obj, **kwargs)
        if default_is_exception:
            raise default( 'objectstore, __call_method failed: %s on %s, kwargs: %s'
                %( method, str( obj ), str( kwargs ) ) )
        else:
            return default

    def __get_store_id_for(self, obj, **kwargs):
        if obj.object_store_id is not None and obj.object_store_id in self.backends:
            return obj.object_store_id
        else:
            # if this instance has been switched from a non-distributed to a
            # distributed object store, or if the object's store id is invalid,
            # try to locate the object
            log.warning('The backend object store ID (%s) for %s object with ID %s is invalid' % (obj.object_store_id, obj.__class__.__name__, obj.id))
            for id, store in self.backends.items():
                if store.exists(obj, **kwargs):
                    log.warning('%s object with ID %s found in backend object store with ID %s' % (obj.__class__.__name__, obj.id, id))
                    obj.object_store_id = id
                    object_session( obj ).add( obj )
                    object_session( obj ).flush()
                    return id
        return None
Exemple #18
0
class DistributedObjectStore(ObjectStore):
    """
    ObjectStore that defers to a list of backends, for getting objects the
    first store where the object exists is used, objects are created in a
    store selected randomly, but with weighting.
    """
    
    def __init__(self, config):
        super(DistributedObjectStore, self).__init__()
        self.distributed_config = config.distributed_object_store_config_file
        assert self.distributed_config is not None, "distributed object store ('object_store = distributed') " \
                                                    "requires a config file, please set one in " \
                                                    "'distributed_object_store_config_file')"
        self.backends = {}
        self.weighted_backend_ids = []
        self.original_weighted_backend_ids = []
        self.max_percent_full = {}
        self.global_max_percent_full = 0.0

        random.seed()

        self.__parse_distributed_config(config)

        if self.global_max_percent_full or filter(lambda x: x is not None, self.max_percent_full.values()):
            self.sleeper = Sleeper()
            self.filesystem_monitor_thread = threading.Thread(target=self.__filesystem_monitor)
            self.filesystem_monitor_thread.start()
            log.info("Filesystem space monitor started")

    def __parse_distributed_config(self, config):
        tree = util.parse_xml(self.distributed_config)
        root = tree.getroot()
        log.debug('Loading backends for distributed object store from %s' % self.distributed_config)
        self.global_max_percent_full = float(root.get('maxpctfull', 0))
        for elem in [ e for e in root if e.tag == 'backend' ]:
            id = elem.get('id')
            weight = int(elem.get('weight', 1))
            maxpctfull = float(elem.get('maxpctfull', 0))
            if elem.get('type', 'disk'):
                path = None
                extra_dirs = {}
                for sub in elem:
                    if sub.tag == 'files_dir':
                        path = sub.get('path')
                    elif sub.tag == 'extra_dir':
                        type = sub.get('type')
                        extra_dirs[type] = sub.get('path')
                self.backends[id] = DiskObjectStore(config, file_path=path, extra_dirs=extra_dirs)
                self.max_percent_full[id] = maxpctfull
                log.debug("Loaded disk backend '%s' with weight %s and file_path: %s" % (id, weight, path))
                if extra_dirs:
                    log.debug("    Extra directories:")
                    for type, dir in extra_dirs.items():
                        log.debug("        %s: %s" % (type, dir))
            for i in range(0, weight):
                # The simplest way to do weighting: add backend ids to a
                # sequence the number of times equalling weight, then randomly
                # choose a backend from that sequence at creation
                self.weighted_backend_ids.append(id)
        self.original_weighted_backend_ids = self.weighted_backend_ids

    def __filesystem_monitor(self):
        while self.running:
            new_weighted_backend_ids = self.original_weighted_backend_ids
            for id, backend in self.backends.items():
                maxpct = self.max_percent_full[id] or self.global_max_percent_full
                pct = backend.get_store_usage_percent()
                if pct > maxpct:
                    new_weighted_backend_ids = filter(lambda x: x != id, new_weighted_backend_ids)
            self.weighted_backend_ids = new_weighted_backend_ids
            self.sleeper.sleep(120) # Test free space every 2 minutes

    def shutdown(self):
        super(DistributedObjectStore, self).shutdown()
        self.sleeper.wake()

    def exists(self, obj, **kwargs):
        return self.__call_method('exists', obj, False, False, **kwargs)

    def file_ready(self, obj, **kwargs):
        return self.__call_method('file_ready', obj, False, False, **kwargs)

    def create(self, obj, **kwargs):
        """
        create() is the only method in which obj.object_store_id may be None
        """
        if obj.object_store_id is None or not self.exists(obj, **kwargs):
            if obj.object_store_id is None or obj.object_store_id not in self.weighted_backend_ids:
                try:
                    obj.object_store_id = random.choice(self.weighted_backend_ids)
                except IndexError:
                    raise ObjectInvalid()
                object_session( obj ).add( obj )
                object_session( obj ).flush()
                log.debug("Selected backend '%s' for creation of %s %s" % (obj.object_store_id, obj.__class__.__name__, obj.id))
            else:
                log.debug("Using preferred backend '%s' for creation of %s %s" % (obj.object_store_id, obj.__class__.__name__, obj.id))
            self.backends[obj.object_store_id].create(obj, **kwargs)

    def empty(self, obj, **kwargs):
        return self.__call_method('empty', obj, True, False, **kwargs)

    def size(self, obj, **kwargs):
        return self.__call_method('size', obj, 0, False, **kwargs)

    def delete(self, obj, **kwargs):
        return self.__call_method('delete', obj, False, False, **kwargs)

    def get_data(self, obj, **kwargs):
        return self.__call_method('get_data', obj, ObjectNotFound, True, **kwargs)

    def get_filename(self, obj, **kwargs):
        return self.__call_method('get_filename', obj, ObjectNotFound, True, **kwargs)

    def update_from_file(self, obj, **kwargs):
        if kwargs.get('create', False):
            self.create(obj, **kwargs)
            kwargs['create'] = False
        return self.__call_method('update_from_file', obj, ObjectNotFound, True, **kwargs)

    def get_object_url(self, obj, **kwargs):
        return self.__call_method('get_object_url', obj, None, False, **kwargs)

    def __call_method(self, method, obj, default, default_is_exception, **kwargs):
        object_store_id = self.__get_store_id_for(obj, **kwargs)
        if object_store_id is not None:
            return self.backends[object_store_id].__getattribute__(method)(obj, **kwargs)
        if default_is_exception:
            raise default()
        else:
            return default

    def __get_store_id_for(self, obj, **kwargs):
        if obj.object_store_id is not None and obj.object_store_id in self.backends:
            return obj.object_store_id
        else:
            # if this instance has been switched from a non-distributed to a
            # distributed object store, or if the object's store id is invalid,
            # try to locate the object
            log.warning('The backend object store ID (%s) for %s object with ID %s is invalid' % (obj.object_store_id, obj.__class__.__name__, obj.id))
            for id, store in self.backends.items():
                if store.exists(obj, **kwargs):
                    log.warning('%s object with ID %s found in backend object store with ID %s' % (obj.__class__.__name__, obj.id, id))
                    obj.object_store_id = id
                    object_session( obj ).add( obj )
                    object_session( obj ).flush()
                    return id
        return None
Exemple #19
0
class JobHandlerQueue( object ):
    """
    Job manager, waits for jobs to be runnable and then dispatches to
    a JobRunner.
    """
    STOP_SIGNAL = object()

    def __init__( self, app, dispatcher ):
        """Start the job manager"""
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context
        self.track_jobs_in_database = self.app.config.track_jobs_in_database

        # Initialize structures for handling job limits
        self.__clear_user_job_count()

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting_jobs = []
        # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times)
        self.job_wrappers = {}
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread( name="JobHandlerQueue.monitor_thread", target=self.__monitor )
        self.monitor_thread.setDaemon( True )

    def start( self ):
        """
        The JobManager should start, and then start its Handler, if it has one.
        """
        # Recover jobs at startup
        self.__check_jobs_at_startup()
        # Start the queue
        self.monitor_thread.start()
        log.info( "job handler queue started" )

    def __check_jobs_at_startup( self ):
        """
        Checks all jobs that are in the 'new', 'queued' or 'running' state in
        the database and requeues or cleans up as necessary.  Only run as the
        job handler starts.
        """
        for job in self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                                  .filter( ( ( model.Job.state == model.Job.states.NEW ) \
                                             | ( model.Job.state == model.Job.states.RUNNING ) \
                                             | ( model.Job.state == model.Job.states.QUEUED ) ) \
                                           & ( model.Job.handler == self.app.config.server_name ) ):
            if job.tool_id not in self.app.toolbox.tools_by_id:
                log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % ( job.id, job.tool_id ) )
                JobWrapper( job, self ).fail( 'This tool was disabled before the job completed.  Please contact your Galaxy administrator.' )
            if job.job_runner_name is not None and job.job_runner_external_id is None:
                # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner.
                log.debug( "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id )
                job.job_runner_name = None
                if self.track_jobs_in_database:
                    job.state = model.Job.states.NEW
                else:
                    self.queue.put( ( job.id, job.tool_id ) )
            elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None:
                # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist
                # TODO: test me extensively
                job_wrapper = JobWrapper( job, self )
                job_destination = self.dispatcher.url_to_destination(job.job_runner_name)
                if job_destination.id is None:
                    job_destination.id = 'legacy_url'
                job_wrapper.set_job_destination(job_destination, job.job_runner_external_id)
                self.dispatcher.recover( job, job_wrapper )
                log.info('(%s) Converted job from a URL to a destination and recovered' % (job.id))
            elif job.job_runner_name is None:
                # Never (fully) dispatched
                log.debug( "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % ( job.id, job.state ) )
                if self.track_jobs_in_database:
                    job.state = model.Job.states.NEW
                else:
                    self.queue.put( ( job.id, job.tool_id ) )
            else:
                # Already dispatched and running
                job_wrapper = JobWrapper( job, self )
                job_wrapper.job_runner_mapper.cached_job_destination = JobDestination(id=job.destination_id, runner=job.job_runner_name, params=job.destination_params)
                self.dispatcher.recover( job, job_wrapper )
        if self.sa_session.dirty:
            self.sa_session.flush()

    def __monitor( self ):
        """
        Continually iterate the waiting jobs, checking is each is ready to
        run and dispatching if so.
        """
        while self.running:
            try:
                self.__monitor_step()
            except:
                log.exception( "Exception in monitor_step" )
            # Sleep
            self.sleeper.sleep( 1 )

    def __monitor_step( self ):
        """
        Called repeatedly by `monitor` to process waiting jobs. Gets any new
        jobs (either from the database or from its own queue), then iterates
        over all new and waiting jobs to check the state of the jobs each
        depends on. If the job has dependencies that have not finished, it
        it goes to the waiting queue. If the job has dependencies with errors,
        it is marked as having errors and removed from the queue. Otherwise,
        the job is dispatched.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        if self.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                    .join(model.JobToInputDatasetAssociation) \
                    .join(model.HistoryDatasetAssociation) \
                    .join(model.Dataset) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 or_((model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA),
                                     (model.HistoryDatasetAssociation.deleted == True ),
                                     (model.Dataset.state != model.Dataset.states.OK ),
                                     (model.Dataset.deleted == True)))).subquery()
            ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                    .join(model.JobToInputLibraryDatasetAssociation) \
                    .join(model.LibraryDatasetDatasetAssociation) \
                    .join(model.Dataset) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 or_((model.LibraryDatasetDatasetAssociation._state != None),
                                     (model.LibraryDatasetDatasetAssociation.deleted == True),
                                     (model.Dataset.state != model.Dataset.states.OK),
                                     (model.Dataset.deleted == True)))).subquery()
            jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 (model.Job.handler == self.app.config.server_name),
                                 ~model.Job.table.c.id.in_(hda_not_ready),
                                 ~model.Job.table.c.id.in_(ldda_not_ready))) \
                    .order_by(model.Job.id).all()
        else:
            # Get job objects and append to watch queue for any which were
            # previously waiting
            for job_id in self.waiting_jobs:
                jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) )
            try:
                while 1:
                    message = self.queue.get_nowait()
                    if message is self.STOP_SIGNAL:
                        return
                    # Unpack the message
                    job_id, tool_id = message
                    # Get the job object and append to watch queue
                    jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) )
            except Empty:
                pass
        # Ensure that we get new job counts on each iteration
        self.__clear_user_job_count()
        # Iterate over new and waiting jobs and look for any that are
        # ready to run
        new_waiting_jobs = []
        for job in jobs_to_check:
            try:
                # Check the job's dependencies, requeue if they're not done.
                # Some of these states will only happen when using the in-memory job queue
                job_state = self.__check_if_ready_to_run( job )
                if job_state == JOB_WAIT:
                    new_waiting_jobs.append( job.id )
                elif job_state == JOB_INPUT_ERROR:
                    log.info( "(%d) Job unable to run: one or more inputs in error state" % job.id )
                elif job_state == JOB_INPUT_DELETED:
                    log.info( "(%d) Job unable to run: one or more inputs deleted" % job.id )
                elif job_state == JOB_READY:
                    self.dispatcher.put( self.job_wrappers.pop( job.id ) )
                    log.info( "(%d) Job dispatched" % job.id )
                elif job_state == JOB_DELETED:
                    log.info( "(%d) Job deleted by user while still queued" % job.id )
                elif job_state == JOB_ADMIN_DELETED:
                    log.info( "(%d) Job deleted by admin while still queued" % job.id )
                elif job_state == JOB_USER_OVER_QUOTA:
                    log.info( "(%d) User (%s) is over quota: job paused" % ( job.id, job.user_id ) )
                    job.state = model.Job.states.PAUSED
                    for dataset_assoc in job.output_datasets + job.output_library_datasets:
                        dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED
                        dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run"
                        self.sa_session.add( dataset_assoc.dataset.dataset )
                    self.sa_session.add( job )
                elif job_state == JOB_ERROR:
                    log.error( "(%d) Error checking job readiness" % job.id )
                else:
                    log.error( "(%d) Job in unknown state '%s'" % ( job.id, job_state ) )
                    new_waiting_jobs.append( job.id )
            except Exception:
                log.exception( "failure running job %d" % job.id )
        # Update the waiting list
        if not self.track_jobs_in_database:
            self.waiting_jobs = new_waiting_jobs
        # Remove cached wrappers for any jobs that are no longer being tracked
        for id in self.job_wrappers.keys():
            if id not in new_waiting_jobs:
                del self.job_wrappers[id]
        # Flush, if we updated the state
        self.sa_session.flush()
        # Done with the session
        self.sa_session.remove()

    def __check_if_ready_to_run( self, job ):
        """
        Check if a job is ready to run by verifying that each of its input
        datasets is ready (specifically in the OK state). If any input dataset
        has an error, fail the job and return JOB_INPUT_ERROR. If any input
        dataset is deleted, fail the job and return JOB_INPUT_DELETED.  If all
        input datasets are in OK state, return JOB_READY indicating that the
        job can be dispatched. Otherwise, return JOB_WAIT indicating that input
        datasets are still being prepared.
        """
        # If tracking in the database, job.state is guaranteed to be NEW and the inputs are guaranteed to be OK
        if not self.track_jobs_in_database:
            if job.state == model.Job.states.DELETED:
                return JOB_DELETED
            elif job.state == model.Job.states.ERROR:
                return JOB_ADMIN_DELETED
            for dataset_assoc in job.input_datasets + job.input_library_datasets:
                idata = dataset_assoc.dataset
                if not idata:
                    continue
                # don't run jobs for which the input dataset was deleted
                if idata.deleted:
                    self.job_wrappers.pop(job.id, JobWrapper( job, self )).fail( "input data %s (file: %s) was deleted before the job started" % ( idata.hid, idata.file_name ) )
                    return JOB_INPUT_DELETED
                # an error in the input data causes us to bail immediately
                elif idata.state == idata.states.ERROR:
                    self.job_wrappers.pop(job.id, JobWrapper( job, self )).fail( "input data %s is in error state" % ( idata.hid ) )
                    return JOB_INPUT_ERROR
                elif idata.state == idata.states.FAILED_METADATA:
                    self.job_wrappers.pop(job.id, JobWrapper( job, self )).fail( "input data %s failed to properly set metadata" % ( idata.hid ) )
                    return JOB_INPUT_ERROR
                elif idata.state != idata.states.OK and not ( idata.state == idata.states.SETTING_METADATA and job.tool_id is not None and job.tool_id == self.app.datatypes_registry.set_external_metadata_tool.id ):
                    # need to requeue
                    return JOB_WAIT
        # Create the job wrapper so that the destination can be set
        if job.id not in self.job_wrappers:
            self.job_wrappers[job.id] = JobWrapper(job, self)
        # Cause the job_destination to be set and cached by the mapper
        try:
            self.job_wrappers[job.id].job_destination
        except Exception, e:
            failure_message = getattr(e, 'failure_message', DEFAULT_JOB_PUT_FAILURE_MESSAGE )
            if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE:
                log.exception( 'Failed to generate job destination' )
            else:
                log.debug( "Intentionally failing job with message (%s)" % failure_message )
            self.job_wrappers[job.id].fail( failure_message )
            return JOB_ERROR
        # job is ready to run, check limits
        state = self.__check_user_jobs( job, self.job_wrappers[job.id] )
        if state == JOB_READY and self.app.config.enable_quotas:
            quota = self.app.quota_agent.get_quota( job.user )
            if quota is not None:
                try:
                    usage = self.app.quota_agent.get_usage( user=job.user, history=job.history )
                    if usage > quota:
                        return JOB_USER_OVER_QUOTA
                except AssertionError, e:
                    pass # No history, should not happen with an anon user
Exemple #20
0
class JobHandlerStopQueue( object ):
    """
    A queue for jobs which need to be terminated prematurely.
    """
    STOP_SIGNAL = object()
    def __init__( self, app, dispatcher ):
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()

        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting = []

        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread( target=self.monitor )
        self.monitor_thread.start()
        log.info( "job handler stop queue started" )

    def monitor( self ):
        """
        Continually iterate the waiting jobs, stop any that are found.
        """
        # HACK: Delay until after forking, we need a way to do post fork notification!!!
        time.sleep( 10 )
        while self.running:
            try:
                self.monitor_step()
            except:
                log.exception( "Exception in monitor_step" )
            # Sleep
            self.sleeper.sleep( 1 )

    def monitor_step( self ):
        """
        Called repeatedly by `monitor` to stop jobs.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        if self.app.config.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            newly_deleted_jobs = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                                     .filter( ( model.Job.state == model.Job.states.DELETED_NEW ) \
                                              & ( model.Job.handler == self.app.config.server_name ) ).all()
            for job in newly_deleted_jobs:
                jobs_to_check.append( ( job, None ) )
        # Also pull from the queue (in the case of Administrative stopped jobs)
        try:
            while 1:
                message = self.queue.get_nowait()
                if message is self.STOP_SIGNAL:
                    return
                # Unpack the message
                job_id, error_msg = message
                # Get the job object and append to watch queue
                jobs_to_check.append( ( self.sa_session.query( model.Job ).get( job_id ), error_msg ) )
        except Empty:
            pass
        for job, error_msg in jobs_to_check:
            if error_msg is not None:
                job.state = job.states.ERROR
                job.info = error_msg
            else:
                job.state = job.states.DELETED
            self.sa_session.add( job )
            self.sa_session.flush()
            if job.job_runner_name is not None:
                # tell the dispatcher to stop the job
                self.dispatcher.stop( job )

    def put( self, job_id, error_msg=None ):
        self.queue.put( ( job_id, error_msg ) )

    def shutdown( self ):
        """Attempts to gracefully shut down the worker thread"""
        if self.parent_pid != os.getpid():
            # We're not the real job queue, do nothing
            return
        else:
            log.info( "sending stop signal to worker thread" )
            self.running = False
            if not self.app.config.track_jobs_in_database:
                self.queue.put( self.STOP_SIGNAL )
            self.sleeper.wake()
            log.info( "job handler stop queue stopped" )
Exemple #21
0
class JobHandlerStopQueue(object):
    """
    A queue for jobs which need to be terminated prematurely.
    """
    STOP_SIGNAL = object()

    def __init__(self, app, dispatcher):
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()

        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting = []

        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread(
            name="JobHandlerStopQueue.monitor_thread", target=self.monitor)
        self.monitor_thread.setDaemon(True)
        self.monitor_thread.start()
        log.info("job handler stop queue started")

    def monitor(self):
        """
        Continually iterate the waiting jobs, stop any that are found.
        """
        # HACK: Delay until after forking, we need a way to do post fork notification!!!
        time.sleep(10)
        while self.running:
            try:
                self.monitor_step()
            except:
                log.exception("Exception in monitor_step")
            # Sleep
            self.sleeper.sleep(1)

    def monitor_step(self):
        """
        Called repeatedly by `monitor` to stop jobs.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        if self.app.config.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            newly_deleted_jobs = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                                     .filter( ( model.Job.state == model.Job.states.DELETED_NEW ) \
                                              & ( model.Job.handler == self.app.config.server_name ) ).all()
            for job in newly_deleted_jobs:
                jobs_to_check.append((job, job.stderr))
        # Also pull from the queue (in the case of Administrative stopped jobs)
        try:
            while 1:
                message = self.queue.get_nowait()
                if message is self.STOP_SIGNAL:
                    return
                # Unpack the message
                job_id, error_msg = message
                # Get the job object and append to watch queue
                jobs_to_check.append(
                    (self.sa_session.query(model.Job).get(job_id), error_msg))
        except Empty:
            pass
        for job, error_msg in jobs_to_check:
            if error_msg is not None:
                job.state = job.states.ERROR
                job.info = error_msg
            else:
                job.state = job.states.DELETED
            self.sa_session.add(job)
            self.sa_session.flush()
            if job.job_runner_name is not None:
                # tell the dispatcher to stop the job
                self.dispatcher.stop(job)

    def put(self, job_id, error_msg=None):
        self.queue.put((job_id, error_msg))

    def shutdown(self):
        """Attempts to gracefully shut down the worker thread"""
        if self.parent_pid != os.getpid():
            # We're not the real job queue, do nothing
            return
        else:
            log.info("sending stop signal to worker thread")
            self.running = False
            if not self.app.config.track_jobs_in_database:
                self.queue.put(self.STOP_SIGNAL)
            self.sleeper.wake()
            log.info("job handler stop queue stopped")
Exemple #22
0
class S3ObjectStore(ObjectStore):
    """
    Object store that stores objects as items in an AWS S3 bucket. A local
    cache exists that is used as an intermediate location for files between
    Galaxy and S3.
    """
    def __init__(self, config):
        assert sys.version_info >= (2, 6), 'S3 Object Store support requires Python >= 2.6'
        super(S3ObjectStore, self).__init__()
        self.config = config
        self.staging_path = self.config.file_path
        self.s3_conn = S3Connection()
        self.bucket = self._get_bucket(self.config.s3_bucket)
        self.use_rr = self.config.use_reduced_redundancy
        self.cache_size = self.config.object_store_cache_size * 1073741824 # Convert GBs to bytes
        self.transfer_progress = 0
        # Clean cache only if value is set in universe_wsgi.ini
        if self.cache_size != -1:
            # Helper for interruptable sleep
            self.sleeper = Sleeper()
            self.cache_monitor_thread = threading.Thread(target=self.__cache_monitor)
            self.cache_monitor_thread.start()
            log.info("Cache cleaner manager started")
    
    def __cache_monitor(self):
        time.sleep(2) # Wait for things to load before starting the monitor
        while self.running:
            total_size = 0
            # Is this going to be too expensive of an operation to be done frequently?
            file_list = []
            for dirpath, dirnames, filenames in os.walk(self.staging_path):
                for f in filenames:
                    fp = os.path.join(dirpath, f)
                    file_size = os.path.getsize(fp)
                    total_size += file_size
                    # Get the time given file was last accessed
                    last_access_time = time.localtime(os.stat(fp)[7])
                    # Compose a tuple of the access time and the file path
                    file_tuple = last_access_time, fp, file_size
                    file_list.append(file_tuple)
            # Sort the file list (based on access time)
            file_list.sort()
            # Initiate cleaning once within 10% of the defined cache size?
            cache_limit = self.cache_size * 0.9
            if total_size > cache_limit:
                log.info("Initiating cache cleaning: current cache size: %s; clean until smaller than: %s" \
                    % (convert_bytes(total_size), convert_bytes(cache_limit)))
                # How much to delete? If simply deleting up to the cache-10% limit,
                # is likely to be deleting frequently and may run the risk of hitting
                # the limit - maybe delete additional #%?
                # For now, delete enough to leave at least 10% of the total cache free
                delete_this_much = total_size - cache_limit
                self.__clean_cache(file_list, delete_this_much)
            self.sleeper.sleep(30) # Test cache size every 30 seconds?
    
    def __clean_cache(self, file_list, delete_this_much):
        """ Keep deleting files from the file_list until the size of the deleted
        files is greater than the value in delete_this_much parameter.
        
        :type file_list: list
        :param file_list: List of candidate files that can be deleted. This method
            will start deleting files from the beginning of the list so the list
            should be sorted accordingly. The list must contains 3-element tuples,
            positioned as follows: position 0 holds file last accessed timestamp
            (as time.struct_time), position 1 holds file path, and position 2 has 
            file size (e.g., (<access time>, /mnt/data/dataset_1.dat), 472394)
        
        :type delete_this_much: int
        :param delete_this_much: Total size of files, in bytes, that should be deleted.
        """
        # Keep deleting datasets from file_list until deleted_amount does not
        # exceed delete_this_much; start deleting from the front of the file list,
        # which assumes the oldest files come first on the list.
        deleted_amount = 0
        for i, f in enumerate(file_list):
            if deleted_amount < delete_this_much:
                deleted_amount += f[2]
                os.remove(f[1])
                # Debugging code for printing deleted files' stats
                # folder, file_name = os.path.split(f[1])
                # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0])
                # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \
                #     % (i, file_name, convert_bytes(f[2]), file_date, \
                #     convert_bytes(deleted_amount), convert_bytes(delete_this_much)))
            else:
                log.debug("Cache cleaning done. Total space freed: %s" % convert_bytes(deleted_amount))
                return
    
    def _get_bucket(self, bucket_name):
        """ Sometimes a handle to a bucket is not established right away so try
        it a few times. Raise error is connection is not established. """
        for i in range(5):
            try:
                bucket = self.s3_conn.get_bucket(bucket_name)
                log.debug("Using S3 object store; got bucket '%s'" % bucket.name)
                return bucket
            except S3ResponseError: 
                log.debug("Could not get bucket '%s', attempt %s/5" % (bucket_name, i+1))
                time.sleep(2)
        # All the attempts have been exhausted and connection was not established,
        # raise error
        raise S3ResponseError
    
    def _fix_permissions(self, rel_path):
        """ Set permissions on rel_path"""
        for basedir, dirs, files in os.walk(rel_path):
            util.umask_fix_perms(basedir, self.config.umask, 0777, self.config.gid)
            for f in files:
                path = os.path.join(basedir, f)
                # Ignore symlinks
                if os.path.islink(path):
                    continue 
                util.umask_fix_perms( path, self.config.umask, 0666, self.config.gid )
    
    def _construct_path(self, obj, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None):
        rel_path = os.path.join(*directory_hash_id(obj.id))
        if extra_dir is not None:
            if extra_dir_at_root:
                rel_path = os.path.join(extra_dir, rel_path)
            else:
                rel_path = os.path.join(rel_path, extra_dir)
        # S3 folders are marked by having trailing '/' so add it now
        rel_path = '%s/' % rel_path
        if not dir_only:
            rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id)
        return rel_path

    def _get_cache_path(self, rel_path):
        return os.path.abspath(os.path.join(self.staging_path, rel_path))
    
    def _get_transfer_progress(self):
        return self.transfer_progress
    
    def _get_size_in_s3(self, rel_path):
        try:
            key = self.bucket.get_key(rel_path)
            if key:
                return key.size
        except S3ResponseError, ex:
            log.error("Could not get size of key '%s' from S3: %s" % (rel_path, ex))
        except Exception, ex:
            log.error("Could not get reference to the key object '%s'; returning -1 for key size: %s" % (rel_path, ex))
Exemple #23
0
class JobHandlerQueue( object ):
    """
    Job manager, waits for jobs to be runnable and then dispatches to
    a JobRunner.
    """
    STOP_SIGNAL = object()
    def __init__( self, app, dispatcher ):
        """Start the job manager"""
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context
        self.track_jobs_in_database = self.app.config.track_jobs_in_database

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Contains jobs that are waiting (only use from monitor thread)
        ## This and jobs_to_check[] are closest to a "Job Queue"
        self.waiting_jobs = []
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread( target=self.__monitor )

    def start( self ):
        """
        The JobManager should start, and then start its Handler, if it has one.
        """
        # Recover jobs at startup
        self.__check_jobs_at_startup()
        # Start the queue
        self.monitor_thread.start()
        log.info( "job handler queue started" )

    def __check_jobs_at_startup( self ):
        """
        Checks all jobs that are in the 'new', 'queued' or 'running' state in
        the database and requeues or cleans up as necessary.  Only run as the
        job manager starts.
        """
        for job in self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                                  .filter( ( ( model.Job.state == model.Job.states.NEW ) \
                                             | ( model.Job.state == model.Job.states.RUNNING ) \
                                             | ( model.Job.state == model.Job.states.QUEUED ) ) \
                                           & ( model.Job.handler == self.app.config.server_name ) ):
            if job.tool_id not in self.app.toolbox.tools_by_id:
                log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % ( job.id, job.tool_id ) )
                JobWrapper( job, self ).fail( 'This tool was disabled before the job completed.  Please contact your Galaxy administrator.' )
            elif job.job_runner_name is None:
                log.debug( "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % ( job.id, job.state ) )
                if self.track_jobs_in_database:
                    job.state = model.Job.states.NEW
                else:
                    self.queue.put( ( job.id, job.tool_id ) )
            else:
                job_wrapper = JobWrapper( job, self )
                self.dispatcher.recover( job, job_wrapper )
        if self.sa_session.dirty:
            self.sa_session.flush()

    def __monitor( self ):
        """
        Continually iterate the waiting jobs, checking is each is ready to
        run and dispatching if so.
        """
        while self.running:
            try:
                self.__monitor_step()
            except:
                log.exception( "Exception in monitor_step" )
            # Sleep
            self.sleeper.sleep( 1 )

    def __monitor_step( self ):
        """
        Called repeatedly by `monitor` to process waiting jobs. Gets any new
        jobs (either from the database or from its own queue), then iterates
        over all new and waiting jobs to check the state of the jobs each
        depends on. If the job has dependencies that have not finished, it
        it goes to the waiting queue. If the job has dependencies with errors,
        it is marked as having errors and removed from the queue. Otherwise,
        the job is dispatched.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        if self.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            jobs_to_check = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                                .filter( ( model.Job.state == model.Job.states.NEW ) \
                                         & ( model.Job.handler == self.app.config.server_name ) ).all()
        else:
            # Get job objects and append to watch queue for any which were
            # previously waiting
            for job_id in self.waiting_jobs:
                jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) )
            try:
                while 1:
                    message = self.queue.get_nowait()
                    if message is self.STOP_SIGNAL:
                        return
                    # Unpack the message
                    job_id, tool_id = message
                    # Get the job object and append to watch queue
                    jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) )
            except Empty:
                pass
        # Iterate over new and waiting jobs and look for any that are
        # ready to run
        new_waiting_jobs = []
        for job in jobs_to_check:
            try:
                # Check the job's dependencies, requeue if they're not done
                job_state = self.__check_if_ready_to_run( job )
                if job_state == JOB_WAIT:
                    if not self.track_jobs_in_database:
                        new_waiting_jobs.append( job.id )
                elif job_state == JOB_INPUT_ERROR:
                    log.info( "(%d) Job unable to run: one or more inputs in error state" % job.id )
                elif job_state == JOB_INPUT_DELETED:
                    log.info( "(%d) Job unable to run: one or more inputs deleted" % job.id )
                elif job_state == JOB_READY:
                    self.dispatcher.put( JobWrapper( job, self ) )
                    log.info( "(%d) Job dispatched" % job.id )
                elif job_state == JOB_DELETED:
                    log.info( "(%d) Job deleted by user while still queued" % job.id )
                elif job_state == JOB_ADMIN_DELETED:
                    log.info( "(%d) Job deleted by admin while still queued" % job.id )
                else:
                    log.error( "(%d) Job in unknown state '%s'" % ( job.id, job_state ) )
                    if not self.track_jobs_in_database:
                        new_waiting_jobs.append( job.id )
            except Exception:
                log.exception( "failure running job %d" % job.id )
        # Update the waiting list
        self.waiting_jobs = new_waiting_jobs
        # Done with the session
        self.sa_session.remove()

    def __check_if_ready_to_run( self, job ):
        """
        Check if a job is ready to run by verifying that each of its input
        datasets is ready (specifically in the OK state). If any input dataset
        has an error, fail the job and return JOB_INPUT_ERROR. If any input
        dataset is deleted, fail the job and return JOB_INPUT_DELETED.  If all
        input datasets are in OK state, return JOB_READY indicating that the
        job can be dispatched. Otherwise, return JOB_WAIT indicating that input
        datasets are still being prepared.
        """
        if job.state == model.Job.states.DELETED:
            return JOB_DELETED
        elif job.state == model.Job.states.ERROR:
            return JOB_ADMIN_DELETED
        elif self.app.config.enable_quotas:
            quota = self.app.quota_agent.get_quota( job.user )
            if quota is not None:
                try:
                    usage = self.app.quota_agent.get_usage( user=job.user, history=job.history )
                    if usage > quota:
                        return JOB_WAIT
                except AssertionError, e:
                    pass # No history, should not happen with an anon user
        for dataset_assoc in job.input_datasets + job.input_library_datasets:
            idata = dataset_assoc.dataset
            if not idata:
                continue
            # don't run jobs for which the input dataset was deleted
            if idata.deleted:
                JobWrapper( job, self ).fail( "input data %s (file: %s) was deleted before the job started" % ( idata.hid, idata.file_name ) )
                return JOB_INPUT_DELETED
            # an error in the input data causes us to bail immediately
            elif idata.state == idata.states.ERROR:
                JobWrapper( job, self ).fail( "input data %s is in error state" % ( idata.hid ) )
                return JOB_INPUT_ERROR
            elif idata.state == idata.states.FAILED_METADATA:
                JobWrapper( job, self ).fail( "input data %s failed to properly set metadata" % ( idata.hid ) )
                return JOB_INPUT_ERROR
            elif idata.state != idata.states.OK and not ( idata.state == idata.states.SETTING_METADATA and job.tool_id is not None and job.tool_id == self.app.datatypes_registry.set_external_metadata_tool.id ):
                # need to requeue
                return JOB_WAIT
        return self.__check_user_jobs( job )
Exemple #24
0
class S3ObjectStore(ObjectStore):
    """
    Object store that stores objects as items in an AWS S3 bucket. A local
    cache exists that is used as an intermediate location for files between
    Galaxy and S3.
    """
    def __init__(self, config):
        super(S3ObjectStore, self).__init__()
        self.config = config
        self.staging_path = self.config.file_path
        self.s3_conn = get_OS_connection(self.config)
        self.bucket = self._get_bucket(self.config.os_bucket_name)
        self.use_rr = self.config.os_use_reduced_redundancy
        self.cache_size = self.config.object_store_cache_size
        self.transfer_progress = 0
        # Clean cache only if value is set in universe_wsgi.ini
        if self.cache_size != -1:
            # Convert GBs to bytes for comparison
            self.cache_size = self.cache_size * 1073741824
            # Helper for interruptable sleep
            self.sleeper = Sleeper()
            self.cache_monitor_thread = threading.Thread(target=self.__cache_monitor)
            self.cache_monitor_thread.start()
            log.info("Cache cleaner manager started")
        # Test if 'axel' is available for parallel download and pull the key into cache
        try:
            subprocess.call('axel')
            self.use_axel = True
        except OSError:
            self.use_axel = False

    def __cache_monitor(self):
        time.sleep(2) # Wait for things to load before starting the monitor
        while self.running:
            total_size = 0
            # Is this going to be too expensive of an operation to be done frequently?
            file_list = []
            for dirpath, dirnames, filenames in os.walk(self.staging_path):
                for f in filenames:
                    fp = os.path.join(dirpath, f)
                    file_size = os.path.getsize(fp)
                    total_size += file_size
                    # Get the time given file was last accessed
                    last_access_time = time.localtime(os.stat(fp)[7])
                    # Compose a tuple of the access time and the file path
                    file_tuple = last_access_time, fp, file_size
                    file_list.append(file_tuple)
            # Sort the file list (based on access time)
            file_list.sort()
            # Initiate cleaning once within 10% of the defined cache size?
            cache_limit = self.cache_size * 0.9
            if total_size > cache_limit:
                log.info("Initiating cache cleaning: current cache size: %s; clean until smaller than: %s" \
                    % (convert_bytes(total_size), convert_bytes(cache_limit)))
                # How much to delete? If simply deleting up to the cache-10% limit,
                # is likely to be deleting frequently and may run the risk of hitting
                # the limit - maybe delete additional #%?
                # For now, delete enough to leave at least 10% of the total cache free
                delete_this_much = total_size - cache_limit
                self.__clean_cache(file_list, delete_this_much)
            self.sleeper.sleep(30) # Test cache size every 30 seconds?

    def __clean_cache(self, file_list, delete_this_much):
        """ Keep deleting files from the file_list until the size of the deleted
        files is greater than the value in delete_this_much parameter.

        :type file_list: list
        :param file_list: List of candidate files that can be deleted. This method
            will start deleting files from the beginning of the list so the list
            should be sorted accordingly. The list must contains 3-element tuples,
            positioned as follows: position 0 holds file last accessed timestamp
            (as time.struct_time), position 1 holds file path, and position 2 has
            file size (e.g., (<access time>, /mnt/data/dataset_1.dat), 472394)

        :type delete_this_much: int
        :param delete_this_much: Total size of files, in bytes, that should be deleted.
        """
        # Keep deleting datasets from file_list until deleted_amount does not
        # exceed delete_this_much; start deleting from the front of the file list,
        # which assumes the oldest files come first on the list.
        deleted_amount = 0
        for i, f in enumerate(file_list):
            if deleted_amount < delete_this_much:
                deleted_amount += f[2]
                os.remove(f[1])
                # Debugging code for printing deleted files' stats
                # folder, file_name = os.path.split(f[1])
                # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0])
                # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \
                #     % (i, file_name, convert_bytes(f[2]), file_date, \
                #     convert_bytes(deleted_amount), convert_bytes(delete_this_much)))
            else:
                log.debug("Cache cleaning done. Total space freed: %s" % convert_bytes(deleted_amount))
                return

    def _get_bucket(self, bucket_name):
        """ Sometimes a handle to a bucket is not established right away so try
        it a few times. Raise error is connection is not established. """
        for i in range(5):
            try:
                bucket = self.s3_conn.get_bucket(bucket_name)
                log.debug("Using cloud object store with bucket '%s'" % bucket.name)
                return bucket
            except S3ResponseError:
                log.debug("Could not get bucket '%s', attempt %s/5" % (bucket_name, i+1))
                time.sleep(2)
        # All the attempts have been exhausted and connection was not established,
        # raise error
        raise S3ResponseError

    def _fix_permissions(self, rel_path):
        """ Set permissions on rel_path"""
        for basedir, dirs, files in os.walk(rel_path):
            util.umask_fix_perms(basedir, self.config.umask, 0777, self.config.gid)
            for f in files:
                path = os.path.join(basedir, f)
                # Ignore symlinks
                if os.path.islink(path):
                    continue
                util.umask_fix_perms( path, self.config.umask, 0666, self.config.gid )

    def _construct_path(self, obj, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None, **kwargs):
        rel_path = os.path.join(*directory_hash_id(obj.id))
        if extra_dir is not None:
            if extra_dir_at_root:
                rel_path = os.path.join(extra_dir, rel_path)
            else:
                rel_path = os.path.join(rel_path, extra_dir)
        # S3 folders are marked by having trailing '/' so add it now
        rel_path = '%s/' % rel_path
        if not dir_only:
            rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id)
        return rel_path

    def _get_cache_path(self, rel_path):
        return os.path.abspath(os.path.join(self.staging_path, rel_path))

    def _get_transfer_progress(self):
        return self.transfer_progress

    def _get_size_in_s3(self, rel_path):
        try:
            key = self.bucket.get_key(rel_path)
            if key:
                return key.size
        except S3ResponseError, ex:
            log.error("Could not get size of key '%s' from S3: %s" % (rel_path, ex))
        except Exception, ex:
            log.error("Could not get reference to the key object '%s'; returning -1 for key size: %s" % (rel_path, ex))