Beispiel #1
0
 def __filesystem_monitor(self, sleeper: Sleeper):
     while self.running:
         new_weighted_backend_ids = self.original_weighted_backend_ids
         for id, backend in self.backends.items():
             maxpct = self.max_percent_full[
                 id] or self.global_max_percent_full
             pct = backend.get_store_usage_percent()
             if pct > maxpct:
                 new_weighted_backend_ids = [
                     _ for _ in new_weighted_backend_ids if _ != id
                 ]
         self.weighted_backend_ids = new_weighted_backend_ids
         sleeper.sleep(120)  # Test free space every 2 minutes
Beispiel #2
0
class DistributedObjectStore(NestedObjectStore):

    """
    ObjectStore that defers to a list of backends.

    When getting objects the first store where the object exists is used.
    When creating objects they are created in a store selected randomly, but
    with weighting.
    """

    def __init__(self, config, config_xml=None, fsmon=False):
        """
        :type config: object
        :param config: An object, most likely populated from
            `galaxy/config.ini`, having the same attributes needed by
            :class:`NestedObjectStore` plus:

            * distributed_object_store_config_file

        :type config_xml: ElementTree

        :type fsmon: bool
        :param fsmon: If True, monitor the file system for free space,
            removing backends when they get too full.
        """
        super(DistributedObjectStore, self).__init__(config,
                config_xml=config_xml)
        if config_xml is None:
            self.distributed_config = config.distributed_object_store_config_file
            assert self.distributed_config is not None, \
                "distributed object store ('object_store = distributed') " \
                "requires a config file, please set one in " \
                "'distributed_object_store_config_file')"
        self.backends = {}
        self.weighted_backend_ids = []
        self.original_weighted_backend_ids = []
        self.max_percent_full = {}
        self.global_max_percent_full = 0.0
        random.seed()
        self.__parse_distributed_config(config, config_xml)
        self.sleeper = None
        if fsmon and ( self.global_max_percent_full or [_ for _ in self.max_percent_full.values() if _ != 0.0] ):
            self.sleeper = Sleeper()
            self.filesystem_monitor_thread = threading.Thread(target=self.__filesystem_monitor)
            self.filesystem_monitor_thread.setDaemon( True )
            self.filesystem_monitor_thread.start()
            log.info("Filesystem space monitor started")

    def __parse_distributed_config(self, config, config_xml=None):
        if config_xml is None:
            root = ElementTree.parse(self.distributed_config).getroot()
            log.debug('Loading backends for distributed object store from %s', self.distributed_config)
        else:
            root = config_xml.find('backends')
            log.debug('Loading backends for distributed object store from %s', config_xml.get('id'))
        self.global_max_percent_full = float(root.get('maxpctfull', 0))
        for elem in [ e for e in root if e.tag == 'backend' ]:
            id = elem.get('id')
            weight = int(elem.get('weight', 1))
            maxpctfull = float(elem.get('maxpctfull', 0))
            if elem.get('type', 'disk'):
                path = None
                extra_dirs = {}
                for sub in elem:
                    if sub.tag == 'files_dir':
                        path = sub.get('path')
                    elif sub.tag == 'extra_dir':
                        type = sub.get('type')
                        extra_dirs[type] = sub.get('path')
                self.backends[id] = DiskObjectStore(config, file_path=path, extra_dirs=extra_dirs)
                self.max_percent_full[id] = maxpctfull
                log.debug("Loaded disk backend '%s' with weight %s and file_path: %s" % (id, weight, path))
                if extra_dirs:
                    log.debug("    Extra directories:")
                    for type, dir in extra_dirs.items():
                        log.debug("        %s: %s" % (type, dir))
            for i in range(0, weight):
                # The simplest way to do weighting: add backend ids to a
                # sequence the number of times equalling weight, then randomly
                # choose a backend from that sequence at creation
                self.weighted_backend_ids.append(id)
        self.original_weighted_backend_ids = self.weighted_backend_ids

    def shutdown(self):
        """Shut down. Kill the free space monitor if there is one."""
        super(DistributedObjectStore, self).shutdown()
        if self.sleeper is not None:
            self.sleeper.wake()

    def __filesystem_monitor(self):
        while self.running:
            new_weighted_backend_ids = self.original_weighted_backend_ids
            for id, backend in self.backends.items():
                maxpct = self.max_percent_full[id] or self.global_max_percent_full
                pct = backend.get_store_usage_percent()
                if pct > maxpct:
                    new_weighted_backend_ids = [_ for _ in new_weighted_backend_ids if _ != id]
            self.weighted_backend_ids = new_weighted_backend_ids
            self.sleeper.sleep(120)  # Test free space every 2 minutes

    def create(self, obj, **kwargs):
        """The only method in which obj.object_store_id may be None."""
        if obj.object_store_id is None or not self.exists(obj, **kwargs):
            if obj.object_store_id is None or obj.object_store_id not in self.weighted_backend_ids:
                try:
                    obj.object_store_id = random.choice(self.weighted_backend_ids)
                except IndexError:
                    raise ObjectInvalid('objectstore.create, could not generate '
                                        'obj.object_store_id: %s, kwargs: %s'
                                        % ( str( obj ), str( kwargs ) ) )
                _create_object_in_session( obj )
                log.debug("Selected backend '%s' for creation of %s %s"
                          % (obj.object_store_id, obj.__class__.__name__, obj.id))
            else:
                log.debug("Using preferred backend '%s' for creation of %s %s"
                          % (obj.object_store_id, obj.__class__.__name__, obj.id))
            self.backends[obj.object_store_id].create(obj, **kwargs)

    def _call_method(self, method, obj, default, default_is_exception, **kwargs):
        object_store_id = self.__get_store_id_for(obj, **kwargs)
        if object_store_id is not None:
            return self.backends[object_store_id].__getattribute__(method)(obj, **kwargs)
        if default_is_exception:
            raise default('objectstore, _call_method failed: %s on %s, kwargs: %s'
                          % ( method, str( obj ), str( kwargs ) ) )
        else:
            return default

    def __get_store_id_for(self, obj, **kwargs):
        if obj.object_store_id is not None and obj.object_store_id in self.backends:
            return obj.object_store_id
        else:
            # if this instance has been switched from a non-distributed to a
            # distributed object store, or if the object's store id is invalid,
            # try to locate the object
            log.warning('The backend object store ID (%s) for %s object with ID %s is invalid'
                        % (obj.object_store_id, obj.__class__.__name__, obj.id))
            for id, store in self.backends.items():
                if store.exists(obj, **kwargs):
                    log.warning('%s object with ID %s found in backend object store with ID %s'
                                % (obj.__class__.__name__, obj.id, id))
                    obj.object_store_id = id
                    _create_object_in_session( obj )
                    return id
        return None
Beispiel #3
0
class JobHandlerStopQueue( object ):
    """
    A queue for jobs which need to be terminated prematurely.
    """
    STOP_SIGNAL = object()
    def __init__( self, app, dispatcher ):
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()

        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting = []

        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread( name="JobHandlerStopQueue.monitor_thread", target=self.monitor )
        self.monitor_thread.setDaemon( True )
        self.monitor_thread.start()
        log.info( "job handler stop queue started" )

    def monitor( self ):
        """
        Continually iterate the waiting jobs, stop any that are found.
        """
        # HACK: Delay until after forking, we need a way to do post fork notification!!!
        time.sleep( 10 )
        while self.running:
            try:
                self.monitor_step()
            except:
                log.exception( "Exception in monitor_step" )
            # Sleep
            self.sleeper.sleep( 1 )

    def monitor_step( self ):
        """
        Called repeatedly by `monitor` to stop jobs.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        if self.app.config.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            newly_deleted_jobs = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                                     .filter( ( model.Job.state == model.Job.states.DELETED_NEW ) \
                                              & ( model.Job.handler == self.app.config.server_name ) ).all()
            for job in newly_deleted_jobs:
                jobs_to_check.append( ( job, job.stderr ) )
        # Also pull from the queue (in the case of Administrative stopped jobs)
        try:
            while 1:
                message = self.queue.get_nowait()
                if message is self.STOP_SIGNAL:
                    return
                # Unpack the message
                job_id, error_msg = message
                # Get the job object and append to watch queue
                jobs_to_check.append( ( self.sa_session.query( model.Job ).get( job_id ), error_msg ) )
        except Empty:
            pass
        for job, error_msg in jobs_to_check:
            if error_msg is not None:
                job.state = job.states.ERROR
                job.info = error_msg
            else:
                job.state = job.states.DELETED
            self.sa_session.add( job )
            self.sa_session.flush()
            if job.job_runner_name is not None:
                # tell the dispatcher to stop the job
                self.dispatcher.stop( job )

    def put( self, job_id, error_msg=None ):
        self.queue.put( ( job_id, error_msg ) )

    def shutdown( self ):
        """Attempts to gracefully shut down the worker thread"""
        if self.parent_pid != os.getpid():
            # We're not the real job queue, do nothing
            return
        else:
            log.info( "sending stop signal to worker thread" )
            self.running = False
            if not self.app.config.track_jobs_in_database:
                self.queue.put( self.STOP_SIGNAL )
            self.sleeper.wake()
            log.info( "job handler stop queue stopped" )
Beispiel #4
0
class JobHandlerQueue( object ):
    """
    Job manager, waits for jobs to be runnable and then dispatches to
    a JobRunner.
    """
    STOP_SIGNAL = object()

    def __init__( self, app, dispatcher ):
        """Start the job manager"""
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context
        self.track_jobs_in_database = self.app.config.track_jobs_in_database

        # Initialize structures for handling job limits
        self.__clear_user_job_count()

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting_jobs = []
        # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times)
        self.job_wrappers = {}
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread( name="JobHandlerQueue.monitor_thread", target=self.__monitor )
        self.monitor_thread.setDaemon( True )

    def start( self ):
        """
        The JobManager should start, and then start its Handler, if it has one.
        """
        # Recover jobs at startup
        self.__check_jobs_at_startup()
        # Start the queue
        self.monitor_thread.start()
        log.info( "job handler queue started" )

    def __check_jobs_at_startup( self ):
        """
        Checks all jobs that are in the 'new', 'queued' or 'running' state in
        the database and requeues or cleans up as necessary.  Only run as the
        job handler starts.
        In case the activation is enforced it will filter out the jobs of inactive users.
        """
        jobs_at_startup = []
        if self.app.config.user_activation_on:
                jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                                  .outerjoin( model.User ) \
                                  .filter( ( ( model.Job.state == model.Job.states.NEW ) \
                                             | ( model.Job.state == model.Job.states.RUNNING ) \
                                             | ( model.Job.state == model.Job.states.QUEUED ) ) \
                                           & ( model.Job.handler == self.app.config.server_name ) \
                                           & or_( ( model.Job.user_id == None ),( model.User.active == True ) ) ).all()
        else:
            jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                              .filter( ( ( model.Job.state == model.Job.states.NEW ) \
                                         | ( model.Job.state == model.Job.states.RUNNING ) \
                                         | ( model.Job.state == model.Job.states.QUEUED ) ) \
                                       & ( model.Job.handler == self.app.config.server_name ) ).all()
                
        for job in jobs_at_startup:
            if job.tool_id not in self.app.toolbox.tools_by_id:
                log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % ( job.id, job.tool_id ) )
                JobWrapper( job, self ).fail( 'This tool was disabled before the job completed.  Please contact your Galaxy administrator.' )
            if job.job_runner_name is not None and job.job_runner_external_id is None:
                # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner.
                log.debug( "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id )
                job.job_runner_name = None
                if self.track_jobs_in_database:
                    job.state = model.Job.states.NEW
                else:
                    self.queue.put( ( job.id, job.tool_id ) )
            elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None:
                # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist
                # TODO: test me extensively
                job_wrapper = JobWrapper( job, self )
                job_destination = self.dispatcher.url_to_destination(job.job_runner_name)
                if job_destination.id is None:
                    job_destination.id = 'legacy_url'
                job_wrapper.set_job_destination(job_destination, job.job_runner_external_id)
                self.dispatcher.recover( job, job_wrapper )
                log.info('(%s) Converted job from a URL to a destination and recovered' % (job.id))
            elif job.job_runner_name is None:
                # Never (fully) dispatched
                log.debug( "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % ( job.id, job.state ) )
                if self.track_jobs_in_database:
                    job.state = model.Job.states.NEW
                else:
                    self.queue.put( ( job.id, job.tool_id ) )
            else:
                # Already dispatched and running
                job_wrapper = JobWrapper( job, self )
                job_wrapper.job_runner_mapper.cached_job_destination = JobDestination(id=job.destination_id, runner=job.job_runner_name, params=job.destination_params)
                self.dispatcher.recover( job, job_wrapper )
        if self.sa_session.dirty:
            self.sa_session.flush()

    def __monitor( self ):
        """
        Continually iterate the waiting jobs, checking is each is ready to
        run and dispatching if so.
        """
        while self.running:
            try:
                self.__monitor_step()
            except:
                log.exception( "Exception in monitor_step" )
            # Sleep
            self.sleeper.sleep( 1 )

    def __monitor_step( self ):
        """
        Called repeatedly by `monitor` to process waiting jobs. Gets any new
        jobs (either from the database or from its own queue), then iterates
        over all new and waiting jobs to check the state of the jobs each
        depends on. If the job has dependencies that have not finished, it
        it goes to the waiting queue. If the job has dependencies with errors,
        it is marked as having errors and removed from the queue. If the job 
        belongs to an inactive user it is ignored.
        Otherwise, the job is dispatched.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        if self.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                    .join(model.JobToInputDatasetAssociation) \
                    .join(model.HistoryDatasetAssociation) \
                    .join(model.Dataset) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 or_((model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA),
                                     (model.HistoryDatasetAssociation.deleted == True ),
                                     (model.Dataset.state != model.Dataset.states.OK ),
                                     (model.Dataset.deleted == True)))).subquery()
            ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                    .join(model.JobToInputLibraryDatasetAssociation) \
                    .join(model.LibraryDatasetDatasetAssociation) \
                    .join(model.Dataset) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 or_((model.LibraryDatasetDatasetAssociation._state != None),
                                     (model.LibraryDatasetDatasetAssociation.deleted == True),
                                     (model.Dataset.state != model.Dataset.states.OK),
                                     (model.Dataset.deleted == True)))).subquery()
            if self.app.config.user_activation_on:
                jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                        .outerjoin( model.User ) \
                        .filter(and_((model.Job.state == model.Job.states.NEW),
                                    or_((model.Job.user_id == None),(model.User.active == True)),
                                     (model.Job.handler == self.app.config.server_name),
                                     ~model.Job.table.c.id.in_(hda_not_ready),
                                     ~model.Job.table.c.id.in_(ldda_not_ready))) \
                        .order_by(model.Job.id).all()
            else:
                jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 (model.Job.handler == self.app.config.server_name),
                                 ~model.Job.table.c.id.in_(hda_not_ready),
                                 ~model.Job.table.c.id.in_(ldda_not_ready))) \
                    .order_by(model.Job.id).all()
        else:
            # Get job objects and append to watch queue for any which were
            # previously waiting
            for job_id in self.waiting_jobs:
                jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) )
            try:
                while 1:
                    message = self.queue.get_nowait()
                    if message is self.STOP_SIGNAL:
                        return
                    # Unpack the message
                    job_id, tool_id = message
                    # Get the job object and append to watch queue
                    jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) )
            except Empty:
                pass
        # Ensure that we get new job counts on each iteration
        self.__clear_user_job_count()
        # Iterate over new and waiting jobs and look for any that are
        # ready to run
        new_waiting_jobs = []
        for job in jobs_to_check:
            try:
                # Check the job's dependencies, requeue if they're not done.
                # Some of these states will only happen when using the in-memory job queue
                job_state = self.__check_if_ready_to_run( job )
                if job_state == JOB_WAIT:
                    new_waiting_jobs.append( job.id )
                elif job_state == JOB_INPUT_ERROR:
                    log.info( "(%d) Job unable to run: one or more inputs in error state" % job.id )
                elif job_state == JOB_INPUT_DELETED:
                    log.info( "(%d) Job unable to run: one or more inputs deleted" % job.id )
                elif job_state == JOB_READY:
                    self.dispatcher.put( self.job_wrappers.pop( job.id ) )
                    log.info( "(%d) Job dispatched" % job.id )
                elif job_state == JOB_DELETED:
                    log.info( "(%d) Job deleted by user while still queued" % job.id )
                elif job_state == JOB_ADMIN_DELETED:
                    log.info( "(%d) Job deleted by admin while still queued" % job.id )
                elif job_state == JOB_USER_OVER_QUOTA:
                    log.info( "(%d) User (%s) is over quota: job paused" % ( job.id, job.user_id ) )
                    job.state = model.Job.states.PAUSED
                    for dataset_assoc in job.output_datasets + job.output_library_datasets:
                        dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED
                        dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run"
                        self.sa_session.add( dataset_assoc.dataset.dataset )
                    self.sa_session.add( job )
                elif job_state == JOB_ERROR:
                    log.error( "(%d) Error checking job readiness" % job.id )
                else:
                    log.error( "(%d) Job in unknown state '%s'" % ( job.id, job_state ) )
                    new_waiting_jobs.append( job.id )
            except Exception:
                log.exception( "failure running job %d" % job.id )
        # Update the waiting list
        if not self.track_jobs_in_database:
            self.waiting_jobs = new_waiting_jobs
        # Remove cached wrappers for any jobs that are no longer being tracked
        for id in self.job_wrappers.keys():
            if id not in new_waiting_jobs:
                del self.job_wrappers[id]
        # Flush, if we updated the state
        self.sa_session.flush()
        # Done with the session
        self.sa_session.remove()

    def __check_if_ready_to_run( self, job ):
        """
        Check if a job is ready to run by verifying that each of its input
        datasets is ready (specifically in the OK state). If any input dataset
        has an error, fail the job and return JOB_INPUT_ERROR. If any input
        dataset is deleted, fail the job and return JOB_INPUT_DELETED.  If all
        input datasets are in OK state, return JOB_READY indicating that the
        job can be dispatched. Otherwise, return JOB_WAIT indicating that input
        datasets are still being prepared.
        """
        # If tracking in the database, job.state is guaranteed to be NEW and the inputs are guaranteed to be OK
        if not self.track_jobs_in_database:
            if job.state == model.Job.states.DELETED:
                return JOB_DELETED
            elif job.state == model.Job.states.ERROR:
                return JOB_ADMIN_DELETED
            for dataset_assoc in job.input_datasets + job.input_library_datasets:
                idata = dataset_assoc.dataset
                if not idata:
                    continue
                # don't run jobs for which the input dataset was deleted
                if idata.deleted:
                    self.job_wrappers.pop(job.id, JobWrapper( job, self )).fail( "input data %s (file: %s) was deleted before the job started" % ( idata.hid, idata.file_name ) )
                    return JOB_INPUT_DELETED
                # an error in the input data causes us to bail immediately
                elif idata.state == idata.states.ERROR:
                    self.job_wrappers.pop(job.id, JobWrapper( job, self )).fail( "input data %s is in error state" % ( idata.hid ) )
                    return JOB_INPUT_ERROR
                elif idata.state == idata.states.FAILED_METADATA:
                    self.job_wrappers.pop(job.id, JobWrapper( job, self )).fail( "input data %s failed to properly set metadata" % ( idata.hid ) )
                    return JOB_INPUT_ERROR
                elif idata.state != idata.states.OK and not ( idata.state == idata.states.SETTING_METADATA and job.tool_id is not None and job.tool_id == self.app.datatypes_registry.set_external_metadata_tool.id ):
                    # need to requeue
                    return JOB_WAIT
        # Create the job wrapper so that the destination can be set
        if job.id not in self.job_wrappers:
            self.job_wrappers[job.id] = JobWrapper(job, self)
        # Cause the job_destination to be set and cached by the mapper
        try:
            self.job_wrappers[job.id].job_destination
        except Exception, e:
            failure_message = getattr(e, 'failure_message', DEFAULT_JOB_PUT_FAILURE_MESSAGE )
            if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE:
                log.exception( 'Failed to generate job destination' )
            else:
                log.debug( "Intentionally failing job with message (%s)" % failure_message )
            self.job_wrappers[job.id].fail( failure_message )
            return JOB_ERROR
        # job is ready to run, check limits
        state = self.__check_user_jobs( job, self.job_wrappers[job.id] )
        if state == JOB_READY and self.app.config.enable_quotas:
            quota = self.app.quota_agent.get_quota( job.user )
            if quota is not None:
                try:
                    usage = self.app.quota_agent.get_usage( user=job.user, history=job.history )
                    if usage > quota:
                        return JOB_USER_OVER_QUOTA
                except AssertionError, e:
                    pass # No history, should not happen with an anon user
Beispiel #5
0
class S3ObjectStore(ObjectStore):
    """
    Object store that stores objects as items in an AWS S3 bucket. A local
    cache exists that is used as an intermediate location for files between
    Galaxy and S3.
    """
    def __init__(self, config, config_xml):
        if boto is None:
            raise Exception(NO_BOTO_ERROR_MESSAGE)
        super(S3ObjectStore, self).__init__(config)
        self.staging_path = self.config.file_path
        self.transfer_progress = 0
        self._parse_config_xml(config_xml)
        self._configure_connection()
        self.bucket = self._get_bucket(self.bucket)
        # Clean cache only if value is set in galaxy.ini
        if self.cache_size != -1:
            # Convert GBs to bytes for comparison
            self.cache_size = self.cache_size * 1073741824
            # Helper for interruptable sleep
            self.sleeper = Sleeper()
            self.cache_monitor_thread = threading.Thread(target=self.__cache_monitor)
            self.cache_monitor_thread.start()
            log.info("Cache cleaner manager started")
        # Test if 'axel' is available for parallel download and pull the key into cache
        try:
            subprocess.call('axel')
            self.use_axel = True
        except OSError:
            self.use_axel = False

    def _configure_connection(self):
        log.debug("Configuring S3 Connection")
        self.conn = S3Connection(self.access_key, self.secret_key)

    def _parse_config_xml(self, config_xml):
        try:
            a_xml = config_xml.findall('auth')[0]
            self.access_key = a_xml.get('access_key')
            self.secret_key = a_xml.get('secret_key')
            b_xml = config_xml.findall('bucket')[0]
            self.bucket = b_xml.get('name')
            self.use_rr = string_as_bool(b_xml.get('use_reduced_redundancy', "False"))
            self.max_chunk_size = int(b_xml.get('max_chunk_size', 250))
            cn_xml = config_xml.findall('connection')
            if not cn_xml:
                cn_xml = {}
            else:
                cn_xml = cn_xml[0]
            self.host = cn_xml.get('host', None)
            self.port = int(cn_xml.get('port', 6000))
            self.multipart = string_as_bool(cn_xml.get('multipart', 'True'))
            self.is_secure = string_as_bool(cn_xml.get('is_secure', 'True'))
            self.conn_path = cn_xml.get('conn_path', '/')
            c_xml = config_xml.findall('cache')[0]
            self.cache_size = float(c_xml.get('size', -1))
            self.staging_path = c_xml.get('path', self.config.object_store_cache_path)

            for d_xml in config_xml.findall('extra_dir'):
                self.extra_dirs[d_xml.get('type')] = d_xml.get('path')

            log.debug("Object cache dir:    %s", self.staging_path)
            log.debug("       job work dir: %s", self.extra_dirs['job_work'])

            # for multipart upload
            self.s3server = {'access_key': self.access_key,
                             'secret_key': self.secret_key,
                             'is_secure': self.is_secure,
                             'max_chunk_size': self.max_chunk_size,
                             'host': self.host,
                             'port': self.port,
                             'use_rr': self.use_rr,
                             'conn_path': self.conn_path}
        except Exception:
            # Toss it back up after logging, we can't continue loading at this point.
            log.exception("Malformed ObjectStore Configuration XML -- unable to continue")
            raise

    def __cache_monitor(self):
        time.sleep(2)  # Wait for things to load before starting the monitor
        while self.running:
            total_size = 0
            # Is this going to be too expensive of an operation to be done frequently?
            file_list = []
            for dirpath, _, filenames in os.walk(self.staging_path):
                for filename in filenames:
                    filepath = os.path.join(dirpath, filename)
                    file_size = os.path.getsize(filepath)
                    total_size += file_size
                    # Get the time given file was last accessed
                    last_access_time = time.localtime(os.stat(filepath)[7])
                    # Compose a tuple of the access time and the file path
                    file_tuple = last_access_time, filepath, file_size
                    file_list.append(file_tuple)
            # Sort the file list (based on access time)
            file_list.sort()
            # Initiate cleaning once within 10% of the defined cache size?
            cache_limit = self.cache_size * 0.9
            if total_size > cache_limit:
                log.info("Initiating cache cleaning: current cache size: %s; clean until smaller than: %s",
                         convert_bytes(total_size), convert_bytes(cache_limit))
                # How much to delete? If simply deleting up to the cache-10% limit,
                # is likely to be deleting frequently and may run the risk of hitting
                # the limit - maybe delete additional #%?
                # For now, delete enough to leave at least 10% of the total cache free
                delete_this_much = total_size - cache_limit
                self.__clean_cache(file_list, delete_this_much)
            self.sleeper.sleep(30)  # Test cache size every 30 seconds?

    def __clean_cache(self, file_list, delete_this_much):
        """ Keep deleting files from the file_list until the size of the deleted
        files is greater than the value in delete_this_much parameter.

        :type file_list: list
        :param file_list: List of candidate files that can be deleted. This method
            will start deleting files from the beginning of the list so the list
            should be sorted accordingly. The list must contains 3-element tuples,
            positioned as follows: position 0 holds file last accessed timestamp
            (as time.struct_time), position 1 holds file path, and position 2 has
            file size (e.g., (<access time>, /mnt/data/dataset_1.dat), 472394)

        :type delete_this_much: int
        :param delete_this_much: Total size of files, in bytes, that should be deleted.
        """
        # Keep deleting datasets from file_list until deleted_amount does not
        # exceed delete_this_much; start deleting from the front of the file list,
        # which assumes the oldest files come first on the list.
        deleted_amount = 0
        for entry in enumerate(file_list):
            if deleted_amount < delete_this_much:
                deleted_amount += entry[2]
                os.remove(entry[1])
                # Debugging code for printing deleted files' stats
                # folder, file_name = os.path.split(f[1])
                # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0])
                # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \
                #     % (i, file_name, convert_bytes(f[2]), file_date, \
                #     convert_bytes(deleted_amount), convert_bytes(delete_this_much)))
            else:
                log.debug("Cache cleaning done. Total space freed: %s", convert_bytes(deleted_amount))
                return

    def _get_bucket(self, bucket_name):
        """ Sometimes a handle to a bucket is not established right away so try
        it a few times. Raise error is connection is not established. """
        for i in range(5):
            try:
                bucket = self.conn.get_bucket(bucket_name)
                log.debug("Using cloud object store with bucket '%s'", bucket.name)
                return bucket
            except S3ResponseError:
                try:
                    log.debug("Bucket not found, creating s3 bucket with handle '%s'", bucket_name)
                    self.conn.create_bucket(bucket_name)
                except S3ResponseError:
                    log.exception("Could not get bucket '%s', attempt %s/5", bucket_name, i + 1)
                    time.sleep(2)
        # All the attempts have been exhausted and connection was not established,
        # raise error
        raise S3ResponseError

    def _fix_permissions(self, rel_path):
        """ Set permissions on rel_path"""
        for basedir, _, files in os.walk(rel_path):
            umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid)
            for filename in files:
                path = os.path.join(basedir, filename)
                # Ignore symlinks
                if os.path.islink(path):
                    continue
                umask_fix_perms( path, self.config.umask, 0o666, self.config.gid )

    def _construct_path(self, obj, base_dir=None, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None, obj_dir=False, **kwargs):
        # extra_dir should never be constructed from provided data but just
        # make sure there are no shenannigans afoot
        if extra_dir and extra_dir != os.path.normpath(extra_dir):
            log.warning('extra_dir is not normalized: %s', extra_dir)
            raise ObjectInvalid("The requested object is invalid")
        # ensure that any parent directory references in alt_name would not
        # result in a path not contained in the directory path constructed here
        if alt_name:
            if not safe_relpath(alt_name):
                log.warning('alt_name would locate path outside dir: %s', alt_name)
                raise ObjectInvalid("The requested object is invalid")
            # alt_name can contain parent directory references, but S3 will not
            # follow them, so if they are valid we normalize them out
            alt_name = os.path.normpath(alt_name)
        rel_path = os.path.join(*directory_hash_id(obj.id))
        if extra_dir is not None:
            if extra_dir_at_root:
                rel_path = os.path.join(extra_dir, rel_path)
            else:
                rel_path = os.path.join(rel_path, extra_dir)

        # for JOB_WORK directory
        if obj_dir:
            rel_path = os.path.join(rel_path, str(obj.id))
        if base_dir:
            base = self.extra_dirs.get(base_dir)
            return os.path.join(base, rel_path)

        # S3 folders are marked by having trailing '/' so add it now
        rel_path = '%s/' % rel_path

        if not dir_only:
            rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id)
        return rel_path

    def _get_cache_path(self, rel_path):
        return os.path.abspath(os.path.join(self.staging_path, rel_path))

    def _get_transfer_progress(self):
        return self.transfer_progress

    def _get_size_in_s3(self, rel_path):
        try:
            key = self.bucket.get_key(rel_path)
            if key:
                return key.size
        except S3ResponseError:
            log.exception("Could not get size of key '%s' from S3", rel_path)
            return -1

    def _key_exists(self, rel_path):
        exists = False
        try:
            # A hackish way of testing if the rel_path is a folder vs a file
            is_dir = rel_path[-1] == '/'
            if is_dir:
                keyresult = self.bucket.get_all_keys(prefix=rel_path)
                if len(keyresult) > 0:
                    exists = True
                else:
                    exists = False
            else:
                key = Key(self.bucket, rel_path)
                exists = key.exists()
        except S3ResponseError:
            log.exception("Trouble checking existence of S3 key '%s'", rel_path)
            return False
        if rel_path[0] == '/':
            raise
        return exists

    def _in_cache(self, rel_path):
        """ Check if the given dataset is in the local cache and return True if so. """
        # log.debug("------ Checking cache for rel_path %s" % rel_path)
        cache_path = self._get_cache_path(rel_path)
        return os.path.exists(cache_path)
        # TODO: Part of checking if a file is in cache should be to ensure the
        # size of the cached file matches that on S3. Once the upload tool explicitly
        # creates, this check sould be implemented- in the mean time, it's not
        # looking likely to be implementable reliably.
        # if os.path.exists(cache_path):
        #     # print "***1 %s exists" % cache_path
        #     if self._key_exists(rel_path):
        #         # print "***2 %s exists in S3" % rel_path
        #         # Make sure the size in cache is available in its entirety
        #         # print "File '%s' cache size: %s, S3 size: %s" % (cache_path, os.path.getsize(cache_path), self._get_size_in_s3(rel_path))
        #         if os.path.getsize(cache_path) == self._get_size_in_s3(rel_path):
        #             # print "***2.1 %s exists in S3 and the size is the same as in cache (in_cache=True)" % rel_path
        #             exists = True
        #         else:
        #             # print "***2.2 %s exists but differs in size from cache (in_cache=False)" % cache_path
        #             exists = False
        #     else:
        #         # Although not perfect decision making, this most likely means
        #         # that the file is currently being uploaded
        #         # print "***3 %s found in cache but not in S3 (in_cache=True)" % cache_path
        #         exists = True
        # else:
        #     return False

    def _pull_into_cache(self, rel_path):
        # Ensure the cache directory structure exists (e.g., dataset_#_files/)
        rel_path_dir = os.path.dirname(rel_path)
        if not os.path.exists(self._get_cache_path(rel_path_dir)):
            os.makedirs(self._get_cache_path(rel_path_dir))
        # Now pull in the file
        file_ok = self._download(rel_path)
        self._fix_permissions(self._get_cache_path(rel_path_dir))
        return file_ok

    def _transfer_cb(self, complete, total):
        self.transfer_progress += 10

    def _download(self, rel_path):
        try:
            log.debug("Pulling key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path))
            key = self.bucket.get_key(rel_path)
            # Test if cache is large enough to hold the new file
            if self.cache_size > 0 and key.size > self.cache_size:
                log.critical("File %s is larger (%s) than the cache size (%s). Cannot download.",
                             rel_path, key.size, self.cache_size)
                return False
            if self.use_axel:
                log.debug("Parallel pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path))
                ncores = multiprocessing.cpu_count()
                url = key.generate_url(7200)
                ret_code = subprocess.call("axel -a -n %s '%s'" % (ncores, url))
                if ret_code == 0:
                    return True
            else:
                log.debug("Pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path))
                self.transfer_progress = 0  # Reset transfer progress counter
                key.get_contents_to_filename(self._get_cache_path(rel_path), cb=self._transfer_cb, num_cb=10)
                return True
        except S3ResponseError:
            log.exception("Problem downloading key '%s' from S3 bucket '%s'", rel_path, self.bucket.name)
        return False

    def _push_to_os(self, rel_path, source_file=None, from_string=None):
        """
        Push the file pointed to by ``rel_path`` to the object store naming the key
        ``rel_path``. If ``source_file`` is provided, push that file instead while
        still using ``rel_path`` as the key name.
        If ``from_string`` is provided, set contents of the file to the value of
        the string.
        """
        try:
            source_file = source_file if source_file else self._get_cache_path(rel_path)
            if os.path.exists(source_file):
                key = Key(self.bucket, rel_path)
                if os.path.getsize(source_file) == 0 and key.exists():
                    log.debug("Wanted to push file '%s' to S3 key '%s' but its size is 0; skipping.", source_file, rel_path)
                    return True
                if from_string:
                    key.set_contents_from_string(from_string, reduced_redundancy=self.use_rr)
                    log.debug("Pushed data from string '%s' to key '%s'", from_string, rel_path)
                else:
                    start_time = datetime.now()
                    log.debug("Pushing cache file '%s' of size %s bytes to key '%s'", source_file, os.path.getsize(source_file), rel_path)
                    mb_size = os.path.getsize(source_file) / 1e6
                    if mb_size < 10 or (not self.multipart):
                        self.transfer_progress = 0  # Reset transfer progress counter
                        key.set_contents_from_filename(source_file,
                                                       reduced_redundancy=self.use_rr,
                                                       cb=self._transfer_cb,
                                                       num_cb=10)
                    else:
                        multipart_upload(self.s3server, self.bucket, key.name, source_file, mb_size)
                    end_time = datetime.now()
                    log.debug("Pushed cache file '%s' to key '%s' (%s bytes transfered in %s sec)",
                              source_file, rel_path, os.path.getsize(source_file), end_time - start_time)
                return True
            else:
                log.error("Tried updating key '%s' from source file '%s', but source file does not exist.",
                          rel_path, source_file)
        except S3ResponseError:
            log.exception("Trouble pushing S3 key '%s' from file '%s'", rel_path, source_file)
        return False

    def file_ready(self, obj, **kwargs):
        """
        A helper method that checks if a file corresponding to a dataset is
        ready and available to be used. Return ``True`` if so, ``False`` otherwise.
        """
        rel_path = self._construct_path(obj, **kwargs)
        # Make sure the size in cache is available in its entirety
        if self._in_cache(rel_path):
            if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_s3(rel_path):
                return True
            log.debug("Waiting for dataset %s to transfer from OS: %s/%s", rel_path,
                      os.path.getsize(self._get_cache_path(rel_path)), self._get_size_in_s3(rel_path))
        return False

    def exists(self, obj, **kwargs):
        in_cache = in_s3 = False
        rel_path = self._construct_path(obj, **kwargs)

        # Check cache
        if self._in_cache(rel_path):
            in_cache = True
        # Check S3
        in_s3 = self._key_exists(rel_path)
        # log.debug("~~~~~~ File '%s' exists in cache: %s; in s3: %s" % (rel_path, in_cache, in_s3))
        # dir_only does not get synced so shortcut the decision
        dir_only = kwargs.get('dir_only', False)
        base_dir = kwargs.get('base_dir', None)
        if dir_only:
            if in_cache or in_s3:
                return True
            # for JOB_WORK directory
            elif base_dir:
                if not os.path.exists(rel_path):
                    os.makedirs(rel_path)
                return True
            else:
                return False

        # TODO: Sync should probably not be done here. Add this to an async upload stack?
        if in_cache and not in_s3:
            self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path))
            return True
        elif in_s3:
            return True
        else:
            return False

    def create(self, obj, **kwargs):
        if not self.exists(obj, **kwargs):

            # Pull out locally used fields
            extra_dir = kwargs.get('extra_dir', None)
            extra_dir_at_root = kwargs.get('extra_dir_at_root', False)
            dir_only = kwargs.get('dir_only', False)
            alt_name = kwargs.get('alt_name', None)

            # Construct hashed path
            rel_path = os.path.join(*directory_hash_id(obj.id))

            # Optionally append extra_dir
            if extra_dir is not None:
                if extra_dir_at_root:
                    rel_path = os.path.join(extra_dir, rel_path)
                else:
                    rel_path = os.path.join(rel_path, extra_dir)

            # Create given directory in cache
            cache_dir = os.path.join(self.staging_path, rel_path)
            if not os.path.exists(cache_dir):
                os.makedirs(cache_dir)

            # Although not really necessary to create S3 folders (because S3 has
            # flat namespace), do so for consistency with the regular file system
            # S3 folders are marked by having trailing '/' so add it now
            # s3_dir = '%s/' % rel_path
            # self._push_to_os(s3_dir, from_string='')
            # If instructed, create the dataset in cache & in S3
            if not dir_only:
                rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id)
                open(os.path.join(self.staging_path, rel_path), 'w').close()
                self._push_to_os(rel_path, from_string='')

    def empty(self, obj, **kwargs):
        if self.exists(obj, **kwargs):
            return bool(self.size(obj, **kwargs) > 0)
        else:
            raise ObjectNotFound( 'objectstore.empty, object does not exist: %s, kwargs: %s'
                                  % ( str( obj ), str( kwargs ) ) )

    def size(self, obj, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        if self._in_cache(rel_path):
            try:
                return os.path.getsize(self._get_cache_path(rel_path))
            except OSError as ex:
                log.info("Could not get size of file '%s' in local cache, will try S3. Error: %s", rel_path, ex)
        elif self.exists(obj, **kwargs):
            return self._get_size_in_s3(rel_path)
        log.warning("Did not find dataset '%s', returning 0 for size", rel_path)
        return 0

    def delete(self, obj, entire_dir=False, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        extra_dir = kwargs.get('extra_dir', None)
        base_dir = kwargs.get('base_dir', None)
        dir_only = kwargs.get('dir_only', False)
        obj_dir = kwargs.get('obj_dir', False)
        try:
            # Remove temparory data in JOB_WORK directory
            if base_dir and dir_only and obj_dir:
                shutil.rmtree(os.path.abspath(rel_path))
                return True

            # For the case of extra_files, because we don't have a reference to
            # individual files/keys we need to remove the entire directory structure
            # with all the files in it. This is easy for the local file system,
            # but requires iterating through each individual key in S3 and deleing it.
            if entire_dir and extra_dir:
                shutil.rmtree(self._get_cache_path(rel_path))
                results = self.bucket.get_all_keys(prefix=rel_path)
                for key in results:
                    log.debug("Deleting key %s", key.name)
                    key.delete()
                return True
            else:
                # Delete from cache first
                os.unlink(self._get_cache_path(rel_path))
                # Delete from S3 as well
                if self._key_exists(rel_path):
                    key = Key(self.bucket, rel_path)
                    log.debug("Deleting key %s", key.name)
                    key.delete()
                    return True
        except S3ResponseError:
            log.exception("Could not delete key '%s' from S3", rel_path)
        except OSError:
            log.exception('%s delete error', self.get_filename(obj, **kwargs))
        return False

    def get_data(self, obj, start=0, count=-1, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        # Check cache first and get file if not there
        if not self._in_cache(rel_path):
            self._pull_into_cache(rel_path)
        # Read the file content from cache
        data_file = open(self._get_cache_path(rel_path), 'r')
        data_file.seek(start)
        content = data_file.read(count)
        data_file.close()
        return content

    def get_filename(self, obj, **kwargs):
        base_dir = kwargs.get('base_dir', None)
        dir_only = kwargs.get('dir_only', False)
        obj_dir = kwargs.get('obj_dir', False)
        rel_path = self._construct_path(obj, **kwargs)

        # for JOB_WORK directory
        if base_dir and dir_only and obj_dir:
            return os.path.abspath(rel_path)

        cache_path = self._get_cache_path(rel_path)
        # S3 does not recognize directories as files so cannot check if those exist.
        # So, if checking dir only, ensure given dir exists in cache and return
        # the expected cache path.
        # dir_only = kwargs.get('dir_only', False)
        # if dir_only:
        #     if not os.path.exists(cache_path):
        #         os.makedirs(cache_path)
        #     return cache_path
        # Check if the file exists in the cache first
        if self._in_cache(rel_path):
            return cache_path
        # Check if the file exists in persistent storage and, if it does, pull it into cache
        elif self.exists(obj, **kwargs):
            if dir_only:  # Directories do not get pulled into cache
                return cache_path
            else:
                if self._pull_into_cache(rel_path):
                    return cache_path
        # For the case of retrieving a directory only, return the expected path
        # even if it does not exist.
        # if dir_only:
        #     return cache_path
        raise ObjectNotFound( 'objectstore.get_filename, no cache_path: %s, kwargs: %s'
                              % ( str( obj ), str( kwargs ) ) )
        # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path

    def update_from_file(self, obj, file_name=None, create=False, **kwargs):
        if create:
            self.create(obj, **kwargs)
        if self.exists(obj, **kwargs):
            rel_path = self._construct_path(obj, **kwargs)
            # Chose whether to use the dataset file itself or an alternate file
            if file_name:
                source_file = os.path.abspath(file_name)
                # Copy into cache
                cache_file = self._get_cache_path(rel_path)
                try:
                    if source_file != cache_file:
                        # FIXME? Should this be a `move`?
                        shutil.copy2(source_file, cache_file)
                    self._fix_permissions(cache_file)
                except OSError:
                    log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file)
            else:
                source_file = self._get_cache_path(rel_path)
            # Update the file on S3
            self._push_to_os(rel_path, source_file)
        else:
            raise ObjectNotFound( 'objectstore.update_from_file, object does not exist: %s, kwargs: %s'
                                  % ( str( obj ), str( kwargs ) ) )

    def get_object_url(self, obj, **kwargs):
        if self.exists(obj, **kwargs):
            rel_path = self._construct_path(obj, **kwargs)
            try:
                key = Key(self.bucket, rel_path)
                return key.generate_url(expires_in=86400)  # 24hrs
            except S3ResponseError:
                log.exception("Trouble generating URL for dataset '%s'", rel_path)
        return None

    def get_store_usage_percent(self):
        return 0.0
Beispiel #6
0
class DistributedObjectStore(NestedObjectStore):
    """
    ObjectStore that defers to a list of backends.

    When getting objects the first store where the object exists is used.
    When creating objects they are created in a store selected randomly, but
    with weighting.
    """
    store_type = 'distributed'

    def __init__(self, config, config_dict, fsmon=False):
        """
        :type config: object
        :param config: An object, most likely populated from
            `galaxy/config.ini`, having the same attributes needed by
            :class:`NestedObjectStore` plus:

            * distributed_object_store_config_file

        :type config_xml: ElementTree

        :type fsmon: bool
        :param fsmon: If True, monitor the file system for free space,
            removing backends when they get too full.
        """
        super(DistributedObjectStore, self).__init__(config, config_dict)

        self.backends = {}
        self.weighted_backend_ids = []
        self.original_weighted_backend_ids = []
        self.max_percent_full = {}
        self.global_max_percent_full = config_dict.get(
            "global_max_percent_full", 0)
        random.seed()

        for backend_def in config_dict["backends"]:
            backened_id = backend_def["id"]
            maxpctfull = backend_def.get("max_percent_full", 0)
            weight = backend_def["weight"]

            backend = build_object_store_from_config(config,
                                                     config_dict=backend_def,
                                                     fsmon=fsmon)

            self.backends[backened_id] = backend
            self.max_percent_full[backened_id] = maxpctfull

            for _ in range(0, weight):
                # The simplest way to do weighting: add backend ids to a
                # sequence the number of times equalling weight, then randomly
                # choose a backend from that sequence at creation
                self.weighted_backend_ids.append(backened_id)

        self.original_weighted_backend_ids = self.weighted_backend_ids

        self.sleeper = None
        if fsmon and (self.global_max_percent_full or
                      [_ for _ in self.max_percent_full.values() if _ != 0.0]):
            self.sleeper = Sleeper()
            self.filesystem_monitor_thread = threading.Thread(
                target=self.__filesystem_monitor)
            self.filesystem_monitor_thread.setDaemon(True)
            self.filesystem_monitor_thread.start()
            log.info("Filesystem space monitor started")

    @classmethod
    def parse_xml(clazz, config_xml, legacy=False):
        if legacy:
            backends_root = config_xml
        else:
            backends_root = config_xml.find('backends')

        backends = []
        config_dict = {
            'global_max_percent_full':
            float(backends_root.get('maxpctfull', 0)),
            'backends': backends,
        }

        for b in [e for e in backends_root if e.tag == 'backend']:
            store_id = b.get("id")
            store_weight = int(b.get("weight", 1))
            store_maxpctfull = float(b.get('maxpctfull', 0))
            store_type = b.get("type", "disk")
            store_by = b.get('store_by', None)

            objectstore_class, _ = type_to_object_store_class(store_type)
            backend_config_dict = objectstore_class.parse_xml(b)
            backend_config_dict["id"] = store_id
            backend_config_dict["weight"] = store_weight
            backend_config_dict["max_percent_full"] = store_maxpctfull
            backend_config_dict["type"] = store_type
            if store_by is not None:
                backend_config_dict["store_by"] = store_by
            backends.append(backend_config_dict)

        return config_dict

    @classmethod
    def from_xml(clazz, config, config_xml, fsmon=False):
        legacy = False
        if config_xml is None:
            distributed_config = config.distributed_object_store_config_file
            assert distributed_config is not None, \
                "distributed object store ('object_store = distributed') " \
                "requires a config file, please set one in " \
                "'distributed_object_store_config_file')"

            log.debug('Loading backends for distributed object store from %s',
                      distributed_config)
            config_xml = parse_xml(distributed_config).getroot()
            legacy = True
        else:
            log.debug('Loading backends for distributed object store from %s',
                      config_xml.get('id'))

        config_dict = clazz.parse_xml(config_xml, legacy=legacy)
        return clazz(config, config_dict, fsmon=fsmon)

    def to_dict(self):
        as_dict = super(DistributedObjectStore, self).to_dict()
        as_dict["global_max_percent_full"] = self.global_max_percent_full
        backends = []
        for backend_id, backend in self.backends.items():
            backend_as_dict = backend.to_dict()
            backend_as_dict["id"] = backend_id
            backend_as_dict["max_percent_full"] = self.max_percent_full[
                backend_id]
            backend_as_dict["weight"] = len([
                i for i in self.original_weighted_backend_ids
                if i == backend_id
            ])
            backends.append(backend_as_dict)
        as_dict["backends"] = backends
        return as_dict

    def shutdown(self):
        """Shut down. Kill the free space monitor if there is one."""
        super(DistributedObjectStore, self).shutdown()
        if self.sleeper is not None:
            self.sleeper.wake()

    def __filesystem_monitor(self):
        while self.running:
            new_weighted_backend_ids = self.original_weighted_backend_ids
            for id, backend in self.backends.items():
                maxpct = self.max_percent_full[
                    id] or self.global_max_percent_full
                pct = backend.get_store_usage_percent()
                if pct > maxpct:
                    new_weighted_backend_ids = [
                        _ for _ in new_weighted_backend_ids if _ != id
                    ]
            self.weighted_backend_ids = new_weighted_backend_ids
            self.sleeper.sleep(120)  # Test free space every 2 minutes

    def _create(self, obj, **kwargs):
        """The only method in which obj.object_store_id may be None."""
        if obj.object_store_id is None or not self._exists(obj, **kwargs):
            if obj.object_store_id is None or obj.object_store_id not in self.backends:
                try:
                    obj.object_store_id = random.choice(
                        self.weighted_backend_ids)
                except IndexError:
                    raise ObjectInvalid(
                        'objectstore.create, could not generate '
                        'obj.object_store_id: %s, kwargs: %s' %
                        (str(obj), str(kwargs)))
                _create_object_in_session(obj)
                log.debug(
                    "Selected backend '%s' for creation of %s %s" %
                    (obj.object_store_id, obj.__class__.__name__, obj.id))
            else:
                log.debug(
                    "Using preferred backend '%s' for creation of %s %s" %
                    (obj.object_store_id, obj.__class__.__name__, obj.id))
            self.backends[obj.object_store_id].create(obj, **kwargs)

    def _call_method(self, method, obj, default, default_is_exception,
                     **kwargs):
        object_store_id = self.__get_store_id_for(obj, **kwargs)
        if object_store_id is not None:
            return self.backends[object_store_id].__getattribute__(method)(
                obj, **kwargs)
        if default_is_exception:
            raise default(
                'objectstore, _call_method failed: %s on %s, kwargs: %s' %
                (method, self._repr_object_for_exception(obj), str(kwargs)))
        else:
            return default

    def __get_store_id_for(self, obj, **kwargs):
        if obj.object_store_id is not None:
            if obj.object_store_id in self.backends:
                return obj.object_store_id
            else:
                log.warning(
                    'The backend object store ID (%s) for %s object with ID %s is invalid'
                    % (obj.object_store_id, obj.__class__.__name__, obj.id))
        # if this instance has been switched from a non-distributed to a
        # distributed object store, or if the object's store id is invalid,
        # try to locate the object
        for id, store in self.backends.items():
            if store.exists(obj, **kwargs):
                log.warning(
                    '%s object with ID %s found in backend object store with ID %s'
                    % (obj.__class__.__name__, obj.id, id))
                obj.object_store_id = id
                _create_object_in_session(obj)
                return id
        return None
Beispiel #7
0
class JobHandlerQueue( object ):
    """
    Job Handler's Internal Queue, this is what actually implements waiting for
    jobs to be runnable and dispatching to a JobRunner.
    """
    STOP_SIGNAL = object()

    def __init__( self, app, dispatcher ):
        """Initializes the Job Handler Queue, creates (unstarted) monitoring thread"""
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context
        self.track_jobs_in_database = self.app.config.track_jobs_in_database

        # Initialize structures for handling job limits
        self.__clear_job_count()

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting_jobs = []
        # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times)
        self.job_wrappers = {}
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread( name="JobHandlerQueue.monitor_thread", target=self.__monitor )
        self.monitor_thread.setDaemon( True )

    def start( self ):
        """
        Starts the JobHandler's thread after checking for any unhandled jobs.
        """
        # Recover jobs at startup
        self.__check_jobs_at_startup()
        # Start the queue
        self.monitor_thread.start()
        log.info( "job handler queue started" )

    def job_wrapper( self, job, use_persisted_destination=False ):
        return JobWrapper( job, self, use_persisted_destination=use_persisted_destination )

    def job_pair_for_id( self, id ):
        job = self.sa_session.query( model.Job ).get( id )
        return job, self.job_wrapper( job, use_persisted_destination=True )

    def __check_jobs_at_startup( self ):
        """
        Checks all jobs that are in the 'new', 'queued' or 'running' state in
        the database and requeues or cleans up as necessary.  Only run as the
        job handler starts.
        In case the activation is enforced it will filter out the jobs of inactive users.
        """
        jobs_at_startup = []
        if self.track_jobs_in_database:
            in_list = ( model.Job.states.QUEUED,
                        model.Job.states.RUNNING )
        else:
            in_list = ( model.Job.states.NEW,
                        model.Job.states.QUEUED,
                        model.Job.states.RUNNING )
        if self.app.config.user_activation_on:
                jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                    .outerjoin( model.User ) \
                    .filter( model.Job.state.in_( in_list ) &
                             ( model.Job.handler == self.app.config.server_name ) &
                             or_( ( model.Job.user_id == null() ), ( model.User.active == true() ) ) ).all()
        else:
            jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                .filter( model.Job.state.in_( in_list ) &
                         ( model.Job.handler == self.app.config.server_name ) ).all()

        for job in jobs_at_startup:
            if not self.app.toolbox.has_tool( job.tool_id, job.tool_version, exact=True ):
                log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % ( job.id, job.tool_id ) )
                self.job_wrapper( job ).fail( 'This tool was disabled before the job completed.  Please contact your Galaxy administrator.' )
            elif job.job_runner_name is not None and job.job_runner_external_id is None:
                # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner.
                log.debug( "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id )
                job.job_runner_name = None
                if self.track_jobs_in_database:
                    job.set_state( model.Job.states.NEW )
                else:
                    self.queue.put( ( job.id, job.tool_id ) )
            elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None:
                # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist
                job_wrapper = self.job_wrapper( job )
                job_destination = self.dispatcher.url_to_destination(job.job_runner_name)
                if job_destination.id is None:
                    job_destination.id = 'legacy_url'
                job_wrapper.set_job_destination(job_destination, job.job_runner_external_id)
                self.dispatcher.recover( job, job_wrapper )
                log.info('(%s) Converted job from a URL to a destination and recovered' % (job.id))
            elif job.job_runner_name is None:
                # Never (fully) dispatched
                log.debug( "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % ( job.id, job.state ) )
                if self.track_jobs_in_database:
                    job.set_state( model.Job.states.NEW )
                else:
                    self.queue.put( ( job.id, job.tool_id ) )
            else:
                # Already dispatched and running
                job_wrapper = self.__recover_job_wrapper( job )
                self.dispatcher.recover( job, job_wrapper )
        if self.sa_session.dirty:
            self.sa_session.flush()

    def __recover_job_wrapper(self, job):
        # Already dispatched and running
        job_wrapper = self.job_wrapper( job )
        # Use the persisted destination as its params may differ from
        # what's in the job_conf xml
        job_destination = JobDestination(id=job.destination_id, runner=job.job_runner_name, params=job.destination_params)
        # resubmits are not persisted (it's a good thing) so they
        # should be added back to the in-memory destination on startup
        try:
            config_job_destination = self.app.job_config.get_destination( job.destination_id )
            job_destination.resubmit = config_job_destination.resubmit
        except KeyError:
            log.debug( '(%s) Recovered destination id (%s) does not exist in job config (but this may be normal in the case of a dynamically generated destination)', job.id, job.destination_id )
        job_wrapper.job_runner_mapper.cached_job_destination = job_destination
        return job_wrapper

    def __monitor( self ):
        """
        Continually iterate the waiting jobs, checking is each is ready to
        run and dispatching if so.
        """
        while self.running:
            try:
                # If jobs are locked, there's nothing to monitor and we skip
                # to the sleep.
                if not self.app.job_manager.job_lock:
                    self.__monitor_step()
            except:
                log.exception( "Exception in monitor_step" )
            # Sleep
            self.sleeper.sleep( 1 )

    def __monitor_step( self ):
        """
        Called repeatedly by `monitor` to process waiting jobs. Gets any new
        jobs (either from the database or from its own queue), then iterates
        over all new and waiting jobs to check the state of the jobs each
        depends on. If the job has dependencies that have not finished, it
        goes to the waiting queue. If the job has dependencies with errors,
        it is marked as having errors and removed from the queue. If the job
        belongs to an inactive user it is ignored.
        Otherwise, the job is dispatched.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        resubmit_jobs = []
        if self.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                .join(model.JobToInputDatasetAssociation) \
                .join(model.HistoryDatasetAssociation) \
                .join(model.Dataset) \
                .filter(and_( (model.Job.state == model.Job.states.NEW ),
                              or_( ( model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA ),
                                   ( model.HistoryDatasetAssociation.deleted == true() ),
                                   ( model.Dataset.state != model.Dataset.states.OK ),
                                   ( model.Dataset.deleted == true() ) ) ) ).subquery()
            ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                .join(model.JobToInputLibraryDatasetAssociation) \
                .join(model.LibraryDatasetDatasetAssociation) \
                .join(model.Dataset) \
                .filter(and_((model.Job.state == model.Job.states.NEW),
                        or_((model.LibraryDatasetDatasetAssociation._state != null()),
                            (model.LibraryDatasetDatasetAssociation.deleted == true()),
                            (model.Dataset.state != model.Dataset.states.OK),
                            (model.Dataset.deleted == true())))).subquery()
            if self.app.config.user_activation_on:
                jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .outerjoin( model.User ) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 or_((model.Job.user_id == null()), (model.User.active == true())),
                                 (model.Job.handler == self.app.config.server_name),
                                 ~model.Job.table.c.id.in_(hda_not_ready),
                                 ~model.Job.table.c.id.in_(ldda_not_ready))) \
                    .order_by(model.Job.id).all()
            else:
                jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 (model.Job.handler == self.app.config.server_name),
                                 ~model.Job.table.c.id.in_(hda_not_ready),
                                 ~model.Job.table.c.id.in_(ldda_not_ready))) \
                    .order_by(model.Job.id).all()
            # Fetch all "resubmit" jobs
            resubmit_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \
                .filter(and_((model.Job.state == model.Job.states.RESUBMITTED),
                             (model.Job.handler == self.app.config.server_name))) \
                .order_by(model.Job.id).all()
        else:
            # Get job objects and append to watch queue for any which were
            # previously waiting
            for job_id in self.waiting_jobs:
                jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) )
            try:
                while 1:
                    message = self.queue.get_nowait()
                    if message is self.STOP_SIGNAL:
                        return
                    # Unpack the message
                    job_id, tool_id = message
                    # Get the job object and append to watch queue
                    jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) )
            except Empty:
                pass
        # Ensure that we get new job counts on each iteration
        self.__clear_job_count()
        # Check resubmit jobs first so that limits of new jobs will still be enforced
        for job in resubmit_jobs:
            log.debug( '(%s) Job was resubmitted and is being dispatched immediately', job.id )
            # Reassemble resubmit job destination from persisted value
            jw = self.__recover_job_wrapper( job )
            if jw.is_ready_for_resubmission(job):
                self.increase_running_job_count(job.user_id, jw.job_destination.id)
                self.dispatcher.put( jw )
        # Iterate over new and waiting jobs and look for any that are
        # ready to run
        new_waiting_jobs = []
        for job in jobs_to_check:
            try:
                # Check the job's dependencies, requeue if they're not done.
                # Some of these states will only happen when using the in-memory job queue
                job_state = self.__check_job_state( job )
                if job_state == JOB_WAIT:
                    new_waiting_jobs.append( job.id )
                elif job_state == JOB_INPUT_ERROR:
                    log.info( "(%d) Job unable to run: one or more inputs in error state" % job.id )
                elif job_state == JOB_INPUT_DELETED:
                    log.info( "(%d) Job unable to run: one or more inputs deleted" % job.id )
                elif job_state == JOB_READY:
                    self.dispatcher.put( self.job_wrappers.pop( job.id ) )
                    log.info( "(%d) Job dispatched" % job.id )
                elif job_state == JOB_DELETED:
                    log.info( "(%d) Job deleted by user while still queued" % job.id )
                elif job_state == JOB_ADMIN_DELETED:
                    log.info( "(%d) Job deleted by admin while still queued" % job.id )
                elif job_state in ( JOB_USER_OVER_QUOTA,
                                    JOB_USER_OVER_TOTAL_WALLTIME ):
                    if job_state == JOB_USER_OVER_QUOTA:
                        log.info( "(%d) User (%s) is over quota: job paused" % ( job.id, job.user_id ) )
                    else:
                        log.info( "(%d) User (%s) is over total walltime limit: job paused" % ( job.id, job.user_id ) )

                    job.set_state( model.Job.states.PAUSED )
                    for dataset_assoc in job.output_datasets + job.output_library_datasets:
                        dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED
                        dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run"
                        self.sa_session.add( dataset_assoc.dataset.dataset )
                    self.sa_session.add( job )
                elif job_state == JOB_ERROR:
                    log.error( "(%d) Error checking job readiness" % job.id )
                else:
                    log.error( "(%d) Job in unknown state '%s'" % ( job.id, job_state ) )
                    new_waiting_jobs.append( job.id )
            except Exception:
                log.exception( "failure running job %d", job.id )
        # Update the waiting list
        if not self.track_jobs_in_database:
            self.waiting_jobs = new_waiting_jobs
        # Remove cached wrappers for any jobs that are no longer being tracked
        for id in self.job_wrappers.keys():
            if id not in new_waiting_jobs:
                del self.job_wrappers[id]
        # Flush, if we updated the state
        self.sa_session.flush()
        # Done with the session
        self.sa_session.remove()

    def __check_job_state( self, job ):
        """
        Check if a job is ready to run by verifying that each of its input
        datasets is ready (specifically in the OK state). If any input dataset
        has an error, fail the job and return JOB_INPUT_ERROR. If any input
        dataset is deleted, fail the job and return JOB_INPUT_DELETED.  If all
        input datasets are in OK state, return JOB_READY indicating that the
        job can be dispatched. Otherwise, return JOB_WAIT indicating that input
        datasets are still being prepared.
        """
        if not self.track_jobs_in_database:
            in_memory_not_ready_state = self.__verify_in_memory_job_inputs( job )
            if in_memory_not_ready_state:
                return in_memory_not_ready_state

        # Else, if tracking in the database, job.state is guaranteed to be NEW and
        # the inputs are guaranteed to be OK.

        # Create the job wrapper so that the destination can be set
        job_id = job.id
        job_wrapper = self.job_wrappers.get( job_id, None )
        if not job_wrapper:
            job_wrapper = self.job_wrapper( job )
            self.job_wrappers[ job_id ] = job_wrapper

        # If state == JOB_READY, assume job_destination also set - otherwise
        # in case of various error or cancelled states do not assume
        # destination has been set.
        state, job_destination = self.__verify_job_ready( job, job_wrapper )

        if state == JOB_READY:
            # PASS.  increase usage by one job (if caching) so that multiple jobs aren't dispatched on this queue iteration
            self.increase_running_job_count(job.user_id, job_destination.id )
        return state

    def __verify_job_ready( self, job, job_wrapper ):
        """ Compute job destination and verify job is ready at that
        destination by checking job limits and quota. If this method
        return a job state of JOB_READY - it MUST also return a job
        destination.
        """
        job_destination = None
        try:
            assert job_wrapper.tool is not None, 'This tool was disabled before the job completed.  Please contact your Galaxy administrator.'
            # Cause the job_destination to be set and cached by the mapper
            job_destination = job_wrapper.job_destination
        except AssertionError as e:
            log.warning( "(%s) Tool '%s' removed from tool config, unable to run job" % ( job.id, job.tool_id ) )
            job_wrapper.fail( e )
            return JOB_ERROR, job_destination
        except JobNotReadyException as e:
            job_state = e.job_state or JOB_WAIT
            return job_state, None
        except Exception as e:
            failure_message = getattr( e, 'failure_message', DEFAULT_JOB_PUT_FAILURE_MESSAGE )
            if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE:
                log.exception( 'Failed to generate job destination' )
            else:
                log.debug( "Intentionally failing job with message (%s)" % failure_message )
            job_wrapper.fail( failure_message )
            return JOB_ERROR, job_destination
        # job is ready to run, check limits
        # TODO: these checks should be refactored to minimize duplication and made more modular/pluggable
        state = self.__check_destination_jobs( job, job_wrapper )

        if state == JOB_READY:
            state = self.__check_user_jobs( job, job_wrapper )
        if state == JOB_READY and self.app.config.enable_quotas:
            quota = self.app.quota_agent.get_quota( job.user )
            if quota is not None:
                try:
                    usage = self.app.quota_agent.get_usage( user=job.user, history=job.history )
                    if usage > quota:
                        return JOB_USER_OVER_QUOTA, job_destination
                except AssertionError as e:
                    pass  # No history, should not happen with an anon user
        # Check total walltime limits
        if ( state == JOB_READY and
             "delta" in self.app.job_config.limits.total_walltime ):
            jobs_to_check = self.sa_session.query( model.Job ).filter(
                model.Job.user_id == job.user.id,
                model.Job.update_time >= datetime.datetime.now() -
                datetime.timedelta(
                    self.app.job_config.limits.total_walltime["window"]
                ),
                model.Job.state == 'ok'
            ).all()
            time_spent = datetime.timedelta(0)
            for job in jobs_to_check:
                # History is job.state_history
                started = None
                finished = None
                for history in sorted(
                        job.state_history,
                        key=lambda history: history.update_time ):
                    if history.state == "running":
                        started = history.create_time
                    elif history.state == "ok":
                        finished = history.create_time

                time_spent += finished - started

            if time_spent > self.app.job_config.limits.total_walltime["delta"]:
                return JOB_USER_OVER_TOTAL_WALLTIME, job_destination

        return state, job_destination

    def __verify_in_memory_job_inputs( self, job ):
        """ Perform the same checks that happen via SQL for in-memory managed
        jobs.
        """
        if job.state == model.Job.states.DELETED:
            return JOB_DELETED
        elif job.state == model.Job.states.ERROR:
            return JOB_ADMIN_DELETED
        for dataset_assoc in job.input_datasets + job.input_library_datasets:
            idata = dataset_assoc.dataset
            if not idata:
                continue
            # don't run jobs for which the input dataset was deleted
            if idata.deleted:
                self.job_wrappers.pop(job.id, self.job_wrapper( job )).fail( "input data %s (file: %s) was deleted before the job started" % ( idata.hid, idata.file_name ) )
                return JOB_INPUT_DELETED
            # an error in the input data causes us to bail immediately
            elif idata.state == idata.states.ERROR:
                self.job_wrappers.pop(job.id, self.job_wrapper( job )).fail( "input data %s is in error state" % ( idata.hid ) )
                return JOB_INPUT_ERROR
            elif idata.state == idata.states.FAILED_METADATA:
                self.job_wrappers.pop(job.id, self.job_wrapper( job )).fail( "input data %s failed to properly set metadata" % ( idata.hid ) )
                return JOB_INPUT_ERROR
            elif idata.state != idata.states.OK and not ( idata.state == idata.states.SETTING_METADATA and job.tool_id is not None and job.tool_id == self.app.datatypes_registry.set_external_metadata_tool.id ):
                # need to requeue
                return JOB_WAIT

        # All inputs ready to go.
        return None

    def __clear_job_count( self ):
        self.user_job_count = None
        self.user_job_count_per_destination = None
        self.total_job_count_per_destination = None

    def get_user_job_count(self, user_id):
        self.__cache_user_job_count()
        # This could have been incremented by a previous job dispatched on this iteration, even if we're not caching
        rval = self.user_job_count.get(user_id, 0)
        if not self.app.config.cache_user_job_count:
            result = self.sa_session.execute(select([func.count(model.Job.table.c.id)])
                                             .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED,
                                                         model.Job.states.RUNNING,
                                                         model.Job.states.RESUBMITTED)),
                                                         (model.Job.table.c.user_id == user_id))))
            for row in result:
                # there should only be one row
                rval += row[0]
        return rval

    def __cache_user_job_count( self ):
        # Cache the job count if necessary
        if self.user_job_count is None and self.app.config.cache_user_job_count:
            self.user_job_count = {}
            query = self.sa_session.execute(select([model.Job.table.c.user_id, func.count(model.Job.table.c.user_id)])
                                            .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED,
                                                                                     model.Job.states.RUNNING,
                                                                                     model.Job.states.RESUBMITTED)),
                                                        (model.Job.table.c.user_id != null())))
                                            .group_by(model.Job.table.c.user_id))
            for row in query:
                self.user_job_count[row[0]] = row[1]
        elif self.user_job_count is None:
            self.user_job_count = {}

    def get_user_job_count_per_destination(self, user_id):
        self.__cache_user_job_count_per_destination()
        cached = self.user_job_count_per_destination.get(user_id, {})
        if self.app.config.cache_user_job_count:
            rval = cached
        else:
            # The cached count is still used even when we're not caching, it is
            # incremented when a job is run by this handler to ensure that
            # multiple jobs can't get past the limits in one iteration of the
            # queue.
            rval = {}
            rval.update(cached)
            result = self.sa_session.execute(select([model.Job.table.c.destination_id, func.count(model.Job.table.c.destination_id).label('job_count')])
                                             .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING)), (model.Job.table.c.user_id == user_id)))
                                             .group_by(model.Job.table.c.destination_id))
            for row in result:
                # Add the count from the database to the cached count
                rval[row['destination_id']] = rval.get(row['destination_id'], 0) + row['job_count']
        return rval

    def __cache_user_job_count_per_destination(self):
        # Cache the job count if necessary
        if self.user_job_count_per_destination is None and self.app.config.cache_user_job_count:
            self.user_job_count_per_destination = {}
            result = self.sa_session.execute(select([model.Job.table.c.user_id, model.Job.table.c.destination_id, func.count(model.Job.table.c.user_id).label('job_count')])
                                             .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING))))
                                             .group_by(model.Job.table.c.user_id, model.Job.table.c.destination_id))
            for row in result:
                if row['user_id'] not in self.user_job_count_per_destination:
                    self.user_job_count_per_destination[row['user_id']] = {}
                self.user_job_count_per_destination[row['user_id']][row['destination_id']] = row['job_count']
        elif self.user_job_count_per_destination is None:
            self.user_job_count_per_destination = {}

    def increase_running_job_count(self, user_id, destination_id):
        if self.app.job_config.limits.registered_user_concurrent_jobs or \
           self.app.job_config.limits.anonymous_user_concurrent_jobs or \
           self.app.job_config.limits.destination_user_concurrent_jobs:
            if self.user_job_count is None:
                self.user_job_count = {}
            if self.user_job_count_per_destination is None:
                self.user_job_count_per_destination = {}
            self.user_job_count[user_id] = self.user_job_count.get(user_id, 0) + 1
            if user_id not in self.user_job_count_per_destination:
                self.user_job_count_per_destination[user_id] = {}
            self.user_job_count_per_destination[user_id][destination_id] = self.user_job_count_per_destination[user_id].get(destination_id, 0) + 1
        if self.app.job_config.limits.destination_total_concurrent_jobs:
            if self.total_job_count_per_destination is None:
                self.total_job_count_per_destination = {}
            self.total_job_count_per_destination[destination_id] = self.total_job_count_per_destination.get(destination_id, 0) + 1

    def __check_user_jobs( self, job, job_wrapper ):
        # TODO: Update output datasets' _state = LIMITED or some such new
        # state, so the UI can reflect what jobs are waiting due to concurrency
        # limits
        if job.user:
            # Check the hard limit first
            if self.app.job_config.limits.registered_user_concurrent_jobs:
                count = self.get_user_job_count(job.user_id)
                # Check the user's number of dispatched jobs against the overall limit
                if count >= self.app.job_config.limits.registered_user_concurrent_jobs:
                    return JOB_WAIT
            # If we pass the hard limit, also check the per-destination count
            id = job_wrapper.job_destination.id
            count_per_id = self.get_user_job_count_per_destination(job.user_id)
            if id in self.app.job_config.limits.destination_user_concurrent_jobs:
                count = count_per_id.get(id, 0)
                # Check the user's number of dispatched jobs in the assigned destination id against the limit for that id
                if count >= self.app.job_config.limits.destination_user_concurrent_jobs[id]:
                    return JOB_WAIT
            # If we pass the destination limit (if there is one), also check limits on any tags (if any)
            if job_wrapper.job_destination.tags:
                for tag in job_wrapper.job_destination.tags:
                    # Check each tag for this job's destination
                    if tag in self.app.job_config.limits.destination_user_concurrent_jobs:
                        # Only if there's a limit defined for this tag
                        count = 0
                        for id in [ d.id for d in self.app.job_config.get_destinations(tag) ]:
                            # Add up the aggregate job total for this tag
                            count += count_per_id.get(id, 0)
                        if count >= self.app.job_config.limits.destination_user_concurrent_jobs[tag]:
                            return JOB_WAIT
        elif job.galaxy_session:
            # Anonymous users only get the hard limit
            if self.app.job_config.limits.anonymous_user_concurrent_jobs:
                count = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                            .filter( and_( model.Job.session_id == job.galaxy_session.id,
                                           or_( model.Job.state == model.Job.states.RUNNING,
                                                model.Job.state == model.Job.states.QUEUED ) ) ).count()
                if count >= self.app.job_config.limits.anonymous_user_concurrent_jobs:
                    return JOB_WAIT
        else:
            log.warning( 'Job %s is not associated with a user or session so job concurrency limit cannot be checked.' % job.id )
        return JOB_READY

    def __cache_total_job_count_per_destination( self ):
        # Cache the job count if necessary
        if self.total_job_count_per_destination is None:
            self.total_job_count_per_destination = {}
            result = self.sa_session.execute(select([model.Job.table.c.destination_id, func.count(model.Job.table.c.destination_id).label('job_count')])
                                             .where(and_(model.Job.table.c.state.in_((model.Job.states.QUEUED, model.Job.states.RUNNING))))
                                             .group_by(model.Job.table.c.destination_id))
            for row in result:
                self.total_job_count_per_destination[row['destination_id']] = row['job_count']

    def get_total_job_count_per_destination(self):
        self.__cache_total_job_count_per_destination()
        # Always use caching (at worst a job will have to wait one iteration,
        # and this would be more fair anyway as it ensures FIFO scheduling,
        # insofar as FIFO would be fair...)
        return self.total_job_count_per_destination

    def __check_destination_jobs( self, job, job_wrapper ):
        if self.app.job_config.limits.destination_total_concurrent_jobs:
            id = job_wrapper.job_destination.id
            count_per_id = self.get_total_job_count_per_destination()
            if id in self.app.job_config.limits.destination_total_concurrent_jobs:
                count = count_per_id.get(id, 0)
                # Check the number of dispatched jobs in the assigned destination id against the limit for that id
                if count >= self.app.job_config.limits.destination_total_concurrent_jobs[id]:
                    return JOB_WAIT
            # If we pass the destination limit (if there is one), also check limits on any tags (if any)
            if job_wrapper.job_destination.tags:
                for tag in job_wrapper.job_destination.tags:
                    # Check each tag for this job's destination
                    if tag in self.app.job_config.limits.destination_total_concurrent_jobs:
                        # Only if there's a limit defined for this tag
                        count = 0
                        for id in [ d.id for d in self.app.job_config.get_destinations(tag) ]:
                            # Add up the aggregate job total for this tag
                            count += count_per_id.get(id, 0)
                        if count >= self.app.job_config.limits.destination_total_concurrent_jobs[tag]:
                            return JOB_WAIT
        return JOB_READY

    def put( self, job_id, tool_id ):
        """Add a job to the queue (by job identifier)"""
        if not self.track_jobs_in_database:
            self.queue.put( ( job_id, tool_id ) )
            self.sleeper.wake()

    def shutdown( self ):
        """Attempts to gracefully shut down the worker thread"""
        if self.parent_pid != os.getpid():
            # We're not the real job queue, do nothing
            return
        else:
            log.info( "sending stop signal to worker thread" )
            self.running = False
            if not self.app.config.track_jobs_in_database:
                self.queue.put( self.STOP_SIGNAL )
            self.sleeper.wake()
            log.info( "job handler queue stopped" )
            self.dispatcher.shutdown()
class JobHandlerStopQueue(object):
    """
    A queue for jobs which need to be terminated prematurely.
    """
    STOP_SIGNAL = object()

    def __init__(self, app, dispatcher):
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()

        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting = []

        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread(
            name="JobHandlerStopQueue.monitor_thread", target=self.monitor)
        self.monitor_thread.setDaemon(True)
        self.monitor_thread.start()
        log.info("job handler stop queue started")

    def monitor(self):
        """
        Continually iterate the waiting jobs, stop any that are found.
        """
        # HACK: Delay until after forking, we need a way to do post fork notification!!!
        time.sleep(10)
        while self.running:
            try:
                self.monitor_step()
            except:
                log.exception("Exception in monitor_step")
            # Sleep
            self.sleeper.sleep(1)

    def monitor_step(self):
        """
        Called repeatedly by `monitor` to stop jobs.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        if self.app.config.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            newly_deleted_jobs = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                                     .filter( ( model.Job.state == model.Job.states.DELETED_NEW ) &
                                              ( model.Job.handler == self.app.config.server_name ) ).all()
            for job in newly_deleted_jobs:
                jobs_to_check.append((job, job.stderr))
        # Also pull from the queue (in the case of Administrative stopped jobs)
        try:
            while 1:
                message = self.queue.get_nowait()
                if message is self.STOP_SIGNAL:
                    return
                # Unpack the message
                job_id, error_msg = message
                # Get the job object and append to watch queue
                jobs_to_check.append(
                    (self.sa_session.query(model.Job).get(job_id), error_msg))
        except Empty:
            pass
        for job, error_msg in jobs_to_check:
            if (job.state not in (job.states.DELETED_NEW, job.states.DELETED)
                    and job.finished):
                # terminated before it got here
                log.debug('Job %s already finished, not deleting or stopping',
                          job.id)
                continue
            final_state = job.states.DELETED
            if error_msg is not None:
                final_state = job.states.ERROR
                job.info = error_msg
            job.set_final_state(final_state)
            self.sa_session.add(job)
            self.sa_session.flush()
            if job.job_runner_name is not None:
                # tell the dispatcher to stop the job
                self.dispatcher.stop(job)

    def put(self, job_id, error_msg=None):
        if not self.app.config.track_jobs_in_database:
            self.queue.put((job_id, error_msg))

    def shutdown(self):
        """Attempts to gracefully shut down the worker thread"""
        if self.parent_pid != os.getpid():
            # We're not the real job queue, do nothing
            return
        else:
            log.info("sending stop signal to worker thread")
            self.running = False
            if not self.app.config.track_jobs_in_database:
                self.queue.put(self.STOP_SIGNAL)
            self.sleeper.wake()
            log.info("job handler stop queue stopped")
Beispiel #9
0
class JobHandlerQueue( object ):
    """
    Job Handler's Internal Queue, this is what actually implements waiting for
    jobs to be runnable and dispatching to a JobRunner.
    """
    STOP_SIGNAL = object()

    def __init__( self, app, dispatcher ):
        """Initializes the Job Handler Queue, creates (unstarted) monitoring thread"""
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context
        self.track_jobs_in_database = self.app.config.track_jobs_in_database

        # Initialize structures for handling job limits
        self.__clear_job_count()

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting_jobs = []
        # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times)
        self.job_wrappers = {}
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread( name="JobHandlerQueue.monitor_thread", target=self.__monitor )
        self.monitor_thread.setDaemon( True )

    def start( self ):
        """
        Starts the JobHandler's thread after checking for any unhandled jobs.
        """
        # Recover jobs at startup
        self.__check_jobs_at_startup()
        # Start the queue
        self.monitor_thread.start()
        log.info( "job handler queue started" )

    def job_wrapper( self, job, use_persisted_destination=False ):
        return JobWrapper( job, self, use_persisted_destination=use_persisted_destination )

    def job_pair_for_id( self, id ):
        job = self.sa_session.query( model.Job ).get( id )
        return job, self.job_wrapper( job, use_persisted_destination=True )

    def __check_jobs_at_startup( self ):
        """
        Checks all jobs that are in the 'new', 'queued' or 'running' state in
        the database and requeues or cleans up as necessary.  Only run as the
        job handler starts.
        In case the activation is enforced it will filter out the jobs of inactive users.
        """
        jobs_at_startup = []
        if self.track_jobs_in_database:
            in_list = ( model.Job.states.QUEUED,
                        model.Job.states.RUNNING )
        else:
            in_list = ( model.Job.states.NEW,
                        model.Job.states.QUEUED,
                        model.Job.states.RUNNING )
        if self.app.config.user_activation_on:
                jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                                  .outerjoin( model.User ) \
                                  .filter( model.Job.state.in_( in_list ) \
                                           & ( model.Job.handler == self.app.config.server_name ) \
                                           & or_( ( model.Job.user_id == None ), ( model.User.active == True ) ) ).all()
        else:
            jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                              .filter( model.Job.state.in_( in_list ) \
                                       & ( model.Job.handler == self.app.config.server_name ) ).all()

        for job in jobs_at_startup:
            if not self.app.toolbox.has_tool( job.tool_id, job.tool_version, exact=True ):
                log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % ( job.id, job.tool_id ) )
                self.job_wrapper( job ).fail( 'This tool was disabled before the job completed.  Please contact your Galaxy administrator.' )
            elif job.job_runner_name is not None and job.job_runner_external_id is None:
                # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner.
                log.debug( "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id )
                job.job_runner_name = None
                if self.track_jobs_in_database:
                    job.set_state( model.Job.states.NEW )
                else:
                    self.queue.put( ( job.id, job.tool_id ) )
            elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None:
                # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist
                job_wrapper = self.job_wrapper( job )
                job_destination = self.dispatcher.url_to_destination(job.job_runner_name)
                if job_destination.id is None:
                    job_destination.id = 'legacy_url'
                job_wrapper.set_job_destination(job_destination, job.job_runner_external_id)
                self.dispatcher.recover( job, job_wrapper )
                log.info('(%s) Converted job from a URL to a destination and recovered' % (job.id))
            elif job.job_runner_name is None:
                # Never (fully) dispatched
                log.debug( "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % ( job.id, job.state ) )
                if self.track_jobs_in_database:
                    job.set_state( model.Job.states.NEW )
                else:
                    self.queue.put( ( job.id, job.tool_id ) )
            else:
                # Already dispatched and running
                job_wrapper = self.job_wrapper( job )
                # Use the persisted destination as its params may differ from
                # what's in the job_conf xml
                job_destination = JobDestination(id=job.destination_id, runner=job.job_runner_name, params=job.destination_params)
                # resubmits are not persisted (it's a good thing) so they
                # should be added back to the in-memory destination on startup
                try:
                    config_job_destination = self.app.job_config.get_destination( job.destination_id )
                    job_destination.resubmit = config_job_destination.resubmit
                except KeyError:
                    log.warning( '(%s) Recovered destination id (%s) does not exist in job config (but this may be normal in the case of a dynamically generated destination)', job.id, job.destination_id )
                job_wrapper.job_runner_mapper.cached_job_destination = job_destination
                self.dispatcher.recover( job, job_wrapper )
        if self.sa_session.dirty:
            self.sa_session.flush()

    def __monitor( self ):
        """
        Continually iterate the waiting jobs, checking is each is ready to
        run and dispatching if so.
        """
        while self.running:
            try:
                # If jobs are locked, there's nothing to monitor and we skip
                # to the sleep.
                if not self.app.job_manager.job_lock:
                    self.__monitor_step()
            except:
                log.exception( "Exception in monitor_step" )
            # Sleep
            self.sleeper.sleep( 1 )

    def __monitor_step( self ):
        """
        Called repeatedly by `monitor` to process waiting jobs. Gets any new
        jobs (either from the database or from its own queue), then iterates
        over all new and waiting jobs to check the state of the jobs each
        depends on. If the job has dependencies that have not finished, it
        it goes to the waiting queue. If the job has dependencies with errors,
        it is marked as having errors and removed from the queue. If the job
        belongs to an inactive user it is ignored.
        Otherwise, the job is dispatched.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        resubmit_jobs = []
        if self.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                    .join(model.JobToInputDatasetAssociation) \
                    .join(model.HistoryDatasetAssociation) \
                    .join(model.Dataset) \
                    .filter(and_( (model.Job.state == model.Job.states.NEW ),
                                 or_( ( model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA ),
                                      ( model.HistoryDatasetAssociation.deleted == True ),
                                      ( model.Dataset.state != model.Dataset.states.OK ),
                                      ( model.Dataset.deleted == True) ) ) ).subquery()
            ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                    .join(model.JobToInputLibraryDatasetAssociation) \
                    .join(model.LibraryDatasetDatasetAssociation) \
                    .join(model.Dataset) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 or_((model.LibraryDatasetDatasetAssociation._state != None),
                                     (model.LibraryDatasetDatasetAssociation.deleted == True),
                                     (model.Dataset.state != model.Dataset.states.OK),
                                     (model.Dataset.deleted == True)))).subquery()
            if self.app.config.user_activation_on:
                jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                        .outerjoin( model.User ) \
                        .filter(and_((model.Job.state == model.Job.states.NEW),
                                    or_((model.Job.user_id == None), (model.User.active == True)),
                                     (model.Job.handler == self.app.config.server_name),
                                     ~model.Job.table.c.id.in_(hda_not_ready),
                                     ~model.Job.table.c.id.in_(ldda_not_ready))) \
                        .order_by(model.Job.id).all()
            else:
                jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 (model.Job.handler == self.app.config.server_name),
                                 ~model.Job.table.c.id.in_(hda_not_ready),
                                 ~model.Job.table.c.id.in_(ldda_not_ready))) \
                    .order_by(model.Job.id).all()
            # Fetch all "resubmit" jobs
            resubmit_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .filter(and_((model.Job.state == model.Job.states.RESUBMITTED),
                                (model.Job.handler == self.app.config.server_name))) \
                    .order_by(model.Job.id).all()
        else:
            # Get job objects and append to watch queue for any which were
            # previously waiting
            for job_id in self.waiting_jobs:
                jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) )
            try:
                while 1:
                    message = self.queue.get_nowait()
                    if message is self.STOP_SIGNAL:
                        return
                    # Unpack the message
                    job_id, tool_id = message
                    # Get the job object and append to watch queue
                    jobs_to_check.append( self.sa_session.query( model.Job ).get( job_id ) )
            except Empty:
                pass
        # Ensure that we get new job counts on each iteration
        self.__clear_job_count()
        # Check resubmit jobs first so that limits of new jobs will still be enforced
        for job in resubmit_jobs:
            log.debug( '(%s) Job was resubmitted and is being dispatched immediately', job.id )
            # Reassemble resubmit job destination from persisted value
            jw = self.job_wrapper( job )
            jw.job_runner_mapper.cached_job_destination = JobDestination( id=job.destination_id, runner=job.job_runner_name, params=job.destination_params )
            self.increase_running_job_count(job.user_id, jw.job_destination.id)
            self.dispatcher.put( jw )
        # Iterate over new and waiting jobs and look for any that are
        # ready to run
        new_waiting_jobs = []
        for job in jobs_to_check:
            try:
                # Check the job's dependencies, requeue if they're not done.
                # Some of these states will only happen when using the in-memory job queue
                job_state = self.__check_job_state( job )
                if job_state == JOB_WAIT:
                    new_waiting_jobs.append( job.id )
                elif job_state == JOB_INPUT_ERROR:
                    log.info( "(%d) Job unable to run: one or more inputs in error state" % job.id )
                elif job_state == JOB_INPUT_DELETED:
                    log.info( "(%d) Job unable to run: one or more inputs deleted" % job.id )
                elif job_state == JOB_READY:
                    self.dispatcher.put( self.job_wrappers.pop( job.id ) )
                    log.info( "(%d) Job dispatched" % job.id )
                elif job_state == JOB_DELETED:
                    log.info( "(%d) Job deleted by user while still queued" % job.id )
                elif job_state == JOB_ADMIN_DELETED:
                    log.info( "(%d) Job deleted by admin while still queued" % job.id )
                elif job_state == JOB_USER_OVER_QUOTA:
                    log.info( "(%d) User (%s) is over quota: job paused" % ( job.id, job.user_id ) )
                    job.set_state( model.Job.states.PAUSED )
                    for dataset_assoc in job.output_datasets + job.output_library_datasets:
                        dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED
                        dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run"
                        self.sa_session.add( dataset_assoc.dataset.dataset )
                    self.sa_session.add( job )
                elif job_state == JOB_ERROR:
                    log.error( "(%d) Error checking job readiness" % job.id )
                else:
                    log.error( "(%d) Job in unknown state '%s'" % ( job.id, job_state ) )
                    new_waiting_jobs.append( job.id )
            except Exception:
                log.exception( "failure running job %d" % job.id )
        # Update the waiting list
        if not self.track_jobs_in_database:
            self.waiting_jobs = new_waiting_jobs
        # Remove cached wrappers for any jobs that are no longer being tracked
        for id in self.job_wrappers.keys():
            if id not in new_waiting_jobs:
                del self.job_wrappers[id]
        # Flush, if we updated the state
        self.sa_session.flush()
        # Done with the session
        self.sa_session.remove()

    def __check_job_state( self, job ):
        """
        Check if a job is ready to run by verifying that each of its input
        datasets is ready (specifically in the OK state). If any input dataset
        has an error, fail the job and return JOB_INPUT_ERROR. If any input
        dataset is deleted, fail the job and return JOB_INPUT_DELETED.  If all
        input datasets are in OK state, return JOB_READY indicating that the
        job can be dispatched. Otherwise, return JOB_WAIT indicating that input
        datasets are still being prepared.
        """
        if not self.track_jobs_in_database:
            in_memory_not_ready_state = self.__verify_in_memory_job_inputs( job )
            if in_memory_not_ready_state:
                return in_memory_not_ready_state

        # Else, if tracking in the database, job.state is guaranteed to be NEW and
        # the inputs are guaranteed to be OK.

        # Create the job wrapper so that the destination can be set
        job_id = job.id
        job_wrapper = self.job_wrappers.get( job_id, None )
        if not job_wrapper:
            job_wrapper = self.job_wrapper( job )
            self.job_wrappers[ job_id ] = job_wrapper

        # If state == JOB_READY, assume job_destination also set - otherwise
        # in case of various error or cancelled states do not assume
        # destination has been set.
        state, job_destination = self.__verify_job_ready( job, job_wrapper )

        if state == JOB_READY:
            # PASS.  increase usage by one job (if caching) so that multiple jobs aren't dispatched on this queue iteration
            self.increase_running_job_count(job.user_id, job_destination.id )
        return state

    def __verify_job_ready( self, job, job_wrapper ):
        """ Compute job destination and verify job is ready at that
        destination by checking job limits and quota. If this method
        return a job state of JOB_READY - it MUST also return a job
        destination.
        """
        job_destination = None
        try:
            assert job_wrapper.tool is not None, 'This tool was disabled before the job completed.  Please contact your Galaxy administrator.'
            # Cause the job_destination to be set and cached by the mapper
            job_destination = job_wrapper.job_destination
        except AssertionError as e:
            log.warning( "(%s) Tool '%s' removed from tool config, unable to run job" % ( job.id, job.tool_id ) )
            job_wrapper.fail( e )
            return JOB_ERROR, job_destination
        except JobNotReadyException as e:
            job_state = e.job_state or JOB_WAIT
            return job_state, None
        except Exception, e:
            failure_message = getattr( e, 'failure_message', DEFAULT_JOB_PUT_FAILURE_MESSAGE )
            if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE:
                log.exception( 'Failed to generate job destination' )
            else:
                log.debug( "Intentionally failing job with message (%s)" % failure_message )
            job_wrapper.fail( failure_message )
            return JOB_ERROR, job_destination
        # job is ready to run, check limits
        # TODO: these checks should be refactored to minimize duplication and made more modular/pluggable
        state = self.__check_destination_jobs( job, job_wrapper )
        if state == JOB_READY:
            state = self.__check_user_jobs( job, job_wrapper )
        if state == JOB_READY and self.app.config.enable_quotas:
            quota = self.app.quota_agent.get_quota( job.user )
            if quota is not None:
                try:
                    usage = self.app.quota_agent.get_usage( user=job.user, history=job.history )
                    if usage > quota:
                        return JOB_USER_OVER_QUOTA, job_destination
                except AssertionError, e:
                    pass  # No history, should not happen with an anon user
Beispiel #10
0
class S3ObjectStore(ObjectStore):
    """
    Object store that stores objects as items in an AWS S3 bucket. A local
    cache exists that is used as an intermediate location for files between
    Galaxy and S3.
    """
    def __init__(self, config, config_xml):
        if boto is None:
            raise Exception(NO_BOTO_ERROR_MESSAGE)
        super(S3ObjectStore, self).__init__(config, config_xml)
        self.config = config
        self.staging_path = self.config.file_path
        self.transfer_progress = 0
        self._parse_config_xml(config_xml)
        self._configure_connection()
        self.bucket = self._get_bucket(self.bucket)
        # Clean cache only if value is set in galaxy.ini
        if self.cache_size != -1:
            # Convert GBs to bytes for comparison
            self.cache_size = self.cache_size * 1073741824
            # Helper for interruptable sleep
            self.sleeper = Sleeper()
            self.cache_monitor_thread = threading.Thread(
                target=self.__cache_monitor)
            self.cache_monitor_thread.start()
            log.info("Cache cleaner manager started")
        # Test if 'axel' is available for parallel download and pull the key into cache
        try:
            subprocess.call('axel')
            self.use_axel = True
        except OSError:
            self.use_axel = False

    def _configure_connection(self):
        log.debug("Configuring S3 Connection")
        self.conn = S3Connection(self.access_key, self.secret_key)

    def _parse_config_xml(self, config_xml):
        try:
            a_xml = config_xml.findall('auth')[0]
            self.access_key = a_xml.get('access_key')
            self.secret_key = a_xml.get('secret_key')
            b_xml = config_xml.findall('bucket')[0]
            self.bucket = b_xml.get('name')
            self.use_rr = b_xml.get('use_reduced_redundancy', False)
            cn_xml = config_xml.findall('connection')[0]
            self.host = cn_xml.get('host', None)
            self.port = int(cn_xml.get('port', 6000))
            self.is_secure = cn_xml.get('is_secure', True)
            self.conn_path = cn_xml.get('conn_path', '/')
            c_xml = config_xml.findall('cache')[0]
            self.cache_size = float(c_xml.get('size', -1))
            self.cache_path = c_xml.get('path')
        except Exception:
            # Toss it back up after logging, we can't continue loading at this point.
            log.exception(
                "Malformed ObjectStore Configuration XML -- unable to continue"
            )
            raise

    def __cache_monitor(self):
        time.sleep(2)  # Wait for things to load before starting the monitor
        while self.running:
            total_size = 0
            # Is this going to be too expensive of an operation to be done frequently?
            file_list = []
            for dirpath, dirnames, filenames in os.walk(self.staging_path):
                for f in filenames:
                    fp = os.path.join(dirpath, f)
                    file_size = os.path.getsize(fp)
                    total_size += file_size
                    # Get the time given file was last accessed
                    last_access_time = time.localtime(os.stat(fp)[7])
                    # Compose a tuple of the access time and the file path
                    file_tuple = last_access_time, fp, file_size
                    file_list.append(file_tuple)
            # Sort the file list (based on access time)
            file_list.sort()
            # Initiate cleaning once within 10% of the defined cache size?
            cache_limit = self.cache_size * 0.9
            if total_size > cache_limit:
                log.info(
                    "Initiating cache cleaning: current cache size: %s; clean until smaller than: %s"
                    % (convert_bytes(total_size), convert_bytes(cache_limit)))
                # How much to delete? If simply deleting up to the cache-10% limit,
                # is likely to be deleting frequently and may run the risk of hitting
                # the limit - maybe delete additional #%?
                # For now, delete enough to leave at least 10% of the total cache free
                delete_this_much = total_size - cache_limit
                self.__clean_cache(file_list, delete_this_much)
            self.sleeper.sleep(30)  # Test cache size every 30 seconds?

    def __clean_cache(self, file_list, delete_this_much):
        """ Keep deleting files from the file_list until the size of the deleted
        files is greater than the value in delete_this_much parameter.

        :type file_list: list
        :param file_list: List of candidate files that can be deleted. This method
            will start deleting files from the beginning of the list so the list
            should be sorted accordingly. The list must contains 3-element tuples,
            positioned as follows: position 0 holds file last accessed timestamp
            (as time.struct_time), position 1 holds file path, and position 2 has
            file size (e.g., (<access time>, /mnt/data/dataset_1.dat), 472394)

        :type delete_this_much: int
        :param delete_this_much: Total size of files, in bytes, that should be deleted.
        """
        # Keep deleting datasets from file_list until deleted_amount does not
        # exceed delete_this_much; start deleting from the front of the file list,
        # which assumes the oldest files come first on the list.
        deleted_amount = 0
        for i, f in enumerate(file_list):
            if deleted_amount < delete_this_much:
                deleted_amount += f[2]
                os.remove(f[1])
                # Debugging code for printing deleted files' stats
                # folder, file_name = os.path.split(f[1])
                # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0])
                # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \
                #     % (i, file_name, convert_bytes(f[2]), file_date, \
                #     convert_bytes(deleted_amount), convert_bytes(delete_this_much)))
            else:
                log.debug("Cache cleaning done. Total space freed: %s" %
                          convert_bytes(deleted_amount))
                return

    def _get_bucket(self, bucket_name):
        """ Sometimes a handle to a bucket is not established right away so try
        it a few times. Raise error is connection is not established. """
        for i in range(5):
            try:
                bucket = self.conn.get_bucket(bucket_name)
                log.debug("Using cloud object store with bucket '%s'" %
                          bucket.name)
                return bucket
            except S3ResponseError:
                log.debug("Could not get bucket '%s', attempt %s/5" %
                          (bucket_name, i + 1))
                time.sleep(2)
        # All the attempts have been exhausted and connection was not established,
        # raise error
        raise S3ResponseError

    def _fix_permissions(self, rel_path):
        """ Set permissions on rel_path"""
        for basedir, dirs, files in os.walk(rel_path):
            umask_fix_perms(basedir, self.config.umask, 0777, self.config.gid)
            for f in files:
                path = os.path.join(basedir, f)
                # Ignore symlinks
                if os.path.islink(path):
                    continue
                umask_fix_perms(path, self.config.umask, 0666, self.config.gid)

    def _construct_path(self,
                        obj,
                        dir_only=None,
                        extra_dir=None,
                        extra_dir_at_root=False,
                        alt_name=None,
                        **kwargs):
        rel_path = os.path.join(*directory_hash_id(obj.id))
        if extra_dir is not None:
            if extra_dir_at_root:
                rel_path = os.path.join(extra_dir, rel_path)
            else:
                rel_path = os.path.join(rel_path, extra_dir)
        # S3 folders are marked by having trailing '/' so add it now
        rel_path = '%s/' % rel_path
        if not dir_only:
            rel_path = os.path.join(
                rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id)
        return rel_path

    def _get_cache_path(self, rel_path):
        return os.path.abspath(os.path.join(self.staging_path, rel_path))

    def _get_transfer_progress(self):
        return self.transfer_progress

    def _get_size_in_s3(self, rel_path):
        try:
            key = self.bucket.get_key(rel_path)
            if key:
                return key.size
        except S3ResponseError, ex:
            log.error("Could not get size of key '%s' from S3: %s" %
                      (rel_path, ex))
        except Exception, ex:
            log.error(
                "Could not get reference to the key object '%s'; returning -1 for key size: %s"
                % (rel_path, ex))
Beispiel #11
0
class Cloud(ObjectStore, CloudConfigMixin):
    """
    Object store that stores objects as items in an cloud storage. A local
    cache exists that is used as an intermediate location for files between
    Galaxy and the cloud storage.
    """
    store_type = 'cloud'

    def __init__(self, config, config_dict):
        super(Cloud, self).__init__(config, config_dict)
        self.transfer_progress = 0

        bucket_dict = config_dict['bucket']
        connection_dict = config_dict.get('connection', {})
        cache_dict = config_dict['cache']

        self.provider = config_dict["provider"]
        self.credentials = config_dict["auth"]
        self.bucket_name = bucket_dict.get('name')
        self.use_rr = bucket_dict.get('use_reduced_redundancy', False)
        self.max_chunk_size = bucket_dict.get('max_chunk_size', 250)

        self.host = connection_dict.get('host', None)
        self.port = connection_dict.get('port', 6000)
        self.multipart = connection_dict.get('multipart', True)
        self.is_secure = connection_dict.get('is_secure', True)
        self.conn_path = connection_dict.get('conn_path', '/')

        self.cache_size = cache_dict.get('size', -1)
        self.staging_path = cache_dict.get('path') or self.config.object_store_cache_path

        self._initialize()

    def _initialize(self):
        if CloudProviderFactory is None:
            raise Exception(NO_CLOUDBRIDGE_ERROR_MESSAGE)

        self.conn = self._get_connection(self.provider, self.credentials)
        self.bucket = self._get_bucket(self.bucket_name)
        # Clean cache only if value is set in galaxy.ini
        if self.cache_size != -1:
            # Convert GBs to bytes for comparison
            self.cache_size = self.cache_size * 1073741824
            # Helper for interruptable sleep
            self.sleeper = Sleeper()
            self.cache_monitor_thread = threading.Thread(target=self.__cache_monitor)
            self.cache_monitor_thread.start()
            log.info("Cache cleaner manager started")
        # Test if 'axel' is available for parallel download and pull the key into cache
        try:
            subprocess.call('axel')
            self.use_axel = True
        except OSError:
            self.use_axel = False

    @staticmethod
    def _get_connection(provider, credentials):
        log.debug("Configuring `{}` Connection".format(provider))
        if provider == "aws":
            config = {"aws_access_key": credentials["access_key"],
                      "aws_secret_key": credentials["secret_key"]}
            connection = CloudProviderFactory().create_provider(ProviderList.AWS, config)
        elif provider == "azure":
            config = {"azure_subscription_id": credentials["subscription_id"],
                      "azure_client_id": credentials["client_id"],
                      "azure_secret": credentials["secret"],
                      "azure_tenant": credentials["tenant"]}
            connection = CloudProviderFactory().create_provider(ProviderList.AZURE, config)
        elif provider == "google":
            config = {"gcp_service_creds_file": credentials["credentials_file"]}
            connection = CloudProviderFactory().create_provider(ProviderList.GCP, config)
        else:
            raise Exception("Unsupported provider `{}`.".format(provider))

        # Ideally it would be better to assert if the connection is
        # authorized to perform operations required by ObjectStore
        # before returning it (and initializing ObjectStore); hence
        # any related issues can be handled properly here, and ObjectStore
        # can "trust" the connection is established.
        #
        # However, the mechanism implemented in Cloudbridge to assert if
        # a user/service is authorized to perform an operation, assumes
        # the user/service is granted with an elevated privileges, such
        # as admin/owner-level access to all resources. For a detailed
        # discussion see:
        #
        # https://github.com/CloudVE/cloudbridge/issues/135
        #
        # Hence, if a resource owner wants to only authorize Galaxy to r/w
        # a bucket/container on the provider, but does not allow it to access
        # other resources, Cloudbridge may fail asserting credentials.
        # For instance, to r/w an Amazon S3 bucket, the resource owner
        # also needs to authorize full access to Amazon EC2, because Cloudbridge
        # leverages EC2-specific functions to assert the credentials.
        #
        # Therefore, to adhere with principle of least privilege, we do not
        # assert credentials; instead, we handle exceptions raised as a
        # result of signing API calls to cloud provider (e.g., GCP) using
        # incorrect, invalid, or unauthorized credentials.

        return connection

    @classmethod
    def parse_xml(clazz, config_xml):
        # The following reads common cloud-based storage configuration
        # as implemented for the S3 backend. Hence, it also attempts to
        # parse S3-specific configuration (e.g., credentials); however,
        # such provider-specific configuration is overwritten in the
        # following.
        config = parse_config_xml(config_xml)

        try:
            provider = config_xml.attrib.get("provider")
            if provider is None:
                msg = "Missing `provider` attribute from the Cloud backend of the ObjectStore."
                log.error(msg)
                raise Exception(msg)
            provider = provider.lower()
            config["provider"] = provider

            # Read any provider-specific configuration.
            auth_element = config_xml.findall("auth")[0]
            missing_config = []
            if provider == "aws":
                akey = auth_element.get("access_key")
                if akey is None:
                    missing_config.append("access_key")
                skey = auth_element.get("secret_key")
                if skey is None:
                    missing_config.append("secret_key")

                config["auth"] = {
                    "access_key": akey,
                    "secret_key": skey}
            elif provider == "azure":
                sid = auth_element.get("subscription_id")
                if sid is None:
                    missing_config.append("subscription_id")
                cid = auth_element.get("client_id")
                if cid is None:
                    missing_config.append("client_id")
                sec = auth_element.get("secret")
                if sec is None:
                    missing_config.append("secret")
                ten = auth_element.get("tenant")
                if ten is None:
                    missing_config.append("tenant")
                config["auth"] = {
                    "subscription_id": sid,
                    "client_id": cid,
                    "secret": sec,
                    "tenant": ten}
            elif provider == "google":
                cre = auth_element.get("credentials_file")
                if not os.path.isfile(cre):
                    msg = "The following file specified for GCP credentials not found: {}".format(cre)
                    log.error(msg)
                    raise IOError(msg)
                if cre is None:
                    missing_config.append("credentials_file")
                config["auth"] = {
                    "credentials_file": cre}
            else:
                msg = "Unsupported provider `{}`.".format(provider)
                log.error(msg)
                raise Exception(msg)

            if len(missing_config) > 0:
                msg = "The following configuration required for {} cloud backend " \
                      "are missing: {}".format(provider, missing_config)
                log.error(msg)
                raise Exception(msg)
            else:
                return config
        except Exception:
            log.exception("Malformed ObjectStore Configuration XML -- unable to continue")
            raise

    def to_dict(self):
        as_dict = super(Cloud, self).to_dict()
        as_dict.update(self._config_to_dict())
        return as_dict

    def __cache_monitor(self):
        time.sleep(2)  # Wait for things to load before starting the monitor
        while self.running:
            total_size = 0
            # Is this going to be too expensive of an operation to be done frequently?
            file_list = []
            for dirpath, _, filenames in os.walk(self.staging_path):
                for filename in filenames:
                    filepath = os.path.join(dirpath, filename)
                    file_size = os.path.getsize(filepath)
                    total_size += file_size
                    # Get the time given file was last accessed
                    last_access_time = time.localtime(os.stat(filepath)[7])
                    # Compose a tuple of the access time and the file path
                    file_tuple = last_access_time, filepath, file_size
                    file_list.append(file_tuple)
            # Sort the file list (based on access time)
            file_list.sort()
            # Initiate cleaning once within 10% of the defined cache size?
            cache_limit = self.cache_size * 0.9
            if total_size > cache_limit:
                log.info("Initiating cache cleaning: current cache size: %s; clean until smaller than: %s",
                         convert_bytes(total_size), convert_bytes(cache_limit))
                # How much to delete? If simply deleting up to the cache-10% limit,
                # is likely to be deleting frequently and may run the risk of hitting
                # the limit - maybe delete additional #%?
                # For now, delete enough to leave at least 10% of the total cache free
                delete_this_much = total_size - cache_limit
                self.__clean_cache(file_list, delete_this_much)
            self.sleeper.sleep(30)  # Test cache size every 30 seconds?

    def __clean_cache(self, file_list, delete_this_much):
        """ Keep deleting files from the file_list until the size of the deleted
        files is greater than the value in delete_this_much parameter.

        :type file_list: list
        :param file_list: List of candidate files that can be deleted. This method
            will start deleting files from the beginning of the list so the list
            should be sorted accordingly. The list must contains 3-element tuples,
            positioned as follows: position 0 holds file last accessed timestamp
            (as time.struct_time), position 1 holds file path, and position 2 has
            file size (e.g., (<access time>, /mnt/data/dataset_1.dat), 472394)

        :type delete_this_much: int
        :param delete_this_much: Total size of files, in bytes, that should be deleted.
        """
        # Keep deleting datasets from file_list until deleted_amount does not
        # exceed delete_this_much; start deleting from the front of the file list,
        # which assumes the oldest files come first on the list.
        deleted_amount = 0
        for entry in enumerate(file_list):
            if deleted_amount < delete_this_much:
                deleted_amount += entry[2]
                os.remove(entry[1])
                # Debugging code for printing deleted files' stats
                # folder, file_name = os.path.split(f[1])
                # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0])
                # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \
                #     % (i, file_name, convert_bytes(f[2]), file_date, \
                #     convert_bytes(deleted_amount), convert_bytes(delete_this_much)))
            else:
                log.debug("Cache cleaning done. Total space freed: %s", convert_bytes(deleted_amount))
                return

    def _get_bucket(self, bucket_name):
        try:
            bucket = self.conn.storage.buckets.get(bucket_name)
            if bucket is None:
                log.debug("Bucket not found, creating a bucket with handle '%s'", bucket_name)
                bucket = self.conn.storage.buckets.create(bucket_name)
            log.debug("Using cloud ObjectStore with bucket '%s'", bucket.name)
            return bucket
        except InvalidNameException:
            log.exception("Invalid bucket name -- unable to continue")
            raise
        except Exception:
            # These two generic exceptions will be replaced by specific exceptions
            # once proper exceptions are exposed by CloudBridge.
            log.exception("Could not get bucket '{}'".format(bucket_name))
        raise Exception

    def _fix_permissions(self, rel_path):
        """ Set permissions on rel_path"""
        for basedir, _, files in os.walk(rel_path):
            umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid)
            for filename in files:
                path = os.path.join(basedir, filename)
                # Ignore symlinks
                if os.path.islink(path):
                    continue
                umask_fix_perms(path, self.config.umask, 0o666, self.config.gid)

    def _construct_path(self, obj, base_dir=None, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None,
                        obj_dir=False, **kwargs):
        # extra_dir should never be constructed from provided data but just
        # make sure there are no shenannigans afoot
        if extra_dir and extra_dir != os.path.normpath(extra_dir):
            log.warning('extra_dir is not normalized: %s', extra_dir)
            raise ObjectInvalid("The requested object is invalid")
        # ensure that any parent directory references in alt_name would not
        # result in a path not contained in the directory path constructed here
        if alt_name:
            if not safe_relpath(alt_name):
                log.warning('alt_name would locate path outside dir: %s', alt_name)
                raise ObjectInvalid("The requested object is invalid")
            # alt_name can contain parent directory references, but S3 will not
            # follow them, so if they are valid we normalize them out
            alt_name = os.path.normpath(alt_name)
        rel_path = os.path.join(*directory_hash_id(obj.id))
        if extra_dir is not None:
            if extra_dir_at_root:
                rel_path = os.path.join(extra_dir, rel_path)
            else:
                rel_path = os.path.join(rel_path, extra_dir)

        # for JOB_WORK directory
        if obj_dir:
            rel_path = os.path.join(rel_path, str(obj.id))
        if base_dir:
            base = self.extra_dirs.get(base_dir)
            return os.path.join(base, rel_path)

        # S3 folders are marked by having trailing '/' so add it now
        rel_path = '%s/' % rel_path

        if not dir_only:
            rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id)
        return rel_path

    def _get_cache_path(self, rel_path):
        return os.path.abspath(os.path.join(self.staging_path, rel_path))

    def _get_transfer_progress(self):
        return self.transfer_progress

    def _get_size_in_cloud(self, rel_path):
        try:
            obj = self.bucket.objects.get(rel_path)
            if obj:
                return obj.size
        except Exception:
            log.exception("Could not get size of key '%s' from S3", rel_path)
            return -1

    def _key_exists(self, rel_path):
        exists = False
        try:
            # A hackish way of testing if the rel_path is a folder vs a file
            is_dir = rel_path[-1] == '/'
            if is_dir:
                keyresult = self.bucket.objects.list(prefix=rel_path)
                if len(keyresult) > 0:
                    exists = True
                else:
                    exists = False
            else:
                exists = True if self.bucket.objects.get(rel_path) is not None else False
        except Exception:
            log.exception("Trouble checking existence of S3 key '%s'", rel_path)
            return False
        if rel_path[0] == '/':
            raise
        return exists

    def _in_cache(self, rel_path):
        """ Check if the given dataset is in the local cache and return True if so. """
        # log.debug("------ Checking cache for rel_path %s" % rel_path)
        cache_path = self._get_cache_path(rel_path)
        return os.path.exists(cache_path)

    def _pull_into_cache(self, rel_path):
        # Ensure the cache directory structure exists (e.g., dataset_#_files/)
        rel_path_dir = os.path.dirname(rel_path)
        if not os.path.exists(self._get_cache_path(rel_path_dir)):
            os.makedirs(self._get_cache_path(rel_path_dir))
        # Now pull in the file
        file_ok = self._download(rel_path)
        self._fix_permissions(self._get_cache_path(rel_path_dir))
        return file_ok

    def _transfer_cb(self, complete, total):
        self.transfer_progress += 10

    def _download(self, rel_path):
        try:
            log.debug("Pulling key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path))
            key = self.bucket.objects.get(rel_path)
            # Test if cache is large enough to hold the new file
            if self.cache_size > 0 and key.size > self.cache_size:
                log.critical("File %s is larger (%s) than the cache size (%s). Cannot download.",
                             rel_path, key.size, self.cache_size)
                return False
            if self.use_axel:
                log.debug("Parallel pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path))
                ncores = multiprocessing.cpu_count()
                url = key.generate_url(7200)
                ret_code = subprocess.call("axel -a -n %s '%s'" % (ncores, url))
                if ret_code == 0:
                    return True
            else:
                log.debug("Pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path))
                self.transfer_progress = 0  # Reset transfer progress counter
                with open(self._get_cache_path(rel_path), "w+") as downloaded_file_handle:
                    key.save_content(downloaded_file_handle)
                return True
        except Exception:
            log.exception("Problem downloading key '%s' from S3 bucket '%s'", rel_path, self.bucket.name)
        return False

    def _push_to_os(self, rel_path, source_file=None, from_string=None):
        """
        Push the file pointed to by ``rel_path`` to the object store naming the key
        ``rel_path``. If ``source_file`` is provided, push that file instead while
        still using ``rel_path`` as the key name.
        If ``from_string`` is provided, set contents of the file to the value of
        the string.
        """
        try:
            source_file = source_file if source_file else self._get_cache_path(rel_path)
            if os.path.exists(source_file):
                if os.path.getsize(source_file) == 0 and (self.bucket.objects.get(rel_path) is not None):
                    log.debug("Wanted to push file '%s' to S3 key '%s' but its size is 0; skipping.", source_file,
                              rel_path)
                    return True
                if from_string:
                    if not self.bucket.objects.get(rel_path):
                        created_obj = self.bucket.objects.create(rel_path)
                        created_obj.upload(source_file)
                    else:
                        self.bucket.objects.get(rel_path).upload(source_file)
                    log.debug("Pushed data from string '%s' to key '%s'", from_string, rel_path)
                else:
                    start_time = datetime.now()
                    log.debug("Pushing cache file '%s' of size %s bytes to key '%s'", source_file,
                              os.path.getsize(source_file), rel_path)
                    self.transfer_progress = 0  # Reset transfer progress counter
                    if not self.bucket.objects.get(rel_path):
                        created_obj = self.bucket.objects.create(rel_path)
                        created_obj.upload_from_file(source_file)
                    else:
                        self.bucket.objects.get(rel_path).upload_from_file(source_file)

                    end_time = datetime.now()
                    log.debug("Pushed cache file '%s' to key '%s' (%s bytes transfered in %s sec)",
                              source_file, rel_path, os.path.getsize(source_file), end_time - start_time)
                return True
            else:
                log.error("Tried updating key '%s' from source file '%s', but source file does not exist.",
                          rel_path, source_file)
        except Exception:
            log.exception("Trouble pushing S3 key '%s' from file '%s'", rel_path, source_file)
        return False

    def file_ready(self, obj, **kwargs):
        """
        A helper method that checks if a file corresponding to a dataset is
        ready and available to be used. Return ``True`` if so, ``False`` otherwise.
        """
        rel_path = self._construct_path(obj, **kwargs)
        # Make sure the size in cache is available in its entirety
        if self._in_cache(rel_path):
            if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_cloud(rel_path):
                return True
            log.debug("Waiting for dataset %s to transfer from OS: %s/%s", rel_path,
                      os.path.getsize(self._get_cache_path(rel_path)), self._get_size_in_cloud(rel_path))
        return False

    def exists(self, obj, **kwargs):
        in_cache = False
        rel_path = self._construct_path(obj, **kwargs)

        # Check cache
        if self._in_cache(rel_path):
            in_cache = True
        # Check cloud
        in_cloud = self._key_exists(rel_path)
        # log.debug("~~~~~~ File '%s' exists in cache: %s; in s3: %s" % (rel_path, in_cache, in_s3))
        # dir_only does not get synced so shortcut the decision
        dir_only = kwargs.get('dir_only', False)
        base_dir = kwargs.get('base_dir', None)
        if dir_only:
            if in_cache or in_cloud:
                return True
            # for JOB_WORK directory
            elif base_dir:
                if not os.path.exists(rel_path):
                    os.makedirs(rel_path)
                return True
            else:
                return False

        # TODO: Sync should probably not be done here. Add this to an async upload stack?
        if in_cache and not in_cloud:
            self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path))
            return True
        elif in_cloud:
            return True
        else:
            return False

    def create(self, obj, **kwargs):
        if not self.exists(obj, **kwargs):

            # Pull out locally used fields
            extra_dir = kwargs.get('extra_dir', None)
            extra_dir_at_root = kwargs.get('extra_dir_at_root', False)
            dir_only = kwargs.get('dir_only', False)
            alt_name = kwargs.get('alt_name', None)

            # Construct hashed path
            rel_path = os.path.join(*directory_hash_id(obj.id))

            # Optionally append extra_dir
            if extra_dir is not None:
                if extra_dir_at_root:
                    rel_path = os.path.join(extra_dir, rel_path)
                else:
                    rel_path = os.path.join(rel_path, extra_dir)

            # Create given directory in cache
            cache_dir = os.path.join(self.staging_path, rel_path)
            if not os.path.exists(cache_dir):
                os.makedirs(cache_dir)

            if not dir_only:
                rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id)
                open(os.path.join(self.staging_path, rel_path), 'w').close()
                self._push_to_os(rel_path, from_string='')

    def empty(self, obj, **kwargs):
        if self.exists(obj, **kwargs):
            return bool(self.size(obj, **kwargs) > 0)
        else:
            raise ObjectNotFound('objectstore.empty, object does not exist: %s, kwargs: %s'
                                 % (str(obj), str(kwargs)))

    def size(self, obj, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        if self._in_cache(rel_path):
            try:
                return os.path.getsize(self._get_cache_path(rel_path))
            except OSError as ex:
                log.info("Could not get size of file '%s' in local cache, will try cloud. Error: %s", rel_path, ex)
        elif self.exists(obj, **kwargs):
            return self._get_size_in_cloud(rel_path)
        log.warning("Did not find dataset '%s', returning 0 for size", rel_path)
        return 0

    def delete(self, obj, entire_dir=False, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        extra_dir = kwargs.get('extra_dir', None)
        base_dir = kwargs.get('base_dir', None)
        dir_only = kwargs.get('dir_only', False)
        obj_dir = kwargs.get('obj_dir', False)
        try:
            # Remove temparory data in JOB_WORK directory
            if base_dir and dir_only and obj_dir:
                shutil.rmtree(os.path.abspath(rel_path))
                return True

            # For the case of extra_files, because we don't have a reference to
            # individual files/keys we need to remove the entire directory structure
            # with all the files in it. This is easy for the local file system,
            # but requires iterating through each individual key in S3 and deleing it.
            if entire_dir and extra_dir:
                shutil.rmtree(self._get_cache_path(rel_path))
                results = self.bucket.objects.list(prefix=rel_path)
                for key in results:
                    log.debug("Deleting key %s", key.name)
                    key.delete()
                return True
            else:
                # Delete from cache first
                os.unlink(self._get_cache_path(rel_path))
                # Delete from S3 as well
                if self._key_exists(rel_path):
                    key = self.bucket.objects.get(rel_path)
                    log.debug("Deleting key %s", key.name)
                    key.delete()
                    return True
        except Exception:
            log.exception("Could not delete key '%s' from cloud", rel_path)
        except OSError:
            log.exception('%s delete error', self.get_filename(obj, **kwargs))
        return False

    def get_data(self, obj, start=0, count=-1, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        # Check cache first and get file if not there
        if not self._in_cache(rel_path):
            self._pull_into_cache(rel_path)
        # Read the file content from cache
        data_file = open(self._get_cache_path(rel_path), 'r')
        data_file.seek(start)
        content = data_file.read(count)
        data_file.close()
        return content

    def get_filename(self, obj, **kwargs):
        base_dir = kwargs.get('base_dir', None)
        dir_only = kwargs.get('dir_only', False)
        obj_dir = kwargs.get('obj_dir', False)
        rel_path = self._construct_path(obj, **kwargs)

        # for JOB_WORK directory
        if base_dir and dir_only and obj_dir:
            return os.path.abspath(rel_path)

        cache_path = self._get_cache_path(rel_path)
        # S3 does not recognize directories as files so cannot check if those exist.
        # So, if checking dir only, ensure given dir exists in cache and return
        # the expected cache path.
        # dir_only = kwargs.get('dir_only', False)
        # if dir_only:
        #     if not os.path.exists(cache_path):
        #         os.makedirs(cache_path)
        #     return cache_path
        # Check if the file exists in the cache first
        if self._in_cache(rel_path):
            return cache_path
        # Check if the file exists in persistent storage and, if it does, pull it into cache
        elif self.exists(obj, **kwargs):
            if dir_only:  # Directories do not get pulled into cache
                return cache_path
            else:
                if self._pull_into_cache(rel_path):
                    return cache_path
        # For the case of retrieving a directory only, return the expected path
        # even if it does not exist.
        # if dir_only:
        #     return cache_path
        raise ObjectNotFound('objectstore.get_filename, no cache_path: %s, kwargs: %s'
                             % (str(obj), str(kwargs)))
        # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path

    def update_from_file(self, obj, file_name=None, create=False, **kwargs):
        if create:
            self.create(obj, **kwargs)
        if self.exists(obj, **kwargs):
            rel_path = self._construct_path(obj, **kwargs)
            # Chose whether to use the dataset file itself or an alternate file
            if file_name:
                source_file = os.path.abspath(file_name)
                # Copy into cache
                cache_file = self._get_cache_path(rel_path)
                try:
                    if source_file != cache_file:
                        # FIXME? Should this be a `move`?
                        shutil.copy2(source_file, cache_file)
                    self._fix_permissions(cache_file)
                except OSError:
                    log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file)
            else:
                source_file = self._get_cache_path(rel_path)
            # Update the file on cloud
            self._push_to_os(rel_path, source_file)
        else:
            raise ObjectNotFound('objectstore.update_from_file, object does not exist: %s, kwargs: %s'
                                 % (str(obj), str(kwargs)))

    def get_object_url(self, obj, **kwargs):
        if self.exists(obj, **kwargs):
            rel_path = self._construct_path(obj, **kwargs)
            try:
                key = self.bucket.objects.get(rel_path)
                return key.generate_url(expires_in=86400)  # 24hrs
            except Exception:
                log.exception("Trouble generating URL for dataset '%s'", rel_path)
        return None

    def get_store_usage_percent(self):
        return 0.0
Beispiel #12
0
class S3ObjectStore(ObjectStore):
    """
    Object store that stores objects as items in an AWS S3 bucket. A local
    cache exists that is used as an intermediate location for files between
    Galaxy and S3.
    """
    def __init__(self, config, config_xml):
        if boto is None:
            raise Exception(NO_BOTO_ERROR_MESSAGE)
        super(S3ObjectStore, self).__init__(config, config_xml)
        self.config = config
        self.staging_path = self.config.file_path
        self.transfer_progress = 0
        self._parse_config_xml(config_xml)
        self._configure_connection()
        self.bucket = self._get_bucket(self.bucket)
        # Clean cache only if value is set in universe_wsgi.ini
        if self.cache_size != -1:
            # Convert GBs to bytes for comparison
            self.cache_size = self.cache_size * 1073741824
            # Helper for interruptable sleep
            self.sleeper = Sleeper()
            self.cache_monitor_thread = threading.Thread(target=self.__cache_monitor)
            self.cache_monitor_thread.start()
            log.info("Cache cleaner manager started")
        # Test if 'axel' is available for parallel download and pull the key into cache
        try:
            subprocess.call('axel')
            self.use_axel = True
        except OSError:
            self.use_axel = False

    def _configure_connection(self):
        log.debug("Configuring S3 Connection")
        self.conn = S3Connection(self.access_key, self.secret_key)

    def _parse_config_xml(self, config_xml):
        try:
            a_xml = config_xml.findall('auth')[0]
            self.access_key = a_xml.get('access_key')
            self.secret_key = a_xml.get('secret_key')
            b_xml = config_xml.findall('bucket')[0]
            self.bucket = b_xml.get('name')
            self.use_rr = b_xml.get('use_reduced_redundancy', False)
            cn_xml = config_xml.findall('connection')[0]
            self.host = cn_xml.get('host', None)
            self.port = int(cn_xml.get('port', 6000))
            self.is_secure = cn_xml.get('is_secure', True)
            self.conn_path = cn_xml.get('conn_path', '/')
            c_xml = config_xml.findall('cache')[0]
            self.cache_size = float(c_xml.get('size', -1))
            self.cache_path = c_xml.get('path')
        except Exception:
            # Toss it back up after logging, we can't continue loading at this point.
            log.exception("Malformed ObjectStore Configuration XML -- unable to continue")
            raise

    def __cache_monitor(self):
        time.sleep(2)  # Wait for things to load before starting the monitor
        while self.running:
            total_size = 0
            # Is this going to be too expensive of an operation to be done frequently?
            file_list = []
            for dirpath, dirnames, filenames in os.walk(self.staging_path):
                for f in filenames:
                    fp = os.path.join(dirpath, f)
                    file_size = os.path.getsize(fp)
                    total_size += file_size
                    # Get the time given file was last accessed
                    last_access_time = time.localtime(os.stat(fp)[7])
                    # Compose a tuple of the access time and the file path
                    file_tuple = last_access_time, fp, file_size
                    file_list.append(file_tuple)
            # Sort the file list (based on access time)
            file_list.sort()
            # Initiate cleaning once within 10% of the defined cache size?
            cache_limit = self.cache_size * 0.9
            if total_size > cache_limit:
                log.info("Initiating cache cleaning: current cache size: %s; clean until smaller than: %s"
                         % (convert_bytes(total_size), convert_bytes(cache_limit)))
                # How much to delete? If simply deleting up to the cache-10% limit,
                # is likely to be deleting frequently and may run the risk of hitting
                # the limit - maybe delete additional #%?
                # For now, delete enough to leave at least 10% of the total cache free
                delete_this_much = total_size - cache_limit
                self.__clean_cache(file_list, delete_this_much)
            self.sleeper.sleep(30)  # Test cache size every 30 seconds?

    def __clean_cache(self, file_list, delete_this_much):
        """ Keep deleting files from the file_list until the size of the deleted
        files is greater than the value in delete_this_much parameter.

        :type file_list: list
        :param file_list: List of candidate files that can be deleted. This method
            will start deleting files from the beginning of the list so the list
            should be sorted accordingly. The list must contains 3-element tuples,
            positioned as follows: position 0 holds file last accessed timestamp
            (as time.struct_time), position 1 holds file path, and position 2 has
            file size (e.g., (<access time>, /mnt/data/dataset_1.dat), 472394)

        :type delete_this_much: int
        :param delete_this_much: Total size of files, in bytes, that should be deleted.
        """
        # Keep deleting datasets from file_list until deleted_amount does not
        # exceed delete_this_much; start deleting from the front of the file list,
        # which assumes the oldest files come first on the list.
        deleted_amount = 0
        for i, f in enumerate(file_list):
            if deleted_amount < delete_this_much:
                deleted_amount += f[2]
                os.remove(f[1])
                # Debugging code for printing deleted files' stats
                # folder, file_name = os.path.split(f[1])
                # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0])
                # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \
                #     % (i, file_name, convert_bytes(f[2]), file_date, \
                #     convert_bytes(deleted_amount), convert_bytes(delete_this_much)))
            else:
                log.debug("Cache cleaning done. Total space freed: %s" % convert_bytes(deleted_amount))
                return

    def _get_bucket(self, bucket_name):
        """ Sometimes a handle to a bucket is not established right away so try
        it a few times. Raise error is connection is not established. """
        for i in range(5):
            try:
                bucket = self.conn.get_bucket(bucket_name)
                log.debug("Using cloud object store with bucket '%s'" % bucket.name)
                return bucket
            except S3ResponseError:
                log.debug("Could not get bucket '%s', attempt %s/5" % (bucket_name, i + 1))
                time.sleep(2)
        # All the attempts have been exhausted and connection was not established,
        # raise error
        raise S3ResponseError

    def _fix_permissions(self, rel_path):
        """ Set permissions on rel_path"""
        for basedir, dirs, files in os.walk(rel_path):
            umask_fix_perms(basedir, self.config.umask, 0777, self.config.gid)
            for f in files:
                path = os.path.join(basedir, f)
                # Ignore symlinks
                if os.path.islink(path):
                    continue
                umask_fix_perms( path, self.config.umask, 0666, self.config.gid )

    def _construct_path(self, obj, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None, **kwargs):
        rel_path = os.path.join(*directory_hash_id(obj.id))
        if extra_dir is not None:
            if extra_dir_at_root:
                rel_path = os.path.join(extra_dir, rel_path)
            else:
                rel_path = os.path.join(rel_path, extra_dir)
        # S3 folders are marked by having trailing '/' so add it now
        rel_path = '%s/' % rel_path
        if not dir_only:
            rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id)
        return rel_path

    def _get_cache_path(self, rel_path):
        return os.path.abspath(os.path.join(self.staging_path, rel_path))

    def _get_transfer_progress(self):
        return self.transfer_progress

    def _get_size_in_s3(self, rel_path):
        try:
            key = self.bucket.get_key(rel_path)
            if key:
                return key.size
        except S3ResponseError, ex:
            log.error("Could not get size of key '%s' from S3: %s" % (rel_path, ex))
        except Exception, ex:
            log.error("Could not get reference to the key object '%s'; returning -1 for key size: %s" % (rel_path, ex))
Beispiel #13
0
class AzureBlobObjectStore(ObjectStore):
    """
    Object store that stores objects as blobs in an Azure Blob Container. A local
    cache exists that is used as an intermediate location for files between
    Galaxy and Azure.
    """
    def __init__(self, config, config_xml):
        if BlockBlobService is None:
            raise Exception(NO_BLOBSERVICE_ERROR_MESSAGE)
        super(AzureBlobObjectStore, self).__init__(config)

        self.staging_path = self.config.file_path
        self.transfer_progress = 0
        self._parse_config_xml(config_xml)
        self._configure_connection()
        self.container_lease = self._get_container_lease()

        # Clean cache only if value is set in galaxy.ini
        if self.cache_size != -1:
            # Convert GBs to bytes for comparison
            self.cache_size = self.cache_size * 1073741824
            # Helper for interruptable sleep
            self.sleeper = Sleeper()
            self.cache_monitor_thread = threading.Thread(target=self.__cache_monitor)
            self.cache_monitor_thread.start()
            log.info("Cache cleaner manager started")

    ###################
    # Private Methods #
    ###################

    # config_xml is an ElementTree object.
    def _parse_config_xml(self, config_xml):
        try:
            auth_xml = config_xml.find('auth')
            self.account_name = auth_xml.get('account_name')
            self.account_key = auth_xml.get('account_key')
            container_xml = config_xml.find('container')
            self.container_name = container_xml.get('name')
            self.max_chunk_size = int(container_xml.get('max_chunk_size', 250))  # currently unused
            cache_xml = config_xml.find('cache')
            self.cache_size = float(cache_xml.get('size', -1))
            self.staging_path = cache_xml.get('path', self.config.object_store_cache_path)

            for d_xml in config_xml.findall('extra_dir'):
                self.extra_dirs[d_xml.get('type')] = d_xml.get('path')

            log.debug("Object cache dir:    %s", self.staging_path)
            log.debug("       job work dir: %s", self.extra_dirs['job_work'])

        except Exception:
            # Toss it back up after logging, we can't continue loading at this point.
            log.exception("Malformed ObjectStore Configuration XML -- unable to continue")
            raise

    def _configure_connection(self):
        log.debug("Configuring Connection")
        self.account = CloudStorageAccount(self.account_name, self.account_key)
        self.service = self.account.create_block_blob_service()

    def _get_container_lease(self):
        """ Sometimes a handle to a container is not established right away so try
        it a few times. Raise error is connection is not established. """
        for i in range(5):
            try:
                self.service.break_container_lease(self.container_name)
                container_lease = self.service.acquire_container_lease(self.container_name)
                log.debug("Using azure blob store with container '%s'", self.container_name)
                return container_lease
            except AzureHttpError:
                try:
                    log.debug("container not found, creating azure blob store container with name '%s'", self.container_name)
                    self.service.create_container(self.container_name)
                    container_lease = self.service.acquire_container_lease(self.container_name)
                    return container_lease
                except AzureHttpError:
                    log.exception("Could not get container '%s', attempt %s/5", self.container_name, i + 1)
                    time.sleep(2)
        # All the attempts have been exhausted and connection was not established,
        # raise error
        raise AzureHttpError

    def _construct_path(self, obj, base_dir=None, dir_only=None, extra_dir=None, extra_dir_at_root=False, alt_name=None, obj_dir=False, **kwargs):
        # extra_dir should never be constructed from provided data but just
        # make sure there are no shenannigans afoot
        if extra_dir and extra_dir != os.path.normpath(extra_dir):
            log.warning('extra_dir is not normalized: %s', extra_dir)
            raise ObjectInvalid("The requested object is invalid")
        # ensure that any parent directory references in alt_name would not
        # result in a path not contained in the directory path constructed here
        if alt_name:
            if not safe_relpath(alt_name):
                log.warning('alt_name would locate path outside dir: %s', alt_name)
                raise ObjectInvalid("The requested object is invalid")
            # alt_name can contain parent directory references, but S3 will not
            # follow them, so if they are valid we normalize them out
            alt_name = os.path.normpath(alt_name)

        rel_path = os.path.join(*directory_hash_id(obj.id))

        if extra_dir is not None:
            if extra_dir_at_root:
                rel_path = os.path.join(extra_dir, rel_path)
            else:
                rel_path = os.path.join(rel_path, extra_dir)

        # for JOB_WORK directory
        if obj_dir:
            rel_path = os.path.join(rel_path, str(obj.id))
        if base_dir:
            base = self.extra_dirs.get(base_dir)
            return os.path.join(base, rel_path)

        # S3 folders are marked by having trailing '/' so add it now
        # rel_path = '%s/' % rel_path # assume for now we don't need this in Azure blob storage.

        if not dir_only:
            rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id)

        return rel_path

    def _fix_permissions(self, rel_path):
        """ Set permissions on rel_path"""
        for basedir, _, files in os.walk(rel_path):
            umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid)
            for filename in files:
                path = os.path.join(basedir, filename)
                # Ignore symlinks
                if os.path.islink(path):
                    continue
                umask_fix_perms(path, self.config.umask, 0o666, self.config.gid)

    def _get_cache_path(self, rel_path):
        return os.path.abspath(os.path.join(self.staging_path, rel_path))

    def _get_transfer_progress(self):
        return self.transfer_progress

    def _get_size_in_azure(self, rel_path):
        try:
            properties = self.service.get_blob_properties(self.container_name, rel_path)
            # Currently this returns a blob and not a BlobProperties object
            # Similar issue for the ruby https://github.com/Azure/azure-storage-ruby/issues/13
            # The typecheck is an attempt at future-proofing this when/if the bug is fixed.
            if type(properties) is Blob:
                properties = properties.properties
            if properties:
                size_in_bytes = properties.content_length
                return size_in_bytes
        except AzureHttpError:
            log.exception("Could not get size of blob '%s' from Azure", rel_path)
            return -1

    def _in_azure(self, rel_path):
        try:
            exists = self.service.exists(self.container_name, rel_path)
        except AzureHttpError:
            log.exception("Trouble checking existence of Azure blob '%s'", rel_path)
            return False
        return exists

    def _in_cache(self, rel_path):
        """ Check if the given dataset is in the local cache. """
        cache_path = self._get_cache_path(rel_path)
        return os.path.exists(cache_path)

    def _pull_into_cache(self, rel_path):
        # Ensure the cache directory structure exists (e.g., dataset_#_files/)
        rel_path_dir = os.path.dirname(rel_path)
        if not os.path.exists(self._get_cache_path(rel_path_dir)):
            os.makedirs(self._get_cache_path(rel_path_dir))
        # Now pull in the file
        file_ok = self._download(rel_path)
        self._fix_permissions(self._get_cache_path(rel_path_dir))
        return file_ok

    def _transfer_cb(self, complete, total):
        self.transfer_progress = float(complete) / float(total) * 100  # in percent

    def _download(self, rel_path):
        local_destination = self._get_cache_path(rel_path)
        try:
            log.debug("Pulling '%s' into cache to %s", rel_path, local_destination)
            if self.cache_size > 0 and self._get_size_in_azure(rel_path) > self.cache_size:
                log.critical("File %s is larger (%s) than the cache size (%s). Cannot download.",
                             rel_path, self._get_size_in_azure(rel_path), self.cache_size)
                return False
            else:
                self.transfer_progress = 0  # Reset transfer progress counter
                self.service.get_blob_to_path(self.container_name, rel_path, local_destination, progress_callback=self._transfer_cb)
                return True
        except AzureHttpError:
            log.exception("Problem downloading '%s' from Azure", rel_path)
        return False

    def _push_to_os(self, rel_path, source_file=None, from_string=None):
        """
        Push the file pointed to by ``rel_path`` to the object store naming the blob
        ``rel_path``. If ``source_file`` is provided, push that file instead while
        still using ``rel_path`` as the blob name.
        If ``from_string`` is provided, set contents of the file to the value of
        the string.
        """
        try:
            source_file = source_file or self._get_cache_path(rel_path)

            if not os.path.exists(source_file):
                log.error("Tried updating blob '%s' from source file '%s', but source file does not exist.", rel_path, source_file)
                return False

            if os.path.getsize(source_file) == 0:
                log.debug("Wanted to push file '%s' to azure blob '%s' but its size is 0; skipping.", source_file, rel_path)
                return True

            if from_string:
                self.service.create_blob_from_text(self.container_name, rel_path, from_string, progress_callback=self._transfer_cb)
                log.debug("Pushed data from string '%s' to blob '%s'", from_string, rel_path)
            else:
                start_time = datetime.now()
                log.debug("Pushing cache file '%s' of size %s bytes to '%s'", source_file, os.path.getsize(source_file), rel_path)
                self.transfer_progress = 0  # Reset transfer progress counter
                self.service.create_blob_from_path(self.container_name, rel_path, source_file, progress_callback=self._transfer_cb)
                end_time = datetime.now()
                log.debug("Pushed cache file '%s' to blob '%s' (%s bytes transfered in %s sec)",
                          source_file, rel_path, os.path.getsize(source_file), end_time - start_time)
            return True

        except AzureHttpError:
            log.exception("Trouble pushing to Azure Blob '%s' from file '%s'", rel_path, source_file)
        return False

    ##################
    # Public Methods #
    ##################

    def exists(self, obj, **kwargs):
        in_cache = in_azure = False
        rel_path = self._construct_path(obj, **kwargs)

        in_cache = self._in_cache(rel_path)
        in_azure = self._in_azure(rel_path)
        # log.debug("~~~~~~ File '%s' exists in cache: %s; in azure: %s" % (rel_path, in_cache, in_azure))
        # dir_only does not get synced so shortcut the decision
        dir_only = kwargs.get('dir_only', False)
        base_dir = kwargs.get('base_dir', None)
        if dir_only:
            if in_cache or in_azure:
                return True
            # for JOB_WORK directory
            elif base_dir:
                if not os.path.exists(rel_path):
                    os.makedirs(rel_path)
                return True
            else:
                return False

        # TODO: Sync should probably not be done here. Add this to an async upload stack?
        if in_cache and not in_azure:
            self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path))
            return True
        elif in_azure:
            return True
        else:
            return False

    def file_ready(self, obj, **kwargs):
        """
        A helper method that checks if a file corresponding to a dataset is
        ready and available to be used. Return ``True`` if so, ``False`` otherwise.
        """
        rel_path = self._construct_path(obj, **kwargs)
        # Make sure the size in cache is available in its entirety
        if self._in_cache(rel_path):
            local_size = os.path.getsize(self._get_cache_path(rel_path))
            remote_size = self._get_size_in_azure(rel_path)
            if local_size == remote_size:
                return True
            else:
                log.debug("Waiting for dataset %s to transfer from OS: %s/%s", rel_path, local_size, remote_size)

        return False

    def create(self, obj, **kwargs):

        if not self.exists(obj, **kwargs):

            # Pull out locally used fields
            extra_dir = kwargs.get('extra_dir', None)
            extra_dir_at_root = kwargs.get('extra_dir_at_root', False)
            dir_only = kwargs.get('dir_only', False)
            alt_name = kwargs.get('alt_name', None)

            # Construct hashed path
            rel_path = os.path.join(*directory_hash_id(obj.id))

            # Optionally append extra_dir
            if extra_dir is not None:
                if extra_dir_at_root:
                    rel_path = os.path.join(extra_dir, rel_path)
                else:
                    rel_path = os.path.join(rel_path, extra_dir)

            # Create given directory in cache
            cache_dir = os.path.join(self.staging_path, rel_path)
            if not os.path.exists(cache_dir):
                os.makedirs(cache_dir)

            # Although not really necessary to create S3 folders (because S3 has
            # flat namespace), do so for consistency with the regular file system
            # S3 folders are marked by having trailing '/' so add it now
            # s3_dir = '%s/' % rel_path
            # self._push_to_os(s3_dir, from_string='')
            # If instructed, create the dataset in cache & in S3
            if not dir_only:
                rel_path = os.path.join(rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id)
                open(os.path.join(self.staging_path, rel_path), 'w').close()
                self._push_to_os(rel_path, from_string='')

    def empty(self, obj, **kwargs):
        if self.exists(obj, **kwargs):
            return bool(self.size(obj, **kwargs) > 0)
        else:
            raise ObjectNotFound( 'objectstore.empty, object does not exist: %s, kwargs: %s' % ( str( obj ), str( kwargs ) ) )

    def size(self, obj, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        if self._in_cache(rel_path):
            try:
                return os.path.getsize(self._get_cache_path(rel_path))
            except OSError as ex:
                log.info("Could not get size of file '%s' in local cache, will try Azure. Error: %s", rel_path, ex)
        elif self.exists(obj, **kwargs):
            return self._get_size_in_azure(rel_path)
        log.warning("Did not find dataset '%s', returning 0 for size", rel_path)
        return 0

    def delete(self, obj, entire_dir=False, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        extra_dir = kwargs.get('extra_dir', None)
        base_dir = kwargs.get('base_dir', None)
        dir_only = kwargs.get('dir_only', False)
        obj_dir = kwargs.get('obj_dir', False)
        try:
            if base_dir and dir_only and obj_dir:
                # Remove temporary data in JOB_WORK directory
                shutil.rmtree(os.path.abspath(rel_path))
                return True

            # For the case of extra_files, because we don't have a reference to
            # individual files/blobs we need to remove the entire directory structure
            # with all the files in it. This is easy for the local file system,
            # but requires iterating through each individual blob in Azure and deleing it.
            if entire_dir and extra_dir:
                shutil.rmtree(self._get_cache_path(rel_path))
                blobs = self.service.list_blobs(self.container_name, prefix=rel_path)
                for blob in blobs:
                    log.debug("Deleting from Azure: %s", blob)
                    self.service.delete_blob(self.container_name, blob.name)
                return True
            else:
                # Delete from cache first
                os.unlink(self._get_cache_path(rel_path))
                # Delete from S3 as well
                if self._in_azure(rel_path):
                    log.debug("Deleting from Azure: %s", rel_path)
                    self.service.delete_blob(self.container_name, rel_path)
                    return True
        except AzureHttpError:
            log.exception("Could not delete blob '%s' from Azure", rel_path)
        except OSError:
            log.exception('%s delete error', self.get_filename(obj, **kwargs))
        return False

    def get_data(self, obj, start=0, count=-1, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        # Check cache first and get file if not there
        if not self._in_cache(rel_path):
            self._pull_into_cache(rel_path)
        # Read the file content from cache
        data_file = open(self._get_cache_path(rel_path), 'r')
        data_file.seek(start)
        content = data_file.read(count)
        data_file.close()
        return content

    def get_filename(self, obj, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        base_dir = kwargs.get('base_dir', None)
        dir_only = kwargs.get('dir_only', False)
        obj_dir = kwargs.get('obj_dir', False)

        # for JOB_WORK directory
        if base_dir and dir_only and obj_dir:
            return os.path.abspath(rel_path)

        cache_path = self._get_cache_path(rel_path)
        # S3 does not recognize directories as files so cannot check if those exist.
        # So, if checking dir only, ensure given dir exists in cache and return
        # the expected cache path.
        # dir_only = kwargs.get('dir_only', False)
        # if dir_only:
        #     if not os.path.exists(cache_path):
        #         os.makedirs(cache_path)
        #     return cache_path
        # Check if the file exists in the cache first
        if self._in_cache(rel_path):
            return cache_path
        # Check if the file exists in persistent storage and, if it does, pull it into cache
        elif self.exists(obj, **kwargs):
            if dir_only:  # Directories do not get pulled into cache
                return cache_path
            else:
                if self._pull_into_cache(rel_path):
                    return cache_path
        # For the case of retrieving a directory only, return the expected path
        # even if it does not exist.
        # if dir_only:
        #     return cache_path
        raise ObjectNotFound( 'objectstore.get_filename, no cache_path: %s, kwargs: %s' % ( str( obj ), str( kwargs ) ) )

        return cache_path  # Until the upload tool does not explicitly create the dataset, return expected path

    def update_from_file(self, obj, file_name=None, create=False, **kwargs):
        if create is True:
            self.create(obj, **kwargs)
        elif self.exists(obj, **kwargs):
            rel_path = self._construct_path(obj, **kwargs)
            # Chose whether to use the dataset file itself or an alternate file
            if file_name:
                source_file = os.path.abspath(file_name)
                # Copy into cache
                cache_file = self._get_cache_path(rel_path)
                try:
                    if source_file != cache_file:
                        # FIXME? Should this be a `move`?
                        shutil.copy2(source_file, cache_file)
                    self._fix_permissions(cache_file)
                except OSError:
                    log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file)
            else:
                source_file = self._get_cache_path(rel_path)

            self._push_to_os(rel_path, source_file)

        else:
            raise ObjectNotFound( 'objectstore.update_from_file, object does not exist: %s, kwargs: %s' % ( str( obj ), str( kwargs ) ) )

    def get_object_url(self, obj, **kwargs):
        if self.exists(obj, **kwargs):
            rel_path = self._construct_path(obj, **kwargs)
            try:
                url = self.service.make_blob_url(container_name=self.container_name, blob_name=rel_path)
                return url
            except AzureHttpError:
                log.exception("Trouble generating URL for dataset '%s'", rel_path)
        return None

    def get_store_usage_percent(self):
        return 0.0

    ##################
    # Secret Methods #
    ##################

    def __cache_monitor(self):
        time.sleep(2)  # Wait for things to load before starting the monitor
        while self.running:
            total_size = 0
            # Is this going to be too expensive of an operation to be done frequently?
            file_list = []
            for dirpath, _, filenames in os.walk(self.staging_path):
                for filename in filenames:
                    filepath = os.path.join(dirpath, filename)
                    file_size = os.path.getsize(filepath)
                    total_size += file_size
                    # Get the time given file was last accessed
                    last_access_time = time.localtime(os.stat(filepath)[7])
                    # Compose a tuple of the access time and the file path
                    file_tuple = last_access_time, filepath, file_size
                    file_list.append(file_tuple)
            # Sort the file list (based on access time)
            file_list.sort()
            # Initiate cleaning once within 10% of the defined cache size?
            cache_limit = self.cache_size * 0.9
            if total_size > cache_limit:
                log.info("Initiating cache cleaning: current cache size: %s; clean until smaller than: %s",
                         convert_bytes(total_size), convert_bytes(cache_limit))
                # How much to delete? If simply deleting up to the cache-10% limit,
                # is likely to be deleting frequently and may run the risk of hitting
                # the limit - maybe delete additional #%?
                # For now, delete enough to leave at least 10% of the total cache free
                delete_this_much = total_size - cache_limit
                # Keep deleting datasets from file_list until deleted_amount does not
                # exceed delete_this_much; start deleting from the front of the file list,
                # which assumes the oldest files come first on the list.
                deleted_amount = 0
                for entry in enumerate(file_list):
                    if deleted_amount < delete_this_much:
                        deleted_amount += entry[2]
                        os.remove(entry[1])
                        # Debugging code for printing deleted files' stats
                        # folder, file_name = os.path.split(f[1])
                        # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0])
                        # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \
                        #     % (i, file_name, convert_bytes(f[2]), file_date, \
                        #     convert_bytes(deleted_amount), convert_bytes(delete_this_much)))
                    else:
                        log.debug("Cache cleaning done. Total space freed: %s", convert_bytes(deleted_amount))

            self.sleeper.sleep(30)  # Test cache size every 30 seconds?
Beispiel #14
0
class DeferredJobQueue(object):
    job_states = Bunch(READY='ready',
                       WAIT='wait',
                       INVALID='invalid')

    def __init__(self, app):
        self.app = app
        self.sa_session = app.model.context.current
        self.queue = Queue()
        self.plugins = {}
        self._load_plugins()
        self.sleeper = Sleeper()
        self.running = True
        self.waiting_jobs = []
        self.__check_jobs_at_startup()
        self.monitor_thread = threading.Thread(target=self.__monitor)
        self.monitor_thread.start()
        log.info('Deferred job queue started')

    def _load_plugins(self):
        for fname in os.listdir(os.path.dirname(__file__)):
            if not fname.startswith('_') and fname.endswith('.py'):
                name = fname[:-3]
                module_name = 'galaxy.jobs.deferred.' + name
                try:
                    module = __import__(module_name)
                except:
                    log.exception('Deferred job plugin appears to exist but is not loadable: %s', module_name)
                    continue
                for comp in module_name.split(".")[1:]:
                    module = getattr(module, comp)
                if '__all__' not in dir(module):
                    log.error('Plugin "%s" does not contain a list of exported classes in __all__' % module_name)
                    continue
                for obj in module.__all__:
                    display_name = ':'.join((module_name, obj))
                    plugin = getattr(module, obj)
                    for name in ('check_job', 'run_job'):
                        if name not in dir(plugin):
                            log.error('Plugin "%s" does not contain required method "%s()"' % (display_name, name))
                            break
                    else:
                        self.plugins[obj] = plugin(self.app)
                        self.plugins[obj].job_states = self.job_states
                        log.debug('Loaded deferred job plugin: %s' % display_name)

    def __check_jobs_at_startup(self):
        waiting_jobs = self.sa_session.query(model.DeferredJob) \
                                      .filter(model.DeferredJob.state == model.DeferredJob.states.WAITING).all()
        for job in waiting_jobs:
            if not self.__check_job_plugin(job):
                continue
            if 'check_interval' in dir(self.plugins[job.plugin]):
                job.check_interval = self.plugins[job.plugin].check_interval
            log.info('Recovered deferred job (id: %s) at startup' % job.id)
            # Pass the job ID as opposed to the job, since the monitor thread
            # needs to load it in its own threadlocal scoped session.
            self.waiting_jobs.append(job.id)

    def __monitor(self):
        while self.running:
            try:
                self.__monitor_step()
            except:
                log.exception('Exception in monitor_step')
            self.sleeper.sleep(1)
        log.info('job queue stopped')

    def __monitor_step(self):
        # TODO: Querying the database with this frequency is bad, we need message passing
        new_jobs = self.sa_session.query(model.DeferredJob) \
                                  .filter(model.DeferredJob.state == model.DeferredJob.states.NEW).all()
        for job in new_jobs:
            if not self.__check_job_plugin(job):
                continue
            job.state = model.DeferredJob.states.WAITING
            self.sa_session.add(job)
            self.sa_session.flush()
            if 'check_interval' in dir(self.plugins[job.plugin]):
                job.check_interval = self.plugins[job.plugin].check_interval
            self.waiting_jobs.append(job)
        new_waiting = []
        for job in self.waiting_jobs:
            try:
                # Recovered jobs are passed in by ID
                assert type(job) is int
                job = self.sa_session.query(model.DeferredJob).get(job)
            except:
                pass
            if job.is_check_time:
                try:
                    job_state = self.plugins[job.plugin].check_job(job)
                except Exception:
                    self.__fail_job(job)
                    log.exception('Set deferred job %s to error because of an exception in check_job()' % job.id)
                    continue
                if job_state == self.job_states.READY:
                    try:
                        self.plugins[job.plugin].run_job(job)
                    except Exception:
                        self.__fail_job(job)
                        log.exception('Set deferred job %s to error because of an exception in run_job()' % job.id)
                        continue
                elif job_state == self.job_states.INVALID:
                    self.__fail_job(job)
                    log.error('Unable to run deferred job (id: %s): Plugin "%s" marked it as invalid' % (job.id, job.plugin))
                    continue
                else:
                    new_waiting.append(job)
                job.last_check = 'now'
            else:
                new_waiting.append(job)
        self.waiting_jobs = new_waiting

    def __check_job_plugin(self, job):
        if job.plugin not in self.plugins:
            log.error('Invalid deferred job plugin: %s') % job.plugin
            job.state = model.DeferredJob.states.ERROR
            self.sa_session.add(job)
            self.sa_session.flush()
            return False
        return True

    def __check_if_ready_to_run(self, job):
        return self.plugins[job.plugin].check_job(job)

    def __fail_job(self, job):
        job.state = model.DeferredJob.states.ERROR
        self.sa_session.add(job)
        self.sa_session.flush()

    def shutdown(self):
        self.running = False
        self.sleeper.wake()
Beispiel #15
0
class AzureBlobObjectStore(ObjectStore):
    """
    Object store that stores objects as blobs in an Azure Blob Container. A local
    cache exists that is used as an intermediate location for files between
    Galaxy and Azure.
    """
    def __init__(self, config, config_xml):
        if BlockBlobService is None:
            raise Exception(NO_BLOBSERVICE_ERROR_MESSAGE)
        super(AzureBlobObjectStore, self).__init__(config)

        self.staging_path = self.config.file_path
        self.transfer_progress = 0
        self._parse_config_xml(config_xml)
        self._configure_connection()
        self.container_lease = self._get_container_lease()

        # Clean cache only if value is set in galaxy.ini
        if self.cache_size != -1:
            # Convert GBs to bytes for comparison
            self.cache_size = self.cache_size * 1073741824
            # Helper for interruptable sleep
            self.sleeper = Sleeper()
            self.cache_monitor_thread = threading.Thread(
                target=self.__cache_monitor)
            self.cache_monitor_thread.start()
            log.info("Cache cleaner manager started")

    ###################
    # Private Methods #
    ###################

    # config_xml is an ElementTree object.
    def _parse_config_xml(self, config_xml):
        try:
            auth_xml = config_xml.find('auth')
            self.account_name = auth_xml.get('account_name')
            self.account_key = auth_xml.get('account_key')
            container_xml = config_xml.find('container')
            self.container_name = container_xml.get('name')
            self.max_chunk_size = int(container_xml.get(
                'max_chunk_size', 250))  # currently unused
            cache_xml = config_xml.find('cache')
            self.cache_size = float(cache_xml.get('size', -1))
            self.staging_path = cache_xml.get(
                'path', self.config.object_store_cache_path)

            for d_xml in config_xml.findall('extra_dir'):
                self.extra_dirs[d_xml.get('type')] = d_xml.get('path')

            log.debug("Object cache dir:    %s", self.staging_path)
            log.debug("       job work dir: %s", self.extra_dirs['job_work'])

        except Exception:
            # Toss it back up after logging, we can't continue loading at this point.
            log.exception(
                "Malformed ObjectStore Configuration XML -- unable to continue"
            )
            raise

    def _configure_connection(self):
        log.debug("Configuring Connection")
        self.account = CloudStorageAccount(self.account_name, self.account_key)
        self.service = self.account.create_block_blob_service()

    def _get_container_lease(self):
        """ Sometimes a handle to a container is not established right away so try
        it a few times. Raise error is connection is not established. """
        for i in range(5):
            try:
                self.service.break_container_lease(self.container_name)
                container_lease = self.service.acquire_container_lease(
                    self.container_name)
                log.debug("Using azure blob store with container '%s'",
                          self.container_name)
                return container_lease
            except AzureHttpError:
                try:
                    log.debug(
                        "container not found, creating azure blob store container with name '%s'",
                        self.container_name)
                    self.service.create_container(self.container_name)
                    container_lease = self.service.acquire_container_lease(
                        self.container_name)
                    return container_lease
                except AzureHttpError:
                    log.exception("Could not get container '%s', attempt %s/5",
                                  self.container_name, i + 1)
                    time.sleep(2)
        # All the attempts have been exhausted and connection was not established,
        # raise error
        raise AzureHttpError

    def _construct_path(self,
                        obj,
                        base_dir=None,
                        dir_only=None,
                        extra_dir=None,
                        extra_dir_at_root=False,
                        alt_name=None,
                        obj_dir=False,
                        **kwargs):
        # extra_dir should never be constructed from provided data but just
        # make sure there are no shenannigans afoot
        if extra_dir and extra_dir != os.path.normpath(extra_dir):
            log.warning('extra_dir is not normalized: %s', extra_dir)
            raise ObjectInvalid("The requested object is invalid")
        # ensure that any parent directory references in alt_name would not
        # result in a path not contained in the directory path constructed here
        if alt_name:
            if not safe_relpath(alt_name):
                log.warning('alt_name would locate path outside dir: %s',
                            alt_name)
                raise ObjectInvalid("The requested object is invalid")
            # alt_name can contain parent directory references, but S3 will not
            # follow them, so if they are valid we normalize them out
            alt_name = os.path.normpath(alt_name)

        rel_path = os.path.join(*directory_hash_id(obj.id))

        if extra_dir is not None:
            if extra_dir_at_root:
                rel_path = os.path.join(extra_dir, rel_path)
            else:
                rel_path = os.path.join(rel_path, extra_dir)

        # for JOB_WORK directory
        if obj_dir:
            rel_path = os.path.join(rel_path, str(obj.id))
        if base_dir:
            base = self.extra_dirs.get(base_dir)
            return os.path.join(base, rel_path)

        # S3 folders are marked by having trailing '/' so add it now
        # rel_path = '%s/' % rel_path # assume for now we don't need this in Azure blob storage.

        if not dir_only:
            rel_path = os.path.join(
                rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id)

        return rel_path

    def _fix_permissions(self, rel_path):
        """ Set permissions on rel_path"""
        for basedir, _, files in os.walk(rel_path):
            umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid)
            for filename in files:
                path = os.path.join(basedir, filename)
                # Ignore symlinks
                if os.path.islink(path):
                    continue
                umask_fix_perms(path, self.config.umask, 0o666,
                                self.config.gid)

    def _get_cache_path(self, rel_path):
        return os.path.abspath(os.path.join(self.staging_path, rel_path))

    def _get_transfer_progress(self):
        return self.transfer_progress

    def _get_size_in_azure(self, rel_path):
        try:
            properties = self.service.get_blob_properties(
                self.container_name, rel_path)
            # Currently this returns a blob and not a BlobProperties object
            # Similar issue for the ruby https://github.com/Azure/azure-storage-ruby/issues/13
            # The typecheck is an attempt at future-proofing this when/if the bug is fixed.
            if type(properties) is Blob:
                properties = properties.properties
            if properties:
                size_in_bytes = properties.content_length
                return size_in_bytes
        except AzureHttpError:
            log.exception("Could not get size of blob '%s' from Azure",
                          rel_path)
            return -1

    def _in_azure(self, rel_path):
        try:
            exists = self.service.exists(self.container_name, rel_path)
        except AzureHttpError:
            log.exception("Trouble checking existence of Azure blob '%s'",
                          rel_path)
            return False
        return exists

    def _in_cache(self, rel_path):
        """ Check if the given dataset is in the local cache. """
        cache_path = self._get_cache_path(rel_path)
        return os.path.exists(cache_path)

    def _pull_into_cache(self, rel_path):
        # Ensure the cache directory structure exists (e.g., dataset_#_files/)
        rel_path_dir = os.path.dirname(rel_path)
        if not os.path.exists(self._get_cache_path(rel_path_dir)):
            os.makedirs(self._get_cache_path(rel_path_dir))
        # Now pull in the file
        file_ok = self._download(rel_path)
        self._fix_permissions(self._get_cache_path(rel_path_dir))
        return file_ok

    def _transfer_cb(self, complete, total):
        self.transfer_progress = float(complete) / float(
            total) * 100  # in percent

    def _download(self, rel_path):
        local_destination = self._get_cache_path(rel_path)
        try:
            log.debug("Pulling '%s' into cache to %s", rel_path,
                      local_destination)
            if self.cache_size > 0 and self._get_size_in_azure(
                    rel_path) > self.cache_size:
                log.critical(
                    "File %s is larger (%s) than the cache size (%s). Cannot download.",
                    rel_path, self._get_size_in_azure(rel_path),
                    self.cache_size)
                return False
            else:
                self.transfer_progress = 0  # Reset transfer progress counter
                self.service.get_blob_to_path(
                    self.container_name,
                    rel_path,
                    local_destination,
                    progress_callback=self._transfer_cb)
                return True
        except AzureHttpError:
            log.exception("Problem downloading '%s' from Azure", rel_path)
        return False

    def _push_to_os(self, rel_path, source_file=None, from_string=None):
        """
        Push the file pointed to by ``rel_path`` to the object store naming the blob
        ``rel_path``. If ``source_file`` is provided, push that file instead while
        still using ``rel_path`` as the blob name.
        If ``from_string`` is provided, set contents of the file to the value of
        the string.
        """
        try:
            source_file = source_file or self._get_cache_path(rel_path)

            if not os.path.exists(source_file):
                log.error(
                    "Tried updating blob '%s' from source file '%s', but source file does not exist.",
                    rel_path, source_file)
                return False

            if os.path.getsize(source_file) == 0:
                log.debug(
                    "Wanted to push file '%s' to azure blob '%s' but its size is 0; skipping.",
                    source_file, rel_path)
                return True

            if from_string:
                self.service.create_blob_from_text(
                    self.container_name,
                    rel_path,
                    from_string,
                    progress_callback=self._transfer_cb)
                log.debug("Pushed data from string '%s' to blob '%s'",
                          from_string, rel_path)
            else:
                start_time = datetime.now()
                log.debug("Pushing cache file '%s' of size %s bytes to '%s'",
                          source_file, os.path.getsize(source_file), rel_path)
                self.transfer_progress = 0  # Reset transfer progress counter
                self.service.create_blob_from_path(
                    self.container_name,
                    rel_path,
                    source_file,
                    progress_callback=self._transfer_cb)
                end_time = datetime.now()
                log.debug(
                    "Pushed cache file '%s' to blob '%s' (%s bytes transfered in %s sec)",
                    source_file, rel_path, os.path.getsize(source_file),
                    end_time - start_time)
            return True

        except AzureHttpError:
            log.exception("Trouble pushing to Azure Blob '%s' from file '%s'",
                          rel_path, source_file)
        return False

    ##################
    # Public Methods #
    ##################

    def exists(self, obj, **kwargs):
        in_cache = in_azure = False
        rel_path = self._construct_path(obj, **kwargs)

        in_cache = self._in_cache(rel_path)
        in_azure = self._in_azure(rel_path)
        # log.debug("~~~~~~ File '%s' exists in cache: %s; in azure: %s" % (rel_path, in_cache, in_azure))
        # dir_only does not get synced so shortcut the decision
        dir_only = kwargs.get('dir_only', False)
        base_dir = kwargs.get('base_dir', None)
        if dir_only:
            if in_cache or in_azure:
                return True
            # for JOB_WORK directory
            elif base_dir:
                if not os.path.exists(rel_path):
                    os.makedirs(rel_path)
                return True
            else:
                return False

        # TODO: Sync should probably not be done here. Add this to an async upload stack?
        if in_cache and not in_azure:
            self._push_to_os(rel_path,
                             source_file=self._get_cache_path(rel_path))
            return True
        elif in_azure:
            return True
        else:
            return False

    def file_ready(self, obj, **kwargs):
        """
        A helper method that checks if a file corresponding to a dataset is
        ready and available to be used. Return ``True`` if so, ``False`` otherwise.
        """
        rel_path = self._construct_path(obj, **kwargs)
        # Make sure the size in cache is available in its entirety
        if self._in_cache(rel_path):
            local_size = os.path.getsize(self._get_cache_path(rel_path))
            remote_size = self._get_size_in_azure(rel_path)
            if local_size == remote_size:
                return True
            else:
                log.debug("Waiting for dataset %s to transfer from OS: %s/%s",
                          rel_path, local_size, remote_size)

        return False

    def create(self, obj, **kwargs):

        if not self.exists(obj, **kwargs):

            # Pull out locally used fields
            extra_dir = kwargs.get('extra_dir', None)
            extra_dir_at_root = kwargs.get('extra_dir_at_root', False)
            dir_only = kwargs.get('dir_only', False)
            alt_name = kwargs.get('alt_name', None)

            # Construct hashed path
            rel_path = os.path.join(*directory_hash_id(obj.id))

            # Optionally append extra_dir
            if extra_dir is not None:
                if extra_dir_at_root:
                    rel_path = os.path.join(extra_dir, rel_path)
                else:
                    rel_path = os.path.join(rel_path, extra_dir)

            # Create given directory in cache
            cache_dir = os.path.join(self.staging_path, rel_path)
            if not os.path.exists(cache_dir):
                os.makedirs(cache_dir)

            # Although not really necessary to create S3 folders (because S3 has
            # flat namespace), do so for consistency with the regular file system
            # S3 folders are marked by having trailing '/' so add it now
            # s3_dir = '%s/' % rel_path
            # self._push_to_os(s3_dir, from_string='')
            # If instructed, create the dataset in cache & in S3
            if not dir_only:
                rel_path = os.path.join(
                    rel_path,
                    alt_name if alt_name else "dataset_%s.dat" % obj.id)
                open(os.path.join(self.staging_path, rel_path), 'w').close()
                self._push_to_os(rel_path, from_string='')

    def empty(self, obj, **kwargs):
        if self.exists(obj, **kwargs):
            return bool(self.size(obj, **kwargs) > 0)
        else:
            raise ObjectNotFound(
                'objectstore.empty, object does not exist: %s, kwargs: %s' %
                (str(obj), str(kwargs)))

    def size(self, obj, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        if self._in_cache(rel_path):
            try:
                return os.path.getsize(self._get_cache_path(rel_path))
            except OSError as ex:
                log.info(
                    "Could not get size of file '%s' in local cache, will try Azure. Error: %s",
                    rel_path, ex)
        elif self.exists(obj, **kwargs):
            return self._get_size_in_azure(rel_path)
        log.warning("Did not find dataset '%s', returning 0 for size",
                    rel_path)
        return 0

    def delete(self, obj, entire_dir=False, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        extra_dir = kwargs.get('extra_dir', None)
        base_dir = kwargs.get('base_dir', None)
        dir_only = kwargs.get('dir_only', False)
        obj_dir = kwargs.get('obj_dir', False)
        try:
            if base_dir and dir_only and obj_dir:
                # Remove temporary data in JOB_WORK directory
                shutil.rmtree(os.path.abspath(rel_path))
                return True

            # For the case of extra_files, because we don't have a reference to
            # individual files/blobs we need to remove the entire directory structure
            # with all the files in it. This is easy for the local file system,
            # but requires iterating through each individual blob in Azure and deleing it.
            if entire_dir and extra_dir:
                shutil.rmtree(self._get_cache_path(rel_path))
                blobs = self.service.list_blobs(self.container_name,
                                                prefix=rel_path)
                for blob in blobs:
                    log.debug("Deleting from Azure: %s", blob)
                    self.service.delete_blob(self.container_name, blob.name)
                return True
            else:
                # Delete from cache first
                os.unlink(self._get_cache_path(rel_path))
                # Delete from S3 as well
                if self._in_azure(rel_path):
                    log.debug("Deleting from Azure: %s", rel_path)
                    self.service.delete_blob(self.container_name, rel_path)
                    return True
        except AzureHttpError:
            log.exception("Could not delete blob '%s' from Azure", rel_path)
        except OSError:
            log.exception('%s delete error', self.get_filename(obj, **kwargs))
        return False

    def get_data(self, obj, start=0, count=-1, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        # Check cache first and get file if not there
        if not self._in_cache(rel_path):
            self._pull_into_cache(rel_path)
        # Read the file content from cache
        data_file = open(self._get_cache_path(rel_path), 'r')
        data_file.seek(start)
        content = data_file.read(count)
        data_file.close()
        return content

    def get_filename(self, obj, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        base_dir = kwargs.get('base_dir', None)
        dir_only = kwargs.get('dir_only', False)
        obj_dir = kwargs.get('obj_dir', False)

        # for JOB_WORK directory
        if base_dir and dir_only and obj_dir:
            return os.path.abspath(rel_path)

        cache_path = self._get_cache_path(rel_path)
        # S3 does not recognize directories as files so cannot check if those exist.
        # So, if checking dir only, ensure given dir exists in cache and return
        # the expected cache path.
        # dir_only = kwargs.get('dir_only', False)
        # if dir_only:
        #     if not os.path.exists(cache_path):
        #         os.makedirs(cache_path)
        #     return cache_path
        # Check if the file exists in the cache first
        if self._in_cache(rel_path):
            return cache_path
        # Check if the file exists in persistent storage and, if it does, pull it into cache
        elif self.exists(obj, **kwargs):
            if dir_only:  # Directories do not get pulled into cache
                return cache_path
            else:
                if self._pull_into_cache(rel_path):
                    return cache_path
        # For the case of retrieving a directory only, return the expected path
        # even if it does not exist.
        # if dir_only:
        #     return cache_path
        raise ObjectNotFound(
            'objectstore.get_filename, no cache_path: %s, kwargs: %s' %
            (str(obj), str(kwargs)))

        return cache_path  # Until the upload tool does not explicitly create the dataset, return expected path

    def update_from_file(self, obj, file_name=None, create=False, **kwargs):
        if create is True:
            self.create(obj, **kwargs)
        elif self.exists(obj, **kwargs):
            rel_path = self._construct_path(obj, **kwargs)
            # Chose whether to use the dataset file itself or an alternate file
            if file_name:
                source_file = os.path.abspath(file_name)
                # Copy into cache
                cache_file = self._get_cache_path(rel_path)
                try:
                    if source_file != cache_file:
                        # FIXME? Should this be a `move`?
                        shutil.copy2(source_file, cache_file)
                    self._fix_permissions(cache_file)
                except OSError:
                    log.exception(
                        "Trouble copying source file '%s' to cache '%s'",
                        source_file, cache_file)
            else:
                source_file = self._get_cache_path(rel_path)

            self._push_to_os(rel_path, source_file)

        else:
            raise ObjectNotFound(
                'objectstore.update_from_file, object does not exist: %s, kwargs: %s'
                % (str(obj), str(kwargs)))

    def get_object_url(self, obj, **kwargs):
        if self.exists(obj, **kwargs):
            rel_path = self._construct_path(obj, **kwargs)
            try:
                url = self.service.make_blob_url(
                    container_name=self.container_name, blob_name=rel_path)
                return url
            except AzureHttpError:
                log.exception("Trouble generating URL for dataset '%s'",
                              rel_path)
        return None

    def get_store_usage_percent(self):
        return 0.0

    ##################
    # Secret Methods #
    ##################

    def __cache_monitor(self):
        time.sleep(2)  # Wait for things to load before starting the monitor
        while self.running:
            total_size = 0
            # Is this going to be too expensive of an operation to be done frequently?
            file_list = []
            for dirpath, _, filenames in os.walk(self.staging_path):
                for filename in filenames:
                    filepath = os.path.join(dirpath, filename)
                    file_size = os.path.getsize(filepath)
                    total_size += file_size
                    # Get the time given file was last accessed
                    last_access_time = time.localtime(os.stat(filepath)[7])
                    # Compose a tuple of the access time and the file path
                    file_tuple = last_access_time, filepath, file_size
                    file_list.append(file_tuple)
            # Sort the file list (based on access time)
            file_list.sort()
            # Initiate cleaning once within 10% of the defined cache size?
            cache_limit = self.cache_size * 0.9
            if total_size > cache_limit:
                log.info(
                    "Initiating cache cleaning: current cache size: %s; clean until smaller than: %s",
                    convert_bytes(total_size), convert_bytes(cache_limit))
                # How much to delete? If simply deleting up to the cache-10% limit,
                # is likely to be deleting frequently and may run the risk of hitting
                # the limit - maybe delete additional #%?
                # For now, delete enough to leave at least 10% of the total cache free
                delete_this_much = total_size - cache_limit
                # Keep deleting datasets from file_list until deleted_amount does not
                # exceed delete_this_much; start deleting from the front of the file list,
                # which assumes the oldest files come first on the list.
                deleted_amount = 0
                for entry in enumerate(file_list):
                    if deleted_amount < delete_this_much:
                        deleted_amount += entry[2]
                        os.remove(entry[1])
                        # Debugging code for printing deleted files' stats
                        # folder, file_name = os.path.split(f[1])
                        # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0])
                        # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \
                        #     % (i, file_name, convert_bytes(f[2]), file_date, \
                        #     convert_bytes(deleted_amount), convert_bytes(delete_this_much)))
                    else:
                        log.debug("Cache cleaning done. Total space freed: %s",
                                  convert_bytes(deleted_amount))

            self.sleeper.sleep(30)  # Test cache size every 30 seconds?
Beispiel #16
0
class JobHandlerQueue(object):
    """
    Job Handler's Internal Queue, this is what actually implements waiting for
    jobs to be runnable and dispatching to a JobRunner.
    """
    STOP_SIGNAL = object()

    def __init__(self, app, dispatcher):
        """Initializes the Job Handler Queue, creates (unstarted) monitoring thread"""
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context
        self.track_jobs_in_database = self.app.config.track_jobs_in_database

        # Initialize structures for handling job limits
        self.__clear_job_count()

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting_jobs = []
        # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times)
        self.job_wrappers = {}
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread(
            name="JobHandlerQueue.monitor_thread", target=self.__monitor)
        self.monitor_thread.setDaemon(True)

    def start(self):
        """
        Starts the JobHandler's thread after checking for any unhandled jobs.
        """
        # Recover jobs at startup
        self.__check_jobs_at_startup()
        # Start the queue
        self.monitor_thread.start()
        log.info("job handler queue started")

    def job_wrapper(self, job, use_persisted_destination=False):
        return JobWrapper(job,
                          self,
                          use_persisted_destination=use_persisted_destination)

    def job_pair_for_id(self, id):
        job = self.sa_session.query(model.Job).get(id)
        return job, self.job_wrapper(job, use_persisted_destination=True)

    def __write_registry_file_if_absent(self, job):
        # TODO: remove this and the one place it is called in late 2018, this
        # hack attempts to minimize the job failures due to upgrades from 17.05
        # Galaxies.
        job_wrapper = self.job_wrapper(job)
        cwd = job_wrapper.working_directory
        datatypes_config = os.path.join(cwd, "registry.xml")
        if not os.path.exists(datatypes_config):
            try:
                self.app.datatypes_registry.to_xml_file(path=datatypes_config)
            except OSError:
                pass

    def __check_jobs_at_startup(self):
        """
        Checks all jobs that are in the 'new', 'queued' or 'running' state in
        the database and requeues or cleans up as necessary.  Only run as the
        job handler starts.
        In case the activation is enforced it will filter out the jobs of inactive users.
        """
        jobs_at_startup = []
        if self.track_jobs_in_database:
            in_list = (model.Job.states.QUEUED, model.Job.states.RUNNING)
        else:
            in_list = (model.Job.states.NEW, model.Job.states.QUEUED,
                       model.Job.states.RUNNING)
        if self.app.config.user_activation_on:
            jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                .outerjoin( model.User ) \
                .filter( model.Job.state.in_( in_list ) &
                         ( model.Job.handler == self.app.config.server_name ) &
                         or_( ( model.Job.user_id == null() ), ( model.User.active == true() ) ) ).all()
        else:
            jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                .filter( model.Job.state.in_( in_list ) &
                         ( model.Job.handler == self.app.config.server_name ) ).all()

        for job in jobs_at_startup:
            self.__write_registry_file_if_absent(job)
            if not self.app.toolbox.has_tool(
                    job.tool_id, job.tool_version, exact=True):
                log.warning(
                    "(%s) Tool '%s' removed from tool config, unable to recover job"
                    % (job.id, job.tool_id))
                self.job_wrapper(job).fail(
                    'This tool was disabled before the job completed.  Please contact your Galaxy administrator.'
                )
            elif job.job_runner_name is not None and job.job_runner_external_id is None:
                # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner.
                log.debug(
                    "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue"
                    % job.id)
                job.job_runner_name = None
                if self.track_jobs_in_database:
                    job.set_state(model.Job.states.NEW)
                else:
                    self.queue.put((job.id, job.tool_id))
            elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None:
                # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist
                job_wrapper = self.job_wrapper(job)
                job_destination = self.dispatcher.url_to_destination(
                    job.job_runner_name)
                if job_destination.id is None:
                    job_destination.id = 'legacy_url'
                job_wrapper.set_job_destination(job_destination,
                                                job.job_runner_external_id)
                self.dispatcher.recover(job, job_wrapper)
                log.info(
                    '(%s) Converted job from a URL to a destination and recovered'
                    % (job.id))
            elif job.job_runner_name is None:
                # Never (fully) dispatched
                log.debug(
                    "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue"
                    % (job.id, job.state))
                if self.track_jobs_in_database:
                    job.set_state(model.Job.states.NEW)
                else:
                    self.queue.put((job.id, job.tool_id))
            else:
                # Already dispatched and running
                job_wrapper = self.__recover_job_wrapper(job)
                self.dispatcher.recover(job, job_wrapper)
        if self.sa_session.dirty:
            self.sa_session.flush()

    def __recover_job_wrapper(self, job):
        # Already dispatched and running
        job_wrapper = self.job_wrapper(job)
        # Use the persisted destination as its params may differ from
        # what's in the job_conf xml
        job_destination = JobDestination(id=job.destination_id,
                                         runner=job.job_runner_name,
                                         params=job.destination_params)
        # resubmits are not persisted (it's a good thing) so they
        # should be added back to the in-memory destination on startup
        try:
            config_job_destination = self.app.job_config.get_destination(
                job.destination_id)
            job_destination.resubmit = config_job_destination.resubmit
        except KeyError:
            log.debug(
                '(%s) Recovered destination id (%s) does not exist in job config (but this may be normal in the case of a dynamically generated destination)',
                job.id, job.destination_id)
        job_wrapper.job_runner_mapper.cached_job_destination = job_destination
        return job_wrapper

    def __monitor(self):
        """
        Continually iterate the waiting jobs, checking is each is ready to
        run and dispatching if so.
        """
        while self.running:
            try:
                # If jobs are locked, there's nothing to monitor and we skip
                # to the sleep.
                if not self.app.job_manager.job_lock:
                    self.__monitor_step()
            except:
                log.exception("Exception in monitor_step")
            # Sleep
            self.sleeper.sleep(1)

    def __monitor_step(self):
        """
        Called repeatedly by `monitor` to process waiting jobs. Gets any new
        jobs (either from the database or from its own queue), then iterates
        over all new and waiting jobs to check the state of the jobs each
        depends on. If the job has dependencies that have not finished, it
        goes to the waiting queue. If the job has dependencies with errors,
        it is marked as having errors and removed from the queue. If the job
        belongs to an inactive user it is ignored.
        Otherwise, the job is dispatched.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        resubmit_jobs = []
        if self.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                .join(model.JobToInputDatasetAssociation) \
                .join(model.HistoryDatasetAssociation) \
                .join(model.Dataset) \
                .filter(and_( (model.Job.state == model.Job.states.NEW ),
                              or_( ( model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA ),
                                   ( model.HistoryDatasetAssociation.deleted == true() ),
                                   ( model.Dataset.state != model.Dataset.states.OK ),
                                   ( model.Dataset.deleted == true() ) ) ) ).subquery()
            ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                .join(model.JobToInputLibraryDatasetAssociation) \
                .join(model.LibraryDatasetDatasetAssociation) \
                .join(model.Dataset) \
                .filter(and_((model.Job.state == model.Job.states.NEW),
                        or_((model.LibraryDatasetDatasetAssociation._state != null()),
                            (model.LibraryDatasetDatasetAssociation.deleted == true()),
                            (model.Dataset.state != model.Dataset.states.OK),
                            (model.Dataset.deleted == true())))).subquery()
            if self.app.config.user_activation_on:
                jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .outerjoin( model.User ) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 or_((model.Job.user_id == null()), (model.User.active == true())),
                                 (model.Job.handler == self.app.config.server_name),
                                 ~model.Job.table.c.id.in_(hda_not_ready),
                                 ~model.Job.table.c.id.in_(ldda_not_ready))) \
                    .order_by(model.Job.id).all()
            else:
                jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 (model.Job.handler == self.app.config.server_name),
                                 ~model.Job.table.c.id.in_(hda_not_ready),
                                 ~model.Job.table.c.id.in_(ldda_not_ready))) \
                    .order_by(model.Job.id).all()
            # Fetch all "resubmit" jobs
            resubmit_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \
                .filter(and_((model.Job.state == model.Job.states.RESUBMITTED),
                             (model.Job.handler == self.app.config.server_name))) \
                .order_by(model.Job.id).all()
        else:
            # Get job objects and append to watch queue for any which were
            # previously waiting
            for job_id in self.waiting_jobs:
                jobs_to_check.append(
                    self.sa_session.query(model.Job).get(job_id))
            try:
                while 1:
                    message = self.queue.get_nowait()
                    if message is self.STOP_SIGNAL:
                        return
                    # Unpack the message
                    job_id, tool_id = message
                    # Get the job object and append to watch queue
                    jobs_to_check.append(
                        self.sa_session.query(model.Job).get(job_id))
            except Empty:
                pass
        # Ensure that we get new job counts on each iteration
        self.__clear_job_count()
        # Check resubmit jobs first so that limits of new jobs will still be enforced
        for job in resubmit_jobs:
            log.debug(
                '(%s) Job was resubmitted and is being dispatched immediately',
                job.id)
            # Reassemble resubmit job destination from persisted value
            jw = self.__recover_job_wrapper(job)
            if jw.is_ready_for_resubmission(job):
                self.increase_running_job_count(job.user_id,
                                                jw.job_destination.id)
                self.dispatcher.put(jw)
        # Iterate over new and waiting jobs and look for any that are
        # ready to run
        new_waiting_jobs = []
        for job in jobs_to_check:
            try:
                # Check the job's dependencies, requeue if they're not done.
                # Some of these states will only happen when using the in-memory job queue
                job_state = self.__check_job_state(job)
                if job_state == JOB_WAIT:
                    new_waiting_jobs.append(job.id)
                elif job_state == JOB_INPUT_ERROR:
                    log.info(
                        "(%d) Job unable to run: one or more inputs in error state"
                        % job.id)
                elif job_state == JOB_INPUT_DELETED:
                    log.info(
                        "(%d) Job unable to run: one or more inputs deleted" %
                        job.id)
                elif job_state == JOB_READY:
                    self.dispatcher.put(self.job_wrappers.pop(job.id))
                    log.info("(%d) Job dispatched" % job.id)
                elif job_state == JOB_DELETED:
                    log.info("(%d) Job deleted by user while still queued" %
                             job.id)
                elif job_state == JOB_ADMIN_DELETED:
                    log.info("(%d) Job deleted by admin while still queued" %
                             job.id)
                elif job_state in (JOB_USER_OVER_QUOTA,
                                   JOB_USER_OVER_TOTAL_WALLTIME):
                    if job_state == JOB_USER_OVER_QUOTA:
                        log.info("(%d) User (%s) is over quota: job paused" %
                                 (job.id, job.user_id))
                    else:
                        log.info(
                            "(%d) User (%s) is over total walltime limit: job paused"
                            % (job.id, job.user_id))

                    job.set_state(model.Job.states.PAUSED)
                    for dataset_assoc in job.output_datasets + job.output_library_datasets:
                        dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED
                        dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run"
                        self.sa_session.add(dataset_assoc.dataset.dataset)
                    self.sa_session.add(job)
                elif job_state == JOB_ERROR:
                    log.error("(%d) Error checking job readiness" % job.id)
                else:
                    log.error("(%d) Job in unknown state '%s'" %
                              (job.id, job_state))
                    new_waiting_jobs.append(job.id)
            except Exception:
                log.exception("failure running job %d", job.id)
        # Update the waiting list
        if not self.track_jobs_in_database:
            self.waiting_jobs = new_waiting_jobs
        # Remove cached wrappers for any jobs that are no longer being tracked
        for id in self.job_wrappers.keys():
            if id not in new_waiting_jobs:
                del self.job_wrappers[id]
        # Flush, if we updated the state
        self.sa_session.flush()
        # Done with the session
        self.sa_session.remove()

    def __check_job_state(self, job):
        """
        Check if a job is ready to run by verifying that each of its input
        datasets is ready (specifically in the OK state). If any input dataset
        has an error, fail the job and return JOB_INPUT_ERROR. If any input
        dataset is deleted, fail the job and return JOB_INPUT_DELETED.  If all
        input datasets are in OK state, return JOB_READY indicating that the
        job can be dispatched. Otherwise, return JOB_WAIT indicating that input
        datasets are still being prepared.
        """
        if not self.track_jobs_in_database:
            in_memory_not_ready_state = self.__verify_in_memory_job_inputs(job)
            if in_memory_not_ready_state:
                return in_memory_not_ready_state

        # Else, if tracking in the database, job.state is guaranteed to be NEW and
        # the inputs are guaranteed to be OK.

        # Create the job wrapper so that the destination can be set
        job_id = job.id
        job_wrapper = self.job_wrappers.get(job_id, None)
        if not job_wrapper:
            job_wrapper = self.job_wrapper(job)
            self.job_wrappers[job_id] = job_wrapper

        # If state == JOB_READY, assume job_destination also set - otherwise
        # in case of various error or cancelled states do not assume
        # destination has been set.
        state, job_destination = self.__verify_job_ready(job, job_wrapper)

        if state == JOB_READY:
            # PASS.  increase usage by one job (if caching) so that multiple jobs aren't dispatched on this queue iteration
            self.increase_running_job_count(job.user_id, job_destination.id)
        return state

    def __verify_job_ready(self, job, job_wrapper):
        """ Compute job destination and verify job is ready at that
        destination by checking job limits and quota. If this method
        return a job state of JOB_READY - it MUST also return a job
        destination.
        """
        job_destination = None
        try:
            assert job_wrapper.tool is not None, 'This tool was disabled before the job completed.  Please contact your Galaxy administrator.'
            # Cause the job_destination to be set and cached by the mapper
            job_destination = job_wrapper.job_destination
        except AssertionError as e:
            log.warning(
                "(%s) Tool '%s' removed from tool config, unable to run job" %
                (job.id, job.tool_id))
            job_wrapper.fail(e)
            return JOB_ERROR, job_destination
        except JobNotReadyException as e:
            job_state = e.job_state or JOB_WAIT
            return job_state, None
        except Exception as e:
            failure_message = getattr(e, 'failure_message',
                                      DEFAULT_JOB_PUT_FAILURE_MESSAGE)
            if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE:
                log.exception('Failed to generate job destination')
            else:
                log.debug("Intentionally failing job with message (%s)" %
                          failure_message)
            job_wrapper.fail(failure_message)
            return JOB_ERROR, job_destination
        # job is ready to run, check limits
        # TODO: these checks should be refactored to minimize duplication and made more modular/pluggable
        state = self.__check_destination_jobs(job, job_wrapper)

        if state == JOB_READY:
            state = self.__check_user_jobs(job, job_wrapper)
        if state == JOB_READY and self.app.config.enable_quotas:
            quota = self.app.quota_agent.get_quota(job.user)
            if quota is not None:
                try:
                    usage = self.app.quota_agent.get_usage(user=job.user,
                                                           history=job.history)
                    if usage > quota:
                        return JOB_USER_OVER_QUOTA, job_destination
                except AssertionError as e:
                    pass  # No history, should not happen with an anon user
        # Check total walltime limits
        if (state == JOB_READY
                and "delta" in self.app.job_config.limits.total_walltime):
            jobs_to_check = self.sa_session.query(model.Job).filter(
                model.Job.user_id == job.user.id, model.Job.update_time >=
                datetime.datetime.now() - datetime.timedelta(
                    self.app.job_config.limits.total_walltime["window"]),
                model.Job.state == 'ok').all()
            time_spent = datetime.timedelta(0)
            for job in jobs_to_check:
                # History is job.state_history
                started = None
                finished = None
                for history in sorted(job.state_history,
                                      key=lambda history: history.update_time):
                    if history.state == "running":
                        started = history.create_time
                    elif history.state == "ok":
                        finished = history.create_time

                time_spent += finished - started

            if time_spent > self.app.job_config.limits.total_walltime["delta"]:
                return JOB_USER_OVER_TOTAL_WALLTIME, job_destination

        return state, job_destination

    def __verify_in_memory_job_inputs(self, job):
        """ Perform the same checks that happen via SQL for in-memory managed
        jobs.
        """
        if job.state == model.Job.states.DELETED:
            return JOB_DELETED
        elif job.state == model.Job.states.ERROR:
            return JOB_ADMIN_DELETED
        for dataset_assoc in job.input_datasets + job.input_library_datasets:
            idata = dataset_assoc.dataset
            if not idata:
                continue
            # don't run jobs for which the input dataset was deleted
            if idata.deleted:
                self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail(
                    "input data %s (file: %s) was deleted before the job started"
                    % (idata.hid, idata.file_name))
                return JOB_INPUT_DELETED
            # an error in the input data causes us to bail immediately
            elif idata.state == idata.states.ERROR:
                self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail(
                    "input data %s is in error state" % (idata.hid))
                return JOB_INPUT_ERROR
            elif idata.state == idata.states.FAILED_METADATA:
                self.job_wrappers.pop(job.id, self.job_wrapper(job)).fail(
                    "input data %s failed to properly set metadata" %
                    (idata.hid))
                return JOB_INPUT_ERROR
            elif idata.state != idata.states.OK and not (
                    idata.state == idata.states.SETTING_METADATA
                    and job.tool_id is not None and job.tool_id ==
                    self.app.datatypes_registry.set_external_metadata_tool.id):
                # need to requeue
                return JOB_WAIT

        # All inputs ready to go.
        return None

    def __clear_job_count(self):
        self.user_job_count = None
        self.user_job_count_per_destination = None
        self.total_job_count_per_destination = None

    def get_user_job_count(self, user_id):
        self.__cache_user_job_count()
        # This could have been incremented by a previous job dispatched on this iteration, even if we're not caching
        rval = self.user_job_count.get(user_id, 0)
        if not self.app.config.cache_user_job_count:
            result = self.sa_session.execute(
                select([func.count(model.Job.table.c.id)]).where(
                    and_(
                        model.Job.table.c.state.in_(
                            (model.Job.states.QUEUED, model.Job.states.RUNNING,
                             model.Job.states.RESUBMITTED)),
                        (model.Job.table.c.user_id == user_id))))
            for row in result:
                # there should only be one row
                rval += row[0]
        return rval

    def __cache_user_job_count(self):
        # Cache the job count if necessary
        if self.user_job_count is None and self.app.config.cache_user_job_count:
            self.user_job_count = {}
            query = self.sa_session.execute(
                select([
                    model.Job.table.c.user_id,
                    func.count(model.Job.table.c.user_id)
                ]).where(
                    and_(
                        model.Job.table.c.state.in_(
                            (model.Job.states.QUEUED, model.Job.states.RUNNING,
                             model.Job.states.RESUBMITTED)),
                        (model.Job.table.c.user_id != null()))).group_by(
                            model.Job.table.c.user_id))
            for row in query:
                self.user_job_count[row[0]] = row[1]
        elif self.user_job_count is None:
            self.user_job_count = {}

    def get_user_job_count_per_destination(self, user_id):
        self.__cache_user_job_count_per_destination()
        cached = self.user_job_count_per_destination.get(user_id, {})
        if self.app.config.cache_user_job_count:
            rval = cached
        else:
            # The cached count is still used even when we're not caching, it is
            # incremented when a job is run by this handler to ensure that
            # multiple jobs can't get past the limits in one iteration of the
            # queue.
            rval = {}
            rval.update(cached)
            result = self.sa_session.execute(
                select([
                    model.Job.table.c.destination_id,
                    func.count(
                        model.Job.table.c.destination_id).label('job_count')
                ]).where(
                    and_(
                        model.Job.table.c.state.in_(
                            (model.Job.states.QUEUED,
                             model.Job.states.RUNNING)),
                        (model.Job.table.c.user_id == user_id))).group_by(
                            model.Job.table.c.destination_id))
            for row in result:
                # Add the count from the database to the cached count
                rval[row['destination_id']] = rval.get(row['destination_id'],
                                                       0) + row['job_count']
        return rval

    def __cache_user_job_count_per_destination(self):
        # Cache the job count if necessary
        if self.user_job_count_per_destination is None and self.app.config.cache_user_job_count:
            self.user_job_count_per_destination = {}
            result = self.sa_session.execute(
                select([
                    model.Job.table.c.user_id,
                    model.Job.table.c.destination_id,
                    func.count(model.Job.table.c.user_id).label('job_count')
                ]).where(
                    and_(
                        model.Job.table.c.state.in_(
                            (model.Job.states.QUEUED,
                             model.Job.states.RUNNING)))).group_by(
                                 model.Job.table.c.user_id,
                                 model.Job.table.c.destination_id))
            for row in result:
                if row['user_id'] not in self.user_job_count_per_destination:
                    self.user_job_count_per_destination[row['user_id']] = {}
                self.user_job_count_per_destination[row['user_id']][
                    row['destination_id']] = row['job_count']
        elif self.user_job_count_per_destination is None:
            self.user_job_count_per_destination = {}

    def increase_running_job_count(self, user_id, destination_id):
        if self.app.job_config.limits.registered_user_concurrent_jobs or \
           self.app.job_config.limits.anonymous_user_concurrent_jobs or \
           self.app.job_config.limits.destination_user_concurrent_jobs:
            if self.user_job_count is None:
                self.user_job_count = {}
            if self.user_job_count_per_destination is None:
                self.user_job_count_per_destination = {}
            self.user_job_count[user_id] = self.user_job_count.get(user_id,
                                                                   0) + 1
            if user_id not in self.user_job_count_per_destination:
                self.user_job_count_per_destination[user_id] = {}
            self.user_job_count_per_destination[user_id][
                destination_id] = self.user_job_count_per_destination[
                    user_id].get(destination_id, 0) + 1
        if self.app.job_config.limits.destination_total_concurrent_jobs:
            if self.total_job_count_per_destination is None:
                self.total_job_count_per_destination = {}
            self.total_job_count_per_destination[
                destination_id] = self.total_job_count_per_destination.get(
                    destination_id, 0) + 1

    def __check_user_jobs(self, job, job_wrapper):
        # TODO: Update output datasets' _state = LIMITED or some such new
        # state, so the UI can reflect what jobs are waiting due to concurrency
        # limits
        if job.user:
            # Check the hard limit first
            if self.app.job_config.limits.registered_user_concurrent_jobs:
                count = self.get_user_job_count(job.user_id)
                # Check the user's number of dispatched jobs against the overall limit
                if count >= self.app.job_config.limits.registered_user_concurrent_jobs:
                    return JOB_WAIT
            # If we pass the hard limit, also check the per-destination count
            id = job_wrapper.job_destination.id
            count_per_id = self.get_user_job_count_per_destination(job.user_id)
            if id in self.app.job_config.limits.destination_user_concurrent_jobs:
                count = count_per_id.get(id, 0)
                # Check the user's number of dispatched jobs in the assigned destination id against the limit for that id
                if count >= self.app.job_config.limits.destination_user_concurrent_jobs[
                        id]:
                    return JOB_WAIT
            # If we pass the destination limit (if there is one), also check limits on any tags (if any)
            if job_wrapper.job_destination.tags:
                for tag in job_wrapper.job_destination.tags:
                    # Check each tag for this job's destination
                    if tag in self.app.job_config.limits.destination_user_concurrent_jobs:
                        # Only if there's a limit defined for this tag
                        count = 0
                        for id in [
                                d.id for d in
                                self.app.job_config.get_destinations(tag)
                        ]:
                            # Add up the aggregate job total for this tag
                            count += count_per_id.get(id, 0)
                        if count >= self.app.job_config.limits.destination_user_concurrent_jobs[
                                tag]:
                            return JOB_WAIT
        elif job.galaxy_session:
            # Anonymous users only get the hard limit
            if self.app.job_config.limits.anonymous_user_concurrent_jobs:
                count = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                            .filter( and_( model.Job.session_id == job.galaxy_session.id,
                                           or_( model.Job.state == model.Job.states.RUNNING,
                                                model.Job.state == model.Job.states.QUEUED ) ) ).count()
                if count >= self.app.job_config.limits.anonymous_user_concurrent_jobs:
                    return JOB_WAIT
        else:
            log.warning(
                'Job %s is not associated with a user or session so job concurrency limit cannot be checked.'
                % job.id)
        return JOB_READY

    def __cache_total_job_count_per_destination(self):
        # Cache the job count if necessary
        if self.total_job_count_per_destination is None:
            self.total_job_count_per_destination = {}
            result = self.sa_session.execute(
                select([
                    model.Job.table.c.destination_id,
                    func.count(
                        model.Job.table.c.destination_id).label('job_count')
                ]).where(
                    and_(
                        model.Job.table.c.state.in_(
                            (model.Job.states.QUEUED,
                             model.Job.states.RUNNING)))).group_by(
                                 model.Job.table.c.destination_id))
            for row in result:
                self.total_job_count_per_destination[
                    row['destination_id']] = row['job_count']

    def get_total_job_count_per_destination(self):
        self.__cache_total_job_count_per_destination()
        # Always use caching (at worst a job will have to wait one iteration,
        # and this would be more fair anyway as it ensures FIFO scheduling,
        # insofar as FIFO would be fair...)
        return self.total_job_count_per_destination

    def __check_destination_jobs(self, job, job_wrapper):
        if self.app.job_config.limits.destination_total_concurrent_jobs:
            id = job_wrapper.job_destination.id
            count_per_id = self.get_total_job_count_per_destination()
            if id in self.app.job_config.limits.destination_total_concurrent_jobs:
                count = count_per_id.get(id, 0)
                # Check the number of dispatched jobs in the assigned destination id against the limit for that id
                if count >= self.app.job_config.limits.destination_total_concurrent_jobs[
                        id]:
                    return JOB_WAIT
            # If we pass the destination limit (if there is one), also check limits on any tags (if any)
            if job_wrapper.job_destination.tags:
                for tag in job_wrapper.job_destination.tags:
                    # Check each tag for this job's destination
                    if tag in self.app.job_config.limits.destination_total_concurrent_jobs:
                        # Only if there's a limit defined for this tag
                        count = 0
                        for id in [
                                d.id for d in
                                self.app.job_config.get_destinations(tag)
                        ]:
                            # Add up the aggregate job total for this tag
                            count += count_per_id.get(id, 0)
                        if count >= self.app.job_config.limits.destination_total_concurrent_jobs[
                                tag]:
                            return JOB_WAIT
        return JOB_READY

    def put(self, job_id, tool_id):
        """Add a job to the queue (by job identifier)"""
        if not self.track_jobs_in_database:
            self.queue.put((job_id, tool_id))
            self.sleeper.wake()

    def shutdown(self):
        """Attempts to gracefully shut down the worker thread"""
        if self.parent_pid != os.getpid():
            # We're not the real job queue, do nothing
            return
        else:
            log.info("sending stop signal to worker thread")
            self.running = False
            if not self.app.config.track_jobs_in_database:
                self.queue.put(self.STOP_SIGNAL)
            self.sleeper.wake()
            log.info("job handler queue stopped")
            self.dispatcher.shutdown()
class JobHandlerQueue(object):
    """
    Job Handler's Internal Queue, this is what actually implements waiting for
    jobs to be runnable and dispatching to a JobRunner.
    """
    STOP_SIGNAL = object()

    def __init__(self, app, dispatcher):
        """Initializes the Job Handler Queue, creates (unstarted) monitoring thread"""
        self.app = app
        self.dispatcher = dispatcher

        self.sa_session = app.model.context
        self.track_jobs_in_database = self.app.config.track_jobs_in_database

        # Initialize structures for handling job limits
        self.__clear_job_count()

        # Keep track of the pid that started the job manager, only it
        # has valid threads
        self.parent_pid = os.getpid()
        # Contains new jobs. Note this is not used if track_jobs_in_database is True
        self.queue = Queue()
        # Contains jobs that are waiting (only use from monitor thread)
        self.waiting_jobs = []
        # Contains wrappers of jobs that are limited or ready (so they aren't created unnecessarily/multiple times)
        self.job_wrappers = {}
        # Helper for interruptable sleep
        self.sleeper = Sleeper()
        self.running = True
        self.monitor_thread = threading.Thread(
            name="JobHandlerQueue.monitor_thread", target=self.__monitor)
        self.monitor_thread.setDaemon(True)

    def start(self):
        """
        Starts the JobHandler's thread after checking for any unhandled jobs.
        """
        # Recover jobs at startup
        self.__check_jobs_at_startup()
        # Start the queue
        self.monitor_thread.start()
        log.info("job handler queue started")

    def job_wrapper(self, job, use_persisted_destination=False):
        return JobWrapper(job,
                          self,
                          use_persisted_destination=use_persisted_destination)

    def job_pair_for_id(self, id):
        job = self.sa_session.query(model.Job).get(id)
        return job, self.job_wrapper(job, use_persisted_destination=True)

    def __check_jobs_at_startup(self):
        """
        Checks all jobs that are in the 'new', 'queued' or 'running' state in
        the database and requeues or cleans up as necessary.  Only run as the
        job handler starts.
        In case the activation is enforced it will filter out the jobs of inactive users.
        """
        jobs_at_startup = []
        if self.track_jobs_in_database:
            in_list = (model.Job.states.QUEUED, model.Job.states.RUNNING)
        else:
            in_list = (model.Job.states.NEW, model.Job.states.QUEUED,
                       model.Job.states.RUNNING)
        if self.app.config.user_activation_on:
            jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                .outerjoin( model.User ) \
                .filter( model.Job.state.in_( in_list ) &
                         ( model.Job.handler == self.app.config.server_name ) &
                         or_( ( model.Job.user_id == null() ), ( model.User.active == true() ) ) ).all()
        else:
            jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \
                .filter( model.Job.state.in_( in_list ) &
                         ( model.Job.handler == self.app.config.server_name ) ).all()

        for job in jobs_at_startup:
            if not self.app.toolbox.has_tool(
                    job.tool_id, job.tool_version, exact=True):
                log.warning(
                    "(%s) Tool '%s' removed from tool config, unable to recover job"
                    % (job.id, job.tool_id))
                self.job_wrapper(job).fail(
                    'This tool was disabled before the job completed.  Please contact your Galaxy administrator.'
                )
            elif job.job_runner_name is not None and job.job_runner_external_id is None:
                # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner.
                log.debug(
                    "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue"
                    % job.id)
                job.job_runner_name = None
                if self.track_jobs_in_database:
                    job.set_state(model.Job.states.NEW)
                else:
                    self.queue.put((job.id, job.tool_id))
            elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None:
                # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist
                job_wrapper = self.job_wrapper(job)
                job_destination = self.dispatcher.url_to_destination(
                    job.job_runner_name)
                if job_destination.id is None:
                    job_destination.id = 'legacy_url'
                job_wrapper.set_job_destination(job_destination,
                                                job.job_runner_external_id)
                self.dispatcher.recover(job, job_wrapper)
                log.info(
                    '(%s) Converted job from a URL to a destination and recovered'
                    % (job.id))
            elif job.job_runner_name is None:
                # Never (fully) dispatched
                log.debug(
                    "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue"
                    % (job.id, job.state))
                if self.track_jobs_in_database:
                    job.set_state(model.Job.states.NEW)
                else:
                    self.queue.put((job.id, job.tool_id))
            else:
                # Already dispatched and running
                job_wrapper = self.job_wrapper(job)
                # Use the persisted destination as its params may differ from
                # what's in the job_conf xml
                job_destination = JobDestination(id=job.destination_id,
                                                 runner=job.job_runner_name,
                                                 params=job.destination_params)
                # resubmits are not persisted (it's a good thing) so they
                # should be added back to the in-memory destination on startup
                try:
                    config_job_destination = self.app.job_config.get_destination(
                        job.destination_id)
                    job_destination.resubmit = config_job_destination.resubmit
                except KeyError:
                    log.warning(
                        '(%s) Recovered destination id (%s) does not exist in job config (but this may be normal in the case of a dynamically generated destination)',
                        job.id, job.destination_id)
                job_wrapper.job_runner_mapper.cached_job_destination = job_destination
                self.dispatcher.recover(job, job_wrapper)
        if self.sa_session.dirty:
            self.sa_session.flush()

    def __monitor(self):
        """
        Continually iterate the waiting jobs, checking is each is ready to
        run and dispatching if so.
        """
        while self.running:
            try:
                # If jobs are locked, there's nothing to monitor and we skip
                # to the sleep.
                if not self.app.job_manager.job_lock:
                    self.__monitor_step()
            except:
                log.exception("Exception in monitor_step")
            # Sleep
            self.sleeper.sleep(1)

    def __monitor_step(self):
        """
        Called repeatedly by `monitor` to process waiting jobs. Gets any new
        jobs (either from the database or from its own queue), then iterates
        over all new and waiting jobs to check the state of the jobs each
        depends on. If the job has dependencies that have not finished, it
        it goes to the waiting queue. If the job has dependencies with errors,
        it is marked as having errors and removed from the queue. If the job
        belongs to an inactive user it is ignored.
        Otherwise, the job is dispatched.
        """
        # Pull all new jobs from the queue at once
        jobs_to_check = []
        resubmit_jobs = []
        if self.track_jobs_in_database:
            # Clear the session so we get fresh states for job and all datasets
            self.sa_session.expunge_all()
            # Fetch all new jobs
            hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                .join(model.JobToInputDatasetAssociation) \
                .join(model.HistoryDatasetAssociation) \
                .join(model.Dataset) \
                .filter(and_( (model.Job.state == model.Job.states.NEW ),
                              or_( ( model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA ),
                                   ( model.HistoryDatasetAssociation.deleted == true() ),
                                   ( model.Dataset.state != model.Dataset.states.OK ),
                                   ( model.Dataset.deleted == true() ) ) ) ).subquery()
            ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \
                .join(model.JobToInputLibraryDatasetAssociation) \
                .join(model.LibraryDatasetDatasetAssociation) \
                .join(model.Dataset) \
                .filter(and_((model.Job.state == model.Job.states.NEW),
                        or_((model.LibraryDatasetDatasetAssociation._state != null()),
                            (model.LibraryDatasetDatasetAssociation.deleted == true()),
                            (model.Dataset.state != model.Dataset.states.OK),
                            (model.Dataset.deleted == true())))).subquery()
            if self.app.config.user_activation_on:
                jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .outerjoin( model.User ) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 or_((model.Job.user_id == null()), (model.User.active == true())),
                                 (model.Job.handler == self.app.config.server_name),
                                 ~model.Job.table.c.id.in_(hda_not_ready),
                                 ~model.Job.table.c.id.in_(ldda_not_ready))) \
                    .order_by(model.Job.id).all()
            else:
                jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \
                    .filter(and_((model.Job.state == model.Job.states.NEW),
                                 (model.Job.handler == self.app.config.server_name),
                                 ~model.Job.table.c.id.in_(hda_not_ready),
                                 ~model.Job.table.c.id.in_(ldda_not_ready))) \
                    .order_by(model.Job.id).all()
            # Fetch all "resubmit" jobs
            resubmit_jobs = self.sa_session.query(model.Job).enable_eagerloads(False) \
                .filter(and_((model.Job.state == model.Job.states.RESUBMITTED),
                             (model.Job.handler == self.app.config.server_name))) \
                .order_by(model.Job.id).all()
        else:
            # Get job objects and append to watch queue for any which were
            # previously waiting
            for job_id in self.waiting_jobs:
                jobs_to_check.append(
                    self.sa_session.query(model.Job).get(job_id))
            try:
                while 1:
                    message = self.queue.get_nowait()
                    if message is self.STOP_SIGNAL:
                        return
                    # Unpack the message
                    job_id, tool_id = message
                    # Get the job object and append to watch queue
                    jobs_to_check.append(
                        self.sa_session.query(model.Job).get(job_id))
            except Empty:
                pass
        # Ensure that we get new job counts on each iteration
        self.__clear_job_count()
        # Check resubmit jobs first so that limits of new jobs will still be enforced
        for job in resubmit_jobs:
            log.debug(
                '(%s) Job was resubmitted and is being dispatched immediately',
                job.id)
            # Reassemble resubmit job destination from persisted value
            jw = self.job_wrapper(job)
            jw.job_runner_mapper.cached_job_destination = JobDestination(
                id=job.destination_id,
                runner=job.job_runner_name,
                params=job.destination_params)
            self.increase_running_job_count(job.user_id, jw.job_destination.id)
            self.dispatcher.put(jw)
        # Iterate over new and waiting jobs and look for any that are
        # ready to run
        new_waiting_jobs = []
        for job in jobs_to_check:
            try:
                # Check the job's dependencies, requeue if they're not done.
                # Some of these states will only happen when using the in-memory job queue
                job_state = self.__check_job_state(job)
                if job_state == JOB_WAIT:
                    new_waiting_jobs.append(job.id)
                elif job_state == JOB_INPUT_ERROR:
                    log.info(
                        "(%d) Job unable to run: one or more inputs in error state"
                        % job.id)
                elif job_state == JOB_INPUT_DELETED:
                    log.info(
                        "(%d) Job unable to run: one or more inputs deleted" %
                        job.id)
                elif job_state == JOB_READY:
                    self.dispatcher.put(self.job_wrappers.pop(job.id))
                    log.info("(%d) Job dispatched" % job.id)
                elif job_state == JOB_DELETED:
                    log.info("(%d) Job deleted by user while still queued" %
                             job.id)
                elif job_state == JOB_ADMIN_DELETED:
                    log.info("(%d) Job deleted by admin while still queued" %
                             job.id)
                elif job_state == JOB_USER_OVER_QUOTA:
                    log.info("(%d) User (%s) is over quota: job paused" %
                             (job.id, job.user_id))
                    job.set_state(model.Job.states.PAUSED)
                    for dataset_assoc in job.output_datasets + job.output_library_datasets:
                        dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED
                        dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run"
                        self.sa_session.add(dataset_assoc.dataset.dataset)
                    self.sa_session.add(job)
                elif job_state == JOB_ERROR:
                    log.error("(%d) Error checking job readiness" % job.id)
                else:
                    log.error("(%d) Job in unknown state '%s'" %
                              (job.id, job_state))
                    new_waiting_jobs.append(job.id)
            except Exception:
                log.exception("failure running job %d" % job.id)
        # Update the waiting list
        if not self.track_jobs_in_database:
            self.waiting_jobs = new_waiting_jobs
        # Remove cached wrappers for any jobs that are no longer being tracked
        for id in self.job_wrappers.keys():
            if id not in new_waiting_jobs:
                del self.job_wrappers[id]
        # Flush, if we updated the state
        self.sa_session.flush()
        # Done with the session
        self.sa_session.remove()

    def __check_job_state(self, job):
        """
        Check if a job is ready to run by verifying that each of its input
        datasets is ready (specifically in the OK state). If any input dataset
        has an error, fail the job and return JOB_INPUT_ERROR. If any input
        dataset is deleted, fail the job and return JOB_INPUT_DELETED.  If all
        input datasets are in OK state, return JOB_READY indicating that the
        job can be dispatched. Otherwise, return JOB_WAIT indicating that input
        datasets are still being prepared.
        """
        if not self.track_jobs_in_database:
            in_memory_not_ready_state = self.__verify_in_memory_job_inputs(job)
            if in_memory_not_ready_state:
                return in_memory_not_ready_state

        # Else, if tracking in the database, job.state is guaranteed to be NEW and
        # the inputs are guaranteed to be OK.

        # Create the job wrapper so that the destination can be set
        job_id = job.id
        job_wrapper = self.job_wrappers.get(job_id, None)
        if not job_wrapper:
            job_wrapper = self.job_wrapper(job)
            self.job_wrappers[job_id] = job_wrapper

        # If state == JOB_READY, assume job_destination also set - otherwise
        # in case of various error or cancelled states do not assume
        # destination has been set.
        state, job_destination = self.__verify_job_ready(job, job_wrapper)

        if state == JOB_READY:
            # PASS.  increase usage by one job (if caching) so that multiple jobs aren't dispatched on this queue iteration
            self.increase_running_job_count(job.user_id, job_destination.id)
        return state

    def __verify_job_ready(self, job, job_wrapper):
        """ Compute job destination and verify job is ready at that
        destination by checking job limits and quota. If this method
        return a job state of JOB_READY - it MUST also return a job
        destination.
        """
        job_destination = None
        try:
            assert job_wrapper.tool is not None, 'This tool was disabled before the job completed.  Please contact your Galaxy administrator.'
            # Cause the job_destination to be set and cached by the mapper
            job_destination = job_wrapper.job_destination
        except AssertionError as e:
            log.warning(
                "(%s) Tool '%s' removed from tool config, unable to run job" %
                (job.id, job.tool_id))
            job_wrapper.fail(e)
            return JOB_ERROR, job_destination
        except JobNotReadyException as e:
            job_state = e.job_state or JOB_WAIT
            return job_state, None
        except Exception, e:
            failure_message = getattr(e, 'failure_message',
                                      DEFAULT_JOB_PUT_FAILURE_MESSAGE)
            if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE:
                log.exception('Failed to generate job destination')
            else:
                log.debug("Intentionally failing job with message (%s)" %
                          failure_message)
            job_wrapper.fail(failure_message)
            return JOB_ERROR, job_destination
        # job is ready to run, check limits
        # TODO: these checks should be refactored to minimize duplication and made more modular/pluggable
        state = self.__check_destination_jobs(job, job_wrapper)
        if state == JOB_READY:
            state = self.__check_user_jobs(job, job_wrapper)
        if state == JOB_READY and self.app.config.enable_quotas:
            quota = self.app.quota_agent.get_quota(job.user)
            if quota is not None:
                try:
                    usage = self.app.quota_agent.get_usage(user=job.user,
                                                           history=job.history)
                    if usage > quota:
                        return JOB_USER_OVER_QUOTA, job_destination
                except AssertionError, e:
                    pass  # No history, should not happen with an anon user
Beispiel #18
0
class DeferredJobQueue(object):
    job_states = Bunch(READY='ready', WAIT='wait', INVALID='invalid')

    def __init__(self, app):
        self.app = app
        self.sa_session = app.model.context.current
        self.queue = Queue()
        self.plugins = {}
        self._load_plugins()
        self.sleeper = Sleeper()
        self.running = True
        self.waiting_jobs = []
        self.__check_jobs_at_startup()
        self.monitor_thread = threading.Thread(target=self.__monitor)
        self.monitor_thread.start()
        log.info('Deferred job queue started')

    def _load_plugins(self):
        for fname in os.listdir(os.path.dirname(__file__)):
            if not fname.startswith('_') and fname.endswith('.py'):
                name = fname[:-3]
                module_name = 'galaxy.jobs.deferred.' + name
                try:
                    module = __import__(module_name)
                except ImportError:
                    log.exception(
                        'Deferred job plugin appears to exist but is not loadable: %s',
                        module_name)
                    continue
                for comp in module_name.split(".")[1:]:
                    module = getattr(module, comp)
                if '__all__' not in dir(module):
                    log.error(
                        'Plugin "%s" does not contain a list of exported classes in __all__'
                        % module_name)
                    continue
                for obj in module.__all__:
                    display_name = ':'.join((module_name, obj))
                    plugin = getattr(module, obj)
                    for name in ('check_job', 'run_job'):
                        if name not in dir(plugin):
                            log.error(
                                'Plugin "%s" does not contain required method "%s()"'
                                % (display_name, name))
                            break
                    else:
                        self.plugins[obj] = plugin(self.app)
                        self.plugins[obj].job_states = self.job_states
                        log.debug('Loaded deferred job plugin: %s' %
                                  display_name)

    def __check_jobs_at_startup(self):
        waiting_jobs = self.sa_session.query(model.DeferredJob) \
                                      .filter(model.DeferredJob.state == model.DeferredJob.states.WAITING).all()
        for job in waiting_jobs:
            if not self.__check_job_plugin(job):
                continue
            if 'check_interval' in dir(self.plugins[job.plugin]):
                job.check_interval = self.plugins[job.plugin].check_interval
            log.info('Recovered deferred job (id: %s) at startup' % job.id)
            # Pass the job ID as opposed to the job, since the monitor thread
            # needs to load it in its own threadlocal scoped session.
            self.waiting_jobs.append(job.id)

    def __monitor(self):
        while self.running:
            try:
                self.__monitor_step()
            except Exception:
                log.exception('Exception in monitor_step')
            self.sleeper.sleep(1)
        log.info('job queue stopped')

    def __monitor_step(self):
        # TODO: Querying the database with this frequency is bad, we need message passing
        new_jobs = self.sa_session.query(model.DeferredJob) \
                                  .filter(model.DeferredJob.state == model.DeferredJob.states.NEW).all()
        for job in new_jobs:
            if not self.__check_job_plugin(job):
                continue
            job.state = model.DeferredJob.states.WAITING
            self.sa_session.add(job)
            self.sa_session.flush()
            if 'check_interval' in dir(self.plugins[job.plugin]):
                job.check_interval = self.plugins[job.plugin].check_interval
            self.waiting_jobs.append(job)
        new_waiting = []
        for job in self.waiting_jobs:
            try:
                # Recovered jobs are passed in by ID
                assert type(job) is int
                job = self.sa_session.query(model.DeferredJob).get(job)
            except Exception:
                pass
            if job.is_check_time:
                try:
                    job_state = self.plugins[job.plugin].check_job(job)
                except Exception:
                    self.__fail_job(job)
                    log.exception(
                        'Set deferred job %s to error because of an exception in check_job()'
                        % job.id)
                    continue
                if job_state == self.job_states.READY:
                    try:
                        self.plugins[job.plugin].run_job(job)
                    except Exception:
                        self.__fail_job(job)
                        log.exception(
                            'Set deferred job %s to error because of an exception in run_job()'
                            % job.id)
                        continue
                elif job_state == self.job_states.INVALID:
                    self.__fail_job(job)
                    log.error(
                        'Unable to run deferred job (id: %s): Plugin "%s" marked it as invalid'
                        % (job.id, job.plugin))
                    continue
                else:
                    new_waiting.append(job)
                job.last_check = 'now'
            else:
                new_waiting.append(job)
        self.waiting_jobs = new_waiting

    def __check_job_plugin(self, job):
        if job.plugin not in self.plugins:
            log.error('Invalid deferred job plugin: %s') % job.plugin
            job.state = model.DeferredJob.states.ERROR
            self.sa_session.add(job)
            self.sa_session.flush()
            return False
        return True

    def __check_if_ready_to_run(self, job):
        return self.plugins[job.plugin].check_job(job)

    def __fail_job(self, job):
        job.state = model.DeferredJob.states.ERROR
        self.sa_session.add(job)
        self.sa_session.flush()

    def shutdown(self):
        self.running = False
        self.sleeper.wake()
Beispiel #19
0
class S3ObjectStore(ObjectStore):
    """
    Object store that stores objects as items in an AWS S3 bucket. A local
    cache exists that is used as an intermediate location for files between
    Galaxy and S3.
    """
    def __init__(self, config, config_xml):
        if boto is None:
            raise Exception(NO_BOTO_ERROR_MESSAGE)
        super(S3ObjectStore, self).__init__(config)
        self.staging_path = self.config.file_path
        self.transfer_progress = 0
        self._parse_config_xml(config_xml)
        self._configure_connection()
        self.bucket = self._get_bucket(self.bucket)
        # Clean cache only if value is set in galaxy.ini
        if self.cache_size != -1:
            # Convert GBs to bytes for comparison
            self.cache_size = self.cache_size * 1073741824
            # Helper for interruptable sleep
            self.sleeper = Sleeper()
            self.cache_monitor_thread = threading.Thread(
                target=self.__cache_monitor)
            self.cache_monitor_thread.start()
            log.info("Cache cleaner manager started")
        # Test if 'axel' is available for parallel download and pull the key into cache
        try:
            subprocess.call('axel')
            self.use_axel = True
        except OSError:
            self.use_axel = False

    def _configure_connection(self):
        log.debug("Configuring S3 Connection")
        self.conn = S3Connection(self.access_key, self.secret_key)

    def _parse_config_xml(self, config_xml):
        try:
            a_xml = config_xml.findall('auth')[0]
            self.access_key = a_xml.get('access_key')
            self.secret_key = a_xml.get('secret_key')
            b_xml = config_xml.findall('bucket')[0]
            self.bucket = b_xml.get('name')
            self.use_rr = string_as_bool(
                b_xml.get('use_reduced_redundancy', "False"))
            self.max_chunk_size = int(b_xml.get('max_chunk_size', 250))
            cn_xml = config_xml.findall('connection')
            if not cn_xml:
                cn_xml = {}
            else:
                cn_xml = cn_xml[0]
            self.host = cn_xml.get('host', None)
            self.port = int(cn_xml.get('port', 6000))
            self.multipart = string_as_bool(cn_xml.get('multipart', 'True'))
            self.is_secure = string_as_bool(cn_xml.get('is_secure', 'True'))
            self.conn_path = cn_xml.get('conn_path', '/')
            c_xml = config_xml.findall('cache')[0]
            self.cache_size = float(c_xml.get('size', -1))
            self.staging_path = c_xml.get('path',
                                          self.config.object_store_cache_path)

            for d_xml in config_xml.findall('extra_dir'):
                self.extra_dirs[d_xml.get('type')] = d_xml.get('path')

            log.debug("Object cache dir:    %s", self.staging_path)
            log.debug("       job work dir: %s", self.extra_dirs['job_work'])

            # for multipart upload
            self.s3server = {
                'access_key': self.access_key,
                'secret_key': self.secret_key,
                'is_secure': self.is_secure,
                'max_chunk_size': self.max_chunk_size,
                'host': self.host,
                'port': self.port,
                'use_rr': self.use_rr,
                'conn_path': self.conn_path
            }
        except Exception:
            # Toss it back up after logging, we can't continue loading at this point.
            log.exception(
                "Malformed ObjectStore Configuration XML -- unable to continue"
            )
            raise

    def __cache_monitor(self):
        time.sleep(2)  # Wait for things to load before starting the monitor
        while self.running:
            total_size = 0
            # Is this going to be too expensive of an operation to be done frequently?
            file_list = []
            for dirpath, _, filenames in os.walk(self.staging_path):
                for filename in filenames:
                    filepath = os.path.join(dirpath, filename)
                    file_size = os.path.getsize(filepath)
                    total_size += file_size
                    # Get the time given file was last accessed
                    last_access_time = time.localtime(os.stat(filepath)[7])
                    # Compose a tuple of the access time and the file path
                    file_tuple = last_access_time, filepath, file_size
                    file_list.append(file_tuple)
            # Sort the file list (based on access time)
            file_list.sort()
            # Initiate cleaning once within 10% of the defined cache size?
            cache_limit = self.cache_size * 0.9
            if total_size > cache_limit:
                log.info(
                    "Initiating cache cleaning: current cache size: %s; clean until smaller than: %s",
                    convert_bytes(total_size), convert_bytes(cache_limit))
                # How much to delete? If simply deleting up to the cache-10% limit,
                # is likely to be deleting frequently and may run the risk of hitting
                # the limit - maybe delete additional #%?
                # For now, delete enough to leave at least 10% of the total cache free
                delete_this_much = total_size - cache_limit
                self.__clean_cache(file_list, delete_this_much)
            self.sleeper.sleep(30)  # Test cache size every 30 seconds?

    def __clean_cache(self, file_list, delete_this_much):
        """ Keep deleting files from the file_list until the size of the deleted
        files is greater than the value in delete_this_much parameter.

        :type file_list: list
        :param file_list: List of candidate files that can be deleted. This method
            will start deleting files from the beginning of the list so the list
            should be sorted accordingly. The list must contains 3-element tuples,
            positioned as follows: position 0 holds file last accessed timestamp
            (as time.struct_time), position 1 holds file path, and position 2 has
            file size (e.g., (<access time>, /mnt/data/dataset_1.dat), 472394)

        :type delete_this_much: int
        :param delete_this_much: Total size of files, in bytes, that should be deleted.
        """
        # Keep deleting datasets from file_list until deleted_amount does not
        # exceed delete_this_much; start deleting from the front of the file list,
        # which assumes the oldest files come first on the list.
        deleted_amount = 0
        for entry in file_list:
            if deleted_amount < delete_this_much:
                deleted_amount += entry[2]
                os.remove(entry[1])
                # Debugging code for printing deleted files' stats
                # folder, file_name = os.path.split(f[1])
                # file_date = time.strftime("%m/%d/%y %H:%M:%S", f[0])
                # log.debug("%s. %-25s %s, size %s (deleted %s/%s)" \
                #     % (i, file_name, convert_bytes(f[2]), file_date, \
                #     convert_bytes(deleted_amount), convert_bytes(delete_this_much)))
            else:
                log.debug("Cache cleaning done. Total space freed: %s",
                          convert_bytes(deleted_amount))
                return

    def _get_bucket(self, bucket_name):
        """ Sometimes a handle to a bucket is not established right away so try
        it a few times. Raise error is connection is not established. """
        for i in range(5):
            try:
                bucket = self.conn.get_bucket(bucket_name)
                log.debug("Using cloud object store with bucket '%s'",
                          bucket.name)
                return bucket
            except S3ResponseError:
                try:
                    log.debug(
                        "Bucket not found, creating s3 bucket with handle '%s'",
                        bucket_name)
                    self.conn.create_bucket(bucket_name)
                except S3ResponseError:
                    log.exception("Could not get bucket '%s', attempt %s/5",
                                  bucket_name, i + 1)
                    time.sleep(2)
        # All the attempts have been exhausted and connection was not established,
        # raise error
        raise S3ResponseError

    def _fix_permissions(self, rel_path):
        """ Set permissions on rel_path"""
        for basedir, _, files in os.walk(rel_path):
            umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid)
            for filename in files:
                path = os.path.join(basedir, filename)
                # Ignore symlinks
                if os.path.islink(path):
                    continue
                umask_fix_perms(path, self.config.umask, 0o666,
                                self.config.gid)

    def _construct_path(self,
                        obj,
                        base_dir=None,
                        dir_only=None,
                        extra_dir=None,
                        extra_dir_at_root=False,
                        alt_name=None,
                        obj_dir=False,
                        **kwargs):
        # extra_dir should never be constructed from provided data but just
        # make sure there are no shenannigans afoot
        if extra_dir and extra_dir != os.path.normpath(extra_dir):
            log.warning('extra_dir is not normalized: %s', extra_dir)
            raise ObjectInvalid("The requested object is invalid")
        # ensure that any parent directory references in alt_name would not
        # result in a path not contained in the directory path constructed here
        if alt_name:
            if not safe_relpath(alt_name):
                log.warning('alt_name would locate path outside dir: %s',
                            alt_name)
                raise ObjectInvalid("The requested object is invalid")
            # alt_name can contain parent directory references, but S3 will not
            # follow them, so if they are valid we normalize them out
            alt_name = os.path.normpath(alt_name)
        rel_path = os.path.join(*directory_hash_id(obj.id))
        if extra_dir is not None:
            if extra_dir_at_root:
                rel_path = os.path.join(extra_dir, rel_path)
            else:
                rel_path = os.path.join(rel_path, extra_dir)

        # for JOB_WORK directory
        if obj_dir:
            rel_path = os.path.join(rel_path, str(obj.id))
        if base_dir:
            base = self.extra_dirs.get(base_dir)
            return os.path.join(base, rel_path)

        # S3 folders are marked by having trailing '/' so add it now
        rel_path = '%s/' % rel_path

        if not dir_only:
            rel_path = os.path.join(
                rel_path, alt_name if alt_name else "dataset_%s.dat" % obj.id)
        return rel_path

    def _get_cache_path(self, rel_path):
        return os.path.abspath(os.path.join(self.staging_path, rel_path))

    def _get_transfer_progress(self):
        return self.transfer_progress

    def _get_size_in_s3(self, rel_path):
        try:
            key = self.bucket.get_key(rel_path)
            if key:
                return key.size
        except S3ResponseError:
            log.exception("Could not get size of key '%s' from S3", rel_path)
            return -1

    def _key_exists(self, rel_path):
        exists = False
        try:
            # A hackish way of testing if the rel_path is a folder vs a file
            is_dir = rel_path[-1] == '/'
            if is_dir:
                keyresult = self.bucket.get_all_keys(prefix=rel_path)
                if len(keyresult) > 0:
                    exists = True
                else:
                    exists = False
            else:
                key = Key(self.bucket, rel_path)
                exists = key.exists()
        except S3ResponseError:
            log.exception("Trouble checking existence of S3 key '%s'",
                          rel_path)
            return False
        if rel_path[0] == '/':
            raise
        return exists

    def _in_cache(self, rel_path):
        """ Check if the given dataset is in the local cache and return True if so. """
        # log.debug("------ Checking cache for rel_path %s" % rel_path)
        cache_path = self._get_cache_path(rel_path)
        return os.path.exists(cache_path)
        # TODO: Part of checking if a file is in cache should be to ensure the
        # size of the cached file matches that on S3. Once the upload tool explicitly
        # creates, this check sould be implemented- in the mean time, it's not
        # looking likely to be implementable reliably.
        # if os.path.exists(cache_path):
        #     # print "***1 %s exists" % cache_path
        #     if self._key_exists(rel_path):
        #         # print "***2 %s exists in S3" % rel_path
        #         # Make sure the size in cache is available in its entirety
        #         # print "File '%s' cache size: %s, S3 size: %s" % (cache_path, os.path.getsize(cache_path), self._get_size_in_s3(rel_path))
        #         if os.path.getsize(cache_path) == self._get_size_in_s3(rel_path):
        #             # print "***2.1 %s exists in S3 and the size is the same as in cache (in_cache=True)" % rel_path
        #             exists = True
        #         else:
        #             # print "***2.2 %s exists but differs in size from cache (in_cache=False)" % cache_path
        #             exists = False
        #     else:
        #         # Although not perfect decision making, this most likely means
        #         # that the file is currently being uploaded
        #         # print "***3 %s found in cache but not in S3 (in_cache=True)" % cache_path
        #         exists = True
        # else:
        #     return False

    def _pull_into_cache(self, rel_path):
        # Ensure the cache directory structure exists (e.g., dataset_#_files/)
        rel_path_dir = os.path.dirname(rel_path)
        if not os.path.exists(self._get_cache_path(rel_path_dir)):
            os.makedirs(self._get_cache_path(rel_path_dir))
        # Now pull in the file
        file_ok = self._download(rel_path)
        self._fix_permissions(self._get_cache_path(rel_path_dir))
        return file_ok

    def _transfer_cb(self, complete, total):
        self.transfer_progress += 10

    def _download(self, rel_path):
        try:
            log.debug("Pulling key '%s' into cache to %s", rel_path,
                      self._get_cache_path(rel_path))
            key = self.bucket.get_key(rel_path)
            # Test if cache is large enough to hold the new file
            if self.cache_size > 0 and key.size > self.cache_size:
                log.critical(
                    "File %s is larger (%s) than the cache size (%s). Cannot download.",
                    rel_path, key.size, self.cache_size)
                return False
            if self.use_axel:
                log.debug("Parallel pulled key '%s' into cache to %s",
                          rel_path, self._get_cache_path(rel_path))
                ncores = multiprocessing.cpu_count()
                url = key.generate_url(7200)
                ret_code = subprocess.call("axel -a -n %s '%s'" %
                                           (ncores, url))
                if ret_code == 0:
                    return True
            else:
                log.debug("Pulled key '%s' into cache to %s", rel_path,
                          self._get_cache_path(rel_path))
                self.transfer_progress = 0  # Reset transfer progress counter
                key.get_contents_to_filename(self._get_cache_path(rel_path),
                                             cb=self._transfer_cb,
                                             num_cb=10)
                return True
        except S3ResponseError:
            log.exception("Problem downloading key '%s' from S3 bucket '%s'",
                          rel_path, self.bucket.name)
        return False

    def _push_to_os(self, rel_path, source_file=None, from_string=None):
        """
        Push the file pointed to by ``rel_path`` to the object store naming the key
        ``rel_path``. If ``source_file`` is provided, push that file instead while
        still using ``rel_path`` as the key name.
        If ``from_string`` is provided, set contents of the file to the value of
        the string.
        """
        try:
            source_file = source_file if source_file else self._get_cache_path(
                rel_path)
            if os.path.exists(source_file):
                key = Key(self.bucket, rel_path)
                if os.path.getsize(source_file) == 0 and key.exists():
                    log.debug(
                        "Wanted to push file '%s' to S3 key '%s' but its size is 0; skipping.",
                        source_file, rel_path)
                    return True
                if from_string:
                    key.set_contents_from_string(
                        from_string, reduced_redundancy=self.use_rr)
                    log.debug("Pushed data from string '%s' to key '%s'",
                              from_string, rel_path)
                else:
                    start_time = datetime.now()
                    log.debug(
                        "Pushing cache file '%s' of size %s bytes to key '%s'",
                        source_file, os.path.getsize(source_file), rel_path)
                    mb_size = os.path.getsize(source_file) / 1e6
                    if mb_size < 10 or (not self.multipart):
                        self.transfer_progress = 0  # Reset transfer progress counter
                        key.set_contents_from_filename(
                            source_file,
                            reduced_redundancy=self.use_rr,
                            cb=self._transfer_cb,
                            num_cb=10)
                    else:
                        multipart_upload(self.s3server, self.bucket, key.name,
                                         source_file, mb_size)
                    end_time = datetime.now()
                    log.debug(
                        "Pushed cache file '%s' to key '%s' (%s bytes transfered in %s sec)",
                        source_file, rel_path, os.path.getsize(source_file),
                        end_time - start_time)
                return True
            else:
                log.error(
                    "Tried updating key '%s' from source file '%s', but source file does not exist.",
                    rel_path, source_file)
        except S3ResponseError:
            log.exception("Trouble pushing S3 key '%s' from file '%s'",
                          rel_path, source_file)
        return False

    def file_ready(self, obj, **kwargs):
        """
        A helper method that checks if a file corresponding to a dataset is
        ready and available to be used. Return ``True`` if so, ``False`` otherwise.
        """
        rel_path = self._construct_path(obj, **kwargs)
        # Make sure the size in cache is available in its entirety
        if self._in_cache(rel_path):
            if os.path.getsize(self._get_cache_path(
                    rel_path)) == self._get_size_in_s3(rel_path):
                return True
            log.debug("Waiting for dataset %s to transfer from OS: %s/%s",
                      rel_path,
                      os.path.getsize(self._get_cache_path(rel_path)),
                      self._get_size_in_s3(rel_path))
        return False

    def exists(self, obj, **kwargs):
        in_cache = in_s3 = False
        rel_path = self._construct_path(obj, **kwargs)

        # Check cache
        if self._in_cache(rel_path):
            in_cache = True
        # Check S3
        in_s3 = self._key_exists(rel_path)
        # log.debug("~~~~~~ File '%s' exists in cache: %s; in s3: %s" % (rel_path, in_cache, in_s3))
        # dir_only does not get synced so shortcut the decision
        dir_only = kwargs.get('dir_only', False)
        base_dir = kwargs.get('base_dir', None)
        if dir_only:
            if in_cache or in_s3:
                return True
            # for JOB_WORK directory
            elif base_dir:
                if not os.path.exists(rel_path):
                    os.makedirs(rel_path)
                return True
            else:
                return False

        # TODO: Sync should probably not be done here. Add this to an async upload stack?
        if in_cache and not in_s3:
            self._push_to_os(rel_path,
                             source_file=self._get_cache_path(rel_path))
            return True
        elif in_s3:
            return True
        else:
            return False

    def create(self, obj, **kwargs):
        if not self.exists(obj, **kwargs):

            # Pull out locally used fields
            extra_dir = kwargs.get('extra_dir', None)
            extra_dir_at_root = kwargs.get('extra_dir_at_root', False)
            dir_only = kwargs.get('dir_only', False)
            alt_name = kwargs.get('alt_name', None)

            # Construct hashed path
            rel_path = os.path.join(*directory_hash_id(obj.id))

            # Optionally append extra_dir
            if extra_dir is not None:
                if extra_dir_at_root:
                    rel_path = os.path.join(extra_dir, rel_path)
                else:
                    rel_path = os.path.join(rel_path, extra_dir)

            # Create given directory in cache
            cache_dir = os.path.join(self.staging_path, rel_path)
            if not os.path.exists(cache_dir):
                os.makedirs(cache_dir)

            # Although not really necessary to create S3 folders (because S3 has
            # flat namespace), do so for consistency with the regular file system
            # S3 folders are marked by having trailing '/' so add it now
            # s3_dir = '%s/' % rel_path
            # self._push_to_os(s3_dir, from_string='')
            # If instructed, create the dataset in cache & in S3
            if not dir_only:
                rel_path = os.path.join(
                    rel_path,
                    alt_name if alt_name else "dataset_%s.dat" % obj.id)
                open(os.path.join(self.staging_path, rel_path), 'w').close()
                self._push_to_os(rel_path, from_string='')

    def empty(self, obj, **kwargs):
        if self.exists(obj, **kwargs):
            return bool(self.size(obj, **kwargs) > 0)
        else:
            raise ObjectNotFound(
                'objectstore.empty, object does not exist: %s, kwargs: %s' %
                (str(obj), str(kwargs)))

    def size(self, obj, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        if self._in_cache(rel_path):
            try:
                return os.path.getsize(self._get_cache_path(rel_path))
            except OSError as ex:
                log.info(
                    "Could not get size of file '%s' in local cache, will try S3. Error: %s",
                    rel_path, ex)
        elif self.exists(obj, **kwargs):
            return self._get_size_in_s3(rel_path)
        log.warning("Did not find dataset '%s', returning 0 for size",
                    rel_path)
        return 0

    def delete(self, obj, entire_dir=False, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        extra_dir = kwargs.get('extra_dir', None)
        base_dir = kwargs.get('base_dir', None)
        dir_only = kwargs.get('dir_only', False)
        obj_dir = kwargs.get('obj_dir', False)
        try:
            # Remove temparory data in JOB_WORK directory
            if base_dir and dir_only and obj_dir:
                shutil.rmtree(os.path.abspath(rel_path))
                return True

            # For the case of extra_files, because we don't have a reference to
            # individual files/keys we need to remove the entire directory structure
            # with all the files in it. This is easy for the local file system,
            # but requires iterating through each individual key in S3 and deleing it.
            if entire_dir and extra_dir:
                shutil.rmtree(self._get_cache_path(rel_path))
                results = self.bucket.get_all_keys(prefix=rel_path)
                for key in results:
                    log.debug("Deleting key %s", key.name)
                    key.delete()
                return True
            else:
                # Delete from cache first
                os.unlink(self._get_cache_path(rel_path))
                # Delete from S3 as well
                if self._key_exists(rel_path):
                    key = Key(self.bucket, rel_path)
                    log.debug("Deleting key %s", key.name)
                    key.delete()
                    return True
        except S3ResponseError:
            log.exception("Could not delete key '%s' from S3", rel_path)
        except OSError:
            log.exception('%s delete error', self.get_filename(obj, **kwargs))
        return False

    def get_data(self, obj, start=0, count=-1, **kwargs):
        rel_path = self._construct_path(obj, **kwargs)
        # Check cache first and get file if not there
        if not self._in_cache(rel_path):
            self._pull_into_cache(rel_path)
        # Read the file content from cache
        data_file = open(self._get_cache_path(rel_path), 'r')
        data_file.seek(start)
        content = data_file.read(count)
        data_file.close()
        return content

    def get_filename(self, obj, **kwargs):
        base_dir = kwargs.get('base_dir', None)
        dir_only = kwargs.get('dir_only', False)
        obj_dir = kwargs.get('obj_dir', False)
        rel_path = self._construct_path(obj, **kwargs)

        # for JOB_WORK directory
        if base_dir and dir_only and obj_dir:
            return os.path.abspath(rel_path)

        cache_path = self._get_cache_path(rel_path)
        # S3 does not recognize directories as files so cannot check if those exist.
        # So, if checking dir only, ensure given dir exists in cache and return
        # the expected cache path.
        # dir_only = kwargs.get('dir_only', False)
        # if dir_only:
        #     if not os.path.exists(cache_path):
        #         os.makedirs(cache_path)
        #     return cache_path
        # Check if the file exists in the cache first
        if self._in_cache(rel_path):
            return cache_path
        # Check if the file exists in persistent storage and, if it does, pull it into cache
        elif self.exists(obj, **kwargs):
            if dir_only:  # Directories do not get pulled into cache
                return cache_path
            else:
                if self._pull_into_cache(rel_path):
                    return cache_path
        # For the case of retrieving a directory only, return the expected path
        # even if it does not exist.
        # if dir_only:
        #     return cache_path
        raise ObjectNotFound(
            'objectstore.get_filename, no cache_path: %s, kwargs: %s' %
            (str(obj), str(kwargs)))
        # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path

    def update_from_file(self, obj, file_name=None, create=False, **kwargs):
        if create:
            self.create(obj, **kwargs)
        if self.exists(obj, **kwargs):
            rel_path = self._construct_path(obj, **kwargs)
            # Chose whether to use the dataset file itself or an alternate file
            if file_name:
                source_file = os.path.abspath(file_name)
                # Copy into cache
                cache_file = self._get_cache_path(rel_path)
                try:
                    if source_file != cache_file:
                        # FIXME? Should this be a `move`?
                        shutil.copy2(source_file, cache_file)
                    self._fix_permissions(cache_file)
                except OSError:
                    log.exception(
                        "Trouble copying source file '%s' to cache '%s'",
                        source_file, cache_file)
            else:
                source_file = self._get_cache_path(rel_path)
            # Update the file on S3
            self._push_to_os(rel_path, source_file)
        else:
            raise ObjectNotFound(
                'objectstore.update_from_file, object does not exist: %s, kwargs: %s'
                % (str(obj), str(kwargs)))

    def get_object_url(self, obj, **kwargs):
        if self.exists(obj, **kwargs):
            rel_path = self._construct_path(obj, **kwargs)
            try:
                key = Key(self.bucket, rel_path)
                return key.generate_url(expires_in=86400)  # 24hrs
            except S3ResponseError:
                log.exception("Trouble generating URL for dataset '%s'",
                              rel_path)
        return None

    def get_store_usage_percent(self):
        return 0.0
Beispiel #20
0
class DistributedObjectStore(NestedObjectStore):

    """
    ObjectStore that defers to a list of backends.

    When getting objects the first store where the object exists is used.
    When creating objects they are created in a store selected randomly, but
    with weighting.
    """
    store_type = 'distributed'

    def __init__(self, config, config_dict, fsmon=False):
        """
        :type config: object
        :param config: An object, most likely populated from
            `galaxy/config.ini`, having the same attributes needed by
            :class:`NestedObjectStore` plus:

            * distributed_object_store_config_file

        :type config_xml: ElementTree

        :type fsmon: bool
        :param fsmon: If True, monitor the file system for free space,
            removing backends when they get too full.
        """
        super(DistributedObjectStore, self).__init__(config)

        self.backends = {}
        self.weighted_backend_ids = []
        self.original_weighted_backend_ids = []
        self.max_percent_full = {}
        self.global_max_percent_full = config_dict.get("global_max_percent_full", 0)
        random.seed()

        backends_def = config_dict["backends"]
        for backend_def in backends_def:
            backened_id = backend_def["id"]
            file_path = backend_def["files_dir"]
            extra_dirs = backend_def.get("extra_dirs", [])
            maxpctfull = backend_def.get("max_percent_full", 0)
            weight = backend_def["weight"]
            disk_config_dict = dict(files_dir=file_path, extra_dirs=extra_dirs)
            self.backends[backened_id] = DiskObjectStore(config, disk_config_dict)
            self.max_percent_full[backened_id] = maxpctfull
            log.debug("Loaded disk backend '%s' with weight %s and file_path: %s" % (backened_id, weight, file_path))

            for i in range(0, weight):
                # The simplest way to do weighting: add backend ids to a
                # sequence the number of times equalling weight, then randomly
                # choose a backend from that sequence at creation
                self.weighted_backend_ids.append(backened_id)
        self.original_weighted_backend_ids = self.weighted_backend_ids

        self.sleeper = None
        if fsmon and (self.global_max_percent_full or [_ for _ in self.max_percent_full.values() if _ != 0.0]):
            self.sleeper = Sleeper()
            self.filesystem_monitor_thread = threading.Thread(target=self.__filesystem_monitor)
            self.filesystem_monitor_thread.setDaemon(True)
            self.filesystem_monitor_thread.start()
            log.info("Filesystem space monitor started")

    @classmethod
    def parse_xml(clazz, config_xml, legacy=False):
        if legacy:
            backends_root = config_xml
        else:
            backends_root = config_xml.find('backends')

        backends = []
        config_dict = {
            'global_max_percent_full': float(backends_root.get('maxpctfull', 0)),
            'backends': backends,
        }

        for elem in [e for e in backends_root if e.tag == 'backend']:
            id = elem.get('id')
            weight = int(elem.get('weight', 1))
            maxpctfull = float(elem.get('maxpctfull', 0))
            elem_type = elem.get('type', 'disk')

            if elem_type:
                path = None
                extra_dirs = []
                for sub in elem:
                    if sub.tag == 'files_dir':
                        path = sub.get('path')
                    elif sub.tag == 'extra_dir':
                        type = sub.get('type')
                        extra_dirs.append({"type": type, "path": sub.get('path')})

                backend_dict = {
                    'id': id,
                    'weight': weight,
                    'max_percent_full': maxpctfull,
                    'files_dir': path,
                    'extra_dirs': extra_dirs,
                    'type': elem_type,
                }
                backends.append(backend_dict)

        return config_dict

    @classmethod
    def from_xml(clazz, config, config_xml, fsmon=False):
        legacy = False
        if config_xml is None:
            distributed_config = config.distributed_object_store_config_file
            assert distributed_config is not None, \
                "distributed object store ('object_store = distributed') " \
                "requires a config file, please set one in " \
                "'distributed_object_store_config_file')"

            log.debug('Loading backends for distributed object store from %s', distributed_config)
            config_xml = ElementTree.parse(distributed_config).getroot()
            legacy = True
        else:
            log.debug('Loading backends for distributed object store from %s', config_xml.get('id'))

        config_dict = clazz.parse_xml(config_xml, legacy=legacy)
        return clazz(config, config_dict, fsmon=fsmon)

    def to_dict(self):
        as_dict = super(DistributedObjectStore, self).to_dict()
        as_dict["global_max_percent_full"] = self.global_max_percent_full
        backends = []
        for backend_id, backend in self.backends.items():
            backend_as_dict = backend.to_dict()
            backend_as_dict["id"] = backend_id
            backend_as_dict["max_percent_full"] = self.max_percent_full[backend_id]
            backend_as_dict["weight"] = len([i for i in self.original_weighted_backend_ids if i == backend_id])
            backends.append(backend_as_dict)
        as_dict["backends"] = backends
        return as_dict

    def shutdown(self):
        """Shut down. Kill the free space monitor if there is one."""
        super(DistributedObjectStore, self).shutdown()
        if self.sleeper is not None:
            self.sleeper.wake()

    def __filesystem_monitor(self):
        while self.running:
            new_weighted_backend_ids = self.original_weighted_backend_ids
            for id, backend in self.backends.items():
                maxpct = self.max_percent_full[id] or self.global_max_percent_full
                pct = backend.get_store_usage_percent()
                if pct > maxpct:
                    new_weighted_backend_ids = [_ for _ in new_weighted_backend_ids if _ != id]
            self.weighted_backend_ids = new_weighted_backend_ids
            self.sleeper.sleep(120)  # Test free space every 2 minutes

    def create(self, obj, **kwargs):
        """The only method in which obj.object_store_id may be None."""
        if obj.object_store_id is None or not self.exists(obj, **kwargs):
            if obj.object_store_id is None or obj.object_store_id not in self.backends:
                try:
                    obj.object_store_id = random.choice(self.weighted_backend_ids)
                except IndexError:
                    raise ObjectInvalid('objectstore.create, could not generate '
                                        'obj.object_store_id: %s, kwargs: %s'
                                        % (str(obj), str(kwargs)))
                _create_object_in_session(obj)
                log.debug("Selected backend '%s' for creation of %s %s"
                          % (obj.object_store_id, obj.__class__.__name__, obj.id))
            else:
                log.debug("Using preferred backend '%s' for creation of %s %s"
                          % (obj.object_store_id, obj.__class__.__name__, obj.id))
            self.backends[obj.object_store_id].create(obj, **kwargs)

    def _call_method(self, method, obj, default, default_is_exception, **kwargs):
        object_store_id = self.__get_store_id_for(obj, **kwargs)
        if object_store_id is not None:
            return self.backends[object_store_id].__getattribute__(method)(obj, **kwargs)
        if default_is_exception:
            raise default('objectstore, _call_method failed: %s on %s, kwargs: %s'
                          % (method, self._repr_object_for_exception(obj), str(kwargs)))
        else:
            return default

    def __get_store_id_for(self, obj, **kwargs):
        if obj.object_store_id is not None:
            if obj.object_store_id in self.backends:
                return obj.object_store_id
            else:
                log.warning('The backend object store ID (%s) for %s object with ID %s is invalid'
                            % (obj.object_store_id, obj.__class__.__name__, obj.id))
        # if this instance has been switched from a non-distributed to a
        # distributed object store, or if the object's store id is invalid,
        # try to locate the object
        for id, store in self.backends.items():
            if store.exists(obj, **kwargs):
                log.warning('%s object with ID %s found in backend object store with ID %s'
                            % (obj.__class__.__name__, obj.id, id))
                obj.object_store_id = id
                _create_object_in_session(obj)
                return id
        return None