Beispiel #1
0
def test_history_collection_copy(list_size=NUM_DATASETS):
    with _setup_mapping_and_user() as (test_config, object_store, model,
                                       old_history):
        for i in range(NUM_COLLECTIONS):
            hdas = []
            for i in range(list_size * 2):
                hda_path = test_config.write("moo",
                                             "test_metadata_original_%d" % i)
                hda = _create_hda(model,
                                  object_store,
                                  old_history,
                                  hda_path,
                                  visible=False,
                                  include_metadata_file=False)
                hdas.append(hda)

            list_elements = []
            list_collection = model.DatasetCollection(
                collection_type="list:paired")
            for j in range(list_size):
                paired_collection = model.DatasetCollection(
                    collection_type="paired")
                forward_dce = model.DatasetCollectionElement(
                    collection=paired_collection, element=hdas[j * 2])
                reverse_dce = model.DatasetCollectionElement(
                    collection=paired_collection, element=hdas[j * 2 + 1])
                paired_collection_element = model.DatasetCollectionElement(
                    collection=list_collection, element=paired_collection)
                list_elements.append(paired_collection_element)
                model.context.add_all(
                    [forward_dce, reverse_dce, paired_collection_element])
            history_dataset_collection = model.HistoryDatasetCollectionAssociation(
                collection=list_collection)
            history_dataset_collection.user = old_history.user
            model.context.add(history_dataset_collection)

            model.context.flush()
            old_history.add_dataset_collection(history_dataset_collection)
            history_dataset_collection.add_item_annotation(
                model.context, old_history.user, history_dataset_collection,
                "annotation #%d" % history_dataset_collection.hid)

        model.context.flush()
        annotation_str = history_dataset_collection.get_item_annotation_str(
            model.context, old_history.user, history_dataset_collection)

        # Saving magic SA invocations for detecting full flushes that may harm performance.
        # from sqlalchemy import event
        # @event.listens_for(model.context, "before_flush")
        # def track_instances_before_flush(session, context, instances):
        #     if not instances:
        #         print("FULL FLUSH...")
        #     else:
        #         print("Flushing just %s" % instances)

        history_copy_timer = ExecutionTimer()
        new_history = old_history.copy(target_user=old_history.user)
        print("history copied %s" % history_copy_timer)

        for hda in new_history.active_datasets:
            assert hda.get_size() == 3
            annotation_str = hda.get_item_annotation_str(
                model.context, old_history.user, hda)
            assert annotation_str == "annotation #%d" % hda.hid, annotation_str

        assert len(new_history.active_dataset_collections) == NUM_COLLECTIONS
        for hdca in new_history.active_dataset_collections:
            annotation_str = hdca.get_item_annotation_str(
                model.context, old_history.user, hdca)
            assert annotation_str == "annotation #%d" % hdca.hid, annotation_str
Beispiel #2
0
def execute( trans, tool, param_combinations, history, rerun_remap_job_id=None, collection_info=None, workflow_invocation_uuid=None ):
    """
    Execute a tool and return object containing summary (output data, number of
    failures, etc...).
    """
    all_jobs_timer = ExecutionTimer()
    execution_tracker = ToolExecutionTracker( tool, param_combinations, collection_info )
    app = trans.app
    execution_cache = ToolExecutionCache(trans)

    def execute_single_job(params):
        job_timer = ExecutionTimer()
        if workflow_invocation_uuid:
            params[ '__workflow_invocation_uuid__' ] = workflow_invocation_uuid
        elif '__workflow_invocation_uuid__' in params:
            # Only workflow invocation code gets to set this, ignore user supplied
            # values or rerun parameters.
            del params[ '__workflow_invocation_uuid__' ]
        job, result = tool.handle_single_execution( trans, rerun_remap_job_id, params, history, collection_info, execution_cache )
        if job:
            message = EXECUTION_SUCCESS_MESSAGE % (tool.id, job.id, job_timer)
            log.debug(message)
            execution_tracker.record_success( job, result )
        else:
            execution_tracker.record_error( result )

    config = app.config
    burst_at = getattr( config, 'tool_submission_burst_at', 10 )
    burst_threads = getattr( config, 'tool_submission_burst_threads', 1 )

    tool_action = tool.action
    if hasattr( tool_action, "check_inputs_ready" ):
        for params in execution_tracker.param_combinations:
            # This will throw an exception if the tool is not ready.
            tool_action.check_inputs_ready(
                tool,
                trans,
                params,
                history
            )

    job_count = len(execution_tracker.param_combinations)
    if job_count < burst_at or burst_threads < 2:
        for params in execution_tracker.param_combinations:
            execute_single_job(params)
    else:
        q = Queue()

        def worker():
            while True:
                params = q.get()
                execute_single_job(params)
                q.task_done()

        for i in range(burst_threads):
            t = Thread(target=worker)
            t.daemon = True
            t.start()

        for params in execution_tracker.param_combinations:
            q.put(params)

        q.join()

    log.debug("Executed %d job(s) for tool %s request: %s" % (job_count, tool.id, all_jobs_timer))
    if collection_info:
        history = history or tool.get_default_history_by_trans( trans )
        if len(param_combinations) == 0:
            template = "Attempting to map over an empty collection, this is not yet implemented. colleciton_info is [%s]"
            message = template % collection_info
            log.warn(message)
            raise Exception(message)
        params = param_combinations[0]
        execution_tracker.create_output_collections( trans, history, params )

    return execution_tracker
Beispiel #3
0
    def populate_collection_elements(self,
                                     collection,
                                     root_collection_builder,
                                     filenames,
                                     name=None,
                                     metadata_source_name=None,
                                     final_job_state='ok'):
        # TODO: allow configurable sorting.
        #    <sort by="lexical" /> <!-- default -->
        #    <sort by="reverse_lexical" />
        #    <sort regex="example.(\d+).fastq" by="1:numerical" />
        #    <sort regex="part_(\d+)_sample_([^_]+).fastq" by="2:lexical,1:numerical" />
        if name is None:
            name = "unnamed output"

        element_datasets = []
        for filename, discovered_file in filenames.items():
            create_dataset_timer = ExecutionTimer()
            fields_match = discovered_file.match
            if not fields_match:
                raise Exception("Problem parsing metadata fields for file %s" %
                                filename)
            element_identifiers = fields_match.element_identifiers
            designation = fields_match.designation
            visible = fields_match.visible
            ext = fields_match.ext
            dbkey = fields_match.dbkey
            # galaxy.tools.parser.output_collection_def.INPUT_DBKEY_TOKEN
            if dbkey == "__input__":
                dbkey = self.input_dbkey

            # Create new primary dataset
            dataset_name = fields_match.name or designation

            link_data = discovered_file.match.link_data
            tag_list = discovered_file.match.tag_list

            sources = discovered_file.match.sources
            hashes = discovered_file.match.hashes
            created_from_basename = discovered_file.match.created_from_basename

            dataset = self.create_dataset(
                ext=ext,
                designation=designation,
                visible=visible,
                dbkey=dbkey,
                name=dataset_name,
                filename=filename,
                metadata_source_name=metadata_source_name,
                link_data=link_data,
                tag_list=tag_list,
                sources=sources,
                hashes=hashes,
                created_from_basename=created_from_basename,
                final_job_state=final_job_state,
            )
            log.debug(
                "(%s) Created dynamic collection dataset for path [%s] with element identifier [%s] for output [%s] %s",
                self.job_id(),
                filename,
                designation,
                name,
                create_dataset_timer,
            )
            element_datasets.append((element_identifiers, dataset))

        add_datasets_timer = ExecutionTimer()
        self.add_datasets_to_history([d for (ei, d) in element_datasets])
        log.debug(
            "(%s) Add dynamic collection datasets to history for output [%s] %s",
            self.job_id(),
            name,
            add_datasets_timer,
        )

        for (element_identifiers, dataset) in element_datasets:
            current_builder = root_collection_builder
            for element_identifier in element_identifiers[:-1]:
                current_builder = current_builder.get_level(element_identifier)
            current_builder.add_dataset(element_identifiers[-1], dataset)

            # Associate new dataset with job
            element_identifier_str = ":".join(element_identifiers)
            association_name = '__new_primary_file_%s|%s__' % (
                name, element_identifier_str)
            self.add_output_dataset_association(association_name, dataset)

        self.flush()
Beispiel #4
0
    def execute(self, tool, trans, incoming={}, return_job=False, set_output_hid=True, set_output_history=True, history=None, job_params=None, rerun_remap_job_id=None, mapping_over_collection=False):
        """
        Executes a tool, creating job and tool outputs, associating them, and
        submitting the job to the job queue. If history is not specified, use
        trans.history as destination for tool's output datasets.
        """
        assert tool.allow_user_access( trans.user ), "User (%s) is not allowed to access this tool." % ( trans.user )
        # Set history.
        if not history:
            history = tool.get_default_history_by_trans( trans, create=True )

        out_data = odict()
        out_collections = {}
        out_collection_instances = {}
        # Track input dataset collections - but replace with simply lists so collect
        # input datasets can process these normally.
        inp_dataset_collections = self.collect_input_dataset_collections( tool, incoming )
        # Collect any input datasets from the incoming parameters
        inp_data = self.collect_input_datasets( tool, incoming, trans )

        # Deal with input dataset names, 'dbkey' and types
        input_names = []
        input_ext = 'data'
        input_dbkey = incoming.get( "dbkey", "?" )
        inp_items = inp_data.items()
        inp_items.reverse()
        for name, data in inp_items:
            if not data:
                data = NoneDataset( datatypes_registry=trans.app.datatypes_registry )
                continue

            # Convert LDDA to an HDA.
            if isinstance(data, LibraryDatasetDatasetAssociation):
                data = data.to_history_dataset_association( None )
                inp_data[name] = data

            else:  # HDA
                if data.hid:
                    input_names.append( 'data %s' % data.hid )
            input_ext = data.ext

            if data.dbkey not in [None, '?']:
                input_dbkey = data.dbkey

            identifier = getattr( data, "element_identifier", None )
            if identifier is not None:
                incoming[ "%s|__identifier__" % name ] = identifier

        # Collect chromInfo dataset and add as parameters to incoming
        ( chrom_info, db_dataset ) = trans.app.genome_builds.get_chrom_info( input_dbkey, trans=trans, custom_build_hack_get_len_from_fasta_conversion=tool.id != 'CONVERTER_fasta_to_len' )
        if db_dataset:
            inp_data.update( { "chromInfo": db_dataset } )
        incoming[ "chromInfo" ] = chrom_info

        # Determine output dataset permission/roles list
        existing_datasets = [ inp for inp in inp_data.values() if inp ]
        if existing_datasets:
            output_permissions = trans.app.security_agent.guess_derived_permissions_for_datasets( existing_datasets )
        else:
            # No valid inputs, we will use history defaults
            output_permissions = trans.app.security_agent.history_get_default_permissions( history )

        # Build name for output datasets based on tool name and input names
        on_text = on_text_for_names( input_names )

        # Add the dbkey to the incoming parameters
        incoming[ "dbkey" ] = input_dbkey
        # wrapped params are used by change_format action and by output.label; only perform this wrapping once, as needed
        wrapped_params = WrappedParameters( trans, tool, incoming )
        # Keep track of parent / child relationships, we'll create all the
        # datasets first, then create the associations
        parent_to_child_pairs = []
        child_dataset_names = set()
        object_store_populator = ObjectStorePopulator( trans.app )

        def handle_output( name, output, hidden=None ):
            if output.parent:
                parent_to_child_pairs.append( ( output.parent, name ) )
                child_dataset_names.add( name )
            # What is the following hack for? Need to document under what
            # conditions can the following occur? ([email protected])
            # HACK: the output data has already been created
            #      this happens i.e. as a result of the async controller
            if name in incoming:
                dataid = incoming[name]
                data = trans.sa_session.query( trans.app.model.HistoryDatasetAssociation ).get( dataid )
                assert data is not None
                out_data[name] = data
            else:
                ext = determine_output_format( output, wrapped_params.params, inp_data, input_ext )
                data = trans.app.model.HistoryDatasetAssociation( extension=ext, create_dataset=True, sa_session=trans.sa_session )
                if hidden is None:
                    hidden = output.hidden
                if hidden:
                    data.visible = False
                # Commit the dataset immediately so it gets database assigned unique id
                trans.sa_session.add( data )
                trans.sa_session.flush()
                trans.app.security_agent.set_all_dataset_permissions( data.dataset, output_permissions )

            object_store_populator.set_object_store_id( data )

            # This may not be neccesary with the new parent/child associations
            data.designation = name
            # Copy metadata from one of the inputs if requested.

            # metadata source can be either a string referencing an input
            # or an actual object to copy.
            metadata_source = output.metadata_source
            if metadata_source:
                if isinstance( metadata_source, basestring ):
                    metadata_source = inp_data[metadata_source]

            if metadata_source is not None:
                data.init_meta( copy_from=metadata_source )
            else:
                data.init_meta()
            # Take dbkey from LAST input
            data.dbkey = str(input_dbkey)
            # Set state
            # FIXME: shouldn't this be NEW until the job runner changes it?
            data.state = data.states.QUEUED
            data.blurb = "queued"
            # Set output label
            data.name = self.get_output_name( output, data, tool, on_text, trans, incoming, history, wrapped_params.params, job_params )
            # Store output
            out_data[ name ] = data
            if output.actions:
                # Apply pre-job tool-output-dataset actions; e.g. setting metadata, changing format
                output_action_params = dict( out_data )
                output_action_params.update( incoming )
                output.actions.apply_action( data, output_action_params )
            # Store all changes to database
            trans.sa_session.flush()
            return data

        for name, output in tool.outputs.items():
            if not filter_output(output, incoming):
                if output.collection:
                    collections_manager = trans.app.dataset_collections_service
                    # As far as I can tell - this is always true - but just verify
                    assert set_output_history, "Cannot create dataset collection for this kind of tool."

                    element_identifiers = []
                    input_collections = dict( [ (k, v[0]) for k, v in inp_dataset_collections.iteritems() ] )
                    known_outputs = output.known_outputs( input_collections, collections_manager.type_registry )
                    # Just to echo TODO elsewhere - this should be restructured to allow
                    # nested collections.
                    for output_part_def in known_outputs:
                        # Add elements to top-level collection, unless nested...
                        current_element_identifiers = element_identifiers
                        current_collection_type = output.structure.collection_type

                        for parent_id in (output_part_def.parent_ids or []):
                            # TODO: replace following line with formal abstractions for doing this.
                            current_collection_type = ":".join(current_collection_type.split(":")[1:])
                            name_to_index = dict(map(lambda (index, value): (value["name"], index), enumerate(current_element_identifiers)))
                            if parent_id not in name_to_index:
                                if parent_id not in current_element_identifiers:
                                    index = len(current_element_identifiers)
                                    current_element_identifiers.append(dict(
                                        name=parent_id,
                                        collection_type=current_collection_type,
                                        src="new_collection",
                                        element_identifiers=[],
                                    ))
                                else:
                                    index = name_to_index[parent_id]
                            current_element_identifiers = current_element_identifiers[ index ][ "element_identifiers" ]

                        effective_output_name = output_part_def.effective_output_name
                        element = handle_output( effective_output_name, output_part_def.output_def, hidden=True )
                        # TODO: this shouldn't exist in the top-level of the history at all
                        # but for now we are still working around that by hiding the contents
                        # there.
                        # Following hack causes dataset to no be added to history...
                        child_dataset_names.add( effective_output_name )

                        if set_output_history:
                            history.add_dataset( element, set_hid=set_output_hid )
                        trans.sa_session.add( element )
                        trans.sa_session.flush()

                        current_element_identifiers.append({
                            "__object__": element,
                            "name": output_part_def.element_identifier,
                        })
                        log.info(element_identifiers)

                    if output.dynamic_structure:
                        assert not element_identifiers  # known_outputs must have been empty
                        element_kwds = dict(elements=collections_manager.ELEMENTS_UNINITIALIZED)
                    else:
                        element_kwds = dict(element_identifiers=element_identifiers)

                    if mapping_over_collection:
                        dc = collections_manager.create_dataset_collection(
                            trans,
                            collection_type=output.structure.collection_type,
                            **element_kwds
                        )
                        out_collections[ name ] = dc
                    else:
                        hdca_name = self.get_output_name( output, None, tool, on_text, trans, incoming, history, wrapped_params.params, job_params )
                        hdca = collections_manager.create(
                            trans,
                            history,
                            name=hdca_name,
                            collection_type=output.structure.collection_type,
                            trusted_identifiers=True,
                            **element_kwds
                        )
                        # name here is name of the output element - not name
                        # of the hdca.
                        out_collection_instances[ name ] = hdca
                else:
                    handle_output_timer = ExecutionTimer()
                    handle_output( name, output )
                    log.info("Handled output %s" % handle_output_timer)
        # Add all the top-level (non-child) datasets to the history unless otherwise specified
        for name in out_data.keys():
            if name not in child_dataset_names and name not in incoming:  # don't add children; or already existing datasets, i.e. async created
                data = out_data[ name ]
                if set_output_history:
                    history.add_dataset( data, set_hid=set_output_hid )
                trans.sa_session.add( data )
                trans.sa_session.flush()
        # Add all the children to their parents
        for parent_name, child_name in parent_to_child_pairs:
            parent_dataset = out_data[ parent_name ]
            child_dataset = out_data[ child_name ]
            parent_dataset.children.append( child_dataset )
        # Store data after custom code runs
        trans.sa_session.flush()
        # Create the job object
        job = trans.app.model.Job()

        if hasattr( trans, "get_galaxy_session" ):
            galaxy_session = trans.get_galaxy_session()
            # If we're submitting from the API, there won't be a session.
            if type( galaxy_session ) == trans.model.GalaxySession:
                job.session_id = galaxy_session.id
        if trans.user is not None:
            job.user_id = trans.user.id
        job.history_id = history.id
        job.tool_id = tool.id
        try:
            # For backward compatibility, some tools may not have versions yet.
            job.tool_version = tool.version
        except:
            job.tool_version = "1.0.0"
        # FIXME: Don't need all of incoming here, just the defined parameters
        #        from the tool. We need to deal with tools that pass all post
        #        parameters to the command as a special case.
        for name, ( dataset_collection, reduced ) in inp_dataset_collections.iteritems():
            # TODO: Does this work if nested in repeat/conditional?
            if reduced:
                incoming[ name ] = "__collection_reduce__|%s" % dataset_collection.id
            # Should verify security? We check security of individual
            # datasets below?
            job.add_input_dataset_collection( name, dataset_collection )
        for name, value in tool.params_to_strings( incoming, trans.app ).iteritems():
            job.add_parameter( name, value )
        current_user_roles = trans.get_current_user_roles()
        access_timer = ExecutionTimer()
        for name, dataset in inp_data.iteritems():
            if dataset:
                if not trans.app.security_agent.can_access_dataset( current_user_roles, dataset.dataset ):
                    raise Exception("User does not have permission to use a dataset (%s) provided for input." % data.id)
                job.add_input_dataset( name, dataset )
            else:
                job.add_input_dataset( name, None )
        log.info("Verified access to datasets %s" % access_timer)
        for name, dataset in out_data.iteritems():
            job.add_output_dataset( name, dataset )
        for name, dataset_collection in out_collections.iteritems():
            job.add_implicit_output_dataset_collection( name, dataset_collection )
        for name, dataset_collection_instance in out_collection_instances.iteritems():
            job.add_output_dataset_collection( name, dataset_collection_instance )
        job.object_store_id = object_store_populator.object_store_id
        if job_params:
            job.params = dumps( job_params )
        job.set_handler(tool.get_job_handler(job_params))
        trans.sa_session.add( job )
        # Now that we have a job id, we can remap any outputs if this is a rerun and the user chose to continue dependent jobs
        # This functionality requires tracking jobs in the database.
        if trans.app.config.track_jobs_in_database and rerun_remap_job_id is not None:
            try:
                old_job = trans.sa_session.query( trans.app.model.Job ).get(rerun_remap_job_id)
                assert old_job is not None, '(%s/%s): Old job id is invalid' % (rerun_remap_job_id, job.id)
                assert old_job.tool_id == job.tool_id, '(%s/%s): Old tool id (%s) does not match rerun tool id (%s)' % (old_job.id, job.id, old_job.tool_id, job.tool_id)
                if trans.user is not None:
                    assert old_job.user_id == trans.user.id, '(%s/%s): Old user id (%s) does not match rerun user id (%s)' % (old_job.id, job.id, old_job.user_id, trans.user.id)
                elif trans.user is None and type( galaxy_session ) == trans.model.GalaxySession:
                    assert old_job.session_id == galaxy_session.id, '(%s/%s): Old session id (%s) does not match rerun session id (%s)' % (old_job.id, job.id, old_job.session_id, galaxy_session.id)
                else:
                    raise Exception('(%s/%s): Remapping via the API is not (yet) supported' % (old_job.id, job.id))
                # Duplicate PJAs before remap.
                for pjaa in old_job.post_job_actions:
                    job.add_post_job_action(pjaa.post_job_action)
                for jtod in old_job.output_datasets:
                    for (job_to_remap, jtid) in [(jtid.job, jtid) for jtid in jtod.dataset.dependent_jobs]:
                        if (trans.user is not None and job_to_remap.user_id == trans.user.id) or (trans.user is None and job_to_remap.session_id == galaxy_session.id):
                            if job_to_remap.state == job_to_remap.states.PAUSED:
                                job_to_remap.state = job_to_remap.states.NEW
                            for hda in [ dep_jtod.dataset for dep_jtod in job_to_remap.output_datasets ]:
                                if hda.state == hda.states.PAUSED:
                                    hda.state = hda.states.NEW
                                    hda.info = None
                            for p in job_to_remap.parameters:
                                if p.name == jtid.name and p.value == str(jtod.dataset.id):
                                    p.value = str(out_data[jtod.name].id)
                            jtid.dataset = out_data[jtod.name]
                            jtid.dataset.hid = jtod.dataset.hid
                            log.info('Job %s input HDA %s remapped to new HDA %s' % (job_to_remap.id, jtod.dataset.id, jtid.dataset.id))
                            trans.sa_session.add(job_to_remap)
                            trans.sa_session.add(jtid)
                    jtod.dataset.visible = False
                    trans.sa_session.add(jtod)
            except Exception:
                log.exception('Cannot remap rerun dependencies.')
        trans.sa_session.flush()
        # Some tools are not really executable, but jobs are still created for them ( for record keeping ).
        # Examples include tools that redirect to other applications ( epigraph ).  These special tools must
        # include something that can be retrieved from the params ( e.g., REDIRECT_URL ) to keep the job
        # from being queued.
        if 'REDIRECT_URL' in incoming:
            # Get the dataset - there should only be 1
            for name in inp_data.keys():
                dataset = inp_data[ name ]
            redirect_url = tool.parse_redirect_url( dataset, incoming )
            # GALAXY_URL should be include in the tool params to enable the external application
            # to send back to the current Galaxy instance
            GALAXY_URL = incoming.get( 'GALAXY_URL', None )
            assert GALAXY_URL is not None, "GALAXY_URL parameter missing in tool config."
            redirect_url += "&GALAXY_URL=%s" % GALAXY_URL
            # Job should not be queued, so set state to ok
            job.set_state( trans.app.model.Job.states.OK )
            job.info = "Redirected to: %s" % redirect_url
            trans.sa_session.add( job )
            trans.sa_session.flush()
            trans.response.send_redirect( url_for( controller='tool_runner', action='redirect', redirect_url=redirect_url ) )
        else:
            # Put the job in the queue if tracking in memory
            trans.app.job_queue.put( job.id, job.tool_id )
            trans.log_event( "Added job to the job queue, id: %s" % str(job.id), tool_id=job.tool_id )
            return job, out_data
Beispiel #5
0
    def _delete(self, obj, entire_dir=False, **kwargs):
        ipt_timer = ExecutionTimer()
        rel_path = self._construct_path(obj, **kwargs)
        extra_dir = kwargs.get('extra_dir', None)
        base_dir = kwargs.get('base_dir', None)
        dir_only = kwargs.get('dir_only', False)
        obj_dir = kwargs.get('obj_dir', False)

        options = {kw.DEST_RESC_NAME_KW: self.resource}

        try:
            # Remove temparory data in JOB_WORK directory
            if base_dir and dir_only and obj_dir:
                shutil.rmtree(os.path.abspath(rel_path))
                return True

            # For the case of extra_files, because we don't have a reference to
            # individual files we need to remove the entire directory structure
            # with all the files in it. This is easy for the local file system,
            # but requires iterating through each individual key in irods and deleing it.
            if entire_dir and extra_dir:
                shutil.rmtree(self._get_cache_path(rel_path),
                              ignore_errors=True)

                col_path = f"{self.home}/{str(rel_path)}"
                col = None
                try:
                    col = self.session.collections.get(col_path)
                except CollectionDoesNotExist:
                    log.warning("Collection (%s) does not exist!", col_path)
                    return False

                cols = col.walk()
                # Traverse the tree only one level deep
                for _ in range(2):
                    # get next result
                    _, _, data_objects = next(cols)

                    # Delete data objects
                    for data_object in data_objects:
                        data_object.unlink(force=True)

                return True

            else:
                # Delete from cache first
                unlink(self._get_cache_path(rel_path), ignore_errors=True)
                # Delete from irods as well
                p = Path(rel_path)
                data_object_name = p.stem + p.suffix
                subcollection_name = p.parent

                collection_path = f"{self.home}/{str(subcollection_name)}"
                data_object_path = f"{collection_path}/{str(data_object_name)}"

                try:
                    data_obj = self.session.data_objects.get(
                        data_object_path, **options)
                    # remove object
                    data_obj.unlink(force=True)
                    return True
                except (DataObjectDoesNotExist, CollectionDoesNotExist):
                    log.info("Collection or data object (%s) does not exist",
                             data_object_path)
                    return True
        except OSError:
            log.exception('%s delete error', self._get_filename(obj, **kwargs))
        finally:
            log.debug("irods_pt _delete: %s", ipt_timer)
        return False
Beispiel #6
0
    def __init__(self, **kwargs):
        if not log.handlers:
            # Paste didn't handle it, so we need a temporary basic log
            # configured.  The handler added here gets dumped and replaced with
            # an appropriately configured logger in configure_logging below.
            logging.basicConfig(level=logging.DEBUG)
        log.debug("python path is: %s", ", ".join(sys.path))
        self.name = 'galaxy'
        self.startup_timer = ExecutionTimer()
        self.new_installation = False
        self.application_stack = application_stack_instance()
        # Read config file and check for errors
        self.config = config.Configuration(**kwargs)
        self.config.check()
        config.configure_logging(self.config)
        self.configure_fluent_log()
        self.config.reload_sanitize_whitelist(
            explicit='sanitize_whitelist_file' in kwargs)
        self.amqp_internal_connection_obj = galaxy.queues.connection_from_config(
            self.config)
        # control_worker *can* be initialized with a queue, but here we don't
        # want to and we'll allow postfork to bind and start it.
        self.control_worker = GalaxyQueueWorker(self)

        self._configure_tool_shed_registry()
        self._configure_object_store(fsmon=True)
        # Setup the database engine and ORM
        config_file = kwargs.get('global_conf', {}).get('__file__', None)
        if config_file:
            log.debug('Using "galaxy.ini" config file: %s', config_file)
        check_migrate_tools = self.config.check_migrate_tools
        self._configure_models(check_migrate_databases=True,
                               check_migrate_tools=check_migrate_tools,
                               config_file=config_file)

        # Manage installed tool shed repositories.
        from tool_shed.galaxy_install import installed_repository_manager
        self.installed_repository_manager = installed_repository_manager.InstalledRepositoryManager(
            self)

        self._configure_datatypes_registry(self.installed_repository_manager)
        galaxy.model.set_datatypes_registry(self.datatypes_registry)

        # Security helper
        self._configure_security()
        # Tag handler
        self.tag_handler = GalaxyTagManager(self.model.context)
        # Dataset Collection Plugins
        self.dataset_collections_service = DatasetCollectionManager(self)

        # Tool Data Tables
        self._configure_tool_data_tables(from_shed_config=False)
        # Load dbkey / genome build manager
        self._configure_genome_builds(data_table_name="__dbkeys__",
                                      load_old_style=True)

        # Genomes
        self.genomes = Genomes(self)
        # Data providers registry.
        self.data_provider_registry = DataProviderRegistry()

        # Initialize job metrics manager, needs to be in place before
        # config so per-destination modifications can be made.
        self.job_metrics = job_metrics.JobMetrics(
            self.config.job_metrics_config_file, app=self)

        # Initialize the job management configuration
        self.job_config = jobs.JobConfiguration(self)

        # Setup a Tool Cache
        self.tool_cache = ToolCache()
        self.tool_shed_repository_cache = ToolShedRepositoryCache(self)
        # Watch various config files for immediate reload
        self.watchers = ConfigWatchers(self)
        self._configure_toolbox()

        # Load Data Manager
        self.data_managers = DataManagers(self)
        # Load the update repository manager.
        self.update_repository_manager = update_repository_manager.UpdateRepositoryManager(
            self)
        # Load proprietary datatype converters and display applications.
        self.installed_repository_manager.load_proprietary_converters_and_display_applications(
        )
        # Load datatype display applications defined in local datatypes_conf.xml
        self.datatypes_registry.load_display_applications(self)
        # Load datatype converters defined in local datatypes_conf.xml
        self.datatypes_registry.load_datatype_converters(self.toolbox)
        # Load external metadata tool
        self.datatypes_registry.load_external_metadata_tool(self.toolbox)
        # Load history import/export tools.
        load_lib_tools(self.toolbox)
        # visualizations registry: associates resources with visualizations, controls how to render
        self.visualizations_registry = VisualizationsRegistry(
            self,
            directories_setting=self.config.visualization_plugins_directory,
            template_cache_dir=self.config.template_cache)
        # Tours registry
        self.tour_registry = ToursRegistry(self.config.tour_config_dir)
        # Webhooks registry
        self.webhooks_registry = WebhooksRegistry(self.config.webhooks_dirs)
        # Load security policy.
        self.security_agent = self.model.security_agent
        self.host_security_agent = galaxy.security.HostAgent(
            model=self.security_agent.model,
            permitted_actions=self.security_agent.permitted_actions)
        # Load quota management.
        if self.config.enable_quotas:
            self.quota_agent = galaxy.quota.QuotaAgent(self.model)
        else:
            self.quota_agent = galaxy.quota.NoQuotaAgent(self.model)
        # Heartbeat for thread profiling
        self.heartbeat = None
        # Container for OpenID authentication routines
        if self.config.enable_openid:
            from galaxy.web.framework import openid_manager
            self.openid_manager = openid_manager.OpenIDManager(
                self.config.openid_consumer_cache_path)
            self.openid_providers = OpenIDProviders.from_file(
                self.config.openid_config_file)
        else:
            self.openid_providers = OpenIDProviders()
        from galaxy import auth
        self.auth_manager = auth.AuthManager(self)
        # Start the heartbeat process if configured and available (wait until
        # postfork if using uWSGI)
        if self.config.use_heartbeat:
            if heartbeat.Heartbeat:
                self.heartbeat = heartbeat.Heartbeat(
                    self.config,
                    period=self.config.heartbeat_interval,
                    fname=self.config.heartbeat_log)
                self.heartbeat.daemon = True
                self.application_stack.register_postfork_function(
                    self.heartbeat.start)
        self.sentry_client = None
        if self.config.sentry_dsn:

            def postfork_sentry_client():
                import raven
                self.sentry_client = raven.Client(self.config.sentry_dsn)

            self.application_stack.register_postfork_function(
                postfork_sentry_client)

        # Transfer manager client
        if self.config.get_bool('enable_beta_job_managers', False):
            from galaxy.jobs import transfer_manager
            self.transfer_manager = transfer_manager.TransferManager(self)
        # Start the job manager
        from galaxy.jobs import manager
        self.job_manager = manager.JobManager(self)
        self.job_manager.start()
        # FIXME: These are exposed directly for backward compatibility
        self.job_queue = self.job_manager.job_queue
        self.job_stop_queue = self.job_manager.job_stop_queue
        self.proxy_manager = ProxyManager(self.config)
        # Initialize the external service types
        self.external_service_types = external_service_types.ExternalServiceTypesCollection(
            self.config.external_service_type_config_file,
            self.config.external_service_type_path, self)

        from galaxy.workflow import scheduling_manager
        # Must be initialized after job_config.
        self.workflow_scheduling_manager = scheduling_manager.WorkflowSchedulingManager(
            self)

        # Configure handling of signals
        handlers = {}
        if self.heartbeat:
            handlers[signal.SIGUSR1] = self.heartbeat.dump_signal_handler
        self._configure_signal_handlers(handlers)

        self.model.engine.dispose()
        self.server_starttime = int(time.time())  # used for cachebusting
        log.info("Galaxy app startup finished %s" % self.startup_timer)
Beispiel #7
0
    def __search(self,
                 tool_id,
                 user,
                 input_data,
                 input_ids=None,
                 job_state=None,
                 param_dump=None,
                 is_workflow_step=False):
        search_timer = ExecutionTimer()
        query = self.sa_session.query(model.Job).filter(
            model.Job.tool_id == tool_id, model.Job.user == user)

        if job_state is None:
            query = query.filter(
                or_(
                    model.Job.state == 'running',
                    model.Job.state == 'queued',
                    model.Job.state == 'waiting',
                    model.Job.state == 'running',
                    model.Job.state == 'ok',
                ))
        else:
            if isinstance(job_state, string_types):
                query = query.filter(model.Job.state == job_state)
            elif isinstance(job_state, list):
                o = []
                for s in job_state:
                    o.append(model.Job.state == s)
                query = query.filter(or_(*o))

        for k, input_list in input_data.items():
            for type_values in input_list:
                t = type_values['src']
                v = type_values['id']
                if t == 'hda':
                    a = aliased(model.JobToInputDatasetAssociation)
                    b = aliased(model.HistoryDatasetAssociation)
                    c = aliased(model.HistoryDatasetAssociation)
                    query = query.filter(
                        and_(model.Job.id == a.job_id, a.name == k,
                             a.dataset_id == b.id,
                             c.dataset_id == b.dataset_id, c.id == v,
                             or_(b.deleted == false(), c.deleted == false())))
                elif t == 'ldda':
                    a = aliased(model.JobToInputLibraryDatasetAssociation)
                    query = query.filter(
                        and_(model.Job.id == a.job_id, a.name == k,
                             a.ldda_id == v))
                elif t == 'hdca':
                    a = aliased(model.JobToInputDatasetCollectionAssociation)
                    b = aliased(model.HistoryDatasetCollectionAssociation)
                    c = aliased(model.HistoryDatasetCollectionAssociation)
                    query = query.filter(
                        and_(
                            model.Job.id == a.job_id, a.name == k,
                            b.id == a.dataset_collection_id, c.id == v,
                            or_(
                                and_(b.deleted == false(), b.id == v),
                                and_(
                                    or_(
                                        c.
                                        copied_from_history_dataset_collection_association_id
                                        == b.id, b.
                                        copied_from_history_dataset_collection_association_id
                                        == c.id), c.deleted == false()))))
                else:
                    return []

        for job in query.all():
            # We found a job that is equal in terms of tool_id, user, state and input datasets,
            # but to be able to verify that the parameters match we need to modify all instances of
            # dataset_ids (HDA, LDDA, HDCA) in the incoming param_dump to point to those used by the
            # possibly equivalent job, which may have been run on copies of the original input data.
            replacement_timer = ExecutionTimer()
            job_input_ids = {}
            for src, items in input_ids.items():
                for dataset_id in items:
                    if src in job_input_ids and dataset_id in job_input_ids[
                            src]:
                        continue
                    if src == 'hda':
                        a = aliased(model.JobToInputDatasetAssociation)
                        b = aliased(model.HistoryDatasetAssociation)
                        c = aliased(model.HistoryDatasetAssociation)

                        (job_dataset_id, ) = self.sa_session.query(
                            b.id).filter(
                                and_(a.job_id == job.id, b.id == a.dataset_id,
                                     c.dataset_id == b.dataset_id,
                                     c.id == dataset_id)).first()
                    elif src == 'hdca':
                        a = aliased(
                            model.JobToInputDatasetCollectionAssociation)
                        b = aliased(model.HistoryDatasetCollectionAssociation)
                        c = aliased(model.HistoryDatasetCollectionAssociation)

                        (
                            job_dataset_id,
                        ) = self.sa_session.query(b.id).filter(
                            and_(
                                a.job_id == job.id,
                                b.id == a.dataset_collection_id,
                                c.id == dataset_id,
                                or_(
                                    b.id == c.id,
                                    or_(
                                        c.
                                        copied_from_history_dataset_collection_association_id
                                        == b.id, b.
                                        copied_from_history_dataset_collection_association_id
                                        == c.id)))).first()
                    elif src == 'ldda':
                        job_dataset_id = dataset_id
                    else:
                        return []
                    if src not in job_input_ids:
                        job_input_ids[src] = {dataset_id: job_dataset_id}
                    else:
                        job_input_ids[src][dataset_id] = job_dataset_id

            def replace_dataset_ids(path, key, value):
                """Exchanges dataset_ids (HDA, LDA, HDCA, not Dataset) in param_dump with dataset ids used in job."""
                if key == 'id':
                    current_case = param_dump
                    for p in path:
                        current_case = current_case[p]
                    src = current_case['src']
                    value = job_input_ids[src][value]
                    return key, value
                return key, value

            new_param_dump = remap(param_dump, visit=replace_dataset_ids)
            log.info("Parameter replacement finished %s", replacement_timer)
            # new_param_dump has its dataset ids remapped to those used by the job.
            # We now ask if the remapped job parameters match the current job.
            query = self.sa_session.query(
                model.Job).filter(model.Job.id == job.id)
            for k, v in new_param_dump.items():
                a = aliased(model.JobParameter)
                query = query.filter(
                    and_(a.job_id == job.id, a.name == k,
                         a.value == json.dumps(v)))
            if query.first() is None:
                continue
            if is_workflow_step:
                add_n_parameters = 3
            else:
                add_n_parameters = 2
            if not len(job.parameters) == (len(new_param_dump) +
                                           add_n_parameters):
                # Verify that equivalent jobs had the same number of job parameters
                # We add 2 or 3 to new_param_dump because chrominfo and dbkey (and __workflow_invocation_uuid__) are not passed
                # as input parameters
                continue
            # check to make sure none of the output datasets or collections have been deleted
            # TODO: refactors this into the initial job query
            outputs_deleted = False
            for hda in job.output_datasets:
                if hda.dataset.deleted:
                    outputs_deleted = True
                    break
            if not outputs_deleted:
                for collection_instance in job.output_dataset_collection_instances:
                    if collection_instance.dataset_collection_instance.deleted:
                        outputs_deleted = True
                        break
            if not outputs_deleted:
                log.info("Searching jobs finished %s", search_timer)
                return job
        return None
Beispiel #8
0
    def __init__(self, config, config_dict):
        ipt_timer = ExecutionTimer()
        super().__init__(config, config_dict)

        auth_dict = config_dict.get('auth')
        if auth_dict is None:
            _config_dict_error('auth')

        self.username = auth_dict.get('username')
        if self.username is None:
            _config_dict_error('auth->username')
        self.password = auth_dict.get('password')
        if self.password is None:
            _config_dict_error('auth->password')

        resource_dict = config_dict['resource']
        if resource_dict is None:
            _config_dict_error('resource')
        self.resource = resource_dict.get('name')
        if self.resource is None:
            _config_dict_error('resource->name')

        zone_dict = config_dict['zone']
        if zone_dict is None:
            _config_dict_error('zone')
        self.zone = zone_dict.get('name')
        if self.zone is None:
            _config_dict_error('zone->name')

        connection_dict = config_dict['connection']
        if connection_dict is None:
            _config_dict_error('connection')
        self.host = connection_dict.get('host')
        if self.host is None:
            _config_dict_error('connection->host')
        self.port = connection_dict.get('port')
        if self.port is None:
            _config_dict_error('connection->port')
        self.timeout = connection_dict.get('timeout')
        if self.timeout is None:
            _config_dict_error('connection->timeout')
        self.refresh_time = connection_dict.get('refresh_time')
        if self.refresh_time is None:
            _config_dict_error('connection->refresh_time')

        cache_dict = config_dict['cache']
        if cache_dict is None:
            _config_dict_error('cache')
        self.cache_size = cache_dict.get('size', -1)
        if self.cache_size is None:
            _config_dict_error('cache->size')
        self.staging_path = cache_dict.get(
            'path') or self.config.object_store_cache_path
        if self.staging_path is None:
            _config_dict_error('cache->path')

        extra_dirs = {
            e['type']: e['path']
            for e in config_dict.get('extra_dirs', [])
        }
        if not extra_dirs:
            _config_dict_error('extra_dirs')
        self.extra_dirs.update(extra_dirs)

        if irods is None:
            raise Exception(IRODS_IMPORT_MESSAGE)

        self.home = f"/{self.zone}/home/{self.username}"

        if irods is None:
            raise Exception(IRODS_IMPORT_MESSAGE)

        self.session = iRODSSession(host=self.host,
                                    port=self.port,
                                    user=self.username,
                                    password=self.password,
                                    zone=self.zone,
                                    refresh_time=self.refresh_time)
        # Set connection timeout
        self.session.connection_timeout = self.timeout
        log.debug("irods_pt __init__: %s", ipt_timer)
Beispiel #9
0
def execute(trans,
            tool,
            mapping_params,
            history,
            rerun_remap_job_id=None,
            collection_info=None,
            workflow_invocation_uuid=None,
            invocation_step=None,
            max_num_jobs=None,
            job_callback=None,
            completed_jobs=None,
            workflow_resource_parameters=None):
    """
    Execute a tool and return object containing summary (output data, number of
    failures, etc...).
    """
    if max_num_jobs:
        assert invocation_step is not None
    if rerun_remap_job_id:
        assert invocation_step is None

    all_jobs_timer = ExecutionTimer()
    if invocation_step is None:
        execution_tracker = ToolExecutionTracker(trans, tool, mapping_params,
                                                 collection_info)
    else:
        execution_tracker = WorkflowStepExecutionTracker(
            trans,
            tool,
            mapping_params,
            collection_info,
            invocation_step,
            job_callback=job_callback)
    execution_cache = ToolExecutionCache(trans)

    def execute_single_job(execution_slice, completed_job):
        job_timer = ExecutionTimer()
        params = execution_slice.param_combination
        if workflow_invocation_uuid:
            params['__workflow_invocation_uuid__'] = workflow_invocation_uuid
        elif '__workflow_invocation_uuid__' in params:
            # Only workflow invocation code gets to set this, ignore user supplied
            # values or rerun parameters.
            del params['__workflow_invocation_uuid__']
        if workflow_resource_parameters:
            params[
                '__workflow_resource_params__'] = workflow_resource_parameters
        elif '__workflow_resource_params__' in params:
            # Only workflow invocation code gets to set this, ignore user supplied
            # values or rerun parameters.
            del params['__workflow_resource_params__']
        job, result = tool.handle_single_execution(trans, rerun_remap_job_id,
                                                   execution_slice, history,
                                                   execution_cache,
                                                   completed_job,
                                                   collection_info)
        if job:
            message = EXECUTION_SUCCESS_MESSAGE % (tool.id, job.id, job_timer)
            log.debug(message)
            execution_tracker.record_success(execution_slice, job, result)
        else:
            execution_tracker.record_error(result)

    tool_action = tool.tool_action
    if hasattr(tool_action, "check_inputs_ready"):
        for params in execution_tracker.param_combinations:
            # This will throw an exception if the tool is not ready.
            tool_action.check_inputs_ready(
                tool,
                trans,
                params,
                history,
                execution_cache=execution_cache,
                collection_info=collection_info,
            )

    execution_tracker.ensure_implicit_collections_populated(
        history, mapping_params.param_template)
    job_count = len(execution_tracker.param_combinations)

    jobs_executed = 0
    has_remaining_jobs = False

    for i, execution_slice in enumerate(
            execution_tracker.new_execution_slices()):
        if max_num_jobs and jobs_executed >= max_num_jobs:
            has_remaining_jobs = True
            break
        else:
            execute_single_job(execution_slice, completed_jobs[i])

    if has_remaining_jobs:
        raise PartialJobExecution(execution_tracker)
    else:
        execution_tracker.finalize_dataset_collections(trans)

    log.debug("Executed %d job(s) for tool %s request: %s" %
              (job_count, tool.id, all_jobs_timer))
    return execution_tracker
Beispiel #10
0
def build_index(whoosh_index_dir, file_path, hgweb_config_dir, dburi,
                **kwargs):
    """
    Build two search indexes simultaneously
    One is for repositories and the other for tools.
    """
    model = ts_mapping.init(file_path,
                            dburi,
                            engine_options={},
                            create_tables=False)
    sa_session = model.context.current

    #  Rare race condition exists here and below
    tool_index_dir = os.path.join(whoosh_index_dir, 'tools')
    if not os.path.exists(whoosh_index_dir):
        os.makedirs(whoosh_index_dir)
        os.makedirs(tool_index_dir)
        work_repo_dir = whoosh_index_dir
        work_tool_dir = tool_index_dir
    else:
        # Index exists, prevent in-place index regeneration
        work_repo_dir = tempfile.mkdtemp(prefix="tmp-whoosh-repo")
        work_tool_dir = tempfile.mkdtemp(prefix="tmp-whoosh-tool")

    repo_index_storage = FileStorage(work_repo_dir)
    tool_index_storage = FileStorage(work_tool_dir)
    repo_index = repo_index_storage.create_index(repo_schema)
    tool_index = tool_index_storage.create_index(tool_schema)
    repo_index_writer = repo_index.writer()
    tool_index_writer = tool_index.writer()
    repos_indexed = 0
    tools_indexed = 0

    execution_timer = ExecutionTimer()
    for repo in get_repos(sa_session, file_path, hgweb_config_dir, **kwargs):

        repo_index_writer.add_document(
            id=repo.get('id'),
            name=unicodify(repo.get('name')),
            description=unicodify(repo.get('description')),
            long_description=unicodify(repo.get('long_description')),
            homepage_url=unicodify(repo.get('homepage_url')),
            remote_repository_url=unicodify(repo.get('remote_repository_url')),
            repo_owner_username=unicodify(repo.get('repo_owner_username')),
            categories=unicodify(repo.get('categories')),
            times_downloaded=repo.get('times_downloaded'),
            approved=repo.get('approved'),
            last_updated=repo.get('last_updated'),
            full_last_updated=repo.get('full_last_updated'),
            repo_lineage=unicodify(repo.get('repo_lineage')))
        #  Tools get their own index
        for tool in repo.get('tools_list'):
            tool_index_writer.add_document(
                id=unicodify(tool.get('id')),
                name=unicodify(tool.get('name')),
                version=unicodify(tool.get('version')),
                description=unicodify(tool.get('description')),
                help=unicodify(tool.get('help')),
                repo_owner_username=unicodify(repo.get('repo_owner_username')),
                repo_name=unicodify(repo.get('name')),
                repo_id=repo.get('id'))
            tools_indexed += 1

        repos_indexed += 1

    tool_index_writer.commit()
    repo_index_writer.commit()

    log.info("Indexed repos: %s, tools: %s", repos_indexed, tools_indexed)
    log.info("Toolbox index finished %s", execution_timer)

    # Copy the built indexes if we were working in a tmp folder.
    if work_repo_dir is not whoosh_index_dir:
        shutil.rmtree(whoosh_index_dir)
        os.makedirs(whoosh_index_dir)
        os.makedirs(tool_index_dir)
        copy_tree(work_repo_dir, whoosh_index_dir)
        copy_tree(work_tool_dir, tool_index_dir)
        shutil.rmtree(work_repo_dir)
Beispiel #11
0
    def __search(self,
                 tool_id,
                 tool_version,
                 user,
                 input_data,
                 job_state=None,
                 param_dump=None,
                 wildcard_param_dump=None):
        search_timer = ExecutionTimer()

        def replace_dataset_ids(path, key, value):
            """Exchanges dataset_ids (HDA, LDA, HDCA, not Dataset) in param_dump with dataset ids used in job."""
            if key == 'id':
                current_case = param_dump
                for p in path:
                    current_case = current_case[p]
                src = current_case['src']
                value = job_input_ids[src][value]
                return key, value
            return key, value

        job_conditions = [
            and_(
                model.Job.tool_id == tool_id,
                model.Job.user == user,
                model.Job.copied_from_job_id.is_(
                    None)  # Always pick original job
            )
        ]

        if tool_version:
            job_conditions.append(model.Job.tool_version == str(tool_version))

        if job_state is None:
            job_conditions.append(
                model.Job.state.in_([
                    model.Job.states.NEW, model.Job.states.QUEUED,
                    model.Job.states.WAITING, model.Job.states.RUNNING,
                    model.Job.states.OK
                ]))
        else:
            if isinstance(job_state, str):
                job_conditions.append(model.Job.state == job_state)
            elif isinstance(job_state, list):
                o = []
                for s in job_state:
                    o.append(model.Job.state == s)
                job_conditions.append(or_(*o))

        for k, v in wildcard_param_dump.items():
            wildcard_value = None
            if v == {'__class__': 'RuntimeValue'}:
                # TODO: verify this is always None. e.g. run with runtime input input
                v = None
            elif k.endswith('|__identifier__'):
                # We've taken care of this while constructing the conditions based on ``input_data`` above
                continue
            elif k == 'chromInfo' and '?.len' in v:
                continue
                wildcard_value = '"%?.len"'
            if not wildcard_value:
                value_dump = json.dumps(v, sort_keys=True)
                wildcard_value = value_dump.replace('"id": "__id_wildcard__"',
                                                    '"id": %')
            a = aliased(model.JobParameter)
            if value_dump == wildcard_value:
                job_conditions.append(
                    and_(
                        model.Job.id == a.job_id,
                        a.name == k,
                        a.value == value_dump,
                    ))
            else:
                job_conditions.append(
                    and_(model.Job.id == a.job_id, a.name == k,
                         a.value.like(wildcard_value)))

        job_conditions.append(
            and_(
                model.Job.any_output_dataset_collection_instances_deleted ==
                false(), model.Job.any_output_dataset_deleted == false()))

        subq = self.sa_session.query(
            model.Job.id).filter(*job_conditions).subquery()
        data_conditions = []

        # We now build the query filters that relate to the input datasets
        # that this job uses. We keep track of the requested dataset id in `requested_ids`,
        # the type (hda, hdca or lda) in `data_types`
        # and the ids that have been used in the job that has already been run in `used_ids`.
        requested_ids = []
        data_types = []
        used_ids = []
        for k, input_list in input_data.items():
            # k will be matched against the JobParameter.name column. This can be prefixed depending on whethter
            # the input is in a repeat, or not (section and conditional)
            k = {k, k.split('|')[-1]}
            for type_values in input_list:
                t = type_values['src']
                v = type_values['id']
                requested_ids.append(v)
                data_types.append(t)
                identifier = type_values['identifier']
                if t == 'hda':
                    a = aliased(model.JobToInputDatasetAssociation)
                    b = aliased(model.HistoryDatasetAssociation)
                    c = aliased(model.HistoryDatasetAssociation)
                    d = aliased(model.JobParameter)
                    e = aliased(model.HistoryDatasetAssociationHistory)
                    stmt = select([
                        model.HistoryDatasetAssociation.id
                    ]).where(model.HistoryDatasetAssociation.id ==
                             e.history_dataset_association_id)
                    name_condition = []
                    if identifier:
                        data_conditions.append(
                            and_(
                                model.Job.id == d.job_id,
                                d.name.in_({f"{_}|__identifier__"
                                            for _ in k}),
                                d.value == json.dumps(identifier)))
                    else:
                        stmt = stmt.where(e.name == c.name)
                        name_condition.append(b.name == c.name)
                    stmt = stmt.where(e.extension == c.extension, ).where(
                        a.dataset_version == e.version, ).where(
                            e._metadata == c._metadata, )
                    data_conditions.append(
                        and_(
                            a.name.in_(k),
                            a.dataset_id ==
                            b.id,  # b is the HDA used for the job
                            c.dataset_id == b.dataset_id,
                            c.id == v,  # c is the requested job input HDA
                            # We need to make sure that the job we are looking for has been run with identical inputs.
                            # Here we deal with 3 requirements:
                            #  - the jobs' input dataset (=b) version is 0, meaning the job's input dataset is not yet ready
                            #  - b's update_time is older than the job create time, meaning no changes occurred
                            #  - the job has a dataset_version recorded, and that versions' metadata matches c's metadata.
                            or_(
                                and_(
                                    or_(a.dataset_version.in_([0, b.version]),
                                        b.update_time < model.Job.create_time),
                                    b.extension == c.extension,
                                    b.metadata == c.metadata,
                                    *name_condition,
                                ), b.id.in_(stmt)),
                            or_(b.deleted == false(), c.deleted == false())))

                    used_ids.append(a.dataset_id)
                elif t == 'ldda':
                    a = aliased(model.JobToInputLibraryDatasetAssociation)
                    data_conditions.append(
                        and_(model.Job.id == a.job_id, a.name.in_(k),
                             a.ldda_id == v))
                    used_ids.append(a.ldda_id)
                elif t == 'hdca':
                    a = aliased(model.JobToInputDatasetCollectionAssociation)
                    b = aliased(model.HistoryDatasetCollectionAssociation)
                    c = aliased(model.HistoryDatasetCollectionAssociation)
                    data_conditions.append(
                        and_(
                            model.Job.id == a.job_id, a.name.in_(k),
                            b.id == a.dataset_collection_id, c.id == v,
                            b.name == c.name,
                            or_(
                                and_(b.deleted == false(), b.id == v),
                                and_(
                                    or_(
                                        c.
                                        copied_from_history_dataset_collection_association_id
                                        == b.id, b.
                                        copied_from_history_dataset_collection_association_id
                                        == c.id),
                                    c.deleted == false(),
                                ))))
                    used_ids.append(a.dataset_collection_id)
                elif t == 'dce':
                    a = aliased(
                        model.JobToInputDatasetCollectionElementAssociation)
                    b = aliased(model.DatasetCollectionElement)
                    c = aliased(model.DatasetCollectionElement)
                    data_conditions.append(
                        and_(
                            model.Job.id == a.job_id,
                            a.name.in_(k),
                            a.dataset_collection_element_id == b.id,
                            b.element_identifier == c.element_identifier,
                            c.child_collection_id == b.child_collection_id,
                            c.id == v,
                        ))
                    used_ids.append(a.dataset_collection_element_id)
                else:
                    return []

        query = self.sa_session.query(model.Job.id, *used_ids).join(
            subq, model.Job.id == subq.c.id).filter(*data_conditions).group_by(
                model.Job.id, *used_ids).order_by(model.Job.id.desc())
        for job in query:
            # We found a job that is equal in terms of tool_id, user, state and input datasets,
            # but to be able to verify that the parameters match we need to modify all instances of
            # dataset_ids (HDA, LDDA, HDCA) in the incoming param_dump to point to those used by the
            # possibly equivalent job, which may have been run on copies of the original input data.
            job_input_ids = {}
            if len(job) > 1:
                # We do have datasets to check
                job_id, current_jobs_data_ids = job[0], job[1:]
                job_parameter_conditions = [model.Job.id == job_id]
                for src, requested_id, used_id in zip(data_types,
                                                      requested_ids,
                                                      current_jobs_data_ids):
                    if src not in job_input_ids:
                        job_input_ids[src] = {requested_id: used_id}
                    else:
                        job_input_ids[src][requested_id] = used_id
                new_param_dump = remap(param_dump, visit=replace_dataset_ids)
                # new_param_dump has its dataset ids remapped to those used by the job.
                # We now ask if the remapped job parameters match the current job.
                for k, v in new_param_dump.items():
                    if v == {'__class__': 'RuntimeValue'}:
                        # TODO: verify this is always None. e.g. run with runtime input input
                        v = None
                    elif k.endswith('|__identifier__'):
                        # We've taken care of this while constructing the conditions based on ``input_data`` above
                        continue
                    elif k == 'chromInfo' and '?.len' in v:
                        continue
                        wildcard_value = '"%?.len"'
                    if not wildcard_value:
                        wildcard_value = json.dumps(v, sort_keys=True).replace(
                            '"id": "__id_wildcard__"', '"id": %')
                    a = aliased(model.JobParameter)
                    job_parameter_conditions.append(
                        and_(model.Job.id == a.job_id, a.name == k,
                             a.value == json.dumps(v, sort_keys=True)))
            else:
                job_parameter_conditions = [model.Job.id == job]
            query = self.sa_session.query(
                model.Job).filter(*job_parameter_conditions)
            job = query.first()
            if job is None:
                continue
            n_parameters = 0
            # Verify that equivalent jobs had the same number of job parameters
            # We skip chrominfo, dbkey, __workflow_invocation_uuid__ and identifer
            # parameter as these are not passed along when expanding tool parameters
            # and they can differ without affecting the resulting dataset.
            for parameter in job.parameters:
                if parameter.name.startswith("__"):
                    continue
                if parameter.name in {
                        'chromInfo', 'dbkey'
                } or parameter.name.endswith('|__identifier__'):
                    continue
                n_parameters += 1
            if not n_parameters == sum(1 for k in param_dump
                                       if not k.startswith('__')
                                       and not k.endswith('|__identifier__')
                                       and k not in {'chromInfo', 'dbkey'}):
                continue
            log.info("Found equivalent job %s", search_timer)
            return job
        log.info("No equivalent jobs found %s", search_timer)
        return None
Beispiel #12
0
    def apply_view(self, base_tool_panel: ToolPanelElements,
                   toolbox_registry: ToolBoxRegistry) -> ToolPanelElements:
        execution_timer = ExecutionTimer()

        # Find the children of the top level topics
        operations_list = [ROOT_OPERATION] + list(
            self._edam_children_of(ROOT_OPERATION))
        topics_list = [ROOT_TOPIC] + list(self._edam_children_of(ROOT_TOPIC))

        # Sort them (by english label)
        # operations = sorted(operations, key=lambda x: self.edam[x]['label'])
        # topics = sorted(topics, key=lambda x: self.edam[x]['label'])

        # Convert these to list of dicts, wherein we'll add our tools/etc.
        operations: Dict[str, Dict] = {x: {} for x in operations_list}
        topics: Dict[str, Dict] = {x: {} for x in topics_list}
        uncategorized: List[Tuple] = []

        for tool_id, key, val, val_name in walk_loaded_tools(
                base_tool_panel, toolbox_registry):
            for term in self._get_edam_sec(val):
                if term == 'uncategorized' or term not in self.edam:
                    uncategorized.append((tool_id, key, val, val_name))
                else:
                    for path in self.edam[term]['path']:
                        if len(path) == 1:
                            t = term
                        elif len(path) == 0:
                            continue
                        else:
                            t = path[0]

                        if path[0].startswith('operation_'):
                            operations[t][tool_id] = (term, tool_id, key, val,
                                                      val_name)
                        elif path[0].startswith('topic_'):
                            topics[t][tool_id] = (term, tool_id, key, val,
                                                  val_name)

        new_panel = ToolPanelElements()

        def populate_section(term, tool_id, key, val, val_name):
            edam_def = self.edam[term]
            description = edam_def['definition']
            label = edam_def['label']
            links = {
                'edam_browser':
                f"https://edamontology.github.io/edam-browser/#{term}",
            }
            section = new_panel.get_or_create_section(term,
                                                      label,
                                                      description=description,
                                                      links=links)
            toolbox_registry.add_tool_to_tool_panel_view(val, section)
            new_panel.record_section_for_tool_id(tool_id, key, val_name)

        for term in sorted(operations.keys(),
                           key=lambda x: self._sort_edam_key(x)):
            if len(operations[term].keys()) == 0:
                continue

            label_dict = {
                'type': 'label',
                'text': self.edam[term]['label'],
                'description': self.edam[term]['definition'],
                'links': {
                    'edam_browser':
                    f"https://edamontology.github.io/edam-browser/#{term}",
                },
                'id': term,
            }
            new_panel[f"label_{term}"] = ToolSectionLabel(label_dict)

            for (term, tool_id, key, val,
                 val_name) in operations[term].values():
                populate_section(term, tool_id, key, val, val_name)

        for term in sorted(topics.keys(),
                           key=lambda x: self._sort_edam_key(x)):
            if len(topics[term].keys()) == 0:
                continue

            label_dict = {
                'type': 'label',
                'text': self.edam[term]['label'],
                'description': self.edam[term]['definition'],
                'links': {
                    'edam_browser':
                    f"https://edamontology.github.io/edam-browser/#{term}",
                },
                'id': term,
            }
            new_panel[f"label_{term}"] = ToolSectionLabel(label_dict)

            for (term, tool_id, key, val, val_name) in topics[term].values():
                populate_section(term, tool_id, key, val, val_name)

        section = new_panel.get_or_create_section('uncategorized',
                                                  'Uncategorized')
        for (tool_id, key, val, val_name) in uncategorized:
            toolbox_registry.add_tool_to_tool_panel_view(val, section)
            new_panel.record_section_for_tool_id(tool_id, key, val_name)
        log.debug("Loading EDAM tool panel finished %s", execution_timer)
        return new_panel
Beispiel #13
0
def execute(trans,
            tool,
            param_combinations,
            history,
            rerun_remap_job_id=None,
            collection_info=None,
            workflow_invocation_uuid=None):
    """
    Execute a tool and return object containing summary (output data, number of
    failures, etc...).
    """
    all_jobs_timer = ExecutionTimer()
    execution_tracker = ToolExecutionTracker(tool, param_combinations,
                                             collection_info)
    app = trans.app
    execution_cache = ToolExecutionCache(trans)

    def execute_single_job(params):
        job_timer = ExecutionTimer()
        if workflow_invocation_uuid:
            params['__workflow_invocation_uuid__'] = workflow_invocation_uuid
        elif '__workflow_invocation_uuid__' in params:
            # Only workflow invocation code gets to set this, ignore user supplied
            # values or rerun parameters.
            del params['__workflow_invocation_uuid__']

        # If this is a workflow, everything has now been connected so we should validate
        # the state we about to execute one last time. Consider whether tool executions
        # should run this as well.
        if workflow_invocation_uuid:
            messages = tool.check_and_update_param_values(
                params,
                trans,
                update_values=False,
                allow_workflow_parameters=False)
            if messages:
                execution_tracker.record_error(messages)
                return

        job, result = tool.handle_single_execution(trans, rerun_remap_job_id,
                                                   params, history,
                                                   collection_info,
                                                   execution_cache)
        if job:
            message = EXECUTION_SUCCESS_MESSAGE % (tool.id, job.id, job_timer)
            log.debug(message)
            execution_tracker.record_success(job, result)
        else:
            execution_tracker.record_error(result)

    config = app.config
    burst_at = getattr(config, 'tool_submission_burst_at', 10)
    burst_threads = getattr(config, 'tool_submission_burst_threads', 1)

    if len(execution_tracker.param_combinations
           ) < burst_at or burst_threads < 2:
        for params in execution_tracker.param_combinations:
            execute_single_job(params)
    else:
        q = Queue()

        def worker():
            while True:
                params = q.get()
                execute_single_job(params)
                q.task_done()

        for i in range(burst_threads):
            t = Thread(target=worker)
            t.daemon = True
            t.start()

        for params in execution_tracker.param_combinations:
            q.put(params)

        q.join()

    log.debug("Executed all jobs for tool request: %s" % all_jobs_timer)
    if collection_info:
        history = history or tool.get_default_history_by_trans(trans)
        execution_tracker.create_output_collections(trans, history, params)

    return execution_tracker
Beispiel #14
0
    def invoke(self):
        workflow_invocation = self.workflow_invocation
        config = self.trans.app.config
        maximum_duration = getattr(config,
                                   "maximum_workflow_invocation_duration", -1)
        if maximum_duration > 0 and workflow_invocation.seconds_since_created > maximum_duration:
            log.debug(
                "Workflow invocation [%s] exceeded maximum number of seconds allowed for scheduling [%s], failing."
                % (workflow_invocation.id, maximum_duration))
            workflow_invocation.state = model.WorkflowInvocation.states.FAILED
            # All jobs ran successfully, so we can save now
            self.trans.sa_session.add(workflow_invocation)

            # Not flushing in here, because web controller may create multiple
            # invocations.
            return self.progress.outputs

        if workflow_invocation.history.deleted:
            log.info("Cancelled workflow evaluation due to deleted history")
            raise modules.CancelWorkflowEvaluation()

        remaining_steps = self.progress.remaining_steps()
        delayed_steps = False
        for (step, workflow_invocation_step) in remaining_steps:
            step_delayed = False
            step_timer = ExecutionTimer()
            try:
                self.__check_implicitly_dependent_steps(step)

                if not workflow_invocation_step:
                    workflow_invocation_step = model.WorkflowInvocationStep()
                    workflow_invocation_step.workflow_invocation = workflow_invocation
                    workflow_invocation_step.workflow_step = step
                    workflow_invocation_step.state = 'new'

                    workflow_invocation.steps.append(workflow_invocation_step)

                incomplete_or_none = self._invoke_step(
                    workflow_invocation_step)
                if incomplete_or_none is False:
                    step_delayed = delayed_steps = True
                    workflow_invocation_step.state = 'ready'
                    self.progress.mark_step_outputs_delayed(
                        step, why="Not all jobs scheduled for state.")
                else:
                    workflow_invocation_step.state = 'scheduled'
            except modules.DelayedWorkflowEvaluation as de:
                step_delayed = delayed_steps = True
                self.progress.mark_step_outputs_delayed(step, why=de.why)
            except Exception:
                log.exception(
                    "Failed to schedule %s, problem occurred on %s.",
                    self.workflow_invocation.workflow.log_str(),
                    step.log_str(),
                )
                raise

            if not step_delayed:
                log.debug("Workflow step %s of invocation %s invoked %s" %
                          (step.id, workflow_invocation.id, step_timer))

        if delayed_steps:
            state = model.WorkflowInvocation.states.READY
        else:
            state = model.WorkflowInvocation.states.SCHEDULED
        workflow_invocation.state = state

        # All jobs ran successfully, so we can save now
        self.trans.sa_session.add(workflow_invocation)

        # Not flushing in here, because web controller may create multiple
        # invocations.
        return self.progress.outputs
Beispiel #15
0
    def __search(self, tool_id, tool_version, user, input_data, job_state=None, param_dump=None, wildcard_param_dump=None):
        search_timer = ExecutionTimer()

        def replace_dataset_ids(path, key, value):
            """Exchanges dataset_ids (HDA, LDA, HDCA, not Dataset) in param_dump with dataset ids used in job."""
            if key == 'id':
                current_case = param_dump
                for p in path:
                    current_case = current_case[p]
                src = current_case['src']
                value = job_input_ids[src][value]
                return key, value
            return key, value

        conditions = [and_(model.Job.tool_id == tool_id,
                           model.Job.user == user)]

        if tool_version:
            conditions.append(model.Job.tool_version == str(tool_version))

        if job_state is None:
            conditions.append(
                model.Job.state.in_([model.Job.states.NEW,
                                     model.Job.states.QUEUED,
                                     model.Job.states.WAITING,
                                     model.Job.states.RUNNING,
                                     model.Job.states.OK])
            )
        else:
            if isinstance(job_state, string_types):
                conditions.append(model.Job.state == job_state)
            elif isinstance(job_state, list):
                o = []
                for s in job_state:
                    o.append(model.Job.state == s)
                conditions.append(
                    or_(*o)
                )

        # We now build the query filters that relate to the input datasets
        # that this job uses. We keep track of the requested dataset id in `requested_ids`,
        # the type (hda, hdca or lda) in `data_types`
        # and the ids that have been used in the job that has already been run in `used_ids`.
        requested_ids = []
        data_types = []
        used_ids = []
        for k, input_list in input_data.items():
            for type_values in input_list:
                t = type_values['src']
                v = type_values['id']
                requested_ids.append(v)
                data_types.append(t)
                identifier = type_values['identifier']
                if t == 'hda':
                    a = aliased(model.JobToInputDatasetAssociation)
                    b = aliased(model.HistoryDatasetAssociation)
                    c = aliased(model.HistoryDatasetAssociation)
                    d = aliased(model.JobParameter)
                    e = aliased(model.HistoryDatasetAssociationHistory)
                    conditions.append(and_(
                        model.Job.id == a.job_id,
                        a.name == k,
                        a.dataset_id == b.id,  # b is the HDA use for the job
                        c.dataset_id == b.dataset_id,
                        c.id == v,  # c is the requested job input HDA
                        # We need to make sure that the job we are looking for has been run with identical inputs.
                        # Here we deal with 3 requirements:
                        #  - the jobs' input dataset (=b) version is 0, meaning the job's input dataset is not yet ready
                        #  - b's update_time is older than the job create time, meaning no changes occurred
                        #  - the job has a dataset_version recorded, and that versions' metadata matches c's metadata.
                        or_(
                            and_(or_(a.dataset_version.in_([0, b.version]),
                                     b.update_time < model.Job.create_time),
                                 b.name == c.name,
                                 b.extension == c.extension,
                                 b.metadata == c.metadata,
                                 ),
                            and_(b.id == e.history_dataset_association_id,
                                 a.dataset_version == e.version,
                                 e.name == c.name,
                                 e.extension == c.extension,
                                 e._metadata == c._metadata,
                                 ),
                        ),
                        or_(b.deleted == false(), c.deleted == false())
                    ))
                    if identifier:
                        conditions.append(and_(model.Job.id == d.job_id,
                                             d.name == "%s|__identifier__" % k,
                                             d.value == json.dumps(identifier)))
                    used_ids.append(a.dataset_id)
                elif t == 'ldda':
                    a = aliased(model.JobToInputLibraryDatasetAssociation)
                    conditions.append(and_(
                        model.Job.id == a.job_id,
                        a.name == k,
                        a.ldda_id == v
                    ))
                    used_ids.append(a.ldda_id)
                elif t == 'hdca':
                    a = aliased(model.JobToInputDatasetCollectionAssociation)
                    b = aliased(model.HistoryDatasetCollectionAssociation)
                    c = aliased(model.HistoryDatasetCollectionAssociation)
                    conditions.append(and_(
                        model.Job.id == a.job_id,
                        a.name == k,
                        b.id == a.dataset_collection_id,
                        c.id == v,
                        b.name == c.name,
                        or_(and_(b.deleted == false(), b.id == v),
                            and_(or_(c.copied_from_history_dataset_collection_association_id == b.id,
                                     b.copied_from_history_dataset_collection_association_id == c.id),
                                 c.deleted == false(),
                                 )
                            )
                    ))
                    used_ids.append(a.dataset_collection_id)
                else:
                    return []

        for k, v in wildcard_param_dump.items():
            wildcard_value = json.dumps(v, sort_keys=True).replace('"id": "__id_wildcard__"', '"id": %')
            a = aliased(model.JobParameter)
            conditions.append(and_(
                model.Job.id == a.job_id,
                a.name == k,
                a.value.like(wildcard_value)
            ))

        conditions.append(and_(
            model.Job.any_output_dataset_collection_instances_deleted == false(),
            model.Job.any_output_dataset_deleted == false()
        ))

        query = self.sa_session.query(model.Job.id, *used_ids).filter(and_(*conditions))
        for job in query.all():
            # We found a job that is equal in terms of tool_id, user, state and input datasets,
            # but to be able to verify that the parameters match we need to modify all instances of
            # dataset_ids (HDA, LDDA, HDCA) in the incoming param_dump to point to those used by the
            # possibly equivalent job, which may have been run on copies of the original input data.
            job_input_ids = {}
            if len(job) > 1:
                # We do have datasets to check
                job_id, current_jobs_data_ids = job[0], job[1:]
                job_parameter_conditions = [model.Job.id == job_id]
                for src, requested_id, used_id in zip(data_types, requested_ids, current_jobs_data_ids):
                    if src not in job_input_ids:
                        job_input_ids[src] = {requested_id: used_id}
                    else:
                        job_input_ids[src][requested_id] = used_id
                new_param_dump = remap(param_dump, visit=replace_dataset_ids)
                # new_param_dump has its dataset ids remapped to those used by the job.
                # We now ask if the remapped job parameters match the current job.
                for k, v in new_param_dump.items():
                    a = aliased(model.JobParameter)
                    job_parameter_conditions.append(and_(
                        a.name == k,
                        a.value == json.dumps(v, sort_keys=True)
                    ))
            else:
                job_parameter_conditions = [model.Job.id == job]
            query = self.sa_session.query(model.Job).filter(*job_parameter_conditions)
            job = query.first()
            if job is None:
                continue
            n_parameters = 0
            # Verify that equivalent jobs had the same number of job parameters
            # We skip chrominfo, dbkey, __workflow_invocation_uuid__ and identifer
            # parameter as these are not passed along when expanding tool parameters
            # and they can differ without affecting the resulting dataset.
            for parameter in job.parameters:
                if parameter.name in {'__workflow_invocation_uuid__', 'chromInfo', 'dbkey'} or parameter.name.endswith('|__identifier__'):
                    continue
                n_parameters += 1
            if not n_parameters == len(param_dump):
                continue
            log.info("Found equivalent job %s", search_timer)
            return job
        log.info("No equivalent jobs found %s", search_timer)
        return None
Beispiel #16
0
def prune_history_audit_table(sa_session: scoped_session):
    """Prune ever growing history_audit table."""
    timer = ExecutionTimer()
    model.HistoryAudit.prune(sa_session)
    log.debug(f"Successfully pruned history_audit table {timer}")
Beispiel #17
0
 def _exec(self):
     if self.time_execution:
         timer = ExecutionTimer()
     self.func()
     if self.time_execution:
         log.debug(f"Executed periodic task {self.name} {timer}")
Beispiel #18
0
    def _push_to_irods(self, rel_path, source_file=None, from_string=None):
        """
        Push the file pointed to by ``rel_path`` to the iRODS. Extract folder name
        from rel_path as iRODS collection name, and extract file name from rel_path
        as iRODS data object name.
        If ``source_file`` is provided, push that file instead while
        still using ``rel_path`` for collection and object store names.
        If ``from_string`` is provided, set contents of the file to the value of the string.
        """
        ipt_timer = ExecutionTimer()
        p = Path(rel_path)
        data_object_name = p.stem + p.suffix
        subcollection_name = p.parent

        source_file = source_file if source_file else self._get_cache_path(
            rel_path)
        options = {kw.FORCE_FLAG_KW: '', kw.DEST_RESC_NAME_KW: self.resource}

        if not os.path.exists(source_file):
            log.error(
                "Tried updating key '%s' from source file '%s', but source file does not exist.",
                rel_path, source_file)
            return False

        # Check if the data object exists in iRODS
        collection_path = f"{self.home}/{str(subcollection_name)}"
        data_object_path = f"{collection_path}/{str(data_object_name)}"
        exists = False

        try:
            exists = self.session.data_objects.exists(data_object_path)

            if os.path.getsize(source_file) == 0 and exists:
                log.debug(
                    "Wanted to push file '%s' to iRODS collection '%s' but its size is 0; skipping.",
                    source_file, rel_path)
                return True

            # Create sub-collection first
            self.session.collections.create(collection_path,
                                            recurse=True,
                                            **options)

            if from_string:
                # Create data object
                data_obj = self.session.data_objects.create(
                    data_object_path, self.resource, **options)

                # Save 'from_string' as a file
                with data_obj.open('w') as data_obj_fp:
                    data_obj_fp.write(from_string)

                # Add file containing 'from_string' to the irods collection, since
                # put() expects a file as input. Get file name from data object's 'desc' field
                self.session.data_objects.put(data_obj.desc,
                                              f"{collection_path}/", **options)

                log.debug("Pushed data from string '%s' to collection '%s'",
                          from_string, data_object_path)
            else:
                start_time = datetime.now()
                log.debug(
                    "Pushing cache file '%s' of size %s bytes to collection '%s'",
                    source_file, os.path.getsize(source_file), rel_path)

                # Add the source file to the irods collection
                self.session.data_objects.put(source_file, data_object_path,
                                              **options)

                end_time = datetime.now()
                log.debug(
                    "Pushed cache file '%s' to collection '%s' (%s bytes transfered in %s sec)",
                    source_file, rel_path, os.path.getsize(source_file),
                    (end_time - start_time).total_seconds())
            return True
        finally:
            log.debug("irods_pt _push_to_irods: %s", ipt_timer)
Beispiel #19
0
    def __init__(self, **kwargs) -> None:
        startup_timer = ExecutionTimer()
        super().__init__(fsmon=True, **kwargs)
        self.haltables = [
            ("queue worker", self._shutdown_queue_worker),
            ("file watcher", self._shutdown_watcher),
            ("database heartbeat", self._shutdown_database_heartbeat),
            ("workflow scheduler", self._shutdown_scheduling_manager),
            ("object store", self._shutdown_object_store),
            ("job manager", self._shutdown_job_manager),
            ("application heartbeat", self._shutdown_heartbeat),
            ("repository manager", self._shutdown_repo_manager),
            ("database connection", self._shutdown_model),
            ("application stack", self._shutdown_application_stack),
        ]
        self._register_singleton(StructuredApp, self)
        # A lot of postfork initialization depends on the server name, ensure it is set immediately after forking before other postfork functions
        self.application_stack.register_postfork_function(
            self.application_stack.set_postfork_server_name, self)
        self.config.reload_sanitize_allowlist(
            explicit='sanitize_allowlist_file' in kwargs)
        self.amqp_internal_connection_obj = galaxy.queues.connection_from_config(
            self.config)
        # queue_worker *can* be initialized with a queue, but here we don't
        # want to and we'll allow postfork to bind and start it.
        self.queue_worker = self._register_singleton(GalaxyQueueWorker,
                                                     GalaxyQueueWorker(self))

        self._configure_tool_shed_registry()

        self.dependency_resolvers_view = self._register_singleton(
            DependencyResolversView, DependencyResolversView(self))
        self.test_data_resolver = self._register_singleton(
            TestDataResolver,
            TestDataResolver(file_dirs=self.config.tool_test_data_directories))
        self.dynamic_tool_manager = self._register_singleton(
            DynamicToolManager)
        self.api_keys_manager = self._register_singleton(ApiKeyManager)

        # Tool Data Tables
        self._configure_tool_data_tables(from_shed_config=False)
        # Load dbkey / genome build manager
        self._configure_genome_builds(data_table_name="__dbkeys__",
                                      load_old_style=True)

        # Genomes
        self.genomes = self._register_singleton(Genomes)
        # Data providers registry.
        self.data_provider_registry = self._register_singleton(
            DataProviderRegistry)

        # Initialize error report plugins.
        self.error_reports = self._register_singleton(
            ErrorReports, ErrorReports(self.config.error_report_file,
                                       app=self))

        # Setup a Tool Cache
        self.tool_cache = self._register_singleton(ToolCache)
        self.tool_shed_repository_cache = self._register_singleton(
            ToolShedRepositoryCache)
        # Watch various config files for immediate reload
        self.watchers = self._register_singleton(ConfigWatchers)
        self._configure_toolbox()
        # Load Data Manager
        self.data_managers = self._register_singleton(DataManagers)
        # Load the update repository manager.
        self.update_repository_manager = self._register_singleton(
            UpdateRepositoryManager, UpdateRepositoryManager(self))
        # Load proprietary datatype converters and display applications.
        self.installed_repository_manager.load_proprietary_converters_and_display_applications(
        )
        # Load datatype display applications defined in local datatypes_conf.xml
        self.datatypes_registry.load_display_applications(self)
        # Load datatype converters defined in local datatypes_conf.xml
        self.datatypes_registry.load_datatype_converters(self.toolbox)
        # Load external metadata tool
        self.datatypes_registry.load_external_metadata_tool(self.toolbox)
        # Load history import/export tools.
        load_lib_tools(self.toolbox)
        self.toolbox.persist_cache(register_postfork=True)
        # visualizations registry: associates resources with visualizations, controls how to render
        self.visualizations_registry = self._register_singleton(
            VisualizationsRegistry,
            VisualizationsRegistry(
                self,
                directories_setting=self.config.
                visualization_plugins_directory,
                template_cache_dir=self.config.template_cache_path))
        # Tours registry
        tour_registry = build_tours_registry(self.config.tour_config_dir)
        self.tour_registry = tour_registry
        self[ToursRegistry] = tour_registry  # type: ignore[misc]
        # Webhooks registry
        self.webhooks_registry = self._register_singleton(
            WebhooksRegistry, WebhooksRegistry(self.config.webhooks_dir))
        # Load security policy.
        self.security_agent = self.model.security_agent
        self.host_security_agent = galaxy.model.security.HostAgent(
            model=self.security_agent.model,
            permitted_actions=self.security_agent.permitted_actions)
        # Load quota management.
        self.quota_agent = self._register_singleton(
            QuotaAgent, get_quota_agent(self.config, self.model))
        # Heartbeat for thread profiling
        self.heartbeat = None
        self.auth_manager = self._register_singleton(
            auth.AuthManager, auth.AuthManager(self.config))
        # Start the heartbeat process if configured and available (wait until
        # postfork if using uWSGI)
        if self.config.use_heartbeat:
            if heartbeat.Heartbeat:
                self.heartbeat = heartbeat.Heartbeat(
                    self.config,
                    period=self.config.heartbeat_interval,
                    fname=self.config.heartbeat_log)
                self.heartbeat.daemon = True
                self.application_stack.register_postfork_function(
                    self.heartbeat.start)

        self.authnz_manager = None
        if self.config.enable_oidc:
            from galaxy.authnz import managers
            self.authnz_manager = managers.AuthnzManager(
                self, self.config.oidc_config_file,
                self.config.oidc_backends_config_file)

        self.containers = {}
        if self.config.enable_beta_containers_interface:
            self.containers = build_container_interfaces(
                self.config.containers_config_file,
                containers_conf=self.config.containers_conf)

        if not self.config.enable_celery_tasks and self.config.history_audit_table_prune_interval > 0:
            self.prune_history_audit_task = IntervalTask(
                func=lambda: galaxy.model.HistoryAudit.prune(self.model.session
                                                             ),
                name="HistoryAuditTablePruneTask",
                interval=self.config.history_audit_table_prune_interval,
                immediate_start=False,
                time_execution=True)
            self.application_stack.register_postfork_function(
                self.prune_history_audit_task.start)
            self.haltables.append(("HistoryAuditTablePruneTask",
                                   self.prune_history_audit_task.shutdown))
        # Start the job manager
        self.application_stack.register_postfork_function(
            self.job_manager.start)
        # If app is not job handler but uses mule messaging.
        # Can be removed when removing mule support.
        self.job_manager._check_jobs_at_startup()
        self.proxy_manager = ProxyManager(self.config)

        # Must be initialized after job_config.
        self.workflow_scheduling_manager = scheduling_manager.WorkflowSchedulingManager(
            self)

        self.trs_proxy = self._register_singleton(TrsProxy,
                                                  TrsProxy(self.config))
        # Must be initialized after any component that might make use of stack messaging is configured. Alternatively if
        # it becomes more commonly needed we could create a prefork function registration method like we do with
        # postfork functions.
        self.application_stack.init_late_prefork()

        self.interactivetool_manager = InteractiveToolManager(self)

        # Configure handling of signals
        handlers = {}
        if self.heartbeat:
            handlers[signal.SIGUSR1] = self.heartbeat.dump_signal_handler
        self._configure_signal_handlers(handlers)

        self.database_heartbeat = DatabaseHeartbeat(
            application_stack=self.application_stack)
        self.database_heartbeat.add_change_callback(self.watchers.change_state)
        self.application_stack.register_postfork_function(
            self.database_heartbeat.start)

        # Start web stack message handling
        self.application_stack.register_postfork_function(
            self.application_stack.start)
        self.application_stack.register_postfork_function(
            self.queue_worker.bind_and_start)
        # Delay toolbox index until after startup
        self.application_stack.register_postfork_function(
            lambda: send_local_control_task(self,
                                            'rebuild_toolbox_search_index'))

        # Inject url_for for components to more easily optionally depend
        # on url_for.
        self.url_for = url_for
        self.legacy_url_for = legacy_url_for

        self.server_starttime = int(time.time())  # used for cachebusting
        # Limit lifetime of tool shed repository cache to app startup
        self.tool_shed_repository_cache = None
        self.legacy_mapper = None
        log.info(f"Galaxy app startup finished {startup_timer}")
Beispiel #20
0
    def _populate_elements(self, chunk, name, root_collection_builder,
                           metadata_source_name, final_job_state):
        element_datasets = {
            'element_identifiers': [],
            'datasets': [],
            'tag_lists': [],
            'paths': [],
            'extra_files': []
        }
        for filename, discovered_file in chunk:
            create_dataset_timer = ExecutionTimer()
            fields_match = discovered_file.match
            if not fields_match:
                raise Exception("Problem parsing metadata fields for file %s" %
                                filename)
            element_identifiers = fields_match.element_identifiers
            designation = fields_match.designation
            visible = fields_match.visible
            ext = fields_match.ext
            dbkey = fields_match.dbkey
            extra_files = fields_match.extra_files
            # galaxy.tools.parser.output_collection_def.INPUT_DBKEY_TOKEN
            if dbkey == "__input__":
                dbkey = self.input_dbkey

            # Create new primary dataset
            dataset_name = fields_match.name or designation

            link_data = discovered_file.match.link_data

            sources = discovered_file.match.sources
            hashes = discovered_file.match.hashes
            created_from_basename = discovered_file.match.created_from_basename

            dataset = self.create_dataset(
                ext=ext,
                designation=designation,
                visible=visible,
                dbkey=dbkey,
                name=dataset_name,
                metadata_source_name=metadata_source_name,
                link_data=link_data,
                sources=sources,
                hashes=hashes,
                created_from_basename=created_from_basename,
                final_job_state=final_job_state,
            )
            log.debug(
                "(%s) Created dynamic collection dataset for path [%s] with element identifier [%s] for output [%s] %s",
                self.job_id(),
                filename,
                designation,
                name,
                create_dataset_timer,
            )
            element_datasets['element_identifiers'].append(element_identifiers)
            element_datasets['extra_files'].append(extra_files)
            element_datasets['datasets'].append(dataset)
            element_datasets['tag_lists'].append(
                discovered_file.match.tag_list)
            element_datasets['paths'].append(filename)

        self.add_tags_to_datasets(datasets=element_datasets['datasets'],
                                  tag_lists=element_datasets['tag_lists'])
        for (element_identifiers,
             dataset) in zip(element_datasets['element_identifiers'],
                             element_datasets['datasets']):
            current_builder = root_collection_builder
            for element_identifier in element_identifiers[:-1]:
                current_builder = current_builder.get_level(element_identifier)
            current_builder.add_dataset(element_identifiers[-1], dataset)

            # Associate new dataset with job
            element_identifier_str = ":".join(element_identifiers)
            association_name = f'__new_primary_file_{name}|{element_identifier_str}__'
            self.add_output_dataset_association(association_name, dataset)

        self.update_object_store_with_datasets(
            datasets=element_datasets['datasets'],
            paths=element_datasets['paths'],
            extra_files=element_datasets['extra_files'])
        add_datasets_timer = ExecutionTimer()
        self.add_datasets_to_history(element_datasets['datasets'])
        log.debug(
            "(%s) Add dynamic collection datasets to history for output [%s] %s",
            self.job_id(),
            name,
            add_datasets_timer,
        )
        self.set_datasets_metadata(datasets=element_datasets['datasets'])
Beispiel #21
0
    def populate_collection_elements(self,
                                     collection,
                                     root_collection_builder,
                                     filenames,
                                     name=None,
                                     metadata_source_name=None):
        # TODO: allow configurable sorting.
        #    <sort by="lexical" /> <!-- default -->
        #    <sort by="reverse_lexical" />
        #    <sort regex="example.(\d+).fastq" by="1:numerical" />
        #    <sort regex="part_(\d+)_sample_([^_]+).fastq" by="2:lexical,1:numerical" />
        if name is None:
            name = "unnamed output"

        element_datasets = []
        for filename, discovered_file in filenames.items():
            create_dataset_timer = ExecutionTimer()
            fields_match = discovered_file.match
            if not fields_match:
                raise Exception("Problem parsing metadata fields for file %s" %
                                filename)
            element_identifiers = fields_match.element_identifiers
            designation = fields_match.designation
            visible = fields_match.visible
            ext = fields_match.ext
            dbkey = fields_match.dbkey
            if dbkey == INPUT_DBKEY_TOKEN:
                dbkey = self.input_dbkey

            # Create new primary dataset
            name = fields_match.name or designation

            link_data = discovered_file.match.link_data

            dataset = self.create_dataset(
                ext=ext,
                designation=designation,
                visible=visible,
                dbkey=dbkey,
                name=name,
                filename=filename,
                metadata_source_name=metadata_source_name,
                link_data=link_data,
            )
            log.debug(
                "(%s) Created dynamic collection dataset for path [%s] with element identifier [%s] for output [%s] %s",
                self.job.id,
                filename,
                designation,
                name,
                create_dataset_timer,
            )
            element_datasets.append((element_identifiers, dataset))

        app = self.app
        sa_session = self.sa_session
        job = self.job

        if job:
            add_datasets_timer = ExecutionTimer()
            job.history.add_datasets(sa_session,
                                     [d for (ei, d) in element_datasets])
            log.debug(
                "(%s) Add dynamic collection datasets to history for output [%s] %s",
                self.job.id,
                name,
                add_datasets_timer,
            )

        for (element_identifiers, dataset) in element_datasets:
            current_builder = root_collection_builder
            for element_identifier in element_identifiers[:-1]:
                current_builder = current_builder.get_level(element_identifier)
            current_builder.add_dataset(element_identifiers[-1], dataset)

            # Associate new dataset with job
            if job:
                element_identifier_str = ":".join(element_identifiers)
                # Below was changed from '__new_primary_file_%s|%s__' % (name, designation )
                assoc = app.model.JobToOutputDatasetAssociation(
                    '__new_primary_file_%s|%s__' %
                    (name, element_identifier_str), dataset)
                assoc.job = self.job
            sa_session.add(assoc)

            dataset.raw_set_dataset_state('ok')

        sa_session.flush()
Beispiel #22
0
def _timed_flush_obj(obj):
    obj_flush_timer = ExecutionTimer()
    sa_session = object_session(obj)
    sa_session.flush()
    log.info("Flushed transaction for %s %s" % (obj.log_str(), obj_flush_timer))
Beispiel #23
0
    def execute(self,
                tool,
                trans,
                incoming={},
                return_job=False,
                set_output_hid=True,
                history=None,
                job_params=None,
                rerun_remap_job_id=None,
                mapping_over_collection=False,
                execution_cache=None):
        """
        Executes a tool, creating job and tool outputs, associating them, and
        submitting the job to the job queue. If history is not specified, use
        trans.history as destination for tool's output datasets.
        """
        self._check_access(tool, trans)
        app = trans.app
        if execution_cache is None:
            execution_cache = ToolExecutionCache(trans)
        current_user_roles = execution_cache.current_user_roles
        history, inp_data, inp_dataset_collections = self._collect_inputs(
            tool, trans, incoming, history, current_user_roles)

        # Build name for output datasets based on tool name and input names
        on_text = self._get_on_text(inp_data)

        # format='input" previously would give you a random extension from
        # the input extensions, now it should just give "input" as the output
        # format.
        input_ext = 'data' if tool.profile < 16.04 else "input"
        input_dbkey = incoming.get("dbkey", "?")
        for name, data in reversed(inp_data.items()):
            if not data:
                data = NoneDataset(datatypes_registry=app.datatypes_registry)
                continue

            # Convert LDDA to an HDA.
            if isinstance(data, LibraryDatasetDatasetAssociation):
                data = data.to_history_dataset_association(None)
                inp_data[name] = data

            if tool.profile < 16.04:
                input_ext = data.ext

            if data.dbkey not in [None, '?']:
                input_dbkey = data.dbkey

            identifier = getattr(data, "element_identifier", None)
            if identifier is not None:
                incoming["%s|__identifier__" % name] = identifier

        # Collect chromInfo dataset and add as parameters to incoming
        (chrom_info, db_dataset) = app.genome_builds.get_chrom_info(
            input_dbkey,
            trans=trans,
            custom_build_hack_get_len_from_fasta_conversion=tool.id !=
            'CONVERTER_fasta_to_len')
        if db_dataset:
            inp_data.update({"chromInfo": db_dataset})
        incoming["chromInfo"] = chrom_info

        # Determine output dataset permission/roles list
        existing_datasets = [inp for inp in inp_data.values() if inp]
        if existing_datasets:
            output_permissions = app.security_agent.guess_derived_permissions_for_datasets(
                existing_datasets)
        else:
            # No valid inputs, we will use history defaults
            output_permissions = app.security_agent.history_get_default_permissions(
                history)

        # Add the dbkey to the incoming parameters
        incoming["dbkey"] = input_dbkey
        # wrapped params are used by change_format action and by output.label; only perform this wrapping once, as needed
        wrapped_params = self._wrapped_params(trans, tool, incoming)

        out_data = odict()
        input_collections = dict(
            (k, v[0][0]) for k, v in inp_dataset_collections.items())
        output_collections = OutputCollections(
            trans,
            history,
            tool=tool,
            tool_action=self,
            input_collections=input_collections,
            mapping_over_collection=mapping_over_collection,
            on_text=on_text,
            incoming=incoming,
            params=wrapped_params.params,
            job_params=job_params,
        )

        # Keep track of parent / child relationships, we'll create all the
        # datasets first, then create the associations
        parent_to_child_pairs = []
        child_dataset_names = set()
        object_store_populator = ObjectStorePopulator(app)

        def handle_output(name, output, hidden=None):
            if output.parent:
                parent_to_child_pairs.append((output.parent, name))
                child_dataset_names.add(name)
            # What is the following hack for? Need to document under what
            # conditions can the following occur? ([email protected])
            # HACK: the output data has already been created
            #      this happens i.e. as a result of the async controller
            if name in incoming:
                dataid = incoming[name]
                data = trans.sa_session.query(
                    app.model.HistoryDatasetAssociation).get(dataid)
                assert data is not None
                out_data[name] = data
            else:
                ext = determine_output_format(output, wrapped_params.params,
                                              inp_data,
                                              inp_dataset_collections,
                                              input_ext)
                data = app.model.HistoryDatasetAssociation(extension=ext,
                                                           create_dataset=True,
                                                           flush=False)
                if hidden is None:
                    hidden = output.hidden
                if hidden:
                    data.visible = False
                trans.sa_session.add(data)
                trans.app.security_agent.set_all_dataset_permissions(
                    data.dataset, output_permissions, new=True)

            # Must flush before setting object store id currently.
            # TODO: optimize this.
            trans.sa_session.flush()
            object_store_populator.set_object_store_id(data)

            # This may not be neccesary with the new parent/child associations
            data.designation = name
            # Copy metadata from one of the inputs if requested.

            # metadata source can be either a string referencing an input
            # or an actual object to copy.
            metadata_source = output.metadata_source
            if metadata_source:
                if isinstance(metadata_source, string_types):
                    metadata_source = inp_data.get(metadata_source)

            if metadata_source is not None:
                data.init_meta(copy_from=metadata_source)
            else:
                data.init_meta()
            # Take dbkey from LAST input
            data.dbkey = str(input_dbkey)
            # Set state
            data.blurb = "queued"
            # Set output label
            data.name = self.get_output_name(output, data, tool, on_text,
                                             trans, incoming, history,
                                             wrapped_params.params, job_params)
            # Store output
            out_data[name] = data
            if output.actions:
                # Apply pre-job tool-output-dataset actions; e.g. setting metadata, changing format
                output_action_params = dict(out_data)
                output_action_params.update(incoming)
                output.actions.apply_action(data, output_action_params)
            # Also set the default values of actions of type metadata
            self.set_metadata_defaults(output, data, tool, on_text, trans,
                                       incoming, history,
                                       wrapped_params.params, job_params)
            # Flush all datasets at once.
            return data

        for name, output in tool.outputs.items():
            if not filter_output(output, incoming):
                if output.collection:
                    collections_manager = app.dataset_collections_service
                    element_identifiers = []
                    known_outputs = output.known_outputs(
                        input_collections, collections_manager.type_registry)
                    # Just to echo TODO elsewhere - this should be restructured to allow
                    # nested collections.
                    for output_part_def in known_outputs:
                        # Add elements to top-level collection, unless nested...
                        current_element_identifiers = element_identifiers
                        current_collection_type = output.structure.collection_type

                        for parent_id in (output_part_def.parent_ids or []):
                            # TODO: replace following line with formal abstractions for doing this.
                            current_collection_type = ":".join(
                                current_collection_type.split(":")[1:])
                            name_to_index = dict(
                                (value["name"], index)
                                for (index, value) in enumerate(
                                    current_element_identifiers))
                            if parent_id not in name_to_index:
                                if parent_id not in current_element_identifiers:
                                    index = len(current_element_identifiers)
                                    current_element_identifiers.append(
                                        dict(
                                            name=parent_id,
                                            collection_type=
                                            current_collection_type,
                                            src="new_collection",
                                            element_identifiers=[],
                                        ))
                                else:
                                    index = name_to_index[parent_id]
                            current_element_identifiers = current_element_identifiers[
                                index]["element_identifiers"]

                        effective_output_name = output_part_def.effective_output_name
                        element = handle_output(effective_output_name,
                                                output_part_def.output_def,
                                                hidden=True)
                        # TODO: this shouldn't exist in the top-level of the history at all
                        # but for now we are still working around that by hiding the contents
                        # there.
                        # Following hack causes dataset to no be added to history...
                        child_dataset_names.add(effective_output_name)

                        history.add_dataset(element,
                                            set_hid=set_output_hid,
                                            quota=False)
                        trans.sa_session.add(element)
                        trans.sa_session.flush()

                        current_element_identifiers.append({
                            "__object__":
                            element,
                            "name":
                            output_part_def.element_identifier,
                        })
                        log.info(element_identifiers)

                    if output.dynamic_structure:
                        assert not element_identifiers  # known_outputs must have been empty
                        element_kwds = dict(elements=collections_manager.
                                            ELEMENTS_UNINITIALIZED)
                    else:
                        element_kwds = dict(
                            element_identifiers=element_identifiers)

                    output_collections.create_collection(output=output,
                                                         name=name,
                                                         **element_kwds)
                else:
                    handle_output_timer = ExecutionTimer()
                    handle_output(name, output)
                    log.info("Handled output named %s for tool %s %s" %
                             (name, tool.id, handle_output_timer))

        add_datasets_timer = ExecutionTimer()
        # Add all the top-level (non-child) datasets to the history unless otherwise specified
        datasets_to_persist = []
        for name in out_data.keys():
            if name not in child_dataset_names and name not in incoming:  # don't add children; or already existing datasets, i.e. async created
                data = out_data[name]
                datasets_to_persist.append(data)
        # Set HID and add to history.
        # This is brand new and certainly empty so don't worry about quota.
        # TOOL OPTIMIZATION NOTE - from above loop to the job create below 99%+
        # of execution time happens within in history.add_datasets.
        history.add_datasets(trans.sa_session,
                             datasets_to_persist,
                             set_hid=set_output_hid,
                             quota=False,
                             flush=False)

        # Add all the children to their parents
        for parent_name, child_name in parent_to_child_pairs:
            parent_dataset = out_data[parent_name]
            child_dataset = out_data[child_name]
            parent_dataset.children.append(child_dataset)

        log.info("Added output datasets to history %s" % add_datasets_timer)
        job_setup_timer = ExecutionTimer()
        # Create the job object
        job, galaxy_session = self._new_job_for_session(trans, tool, history)
        self._record_inputs(trans, tool, job, incoming, inp_data,
                            inp_dataset_collections, current_user_roles)
        self._record_outputs(job, out_data, output_collections)
        job.object_store_id = object_store_populator.object_store_id
        if job_params:
            job.params = dumps(job_params)
        job.set_handler(tool.get_job_handler(job_params))
        trans.sa_session.add(job)
        # Now that we have a job id, we can remap any outputs if this is a rerun and the user chose to continue dependent jobs
        # This functionality requires tracking jobs in the database.
        if app.config.track_jobs_in_database and rerun_remap_job_id is not None:
            try:
                old_job = trans.sa_session.query(
                    app.model.Job).get(rerun_remap_job_id)
                assert old_job is not None, '(%s/%s): Old job id is invalid' % (
                    rerun_remap_job_id, job.id)
                assert old_job.tool_id == job.tool_id, '(%s/%s): Old tool id (%s) does not match rerun tool id (%s)' % (
                    old_job.id, job.id, old_job.tool_id, job.tool_id)
                if trans.user is not None:
                    assert old_job.user_id == trans.user.id, '(%s/%s): Old user id (%s) does not match rerun user id (%s)' % (
                        old_job.id, job.id, old_job.user_id, trans.user.id)
                elif trans.user is None and type(
                        galaxy_session) == trans.model.GalaxySession:
                    assert old_job.session_id == galaxy_session.id, '(%s/%s): Old session id (%s) does not match rerun session id (%s)' % (
                        old_job.id, job.id, old_job.session_id,
                        galaxy_session.id)
                else:
                    raise Exception(
                        '(%s/%s): Remapping via the API is not (yet) supported'
                        % (old_job.id, job.id))
                # Duplicate PJAs before remap.
                for pjaa in old_job.post_job_actions:
                    job.add_post_job_action(pjaa.post_job_action)
                for jtod in old_job.output_datasets:
                    for (job_to_remap,
                         jtid) in [(jtid.job, jtid)
                                   for jtid in jtod.dataset.dependent_jobs]:
                        if (trans.user is not None and job_to_remap.user_id
                                == trans.user.id) or (trans.user is None and
                                                      job_to_remap.session_id
                                                      == galaxy_session.id):
                            if job_to_remap.state == job_to_remap.states.PAUSED:
                                job_to_remap.state = job_to_remap.states.NEW
                            for hda in [
                                    dep_jtod.dataset for dep_jtod in
                                    job_to_remap.output_datasets
                            ]:
                                if hda.state == hda.states.PAUSED:
                                    hda.state = hda.states.NEW
                                    hda.info = None
                            input_values = dict([
                                (p.name, json.loads(p.value))
                                for p in job_to_remap.parameters
                            ])
                            update_param(jtid.name, input_values,
                                         str(out_data[jtod.name].id))
                            for p in job_to_remap.parameters:
                                p.value = json.dumps(input_values[p.name])
                            jtid.dataset = out_data[jtod.name]
                            jtid.dataset.hid = jtod.dataset.hid
                            log.info(
                                'Job %s input HDA %s remapped to new HDA %s' %
                                (job_to_remap.id, jtod.dataset.id,
                                 jtid.dataset.id))
                            trans.sa_session.add(job_to_remap)
                            trans.sa_session.add(jtid)
                    jtod.dataset.visible = False
                    trans.sa_session.add(jtod)
            except Exception:
                log.exception('Cannot remap rerun dependencies.')

        log.info("Setup for job %s complete, ready to flush %s" %
                 (job.log_str(), job_setup_timer))

        job_flush_timer = ExecutionTimer()
        trans.sa_session.flush()
        log.info("Flushed transaction for job %s %s" %
                 (job.log_str(), job_flush_timer))
        # Some tools are not really executable, but jobs are still created for them ( for record keeping ).
        # Examples include tools that redirect to other applications ( epigraph ).  These special tools must
        # include something that can be retrieved from the params ( e.g., REDIRECT_URL ) to keep the job
        # from being queued.
        if 'REDIRECT_URL' in incoming:
            # Get the dataset - there should only be 1
            for name in inp_data.keys():
                dataset = inp_data[name]
            redirect_url = tool.parse_redirect_url(dataset, incoming)
            # GALAXY_URL should be include in the tool params to enable the external application
            # to send back to the current Galaxy instance
            GALAXY_URL = incoming.get('GALAXY_URL', None)
            assert GALAXY_URL is not None, "GALAXY_URL parameter missing in tool config."
            redirect_url += "&GALAXY_URL=%s" % GALAXY_URL
            # Job should not be queued, so set state to ok
            job.set_state(app.model.Job.states.OK)
            job.info = "Redirected to: %s" % redirect_url
            trans.sa_session.add(job)
            trans.sa_session.flush()
            trans.response.send_redirect(
                url_for(controller='tool_runner',
                        action='redirect',
                        redirect_url=redirect_url))
        else:
            # Put the job in the queue if tracking in memory
            app.job_queue.put(job.id, job.tool_id)
            trans.log_event("Added job to the job queue, id: %s" % str(job.id),
                            tool_id=job.tool_id)
            return job, out_data
Beispiel #24
0
    def __init__(self, **kwargs):
        if not log.handlers:
            # Paste didn't handle it, so we need a temporary basic log
            # configured.  The handler added here gets dumped and replaced with
            # an appropriately configured logger in configure_logging below.
            logging.basicConfig(level=logging.DEBUG)
        log.debug("python path is: %s", ", ".join(sys.path))
        self.name = 'galaxy'
        # is_webapp will be set to true when building WSGI app
        self.is_webapp = False
        self.startup_timer = ExecutionTimer()
        self.new_installation = False
        # Read config file and check for errors
        self.config = config.Configuration(**kwargs)
        self.config.check()
        config.configure_logging(self.config)
        self.execution_timer_factory = ExecutionTimerFactory(self.config)
        self.configure_fluent_log()
        # A lot of postfork initialization depends on the server name, ensure it is set immediately after forking before other postfork functions
        self.application_stack = application_stack_instance(app=self)
        self.application_stack.register_postfork_function(
            self.application_stack.set_postfork_server_name, self)
        self.config.reload_sanitize_allowlist(
            explicit='sanitize_allowlist_file' in kwargs)
        self.amqp_internal_connection_obj = galaxy.queues.connection_from_config(
            self.config)
        # queue_worker *can* be initialized with a queue, but here we don't
        # want to and we'll allow postfork to bind and start it.
        self.queue_worker = GalaxyQueueWorker(self)

        self._configure_tool_shed_registry()
        self._configure_object_store(fsmon=True)
        # Setup the database engine and ORM
        config_file = kwargs.get('global_conf', {}).get('__file__', None)
        if config_file:
            log.debug('Using "galaxy.ini" config file: %s', config_file)
        check_migrate_tools = self.config.check_migrate_tools
        self._configure_models(
            check_migrate_databases=self.config.check_migrate_databases,
            check_migrate_tools=check_migrate_tools,
            config_file=config_file)

        # Security helper
        self._configure_security()
        # Tag handler
        self.tag_handler = GalaxyTagHandler(self.model.context)
        self.dataset_collections_service = DatasetCollectionManager(self)
        self.history_manager = HistoryManager(self)
        self.hda_manager = HDAManager(self)
        self.workflow_manager = WorkflowsManager(self)
        self.dependency_resolvers_view = DependencyResolversView(self)
        self.test_data_resolver = test_data.TestDataResolver(
            file_dirs=self.config.tool_test_data_directories)
        self.library_folder_manager = FolderManager()
        self.library_manager = LibraryManager()
        self.dynamic_tool_manager = DynamicToolManager(self)

        # ConfiguredFileSources
        self.file_sources = ConfiguredFileSources.from_app_config(self.config)

        # Tool Data Tables
        self._configure_tool_data_tables(from_shed_config=False)
        # Load dbkey / genome build manager
        self._configure_genome_builds(data_table_name="__dbkeys__",
                                      load_old_style=True)

        # Genomes
        self.genomes = Genomes(self)
        # Data providers registry.
        self.data_provider_registry = DataProviderRegistry()

        # Initialize job metrics manager, needs to be in place before
        # config so per-destination modifications can be made.
        self.job_metrics = job_metrics.JobMetrics(
            self.config.job_metrics_config_file, app=self)

        # Initialize error report plugins.
        self.error_reports = ErrorReports(self.config.error_report_file,
                                          app=self)

        # Initialize the job management configuration
        self.job_config = jobs.JobConfiguration(self)

        # Setup a Tool Cache
        self.tool_cache = ToolCache()
        self.tool_shed_repository_cache = ToolShedRepositoryCache(self)
        # Watch various config files for immediate reload
        self.watchers = ConfigWatchers(self)
        self._configure_tool_config_files()
        self.installed_repository_manager = InstalledRepositoryManager(self)
        self._configure_datatypes_registry(self.installed_repository_manager)
        galaxy.model.set_datatypes_registry(self.datatypes_registry)

        self._configure_toolbox()

        # Load Data Manager
        self.data_managers = DataManagers(self)
        # Load the update repository manager.
        self.update_repository_manager = UpdateRepositoryManager(self)
        # Load proprietary datatype converters and display applications.
        self.installed_repository_manager.load_proprietary_converters_and_display_applications(
        )
        # Load datatype display applications defined in local datatypes_conf.xml
        self.datatypes_registry.load_display_applications(self)
        # Load datatype converters defined in local datatypes_conf.xml
        self.datatypes_registry.load_datatype_converters(self.toolbox)
        # Load external metadata tool
        self.datatypes_registry.load_external_metadata_tool(self.toolbox)
        # Load history import/export tools.
        load_lib_tools(self.toolbox)
        self.toolbox.persist_cache(register_postfork=True)
        # visualizations registry: associates resources with visualizations, controls how to render
        self.visualizations_registry = VisualizationsRegistry(
            self,
            directories_setting=self.config.visualization_plugins_directory,
            template_cache_dir=self.config.template_cache_path)
        # Tours registry
        self.tour_registry = ToursRegistry(self.config.tour_config_dir)
        # Webhooks registry
        self.webhooks_registry = WebhooksRegistry(self.config.webhooks_dir)
        # Load security policy.
        self.security_agent = self.model.security_agent
        self.host_security_agent = galaxy.model.security.HostAgent(
            model=self.security_agent.model,
            permitted_actions=self.security_agent.permitted_actions)
        # Load quota management.
        if self.config.enable_quotas:
            self.quota_agent = galaxy.quota.QuotaAgent(self.model)
        else:
            self.quota_agent = galaxy.quota.NoQuotaAgent(self.model)
        # Heartbeat for thread profiling
        self.heartbeat = None
        from galaxy import auth
        self.auth_manager = auth.AuthManager(self)
        self.user_manager = UserManager(self)
        # Start the heartbeat process if configured and available (wait until
        # postfork if using uWSGI)
        if self.config.use_heartbeat:
            if heartbeat.Heartbeat:
                self.heartbeat = heartbeat.Heartbeat(
                    self.config,
                    period=self.config.heartbeat_interval,
                    fname=self.config.heartbeat_log)
                self.heartbeat.daemon = True
                self.application_stack.register_postfork_function(
                    self.heartbeat.start)

        self.authnz_manager = None
        if self.config.enable_oidc:
            from galaxy.authnz import managers
            self.authnz_manager = managers.AuthnzManager(
                self, self.config.oidc_config_file,
                self.config.oidc_backends_config_file)

        self.sentry_client = None
        if self.config.sentry_dsn:

            def postfork_sentry_client():
                import raven
                self.sentry_client = raven.Client(
                    self.config.sentry_dsn,
                    transport=raven.transport.HTTPTransport)

            self.application_stack.register_postfork_function(
                postfork_sentry_client)

        # Transfer manager client
        if self.config.get_bool('enable_beta_job_managers', False):
            from galaxy.jobs import transfer_manager
            self.transfer_manager = transfer_manager.TransferManager(self)
        # Start the job manager
        from galaxy.jobs import manager
        self.job_manager = manager.JobManager(self)
        self.application_stack.register_postfork_function(
            self.job_manager.start)
        self.proxy_manager = ProxyManager(self.config)

        from galaxy.workflow import scheduling_manager
        # Must be initialized after job_config.
        self.workflow_scheduling_manager = scheduling_manager.WorkflowSchedulingManager(
            self)

        self.trs_proxy = TrsProxy(self.config)
        # Must be initialized after any component that might make use of stack messaging is configured. Alternatively if
        # it becomes more commonly needed we could create a prefork function registration method like we do with
        # postfork functions.
        self.application_stack.init_late_prefork()

        self.containers = {}
        if self.config.enable_beta_containers_interface:
            self.containers = build_container_interfaces(
                self.config.containers_config_file,
                containers_conf=self.config.containers_conf)

        self.interactivetool_manager = InteractiveToolManager(self)

        # Configure handling of signals
        handlers = {}
        if self.heartbeat:
            handlers[signal.SIGUSR1] = self.heartbeat.dump_signal_handler
        self._configure_signal_handlers(handlers)

        self.database_heartbeat = DatabaseHeartbeat(
            application_stack=self.application_stack)
        self.database_heartbeat.add_change_callback(self.watchers.change_state)
        self.application_stack.register_postfork_function(
            self.database_heartbeat.start)

        # Start web stack message handling
        self.application_stack.register_postfork_function(
            self.application_stack.start)
        self.application_stack.register_postfork_function(
            self.queue_worker.bind_and_start)
        # Delay toolbox index until after startup
        self.application_stack.register_postfork_function(
            lambda: send_local_control_task(self,
                                            'rebuild_toolbox_search_index'))

        self.model.engine.dispose()

        # Inject url_for for components to more easily optionally depend
        # on url_for.
        self.url_for = url_for

        self.server_starttime = int(time.time())  # used for cachebusting
        log.info("Galaxy app startup finished %s" % self.startup_timer)
Beispiel #25
0
 def _create_job(self, *args, **kwds):
     """Wrapper around upload_common.create_job with a timer."""
     create_job_timer = ExecutionTimer()
     rval = upload_common.create_job(*args, **kwds)
     log.debug("Created upload job %s" % create_job_timer)
     return rval