Beispiel #1
0
def set_metadata():
    # locate galaxy_root for loading datatypes
    galaxy_root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir))
    galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tool_job_working_directory = os.path.abspath(os.getcwd())

    # Set up datatypes registry
    datatypes_config = sys.argv.pop( 1 )
    datatypes_registry = galaxy.datatypes.registry.Registry()
    datatypes_registry.load_datatypes( root_dir=galaxy_root, config=datatypes_config )
    galaxy.model.set_datatypes_registry( datatypes_registry )

    job_metadata = sys.argv.pop( 1 )
    existing_job_metadata_dict = {}
    new_job_metadata_dict = {}
    if job_metadata != "None" and os.path.exists( job_metadata ):
        for line in open( job_metadata, 'r' ):
            try:
                line = stringify_dictionary_keys( json.loads( line ) )
                if line['type'] == 'dataset':
                    existing_job_metadata_dict[ line['dataset_id'] ] = line
                elif line['type'] == 'new_primary_dataset':
                    new_job_metadata_dict[ line[ 'filename' ] ] = line
            except:
                continue

    for filenames in sys.argv[1:]:
        fields = filenames.split( ',' )
        filename_in = fields.pop( 0 )
        filename_kwds = fields.pop( 0 )
        filename_out = fields.pop( 0 )
        filename_results_code = fields.pop( 0 )
        dataset_filename_override = fields.pop( 0 )
        # Need to be careful with the way that these parameters are populated from the filename splitting,
        # because if a job is running when the server is updated, any existing external metadata command-lines
        #will not have info about the newly added override_metadata file
        if fields:
            override_metadata = fields.pop( 0 )
        else:
            override_metadata = None
        set_meta_kwds = stringify_dictionary_keys( json.load( open( filename_kwds ) ) )  # load kwds; need to ensure our keywords are not unicode
        try:
            dataset = cPickle.load( open( filename_in ) )  # load DatasetInstance
            dataset.dataset.external_filename = dataset_filename_override
            files_path = os.path.abspath(os.path.join( tool_job_working_directory, "dataset_%s_files" % (dataset.dataset.id) ))
            dataset.dataset.external_extra_files_path = files_path
            if dataset.dataset.id in existing_job_metadata_dict:
                dataset.extension = existing_job_metadata_dict[ dataset.dataset.id ].get( 'ext', dataset.extension )
            # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            if override_metadata:
                override_metadata = json.load( open( override_metadata ) )
                for metadata_name, metadata_file_override in override_metadata:
                    if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value( metadata_file_override ):
                        metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override )
                    setattr( dataset.metadata, metadata_name, metadata_file_override )
            file_dict = existing_job_metadata_dict.get( dataset.dataset.id, {} )
            set_meta_with_tool_provided( dataset, file_dict, set_meta_kwds, datatypes_registry )
            dataset.metadata.to_JSON_dict( filename_out )  # write out results of set_meta
            json.dump( ( True, 'Metadata has been set successfully' ), open( filename_results_code, 'wb+' ) )  # setting metadata has succeeded
        except Exception, e:
            json.dump( ( False, str( e ) ), open( filename_results_code, 'wb+' ) )  # setting metadata has failed somehow
Beispiel #2
0
def __main__():
    file_path = sys.argv.pop( 1 )
    tmp_dir = sys.argv.pop( 1 )
    galaxy.model.Dataset.file_path = file_path
    galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tmp_dir
    
    # Set up datatypes registry
    config_root = sys.argv.pop( 1 )
    datatypes_config = sys.argv.pop( 1 )
    galaxy.model.set_datatypes_registry( galaxy.datatypes.registry.Registry( config_root, datatypes_config ) )

    job_metadata = sys.argv.pop( 1 )
    ext_override = dict()
    if job_metadata != "None" and os.path.exists( job_metadata ):
        for line in open( job_metadata, 'r' ):
            try:
                line = stringify_dictionary_keys( from_json_string( line ) )
                assert line['type'] == 'dataset'
                ext_override[line['dataset_id']] = line['ext']
            except:
                continue
    for filenames in sys.argv[1:]:
        fields = filenames.split( ',' )
        filename_in = fields.pop( 0 )
        filename_kwds = fields.pop( 0 )
        filename_out = fields.pop( 0 )
        filename_results_code = fields.pop( 0 )
        dataset_filename_override = fields.pop( 0 )
        #Need to be careful with the way that these parameters are populated from the filename splitting, 
        #because if a job is running when the server is updated, any existing external metadata command-lines 
        #will not have info about the newly added override_metadata file
        if fields:
            override_metadata = fields.pop( 0 )
        else:
            override_metadata = None
        try:
            dataset = cPickle.load( open( filename_in ) ) #load DatasetInstance
            if dataset_filename_override:
                dataset.dataset.external_filename = dataset_filename_override
            if ext_override.get( dataset.dataset.id, None ):
                dataset.extension = ext_override[ dataset.dataset.id ]
            #Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            if override_metadata:
                override_metadata = simplejson.load( open( override_metadata ) )
                for metadata_name, metadata_file_override in override_metadata:
                    if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value( metadata_file_override ):
                        metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override )
                    setattr( dataset.metadata, metadata_name, metadata_file_override )
            kwds = stringify_dictionary_keys( simplejson.load( open( filename_kwds ) ) )#load kwds; need to ensure our keywords are not unicode
            dataset.datatype.set_meta( dataset, **kwds )
            dataset.metadata.to_JSON_dict( filename_out ) # write out results of set_meta
            simplejson.dump( ( True, 'Metadata has been set successfully' ), open( filename_results_code, 'wb+' ) ) #setting metadata has succeeded
        except Exception, e:
            simplejson.dump( ( False, str( e ) ), open( filename_results_code, 'wb+' ) ) #setting metadata has failed somehow
Beispiel #3
0
def set_metadata_portable():
    import galaxy.model
    tool_job_working_directory = os.path.abspath(os.getcwd())
    metadata_tmp_files_dir = os.path.join(tool_job_working_directory, "metadata")
    galaxy.model.metadata.MetadataTempFile.tmp_dir = metadata_tmp_files_dir

    metadata_params_path = os.path.join("metadata", "params.json")
    try:
        with open(metadata_params_path, "r") as f:
            metadata_params = json.load(f)
    except IOError:
        raise Exception("Failed to find metadata/params.json from cwd [%s]" % tool_job_working_directory)
    datatypes_config = metadata_params["datatypes_config"]
    job_metadata = metadata_params["job_metadata"]
    max_metadata_value_size = metadata_params.get("max_metadata_value_size") or 0
    outputs = metadata_params["outputs"]

    datatypes_registry = validate_and_load_datatypes_config(datatypes_config)
    tool_provided_metadata = load_job_metadata(job_metadata)

    def set_meta(new_dataset_instance, file_dict):
        set_meta_with_tool_provided(new_dataset_instance, file_dict, set_meta_kwds, datatypes_registry, max_metadata_value_size)

    for output_name, output_dict in outputs.items():
        filename_in = os.path.join("metadata/metadata_in_%s" % output_name)
        filename_kwds = os.path.join("metadata/metadata_kwds_%s" % output_name)
        filename_out = os.path.join("metadata/metadata_out_%s" % output_name)
        filename_results_code = os.path.join("metadata/metadata_results_%s" % output_name)
        override_metadata = os.path.join("metadata/metadata_override_%s" % output_name)
        dataset_filename_override = output_dict["filename_override"]

        # Same block as below...
        set_meta_kwds = stringify_dictionary_keys(json.load(open(filename_kwds)))  # load kwds; need to ensure our keywords are not unicode
        try:
            dataset = cPickle.load(open(filename_in, 'rb'))  # load DatasetInstance
            dataset.dataset.external_filename = dataset_filename_override
            store_by = metadata_params.get("object_store_store_by", "id")
            extra_files_dir_name = "dataset_%s_files" % getattr(dataset.dataset, store_by)
            files_path = os.path.abspath(os.path.join(tool_job_working_directory, extra_files_dir_name))
            dataset.dataset.external_extra_files_path = files_path
            file_dict = tool_provided_metadata.get_dataset_meta(output_name, dataset.dataset.id)
            if 'ext' in file_dict:
                dataset.extension = file_dict['ext']
            # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            override_metadata = json.load(open(override_metadata))
            for metadata_name, metadata_file_override in override_metadata:
                if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(metadata_file_override):
                    metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON(metadata_file_override)
                setattr(dataset.metadata, metadata_name, metadata_file_override)
            if output_dict.get("validate", False):
                set_validated_state(dataset)
            set_meta(dataset, file_dict)
            dataset.metadata.to_JSON_dict(filename_out)  # write out results of set_meta
            json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wt+'))  # setting metadata has succeeded
        except Exception:
            json.dump((False, traceback.format_exc()), open(filename_results_code, 'wt+'))  # setting metadata has failed somehow

    write_job_metadata(tool_job_working_directory, job_metadata, set_meta, tool_provided_metadata)
Beispiel #4
0
def set_metadata_legacy():
    import galaxy.model
    galaxy.model.metadata.MetadataTempFile.tmp_dir = tool_job_working_directory = os.path.abspath(os.getcwd())

    # This is ugly, but to transition from existing jobs without this parameter
    # to ones with, smoothly, it has to be the last optional parameter and we
    # have to sniff it.
    try:
        max_metadata_value_size = int(sys.argv[-1])
        sys.argv = sys.argv[:-1]
    except ValueError:
        max_metadata_value_size = 0
        # max_metadata_value_size is unspecified and should be 0

    # Set up datatypes registry
    datatypes_config = sys.argv.pop(1)
    datatypes_registry = validate_and_load_datatypes_config(datatypes_config)

    job_metadata = sys.argv.pop(1)
    tool_provided_metadata = load_job_metadata(job_metadata, None)

    def set_meta(new_dataset_instance, file_dict):
        set_meta_with_tool_provided(new_dataset_instance, file_dict, set_meta_kwds, datatypes_registry, max_metadata_value_size)

    for filenames in sys.argv[1:]:
        fields = filenames.split(',')
        filename_in = fields.pop(0)
        filename_kwds = fields.pop(0)
        filename_out = fields.pop(0)
        filename_results_code = fields.pop(0)
        dataset_filename_override = fields.pop(0)
        override_metadata = fields.pop(0)
        set_meta_kwds = stringify_dictionary_keys(json.load(open(filename_kwds)))  # load kwds; need to ensure our keywords are not unicode
        try:
            dataset = cPickle.load(open(filename_in, 'rb'))  # load DatasetInstance
            dataset.dataset.external_filename = dataset_filename_override
            store_by = "id"
            extra_files_dir_name = "dataset_%s_files" % getattr(dataset.dataset, store_by)
            files_path = os.path.abspath(os.path.join(tool_job_working_directory, "working", extra_files_dir_name))
            dataset.dataset.external_extra_files_path = files_path
            file_dict = tool_provided_metadata.get_dataset_meta(None, dataset.dataset.id, dataset.dataset.uuid)
            if 'ext' in file_dict:
                dataset.extension = file_dict['ext']
            # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            override_metadata = json.load(open(override_metadata))
            for metadata_name, metadata_file_override in override_metadata:
                if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(metadata_file_override):
                    metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON(metadata_file_override)
                setattr(dataset.metadata, metadata_name, metadata_file_override)
            set_meta(dataset, file_dict)
            dataset.metadata.to_JSON_dict(filename_out)  # write out results of set_meta
            json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wt+'))  # setting metadata has succeeded
        except Exception as e:
            json.dump((False, unicodify(e)), open(filename_results_code, 'wt+'))  # setting metadata has failed somehow

    write_job_metadata(tool_job_working_directory, job_metadata, set_meta, tool_provided_metadata)
Beispiel #5
0
def main():
    parser = optparse.OptionParser()
    parser.add_option(
        "-b",
        "--buffer",
        dest="buffer",
        type="int",
        default=1000000,
        help="Number of lines to buffer at a time. Default: 1,000,000 lines. A buffer of 0 will attempt to use memory only.",
    )
    parser.add_option(
        "-d",
        "--index_depth",
        dest="index_depth",
        type="int",
        default=3,
        help="Depth to use on filebased offset indexing. Default: 3.",
    )
    parser.add_option(
        "-p",
        "--keep_partial",
        action="store_true",
        dest="keep_partial",
        default=False,
        help="Keep rows in first input which are missing identifiers.",
    )
    parser.add_option(
        "-u",
        "--keep_unmatched",
        action="store_true",
        dest="keep_unmatched",
        default=False,
        help="Keep rows in first input which are not joined with the second input.",
    )
    parser.add_option(
        "-f",
        "--fill_options_file",
        dest="fill_options_file",
        type="str",
        default=None,
        help="Fill empty columns with a values from a JSONified file.",
    )

    options, args = parser.parse_args()

    fill_options = None
    if options.fill_options_file is not None:
        try:
            fill_options = Bunch(
                **stringify_dictionary_keys(json.load(open(options.fill_options_file)))
            )  # json.load( open( options.fill_options_file ) )
        except Exception, e:
            print "Warning: Ignoring fill options due to json error (%s)." % e
Beispiel #6
0
def main():
    parser = optparse.OptionParser()
    parser.add_option(
        '-b',
        '--buffer',
        dest='buffer',
        type='int',
        default=1000000,
        help=
        'Number of lines to buffer at a time. Default: 1,000,000 lines. A buffer of 0 will attempt to use memory only.'
    )
    parser.add_option(
        '-d',
        '--index_depth',
        dest='index_depth',
        type='int',
        default=3,
        help='Depth to use on filebased offset indexing. Default: 3.')
    parser.add_option(
        '-p',
        '--keep_partial',
        action='store_true',
        dest='keep_partial',
        default=False,
        help='Keep rows in first input which are missing identifiers.')
    parser.add_option(
        '-u',
        '--keep_unmatched',
        action='store_true',
        dest='keep_unmatched',
        default=False,
        help=
        'Keep rows in first input which are not joined with the second input.')
    parser.add_option(
        '-f',
        '--fill_options_file',
        dest='fill_options_file',
        type='str',
        default=None,
        help='Fill empty columns with a values from a JSONified file.')

    options, args = parser.parse_args()

    fill_options = None
    if options.fill_options_file is not None:
        try:
            fill_options = Bunch(**stringify_dictionary_keys(
                json.load(open(options.fill_options_file))
            ))  # json.load( open( options.fill_options_file ) )
        except Exception, e:
            print "Warning: Ignoring fill options due to json error (%s)." % e
    def __init__(self, meta_file, job_wrapper=None):
        self.meta_file = meta_file
        self.tool_provided_job_metadata = []

        with open(meta_file, 'r') as f:
            for line in f:
                try:
                    line = stringify_dictionary_keys(json.loads(line))
                    assert 'type' in line
                except Exception:
                    log.exception(
                        '(%s) Got JSON data from tool, but data is improperly formatted or no "type" key in data'
                        % job_wrapper.job_id)
                    log.debug('Offending data was: %s' % line)
                    continue
                # Set the dataset id if it's a dataset entry and isn't set.
                # This isn't insecure.  We loop the job's output datasets in
                # the finish method, so if a tool writes out metadata for a
                # dataset id that it doesn't own, it'll just be ignored.
                dataset_id_not_specified = line[
                    'type'] == 'dataset' and 'dataset_id' not in line
                if dataset_id_not_specified:
                    dataset_basename = line['dataset']
                    if job_wrapper:
                        try:
                            line[
                                'dataset_id'] = job_wrapper.get_output_file_id(
                                    dataset_basename)
                        except KeyError:
                            log.warning(
                                '(%s) Tool provided job dataset-specific metadata without specifying a dataset'
                                % job_wrapper.job_id)
                            continue
                    else:
                        match = re.match(r'(galaxy_)?dataset_(.*)\.dat',
                                         dataset_basename)
                        if match is None:
                            raise Exception(
                                "processing tool_provided_metadata (e.g. galaxy.json) entry with invalid dataset name [%s]"
                                % dataset_basename)
                        dataset_id = match.group(2)
                        if dataset_id.isdigit():
                            line['dataset_id'] = dataset_id
                        else:
                            line['dataset_uuid'] = dataset_id

                self.tool_provided_job_metadata.append(line)
Beispiel #8
0
def __main__():
    file_path = sys.argv.pop( 1 )
    tmp_dir = sys.argv.pop( 1 )
    galaxy.model.Dataset.file_path = file_path
    galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tmp_dir
    for filenames in sys.argv[1:]:
        filename_in, filename_kwds, filename_out, filename_results_code, dataset_filename_override = filenames.split( ',' )
        try:
            dataset = cPickle.load( open( filename_in ) ) #load DatasetInstance
            if dataset_filename_override:
                dataset.dataset.external_filename = dataset_filename_override
            kwds = stringify_dictionary_keys( simplejson.load( open( filename_kwds ) ) )#load kwds; need to ensure our keywords are not unicode
            dataset.datatype.set_meta( dataset, **kwds )
            dataset.metadata.to_JSON_dict( filename_out ) # write out results of set_meta
            simplejson.dump( ( True, 'Metadata has been set successfully' ), open( filename_results_code, 'wb+' ) ) #setting metadata has suceeded
        except Exception, e:
            simplejson.dump( ( False, str( e ) ), open( filename_results_code, 'wb+' ) ) #setting metadata has failed somehow
Beispiel #9
0
def main():
    parser = optparse.OptionParser()
    parser.add_option(
        '-b','--buffer',
        dest='buffer',
        type='int',default=1000000,
        help='Number of lines to buffer at a time. Default: 1,000,000 lines. A buffer of 0 will attempt to use memory only.'
    )
    parser.add_option(
        '-d','--index_depth',
        dest='index_depth',
        type='int',default=3,
        help='Depth to use on filebased offset indexing. Default: 3.'
    )
    parser.add_option(
        '-p','--keep_partial',
        action='store_true',
        dest='keep_partial',
        default=False,
        help='Keep rows in first input which are missing identifiers.')
    parser.add_option(
        '-u','--keep_unmatched',
        action='store_true',
        dest='keep_unmatched',
        default=False,
        help='Keep rows in first input which are not joined with the second input.')
    parser.add_option(
        '-f','--fill_options_file',
        dest='fill_options_file',
        type='str',default=None,
        help='Fill empty columns with a values from a JSONified file.')
    
    
    options, args = parser.parse_args()
    
    fill_options = None
    if options.fill_options_file is not None:
        try:
            if simplejson is None:
                raise simplejson_exception
            fill_options = Bunch( **stringify_dictionary_keys( simplejson.load( open( options.fill_options_file ) ) ) ) #simplejson.load( open( options.fill_options_file ) )
        except Exception, e:
            print "Warning: Ignoring fill options due to simplejson error (%s)." % e
Beispiel #10
0
def __main__():
    parser = optparse.OptionParser()
    parser.add_option( '-o', '--output', dest='output', help='The name of the output file' )
    parser.add_option( '-1', '--input1', dest='input1', help='The name of the first input file' )
    parser.add_option( '-2', '--input2', dest='input2', help='The name of the second input file' )
    parser.add_option( '-g', '--hinge', dest='hinge', help='The "hinge" to use (the value to compare)' )
    parser.add_option( '-c', '--columns', dest='columns', help='The columns to include in the output file' )
    parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', default=None, help='The file specifying the fill value to use' )
    (options, args) = parser.parse_args()
    hinge = int( options.hinge )
    cols = [ int( c ) for c in str( options.columns ).split( ',' ) if int( c ) > hinge ]
    inputs = [ options.input1, options.input2 ]
    if options.fill_options_file == 'None':
        inputs.extend( args )
    elif len( args ) > 0:
        inputs.extend( args )
    fill_options = None
    if options.fill_options_file != 'None' and options.fill_options_file is not None:
        try:
            if simplejson is None:
                raise simplejson_exception
            fill_options = Bunch( **stringify_dictionary_keys( simplejson.load( open( options.fill_options_file ) ) ) )
        except Exception, e:
            print 'Warning: Ignoring fill options due to simplejson error (%s).' % e
Beispiel #11
0
def __main__():
    file_path = sys.argv.pop(1)
    tool_job_working_directory = tmp_dir = sys.argv.pop(
        1)  #this is also the job_working_directory now
    galaxy.model.Dataset.file_path = file_path
    galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tmp_dir

    config_root = sys.argv.pop(1)
    config_file_name = sys.argv.pop(1)
    if not os.path.isabs(config_file_name):
        config_file_name = os.path.join(config_root, config_file_name)

    # Set up reference to object store
    # First, read in the main config file for Galaxy; this is required because
    # the object store configuration is stored there
    conf_dict = load_app_properties(ini_file=config_file_name)
    # config object is required by ObjectStore class so create it now
    universe_config = config.Configuration(**conf_dict)
    universe_config.ensure_tempdir()
    object_store = build_object_store_from_config(universe_config)
    galaxy.model.Dataset.object_store = object_store

    # Set up datatypes registry
    datatypes_config = sys.argv.pop(1)
    datatypes_registry = galaxy.datatypes.registry.Registry()
    datatypes_registry.load_datatypes(root_dir=config_root,
                                      config=datatypes_config)
    galaxy.model.set_datatypes_registry(datatypes_registry)

    job_metadata = sys.argv.pop(1)
    existing_job_metadata_dict = {}
    new_job_metadata_dict = {}
    if job_metadata != "None" and os.path.exists(job_metadata):
        for line in open(job_metadata, 'r'):
            try:
                line = stringify_dictionary_keys(json.loads(line))
                if line['type'] == 'dataset':
                    existing_job_metadata_dict[line['dataset_id']] = line
                elif line['type'] == 'new_primary_dataset':
                    new_job_metadata_dict[line['filename']] = line
            except:
                continue

    for filenames in sys.argv[1:]:
        fields = filenames.split(',')
        filename_in = fields.pop(0)
        filename_kwds = fields.pop(0)
        filename_out = fields.pop(0)
        filename_results_code = fields.pop(0)
        dataset_filename_override = fields.pop(0)
        # Need to be careful with the way that these parameters are populated from the filename splitting,
        # because if a job is running when the server is updated, any existing external metadata command-lines
        #will not have info about the newly added override_metadata file
        if fields:
            override_metadata = fields.pop(0)
        else:
            override_metadata = None
        set_meta_kwds = stringify_dictionary_keys(
            json.load(open(filename_kwds))
        )  # load kwds; need to ensure our keywords are not unicode
        try:
            dataset = cPickle.load(open(filename_in))  # load DatasetInstance
            if dataset_filename_override:
                dataset.dataset.external_filename = dataset_filename_override
            files_path = os.path.abspath(
                os.path.join(tool_job_working_directory,
                             "dataset_%s_files" % (dataset.dataset.id)))
            dataset.dataset.external_extra_files_path = files_path
            if dataset.dataset.id in existing_job_metadata_dict:
                dataset.extension = existing_job_metadata_dict[
                    dataset.dataset.id].get('ext', dataset.extension)
            # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            if override_metadata:
                override_metadata = json.load(open(override_metadata))
                for metadata_name, metadata_file_override in override_metadata:
                    if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(
                            metadata_file_override):
                        metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON(
                            metadata_file_override)
                    setattr(dataset.metadata, metadata_name,
                            metadata_file_override)
            file_dict = existing_job_metadata_dict.get(dataset.dataset.id, {})
            set_meta_with_tool_provided(dataset, file_dict, set_meta_kwds)
            dataset.metadata.to_JSON_dict(
                filename_out)  # write out results of set_meta
            json.dump((True, 'Metadata has been set successfully'),
                      open(filename_results_code,
                           'wb+'))  # setting metadata has succeeded
        except Exception, e:
            json.dump((False, str(e)),
                      open(filename_results_code,
                           'wb+'))  # setting metadata has failed somehow
Beispiel #12
0
def main():
    parser = optparse.OptionParser()
    parser.add_option(
        '-b', '--buffer',
        dest='buffer',
        type='int', default=1000000,
        help='Number of lines to buffer at a time. Default: 1,000,000 lines. A buffer of 0 will attempt to use memory only.'
    )
    parser.add_option(
        '-d', '--index_depth',
        dest='index_depth',
        type='int', default=3,
        help='Depth to use on filebased offset indexing. Default: 3.'
    )
    parser.add_option(
        '-p', '--keep_partial',
        action='store_true',
        dest='keep_partial',
        default=False,
        help='Keep rows in first input which are missing identifiers.')
    parser.add_option(
        '-u', '--keep_unmatched',
        action='store_true',
        dest='keep_unmatched',
        default=False,
        help='Keep rows in first input which are not joined with the second input.')
    parser.add_option(
        '-f', '--fill_options_file',
        dest='fill_options_file',
        type='str', default=None,
        help='Fill empty columns with a values from a JSONified file.')

    options, args = parser.parse_args()

    fill_options = None
    if options.fill_options_file is not None:
        try:
            fill_options = Bunch( **stringify_dictionary_keys( json.load( open( options.fill_options_file ) ) ) )  # json.load( open( options.fill_options_file ) )
        except Exception as e:
            print("Warning: Ignoring fill options due to json error (%s)." % e)
    if fill_options is None:
        fill_options = Bunch()
    if 'fill_unjoined_only' not in fill_options:
        fill_options.fill_unjoined_only = True
    if 'file1_columns' not in fill_options:
        fill_options.file1_columns = None
    if 'file2_columns' not in fill_options:
        fill_options.file2_columns = None

    try:
        filename1 = args[0]
        filename2 = args[1]
        column1 = int( args[2] ) - 1
        column2 = int( args[3] ) - 1
        out_filename = args[4]
    except:
        print("Error parsing command line.", file=sys.stderr)
        sys.exit()

    # Character for splitting fields and joining lines
    split = "\t"

    return join_files( filename1, column1, filename2, column2, out_filename, split, options.buffer, options.keep_unmatched, options.keep_partial, options.index_depth, fill_options=fill_options )
Beispiel #13
0
def set_metadata():
    # locate galaxy_root for loading datatypes
    galaxy_root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir))
    galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tool_job_working_directory = os.path.abspath(os.getcwd())

    # This is ugly, but to transition from existing jobs without this parameter
    # to ones with, smoothly, it has to be the last optional parameter and we
    # have to sniff it.
    try:
        max_metadata_value_size = int(sys.argv[-1])
        sys.argv = sys.argv[:-1]
    except ValueError:
        max_metadata_value_size = 0
        # max_metadata_value_size is unspecified and should be 0

    # Set up datatypes registry
    datatypes_config = sys.argv.pop(1)
    datatypes_registry = galaxy.datatypes.registry.Registry()
    datatypes_registry.load_datatypes(root_dir=galaxy_root, config=datatypes_config)
    galaxy.model.set_datatypes_registry(datatypes_registry)

    job_metadata = sys.argv.pop(1)
    existing_job_metadata_dict = {}
    new_job_metadata_dict = {}
    if job_metadata != "None" and os.path.exists(job_metadata):
        for line in open(job_metadata, "r"):
            try:
                line = stringify_dictionary_keys(json.loads(line))
                if line["type"] == "dataset":
                    existing_job_metadata_dict[line["dataset_id"]] = line
                elif line["type"] == "new_primary_dataset":
                    new_job_metadata_dict[line["filename"]] = line
            except:
                continue

    for filenames in sys.argv[1:]:
        fields = filenames.split(",")
        filename_in = fields.pop(0)
        filename_kwds = fields.pop(0)
        filename_out = fields.pop(0)
        filename_results_code = fields.pop(0)
        dataset_filename_override = fields.pop(0)
        # Need to be careful with the way that these parameters are populated from the filename splitting,
        # because if a job is running when the server is updated, any existing external metadata command-lines
        # will not have info about the newly added override_metadata file
        if fields:
            override_metadata = fields.pop(0)
        else:
            override_metadata = None
        set_meta_kwds = stringify_dictionary_keys(
            json.load(open(filename_kwds))
        )  # load kwds; need to ensure our keywords are not unicode
        try:
            dataset = cPickle.load(open(filename_in))  # load DatasetInstance
            dataset.dataset.external_filename = dataset_filename_override
            files_path = os.path.abspath(
                os.path.join(tool_job_working_directory, "dataset_%s_files" % (dataset.dataset.id))
            )
            dataset.dataset.external_extra_files_path = files_path
            if dataset.dataset.id in existing_job_metadata_dict:
                dataset.extension = existing_job_metadata_dict[dataset.dataset.id].get("ext", dataset.extension)
            # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            if override_metadata:
                override_metadata = json.load(open(override_metadata))
                for metadata_name, metadata_file_override in override_metadata:
                    if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(metadata_file_override):
                        metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON(
                            metadata_file_override
                        )
                    setattr(dataset.metadata, metadata_name, metadata_file_override)
            file_dict = existing_job_metadata_dict.get(dataset.dataset.id, {})
            set_meta_with_tool_provided(dataset, file_dict, set_meta_kwds, datatypes_registry)
            if max_metadata_value_size:
                for k, v in dataset.metadata.items():
                    if total_size(v) > max_metadata_value_size:
                        log.info("Key %s too large for metadata, discarding" % k)
                        dataset.metadata.remove_key(k)
            dataset.metadata.to_JSON_dict(filename_out)  # write out results of set_meta
            json.dump(
                (True, "Metadata has been set successfully"), open(filename_results_code, "wb+")
            )  # setting metadata has succeeded
        except Exception, e:
            json.dump((False, str(e)), open(filename_results_code, "wb+"))  # setting metadata has failed somehow
Beispiel #14
0
def __main__():
    file_path = sys.argv.pop( 1 )
    tmp_dir = sys.argv.pop( 1 )
    galaxy.model.Dataset.file_path = file_path
    galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tmp_dir

    config_root = sys.argv.pop( 1 )
    config_file_name = sys.argv.pop( 1 )
    if not os.path.isabs( config_file_name ):
        config_file_name = os.path.join( config_root, config_file_name )
    
    # Set up reference to object store
    # First, read in the main config file for Galaxy; this is required because
    # the object store configuration is stored there
    conf = ConfigParser.ConfigParser()
    conf.read(config_file_name)
    conf_dict = {}
    for section in conf.sections():
        for option in conf.options(section):
            try:
                conf_dict[option] = conf.get(section, option)
            except ConfigParser.InterpolationMissingOptionError:
                # Because this is not called from Paste Script, %(here)s variable
                # is not initialized in the config file so skip those fields -
                # just need not to use any such fields for the object store conf...
                log.debug("Did not load option %s from %s" % (option, config_file_name))
    # config object is required by ObjectStore class so create it now
    universe_config = config.Configuration(**conf_dict)
    object_store = build_object_store_from_config(universe_config)
    galaxy.model.Dataset.object_store = object_store
    
    # Set up datatypes registry
    datatypes_config = sys.argv.pop( 1 )
    datatypes_registry = galaxy.datatypes.registry.Registry()
    datatypes_registry.load_datatypes( root_dir=config_root, config=datatypes_config )
    galaxy.model.set_datatypes_registry( datatypes_registry )

    job_metadata = sys.argv.pop( 1 )
    ext_override = dict()
    if job_metadata != "None" and os.path.exists( job_metadata ):
        for line in open( job_metadata, 'r' ):
            try:
                line = stringify_dictionary_keys( from_json_string( line ) )
                assert line['type'] == 'dataset'
                ext_override[line['dataset_id']] = line['ext']
            except:
                continue
    for filenames in sys.argv[1:]:
        fields = filenames.split( ',' )
        filename_in = fields.pop( 0 )
        filename_kwds = fields.pop( 0 )
        filename_out = fields.pop( 0 )
        filename_results_code = fields.pop( 0 )
        dataset_filename_override = fields.pop( 0 )
        #Need to be careful with the way that these parameters are populated from the filename splitting, 
        #because if a job is running when the server is updated, any existing external metadata command-lines 
        #will not have info about the newly added override_metadata file
        if fields:
            override_metadata = fields.pop( 0 )
        else:
            override_metadata = None
        try:
            dataset = cPickle.load( open( filename_in ) ) #load DatasetInstance
            if dataset_filename_override:
                dataset.dataset.external_filename = dataset_filename_override
            if ext_override.get( dataset.dataset.id, None ):
                dataset.extension = ext_override[ dataset.dataset.id ]
            #Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            if override_metadata:
                override_metadata = simplejson.load( open( override_metadata ) )
                for metadata_name, metadata_file_override in override_metadata:
                    if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value( metadata_file_override ):
                        metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override )
                    setattr( dataset.metadata, metadata_name, metadata_file_override )
            kwds = stringify_dictionary_keys( simplejson.load( open( filename_kwds ) ) )#load kwds; need to ensure our keywords are not unicode
            dataset.datatype.set_meta( dataset, **kwds )
            dataset.metadata.to_JSON_dict( filename_out ) # write out results of set_meta
            simplejson.dump( ( True, 'Metadata has been set successfully' ), open( filename_results_code, 'wb+' ) ) #setting metadata has succeeded
        except Exception, e:
            simplejson.dump( ( False, str( e ) ), open( filename_results_code, 'wb+' ) ) #setting metadata has failed somehow
Beispiel #15
0
def set_metadata():
    # locate galaxy_root for loading datatypes
    galaxy_root = os.path.abspath(
        os.path.join(os.path.dirname(__file__), os.pardir, os.pardir,
                     os.pardir))
    galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tool_job_working_directory = os.path.abspath(
        os.getcwd())

    # This is ugly, but to transition from existing jobs without this parameter
    # to ones with, smoothly, it has to be the last optional parameter and we
    # have to sniff it.
    try:
        max_metadata_value_size = int(sys.argv[-1])
        sys.argv = sys.argv[:-1]
    except ValueError:
        max_metadata_value_size = 0
        # max_metadata_value_size is unspecified and should be 0

    # Set up datatypes registry
    datatypes_config = sys.argv.pop(1)
    datatypes_registry = galaxy.datatypes.registry.Registry()
    datatypes_registry.load_datatypes(root_dir=galaxy_root,
                                      config=datatypes_config)
    galaxy.model.set_datatypes_registry(datatypes_registry)

    job_metadata = sys.argv.pop(1)
    existing_job_metadata_dict = {}
    new_job_metadata_dict = {}
    if job_metadata != "None" and os.path.exists(job_metadata):
        for line in open(job_metadata, 'r'):
            try:
                line = stringify_dictionary_keys(json.loads(line))
                if line['type'] == 'dataset':
                    existing_job_metadata_dict[line['dataset_id']] = line
                elif line['type'] == 'new_primary_dataset':
                    new_job_metadata_dict[line['filename']] = line
            except:
                continue

    for filenames in sys.argv[1:]:
        fields = filenames.split(',')
        filename_in = fields.pop(0)
        filename_kwds = fields.pop(0)
        filename_out = fields.pop(0)
        filename_results_code = fields.pop(0)
        dataset_filename_override = fields.pop(0)
        # Need to be careful with the way that these parameters are populated from the filename splitting,
        # because if a job is running when the server is updated, any existing external metadata command-lines
        # will not have info about the newly added override_metadata file
        if fields:
            override_metadata = fields.pop(0)
        else:
            override_metadata = None
        set_meta_kwds = stringify_dictionary_keys(
            json.load(open(filename_kwds))
        )  # load kwds; need to ensure our keywords are not unicode
        try:
            dataset = cPickle.load(open(filename_in))  # load DatasetInstance
            dataset.dataset.external_filename = dataset_filename_override
            files_path = os.path.abspath(
                os.path.join(tool_job_working_directory,
                             "dataset_%s_files" % (dataset.dataset.id)))
            dataset.dataset.external_extra_files_path = files_path
            if dataset.dataset.id in existing_job_metadata_dict:
                dataset.extension = existing_job_metadata_dict[
                    dataset.dataset.id].get('ext', dataset.extension)
            # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            if override_metadata:
                override_metadata = json.load(open(override_metadata))
                for metadata_name, metadata_file_override in override_metadata:
                    if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(
                            metadata_file_override):
                        metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON(
                            metadata_file_override)
                    setattr(dataset.metadata, metadata_name,
                            metadata_file_override)
            file_dict = existing_job_metadata_dict.get(dataset.dataset.id, {})
            set_meta_with_tool_provided(dataset, file_dict, set_meta_kwds,
                                        datatypes_registry)
            if max_metadata_value_size:
                for k, v in dataset.metadata.items():
                    if total_size(v) > max_metadata_value_size:
                        log.info("Key %s too large for metadata, discarding" %
                                 k)
                        dataset.metadata.remove_key(k)
            dataset.metadata.to_JSON_dict(
                filename_out)  # write out results of set_meta
            json.dump((True, 'Metadata has been set successfully'),
                      open(filename_results_code,
                           'wb+'))  # setting metadata has succeeded
        except Exception, e:
            json.dump((False, str(e)),
                      open(filename_results_code,
                           'wb+'))  # setting metadata has failed somehow
Beispiel #16
0
def set_metadata():
    # locate galaxy_root for loading datatypes
    galaxy_root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir))
    galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tool_job_working_directory = os.path.abspath(os.getcwd())

    # This is ugly, but to transition from existing jobs without this parameter
    # to ones with, smoothly, it has to be the last optional parameter and we
    # have to sniff it.
    try:
        max_metadata_value_size = int(sys.argv[-1])
        sys.argv = sys.argv[:-1]
    except ValueError:
        max_metadata_value_size = 0
        # max_metadata_value_size is unspecified and should be 0

    # Set up datatypes registry
    datatypes_config = sys.argv.pop(1)
    if not os.path.exists(datatypes_config):
        # This path should exist, except for jobs that started running on release 17.05, where a global
        # datatypes_config (instead of a datatypes_config per job) was used. For a while release 17.05
        # would remove the global datatypes config on shutdown and toolbox reload, which would lead to
        # failed metadata jobs. To remedy this we scan jobs at startup for missing registry.xml files,
        # and if we detect such a job we write out the current registry.xml file.
        datatypes_config = os.path.join(tool_job_working_directory, "registry.xml")
        if not os.path.exists(datatypes_config):
            print("Metadata setting failed because registry.xml could not be found. You may retry setting metadata.")
            sys.exit(1)
    datatypes_registry = galaxy.datatypes.registry.Registry()
    datatypes_registry.load_datatypes(root_dir=galaxy_root, config=datatypes_config)
    galaxy.model.set_datatypes_registry(datatypes_registry)

    job_metadata = sys.argv.pop(1)
    existing_job_metadata_dict = {}
    new_job_metadata_dict = {}
    if job_metadata != "None" and os.path.exists(job_metadata):
        for line in open(job_metadata, 'r'):
            try:
                line = stringify_dictionary_keys(json.loads(line))
                if line['type'] == 'dataset':
                    existing_job_metadata_dict[line['dataset_id']] = line
                elif line['type'] == 'new_primary_dataset':
                    new_job_metadata_dict[line['filename']] = line
            except:
                continue

    for filenames in sys.argv[1:]:
        fields = filenames.split(',')
        filename_in = fields.pop(0)
        filename_kwds = fields.pop(0)
        filename_out = fields.pop(0)
        filename_results_code = fields.pop(0)
        dataset_filename_override = fields.pop(0)
        # Need to be careful with the way that these parameters are populated from the filename splitting,
        # because if a job is running when the server is updated, any existing external metadata command-lines
        # will not have info about the newly added override_metadata file
        if fields:
            override_metadata = fields.pop(0)
        else:
            override_metadata = None
        set_meta_kwds = stringify_dictionary_keys(json.load(open(filename_kwds)))  # load kwds; need to ensure our keywords are not unicode
        try:
            dataset = cPickle.load(open(filename_in))  # load DatasetInstance
            dataset.dataset.external_filename = dataset_filename_override
            files_path = os.path.abspath(os.path.join(tool_job_working_directory, "dataset_%s_files" % (dataset.dataset.id)))
            dataset.dataset.external_extra_files_path = files_path
            if dataset.dataset.id in existing_job_metadata_dict:
                dataset.extension = existing_job_metadata_dict[dataset.dataset.id].get('ext', dataset.extension)
            # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            if override_metadata:
                override_metadata = json.load(open(override_metadata))
                for metadata_name, metadata_file_override in override_metadata:
                    if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(metadata_file_override):
                        metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON(metadata_file_override)
                    setattr(dataset.metadata, metadata_name, metadata_file_override)
            file_dict = existing_job_metadata_dict.get(dataset.dataset.id, {})
            set_meta_with_tool_provided(dataset, file_dict, set_meta_kwds, datatypes_registry)
            if max_metadata_value_size:
                for k, v in list(dataset.metadata.items()):
                    if total_size(v) > max_metadata_value_size:
                        log.info("Key %s too large for metadata, discarding" % k)
                        dataset.metadata.remove_key(k)
            dataset.metadata.to_JSON_dict(filename_out)  # write out results of set_meta
            json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wb+'))  # setting metadata has succeeded
        except Exception as e:
            json.dump((False, str(e)), open(filename_results_code, 'wb+'))  # setting metadata has failed somehow

    for i, (filename, file_dict) in enumerate(new_job_metadata_dict.items(), start=1):
        new_dataset_filename = os.path.join(tool_job_working_directory, "working", file_dict['filename'])
        new_dataset = galaxy.model.Dataset(id=-i, external_filename=new_dataset_filename)
        extra_files = file_dict.get('extra_files', None)
        if extra_files is not None:
            new_dataset._extra_files_path = os.path.join(tool_job_working_directory, "working", extra_files)
        new_dataset.state = new_dataset.states.OK
        new_dataset_instance = galaxy.model.HistoryDatasetAssociation(id=-i, dataset=new_dataset, extension=file_dict.get('ext', 'data'))
        set_meta_with_tool_provided(new_dataset_instance, file_dict, set_meta_kwds, datatypes_registry)
        file_dict['metadata'] = json.loads(new_dataset_instance.metadata.to_JSON_dict())  # storing metadata in external form, need to turn back into dict, then later jsonify
    if existing_job_metadata_dict or new_job_metadata_dict:
        with open(job_metadata, 'wb') as job_metadata_fh:
            for value in list(existing_job_metadata_dict.values()) + list(new_job_metadata_dict.values()):
                job_metadata_fh.write("%s\n" % (json.dumps(value)))

    clear_mappers()
Beispiel #17
0
def __main__():
    file_path = sys.argv.pop(1)
    tmp_dir = sys.argv.pop(1)
    galaxy.model.Dataset.file_path = file_path
    galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tmp_dir

    # Set up datatypes registry
    config_root = sys.argv.pop(1)
    datatypes_config = sys.argv.pop(1)
    galaxy.model.set_datatypes_registry(
        galaxy.datatypes.registry.Registry(config_root, datatypes_config))

    job_metadata = sys.argv.pop(1)
    ext_override = dict()
    if job_metadata != "None" and os.path.exists(job_metadata):
        for line in open(job_metadata, 'r'):
            try:
                line = stringify_dictionary_keys(from_json_string(line))
                assert line['type'] == 'dataset'
                ext_override[line['dataset_id']] = line['ext']
            except:
                continue
    for filenames in sys.argv[1:]:
        fields = filenames.split(',')
        filename_in = fields.pop(0)
        filename_kwds = fields.pop(0)
        filename_out = fields.pop(0)
        filename_results_code = fields.pop(0)
        dataset_filename_override = fields.pop(0)
        #Need to be careful with the way that these parameters are populated from the filename splitting,
        #because if a job is running when the server is updated, any existing external metadata command-lines
        #will not have info about the newly added override_metadata file
        if fields:
            override_metadata = fields.pop(0)
        else:
            override_metadata = None
        try:
            dataset = cPickle.load(open(filename_in))  #load DatasetInstance
            if dataset_filename_override:
                dataset.dataset.external_filename = dataset_filename_override
            if ext_override.get(dataset.dataset.id, None):
                dataset.extension = ext_override[dataset.dataset.id]
            #Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            if override_metadata:
                override_metadata = simplejson.load(open(override_metadata))
                for metadata_name, metadata_file_override in override_metadata:
                    if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(
                            metadata_file_override):
                        metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON(
                            metadata_file_override)
                    setattr(dataset.metadata, metadata_name,
                            metadata_file_override)
            kwds = stringify_dictionary_keys(
                simplejson.load(open(filename_kwds))
            )  #load kwds; need to ensure our keywords are not unicode
            dataset.datatype.set_meta(dataset, **kwds)
            dataset.metadata.to_JSON_dict(
                filename_out)  # write out results of set_meta
            simplejson.dump((True, 'Metadata has been set successfully'),
                            open(filename_results_code,
                                 'wb+'))  #setting metadata has succeeded
        except Exception, e:
            simplejson.dump((False, str(e)),
                            open(filename_results_code,
                                 'wb+'))  #setting metadata has failed somehow
Beispiel #18
0
def set_metadata_portable():
    import galaxy.model
    tool_job_working_directory = os.path.abspath(os.getcwd())
    metadata_tmp_files_dir = os.path.join(tool_job_working_directory,
                                          "metadata")
    galaxy.model.metadata.MetadataTempFile.tmp_dir = metadata_tmp_files_dir

    metadata_params_path = os.path.join("metadata", "params.json")
    try:
        with open(metadata_params_path, "r") as f:
            metadata_params = json.load(f)
    except IOError:
        raise Exception("Failed to find metadata/params.json from cwd [%s]" %
                        tool_job_working_directory)
    datatypes_config = metadata_params["datatypes_config"]
    job_metadata = metadata_params["job_metadata"]
    provided_metadata_style = metadata_params.get("provided_metadata_style")
    max_metadata_value_size = metadata_params.get(
        "max_metadata_value_size") or 0
    outputs = metadata_params["outputs"]

    datatypes_registry = validate_and_load_datatypes_config(datatypes_config)
    tool_provided_metadata = load_job_metadata(job_metadata,
                                               provided_metadata_style)

    def set_meta(new_dataset_instance, file_dict):
        set_meta_with_tool_provided(new_dataset_instance, file_dict,
                                    set_meta_kwds, datatypes_registry,
                                    max_metadata_value_size)

    object_store_conf_path = os.path.join("metadata", "object_store_conf.json")
    extended_metadata_collection = os.path.exists(object_store_conf_path)

    object_store = None
    job_context = None
    version_string = ""

    export_store = None
    if extended_metadata_collection:
        from galaxy.tool_util.parser.stdio import ToolStdioRegex, ToolStdioExitCode
        tool_dict = metadata_params["tool"]
        stdio_exit_code_dicts, stdio_regex_dicts = tool_dict[
            "stdio_exit_codes"], tool_dict["stdio_regexes"]
        stdio_exit_codes = list(map(ToolStdioExitCode, stdio_exit_code_dicts))
        stdio_regexes = list(map(ToolStdioRegex, stdio_regex_dicts))

        with open(object_store_conf_path, "r") as f:
            config_dict = json.load(f)
        from galaxy.objectstore import build_object_store_from_config
        assert config_dict is not None
        object_store = build_object_store_from_config(None,
                                                      config_dict=config_dict)
        galaxy.model.Dataset.object_store = object_store

        outputs_directory = os.path.join(tool_job_working_directory, "outputs")
        if not os.path.exists(outputs_directory):
            outputs_directory = tool_job_working_directory

        # TODO: constants...
        if os.path.exists(os.path.join(outputs_directory, "tool_stdout")):
            with open(os.path.join(outputs_directory, "tool_stdout"),
                      "rb") as f:
                tool_stdout = f.read()

            with open(os.path.join(outputs_directory, "tool_stderr"),
                      "rb") as f:
                tool_stderr = f.read()
        elif os.path.exists(os.path.join(outputs_directory, "stdout")):
            # Puslar style working directory.
            with open(os.path.join(outputs_directory, "stdout"), "rb") as f:
                tool_stdout = f.read()

            with open(os.path.join(outputs_directory, "stderr"), "rb") as f:
                tool_stderr = f.read()

        job_id_tag = metadata_params["job_id_tag"]

        # TODO: this clearly needs to be refactored, nothing in runners should be imported here..
        from galaxy.job_execution.output_collect import default_exit_code_file, read_exit_code_from
        exit_code_file = default_exit_code_file(".", job_id_tag)
        tool_exit_code = read_exit_code_from(exit_code_file, job_id_tag)

        from galaxy.tool_util.output_checker import check_output, DETECTED_JOB_STATE
        check_output_detected_state, tool_stdout, tool_stderr, job_messages = check_output(
            stdio_regexes, stdio_exit_codes, tool_stdout, tool_stderr,
            tool_exit_code, job_id_tag)
        if check_output_detected_state == DETECTED_JOB_STATE.OK and not tool_provided_metadata.has_failed_outputs(
        ):
            final_job_state = galaxy.model.Job.states.OK
        else:
            final_job_state = galaxy.model.Job.states.ERROR

        from pulsar.client.staging import COMMAND_VERSION_FILENAME
        version_string = ""
        if os.path.exists(COMMAND_VERSION_FILENAME):
            version_string = open(COMMAND_VERSION_FILENAME).read()

        # TODO: handle outputs_to_working_directory?
        from galaxy.util.expressions import ExpressionContext
        job_context = ExpressionContext(
            dict(stdout=tool_stdout, stderr=tool_stderr))

        # Load outputs.
        import_model_store = store.imported_store_for_metadata(
            'metadata/outputs_new', object_store=object_store)
        export_store = store.DirectoryModelExportStore(
            'metadata/outputs_populated',
            serialize_dataset_objects=True,
            for_edit=True)

    for output_name, output_dict in outputs.items():
        if extended_metadata_collection:
            dataset_instance_id = output_dict["id"]
            dataset = import_model_store.sa_session.query(
                galaxy.model.HistoryDatasetAssociation).find(
                    dataset_instance_id)
            assert dataset is not None
        else:
            filename_in = os.path.join("metadata/metadata_in_%s" % output_name)
            dataset = cPickle.load(open(filename_in,
                                        'rb'))  # load DatasetInstance

        filename_kwds = os.path.join("metadata/metadata_kwds_%s" % output_name)
        filename_out = os.path.join("metadata/metadata_out_%s" % output_name)
        filename_results_code = os.path.join("metadata/metadata_results_%s" %
                                             output_name)
        override_metadata = os.path.join("metadata/metadata_override_%s" %
                                         output_name)
        dataset_filename_override = output_dict["filename_override"]

        # Same block as below...
        set_meta_kwds = stringify_dictionary_keys(
            json.load(open(filename_kwds))
        )  # load kwds; need to ensure our keywords are not unicode
        try:
            dataset.dataset.external_filename = dataset_filename_override
            store_by = metadata_params.get("object_store_store_by", "id")
            extra_files_dir_name = "dataset_%s_files" % getattr(
                dataset.dataset, store_by)
            files_path = os.path.abspath(
                os.path.join(tool_job_working_directory, "working",
                             extra_files_dir_name))
            dataset.dataset.external_extra_files_path = files_path
            file_dict = tool_provided_metadata.get_dataset_meta(
                output_name, dataset.dataset.id, dataset.dataset.uuid)
            if 'ext' in file_dict:
                dataset.extension = file_dict['ext']
            # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            override_metadata = json.load(open(override_metadata))
            for metadata_name, metadata_file_override in override_metadata:
                if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(
                        metadata_file_override):
                    metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON(
                        metadata_file_override)
                setattr(dataset.metadata, metadata_name,
                        metadata_file_override)
            if output_dict.get("validate", False):
                set_validated_state(dataset)
            set_meta(dataset, file_dict)

            if extended_metadata_collection:
                meta = tool_provided_metadata.get_dataset_meta(
                    output_name, dataset.dataset.id, dataset.dataset.uuid)
                if meta:
                    context = ExpressionContext(meta, job_context)
                else:
                    context = job_context

                # Lazy and unattached
                # if getattr(dataset, "hidden_beneath_collection_instance", None):
                #    dataset.visible = False
                dataset.blurb = 'done'
                dataset.peek = 'no peek'
                dataset.info = (dataset.info or '')
                if context['stdout'].strip():
                    # Ensure white space between entries
                    dataset.info = dataset.info.rstrip(
                    ) + "\n" + context['stdout'].strip()
                if context['stderr'].strip():
                    # Ensure white space between entries
                    dataset.info = dataset.info.rstrip(
                    ) + "\n" + context['stderr'].strip()
                dataset.tool_version = version_string
                dataset.set_size()
                if 'uuid' in context:
                    dataset.dataset.uuid = context['uuid']
                object_store.update_from_file(dataset.dataset, create=True)
                from galaxy.job_execution.output_collect import collect_extra_files
                collect_extra_files(object_store, dataset, ".")
                if galaxy.model.Job.states.ERROR == final_job_state:
                    dataset.blurb = "error"
                    dataset.mark_unhidden()
                else:
                    # If the tool was expected to set the extension, attempt to retrieve it
                    if dataset.ext == 'auto':
                        dataset.extension = context.get('ext', 'data')
                        dataset.init_meta(copy_from=dataset)

                    # This has already been done:
                    # else:
                    #     self.external_output_metadata.load_metadata(dataset, output_name, self.sa_session, working_directory=self.working_directory, remote_metadata_directory=remote_metadata_directory)
                    line_count = context.get('line_count', None)
                    try:
                        # Certain datatype's set_peek methods contain a line_count argument
                        dataset.set_peek(line_count=line_count)
                    except TypeError:
                        # ... and others don't
                        dataset.set_peek()

                from galaxy.jobs import TOOL_PROVIDED_JOB_METADATA_KEYS
                for context_key in TOOL_PROVIDED_JOB_METADATA_KEYS:
                    if context_key in context:
                        context_value = context[context_key]
                        setattr(dataset, context_key, context_value)

                if extended_metadata_collection:
                    export_store.add_dataset(dataset)
                else:
                    cPickle.dump(dataset, open(filename_out, 'wb+'))
            else:
                dataset.metadata.to_JSON_dict(
                    filename_out)  # write out results of set_meta

            json.dump((True, 'Metadata has been set successfully'),
                      open(filename_results_code,
                           'wt+'))  # setting metadata has succeeded
        except Exception:
            json.dump((False, traceback.format_exc()),
                      open(filename_results_code,
                           'wt+'))  # setting metadata has failed somehow

    if extended_metadata_collection:
        # discover extra outputs...
        from galaxy.job_execution.output_collect import collect_dynamic_outputs, collect_primary_datasets, SessionlessJobContext

        job_context = SessionlessJobContext(
            metadata_params, tool_provided_metadata, object_store,
            export_store, import_model_store,
            os.path.join(tool_job_working_directory, "working"))

        output_collections = {}
        for name, output_collection in metadata_params[
                "output_collections"].items():
            output_collections[name] = import_model_store.sa_session.query(
                galaxy.model.HistoryDatasetCollectionAssociation).find(
                    output_collection["id"])
        outputs = {}
        for name, output in metadata_params["outputs"].items():
            outputs[name] = import_model_store.sa_session.query(
                galaxy.model.HistoryDatasetAssociation).find(output["id"])

        input_ext = json.loads(metadata_params["job_params"].get(
            "__input_ext", '"data"'))
        collect_primary_datasets(
            job_context,
            outputs,
            input_ext=input_ext,
        )
        collect_dynamic_outputs(job_context, output_collections)

    if export_store:
        export_store._finalize()
    write_job_metadata(tool_job_working_directory, job_metadata, set_meta,
                       tool_provided_metadata)
Beispiel #19
0
def set_metadata_portable():
    tool_job_working_directory = os.path.abspath(os.getcwd())
    metadata_tmp_files_dir = os.path.join(tool_job_working_directory, "metadata")
    MetadataTempFile.tmp_dir = metadata_tmp_files_dir

    metadata_params = get_metadata_params(tool_job_working_directory)
    datatypes_config = metadata_params["datatypes_config"]
    job_metadata = metadata_params["job_metadata"]
    provided_metadata_style = metadata_params.get("provided_metadata_style")
    max_metadata_value_size = metadata_params.get("max_metadata_value_size") or 0
    max_discovered_files = metadata_params.get("max_discovered_files")
    outputs = metadata_params["outputs"]

    datatypes_registry = validate_and_load_datatypes_config(datatypes_config)
    tool_provided_metadata = load_job_metadata(job_metadata, provided_metadata_style)

    def set_meta(new_dataset_instance, file_dict):
        set_meta_with_tool_provided(new_dataset_instance, file_dict, set_meta_kwds, datatypes_registry, max_metadata_value_size)

    try:
        object_store = get_object_store(tool_job_working_directory=tool_job_working_directory)
    except (FileNotFoundError, AssertionError):
        object_store = None
    extended_metadata_collection = bool(object_store)
    job_context = None
    version_string = None

    export_store = None
    final_job_state = Job.states.OK
    job_messages = []
    if extended_metadata_collection:
        tool_dict = metadata_params["tool"]
        stdio_exit_code_dicts, stdio_regex_dicts = tool_dict["stdio_exit_codes"], tool_dict["stdio_regexes"]
        stdio_exit_codes = list(map(ToolStdioExitCode, stdio_exit_code_dicts))
        stdio_regexes = list(map(ToolStdioRegex, stdio_regex_dicts))

        outputs_directory = os.path.join(tool_job_working_directory, "outputs")
        if not os.path.exists(outputs_directory):
            outputs_directory = tool_job_working_directory

        # TODO: constants...
        locations = [
            (outputs_directory, 'tool_'),
            (tool_job_working_directory, ''),
            (outputs_directory, ''),  # # Pulsar style output directory? Was this ever used - did this ever work?
        ]
        for directory, prefix in locations:
            if os.path.exists(os.path.join(directory, f"{prefix}stdout")):
                with open(os.path.join(directory, f"{prefix}stdout"), 'rb') as f:
                    tool_stdout = f.read(MAX_STDIO_READ_BYTES)
                with open(os.path.join(directory, f"{prefix}stderr"), 'rb') as f:
                    tool_stderr = f.read(MAX_STDIO_READ_BYTES)
                break
        else:
            if os.path.exists(os.path.join(tool_job_working_directory, 'task_0')):
                # We have a task splitting job
                tool_stdout = b''
                tool_stderr = b''
                paths = Path(tool_job_working_directory).glob('task_*')
                for path in paths:
                    with open(path / 'outputs' / 'tool_stdout', 'rb') as f:
                        task_stdout = f.read(MAX_STDIO_READ_BYTES)
                        if task_stdout:
                            tool_stdout = b"%s[%s stdout]\n%s\n" % (tool_stdout, path.name.encode(), task_stdout)
                    with open(path / 'outputs' / 'tool_stderr', 'rb') as f:
                        task_stderr = f.read(MAX_STDIO_READ_BYTES)
                        if task_stderr:
                            tool_stderr = b"%s[%s stdout]\n%s\n" % (tool_stderr, path.name.encode(), task_stderr)
            else:
                wdc = os.listdir(tool_job_working_directory)
                odc = os.listdir(outputs_directory)
                error_desc = "Failed to find tool_stdout or tool_stderr for this job, cannot collect metadata"
                error_extra = f"Working dir contents [{wdc}], output directory contents [{odc}]"
                log.warn(f"{error_desc}. {error_extra}")
                raise Exception(error_desc)

        job_id_tag = metadata_params["job_id_tag"]

        exit_code_file = default_exit_code_file(".", job_id_tag)
        tool_exit_code = read_exit_code_from(exit_code_file, job_id_tag)

        check_output_detected_state, tool_stdout, tool_stderr, job_messages = check_output(stdio_regexes, stdio_exit_codes, tool_stdout, tool_stderr, tool_exit_code, job_id_tag)
        if check_output_detected_state == DETECTED_JOB_STATE.OK and not tool_provided_metadata.has_failed_outputs():
            final_job_state = Job.states.OK
        else:
            final_job_state = Job.states.ERROR

        version_string_path = os.path.join('outputs', COMMAND_VERSION_FILENAME)
        version_string = collect_shrinked_content_from_path(version_string_path)

        expression_context = ExpressionContext(dict(stdout=tool_stdout[:255], stderr=tool_stderr[:255]))

        # Load outputs.
        export_store = store.DirectoryModelExportStore('metadata/outputs_populated', serialize_dataset_objects=True, for_edit=True, strip_metadata_files=False, serialize_jobs=True)
    try:
        import_model_store = store.imported_store_for_metadata('metadata/outputs_new', object_store=object_store)
    except AssertionError:
        # Remove in 21.09, this should only happen for jobs that started on <= 20.09 and finish now
        import_model_store = None

    tool_script_file = os.path.join(tool_job_working_directory, 'tool_script.sh')
    job = None
    if import_model_store and export_store:
        job = next(iter(import_model_store.sa_session.objects[Job].values()))

    job_context = SessionlessJobContext(
        metadata_params,
        tool_provided_metadata,
        object_store,
        export_store,
        import_model_store,
        os.path.join(tool_job_working_directory, "working"),
        final_job_state=final_job_state,
        max_discovered_files=max_discovered_files,
    )

    if extended_metadata_collection:
        # discover extra outputs...
        output_collections = {}
        for name, output_collection in metadata_params["output_collections"].items():
            # TODO: remove HistoryDatasetCollectionAssociation fallback on 22.01, model_class used to not be serialized prior to 21.09
            model_class = output_collection.get('model_class', 'HistoryDatasetCollectionAssociation')
            collection = import_model_store.sa_session.query(getattr(galaxy.model, model_class)).find(output_collection["id"])
            output_collections[name] = collection
        output_instances = {}
        for name, output in metadata_params["outputs"].items():
            klass = getattr(galaxy.model, output.get('model_class', 'HistoryDatasetAssociation'))
            output_instances[name] = import_model_store.sa_session.query(klass).find(output["id"])

        input_ext = json.loads(metadata_params["job_params"].get("__input_ext") or '"data"')
        try:
            collect_primary_datasets(
                job_context,
                output_instances,
                input_ext=input_ext,
            )
            collect_dynamic_outputs(job_context, output_collections)
        except MaxDiscoveredFilesExceededError as e:
            final_job_state = Job.states.ERROR
            job_messages.append(str(e))
        if job:
            job.job_messages = job_messages
            job.state = final_job_state
        if os.path.exists(tool_script_file):
            with open(tool_script_file) as command_fh:
                command_line_lines = []
                for i, line in enumerate(command_fh):
                    if i == 0 and line.endswith('COMMAND_VERSION 2>&1;'):
                        # Don't record version command as part of command line
                        continue
                    command_line_lines.append(line)
                job.command_line = "".join(command_line_lines).strip()
                export_store.export_job(job, include_job_data=False)

    unnamed_id_to_path = {}
    for unnamed_output_dict in job_context.tool_provided_metadata.get_unnamed_outputs():
        destination = unnamed_output_dict["destination"]
        elements = unnamed_output_dict["elements"]
        destination_type = destination["type"]
        if destination_type == 'hdas':
            for element in elements:
                filename = element.get('filename')
                object_id = element.get('object_id')
                if filename and object_id:
                    unnamed_id_to_path[object_id] = os.path.join(job_context.job_working_directory, filename)

    for output_name, output_dict in outputs.items():
        dataset_instance_id = output_dict["id"]
        klass = getattr(galaxy.model, output_dict.get('model_class', 'HistoryDatasetAssociation'))
        dataset = None
        if import_model_store:
            dataset = import_model_store.sa_session.query(klass).find(dataset_instance_id)
        if dataset is None:
            # legacy check for jobs that started before 21.01, remove on 21.05
            filename_in = os.path.join(f"metadata/metadata_in_{output_name}")
            import pickle
            dataset = pickle.load(open(filename_in, 'rb'))  # load DatasetInstance
        assert dataset is not None

        filename_kwds = os.path.join(f"metadata/metadata_kwds_{output_name}")
        filename_out = os.path.join(f"metadata/metadata_out_{output_name}")
        filename_results_code = os.path.join(f"metadata/metadata_results_{output_name}")
        override_metadata = os.path.join(f"metadata/metadata_override_{output_name}")
        dataset_filename_override = output_dict["filename_override"]
        # pre-20.05 this was a per job parameter and not a per dataset parameter, drop in 21.XX
        legacy_object_store_store_by = metadata_params.get("object_store_store_by", "id")

        # Same block as below...
        set_meta_kwds = stringify_dictionary_keys(json.load(open(filename_kwds)))  # load kwds; need to ensure our keywords are not unicode
        try:
            external_filename = unnamed_id_to_path.get(dataset_instance_id, dataset_filename_override)
            if not os.path.exists(external_filename):
                matches = glob.glob(external_filename)
                assert len(matches) == 1, f"More than one file matched by output glob '{external_filename}'"
                external_filename = matches[0]
                assert safe_contains(tool_job_working_directory, external_filename), f"Cannot collect output '{external_filename}' from outside of working directory"
                created_from_basename = os.path.relpath(external_filename, os.path.join(tool_job_working_directory, 'working'))
                dataset.dataset.created_from_basename = created_from_basename
            # override filename if we're dealing with outputs to working directory and dataset is not linked to
            link_data_only = metadata_params.get("link_data_only")
            if not link_data_only:
                # Only set external filename if we're dealing with files in job working directory.
                # Fixes link_data_only uploads
                dataset.dataset.external_filename = external_filename
                store_by = output_dict.get("object_store_store_by", legacy_object_store_store_by)
                extra_files_dir_name = f"dataset_{getattr(dataset.dataset, store_by)}_files"
                files_path = os.path.abspath(os.path.join(tool_job_working_directory, "working", extra_files_dir_name))
                dataset.dataset.external_extra_files_path = files_path
            file_dict = tool_provided_metadata.get_dataset_meta(output_name, dataset.dataset.id, dataset.dataset.uuid)
            if 'ext' in file_dict:
                dataset.extension = file_dict['ext']
            # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            override_metadata = json.load(open(override_metadata))
            for metadata_name, metadata_file_override in override_metadata:
                if MetadataTempFile.is_JSONified_value(metadata_file_override):
                    metadata_file_override = MetadataTempFile.from_JSON(metadata_file_override)
                setattr(dataset.metadata, metadata_name, metadata_file_override)
            if output_dict.get("validate", False):
                set_validated_state(dataset)
            if dataset_instance_id not in unnamed_id_to_path:
                # We're going to run through set_metadata in collect_dynamic_outputs with more contextual metadata,
                # so skip set_meta here.
                set_meta(dataset, file_dict)
                if extended_metadata_collection:
                    collect_extra_files(object_store, dataset, ".")
                    dataset.state = dataset.dataset.state = final_job_state

            if extended_metadata_collection:
                if not link_data_only and os.path.getsize(external_filename):
                    # Here we might be updating a disk based objectstore when outputs_to_working_directory is used,
                    # or a remote object store from its cache path.
                    object_store.update_from_file(dataset.dataset, file_name=external_filename, create=True)
                # TODO: merge expression_context into tool_provided_metadata so we don't have to special case this (here and in _finish_dataset)
                meta = tool_provided_metadata.get_dataset_meta(output_name, dataset.dataset.id, dataset.dataset.uuid)
                if meta:
                    context = ExpressionContext(meta, expression_context)
                else:
                    context = expression_context
                dataset.blurb = 'done'
                dataset.peek = 'no peek'
                dataset.info = (dataset.info or '')
                if context['stdout'].strip():
                    # Ensure white space between entries
                    dataset.info = f"{dataset.info.rstrip()}\n{context['stdout'].strip()}"
                if context['stderr'].strip():
                    # Ensure white space between entries
                    dataset.info = f"{dataset.info.rstrip()}\n{context['stderr'].strip()}"
                dataset.tool_version = version_string
                if 'uuid' in context:
                    dataset.dataset.uuid = context['uuid']
                if not final_job_state == Job.states.ERROR:
                    line_count = context.get('line_count', None)
                    try:
                        # Certain datatype's set_peek methods contain a line_count argument
                        dataset.set_peek(line_count=line_count)
                    except TypeError:
                        # ... and others don't
                        dataset.set_peek()
                for context_key in TOOL_PROVIDED_JOB_METADATA_KEYS:
                    if context_key in context:
                        context_value = context[context_key]
                        setattr(dataset, context_key, context_value)
                # We only want to persist the external_filename if the dataset has been linked in.
                if not link_data_only:
                    dataset.dataset.external_filename = None
                    dataset.dataset.extra_files_path = None
                export_store.add_dataset(dataset)
            else:
                dataset.metadata.to_JSON_dict(filename_out)  # write out results of set_meta

            json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wt+'))  # setting metadata has succeeded
        except Exception:
            json.dump((False, traceback.format_exc()), open(filename_results_code, 'wt+'))  # setting metadata has failed somehow

    if export_store:
        export_store._finalize()
    write_job_metadata(tool_job_working_directory, job_metadata, set_meta, tool_provided_metadata)
Beispiel #20
0
def set_metadata_portable():
    tool_job_working_directory = os.path.abspath(os.getcwd())
    metadata_tmp_files_dir = os.path.join(tool_job_working_directory,
                                          "metadata")
    MetadataTempFile.tmp_dir = metadata_tmp_files_dir

    metadata_params_path = os.path.join("metadata", "params.json")
    try:
        with open(metadata_params_path) as f:
            metadata_params = json.load(f)
    except OSError:
        raise Exception(
            f"Failed to find metadata/params.json from cwd [{tool_job_working_directory}]"
        )
    datatypes_config = metadata_params["datatypes_config"]
    job_metadata = metadata_params["job_metadata"]
    provided_metadata_style = metadata_params.get("provided_metadata_style")
    max_metadata_value_size = metadata_params.get(
        "max_metadata_value_size") or 0
    outputs = metadata_params["outputs"]

    datatypes_registry = validate_and_load_datatypes_config(datatypes_config)
    tool_provided_metadata = load_job_metadata(job_metadata,
                                               provided_metadata_style)

    def set_meta(new_dataset_instance, file_dict):
        set_meta_with_tool_provided(new_dataset_instance, file_dict,
                                    set_meta_kwds, datatypes_registry,
                                    max_metadata_value_size)

    object_store_conf_path = os.path.join("metadata", "object_store_conf.json")
    extended_metadata_collection = os.path.exists(object_store_conf_path)

    object_store = None
    job_context = None
    version_string = ""

    export_store = None
    final_job_state = Job.states.OK
    if extended_metadata_collection:
        tool_dict = metadata_params["tool"]
        stdio_exit_code_dicts, stdio_regex_dicts = tool_dict[
            "stdio_exit_codes"], tool_dict["stdio_regexes"]
        stdio_exit_codes = list(map(ToolStdioExitCode, stdio_exit_code_dicts))
        stdio_regexes = list(map(ToolStdioRegex, stdio_regex_dicts))

        with open(object_store_conf_path) as f:
            config_dict = json.load(f)
        assert config_dict is not None
        object_store = build_object_store_from_config(None,
                                                      config_dict=config_dict)
        Dataset.object_store = object_store

        outputs_directory = os.path.join(tool_job_working_directory, "outputs")
        if not os.path.exists(outputs_directory):
            outputs_directory = tool_job_working_directory

        # TODO: constants...
        if os.path.exists(os.path.join(outputs_directory, "tool_stdout")):
            with open(os.path.join(outputs_directory, "tool_stdout"),
                      "rb") as f:
                tool_stdout = f.read()

            with open(os.path.join(outputs_directory, "tool_stderr"),
                      "rb") as f:
                tool_stderr = f.read()
        elif os.path.exists(os.path.join(tool_job_working_directory,
                                         "stdout")):
            with open(os.path.join(tool_job_working_directory, "stdout"),
                      "rb") as f:
                tool_stdout = f.read()

            with open(os.path.join(tool_job_working_directory, "stderr"),
                      "rb") as f:
                tool_stderr = f.read()
        elif os.path.exists(os.path.join(outputs_directory, "stdout")):
            # Puslar style output directory? Was this ever used - did this ever work?
            with open(os.path.join(outputs_directory, "stdout"), "rb") as f:
                tool_stdout = f.read()

            with open(os.path.join(outputs_directory, "stderr"), "rb") as f:
                tool_stderr = f.read()
        else:
            wdc = os.listdir(tool_job_working_directory)
            odc = os.listdir(outputs_directory)
            error_desc = "Failed to find tool_stdout or tool_stderr for this job, cannot collect metadata"
            error_extra = f"Working dir contents [{wdc}], output directory contents [{odc}]"
            log.warn(f"{error_desc}. {error_extra}")
            raise Exception(error_desc)

        job_id_tag = metadata_params["job_id_tag"]

        exit_code_file = default_exit_code_file(".", job_id_tag)
        tool_exit_code = read_exit_code_from(exit_code_file, job_id_tag)

        check_output_detected_state, tool_stdout, tool_stderr, job_messages = check_output(
            stdio_regexes, stdio_exit_codes, tool_stdout, tool_stderr,
            tool_exit_code, job_id_tag)
        if check_output_detected_state == DETECTED_JOB_STATE.OK and not tool_provided_metadata.has_failed_outputs(
        ):
            final_job_state = Job.states.OK
        else:
            final_job_state = Job.states.ERROR

        version_string = ""
        if os.path.exists(COMMAND_VERSION_FILENAME):
            version_string = open(COMMAND_VERSION_FILENAME).read()

        expression_context = ExpressionContext(
            dict(stdout=tool_stdout, stderr=tool_stderr))

        # Load outputs.
        export_store = store.DirectoryModelExportStore(
            'metadata/outputs_populated',
            serialize_dataset_objects=True,
            for_edit=True,
            strip_metadata_files=False,
            serialize_jobs=False)
    try:
        import_model_store = store.imported_store_for_metadata(
            'metadata/outputs_new', object_store=object_store)
    except AssertionError:
        # Remove in 21.09, this should only happen for jobs that started on <= 20.09 and finish now
        import_model_store = None

    job_context = SessionlessJobContext(
        metadata_params,
        tool_provided_metadata,
        object_store,
        export_store,
        import_model_store,
        os.path.join(tool_job_working_directory, "working"),
        final_job_state=final_job_state,
    )

    unnamed_id_to_path = {}
    for unnamed_output_dict in job_context.tool_provided_metadata.get_unnamed_outputs(
    ):
        destination = unnamed_output_dict["destination"]
        elements = unnamed_output_dict["elements"]
        destination_type = destination["type"]
        if destination_type == 'hdas':
            for element in elements:
                filename = element.get('filename')
                if filename:
                    unnamed_id_to_path[element['object_id']] = os.path.join(
                        job_context.job_working_directory, filename)

    for output_name, output_dict in outputs.items():
        dataset_instance_id = output_dict["id"]
        klass = getattr(
            galaxy.model,
            output_dict.get('model_class', 'HistoryDatasetAssociation'))
        dataset = None
        if import_model_store:
            dataset = import_model_store.sa_session.query(klass).find(
                dataset_instance_id)
        if dataset is None:
            # legacy check for jobs that started before 21.01, remove on 21.05
            filename_in = os.path.join(f"metadata/metadata_in_{output_name}")
            import pickle
            dataset = pickle.load(open(filename_in,
                                       'rb'))  # load DatasetInstance
        assert dataset is not None

        filename_kwds = os.path.join(f"metadata/metadata_kwds_{output_name}")
        filename_out = os.path.join(f"metadata/metadata_out_{output_name}")
        filename_results_code = os.path.join(
            f"metadata/metadata_results_{output_name}")
        override_metadata = os.path.join(
            f"metadata/metadata_override_{output_name}")
        dataset_filename_override = output_dict["filename_override"]
        # pre-20.05 this was a per job parameter and not a per dataset parameter, drop in 21.XX
        legacy_object_store_store_by = metadata_params.get(
            "object_store_store_by", "id")

        # Same block as below...
        set_meta_kwds = stringify_dictionary_keys(
            json.load(open(filename_kwds))
        )  # load kwds; need to ensure our keywords are not unicode
        try:
            dataset.dataset.external_filename = unnamed_id_to_path.get(
                dataset_instance_id, dataset_filename_override)
            store_by = output_dict.get("object_store_store_by",
                                       legacy_object_store_store_by)
            extra_files_dir_name = f"dataset_{getattr(dataset.dataset, store_by)}_files"
            files_path = os.path.abspath(
                os.path.join(tool_job_working_directory, "working",
                             extra_files_dir_name))
            dataset.dataset.external_extra_files_path = files_path
            file_dict = tool_provided_metadata.get_dataset_meta(
                output_name, dataset.dataset.id, dataset.dataset.uuid)
            if 'ext' in file_dict:
                dataset.extension = file_dict['ext']
            # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            override_metadata = json.load(open(override_metadata))
            for metadata_name, metadata_file_override in override_metadata:
                if MetadataTempFile.is_JSONified_value(metadata_file_override):
                    metadata_file_override = MetadataTempFile.from_JSON(
                        metadata_file_override)
                setattr(dataset.metadata, metadata_name,
                        metadata_file_override)
            if output_dict.get("validate", False):
                set_validated_state(dataset)
            if dataset_instance_id not in unnamed_id_to_path:
                # We're going to run through set_metadata in collect_dynamic_outputs with more contextual metadata,
                # so skip set_meta here.
                set_meta(dataset, file_dict)

            if extended_metadata_collection:
                meta = tool_provided_metadata.get_dataset_meta(
                    output_name, dataset.dataset.id, dataset.dataset.uuid)
                if meta:
                    context = ExpressionContext(meta, expression_context)
                else:
                    context = expression_context

                # Lazy and unattached
                # if getattr(dataset, "hidden_beneath_collection_instance", None):
                #    dataset.visible = False
                dataset.blurb = 'done'
                dataset.peek = 'no peek'
                dataset.info = (dataset.info or '')
                if context['stdout'].strip():
                    # Ensure white space between entries
                    dataset.info = f"{dataset.info.rstrip()}\n{context['stdout'].strip()}"
                if context['stderr'].strip():
                    # Ensure white space between entries
                    dataset.info = f"{dataset.info.rstrip()}\n{context['stderr'].strip()}"
                dataset.tool_version = version_string
                dataset.set_size()
                if 'uuid' in context:
                    dataset.dataset.uuid = context['uuid']
                if dataset_filename_override and dataset_filename_override != dataset.file_name:
                    # This has to be a job with outputs_to_working_directory set.
                    # We update the object store with the created output file.
                    object_store.update_from_file(
                        dataset.dataset,
                        file_name=dataset_filename_override,
                        create=True)
                collect_extra_files(object_store, dataset, ".")
                if Job.states.ERROR == final_job_state:
                    dataset.blurb = "error"
                    dataset.mark_unhidden()
                else:
                    # If the tool was expected to set the extension, attempt to retrieve it
                    if dataset.ext == 'auto':
                        dataset.extension = context.get('ext', 'data')
                        dataset.init_meta(copy_from=dataset)

                    # This has already been done:
                    # else:
                    #     self.external_output_metadata.load_metadata(dataset, output_name, self.sa_session, working_directory=self.working_directory, remote_metadata_directory=remote_metadata_directory)
                    line_count = context.get('line_count', None)
                    try:
                        # Certain datatype's set_peek methods contain a line_count argument
                        dataset.set_peek(line_count=line_count)
                    except TypeError:
                        # ... and others don't
                        dataset.set_peek()

                for context_key in TOOL_PROVIDED_JOB_METADATA_KEYS:
                    if context_key in context:
                        context_value = context[context_key]
                        setattr(dataset, context_key, context_value)
                # We never want to persist the external_filename.
                dataset.dataset.external_filename = None
                export_store.add_dataset(dataset)
            else:
                dataset.metadata.to_JSON_dict(
                    filename_out)  # write out results of set_meta

            json.dump((True, 'Metadata has been set successfully'),
                      open(filename_results_code,
                           'wt+'))  # setting metadata has succeeded
        except Exception:
            json.dump((False, traceback.format_exc()),
                      open(filename_results_code,
                           'wt+'))  # setting metadata has failed somehow

    if extended_metadata_collection:
        # discover extra outputs...
        output_collections = {}
        for name, output_collection in metadata_params[
                "output_collections"].items():
            output_collections[name] = import_model_store.sa_session.query(
                HistoryDatasetCollectionAssociation).find(
                    output_collection["id"])
        outputs = {}
        for name, output in metadata_params["outputs"].items():
            klass = getattr(
                galaxy.model,
                output.get('model_class', 'HistoryDatasetAssociation'))
            outputs[name] = import_model_store.sa_session.query(klass).find(
                output["id"])

        input_ext = json.loads(metadata_params["job_params"].get(
            "__input_ext", '"data"'))
        collect_primary_datasets(
            job_context,
            outputs,
            input_ext=input_ext,
        )
        collect_dynamic_outputs(job_context, output_collections)

    if export_store:
        export_store._finalize()
    write_job_metadata(tool_job_working_directory, job_metadata, set_meta,
                       tool_provided_metadata)
Beispiel #21
0
def __main__():
    file_path = sys.argv.pop(1)
    tool_job_working_directory = tmp_dir = sys.argv.pop(1)  # this is also the job_working_directory now
    galaxy.model.Dataset.file_path = file_path
    galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tmp_dir

    config_root = sys.argv.pop(1)
    config_file_name = sys.argv.pop(1)
    if not os.path.isabs(config_file_name):
        config_file_name = os.path.join(config_root, config_file_name)

    # Set up reference to object store
    # First, read in the main config file for Galaxy; this is required because
    # the object store configuration is stored there
    conf_dict = load_app_properties(ini_file=config_file_name)
    # config object is required by ObjectStore class so create it now
    universe_config = config.Configuration(**conf_dict)
    universe_config.ensure_tempdir()
    object_store = build_object_store_from_config(universe_config)
    galaxy.model.Dataset.object_store = object_store

    # Set up datatypes registry
    datatypes_config = sys.argv.pop(1)
    datatypes_registry = galaxy.datatypes.registry.Registry()
    datatypes_registry.load_datatypes(root_dir=config_root, config=datatypes_config)
    galaxy.model.set_datatypes_registry(datatypes_registry)

    job_metadata = sys.argv.pop(1)
    existing_job_metadata_dict = {}
    new_job_metadata_dict = {}
    if job_metadata != "None" and os.path.exists(job_metadata):
        for line in open(job_metadata, "r"):
            try:
                line = stringify_dictionary_keys(json.loads(line))
                if line["type"] == "dataset":
                    existing_job_metadata_dict[line["dataset_id"]] = line
                elif line["type"] == "new_primary_dataset":
                    new_job_metadata_dict[line["filename"]] = line
            except:
                continue

    for filenames in sys.argv[1:]:
        fields = filenames.split(",")
        filename_in = fields.pop(0)
        filename_kwds = fields.pop(0)
        filename_out = fields.pop(0)
        filename_results_code = fields.pop(0)
        dataset_filename_override = fields.pop(0)
        # Need to be careful with the way that these parameters are populated from the filename splitting,
        # because if a job is running when the server is updated, any existing external metadata command-lines
        # will not have info about the newly added override_metadata file
        if fields:
            override_metadata = fields.pop(0)
        else:
            override_metadata = None
        set_meta_kwds = stringify_dictionary_keys(
            json.load(open(filename_kwds))
        )  # load kwds; need to ensure our keywords are not unicode
        try:
            dataset = cPickle.load(open(filename_in))  # load DatasetInstance
            if dataset_filename_override:
                dataset.dataset.external_filename = dataset_filename_override
            files_path = os.path.abspath(
                os.path.join(tool_job_working_directory, "dataset_%s_files" % (dataset.dataset.id))
            )
            dataset.dataset.external_extra_files_path = files_path
            if dataset.dataset.id in existing_job_metadata_dict:
                dataset.extension = existing_job_metadata_dict[dataset.dataset.id].get("ext", dataset.extension)
            # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            if override_metadata:
                override_metadata = json.load(open(override_metadata))
                for metadata_name, metadata_file_override in override_metadata:
                    if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(metadata_file_override):
                        metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON(
                            metadata_file_override
                        )
                    setattr(dataset.metadata, metadata_name, metadata_file_override)
            file_dict = existing_job_metadata_dict.get(dataset.dataset.id, {})
            set_meta_with_tool_provided(dataset, file_dict, set_meta_kwds)
            dataset.metadata.to_JSON_dict(filename_out)  # write out results of set_meta
            json.dump(
                (True, "Metadata has been set successfully"), open(filename_results_code, "wb+")
            )  # setting metadata has succeeded
        except Exception, e:
            json.dump((False, str(e)), open(filename_results_code, "wb+"))  # setting metadata has failed somehow
Beispiel #22
0
def main():
    parser = optparse.OptionParser()
    parser.add_option(
        '-b', '--buffer',
        dest='buffer',
        type='int', default=1000000,
        help='Number of lines to buffer at a time. Default: 1,000,000 lines. A buffer of 0 will attempt to use memory only.'
    )
    parser.add_option(
        '-d', '--index_depth',
        dest='index_depth',
        type='int', default=3,
        help='Depth to use on filebased offset indexing. Default: 3.'
    )
    parser.add_option(
        '-p', '--keep_partial',
        action='store_true',
        dest='keep_partial',
        default=False,
        help='Keep rows in first input which are missing identifiers.')
    parser.add_option(
        '-u', '--keep_unmatched',
        action='store_true',
        dest='keep_unmatched',
        default=False,
        help='Keep rows in first input which are not joined with the second input.')
    parser.add_option(
        '-f', '--fill_options_file',
        dest='fill_options_file',
        type='str', default=None,
        help='Fill empty columns with a values from a JSONified file.')
    parser.add_option(
        '-H', '--keep_headers',
        action='store_true',
        dest='keep_headers',
        default=False,
        help='Keep the headers')

    options, args = parser.parse_args()

    fill_options = None
    if options.fill_options_file is not None:
        try:
            fill_options = Bunch(**stringify_dictionary_keys(json.load(open(options.fill_options_file))))  # json.load( open( options.fill_options_file ) )
        except Exception as e:
            print("Warning: Ignoring fill options due to json error (%s)." % e)
    if fill_options is None:
        fill_options = Bunch()
    if 'fill_unjoined_only' not in fill_options:
        fill_options.fill_unjoined_only = True
    if 'file1_columns' not in fill_options:
        fill_options.file1_columns = None
    if 'file2_columns' not in fill_options:
        fill_options.file2_columns = None

    try:
        filename1 = args[0]
        filename2 = args[1]
        column1 = int(args[2]) - 1
        column2 = int(args[3]) - 1
        out_filename = args[4]
    except Exception:
        print("Error parsing command line.", file=sys.stderr)
        sys.exit()

    # Character for splitting fields and joining lines
    split = "\t"

    return join_files(filename1, column1, filename2, column2, out_filename, split, options.buffer, options.keep_unmatched, options.keep_partial, options.keep_headers, options.index_depth, fill_options=fill_options)
Beispiel #23
0
def __main__():
    file_path = sys.argv.pop( 1 )
    tmp_dir = sys.argv.pop( 1 )
    galaxy.model.Dataset.file_path = file_path
    galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tmp_dir

    config_root = sys.argv.pop( 1 )
    config_file_name = sys.argv.pop( 1 )
    if not os.path.isabs( config_file_name ):
        config_file_name = os.path.join( config_root, config_file_name )

    # Set up reference to object store
    # First, read in the main config file for Galaxy; this is required because
    # the object store configuration is stored there
    conf = ConfigParser.ConfigParser()
    conf.read(config_file_name)
    conf_dict = {}
    for section in conf.sections():
        for option in conf.options(section):
            try:
                conf_dict[option] = conf.get(section, option)
            except ConfigParser.InterpolationMissingOptionError:
                # Because this is not called from Paste Script, %(here)s variable
                # is not initialized in the config file so skip those fields -
                # just need not to use any such fields for the object store conf...
                log.debug("Did not load option %s from %s" % (option, config_file_name))
    # config object is required by ObjectStore class so create it now
    universe_config = config.Configuration(**conf_dict)
    object_store = build_object_store_from_config(universe_config)
    galaxy.model.Dataset.object_store = object_store

    # Set up datatypes registry
    datatypes_config = sys.argv.pop( 1 )
    datatypes_registry = galaxy.datatypes.registry.Registry()
    datatypes_registry.load_datatypes( root_dir=config_root, config=datatypes_config )
    galaxy.model.set_datatypes_registry( datatypes_registry )

    job_metadata = sys.argv.pop( 1 )
    ext_override = dict()
    if job_metadata != "None" and os.path.exists( job_metadata ):
        for line in open( job_metadata, 'r' ):
            try:
                line = stringify_dictionary_keys( from_json_string( line ) )
                assert line['type'] == 'dataset'
                ext_override[line['dataset_id']] = line['ext']
            except:
                continue
    for filenames in sys.argv[1:]:
        fields = filenames.split( ',' )
        filename_in = fields.pop( 0 )
        filename_kwds = fields.pop( 0 )
        filename_out = fields.pop( 0 )
        filename_results_code = fields.pop( 0 )
        dataset_filename_override = fields.pop( 0 )
        # Need to be careful with the way that these parameters are populated from the filename splitting,
        # because if a job is running when the server is updated, any existing external metadata command-lines
        #will not have info about the newly added override_metadata file
        if fields:
            override_metadata = fields.pop( 0 )
        else:
            override_metadata = None
        try:
            dataset = cPickle.load( open( filename_in ) )  # load DatasetInstance
            if dataset_filename_override:
                dataset.dataset.external_filename = dataset_filename_override
            if ext_override.get( dataset.dataset.id, None ):
                dataset.extension = ext_override[ dataset.dataset.id ]
            # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            if override_metadata:
                override_metadata = json.load( open( override_metadata ) )
                for metadata_name, metadata_file_override in override_metadata:
                    if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value( metadata_file_override ):
                        metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override )
                    setattr( dataset.metadata, metadata_name, metadata_file_override )
            kwds = stringify_dictionary_keys( json.load( open( filename_kwds ) ) )  # load kwds; need to ensure our keywords are not unicode
            dataset.datatype.set_meta( dataset, **kwds )
            dataset.metadata.to_JSON_dict( filename_out )  # write out results of set_meta
            json.dump( ( True, 'Metadata has been set successfully' ), open( filename_results_code, 'wb+' ) )  # setting metadata has succeeded
        except Exception, e:
            json.dump( ( False, str( e ) ), open( filename_results_code, 'wb+' ) )  # setting metadata has failed somehow
Beispiel #24
0
 def from_JSON(cls, json_dict):
     #need to ensure our keywords are not unicode
     rval = cls(**stringify_dictionary_keys(json_dict['kwds']))
     rval._filename = json_dict['filename']
     return rval
Beispiel #25
0
 def from_JSON( cls, json_dict ):
     #need to ensure our keywords are not unicode
     rval = cls( **stringify_dictionary_keys( json_dict['kwds'] ) )
     rval._filename = json_dict['filename']
     return rval
def set_metadata():
    # locate galaxy_root for loading datatypes
    galaxy_root = os.path.abspath(
        os.path.join(os.path.dirname(__file__), os.pardir, os.pardir,
                     os.pardir))
    import galaxy.model
    galaxy.model.metadata.MetadataTempFile.tmp_dir = tool_job_working_directory = os.path.abspath(
        os.getcwd())

    # This is ugly, but to transition from existing jobs without this parameter
    # to ones with, smoothly, it has to be the last optional parameter and we
    # have to sniff it.
    try:
        max_metadata_value_size = int(sys.argv[-1])
        sys.argv = sys.argv[:-1]
    except ValueError:
        max_metadata_value_size = 0
        # max_metadata_value_size is unspecified and should be 0

    # Set up datatypes registry
    datatypes_config = sys.argv.pop(1)
    if not os.path.exists(datatypes_config):
        # This path should exist, except for jobs that started running on release 17.05, where a global
        # datatypes_config (instead of a datatypes_config per job) was used. For a while release 17.05
        # would remove the global datatypes config on shutdown and toolbox reload, which would lead to
        # failed metadata jobs. To remedy this we scan jobs at startup for missing registry.xml files,
        # and if we detect such a job we write out the current registry.xml file.
        datatypes_config = os.path.join(tool_job_working_directory,
                                        "registry.xml")
        if not os.path.exists(datatypes_config):
            print(
                "Metadata setting failed because registry.xml could not be found. You may retry setting metadata."
            )
            sys.exit(1)
    import galaxy.datatypes.registry
    datatypes_registry = galaxy.datatypes.registry.Registry()
    datatypes_registry.load_datatypes(root_dir=galaxy_root,
                                      config=datatypes_config)
    galaxy.model.set_datatypes_registry(datatypes_registry)

    job_metadata = sys.argv.pop(1)
    existing_job_metadata_dict = {}
    new_job_metadata_dict = {}
    if job_metadata != "None" and os.path.exists(job_metadata):
        for line in open(job_metadata, 'r'):
            try:
                line = stringify_dictionary_keys(json.loads(line))
                if line['type'] == 'dataset':
                    existing_job_metadata_dict[line['dataset_id']] = line
                elif line['type'] == 'new_primary_dataset':
                    new_job_metadata_dict[line['filename']] = line
            except Exception:
                continue

    for filenames in sys.argv[1:]:
        fields = filenames.split(',')
        filename_in = fields.pop(0)
        filename_kwds = fields.pop(0)
        filename_out = fields.pop(0)
        filename_results_code = fields.pop(0)
        dataset_filename_override = fields.pop(0)
        # Need to be careful with the way that these parameters are populated from the filename splitting,
        # because if a job is running when the server is updated, any existing external metadata command-lines
        # will not have info about the newly added override_metadata file
        if fields:
            override_metadata = fields.pop(0)
        else:
            override_metadata = None
        set_meta_kwds = stringify_dictionary_keys(
            json.load(open(filename_kwds))
        )  # load kwds; need to ensure our keywords are not unicode
        try:
            dataset = cPickle.load(open(filename_in,
                                        'rb'))  # load DatasetInstance
            dataset.dataset.external_filename = dataset_filename_override
            files_path = os.path.abspath(
                os.path.join(tool_job_working_directory,
                             "dataset_%s_files" % (dataset.dataset.id)))
            dataset.dataset.external_extra_files_path = files_path
            if dataset.dataset.id in existing_job_metadata_dict:
                dataset.extension = existing_job_metadata_dict[
                    dataset.dataset.id].get('ext', dataset.extension)
            # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles
            if override_metadata:
                override_metadata = json.load(open(override_metadata))
                for metadata_name, metadata_file_override in override_metadata:
                    if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(
                            metadata_file_override):
                        metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON(
                            metadata_file_override)
                    setattr(dataset.metadata, metadata_name,
                            metadata_file_override)
            file_dict = existing_job_metadata_dict.get(dataset.dataset.id, {})
            set_meta_with_tool_provided(dataset, file_dict, set_meta_kwds,
                                        datatypes_registry)
            if max_metadata_value_size:
                for k, v in list(dataset.metadata.items()):
                    if total_size(v) > max_metadata_value_size:
                        log.info("Key %s too large for metadata, discarding" %
                                 k)
                        dataset.metadata.remove_key(k)
            dataset.metadata.to_JSON_dict(
                filename_out)  # write out results of set_meta
            json.dump((True, 'Metadata has been set successfully'),
                      open(filename_results_code,
                           'wt+'))  # setting metadata has succeeded
        except Exception as e:
            json.dump((False, str(e)),
                      open(filename_results_code,
                           'wt+'))  # setting metadata has failed somehow

    for i, (filename, file_dict) in enumerate(new_job_metadata_dict.items(),
                                              start=1):
        new_dataset_filename = os.path.join(tool_job_working_directory,
                                            "working", file_dict['filename'])
        new_dataset = galaxy.model.Dataset(
            id=-i, external_filename=new_dataset_filename)
        extra_files = file_dict.get('extra_files', None)
        if extra_files is not None:
            new_dataset._extra_files_path = os.path.join(
                tool_job_working_directory, "working", extra_files)
        new_dataset.state = new_dataset.states.OK
        new_dataset_instance = galaxy.model.HistoryDatasetAssociation(
            id=-i, dataset=new_dataset, extension=file_dict.get('ext', 'data'))
        set_meta_with_tool_provided(new_dataset_instance, file_dict,
                                    set_meta_kwds, datatypes_registry)
        file_dict['metadata'] = json.loads(
            new_dataset_instance.metadata.to_JSON_dict()
        )  # storing metadata in external form, need to turn back into dict, then later jsonify
    if existing_job_metadata_dict or new_job_metadata_dict:
        with open(job_metadata, 'wt') as job_metadata_fh:
            for value in list(existing_job_metadata_dict.values()) + list(
                    new_job_metadata_dict.values()):
                job_metadata_fh.write("%s\n" % (json.dumps(value)))

    clear_mappers()