def set_metadata(): # locate galaxy_root for loading datatypes galaxy_root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir)) galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tool_job_working_directory = os.path.abspath(os.getcwd()) # Set up datatypes registry datatypes_config = sys.argv.pop( 1 ) datatypes_registry = galaxy.datatypes.registry.Registry() datatypes_registry.load_datatypes( root_dir=galaxy_root, config=datatypes_config ) galaxy.model.set_datatypes_registry( datatypes_registry ) job_metadata = sys.argv.pop( 1 ) existing_job_metadata_dict = {} new_job_metadata_dict = {} if job_metadata != "None" and os.path.exists( job_metadata ): for line in open( job_metadata, 'r' ): try: line = stringify_dictionary_keys( json.loads( line ) ) if line['type'] == 'dataset': existing_job_metadata_dict[ line['dataset_id'] ] = line elif line['type'] == 'new_primary_dataset': new_job_metadata_dict[ line[ 'filename' ] ] = line except: continue for filenames in sys.argv[1:]: fields = filenames.split( ',' ) filename_in = fields.pop( 0 ) filename_kwds = fields.pop( 0 ) filename_out = fields.pop( 0 ) filename_results_code = fields.pop( 0 ) dataset_filename_override = fields.pop( 0 ) # Need to be careful with the way that these parameters are populated from the filename splitting, # because if a job is running when the server is updated, any existing external metadata command-lines #will not have info about the newly added override_metadata file if fields: override_metadata = fields.pop( 0 ) else: override_metadata = None set_meta_kwds = stringify_dictionary_keys( json.load( open( filename_kwds ) ) ) # load kwds; need to ensure our keywords are not unicode try: dataset = cPickle.load( open( filename_in ) ) # load DatasetInstance dataset.dataset.external_filename = dataset_filename_override files_path = os.path.abspath(os.path.join( tool_job_working_directory, "dataset_%s_files" % (dataset.dataset.id) )) dataset.dataset.external_extra_files_path = files_path if dataset.dataset.id in existing_job_metadata_dict: dataset.extension = existing_job_metadata_dict[ dataset.dataset.id ].get( 'ext', dataset.extension ) # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles if override_metadata: override_metadata = json.load( open( override_metadata ) ) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value( metadata_file_override ): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override ) setattr( dataset.metadata, metadata_name, metadata_file_override ) file_dict = existing_job_metadata_dict.get( dataset.dataset.id, {} ) set_meta_with_tool_provided( dataset, file_dict, set_meta_kwds, datatypes_registry ) dataset.metadata.to_JSON_dict( filename_out ) # write out results of set_meta json.dump( ( True, 'Metadata has been set successfully' ), open( filename_results_code, 'wb+' ) ) # setting metadata has succeeded except Exception, e: json.dump( ( False, str( e ) ), open( filename_results_code, 'wb+' ) ) # setting metadata has failed somehow
def __main__(): file_path = sys.argv.pop( 1 ) tmp_dir = sys.argv.pop( 1 ) galaxy.model.Dataset.file_path = file_path galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tmp_dir # Set up datatypes registry config_root = sys.argv.pop( 1 ) datatypes_config = sys.argv.pop( 1 ) galaxy.model.set_datatypes_registry( galaxy.datatypes.registry.Registry( config_root, datatypes_config ) ) job_metadata = sys.argv.pop( 1 ) ext_override = dict() if job_metadata != "None" and os.path.exists( job_metadata ): for line in open( job_metadata, 'r' ): try: line = stringify_dictionary_keys( from_json_string( line ) ) assert line['type'] == 'dataset' ext_override[line['dataset_id']] = line['ext'] except: continue for filenames in sys.argv[1:]: fields = filenames.split( ',' ) filename_in = fields.pop( 0 ) filename_kwds = fields.pop( 0 ) filename_out = fields.pop( 0 ) filename_results_code = fields.pop( 0 ) dataset_filename_override = fields.pop( 0 ) #Need to be careful with the way that these parameters are populated from the filename splitting, #because if a job is running when the server is updated, any existing external metadata command-lines #will not have info about the newly added override_metadata file if fields: override_metadata = fields.pop( 0 ) else: override_metadata = None try: dataset = cPickle.load( open( filename_in ) ) #load DatasetInstance if dataset_filename_override: dataset.dataset.external_filename = dataset_filename_override if ext_override.get( dataset.dataset.id, None ): dataset.extension = ext_override[ dataset.dataset.id ] #Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles if override_metadata: override_metadata = simplejson.load( open( override_metadata ) ) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value( metadata_file_override ): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override ) setattr( dataset.metadata, metadata_name, metadata_file_override ) kwds = stringify_dictionary_keys( simplejson.load( open( filename_kwds ) ) )#load kwds; need to ensure our keywords are not unicode dataset.datatype.set_meta( dataset, **kwds ) dataset.metadata.to_JSON_dict( filename_out ) # write out results of set_meta simplejson.dump( ( True, 'Metadata has been set successfully' ), open( filename_results_code, 'wb+' ) ) #setting metadata has succeeded except Exception, e: simplejson.dump( ( False, str( e ) ), open( filename_results_code, 'wb+' ) ) #setting metadata has failed somehow
def set_metadata_portable(): import galaxy.model tool_job_working_directory = os.path.abspath(os.getcwd()) metadata_tmp_files_dir = os.path.join(tool_job_working_directory, "metadata") galaxy.model.metadata.MetadataTempFile.tmp_dir = metadata_tmp_files_dir metadata_params_path = os.path.join("metadata", "params.json") try: with open(metadata_params_path, "r") as f: metadata_params = json.load(f) except IOError: raise Exception("Failed to find metadata/params.json from cwd [%s]" % tool_job_working_directory) datatypes_config = metadata_params["datatypes_config"] job_metadata = metadata_params["job_metadata"] max_metadata_value_size = metadata_params.get("max_metadata_value_size") or 0 outputs = metadata_params["outputs"] datatypes_registry = validate_and_load_datatypes_config(datatypes_config) tool_provided_metadata = load_job_metadata(job_metadata) def set_meta(new_dataset_instance, file_dict): set_meta_with_tool_provided(new_dataset_instance, file_dict, set_meta_kwds, datatypes_registry, max_metadata_value_size) for output_name, output_dict in outputs.items(): filename_in = os.path.join("metadata/metadata_in_%s" % output_name) filename_kwds = os.path.join("metadata/metadata_kwds_%s" % output_name) filename_out = os.path.join("metadata/metadata_out_%s" % output_name) filename_results_code = os.path.join("metadata/metadata_results_%s" % output_name) override_metadata = os.path.join("metadata/metadata_override_%s" % output_name) dataset_filename_override = output_dict["filename_override"] # Same block as below... set_meta_kwds = stringify_dictionary_keys(json.load(open(filename_kwds))) # load kwds; need to ensure our keywords are not unicode try: dataset = cPickle.load(open(filename_in, 'rb')) # load DatasetInstance dataset.dataset.external_filename = dataset_filename_override store_by = metadata_params.get("object_store_store_by", "id") extra_files_dir_name = "dataset_%s_files" % getattr(dataset.dataset, store_by) files_path = os.path.abspath(os.path.join(tool_job_working_directory, extra_files_dir_name)) dataset.dataset.external_extra_files_path = files_path file_dict = tool_provided_metadata.get_dataset_meta(output_name, dataset.dataset.id) if 'ext' in file_dict: dataset.extension = file_dict['ext'] # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles override_metadata = json.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(metadata_file_override): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON(metadata_file_override) setattr(dataset.metadata, metadata_name, metadata_file_override) if output_dict.get("validate", False): set_validated_state(dataset) set_meta(dataset, file_dict) dataset.metadata.to_JSON_dict(filename_out) # write out results of set_meta json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wt+')) # setting metadata has succeeded except Exception: json.dump((False, traceback.format_exc()), open(filename_results_code, 'wt+')) # setting metadata has failed somehow write_job_metadata(tool_job_working_directory, job_metadata, set_meta, tool_provided_metadata)
def set_metadata_legacy(): import galaxy.model galaxy.model.metadata.MetadataTempFile.tmp_dir = tool_job_working_directory = os.path.abspath(os.getcwd()) # This is ugly, but to transition from existing jobs without this parameter # to ones with, smoothly, it has to be the last optional parameter and we # have to sniff it. try: max_metadata_value_size = int(sys.argv[-1]) sys.argv = sys.argv[:-1] except ValueError: max_metadata_value_size = 0 # max_metadata_value_size is unspecified and should be 0 # Set up datatypes registry datatypes_config = sys.argv.pop(1) datatypes_registry = validate_and_load_datatypes_config(datatypes_config) job_metadata = sys.argv.pop(1) tool_provided_metadata = load_job_metadata(job_metadata, None) def set_meta(new_dataset_instance, file_dict): set_meta_with_tool_provided(new_dataset_instance, file_dict, set_meta_kwds, datatypes_registry, max_metadata_value_size) for filenames in sys.argv[1:]: fields = filenames.split(',') filename_in = fields.pop(0) filename_kwds = fields.pop(0) filename_out = fields.pop(0) filename_results_code = fields.pop(0) dataset_filename_override = fields.pop(0) override_metadata = fields.pop(0) set_meta_kwds = stringify_dictionary_keys(json.load(open(filename_kwds))) # load kwds; need to ensure our keywords are not unicode try: dataset = cPickle.load(open(filename_in, 'rb')) # load DatasetInstance dataset.dataset.external_filename = dataset_filename_override store_by = "id" extra_files_dir_name = "dataset_%s_files" % getattr(dataset.dataset, store_by) files_path = os.path.abspath(os.path.join(tool_job_working_directory, "working", extra_files_dir_name)) dataset.dataset.external_extra_files_path = files_path file_dict = tool_provided_metadata.get_dataset_meta(None, dataset.dataset.id, dataset.dataset.uuid) if 'ext' in file_dict: dataset.extension = file_dict['ext'] # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles override_metadata = json.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(metadata_file_override): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON(metadata_file_override) setattr(dataset.metadata, metadata_name, metadata_file_override) set_meta(dataset, file_dict) dataset.metadata.to_JSON_dict(filename_out) # write out results of set_meta json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wt+')) # setting metadata has succeeded except Exception as e: json.dump((False, unicodify(e)), open(filename_results_code, 'wt+')) # setting metadata has failed somehow write_job_metadata(tool_job_working_directory, job_metadata, set_meta, tool_provided_metadata)
def main(): parser = optparse.OptionParser() parser.add_option( "-b", "--buffer", dest="buffer", type="int", default=1000000, help="Number of lines to buffer at a time. Default: 1,000,000 lines. A buffer of 0 will attempt to use memory only.", ) parser.add_option( "-d", "--index_depth", dest="index_depth", type="int", default=3, help="Depth to use on filebased offset indexing. Default: 3.", ) parser.add_option( "-p", "--keep_partial", action="store_true", dest="keep_partial", default=False, help="Keep rows in first input which are missing identifiers.", ) parser.add_option( "-u", "--keep_unmatched", action="store_true", dest="keep_unmatched", default=False, help="Keep rows in first input which are not joined with the second input.", ) parser.add_option( "-f", "--fill_options_file", dest="fill_options_file", type="str", default=None, help="Fill empty columns with a values from a JSONified file.", ) options, args = parser.parse_args() fill_options = None if options.fill_options_file is not None: try: fill_options = Bunch( **stringify_dictionary_keys(json.load(open(options.fill_options_file))) ) # json.load( open( options.fill_options_file ) ) except Exception, e: print "Warning: Ignoring fill options due to json error (%s)." % e
def main(): parser = optparse.OptionParser() parser.add_option( '-b', '--buffer', dest='buffer', type='int', default=1000000, help= 'Number of lines to buffer at a time. Default: 1,000,000 lines. A buffer of 0 will attempt to use memory only.' ) parser.add_option( '-d', '--index_depth', dest='index_depth', type='int', default=3, help='Depth to use on filebased offset indexing. Default: 3.') parser.add_option( '-p', '--keep_partial', action='store_true', dest='keep_partial', default=False, help='Keep rows in first input which are missing identifiers.') parser.add_option( '-u', '--keep_unmatched', action='store_true', dest='keep_unmatched', default=False, help= 'Keep rows in first input which are not joined with the second input.') parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', type='str', default=None, help='Fill empty columns with a values from a JSONified file.') options, args = parser.parse_args() fill_options = None if options.fill_options_file is not None: try: fill_options = Bunch(**stringify_dictionary_keys( json.load(open(options.fill_options_file)) )) # json.load( open( options.fill_options_file ) ) except Exception, e: print "Warning: Ignoring fill options due to json error (%s)." % e
def __init__(self, meta_file, job_wrapper=None): self.meta_file = meta_file self.tool_provided_job_metadata = [] with open(meta_file, 'r') as f: for line in f: try: line = stringify_dictionary_keys(json.loads(line)) assert 'type' in line except Exception: log.exception( '(%s) Got JSON data from tool, but data is improperly formatted or no "type" key in data' % job_wrapper.job_id) log.debug('Offending data was: %s' % line) continue # Set the dataset id if it's a dataset entry and isn't set. # This isn't insecure. We loop the job's output datasets in # the finish method, so if a tool writes out metadata for a # dataset id that it doesn't own, it'll just be ignored. dataset_id_not_specified = line[ 'type'] == 'dataset' and 'dataset_id' not in line if dataset_id_not_specified: dataset_basename = line['dataset'] if job_wrapper: try: line[ 'dataset_id'] = job_wrapper.get_output_file_id( dataset_basename) except KeyError: log.warning( '(%s) Tool provided job dataset-specific metadata without specifying a dataset' % job_wrapper.job_id) continue else: match = re.match(r'(galaxy_)?dataset_(.*)\.dat', dataset_basename) if match is None: raise Exception( "processing tool_provided_metadata (e.g. galaxy.json) entry with invalid dataset name [%s]" % dataset_basename) dataset_id = match.group(2) if dataset_id.isdigit(): line['dataset_id'] = dataset_id else: line['dataset_uuid'] = dataset_id self.tool_provided_job_metadata.append(line)
def __main__(): file_path = sys.argv.pop( 1 ) tmp_dir = sys.argv.pop( 1 ) galaxy.model.Dataset.file_path = file_path galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tmp_dir for filenames in sys.argv[1:]: filename_in, filename_kwds, filename_out, filename_results_code, dataset_filename_override = filenames.split( ',' ) try: dataset = cPickle.load( open( filename_in ) ) #load DatasetInstance if dataset_filename_override: dataset.dataset.external_filename = dataset_filename_override kwds = stringify_dictionary_keys( simplejson.load( open( filename_kwds ) ) )#load kwds; need to ensure our keywords are not unicode dataset.datatype.set_meta( dataset, **kwds ) dataset.metadata.to_JSON_dict( filename_out ) # write out results of set_meta simplejson.dump( ( True, 'Metadata has been set successfully' ), open( filename_results_code, 'wb+' ) ) #setting metadata has suceeded except Exception, e: simplejson.dump( ( False, str( e ) ), open( filename_results_code, 'wb+' ) ) #setting metadata has failed somehow
def main(): parser = optparse.OptionParser() parser.add_option( '-b','--buffer', dest='buffer', type='int',default=1000000, help='Number of lines to buffer at a time. Default: 1,000,000 lines. A buffer of 0 will attempt to use memory only.' ) parser.add_option( '-d','--index_depth', dest='index_depth', type='int',default=3, help='Depth to use on filebased offset indexing. Default: 3.' ) parser.add_option( '-p','--keep_partial', action='store_true', dest='keep_partial', default=False, help='Keep rows in first input which are missing identifiers.') parser.add_option( '-u','--keep_unmatched', action='store_true', dest='keep_unmatched', default=False, help='Keep rows in first input which are not joined with the second input.') parser.add_option( '-f','--fill_options_file', dest='fill_options_file', type='str',default=None, help='Fill empty columns with a values from a JSONified file.') options, args = parser.parse_args() fill_options = None if options.fill_options_file is not None: try: if simplejson is None: raise simplejson_exception fill_options = Bunch( **stringify_dictionary_keys( simplejson.load( open( options.fill_options_file ) ) ) ) #simplejson.load( open( options.fill_options_file ) ) except Exception, e: print "Warning: Ignoring fill options due to simplejson error (%s)." % e
def __main__(): parser = optparse.OptionParser() parser.add_option( '-o', '--output', dest='output', help='The name of the output file' ) parser.add_option( '-1', '--input1', dest='input1', help='The name of the first input file' ) parser.add_option( '-2', '--input2', dest='input2', help='The name of the second input file' ) parser.add_option( '-g', '--hinge', dest='hinge', help='The "hinge" to use (the value to compare)' ) parser.add_option( '-c', '--columns', dest='columns', help='The columns to include in the output file' ) parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', default=None, help='The file specifying the fill value to use' ) (options, args) = parser.parse_args() hinge = int( options.hinge ) cols = [ int( c ) for c in str( options.columns ).split( ',' ) if int( c ) > hinge ] inputs = [ options.input1, options.input2 ] if options.fill_options_file == 'None': inputs.extend( args ) elif len( args ) > 0: inputs.extend( args ) fill_options = None if options.fill_options_file != 'None' and options.fill_options_file is not None: try: if simplejson is None: raise simplejson_exception fill_options = Bunch( **stringify_dictionary_keys( simplejson.load( open( options.fill_options_file ) ) ) ) except Exception, e: print 'Warning: Ignoring fill options due to simplejson error (%s).' % e
def __main__(): file_path = sys.argv.pop(1) tool_job_working_directory = tmp_dir = sys.argv.pop( 1) #this is also the job_working_directory now galaxy.model.Dataset.file_path = file_path galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tmp_dir config_root = sys.argv.pop(1) config_file_name = sys.argv.pop(1) if not os.path.isabs(config_file_name): config_file_name = os.path.join(config_root, config_file_name) # Set up reference to object store # First, read in the main config file for Galaxy; this is required because # the object store configuration is stored there conf_dict = load_app_properties(ini_file=config_file_name) # config object is required by ObjectStore class so create it now universe_config = config.Configuration(**conf_dict) universe_config.ensure_tempdir() object_store = build_object_store_from_config(universe_config) galaxy.model.Dataset.object_store = object_store # Set up datatypes registry datatypes_config = sys.argv.pop(1) datatypes_registry = galaxy.datatypes.registry.Registry() datatypes_registry.load_datatypes(root_dir=config_root, config=datatypes_config) galaxy.model.set_datatypes_registry(datatypes_registry) job_metadata = sys.argv.pop(1) existing_job_metadata_dict = {} new_job_metadata_dict = {} if job_metadata != "None" and os.path.exists(job_metadata): for line in open(job_metadata, 'r'): try: line = stringify_dictionary_keys(json.loads(line)) if line['type'] == 'dataset': existing_job_metadata_dict[line['dataset_id']] = line elif line['type'] == 'new_primary_dataset': new_job_metadata_dict[line['filename']] = line except: continue for filenames in sys.argv[1:]: fields = filenames.split(',') filename_in = fields.pop(0) filename_kwds = fields.pop(0) filename_out = fields.pop(0) filename_results_code = fields.pop(0) dataset_filename_override = fields.pop(0) # Need to be careful with the way that these parameters are populated from the filename splitting, # because if a job is running when the server is updated, any existing external metadata command-lines #will not have info about the newly added override_metadata file if fields: override_metadata = fields.pop(0) else: override_metadata = None set_meta_kwds = stringify_dictionary_keys( json.load(open(filename_kwds)) ) # load kwds; need to ensure our keywords are not unicode try: dataset = cPickle.load(open(filename_in)) # load DatasetInstance if dataset_filename_override: dataset.dataset.external_filename = dataset_filename_override files_path = os.path.abspath( os.path.join(tool_job_working_directory, "dataset_%s_files" % (dataset.dataset.id))) dataset.dataset.external_extra_files_path = files_path if dataset.dataset.id in existing_job_metadata_dict: dataset.extension = existing_job_metadata_dict[ dataset.dataset.id].get('ext', dataset.extension) # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles if override_metadata: override_metadata = json.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value( metadata_file_override): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override) setattr(dataset.metadata, metadata_name, metadata_file_override) file_dict = existing_job_metadata_dict.get(dataset.dataset.id, {}) set_meta_with_tool_provided(dataset, file_dict, set_meta_kwds) dataset.metadata.to_JSON_dict( filename_out) # write out results of set_meta json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wb+')) # setting metadata has succeeded except Exception, e: json.dump((False, str(e)), open(filename_results_code, 'wb+')) # setting metadata has failed somehow
def main(): parser = optparse.OptionParser() parser.add_option( '-b', '--buffer', dest='buffer', type='int', default=1000000, help='Number of lines to buffer at a time. Default: 1,000,000 lines. A buffer of 0 will attempt to use memory only.' ) parser.add_option( '-d', '--index_depth', dest='index_depth', type='int', default=3, help='Depth to use on filebased offset indexing. Default: 3.' ) parser.add_option( '-p', '--keep_partial', action='store_true', dest='keep_partial', default=False, help='Keep rows in first input which are missing identifiers.') parser.add_option( '-u', '--keep_unmatched', action='store_true', dest='keep_unmatched', default=False, help='Keep rows in first input which are not joined with the second input.') parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', type='str', default=None, help='Fill empty columns with a values from a JSONified file.') options, args = parser.parse_args() fill_options = None if options.fill_options_file is not None: try: fill_options = Bunch( **stringify_dictionary_keys( json.load( open( options.fill_options_file ) ) ) ) # json.load( open( options.fill_options_file ) ) except Exception as e: print("Warning: Ignoring fill options due to json error (%s)." % e) if fill_options is None: fill_options = Bunch() if 'fill_unjoined_only' not in fill_options: fill_options.fill_unjoined_only = True if 'file1_columns' not in fill_options: fill_options.file1_columns = None if 'file2_columns' not in fill_options: fill_options.file2_columns = None try: filename1 = args[0] filename2 = args[1] column1 = int( args[2] ) - 1 column2 = int( args[3] ) - 1 out_filename = args[4] except: print("Error parsing command line.", file=sys.stderr) sys.exit() # Character for splitting fields and joining lines split = "\t" return join_files( filename1, column1, filename2, column2, out_filename, split, options.buffer, options.keep_unmatched, options.keep_partial, options.index_depth, fill_options=fill_options )
def set_metadata(): # locate galaxy_root for loading datatypes galaxy_root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir)) galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tool_job_working_directory = os.path.abspath(os.getcwd()) # This is ugly, but to transition from existing jobs without this parameter # to ones with, smoothly, it has to be the last optional parameter and we # have to sniff it. try: max_metadata_value_size = int(sys.argv[-1]) sys.argv = sys.argv[:-1] except ValueError: max_metadata_value_size = 0 # max_metadata_value_size is unspecified and should be 0 # Set up datatypes registry datatypes_config = sys.argv.pop(1) datatypes_registry = galaxy.datatypes.registry.Registry() datatypes_registry.load_datatypes(root_dir=galaxy_root, config=datatypes_config) galaxy.model.set_datatypes_registry(datatypes_registry) job_metadata = sys.argv.pop(1) existing_job_metadata_dict = {} new_job_metadata_dict = {} if job_metadata != "None" and os.path.exists(job_metadata): for line in open(job_metadata, "r"): try: line = stringify_dictionary_keys(json.loads(line)) if line["type"] == "dataset": existing_job_metadata_dict[line["dataset_id"]] = line elif line["type"] == "new_primary_dataset": new_job_metadata_dict[line["filename"]] = line except: continue for filenames in sys.argv[1:]: fields = filenames.split(",") filename_in = fields.pop(0) filename_kwds = fields.pop(0) filename_out = fields.pop(0) filename_results_code = fields.pop(0) dataset_filename_override = fields.pop(0) # Need to be careful with the way that these parameters are populated from the filename splitting, # because if a job is running when the server is updated, any existing external metadata command-lines # will not have info about the newly added override_metadata file if fields: override_metadata = fields.pop(0) else: override_metadata = None set_meta_kwds = stringify_dictionary_keys( json.load(open(filename_kwds)) ) # load kwds; need to ensure our keywords are not unicode try: dataset = cPickle.load(open(filename_in)) # load DatasetInstance dataset.dataset.external_filename = dataset_filename_override files_path = os.path.abspath( os.path.join(tool_job_working_directory, "dataset_%s_files" % (dataset.dataset.id)) ) dataset.dataset.external_extra_files_path = files_path if dataset.dataset.id in existing_job_metadata_dict: dataset.extension = existing_job_metadata_dict[dataset.dataset.id].get("ext", dataset.extension) # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles if override_metadata: override_metadata = json.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(metadata_file_override): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override ) setattr(dataset.metadata, metadata_name, metadata_file_override) file_dict = existing_job_metadata_dict.get(dataset.dataset.id, {}) set_meta_with_tool_provided(dataset, file_dict, set_meta_kwds, datatypes_registry) if max_metadata_value_size: for k, v in dataset.metadata.items(): if total_size(v) > max_metadata_value_size: log.info("Key %s too large for metadata, discarding" % k) dataset.metadata.remove_key(k) dataset.metadata.to_JSON_dict(filename_out) # write out results of set_meta json.dump( (True, "Metadata has been set successfully"), open(filename_results_code, "wb+") ) # setting metadata has succeeded except Exception, e: json.dump((False, str(e)), open(filename_results_code, "wb+")) # setting metadata has failed somehow
def __main__(): file_path = sys.argv.pop( 1 ) tmp_dir = sys.argv.pop( 1 ) galaxy.model.Dataset.file_path = file_path galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tmp_dir config_root = sys.argv.pop( 1 ) config_file_name = sys.argv.pop( 1 ) if not os.path.isabs( config_file_name ): config_file_name = os.path.join( config_root, config_file_name ) # Set up reference to object store # First, read in the main config file for Galaxy; this is required because # the object store configuration is stored there conf = ConfigParser.ConfigParser() conf.read(config_file_name) conf_dict = {} for section in conf.sections(): for option in conf.options(section): try: conf_dict[option] = conf.get(section, option) except ConfigParser.InterpolationMissingOptionError: # Because this is not called from Paste Script, %(here)s variable # is not initialized in the config file so skip those fields - # just need not to use any such fields for the object store conf... log.debug("Did not load option %s from %s" % (option, config_file_name)) # config object is required by ObjectStore class so create it now universe_config = config.Configuration(**conf_dict) object_store = build_object_store_from_config(universe_config) galaxy.model.Dataset.object_store = object_store # Set up datatypes registry datatypes_config = sys.argv.pop( 1 ) datatypes_registry = galaxy.datatypes.registry.Registry() datatypes_registry.load_datatypes( root_dir=config_root, config=datatypes_config ) galaxy.model.set_datatypes_registry( datatypes_registry ) job_metadata = sys.argv.pop( 1 ) ext_override = dict() if job_metadata != "None" and os.path.exists( job_metadata ): for line in open( job_metadata, 'r' ): try: line = stringify_dictionary_keys( from_json_string( line ) ) assert line['type'] == 'dataset' ext_override[line['dataset_id']] = line['ext'] except: continue for filenames in sys.argv[1:]: fields = filenames.split( ',' ) filename_in = fields.pop( 0 ) filename_kwds = fields.pop( 0 ) filename_out = fields.pop( 0 ) filename_results_code = fields.pop( 0 ) dataset_filename_override = fields.pop( 0 ) #Need to be careful with the way that these parameters are populated from the filename splitting, #because if a job is running when the server is updated, any existing external metadata command-lines #will not have info about the newly added override_metadata file if fields: override_metadata = fields.pop( 0 ) else: override_metadata = None try: dataset = cPickle.load( open( filename_in ) ) #load DatasetInstance if dataset_filename_override: dataset.dataset.external_filename = dataset_filename_override if ext_override.get( dataset.dataset.id, None ): dataset.extension = ext_override[ dataset.dataset.id ] #Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles if override_metadata: override_metadata = simplejson.load( open( override_metadata ) ) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value( metadata_file_override ): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override ) setattr( dataset.metadata, metadata_name, metadata_file_override ) kwds = stringify_dictionary_keys( simplejson.load( open( filename_kwds ) ) )#load kwds; need to ensure our keywords are not unicode dataset.datatype.set_meta( dataset, **kwds ) dataset.metadata.to_JSON_dict( filename_out ) # write out results of set_meta simplejson.dump( ( True, 'Metadata has been set successfully' ), open( filename_results_code, 'wb+' ) ) #setting metadata has succeeded except Exception, e: simplejson.dump( ( False, str( e ) ), open( filename_results_code, 'wb+' ) ) #setting metadata has failed somehow
def set_metadata(): # locate galaxy_root for loading datatypes galaxy_root = os.path.abspath( os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir)) galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tool_job_working_directory = os.path.abspath( os.getcwd()) # This is ugly, but to transition from existing jobs without this parameter # to ones with, smoothly, it has to be the last optional parameter and we # have to sniff it. try: max_metadata_value_size = int(sys.argv[-1]) sys.argv = sys.argv[:-1] except ValueError: max_metadata_value_size = 0 # max_metadata_value_size is unspecified and should be 0 # Set up datatypes registry datatypes_config = sys.argv.pop(1) datatypes_registry = galaxy.datatypes.registry.Registry() datatypes_registry.load_datatypes(root_dir=galaxy_root, config=datatypes_config) galaxy.model.set_datatypes_registry(datatypes_registry) job_metadata = sys.argv.pop(1) existing_job_metadata_dict = {} new_job_metadata_dict = {} if job_metadata != "None" and os.path.exists(job_metadata): for line in open(job_metadata, 'r'): try: line = stringify_dictionary_keys(json.loads(line)) if line['type'] == 'dataset': existing_job_metadata_dict[line['dataset_id']] = line elif line['type'] == 'new_primary_dataset': new_job_metadata_dict[line['filename']] = line except: continue for filenames in sys.argv[1:]: fields = filenames.split(',') filename_in = fields.pop(0) filename_kwds = fields.pop(0) filename_out = fields.pop(0) filename_results_code = fields.pop(0) dataset_filename_override = fields.pop(0) # Need to be careful with the way that these parameters are populated from the filename splitting, # because if a job is running when the server is updated, any existing external metadata command-lines # will not have info about the newly added override_metadata file if fields: override_metadata = fields.pop(0) else: override_metadata = None set_meta_kwds = stringify_dictionary_keys( json.load(open(filename_kwds)) ) # load kwds; need to ensure our keywords are not unicode try: dataset = cPickle.load(open(filename_in)) # load DatasetInstance dataset.dataset.external_filename = dataset_filename_override files_path = os.path.abspath( os.path.join(tool_job_working_directory, "dataset_%s_files" % (dataset.dataset.id))) dataset.dataset.external_extra_files_path = files_path if dataset.dataset.id in existing_job_metadata_dict: dataset.extension = existing_job_metadata_dict[ dataset.dataset.id].get('ext', dataset.extension) # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles if override_metadata: override_metadata = json.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value( metadata_file_override): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override) setattr(dataset.metadata, metadata_name, metadata_file_override) file_dict = existing_job_metadata_dict.get(dataset.dataset.id, {}) set_meta_with_tool_provided(dataset, file_dict, set_meta_kwds, datatypes_registry) if max_metadata_value_size: for k, v in dataset.metadata.items(): if total_size(v) > max_metadata_value_size: log.info("Key %s too large for metadata, discarding" % k) dataset.metadata.remove_key(k) dataset.metadata.to_JSON_dict( filename_out) # write out results of set_meta json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wb+')) # setting metadata has succeeded except Exception, e: json.dump((False, str(e)), open(filename_results_code, 'wb+')) # setting metadata has failed somehow
def set_metadata(): # locate galaxy_root for loading datatypes galaxy_root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir)) galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tool_job_working_directory = os.path.abspath(os.getcwd()) # This is ugly, but to transition from existing jobs without this parameter # to ones with, smoothly, it has to be the last optional parameter and we # have to sniff it. try: max_metadata_value_size = int(sys.argv[-1]) sys.argv = sys.argv[:-1] except ValueError: max_metadata_value_size = 0 # max_metadata_value_size is unspecified and should be 0 # Set up datatypes registry datatypes_config = sys.argv.pop(1) if not os.path.exists(datatypes_config): # This path should exist, except for jobs that started running on release 17.05, where a global # datatypes_config (instead of a datatypes_config per job) was used. For a while release 17.05 # would remove the global datatypes config on shutdown and toolbox reload, which would lead to # failed metadata jobs. To remedy this we scan jobs at startup for missing registry.xml files, # and if we detect such a job we write out the current registry.xml file. datatypes_config = os.path.join(tool_job_working_directory, "registry.xml") if not os.path.exists(datatypes_config): print("Metadata setting failed because registry.xml could not be found. You may retry setting metadata.") sys.exit(1) datatypes_registry = galaxy.datatypes.registry.Registry() datatypes_registry.load_datatypes(root_dir=galaxy_root, config=datatypes_config) galaxy.model.set_datatypes_registry(datatypes_registry) job_metadata = sys.argv.pop(1) existing_job_metadata_dict = {} new_job_metadata_dict = {} if job_metadata != "None" and os.path.exists(job_metadata): for line in open(job_metadata, 'r'): try: line = stringify_dictionary_keys(json.loads(line)) if line['type'] == 'dataset': existing_job_metadata_dict[line['dataset_id']] = line elif line['type'] == 'new_primary_dataset': new_job_metadata_dict[line['filename']] = line except: continue for filenames in sys.argv[1:]: fields = filenames.split(',') filename_in = fields.pop(0) filename_kwds = fields.pop(0) filename_out = fields.pop(0) filename_results_code = fields.pop(0) dataset_filename_override = fields.pop(0) # Need to be careful with the way that these parameters are populated from the filename splitting, # because if a job is running when the server is updated, any existing external metadata command-lines # will not have info about the newly added override_metadata file if fields: override_metadata = fields.pop(0) else: override_metadata = None set_meta_kwds = stringify_dictionary_keys(json.load(open(filename_kwds))) # load kwds; need to ensure our keywords are not unicode try: dataset = cPickle.load(open(filename_in)) # load DatasetInstance dataset.dataset.external_filename = dataset_filename_override files_path = os.path.abspath(os.path.join(tool_job_working_directory, "dataset_%s_files" % (dataset.dataset.id))) dataset.dataset.external_extra_files_path = files_path if dataset.dataset.id in existing_job_metadata_dict: dataset.extension = existing_job_metadata_dict[dataset.dataset.id].get('ext', dataset.extension) # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles if override_metadata: override_metadata = json.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(metadata_file_override): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON(metadata_file_override) setattr(dataset.metadata, metadata_name, metadata_file_override) file_dict = existing_job_metadata_dict.get(dataset.dataset.id, {}) set_meta_with_tool_provided(dataset, file_dict, set_meta_kwds, datatypes_registry) if max_metadata_value_size: for k, v in list(dataset.metadata.items()): if total_size(v) > max_metadata_value_size: log.info("Key %s too large for metadata, discarding" % k) dataset.metadata.remove_key(k) dataset.metadata.to_JSON_dict(filename_out) # write out results of set_meta json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wb+')) # setting metadata has succeeded except Exception as e: json.dump((False, str(e)), open(filename_results_code, 'wb+')) # setting metadata has failed somehow for i, (filename, file_dict) in enumerate(new_job_metadata_dict.items(), start=1): new_dataset_filename = os.path.join(tool_job_working_directory, "working", file_dict['filename']) new_dataset = galaxy.model.Dataset(id=-i, external_filename=new_dataset_filename) extra_files = file_dict.get('extra_files', None) if extra_files is not None: new_dataset._extra_files_path = os.path.join(tool_job_working_directory, "working", extra_files) new_dataset.state = new_dataset.states.OK new_dataset_instance = galaxy.model.HistoryDatasetAssociation(id=-i, dataset=new_dataset, extension=file_dict.get('ext', 'data')) set_meta_with_tool_provided(new_dataset_instance, file_dict, set_meta_kwds, datatypes_registry) file_dict['metadata'] = json.loads(new_dataset_instance.metadata.to_JSON_dict()) # storing metadata in external form, need to turn back into dict, then later jsonify if existing_job_metadata_dict or new_job_metadata_dict: with open(job_metadata, 'wb') as job_metadata_fh: for value in list(existing_job_metadata_dict.values()) + list(new_job_metadata_dict.values()): job_metadata_fh.write("%s\n" % (json.dumps(value))) clear_mappers()
def __main__(): file_path = sys.argv.pop(1) tmp_dir = sys.argv.pop(1) galaxy.model.Dataset.file_path = file_path galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tmp_dir # Set up datatypes registry config_root = sys.argv.pop(1) datatypes_config = sys.argv.pop(1) galaxy.model.set_datatypes_registry( galaxy.datatypes.registry.Registry(config_root, datatypes_config)) job_metadata = sys.argv.pop(1) ext_override = dict() if job_metadata != "None" and os.path.exists(job_metadata): for line in open(job_metadata, 'r'): try: line = stringify_dictionary_keys(from_json_string(line)) assert line['type'] == 'dataset' ext_override[line['dataset_id']] = line['ext'] except: continue for filenames in sys.argv[1:]: fields = filenames.split(',') filename_in = fields.pop(0) filename_kwds = fields.pop(0) filename_out = fields.pop(0) filename_results_code = fields.pop(0) dataset_filename_override = fields.pop(0) #Need to be careful with the way that these parameters are populated from the filename splitting, #because if a job is running when the server is updated, any existing external metadata command-lines #will not have info about the newly added override_metadata file if fields: override_metadata = fields.pop(0) else: override_metadata = None try: dataset = cPickle.load(open(filename_in)) #load DatasetInstance if dataset_filename_override: dataset.dataset.external_filename = dataset_filename_override if ext_override.get(dataset.dataset.id, None): dataset.extension = ext_override[dataset.dataset.id] #Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles if override_metadata: override_metadata = simplejson.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value( metadata_file_override): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override) setattr(dataset.metadata, metadata_name, metadata_file_override) kwds = stringify_dictionary_keys( simplejson.load(open(filename_kwds)) ) #load kwds; need to ensure our keywords are not unicode dataset.datatype.set_meta(dataset, **kwds) dataset.metadata.to_JSON_dict( filename_out) # write out results of set_meta simplejson.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wb+')) #setting metadata has succeeded except Exception, e: simplejson.dump((False, str(e)), open(filename_results_code, 'wb+')) #setting metadata has failed somehow
def set_metadata_portable(): import galaxy.model tool_job_working_directory = os.path.abspath(os.getcwd()) metadata_tmp_files_dir = os.path.join(tool_job_working_directory, "metadata") galaxy.model.metadata.MetadataTempFile.tmp_dir = metadata_tmp_files_dir metadata_params_path = os.path.join("metadata", "params.json") try: with open(metadata_params_path, "r") as f: metadata_params = json.load(f) except IOError: raise Exception("Failed to find metadata/params.json from cwd [%s]" % tool_job_working_directory) datatypes_config = metadata_params["datatypes_config"] job_metadata = metadata_params["job_metadata"] provided_metadata_style = metadata_params.get("provided_metadata_style") max_metadata_value_size = metadata_params.get( "max_metadata_value_size") or 0 outputs = metadata_params["outputs"] datatypes_registry = validate_and_load_datatypes_config(datatypes_config) tool_provided_metadata = load_job_metadata(job_metadata, provided_metadata_style) def set_meta(new_dataset_instance, file_dict): set_meta_with_tool_provided(new_dataset_instance, file_dict, set_meta_kwds, datatypes_registry, max_metadata_value_size) object_store_conf_path = os.path.join("metadata", "object_store_conf.json") extended_metadata_collection = os.path.exists(object_store_conf_path) object_store = None job_context = None version_string = "" export_store = None if extended_metadata_collection: from galaxy.tool_util.parser.stdio import ToolStdioRegex, ToolStdioExitCode tool_dict = metadata_params["tool"] stdio_exit_code_dicts, stdio_regex_dicts = tool_dict[ "stdio_exit_codes"], tool_dict["stdio_regexes"] stdio_exit_codes = list(map(ToolStdioExitCode, stdio_exit_code_dicts)) stdio_regexes = list(map(ToolStdioRegex, stdio_regex_dicts)) with open(object_store_conf_path, "r") as f: config_dict = json.load(f) from galaxy.objectstore import build_object_store_from_config assert config_dict is not None object_store = build_object_store_from_config(None, config_dict=config_dict) galaxy.model.Dataset.object_store = object_store outputs_directory = os.path.join(tool_job_working_directory, "outputs") if not os.path.exists(outputs_directory): outputs_directory = tool_job_working_directory # TODO: constants... if os.path.exists(os.path.join(outputs_directory, "tool_stdout")): with open(os.path.join(outputs_directory, "tool_stdout"), "rb") as f: tool_stdout = f.read() with open(os.path.join(outputs_directory, "tool_stderr"), "rb") as f: tool_stderr = f.read() elif os.path.exists(os.path.join(outputs_directory, "stdout")): # Puslar style working directory. with open(os.path.join(outputs_directory, "stdout"), "rb") as f: tool_stdout = f.read() with open(os.path.join(outputs_directory, "stderr"), "rb") as f: tool_stderr = f.read() job_id_tag = metadata_params["job_id_tag"] # TODO: this clearly needs to be refactored, nothing in runners should be imported here.. from galaxy.job_execution.output_collect import default_exit_code_file, read_exit_code_from exit_code_file = default_exit_code_file(".", job_id_tag) tool_exit_code = read_exit_code_from(exit_code_file, job_id_tag) from galaxy.tool_util.output_checker import check_output, DETECTED_JOB_STATE check_output_detected_state, tool_stdout, tool_stderr, job_messages = check_output( stdio_regexes, stdio_exit_codes, tool_stdout, tool_stderr, tool_exit_code, job_id_tag) if check_output_detected_state == DETECTED_JOB_STATE.OK and not tool_provided_metadata.has_failed_outputs( ): final_job_state = galaxy.model.Job.states.OK else: final_job_state = galaxy.model.Job.states.ERROR from pulsar.client.staging import COMMAND_VERSION_FILENAME version_string = "" if os.path.exists(COMMAND_VERSION_FILENAME): version_string = open(COMMAND_VERSION_FILENAME).read() # TODO: handle outputs_to_working_directory? from galaxy.util.expressions import ExpressionContext job_context = ExpressionContext( dict(stdout=tool_stdout, stderr=tool_stderr)) # Load outputs. import_model_store = store.imported_store_for_metadata( 'metadata/outputs_new', object_store=object_store) export_store = store.DirectoryModelExportStore( 'metadata/outputs_populated', serialize_dataset_objects=True, for_edit=True) for output_name, output_dict in outputs.items(): if extended_metadata_collection: dataset_instance_id = output_dict["id"] dataset = import_model_store.sa_session.query( galaxy.model.HistoryDatasetAssociation).find( dataset_instance_id) assert dataset is not None else: filename_in = os.path.join("metadata/metadata_in_%s" % output_name) dataset = cPickle.load(open(filename_in, 'rb')) # load DatasetInstance filename_kwds = os.path.join("metadata/metadata_kwds_%s" % output_name) filename_out = os.path.join("metadata/metadata_out_%s" % output_name) filename_results_code = os.path.join("metadata/metadata_results_%s" % output_name) override_metadata = os.path.join("metadata/metadata_override_%s" % output_name) dataset_filename_override = output_dict["filename_override"] # Same block as below... set_meta_kwds = stringify_dictionary_keys( json.load(open(filename_kwds)) ) # load kwds; need to ensure our keywords are not unicode try: dataset.dataset.external_filename = dataset_filename_override store_by = metadata_params.get("object_store_store_by", "id") extra_files_dir_name = "dataset_%s_files" % getattr( dataset.dataset, store_by) files_path = os.path.abspath( os.path.join(tool_job_working_directory, "working", extra_files_dir_name)) dataset.dataset.external_extra_files_path = files_path file_dict = tool_provided_metadata.get_dataset_meta( output_name, dataset.dataset.id, dataset.dataset.uuid) if 'ext' in file_dict: dataset.extension = file_dict['ext'] # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles override_metadata = json.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value( metadata_file_override): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override) setattr(dataset.metadata, metadata_name, metadata_file_override) if output_dict.get("validate", False): set_validated_state(dataset) set_meta(dataset, file_dict) if extended_metadata_collection: meta = tool_provided_metadata.get_dataset_meta( output_name, dataset.dataset.id, dataset.dataset.uuid) if meta: context = ExpressionContext(meta, job_context) else: context = job_context # Lazy and unattached # if getattr(dataset, "hidden_beneath_collection_instance", None): # dataset.visible = False dataset.blurb = 'done' dataset.peek = 'no peek' dataset.info = (dataset.info or '') if context['stdout'].strip(): # Ensure white space between entries dataset.info = dataset.info.rstrip( ) + "\n" + context['stdout'].strip() if context['stderr'].strip(): # Ensure white space between entries dataset.info = dataset.info.rstrip( ) + "\n" + context['stderr'].strip() dataset.tool_version = version_string dataset.set_size() if 'uuid' in context: dataset.dataset.uuid = context['uuid'] object_store.update_from_file(dataset.dataset, create=True) from galaxy.job_execution.output_collect import collect_extra_files collect_extra_files(object_store, dataset, ".") if galaxy.model.Job.states.ERROR == final_job_state: dataset.blurb = "error" dataset.mark_unhidden() else: # If the tool was expected to set the extension, attempt to retrieve it if dataset.ext == 'auto': dataset.extension = context.get('ext', 'data') dataset.init_meta(copy_from=dataset) # This has already been done: # else: # self.external_output_metadata.load_metadata(dataset, output_name, self.sa_session, working_directory=self.working_directory, remote_metadata_directory=remote_metadata_directory) line_count = context.get('line_count', None) try: # Certain datatype's set_peek methods contain a line_count argument dataset.set_peek(line_count=line_count) except TypeError: # ... and others don't dataset.set_peek() from galaxy.jobs import TOOL_PROVIDED_JOB_METADATA_KEYS for context_key in TOOL_PROVIDED_JOB_METADATA_KEYS: if context_key in context: context_value = context[context_key] setattr(dataset, context_key, context_value) if extended_metadata_collection: export_store.add_dataset(dataset) else: cPickle.dump(dataset, open(filename_out, 'wb+')) else: dataset.metadata.to_JSON_dict( filename_out) # write out results of set_meta json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wt+')) # setting metadata has succeeded except Exception: json.dump((False, traceback.format_exc()), open(filename_results_code, 'wt+')) # setting metadata has failed somehow if extended_metadata_collection: # discover extra outputs... from galaxy.job_execution.output_collect import collect_dynamic_outputs, collect_primary_datasets, SessionlessJobContext job_context = SessionlessJobContext( metadata_params, tool_provided_metadata, object_store, export_store, import_model_store, os.path.join(tool_job_working_directory, "working")) output_collections = {} for name, output_collection in metadata_params[ "output_collections"].items(): output_collections[name] = import_model_store.sa_session.query( galaxy.model.HistoryDatasetCollectionAssociation).find( output_collection["id"]) outputs = {} for name, output in metadata_params["outputs"].items(): outputs[name] = import_model_store.sa_session.query( galaxy.model.HistoryDatasetAssociation).find(output["id"]) input_ext = json.loads(metadata_params["job_params"].get( "__input_ext", '"data"')) collect_primary_datasets( job_context, outputs, input_ext=input_ext, ) collect_dynamic_outputs(job_context, output_collections) if export_store: export_store._finalize() write_job_metadata(tool_job_working_directory, job_metadata, set_meta, tool_provided_metadata)
def set_metadata_portable(): tool_job_working_directory = os.path.abspath(os.getcwd()) metadata_tmp_files_dir = os.path.join(tool_job_working_directory, "metadata") MetadataTempFile.tmp_dir = metadata_tmp_files_dir metadata_params = get_metadata_params(tool_job_working_directory) datatypes_config = metadata_params["datatypes_config"] job_metadata = metadata_params["job_metadata"] provided_metadata_style = metadata_params.get("provided_metadata_style") max_metadata_value_size = metadata_params.get("max_metadata_value_size") or 0 max_discovered_files = metadata_params.get("max_discovered_files") outputs = metadata_params["outputs"] datatypes_registry = validate_and_load_datatypes_config(datatypes_config) tool_provided_metadata = load_job_metadata(job_metadata, provided_metadata_style) def set_meta(new_dataset_instance, file_dict): set_meta_with_tool_provided(new_dataset_instance, file_dict, set_meta_kwds, datatypes_registry, max_metadata_value_size) try: object_store = get_object_store(tool_job_working_directory=tool_job_working_directory) except (FileNotFoundError, AssertionError): object_store = None extended_metadata_collection = bool(object_store) job_context = None version_string = None export_store = None final_job_state = Job.states.OK job_messages = [] if extended_metadata_collection: tool_dict = metadata_params["tool"] stdio_exit_code_dicts, stdio_regex_dicts = tool_dict["stdio_exit_codes"], tool_dict["stdio_regexes"] stdio_exit_codes = list(map(ToolStdioExitCode, stdio_exit_code_dicts)) stdio_regexes = list(map(ToolStdioRegex, stdio_regex_dicts)) outputs_directory = os.path.join(tool_job_working_directory, "outputs") if not os.path.exists(outputs_directory): outputs_directory = tool_job_working_directory # TODO: constants... locations = [ (outputs_directory, 'tool_'), (tool_job_working_directory, ''), (outputs_directory, ''), # # Pulsar style output directory? Was this ever used - did this ever work? ] for directory, prefix in locations: if os.path.exists(os.path.join(directory, f"{prefix}stdout")): with open(os.path.join(directory, f"{prefix}stdout"), 'rb') as f: tool_stdout = f.read(MAX_STDIO_READ_BYTES) with open(os.path.join(directory, f"{prefix}stderr"), 'rb') as f: tool_stderr = f.read(MAX_STDIO_READ_BYTES) break else: if os.path.exists(os.path.join(tool_job_working_directory, 'task_0')): # We have a task splitting job tool_stdout = b'' tool_stderr = b'' paths = Path(tool_job_working_directory).glob('task_*') for path in paths: with open(path / 'outputs' / 'tool_stdout', 'rb') as f: task_stdout = f.read(MAX_STDIO_READ_BYTES) if task_stdout: tool_stdout = b"%s[%s stdout]\n%s\n" % (tool_stdout, path.name.encode(), task_stdout) with open(path / 'outputs' / 'tool_stderr', 'rb') as f: task_stderr = f.read(MAX_STDIO_READ_BYTES) if task_stderr: tool_stderr = b"%s[%s stdout]\n%s\n" % (tool_stderr, path.name.encode(), task_stderr) else: wdc = os.listdir(tool_job_working_directory) odc = os.listdir(outputs_directory) error_desc = "Failed to find tool_stdout or tool_stderr for this job, cannot collect metadata" error_extra = f"Working dir contents [{wdc}], output directory contents [{odc}]" log.warn(f"{error_desc}. {error_extra}") raise Exception(error_desc) job_id_tag = metadata_params["job_id_tag"] exit_code_file = default_exit_code_file(".", job_id_tag) tool_exit_code = read_exit_code_from(exit_code_file, job_id_tag) check_output_detected_state, tool_stdout, tool_stderr, job_messages = check_output(stdio_regexes, stdio_exit_codes, tool_stdout, tool_stderr, tool_exit_code, job_id_tag) if check_output_detected_state == DETECTED_JOB_STATE.OK and not tool_provided_metadata.has_failed_outputs(): final_job_state = Job.states.OK else: final_job_state = Job.states.ERROR version_string_path = os.path.join('outputs', COMMAND_VERSION_FILENAME) version_string = collect_shrinked_content_from_path(version_string_path) expression_context = ExpressionContext(dict(stdout=tool_stdout[:255], stderr=tool_stderr[:255])) # Load outputs. export_store = store.DirectoryModelExportStore('metadata/outputs_populated', serialize_dataset_objects=True, for_edit=True, strip_metadata_files=False, serialize_jobs=True) try: import_model_store = store.imported_store_for_metadata('metadata/outputs_new', object_store=object_store) except AssertionError: # Remove in 21.09, this should only happen for jobs that started on <= 20.09 and finish now import_model_store = None tool_script_file = os.path.join(tool_job_working_directory, 'tool_script.sh') job = None if import_model_store and export_store: job = next(iter(import_model_store.sa_session.objects[Job].values())) job_context = SessionlessJobContext( metadata_params, tool_provided_metadata, object_store, export_store, import_model_store, os.path.join(tool_job_working_directory, "working"), final_job_state=final_job_state, max_discovered_files=max_discovered_files, ) if extended_metadata_collection: # discover extra outputs... output_collections = {} for name, output_collection in metadata_params["output_collections"].items(): # TODO: remove HistoryDatasetCollectionAssociation fallback on 22.01, model_class used to not be serialized prior to 21.09 model_class = output_collection.get('model_class', 'HistoryDatasetCollectionAssociation') collection = import_model_store.sa_session.query(getattr(galaxy.model, model_class)).find(output_collection["id"]) output_collections[name] = collection output_instances = {} for name, output in metadata_params["outputs"].items(): klass = getattr(galaxy.model, output.get('model_class', 'HistoryDatasetAssociation')) output_instances[name] = import_model_store.sa_session.query(klass).find(output["id"]) input_ext = json.loads(metadata_params["job_params"].get("__input_ext") or '"data"') try: collect_primary_datasets( job_context, output_instances, input_ext=input_ext, ) collect_dynamic_outputs(job_context, output_collections) except MaxDiscoveredFilesExceededError as e: final_job_state = Job.states.ERROR job_messages.append(str(e)) if job: job.job_messages = job_messages job.state = final_job_state if os.path.exists(tool_script_file): with open(tool_script_file) as command_fh: command_line_lines = [] for i, line in enumerate(command_fh): if i == 0 and line.endswith('COMMAND_VERSION 2>&1;'): # Don't record version command as part of command line continue command_line_lines.append(line) job.command_line = "".join(command_line_lines).strip() export_store.export_job(job, include_job_data=False) unnamed_id_to_path = {} for unnamed_output_dict in job_context.tool_provided_metadata.get_unnamed_outputs(): destination = unnamed_output_dict["destination"] elements = unnamed_output_dict["elements"] destination_type = destination["type"] if destination_type == 'hdas': for element in elements: filename = element.get('filename') object_id = element.get('object_id') if filename and object_id: unnamed_id_to_path[object_id] = os.path.join(job_context.job_working_directory, filename) for output_name, output_dict in outputs.items(): dataset_instance_id = output_dict["id"] klass = getattr(galaxy.model, output_dict.get('model_class', 'HistoryDatasetAssociation')) dataset = None if import_model_store: dataset = import_model_store.sa_session.query(klass).find(dataset_instance_id) if dataset is None: # legacy check for jobs that started before 21.01, remove on 21.05 filename_in = os.path.join(f"metadata/metadata_in_{output_name}") import pickle dataset = pickle.load(open(filename_in, 'rb')) # load DatasetInstance assert dataset is not None filename_kwds = os.path.join(f"metadata/metadata_kwds_{output_name}") filename_out = os.path.join(f"metadata/metadata_out_{output_name}") filename_results_code = os.path.join(f"metadata/metadata_results_{output_name}") override_metadata = os.path.join(f"metadata/metadata_override_{output_name}") dataset_filename_override = output_dict["filename_override"] # pre-20.05 this was a per job parameter and not a per dataset parameter, drop in 21.XX legacy_object_store_store_by = metadata_params.get("object_store_store_by", "id") # Same block as below... set_meta_kwds = stringify_dictionary_keys(json.load(open(filename_kwds))) # load kwds; need to ensure our keywords are not unicode try: external_filename = unnamed_id_to_path.get(dataset_instance_id, dataset_filename_override) if not os.path.exists(external_filename): matches = glob.glob(external_filename) assert len(matches) == 1, f"More than one file matched by output glob '{external_filename}'" external_filename = matches[0] assert safe_contains(tool_job_working_directory, external_filename), f"Cannot collect output '{external_filename}' from outside of working directory" created_from_basename = os.path.relpath(external_filename, os.path.join(tool_job_working_directory, 'working')) dataset.dataset.created_from_basename = created_from_basename # override filename if we're dealing with outputs to working directory and dataset is not linked to link_data_only = metadata_params.get("link_data_only") if not link_data_only: # Only set external filename if we're dealing with files in job working directory. # Fixes link_data_only uploads dataset.dataset.external_filename = external_filename store_by = output_dict.get("object_store_store_by", legacy_object_store_store_by) extra_files_dir_name = f"dataset_{getattr(dataset.dataset, store_by)}_files" files_path = os.path.abspath(os.path.join(tool_job_working_directory, "working", extra_files_dir_name)) dataset.dataset.external_extra_files_path = files_path file_dict = tool_provided_metadata.get_dataset_meta(output_name, dataset.dataset.id, dataset.dataset.uuid) if 'ext' in file_dict: dataset.extension = file_dict['ext'] # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles override_metadata = json.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if MetadataTempFile.is_JSONified_value(metadata_file_override): metadata_file_override = MetadataTempFile.from_JSON(metadata_file_override) setattr(dataset.metadata, metadata_name, metadata_file_override) if output_dict.get("validate", False): set_validated_state(dataset) if dataset_instance_id not in unnamed_id_to_path: # We're going to run through set_metadata in collect_dynamic_outputs with more contextual metadata, # so skip set_meta here. set_meta(dataset, file_dict) if extended_metadata_collection: collect_extra_files(object_store, dataset, ".") dataset.state = dataset.dataset.state = final_job_state if extended_metadata_collection: if not link_data_only and os.path.getsize(external_filename): # Here we might be updating a disk based objectstore when outputs_to_working_directory is used, # or a remote object store from its cache path. object_store.update_from_file(dataset.dataset, file_name=external_filename, create=True) # TODO: merge expression_context into tool_provided_metadata so we don't have to special case this (here and in _finish_dataset) meta = tool_provided_metadata.get_dataset_meta(output_name, dataset.dataset.id, dataset.dataset.uuid) if meta: context = ExpressionContext(meta, expression_context) else: context = expression_context dataset.blurb = 'done' dataset.peek = 'no peek' dataset.info = (dataset.info or '') if context['stdout'].strip(): # Ensure white space between entries dataset.info = f"{dataset.info.rstrip()}\n{context['stdout'].strip()}" if context['stderr'].strip(): # Ensure white space between entries dataset.info = f"{dataset.info.rstrip()}\n{context['stderr'].strip()}" dataset.tool_version = version_string if 'uuid' in context: dataset.dataset.uuid = context['uuid'] if not final_job_state == Job.states.ERROR: line_count = context.get('line_count', None) try: # Certain datatype's set_peek methods contain a line_count argument dataset.set_peek(line_count=line_count) except TypeError: # ... and others don't dataset.set_peek() for context_key in TOOL_PROVIDED_JOB_METADATA_KEYS: if context_key in context: context_value = context[context_key] setattr(dataset, context_key, context_value) # We only want to persist the external_filename if the dataset has been linked in. if not link_data_only: dataset.dataset.external_filename = None dataset.dataset.extra_files_path = None export_store.add_dataset(dataset) else: dataset.metadata.to_JSON_dict(filename_out) # write out results of set_meta json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wt+')) # setting metadata has succeeded except Exception: json.dump((False, traceback.format_exc()), open(filename_results_code, 'wt+')) # setting metadata has failed somehow if export_store: export_store._finalize() write_job_metadata(tool_job_working_directory, job_metadata, set_meta, tool_provided_metadata)
def set_metadata_portable(): tool_job_working_directory = os.path.abspath(os.getcwd()) metadata_tmp_files_dir = os.path.join(tool_job_working_directory, "metadata") MetadataTempFile.tmp_dir = metadata_tmp_files_dir metadata_params_path = os.path.join("metadata", "params.json") try: with open(metadata_params_path) as f: metadata_params = json.load(f) except OSError: raise Exception( f"Failed to find metadata/params.json from cwd [{tool_job_working_directory}]" ) datatypes_config = metadata_params["datatypes_config"] job_metadata = metadata_params["job_metadata"] provided_metadata_style = metadata_params.get("provided_metadata_style") max_metadata_value_size = metadata_params.get( "max_metadata_value_size") or 0 outputs = metadata_params["outputs"] datatypes_registry = validate_and_load_datatypes_config(datatypes_config) tool_provided_metadata = load_job_metadata(job_metadata, provided_metadata_style) def set_meta(new_dataset_instance, file_dict): set_meta_with_tool_provided(new_dataset_instance, file_dict, set_meta_kwds, datatypes_registry, max_metadata_value_size) object_store_conf_path = os.path.join("metadata", "object_store_conf.json") extended_metadata_collection = os.path.exists(object_store_conf_path) object_store = None job_context = None version_string = "" export_store = None final_job_state = Job.states.OK if extended_metadata_collection: tool_dict = metadata_params["tool"] stdio_exit_code_dicts, stdio_regex_dicts = tool_dict[ "stdio_exit_codes"], tool_dict["stdio_regexes"] stdio_exit_codes = list(map(ToolStdioExitCode, stdio_exit_code_dicts)) stdio_regexes = list(map(ToolStdioRegex, stdio_regex_dicts)) with open(object_store_conf_path) as f: config_dict = json.load(f) assert config_dict is not None object_store = build_object_store_from_config(None, config_dict=config_dict) Dataset.object_store = object_store outputs_directory = os.path.join(tool_job_working_directory, "outputs") if not os.path.exists(outputs_directory): outputs_directory = tool_job_working_directory # TODO: constants... if os.path.exists(os.path.join(outputs_directory, "tool_stdout")): with open(os.path.join(outputs_directory, "tool_stdout"), "rb") as f: tool_stdout = f.read() with open(os.path.join(outputs_directory, "tool_stderr"), "rb") as f: tool_stderr = f.read() elif os.path.exists(os.path.join(tool_job_working_directory, "stdout")): with open(os.path.join(tool_job_working_directory, "stdout"), "rb") as f: tool_stdout = f.read() with open(os.path.join(tool_job_working_directory, "stderr"), "rb") as f: tool_stderr = f.read() elif os.path.exists(os.path.join(outputs_directory, "stdout")): # Puslar style output directory? Was this ever used - did this ever work? with open(os.path.join(outputs_directory, "stdout"), "rb") as f: tool_stdout = f.read() with open(os.path.join(outputs_directory, "stderr"), "rb") as f: tool_stderr = f.read() else: wdc = os.listdir(tool_job_working_directory) odc = os.listdir(outputs_directory) error_desc = "Failed to find tool_stdout or tool_stderr for this job, cannot collect metadata" error_extra = f"Working dir contents [{wdc}], output directory contents [{odc}]" log.warn(f"{error_desc}. {error_extra}") raise Exception(error_desc) job_id_tag = metadata_params["job_id_tag"] exit_code_file = default_exit_code_file(".", job_id_tag) tool_exit_code = read_exit_code_from(exit_code_file, job_id_tag) check_output_detected_state, tool_stdout, tool_stderr, job_messages = check_output( stdio_regexes, stdio_exit_codes, tool_stdout, tool_stderr, tool_exit_code, job_id_tag) if check_output_detected_state == DETECTED_JOB_STATE.OK and not tool_provided_metadata.has_failed_outputs( ): final_job_state = Job.states.OK else: final_job_state = Job.states.ERROR version_string = "" if os.path.exists(COMMAND_VERSION_FILENAME): version_string = open(COMMAND_VERSION_FILENAME).read() expression_context = ExpressionContext( dict(stdout=tool_stdout, stderr=tool_stderr)) # Load outputs. export_store = store.DirectoryModelExportStore( 'metadata/outputs_populated', serialize_dataset_objects=True, for_edit=True, strip_metadata_files=False, serialize_jobs=False) try: import_model_store = store.imported_store_for_metadata( 'metadata/outputs_new', object_store=object_store) except AssertionError: # Remove in 21.09, this should only happen for jobs that started on <= 20.09 and finish now import_model_store = None job_context = SessionlessJobContext( metadata_params, tool_provided_metadata, object_store, export_store, import_model_store, os.path.join(tool_job_working_directory, "working"), final_job_state=final_job_state, ) unnamed_id_to_path = {} for unnamed_output_dict in job_context.tool_provided_metadata.get_unnamed_outputs( ): destination = unnamed_output_dict["destination"] elements = unnamed_output_dict["elements"] destination_type = destination["type"] if destination_type == 'hdas': for element in elements: filename = element.get('filename') if filename: unnamed_id_to_path[element['object_id']] = os.path.join( job_context.job_working_directory, filename) for output_name, output_dict in outputs.items(): dataset_instance_id = output_dict["id"] klass = getattr( galaxy.model, output_dict.get('model_class', 'HistoryDatasetAssociation')) dataset = None if import_model_store: dataset = import_model_store.sa_session.query(klass).find( dataset_instance_id) if dataset is None: # legacy check for jobs that started before 21.01, remove on 21.05 filename_in = os.path.join(f"metadata/metadata_in_{output_name}") import pickle dataset = pickle.load(open(filename_in, 'rb')) # load DatasetInstance assert dataset is not None filename_kwds = os.path.join(f"metadata/metadata_kwds_{output_name}") filename_out = os.path.join(f"metadata/metadata_out_{output_name}") filename_results_code = os.path.join( f"metadata/metadata_results_{output_name}") override_metadata = os.path.join( f"metadata/metadata_override_{output_name}") dataset_filename_override = output_dict["filename_override"] # pre-20.05 this was a per job parameter and not a per dataset parameter, drop in 21.XX legacy_object_store_store_by = metadata_params.get( "object_store_store_by", "id") # Same block as below... set_meta_kwds = stringify_dictionary_keys( json.load(open(filename_kwds)) ) # load kwds; need to ensure our keywords are not unicode try: dataset.dataset.external_filename = unnamed_id_to_path.get( dataset_instance_id, dataset_filename_override) store_by = output_dict.get("object_store_store_by", legacy_object_store_store_by) extra_files_dir_name = f"dataset_{getattr(dataset.dataset, store_by)}_files" files_path = os.path.abspath( os.path.join(tool_job_working_directory, "working", extra_files_dir_name)) dataset.dataset.external_extra_files_path = files_path file_dict = tool_provided_metadata.get_dataset_meta( output_name, dataset.dataset.id, dataset.dataset.uuid) if 'ext' in file_dict: dataset.extension = file_dict['ext'] # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles override_metadata = json.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if MetadataTempFile.is_JSONified_value(metadata_file_override): metadata_file_override = MetadataTempFile.from_JSON( metadata_file_override) setattr(dataset.metadata, metadata_name, metadata_file_override) if output_dict.get("validate", False): set_validated_state(dataset) if dataset_instance_id not in unnamed_id_to_path: # We're going to run through set_metadata in collect_dynamic_outputs with more contextual metadata, # so skip set_meta here. set_meta(dataset, file_dict) if extended_metadata_collection: meta = tool_provided_metadata.get_dataset_meta( output_name, dataset.dataset.id, dataset.dataset.uuid) if meta: context = ExpressionContext(meta, expression_context) else: context = expression_context # Lazy and unattached # if getattr(dataset, "hidden_beneath_collection_instance", None): # dataset.visible = False dataset.blurb = 'done' dataset.peek = 'no peek' dataset.info = (dataset.info or '') if context['stdout'].strip(): # Ensure white space between entries dataset.info = f"{dataset.info.rstrip()}\n{context['stdout'].strip()}" if context['stderr'].strip(): # Ensure white space between entries dataset.info = f"{dataset.info.rstrip()}\n{context['stderr'].strip()}" dataset.tool_version = version_string dataset.set_size() if 'uuid' in context: dataset.dataset.uuid = context['uuid'] if dataset_filename_override and dataset_filename_override != dataset.file_name: # This has to be a job with outputs_to_working_directory set. # We update the object store with the created output file. object_store.update_from_file( dataset.dataset, file_name=dataset_filename_override, create=True) collect_extra_files(object_store, dataset, ".") if Job.states.ERROR == final_job_state: dataset.blurb = "error" dataset.mark_unhidden() else: # If the tool was expected to set the extension, attempt to retrieve it if dataset.ext == 'auto': dataset.extension = context.get('ext', 'data') dataset.init_meta(copy_from=dataset) # This has already been done: # else: # self.external_output_metadata.load_metadata(dataset, output_name, self.sa_session, working_directory=self.working_directory, remote_metadata_directory=remote_metadata_directory) line_count = context.get('line_count', None) try: # Certain datatype's set_peek methods contain a line_count argument dataset.set_peek(line_count=line_count) except TypeError: # ... and others don't dataset.set_peek() for context_key in TOOL_PROVIDED_JOB_METADATA_KEYS: if context_key in context: context_value = context[context_key] setattr(dataset, context_key, context_value) # We never want to persist the external_filename. dataset.dataset.external_filename = None export_store.add_dataset(dataset) else: dataset.metadata.to_JSON_dict( filename_out) # write out results of set_meta json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wt+')) # setting metadata has succeeded except Exception: json.dump((False, traceback.format_exc()), open(filename_results_code, 'wt+')) # setting metadata has failed somehow if extended_metadata_collection: # discover extra outputs... output_collections = {} for name, output_collection in metadata_params[ "output_collections"].items(): output_collections[name] = import_model_store.sa_session.query( HistoryDatasetCollectionAssociation).find( output_collection["id"]) outputs = {} for name, output in metadata_params["outputs"].items(): klass = getattr( galaxy.model, output.get('model_class', 'HistoryDatasetAssociation')) outputs[name] = import_model_store.sa_session.query(klass).find( output["id"]) input_ext = json.loads(metadata_params["job_params"].get( "__input_ext", '"data"')) collect_primary_datasets( job_context, outputs, input_ext=input_ext, ) collect_dynamic_outputs(job_context, output_collections) if export_store: export_store._finalize() write_job_metadata(tool_job_working_directory, job_metadata, set_meta, tool_provided_metadata)
def __main__(): file_path = sys.argv.pop(1) tool_job_working_directory = tmp_dir = sys.argv.pop(1) # this is also the job_working_directory now galaxy.model.Dataset.file_path = file_path galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tmp_dir config_root = sys.argv.pop(1) config_file_name = sys.argv.pop(1) if not os.path.isabs(config_file_name): config_file_name = os.path.join(config_root, config_file_name) # Set up reference to object store # First, read in the main config file for Galaxy; this is required because # the object store configuration is stored there conf_dict = load_app_properties(ini_file=config_file_name) # config object is required by ObjectStore class so create it now universe_config = config.Configuration(**conf_dict) universe_config.ensure_tempdir() object_store = build_object_store_from_config(universe_config) galaxy.model.Dataset.object_store = object_store # Set up datatypes registry datatypes_config = sys.argv.pop(1) datatypes_registry = galaxy.datatypes.registry.Registry() datatypes_registry.load_datatypes(root_dir=config_root, config=datatypes_config) galaxy.model.set_datatypes_registry(datatypes_registry) job_metadata = sys.argv.pop(1) existing_job_metadata_dict = {} new_job_metadata_dict = {} if job_metadata != "None" and os.path.exists(job_metadata): for line in open(job_metadata, "r"): try: line = stringify_dictionary_keys(json.loads(line)) if line["type"] == "dataset": existing_job_metadata_dict[line["dataset_id"]] = line elif line["type"] == "new_primary_dataset": new_job_metadata_dict[line["filename"]] = line except: continue for filenames in sys.argv[1:]: fields = filenames.split(",") filename_in = fields.pop(0) filename_kwds = fields.pop(0) filename_out = fields.pop(0) filename_results_code = fields.pop(0) dataset_filename_override = fields.pop(0) # Need to be careful with the way that these parameters are populated from the filename splitting, # because if a job is running when the server is updated, any existing external metadata command-lines # will not have info about the newly added override_metadata file if fields: override_metadata = fields.pop(0) else: override_metadata = None set_meta_kwds = stringify_dictionary_keys( json.load(open(filename_kwds)) ) # load kwds; need to ensure our keywords are not unicode try: dataset = cPickle.load(open(filename_in)) # load DatasetInstance if dataset_filename_override: dataset.dataset.external_filename = dataset_filename_override files_path = os.path.abspath( os.path.join(tool_job_working_directory, "dataset_%s_files" % (dataset.dataset.id)) ) dataset.dataset.external_extra_files_path = files_path if dataset.dataset.id in existing_job_metadata_dict: dataset.extension = existing_job_metadata_dict[dataset.dataset.id].get("ext", dataset.extension) # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles if override_metadata: override_metadata = json.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value(metadata_file_override): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override ) setattr(dataset.metadata, metadata_name, metadata_file_override) file_dict = existing_job_metadata_dict.get(dataset.dataset.id, {}) set_meta_with_tool_provided(dataset, file_dict, set_meta_kwds) dataset.metadata.to_JSON_dict(filename_out) # write out results of set_meta json.dump( (True, "Metadata has been set successfully"), open(filename_results_code, "wb+") ) # setting metadata has succeeded except Exception, e: json.dump((False, str(e)), open(filename_results_code, "wb+")) # setting metadata has failed somehow
def main(): parser = optparse.OptionParser() parser.add_option( '-b', '--buffer', dest='buffer', type='int', default=1000000, help='Number of lines to buffer at a time. Default: 1,000,000 lines. A buffer of 0 will attempt to use memory only.' ) parser.add_option( '-d', '--index_depth', dest='index_depth', type='int', default=3, help='Depth to use on filebased offset indexing. Default: 3.' ) parser.add_option( '-p', '--keep_partial', action='store_true', dest='keep_partial', default=False, help='Keep rows in first input which are missing identifiers.') parser.add_option( '-u', '--keep_unmatched', action='store_true', dest='keep_unmatched', default=False, help='Keep rows in first input which are not joined with the second input.') parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', type='str', default=None, help='Fill empty columns with a values from a JSONified file.') parser.add_option( '-H', '--keep_headers', action='store_true', dest='keep_headers', default=False, help='Keep the headers') options, args = parser.parse_args() fill_options = None if options.fill_options_file is not None: try: fill_options = Bunch(**stringify_dictionary_keys(json.load(open(options.fill_options_file)))) # json.load( open( options.fill_options_file ) ) except Exception as e: print("Warning: Ignoring fill options due to json error (%s)." % e) if fill_options is None: fill_options = Bunch() if 'fill_unjoined_only' not in fill_options: fill_options.fill_unjoined_only = True if 'file1_columns' not in fill_options: fill_options.file1_columns = None if 'file2_columns' not in fill_options: fill_options.file2_columns = None try: filename1 = args[0] filename2 = args[1] column1 = int(args[2]) - 1 column2 = int(args[3]) - 1 out_filename = args[4] except Exception: print("Error parsing command line.", file=sys.stderr) sys.exit() # Character for splitting fields and joining lines split = "\t" return join_files(filename1, column1, filename2, column2, out_filename, split, options.buffer, options.keep_unmatched, options.keep_partial, options.keep_headers, options.index_depth, fill_options=fill_options)
def __main__(): file_path = sys.argv.pop( 1 ) tmp_dir = sys.argv.pop( 1 ) galaxy.model.Dataset.file_path = file_path galaxy.datatypes.metadata.MetadataTempFile.tmp_dir = tmp_dir config_root = sys.argv.pop( 1 ) config_file_name = sys.argv.pop( 1 ) if not os.path.isabs( config_file_name ): config_file_name = os.path.join( config_root, config_file_name ) # Set up reference to object store # First, read in the main config file for Galaxy; this is required because # the object store configuration is stored there conf = ConfigParser.ConfigParser() conf.read(config_file_name) conf_dict = {} for section in conf.sections(): for option in conf.options(section): try: conf_dict[option] = conf.get(section, option) except ConfigParser.InterpolationMissingOptionError: # Because this is not called from Paste Script, %(here)s variable # is not initialized in the config file so skip those fields - # just need not to use any such fields for the object store conf... log.debug("Did not load option %s from %s" % (option, config_file_name)) # config object is required by ObjectStore class so create it now universe_config = config.Configuration(**conf_dict) object_store = build_object_store_from_config(universe_config) galaxy.model.Dataset.object_store = object_store # Set up datatypes registry datatypes_config = sys.argv.pop( 1 ) datatypes_registry = galaxy.datatypes.registry.Registry() datatypes_registry.load_datatypes( root_dir=config_root, config=datatypes_config ) galaxy.model.set_datatypes_registry( datatypes_registry ) job_metadata = sys.argv.pop( 1 ) ext_override = dict() if job_metadata != "None" and os.path.exists( job_metadata ): for line in open( job_metadata, 'r' ): try: line = stringify_dictionary_keys( from_json_string( line ) ) assert line['type'] == 'dataset' ext_override[line['dataset_id']] = line['ext'] except: continue for filenames in sys.argv[1:]: fields = filenames.split( ',' ) filename_in = fields.pop( 0 ) filename_kwds = fields.pop( 0 ) filename_out = fields.pop( 0 ) filename_results_code = fields.pop( 0 ) dataset_filename_override = fields.pop( 0 ) # Need to be careful with the way that these parameters are populated from the filename splitting, # because if a job is running when the server is updated, any existing external metadata command-lines #will not have info about the newly added override_metadata file if fields: override_metadata = fields.pop( 0 ) else: override_metadata = None try: dataset = cPickle.load( open( filename_in ) ) # load DatasetInstance if dataset_filename_override: dataset.dataset.external_filename = dataset_filename_override if ext_override.get( dataset.dataset.id, None ): dataset.extension = ext_override[ dataset.dataset.id ] # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles if override_metadata: override_metadata = json.load( open( override_metadata ) ) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value( metadata_file_override ): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override ) setattr( dataset.metadata, metadata_name, metadata_file_override ) kwds = stringify_dictionary_keys( json.load( open( filename_kwds ) ) ) # load kwds; need to ensure our keywords are not unicode dataset.datatype.set_meta( dataset, **kwds ) dataset.metadata.to_JSON_dict( filename_out ) # write out results of set_meta json.dump( ( True, 'Metadata has been set successfully' ), open( filename_results_code, 'wb+' ) ) # setting metadata has succeeded except Exception, e: json.dump( ( False, str( e ) ), open( filename_results_code, 'wb+' ) ) # setting metadata has failed somehow
def from_JSON(cls, json_dict): #need to ensure our keywords are not unicode rval = cls(**stringify_dictionary_keys(json_dict['kwds'])) rval._filename = json_dict['filename'] return rval
def from_JSON( cls, json_dict ): #need to ensure our keywords are not unicode rval = cls( **stringify_dictionary_keys( json_dict['kwds'] ) ) rval._filename = json_dict['filename'] return rval
def set_metadata(): # locate galaxy_root for loading datatypes galaxy_root = os.path.abspath( os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir)) import galaxy.model galaxy.model.metadata.MetadataTempFile.tmp_dir = tool_job_working_directory = os.path.abspath( os.getcwd()) # This is ugly, but to transition from existing jobs without this parameter # to ones with, smoothly, it has to be the last optional parameter and we # have to sniff it. try: max_metadata_value_size = int(sys.argv[-1]) sys.argv = sys.argv[:-1] except ValueError: max_metadata_value_size = 0 # max_metadata_value_size is unspecified and should be 0 # Set up datatypes registry datatypes_config = sys.argv.pop(1) if not os.path.exists(datatypes_config): # This path should exist, except for jobs that started running on release 17.05, where a global # datatypes_config (instead of a datatypes_config per job) was used. For a while release 17.05 # would remove the global datatypes config on shutdown and toolbox reload, which would lead to # failed metadata jobs. To remedy this we scan jobs at startup for missing registry.xml files, # and if we detect such a job we write out the current registry.xml file. datatypes_config = os.path.join(tool_job_working_directory, "registry.xml") if not os.path.exists(datatypes_config): print( "Metadata setting failed because registry.xml could not be found. You may retry setting metadata." ) sys.exit(1) import galaxy.datatypes.registry datatypes_registry = galaxy.datatypes.registry.Registry() datatypes_registry.load_datatypes(root_dir=galaxy_root, config=datatypes_config) galaxy.model.set_datatypes_registry(datatypes_registry) job_metadata = sys.argv.pop(1) existing_job_metadata_dict = {} new_job_metadata_dict = {} if job_metadata != "None" and os.path.exists(job_metadata): for line in open(job_metadata, 'r'): try: line = stringify_dictionary_keys(json.loads(line)) if line['type'] == 'dataset': existing_job_metadata_dict[line['dataset_id']] = line elif line['type'] == 'new_primary_dataset': new_job_metadata_dict[line['filename']] = line except Exception: continue for filenames in sys.argv[1:]: fields = filenames.split(',') filename_in = fields.pop(0) filename_kwds = fields.pop(0) filename_out = fields.pop(0) filename_results_code = fields.pop(0) dataset_filename_override = fields.pop(0) # Need to be careful with the way that these parameters are populated from the filename splitting, # because if a job is running when the server is updated, any existing external metadata command-lines # will not have info about the newly added override_metadata file if fields: override_metadata = fields.pop(0) else: override_metadata = None set_meta_kwds = stringify_dictionary_keys( json.load(open(filename_kwds)) ) # load kwds; need to ensure our keywords are not unicode try: dataset = cPickle.load(open(filename_in, 'rb')) # load DatasetInstance dataset.dataset.external_filename = dataset_filename_override files_path = os.path.abspath( os.path.join(tool_job_working_directory, "dataset_%s_files" % (dataset.dataset.id))) dataset.dataset.external_extra_files_path = files_path if dataset.dataset.id in existing_job_metadata_dict: dataset.extension = existing_job_metadata_dict[ dataset.dataset.id].get('ext', dataset.extension) # Metadata FileParameter types may not be writable on a cluster node, and are therefore temporarily substituted with MetadataTempFiles if override_metadata: override_metadata = json.load(open(override_metadata)) for metadata_name, metadata_file_override in override_metadata: if galaxy.datatypes.metadata.MetadataTempFile.is_JSONified_value( metadata_file_override): metadata_file_override = galaxy.datatypes.metadata.MetadataTempFile.from_JSON( metadata_file_override) setattr(dataset.metadata, metadata_name, metadata_file_override) file_dict = existing_job_metadata_dict.get(dataset.dataset.id, {}) set_meta_with_tool_provided(dataset, file_dict, set_meta_kwds, datatypes_registry) if max_metadata_value_size: for k, v in list(dataset.metadata.items()): if total_size(v) > max_metadata_value_size: log.info("Key %s too large for metadata, discarding" % k) dataset.metadata.remove_key(k) dataset.metadata.to_JSON_dict( filename_out) # write out results of set_meta json.dump((True, 'Metadata has been set successfully'), open(filename_results_code, 'wt+')) # setting metadata has succeeded except Exception as e: json.dump((False, str(e)), open(filename_results_code, 'wt+')) # setting metadata has failed somehow for i, (filename, file_dict) in enumerate(new_job_metadata_dict.items(), start=1): new_dataset_filename = os.path.join(tool_job_working_directory, "working", file_dict['filename']) new_dataset = galaxy.model.Dataset( id=-i, external_filename=new_dataset_filename) extra_files = file_dict.get('extra_files', None) if extra_files is not None: new_dataset._extra_files_path = os.path.join( tool_job_working_directory, "working", extra_files) new_dataset.state = new_dataset.states.OK new_dataset_instance = galaxy.model.HistoryDatasetAssociation( id=-i, dataset=new_dataset, extension=file_dict.get('ext', 'data')) set_meta_with_tool_provided(new_dataset_instance, file_dict, set_meta_kwds, datatypes_registry) file_dict['metadata'] = json.loads( new_dataset_instance.metadata.to_JSON_dict() ) # storing metadata in external form, need to turn back into dict, then later jsonify if existing_job_metadata_dict or new_job_metadata_dict: with open(job_metadata, 'wt') as job_metadata_fh: for value in list(existing_job_metadata_dict.values()) + list( new_job_metadata_dict.values()): job_metadata_fh.write("%s\n" % (json.dumps(value))) clear_mappers()