def validate(self, job_directory, string): regex = "^%s$" % self._expression_to_regex(job_directory, self.xml_el) match = compile(regex).match(string) validated = match is not None if validated: for group in match.groups(): if not in_directory(group, join(job_directory.path, "inputs")) and \ not in_directory(group, join(job_directory.path, "outputs")): validated = False break return validated
def discovered_file_for_element(dataset, job_working_directory, parent_identifiers=[], collector=None): target_directory = discover_target_directory( getattr(collector, "directory", None), job_working_directory) filename = dataset.get("filename") error_message = dataset.get("error_message") if error_message is None: # handle link_data_only here, verify filename is in directory if not linking... if not dataset.get("link_data_only"): path = os.path.join(target_directory, filename) if not util.in_directory(path, target_directory): raise Exception( "Problem with tool configuration, attempting to pull in datasets from outside working directory." ) else: path = filename return DiscoveredFile( path, collector, JsonCollectedDatasetMatch(dataset, collector, filename, path=path, parent_identifiers=parent_identifiers)) else: assert "error_message" in dataset return DiscoveredFileError( dataset['error_message'], collector, JsonCollectedDatasetMatch(dataset, collector, None, parent_identifiers=parent_identifiers))
def get_work_dir_outputs( self, job_wrapper, job_working_directory=None ): """ Returns list of pairs (source_file, destination) describing path to work_dir output file and ultimate destination. """ if not job_working_directory: job_working_directory = os.path.abspath( job_wrapper.working_directory ) # Set up dict of dataset id --> output path; output path can be real or # false depending on outputs_to_working_directory output_paths = {} for dataset_path in job_wrapper.get_output_fnames(): path = dataset_path.real_path if self.app.config.outputs_to_working_directory: path = dataset_path.false_path output_paths[ dataset_path.dataset_id ] = path output_pairs = [] # Walk job's output associations to find and use from_work_dir attributes. job = job_wrapper.get_job() job_tool = job_wrapper.tool for (joda, dataset) in self._walk_dataset_outputs( job ): if joda and job_tool: hda_tool_output = job_tool.find_output_def( joda.name ) if hda_tool_output and hda_tool_output.from_work_dir: # Copy from working dir to HDA. # TODO: move instead of copy to save time? source_file = os.path.join( job_working_directory, hda_tool_output.from_work_dir ) destination = job_wrapper.get_output_destination( output_paths[ dataset.dataset_id ] ) if in_directory( source_file, job_working_directory ): output_pairs.append( ( source_file, destination ) ) else: # Security violation. log.exception( "from_work_dir specified a location not in the working directory: %s, %s" % ( source_file, job_wrapper.working_directory ) ) return output_pairs
def __in_working_directory(self, job, path, app): working_directory = app.object_store.get_filename(job, base_dir='job_work', dir_only=True, extra_dir=str( job.id)) return util.in_directory(path, working_directory)
def metadata_path_on_compute(path): compute_path = path log.info(compute_tmp_dir) if compute_tmp_dir and tmp_dir and in_directory(path, tmp_dir): path_relative = os.path.relpath(path, tmp_dir) compute_path = os.path.join(compute_tmp_dir, path_relative) return compute_path
def discover_target_directory(extra_file_collector, job_working_directory): directory = job_working_directory if extra_file_collector.directory: directory = os.path.join(directory, extra_file_collector.directory) if not util.in_directory(directory, job_working_directory): raise Exception("Problem with tool configuration, attempting to pull in datasets from outside working directory.") return directory
def preprocess_volumes(volumes_raw_str, container_type): """Process Galaxy volume specification string to either Docker or Singularity specification. Galaxy allows the mount try "default_ro" which translates to ro for Docker and ro for Singularity iff no subdirectories are rw (Singularity does not allow ro parent directories with rw subdirectories). >>> preprocess_volumes("/a/b", DOCKER_CONTAINER_TYPE) ['/a/b:rw'] >>> preprocess_volumes("/a/b:ro,/a/b/c:rw", DOCKER_CONTAINER_TYPE) ['/a/b:ro', '/a/b/c:rw'] >>> preprocess_volumes("/a/b:/a:ro,/a/b/c:/a/b:rw", DOCKER_CONTAINER_TYPE) ['/a/b:/a:ro', '/a/b/c:/a/b:rw'] >>> preprocess_volumes("/a/b:default_ro,/a/b/c:rw", DOCKER_CONTAINER_TYPE) ['/a/b:ro', '/a/b/c:rw'] >>> preprocess_volumes("/a/b:default_ro,/a/b/c:ro", SINGULARITY_CONTAINER_TYPE) ['/a/b:ro', '/a/b/c:ro'] >>> preprocess_volumes("/a/b:default_ro,/a/b/c:rw", SINGULARITY_CONTAINER_TYPE) ['/a/b', '/a/b/c'] """ volumes_raw_strs = [v.strip() for v in volumes_raw_str.split(",")] volumes = [] rw_paths = [] for volume_raw_str in volumes_raw_strs: volume_parts = volume_raw_str.split(":") if len(volume_parts) > 3: raise Exception("Unparsable volumes string in configuration [%s]" % volumes_raw_str) if len(volume_parts) == 3: volume_parts = ["{}:{}".format(volume_parts[0], volume_parts[1]), volume_parts[2]] if len(volume_parts) == 2 and volume_parts[1] not in ("rw", "ro", "default_ro"): volume_parts = ["{}:{}".format(volume_parts[0], volume_parts[1]), "rw"] if len(volume_parts) == 1: volume_parts.append("rw") volumes.append(volume_parts) if volume_parts[1] == "rw": rw_paths.append(volume_parts[0]) for volume in volumes: path = volume[0] how = volume[1] if how == "default_ro": how = "ro" if container_type == SINGULARITY_CONTAINER_TYPE: for rw_path in rw_paths: if in_directory(rw_path, path): how = "rw" volume[1] = how # for a while singularity did not allow to specify the bind type rw # (which is the default). so we omit this default # see https://github.com/hpcng/singularity/pull/5487 if container_type == SINGULARITY_CONTAINER_TYPE and volume[1] == 'rw': del volume[1] return [":".join(v) for v in volumes]
def discover_target_directory(dir_name, job_working_directory): if dir_name: directory = os.path.join(job_working_directory, dir_name) if not util.in_directory(directory, job_working_directory): raise Exception("Problem with tool configuration, attempting to pull in datasets from outside working directory.") return directory else: return job_working_directory
def discover_target_directory(dir_name, job_working_directory): if dir_name: directory = os.path.join(job_working_directory, dir_name) if not util.in_directory(directory, job_working_directory): raise Exception( "Problem with tool configuration, attempting to pull in datasets from outside working directory." ) return directory else: return job_working_directory
def discovered_file_for_unnamed_output(dataset, job_working_directory, parent_identifiers=[]): extra_file_collector = DEFAULT_TOOL_PROVIDED_DATASET_COLLECTOR target_directory = discover_target_directory(extra_file_collector.directory, job_working_directory) filename = dataset["filename"] # handle link_data_only here, verify filename is in directory if not linking... if not dataset.get("link_data_only"): path = os.path.join(target_directory, filename) if not util.in_directory(path, target_directory): raise Exception("Problem with tool configuration, attempting to pull in datasets from outside working directory.") else: path = filename return DiscoveredFile(path, extra_file_collector, JsonCollectedDatasetMatch(dataset, extra_file_collector, filename, path=path, parent_identifiers=parent_identifiers))
def get_work_dir_outputs(self, job_wrapper, job_working_directory=None): """ Returns list of pairs (source_file, destination) describing path to work_dir output file and ultimate destination. """ if not job_working_directory: job_working_directory = os.path.abspath( job_wrapper.working_directory) # Set up dict of dataset id --> output path; output path can be real or # false depending on outputs_to_working_directory output_paths = {} for dataset_path in job_wrapper.get_output_fnames(): path = dataset_path.real_path if self.app.config.outputs_to_working_directory: path = dataset_path.false_path output_paths[dataset_path.dataset_id] = path output_pairs = [] # Walk job's output associations to find and use from_work_dir attributes. job = job_wrapper.get_job() job_tool = job_wrapper.tool for dataset_assoc in job.output_datasets + job.output_library_datasets: for dataset in dataset_assoc.dataset.dataset.history_associations + dataset_assoc.dataset.dataset.library_associations: if isinstance(dataset, self.app.model.HistoryDatasetAssociation): joda = self.sa_session.query( self.app.model.JobToOutputDatasetAssociation ).filter_by(job=job, dataset=dataset).first() if joda and job_tool: hda_tool_output = job_tool.outputs.get(joda.name, None) if hda_tool_output and hda_tool_output.from_work_dir: # Copy from working dir to HDA. # TODO: move instead of copy to save time? source_file = os.path.join( job_working_directory, hda_tool_output.from_work_dir) destination = job_wrapper.get_output_destination( output_paths[dataset.dataset_id]) if in_directory(source_file, job_working_directory): output_pairs.append((source_file, destination)) else: # Security violation. log.exception( "from_work_dir specified a location not in the working directory: %s, %s" % (source_file, job_wrapper.working_directory)) return output_pairs
def __is_output_dataset_path(self, job, path): """ Check if is an output path for this job or a file in the an output's extra files path. """ da_lists = [job.output_datasets, job.output_library_datasets] for da_list in da_lists: for job_dataset_association in da_list: dataset = job_dataset_association.dataset if not dataset: continue if os.path.abspath(dataset.file_name) == os.path.abspath(path): return True elif util.in_directory(path, dataset.extra_files_path): return True return False
def walk_over_extra_files( extra_file_collectors, job_working_directory, matchable ): for extra_file_collector in extra_file_collectors: directory = job_working_directory if extra_file_collector.directory: directory = os.path.join( directory, extra_file_collector.directory ) if not util.in_directory( directory, job_working_directory ): raise Exception( "Problem with tool configuration, attempting to pull in datasets from outside working directory." ) if not os.path.isdir( directory ): continue for filename in sorted( os.listdir( directory ) ): path = os.path.join( directory, filename ) if not os.path.isfile( path ): continue if extra_file_collector.match( matchable, filename ): yield path, extra_file_collector
def __is_output_dataset_path( self, job, path ): """ Check if is an output path for this job or a file in the an output's extra files path. """ da_lists = [ job.output_datasets, job.output_library_datasets ] for da_list in da_lists: for job_dataset_association in da_list: dataset = job_dataset_association.dataset if not dataset: continue if os.path.abspath( dataset.file_name ) == os.path.abspath( path ): return True elif util.in_directory( path, dataset.extra_files_path ): return True return False
def ensure_in_working_directory(self, path, purge_source, in_place): if in_directory(path, self.__workdir): return path new_path = self.__new_dataset_path() if purge_source: try: shutil.move(path, new_path) except OSError as e: # We may not have permission to remove converted_path if e.errno != errno.EACCES: raise else: shutil.copy(path, new_path) return new_path
def preprocess_volumes(volumes_raw_str, container_type): """Process Galaxy volume specification string to either Docker or Singularity specification. Galaxy allows the mount try "default_ro" which translates to ro for Docker and ro for Singularity iff no subdirectories are rw (Singularity does not allow ro parent directories with rw subdirectories). >>> preprocess_volumes("/a/b", DOCKER_CONTAINER_TYPE) ['/a/b:rw'] >>> preprocess_volumes("/a/b:ro,/a/b/c:rw", DOCKER_CONTAINER_TYPE) ['/a/b:ro', '/a/b/c:rw'] >>> preprocess_volumes("/a/b:default_ro,/a/b/c:rw", DOCKER_CONTAINER_TYPE) ['/a/b:ro', '/a/b/c:rw'] >>> preprocess_volumes("/a/b:default_ro,/a/b/c:rw", SINGULARITY_CONTAINER_TYPE) ['/a/b:rw', '/a/b/c:rw'] """ volumes_raw_strs = [v.strip() for v in volumes_raw_str.split(",")] volumes = [] rw_paths = [] for volume_raw_str in volumes_raw_strs: volume_parts = volume_raw_str.split(":") if len(volume_parts) > 2: raise Exception("Unparsable volumes string in configuration [%s]" % volumes_raw_str) if len(volume_parts) == 1: volume_parts.append("rw") volumes.append(volume_parts) if volume_parts[1] == "rw": rw_paths.append(volume_parts[0]) for volume in volumes: path = volume[0] how = volume[1] if how == "default_ro": how = "ro" if container_type == SINGULARITY_CONTAINER_TYPE: for rw_path in rw_paths: if in_directory(rw_path, path): how = "rw" volume[1] = how return [":".join(v) for v in volumes]
def preprocess_volumes(volumes_raw_str, container_type): """Process Galaxy volume specification string to either Docker or Singularity specification. Galaxy allows the mount try "default_ro" which translates to ro for Docker and ro for Singularity iff no subdirectories are rw (Singularity does not allow ro parent directories with rw subdirectories). >>> preprocess_volumes("/a/b", DOCKER_CONTAINER_TYPE) '/a/b:rw' >>> preprocess_volumes("/a/b:ro,/a/b/c:rw", DOCKER_CONTAINER_TYPE) '/a/b:ro,/a/b/c:rw' >>> preprocess_volumes("/a/b:default_ro,/a/b/c:rw", DOCKER_CONTAINER_TYPE) '/a/b:ro,/a/b/c:rw' >>> preprocess_volumes("/a/b:default_ro,/a/b/c:rw", SINGULARITY_CONTAINER_TYPE) '/a/b:rw,/a/b/c:rw' """ volumes_raw_strs = [v.strip() for v in volumes_raw_str.split(",")] volumes = [] rw_paths = [] for volume_raw_str in volumes_raw_strs: volume_parts = volume_raw_str.split(":") if len(volume_parts) > 2: raise Exception("Unparsable volumes string in configuration [%s]" % volumes_raw_str) if len(volume_parts) == 1: volume_parts.append("rw") volumes.append(volume_parts) if volume_parts[1] == "rw": rw_paths.append(volume_parts[0]) for volume in volumes: path = volume[0] how = volume[1] if how == "default_ro": how = "ro" if container_type == SINGULARITY_CONTAINER_TYPE: for rw_path in rw_paths: if in_directory(rw_path, path): how = "rw" volume[1] = how return ",".join([":".join(v) for v in volumes])
def walk_over_extra_files(extra_file_collectors, job_working_directory, matchable): for extra_file_collector in extra_file_collectors: directory = job_working_directory if extra_file_collector.directory: directory = os.path.join(directory, extra_file_collector.directory) if not util.in_directory(directory, job_working_directory): raise Exception( "Problem with tool configuration, attempting to pull in datasets from outside working directory." ) if not os.path.isdir(directory): continue for filename in sorted(os.listdir(directory)): path = os.path.join(directory, filename) if not os.path.isfile(path): continue if extra_file_collector.match(matchable, filename): yield path, extra_file_collector
def ensure_in_working_directory(self, path, purge_source, in_place): if in_directory(path, self.__workdir): return path new_path = self.__new_dataset_path() if purge_source: try: shutil.move(path, new_path) # Drop .info file if it exists try: os.remove(f"{path}.info") except FileNotFoundError: pass except OSError as e: # We may not have permission to remove converted_path if e.errno != errno.EACCES: raise else: shutil.copy(path, new_path) return new_path
def get_work_dir_outputs( self, job_wrapper, job_working_directory=None ): """ Returns list of pairs (source_file, destination) describing path to work_dir output file and ultimate destination. """ if not job_working_directory: job_working_directory = os.path.abspath( job_wrapper.working_directory ) # Set up dict of dataset id --> output path; output path can be real or # false depending on outputs_to_working_directory output_paths = {} for dataset_path in job_wrapper.get_output_fnames(): path = dataset_path.real_path if self.app.config.outputs_to_working_directory: path = dataset_path.false_path output_paths[ dataset_path.dataset_id ] = path output_pairs = [] # Walk job's output associations to find and use from_work_dir attributes. job = job_wrapper.get_job() job_tool = self.app.toolbox.tools_by_id.get( job.tool_id, None ) for dataset_assoc in job.output_datasets + job.output_library_datasets: for dataset in dataset_assoc.dataset.dataset.history_associations + dataset_assoc.dataset.dataset.library_associations: if isinstance( dataset, self.app.model.HistoryDatasetAssociation ): joda = self.sa_session.query( self.app.model.JobToOutputDatasetAssociation ).filter_by( job=job, dataset=dataset ).first() if joda and job_tool: hda_tool_output = job_tool.outputs.get( joda.name, None ) if hda_tool_output and hda_tool_output.from_work_dir: # Copy from working dir to HDA. # TODO: move instead of copy to save time? source_file = os.path.join( job_working_directory, hda_tool_output.from_work_dir ) destination = job_wrapper.get_output_destination( output_paths[ dataset.dataset_id ] ) if in_directory( source_file, job_working_directory ): output_pairs.append( ( source_file, destination ) ) log.debug( "Copying %s to %s as directed by from_work_dir" % ( source_file, destination ) ) else: # Security violation. log.exception( "from_work_dir specified a location not in the working directory: %s, %s" % ( source_file, job_wrapper.working_directory ) ) return output_pairs
def discovered_file_for_unnamed_output(dataset, job_working_directory, parent_identifiers=[]): extra_file_collector = DEFAULT_TOOL_PROVIDED_DATASET_COLLECTOR target_directory = discover_target_directory( extra_file_collector.directory, job_working_directory) filename = dataset["filename"] # handle link_data_only here, verify filename is in directory if not linking... if not dataset.get("link_data_only"): path = os.path.join(target_directory, filename) if not util.in_directory(path, target_directory): raise Exception( "Problem with tool configuration, attempting to pull in datasets from outside working directory." ) else: path = filename return DiscoveredFile( path, extra_file_collector, JsonCollectedDatasetMatch(dataset, extra_file_collector, filename, path=path, parent_identifiers=parent_identifiers))
def remote_output_path_rewrite(self, local_path): output_type = path_type.OUTPUT if in_directory(local_path, self.local_working_directory): output_type = path_type.OUTPUT_WORKDIR remote_path = self.__remote_path_rewrite(local_path, output_type) return remote_path
def exists(self, filename): path = os.path.abspath(self.path(filename)) return os.path.exists(path) and in_directory(path, self.file_dir)
def __is_allowed_temp_dir_file(self, app, job, path): # grrr.. need to get away from new_file_path - these should be written # to job working directory like metadata files. in_temp_dir = util.in_directory(path, app.config.new_file_path) return in_temp_dir and os.path.split(path)[-1].startswith( "GALAXY_VERSION_")
def metadata_path_on_compute(path): compute_path = path if compute_tmp_dir and tmp_dir and in_directory(path, tmp_dir): path_relative = os.path.relpath(path, tmp_dir) compute_path = os.path.join(compute_tmp_dir, path_relative) return compute_path
def __in_working_directory( self, job, path, app ): working_directory = app.object_store.get_filename(job, base_dir='job_work', dir_only=True, extra_dir=str(job.id)) return util.in_directory( path, working_directory )
def __is_allowed_temp_dir_file( self, app, job, path ): # grrr.. need to get away from new_file_path - these should be written # to job working directory like metadata files. in_temp_dir = util.in_directory( path, app.config.new_file_path ) return in_temp_dir and os.path.split( path )[ -1 ].startswith( "GALAXY_VERSION_")
def verify_is_in_directory(path, directory, local_path_module=os.path): if not in_directory(path, directory, local_path_module): msg = "Attempt to read or write file outside an authorized directory." log.warn("%s Attempted path: %s, valid directory: %s" % (msg, path, directory)) raise Exception(msg)
def collect_primary_datasets( tool, output, job_working_directory, input_ext ): app = tool.app sa_session = tool.sa_session new_primary_datasets = {} try: json_file = open( os.path.join( job_working_directory, jobs.TOOL_PROVIDED_JOB_METADATA_FILE ), 'r' ) for line in json_file: line = json.loads( line ) if line.get( 'type' ) == 'new_primary_dataset': new_primary_datasets[ os.path.split( line.get( 'filename' ) )[-1] ] = line except Exception: # This should not be considered an error or warning condition, this file is optional pass # Loop through output file names, looking for generated primary # datasets in form of: # 'primary_associatedWithDatasetID_designation_visibility_extension(_DBKEY)' primary_datasets = {} for name, outdata in output.items(): dataset_collectors = tool.outputs[ name ].dataset_collectors if name in tool.outputs else [ DEFAULT_DATASET_COLLECTOR ] filenames = odict.odict() if 'new_file_path' in app.config.collect_outputs_from: if DEFAULT_DATASET_COLLECTOR in dataset_collectors: # 'new_file_path' collection should be considered deprecated, # only use old-style matching (glob instead of regex and only # using default collector - if enabled). for filename in glob.glob(os.path.join(app.config.new_file_path, "primary_%i_*" % outdata.id) ): filenames[ filename ] = DEFAULT_DATASET_COLLECTOR if 'job_working_directory' in app.config.collect_outputs_from: for extra_file_collector in dataset_collectors: directory = job_working_directory if extra_file_collector.directory: directory = os.path.join( directory, extra_file_collector.directory ) if not util.in_directory( directory, job_working_directory ): raise Exception( "Problem with tool configuration, attempting to pull in datasets from outside working directory." ) if not os.path.isdir( directory ): continue for filename in os.listdir( directory ): path = os.path.join( directory, filename ) if not os.path.isfile( path ): continue if extra_file_collector.match( outdata, filename ): filenames[ path ] = extra_file_collector for filename, extra_file_collector in filenames.iteritems(): if not name in primary_datasets: primary_datasets[name] = {} fields_match = extra_file_collector.match( outdata, os.path.basename( filename ) ) if not fields_match: # Before I guess pop() would just have thrown an IndexError raise Exception( "Problem parsing metadata fields for file %s" % filename ) designation = fields_match.designation visible = fields_match.visible ext = fields_match.ext if ext == "input": ext = input_ext dbkey = fields_match.dbkey # Create new primary dataset primary_data = app.model.HistoryDatasetAssociation( extension=ext, designation=designation, visible=visible, dbkey=dbkey, create_dataset=True, sa_session=sa_session ) app.security_agent.copy_dataset_permissions( outdata.dataset, primary_data.dataset ) sa_session.add( primary_data ) sa_session.flush() # Move data from temp location to dataset location app.object_store.update_from_file(primary_data.dataset, file_name=filename, create=True) primary_data.set_size() # If match specified a name use otherwise generate one from # designation. primary_data.name = fields_match.name or "%s (%s)" % ( outdata.name, designation ) primary_data.info = outdata.info primary_data.init_meta( copy_from=outdata ) primary_data.dbkey = dbkey # Associate new dataset with job job = None for assoc in outdata.creating_job_associations: job = assoc.job break if job: assoc = app.model.JobToOutputDatasetAssociation( '__new_primary_file_%s|%s__' % ( name, designation ), primary_data ) assoc.job = job sa_session.add( assoc ) sa_session.flush() primary_data.state = outdata.state #add tool/metadata provided information new_primary_datasets_attributes = new_primary_datasets.get( os.path.split( filename )[-1], {} ) if new_primary_datasets_attributes: dataset_att_by_name = dict( ext='extension' ) for att_set in [ 'name', 'info', 'ext', 'dbkey' ]: dataset_att_name = dataset_att_by_name.get( att_set, att_set ) setattr( primary_data, dataset_att_name, new_primary_datasets_attributes.get( att_set, getattr( primary_data, dataset_att_name ) ) ) extra_files_path = new_primary_datasets_attributes.get( 'extra_files', None ) if extra_files_path: extra_files_path_joined = os.path.join( job_working_directory, extra_files_path ) for root, dirs, files in os.walk( extra_files_path_joined ): extra_dir = os.path.join( primary_data.extra_files_path, root.replace( extra_files_path_joined, '', 1 ).lstrip( os.path.sep ) ) for f in files: app.object_store.update_from_file( primary_data.dataset, extra_dir=extra_dir, alt_name=f, file_name=os.path.join( root, f ), create=True, dir_only=True, preserve_symlinks=True ) # FIXME: # since these are placed into the job working dir, let the standard # Galaxy cleanup methods handle this (for now?) # there was an extra_files_path dir, attempt to remove it #shutil.rmtree( extra_files_path_joined ) metadata_dict = new_primary_datasets_attributes.get( 'metadata', None ) if metadata_dict: primary_data.metadata.from_JSON_dict( json_dict=metadata_dict ) else: primary_data.set_meta() primary_data.set_peek() sa_session.add( primary_data ) sa_session.flush() outdata.history.add_dataset( primary_data ) # Add dataset to return dict primary_datasets[name][designation] = primary_data # Need to update all associated output hdas, i.e. history was # shared with job running for dataset in outdata.dataset.history_associations: if outdata == dataset: continue new_data = primary_data.copy() dataset.history.add_dataset( new_data ) sa_session.add( new_data ) sa_session.flush() return primary_datasets
def collect_primary_datasets(tool, output, job_working_directory, input_ext): app = tool.app sa_session = tool.sa_session new_primary_datasets = {} try: json_file = open( os.path.join(job_working_directory, jobs.TOOL_PROVIDED_JOB_METADATA_FILE), 'r') for line in json_file: line = json.loads(line) if line.get('type') == 'new_primary_dataset': new_primary_datasets[os.path.split( line.get('filename'))[-1]] = line except Exception: # This should not be considered an error or warning condition, this file is optional pass # Loop through output file names, looking for generated primary # datasets in form of: # 'primary_associatedWithDatasetID_designation_visibility_extension(_DBKEY)' primary_datasets = {} for name, outdata in output.items(): dataset_collectors = tool.outputs[ name].dataset_collectors if name in tool.outputs else [ DEFAULT_DATASET_COLLECTOR ] filenames = odict.odict() if 'new_file_path' in app.config.collect_outputs_from: if DEFAULT_DATASET_COLLECTOR in dataset_collectors: # 'new_file_path' collection should be considered deprecated, # only use old-style matching (glob instead of regex and only # using default collector - if enabled). for filename in glob.glob( os.path.join(app.config.new_file_path, "primary_%i_*" % outdata.id)): filenames[filename] = DEFAULT_DATASET_COLLECTOR if 'job_working_directory' in app.config.collect_outputs_from: for extra_file_collector in dataset_collectors: directory = job_working_directory if extra_file_collector.directory: directory = os.path.join(directory, extra_file_collector.directory) if not util.in_directory(directory, job_working_directory): raise Exception( "Problem with tool configuration, attempting to pull in datasets from outside working directory." ) if not os.path.isdir(directory): continue for filename in os.listdir(directory): path = os.path.join(directory, filename) if not os.path.isfile(path): continue if extra_file_collector.match(outdata, filename): filenames[path] = extra_file_collector for filename, extra_file_collector in filenames.iteritems(): if not name in primary_datasets: primary_datasets[name] = {} fields_match = extra_file_collector.match( outdata, os.path.basename(filename)) if not fields_match: # Before I guess pop() would just have thrown an IndexError raise Exception("Problem parsing metadata fields for file %s" % filename) designation = fields_match.designation visible = fields_match.visible ext = fields_match.ext if ext == "input": ext = input_ext dbkey = fields_match.dbkey # Create new primary dataset primary_data = app.model.HistoryDatasetAssociation( extension=ext, designation=designation, visible=visible, dbkey=dbkey, create_dataset=True, sa_session=sa_session) app.security_agent.copy_dataset_permissions( outdata.dataset, primary_data.dataset) sa_session.add(primary_data) sa_session.flush() # Move data from temp location to dataset location app.object_store.update_from_file(primary_data.dataset, file_name=filename, create=True) primary_data.set_size() # If match specified a name use otherwise generate one from # designation. primary_data.name = fields_match.name or "%s (%s)" % (outdata.name, designation) primary_data.info = outdata.info primary_data.init_meta(copy_from=outdata) primary_data.dbkey = dbkey # Associate new dataset with job job = None for assoc in outdata.creating_job_associations: job = assoc.job break if job: assoc = app.model.JobToOutputDatasetAssociation( '__new_primary_file_%s|%s__' % (name, designation), primary_data) assoc.job = job sa_session.add(assoc) sa_session.flush() primary_data.state = outdata.state #add tool/metadata provided information new_primary_datasets_attributes = new_primary_datasets.get( os.path.split(filename)[-1], {}) if new_primary_datasets_attributes: dataset_att_by_name = dict(ext='extension') for att_set in ['name', 'info', 'ext', 'dbkey']: dataset_att_name = dataset_att_by_name.get( att_set, att_set) setattr( primary_data, dataset_att_name, new_primary_datasets_attributes.get( att_set, getattr(primary_data, dataset_att_name))) extra_files_path = new_primary_datasets_attributes.get( 'extra_files', None) if extra_files_path: extra_files_path_joined = os.path.join( job_working_directory, extra_files_path) for root, dirs, files in os.walk(extra_files_path_joined): extra_dir = os.path.join( primary_data.extra_files_path, root.replace(extra_files_path_joined, '', 1).lstrip(os.path.sep)) for f in files: app.object_store.update_from_file( primary_data.dataset, extra_dir=extra_dir, alt_name=f, file_name=os.path.join(root, f), create=True, dir_only=True, preserve_symlinks=True) # FIXME: # since these are placed into the job working dir, let the standard # Galaxy cleanup methods handle this (for now?) # there was an extra_files_path dir, attempt to remove it #shutil.rmtree( extra_files_path_joined ) metadata_dict = new_primary_datasets_attributes.get( 'metadata', None) if metadata_dict: primary_data.metadata.from_JSON_dict(json_dict=metadata_dict) else: primary_data.set_meta() primary_data.set_peek() sa_session.add(primary_data) sa_session.flush() outdata.history.add_dataset(primary_data) # Add dataset to return dict primary_datasets[name][designation] = primary_data # Need to update all associated output hdas, i.e. history was # shared with job running for dataset in outdata.dataset.history_associations: if outdata == dataset: continue new_data = primary_data.copy() dataset.history.add_dataset(new_data) sa_session.add(new_data) sa_session.flush() return primary_datasets