def resolve_tool( tool: Union[str, j.CommandTool, Type[j.CommandTool], j.Workflow, Type[j.Workflow]], name=None, from_toolshed=False, force=False, only_toolbox=False, ): if isinstance(tool, j.Tool): return tool elif isclass(tool) and issubclass(tool, (j.Workflow, j.Tool)): return tool() if not isinstance(tool, str): raise TypeError( f"Janis is not sure how to resolve a workflow of type: '{type(tool)}'" ) if not only_toolbox: fileschemewherelocated = FileScheme.get_type_by_prefix(tool.lower()) if fileschemewherelocated: Logger.info( f"Detected remote workflow to localise from '{fileschemewherelocated.__name__}'" ) # Get some unique name for the workflow import hashlib fn = hashlib.md5(tool.lower().encode()).hexdigest() + ".py" outdir = os.path.join(JanisConfiguration.manager().configdir, "cached") os.makedirs(outdir, exist_ok=True) dest = os.path.join(outdir, fn) Logger.log(f"Localising '{tool}' to '{dest}'") fileschemewherelocated("internal").cp_from( source=tool.lower(), dest=dest, report_progress=lambda progress: print( f"Download progress: {progress}"), force=force, ) tool = dest wf = get_janis_workflow_from_searchname(tool, ".", name=name, include_commandtools=True) if wf: return wf if from_toolshed: v = None if ":" in tool: ps = tool.split(":") workflow, v = ps[0], ps[1] wf = j.JanisShed.get_tool(tool, v) return wf
def _download_remote_files(self, test_logic: TTestExpectedOutput): """ Download remote test files (only expected output files) to a cache directory :param test_logic: an object that holds information about an expected output :type test_logic: TTestExpectedOutput :return: None :rtype: None """ file_attributes = ["expected_file", "file_diff_source"] for att in file_attributes: if not hasattr(test_logic, att): raise Exception(f"{test_logic.__class__} has no attribute {att}") source = getattr(test_logic, att) if source: test_helpers.verify_janis_assistant_installed() from janis_assistant.management.filescheme import ( FileScheme, LocalFileScheme, ) # f = FileScheme(source) if not FileScheme.is_local_path(source): fs = FileScheme.get_filescheme_for_url(source) last_modified = fs.last_modified(source) local_file_path = os.path.join( self.cached_input_files_dir, f"{test_helpers.hash_filename(source, last_modified)}_{os.path.basename(source)}", ) # Only download if the file does not already exist if not os.path.exists(local_file_path): Logger.info(f"Downloading remote file to {local_file_path}") os.makedirs(self.cached_input_files_dir, exist_ok=True) fs.cp_from(source, local_file_path) else: Logger.info( f"Skip downloading remote file. File {source} already exists in {local_file_path}" ) setattr(test_logic, att, local_file_path)
def generate_file_path(cls, source: str, dest_dir: str): fs = FileScheme.get_type_by_prefix(source)() date_modified = fs.last_modified(source) local_filename = ( f"{cls.hash_filename(source, date_modified)}_{os.path.basename(source)}" ) return os.path.join(dest_dir, local_filename)
def insert_inputs_from_dict( self, inputs: dict, run_id: str = RunModel.DEFAULT_ID, file_input_ids: Set[str] = None, ): if file_input_ids is None: file_input_ids = set() return self.insert_or_update_many([ WorkflowInputModel( id_=k, submission_id=self.submission_id, run_id=run_id, value=v, size=(FileScheme.get_type_by_prefix(v).get_file_size(v)) if k in file_input_ids else None, ) for k, v in inputs.items() ])
def check_base_with_type(inp: TInput, intype: DataType, val, suffix=""): doesnt_exist = {} if isinstance(intype, Array): subtype = intype.subtype() if not isinstance(val, list): raise Exception( f"Expected {inp.id()} to be list, but {str(val)} was a {type(val)}" ) for innerval, idx in zip(val, range(len(val))): nsuffix = f"{suffix}[{idx}]" doesnt_exist.update( InputChecker.check_base_with_type(inp, subtype, innerval, suffix=nsuffix)) return doesnt_exist inpid = inp.id() + suffix if isinstance(val, list): raise Exception( f"Expected singular item for {inp.id()}, received list.") fs = FileScheme.get_filescheme_for_url(val) if not fs.exists(val): doesnt_exist[inpid] = val if not isinstance(intype, File): return doesnt_exist InputChecker.check_extensions(inpid, intype, val) secs = intype.secondary_files() or [] for sec in secs: sec_filename = apply_secondary_file_format_to_filename(val, sec) if not InputChecker.check_if_input_exists(fs, sec_filename): secsuffix = sec.replace("^", "").replace(".", "") doesnt_exist[inp.id() + "_" + secsuffix + suffix] = ("(SECONDARY) " + sec_filename) return doesnt_exist
def copy_output( self, fs: FileScheme, outputid, prefix, tag, secondaries, extension, iscopyable, engine_output: Union[WorkflowOutputModel, Any, List[Any]], shard=None, ): # the output_folder is an array of an array, for each if isinstance(engine_output, list): outs = [] nshards = len(engine_output) prev_shards = shard or [] # This is a little complicated, we want to output the set of tags that have the same length as we # we have shards. We'll only let this work if there's one element in the array with the appropriate amount # of shards. # find the index def find_element_where_length_is(iterable, n): if iterable is None: return None for i in range(len(iterable)): ii = iterable[i] if isinstance(ii, list) and len(ii) == n: return i return None def explode_at_index(iterable, index_to_explode, index_to_select): ar = iterable[:index_to_explode] + [ iterable[index_to_explode][index_to_select] ] if index_to_explode + 1 < len(iterable): ar.extend(iterable[1 + tag_index_to_explode :]) return ar tag_index_to_explode = find_element_where_length_is(tag, nshards) for i in range(nshards): eout = engine_output[i] new_shard = [*prev_shards, i] # choose tag new_prefix = prefix if isinstance(new_prefix, list) and len(new_prefix) > 1: new_prefix = new_prefix[i] new_shard = new_shard[min(len(new_shard), 1) :] new_tag = tag if tag_index_to_explode is not None: new_tag = explode_at_index(tag, tag_index_to_explode, i) new_shard = new_shard[min(len(new_shard), 1) :] outs.append( self.copy_output( fs, outputid=outputid, tag=new_tag, prefix=new_prefix, engine_output=eout, shard=new_shard, secondaries=secondaries, extension=extension, iscopyable=iscopyable, ) ) return [o[0] for o in outs], [o[1] for o in outs] final_tags = tag outfn = outputid if final_tags and any(isinstance(t, list) for t in final_tags): Logger.critical( f"One of the final output tags {str(final_tags)} was still an array, outputs will be written directly into the output directory" ) final_tags = None if prefix: if isinstance(prefix, list): if len(prefix) > 1: Logger.critical( f"Expected only one output_name for this copy, but found ({', '.join(prefix)}) [{len(prefix)}], using the first outputname" ) else: outfn = prefix[0] else: outfn = prefix if final_tags is None: final_tags = [] outdir = os.path.join(self.path, "/".join(final_tags)) fs.mkdirs(outdir) if shard is not None: for s in shard: outfn += f"_shard-{s}" # copy output original_filepath = None newoutputfilepath = os.path.join(outdir, outfn) if isinstance(engine_output, WorkflowOutputModel): original_filepath = engine_output.originalpath if original_filepath and iscopyable: ext = extension or get_extension(engine_output.originalpath) if ext: dot = "" if ext[0] == "." else "." outfn += dot + ext newoutputfilepath += dot + ext fs.cp_from(engine_output.originalpath, newoutputfilepath, force=True) elif engine_output.value: if isinstance(fs, LocalFileScheme): # Write engine_output to outpath with open(newoutputfilepath, "w+") as outfile: outfile.write(str(engine_output.value)) else: original_filepath = engine_output if isinstance(fs, LocalFileScheme): # Write engine_output to outpath with open(newoutputfilepath, "w+") as outfile: outfile.write(str(engine_output)) for sec in secondaries or []: frompath = apply_secondary_file_format_to_filename(original_filepath, sec) tofn = apply_secondary_file_format_to_filename(outfn, sec) topath = os.path.join(outdir, tofn) fs.cp_from(frompath, topath, force=True) return [original_filepath, newoutputfilepath]
def localise_inputs( self, inpid: str, inptype: DataType, dest_dir: str, source: Union[str, List[str]], localise_secondary_files: bool = True, ): if isinstance(source, list): return [ self.localise_inputs(inpid, inptype, dest_dir, s) for s in source ] fs = FileScheme.get_type_by_prefix(source)() if isinstance(fs, LocalFileScheme): return source out_path = self.generate_file_path(source, dest_dir) if os.path.exists(out_path): Logger.info( f"A file already exists when localising '{inpid}' at '{out_path}'. If this isn't the right file, " f"you'll need to manually remove this file before proceeding") else: try: Logger.info(f"Downloading file from {source} -> {out_path}") fs.cp_from(source, out_path) except Exception as e: Logger.critical( f"Couldn't localise source from {source} -> {out_path}: {repr(e)}" ) raise if localise_secondary_files: try: # Handle normal input type or array input type secondary_files = inptype.secondary_files() if inptype.is_array(): secondary_files = inptype.subtype().secondary_files() for sec in secondary_files or []: sec_source = apply_secondary_file_format_to_filename( source, sec) out_sec_path = apply_secondary_file_format_to_filename( out_path, sec) if os.path.exists(out_sec_path): Logger.info( f"The secondary file for {inpid} ({sec}) already exists when localising '{inpid}' at '{out_sec_path}'. If this isn't the right file, " f"you'll need to manually remove this file before proceeding" ) elif not fs.exists(sec_source): Logger.warn( f"Couldn't find the secondary file for {inpid}, expected at {sec_source}, skipping for now" ) else: fs.cp_from(sec_source, out_sec_path) except Exception as e: Logger.critical( f"Couldn't localise secondary file due to: {e}") return out_path
def guess_datatype_by_filename(filename: str): """ We'll try to guess which datatype a file with name 'filename' is. Primarily, this will look at the extension, and whether the secondary files exist :param filename: :return: """ dts = JanisShed.get_all_datatypes() fs = FileScheme.get_type_by_prefix(filename)() if not isinstance(fs, LocalFileScheme): Logger.warn( f"The filescheme detected by Janis for '{filename}' was not LOCAL. This guess datatype process may rely on " f"polling the {fs.id()} file system to check if related files exist. This might have some financial cost involved." ) file_exists_map = {} # each match has a score matches: List[Tuple[int, File]] = [] for datatype in dts: if isclass(datatype): if not issubclass(datatype, File): continue datatype = get_instantiated_type(datatype) elif not isinstance(datatype, File): continue if not datatype.extension: continue datatype: File = datatype extensions = {datatype.extension, *(datatype.alternate_extensions or [])} matching_extension = None for ext in extensions: if filename.endswith(ext): matching_extension = ext break secondaries_match = True if datatype.secondary_files(): for secondary in datatype.secondary_files(): secondary_filename = apply_secondary_file_format_to_filename( filename, secondary ) if secondary not in file_exists_map: file_exists_map[secondary] = fs.exists(secondary_filename) if not file_exists_map[secondary]: secondaries_match = False break if secondaries_match is False: continue # we got here, we're G if matching_extension is not None and secondaries_match: extension_reward = len(matching_extension) * EXTENSION_REWARD_MULTIPLER secondaries_reward = ( len(datatype.secondary_files() or []) * SECONDARIES_REWARD_MULTIPLER ) score = extension_reward + secondaries_reward matches.append((score, datatype)) if len(matches) == 0: return None elif len(matches) == 1: return matches[0][1] else: matches = sorted(matches, key=lambda a: a[0], reverse=True) matched_dt = matches[0][1] ranked = ", ".join(f"{match[1].name()} ({match[0]})" for match in matches[1:]) Logger.debug( f"There were {len(matches)} for matching datatypes. Using {matched_dt.name()} ({matches[0][0]}) " f"as it was the best match from: {ranked}" ) return matched_dt
def check_if_input_exists(fs: FileScheme, path: str): return fs.exists(path)