def prepare_tool( tool: Tool, toolversions: List[str], isorphan: bool, is_published_pipeline: bool = False, ): # Stuff to list on the documentation page: # - Versions of tools # - Generated command # - Cool if it grouped the tools by vendor # - if not tool: return None try: if is_published_pipeline: return "" if tool.type() == ToolType.CommandTool: return prepare_commandtool_page(tool, toolversions) elif tool.type() == ToolType.Workflow: return prepare_workflow_page(tool, toolversions) elif tool.type() == ToolType.CodeTool: return prepare_code_tool_page(tool, toolversions) except Exception as e: traceback.print_exc() Logger.critical("Couldn't generate documentation for " + tool.id() + " " + str(e))
def prepare_quickstart(tool: Tool): required_python_input_map = "\n".join(" " * 15 + i.id() + "=None," for i in tool.tool_inputs() if not i.intype.optional) python_step_name = tool.id().lower() + "_step" output_python_code = "\n".join( " " * 7 + f'wf.output("{o.id()}", source={python_step_name}.{o.id()})' for o in tool.tool_outputs()) python_codeblock = f"""\ .. code-block:: python from {tool.__module__} import {tool.__class__.__name__} wf = WorkflowBuilder("myworkflow") wf.step( "{python_step_name}", {tool.__class__.__name__}( {required_python_input_map} ) ) {output_python_code} """ return f"""\
def prepare_run_instructions_input_file(tool: Tool, user_inps: dict, other_inps: dict, reference_information: str): yaml_user_inps = CwlTranslator.stringify_translated_inputs(user_inps) yaml_other_inps = CwlTranslator.stringify_translated_inputs(other_inps) indented_user = "".join(" " * 7 + s for s in yaml_user_inps.splitlines(True)) indented_other = "".join(" " * 7 + s for s in yaml_other_inps.splitlines(True)) not_localising_secondary_warning = "" if isinstance(tool, WorkflowBase): inputs_that_arent_localising_secondary_files = [ t.id() for t in tool.tool_inputs() if t.doc.skip_sourcing_secondary_files ] if len(inputs_that_arent_localising_secondary_files) > 0: not_localising_secondary_warning = f"""\ .. warning:: The secondary files for the inputs '{"', '".join(inputs_that_arent_localising_secondary_files)}' will not automatically \ localise using janis prepare and are built just after download. Please note this can take a few hours to build \ before the pipeline runs. """ has_static = len(other_inps) > 0 tb = " " * 4 run_args = ["janis run [...run options]", tb + "--inputs inputs.yaml"] static_generation = ("" if not has_static else f"""\ # static inputs janis inputs --static {tool.id()} > static.yaml""") static_yaml = ("" if not has_static else f"""\ **static.yaml** .. code-block:: yaml {indented_other}""") if has_static: run_args.append(tb + "--inputs static.yaml") if isinstance(tool, CommandTool) and not tool.container(): run_args.append( tb + f"--container-override '{tool.id()}=<organisation/container:version>'" ) run_args.append(tb + tool.id()) run_statement = " \\\n".join(" " * 3 + el for el in run_args) if reference_information: reference_information = f"The following inputs have a suggested source. Using janis prepare with the relevant \ ``--source-hint`` will automatically download these files. See `below <#additional-configuration-inputs>`_ for \ more information about inputs for {tool.id()}.\n{reference_information}" return f"""\
def validate_inputs(tool: Tool, additional_inputs): errors = {} input_values_from_workflow = {} if isinstance(tool, Workflow): input_values_from_workflow = { inpkey: inp.value for inpkey, inp in tool.input_nodes.items() if inp.value } input_values_to_use = {**input_values_from_workflow, **additional_inputs} for inp in tool.tool_inputs(): inpkey = inp.id() value = input_values_to_use.get(inpkey) if inp.intype.validate_value(value, allow_null_if_not_optional=False): continue errors[inpkey] = ( inp.intype.invalid_value_hint(value) or f"An internal error occurred when validating {inpkey} from {inp.intype.id()}" ) if len(errors) == 0: return True raise ValueError(f"There were errors in {len(errors)} inputs: " + str(errors))
def evaluate_translation(tool: Tool) -> Union[str, bool]: """ Evaluate if we can successfully translate to wdl and cwl # TODO: validate translations (will look into better way to ensure validation tool exists) :param tool: Janis tool :type tool: Tool :return: error message or True if we can successfully translate to wdl and cwl :rtype: Union[str, bool] """ engines = test_helpers.get_available_engines() output_dir = os.path.join(os.getcwd(), "tests_output", tool.id()) errors = [] for engine in engines: try: translator = engines[engine] translator.translate( tool, export_path=output_dir, to_console=False, to_disk=True ) except Exception as e: errors.append(f"{translator.name}: translation failed {str(e)}") if errors: return ", ".join(errors) return True
def cascade_batchrun_inputs(workflow: Tool, inputs: List[Dict], options: BatchRunRequirements): fields_to_group = set(options.fields) fields_to_group.add(options.groupby) wfins = workflow.inputs_map() required_ar_depth_of_groupby_fields = { f: 1 + count_janisarray_depth(wfins[f].intype) for f in fields_to_group } ins = {} for inp in inputs: for k, v in inp.items(): if k in fields_to_group: if k not in ins: ins[k] = [] # We'll look at the shape of the data, and decide whether # we can just use the value, or we need to wrap it in another array if count_array_depth( v) < required_ar_depth_of_groupby_fields[k]: v = [v] ins[k].extend(v) else: # overwrite the previous value ins[k] = v # If inputs return ins
def check_existence_of_files(wf: Tool, inputs: Dict): doesnt_exist = {} for inp in wf.tool_inputs(): intype = inp.intype is_path = isinstance(intype, (File, Directory)) is_array_of_paths = isinstance(intype, Array) and isinstance( intype.fundamental_type(), (File, Directory)) if not (is_path or is_array_of_paths): continue val = inputs.get(inp.id()) if val is None: if inp.intype.optional: continue raise Exception( f"Expected input '{inp.id()}' was not found or is null") doesnt_exist.update( InputChecker.check_base_with_type(inp, intype, val)) if len(doesnt_exist) > 0: import ruamel.yaml stringified = ruamel.yaml.dump(doesnt_exist, default_flow_style=False) raise Exception("The following inputs were not found:\n" + stringified)
def inputs_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Dict: new_inputs = {} for inp in tool.inputs_map().values(): if inp.id() not in inputs: continue new_inputs[inp.id()] = self.process_single_input( inp.id(), inp.intype, inputs[inp.id()]) return {**inputs, **new_inputs}
def prepare_run_instructions(tool: Tool): metadata = tool.bind_metadata() or tool.metadata has_array_of_arrays_inps = True bla = any( (isinstance(i.intype, Array) and isinstance(i.intype.subtype(), Array)) for i in tool.tool_inputs()) static_input_tuples = [[ i.id(), i.intype.id(), prepare_source(i.doc.source) ] for i in tool.tool_inputs() if i.doc.quality == InputQualityType.static and i.doc.source is not None] reference_information = "" if len(static_input_tuples) > 0: static_input_headers = ["Name", "Type", "Source"] reference_information = tabulate(static_input_tuples, headers=static_input_headers, tablefmt="rst") # overrides = metadata.sample_input_overrides or {} user_inps = {} other_inps = {} for i in tool.tool_inputs(): if i.intype.optional or i.default is not None: continue val = i.doc.example or prepare_default_for_type(i.id(), i.intype) if i.doc and i.doc.quality and i.doc.quality != InputQualityType.user: other_inps[i.id()] = val else: # catch None and InputQualityType.user user_inps[i.id()] = val if has_array_of_arrays_inps: return prepare_run_instructions_input_file(tool, user_inps, other_inps, reference_information) else: return prepare_run_instructions_cli(tool, user_inps, other_inps, reference_information)
def evaluate(cls, tool: Tool) -> Union[str, bool]: """ Evaluate a Janis tool whether they satisfy certain criteria for them to be publishable :param tool: Janis tool :type tool: Tool :return: error message or True if valid :rtype: Union[str, bool] """ if tool.skip_test(): return cls.STATUS_SKIPPED if tool.type() == ToolType.Workflow: return cls.evaluate_workflow(tool) elif tool.type() == ToolType.CommandTool: return cls.evaluate_command_tool(tool) elif tool.type() == ToolType.CodeTool: return cls.evaluate_code_tool(tool) raise Exception("Unrecognised tool type: " + str(tool.type()))
def ensure_outputs_are_in_workflow_and_are_compatible( tool: Tool, outputs: List[str], compatible_type: DataType): tool_outputs: Dict[str, TOutput] = tool.outputs_map() failed_outputs = [] untyped_outputs = [] for o in outputs: if o not in tool_outputs: failed_outputs.append(o) elif not compatible_type.can_receive_from(tool_outputs[o].outtype): untyped_outputs.append(o) return failed_outputs, untyped_outputs
def evaluate_container(tool: Tool) -> Union[str, bool]: """ Evaluate if the container specified for this tool exists in the remote registry :param tool: Janis tool :type tool: Tool :return: error message or True if listed container for this tool exists in the remote registry :rtype: Union[str, bool] """ # If there is no container, we don't need to check if the container exists in the registry if not tool.containers(): return True # Some tool might not have container, we only want to check if a container is listed, its digest exists containers = [v for k, v in tool.containers().items()] containers = list(filter(None, containers)) if not containers: return True test_helpers.verify_janis_assistant_installed() from janis_assistant.data.container import get_digests_from_containers cache_location = os.path.join(os.getcwd(), "tests_output", "containers") digests = get_digests_from_containers(containers, cache_location=cache_location) errors = [] for c in containers: # if digest is exactly the same, it means digest is not found (it's just the tag name) if c not in digests or digests[c] == c: # if the container nameis already using hash, we don't want to report any issue here if not "@sha256:" in c: errors.append(f"container {c} not found") if errors: return ", ".join(errors) return True
def evaluate_unittest_exists(tool: Tool) -> Union[str, bool]: """ Evaluate if test suite for this tool is provided :param tool: Janis tool :type tool: Tool :return: error message or True if unit tests for this tool exists :rtype: Union[str, bool] """ if tool.tests(): return True return "Mising unit tests"
def evaluate_tool_module(tool: Tool) -> Union[str, bool]: """ Evaluate if tool module name for documentation is provided :param tool: Janis tool :type tool: Tool :return: error message or True if tool module name for this tool exists :rtype: Union[str, bool] """ if not tool.tool_module(): return "Missing tool module" return True
def evaluate_friendly_name(tool: Tool) -> Union[str, bool]: """ Evaluate if a friendly name for documentation is provided :param tool: Janis tool :type tool: Tool :return: error message or True if a friendly name for this tool exists :rtype: Union[str, bool] """ if not tool.friendly_name(): return "Missing friendly name" return True
def prepare_run_instructions_input_file(tool: Tool, user_inps: dict, other_inps: dict, reference_information: str): yaml_user_inps = CwlTranslator.stringify_translated_inputs(user_inps) yaml_other_inps = CwlTranslator.stringify_translated_inputs(other_inps) indented_user = "".join(" " * 7 + s for s in yaml_user_inps.splitlines(True)) indented_other = "".join(" " * 7 + s for s in yaml_other_inps.splitlines(True)) has_static = len(other_inps) > 0 tb = " " * 4 run_args = ["janis run [...run options]", tb + "--inputs inputs.yaml"] static_generation = ("" if not has_static else f"""\ # static inputs janis inputs --static {tool.id()} > static.yaml""") static_yaml = ("" if not has_static else f"""\ **static.yaml** .. code-block:: yaml {indented_other}""") if has_static: run_args.append(tb + "--inputs static.yaml") if isinstance(tool, CommandTool) and not tool.container(): run_args.append( tb + f"--container-override '{tool.id()}=<organisation/container:version>'" ) run_args.append(tb + tool.id()) run_statement = " \\\n".join(" " * 3 + el for el in run_args) return f"""\
def inputs_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Dict: new_inputs = {} for inp in tool.tool_inputs(): if not isinstance(inp.intype, File): continue value = inputs.get(inp.id()) if value is not None: processed_value = self.check_input_for_correctness( inp.id(), inp.intype, value ) if processed_value is not None: new_inputs[inp.id()] = processed_value return {**inputs, **new_inputs}
def evaluate_output_params(self, wf: Tool, additional_inputs: dict): mapped_inps = CwlTranslator().build_inputs_file( wf, recursive=False, additional_inputs=additional_inputs ) output_names: Dict[str, any] = {} output_folders: Dict[str, any] = {} if isinstance(wf, Workflow): for o in wf.output_nodes.values(): output_names[o.id()] = self.evaluate_output_selector( o.output_name, mapped_inps ) output_folders[o.id()] = self.evaluate_output_selector( o.output_folder, mapped_inps ) outputs: List[WorkflowOutputModel] = [] for o in wf.tool_outputs(): # We'll ext = None innertype = o.outtype iscopyable = isinstance(o.outtype, (File, Directory)) or ( isinstance(o.outtype, Array) and isinstance(o.outtype.fundamental_type(), (File, Directory)) ) while isinstance(innertype, Array): innertype = innertype.subtype() if isinstance(o.outtype, File): ext = o.outtype.extension outputs.append( WorkflowOutputModel( tag=o.id(), iscopyable=iscopyable, original_path=None, new_path=None, timestamp=None, output_name=output_names.get(o.id()), output_folder=output_folders.get(o.id()), secondaries=o.outtype.secondary_files(), extension=ext, ) ) return self.database.outputsDB.insert_many(outputs)
def inputs_modifier(self, wf: Tool, inputs: Dict, hints: Dict[str, str]): nin = {**inputs} inmap = wf.inputs_map() # Change the 'cwd' just for the scope of this block with Path(self.cwd): for tag, value in nin.items(): if tag not in inmap: # Only localise tags within the inputsdict that are in the tool inputs # This might be a problem later for passthrough inputs (Janis doesn't support yet) continue intype = inmap[tag].intype # If the type of the input is: # (a) A File / Directory isfiletype = isinstance(intype.received_type(), (File, Directory)) # (b) Array with a fundamental type (recursively searches if nested array) of File or Directory isbasefiletype = isinstance( intype.received_type(), Array) and isinstance( intype.fundamental_type(), (File, Directory)) # (Skip if not one of these conditions) if not (isfiletype or isbasefiletype): try: nin[tag] = intype.coerce_value_if_possible(value) except Exception as e: raise Exception( f"Couldn't coerce the input for '{tag}' ({value}) to type '{intype}': {e}" ) continue # Qualify the fully qualify the filepath try: nin[tag] = self.fully_qualify_filename_array_or_single( value) except Exception as e: raise Exception( f"Couldn't qualify the filename for the input '{tag}' ({value}) to type '{intype}': {e}" ) return nin
def inputs_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Dict: """ Download remote files and reploce input with the local files :param tool: an instance of janis tool :type tool: janis_core.Tool :param inputs: a dictionary of tool inputs :type inputs: dict :param hints: :type hints: dict :return: modified input :rtype: dict """ new_inputs = {} for inp in tool.tool_inputs(): modification_required = False if isinstance(inp.intype, File) or ( isinstance(inp.intype, Array) and isinstance(inp.intype.fundamental_type(), File)): if inp.id() in inputs and inputs[inp.id()] is not None: modification_required = True if modification_required: source = inputs[inp.id()] basedir = self.cache_dir os.makedirs(basedir, exist_ok=True) new_inputs[inp.id()] = self.localise_inputs( inp.id(), inp.intype, basedir, source, # mfranklin 2021-01-08: # if we specify a remote input, and we're localising files, we should localise secondary files localise_secondary_files=True, ) return {**inputs, **new_inputs}
def evaluate_metadata(tool: Tool) -> Union[str, bool]: """ Evaluate if important metadata for documentation is provided :param tool: Janis tool :type tool: Tool :return: error message or True if all required metadata for this tool exists :rtype: Union[str, bool] """ METADATA_KEY_CONTRIBUTORS = "contributors" METADATA_KEY_CREATED_DATE = "created date" METADATA_KEY_INSTITUTION = "institution" if isinstance(tool.metadata, Metadata): required = { METADATA_KEY_CONTRIBUTORS: tool.metadata.contributors, METADATA_KEY_CREATED_DATE: tool.metadata.dateCreated, METADATA_KEY_INSTITUTION: tool.metadata.institution, } missing = [] for key, field in required.items(): if field is None or not field: missing.append(key) # special case, tool_provider() value overwrites contributors in the documentation if METADATA_KEY_INSTITUTION in missing: if tool.tool_provider(): missing.remove(METADATA_KEY_INSTITUTION) if missing: return f"Missing metadata: {', '.join(missing)}" # elif isinstance(self.metadata, ...): else: return "Incorrect metadata class" return True
def tool_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Tool: # Build custom pipeline w = WorkflowBuilder(tool.id(), friendly_name=tool.friendly_name(), version=tool.version()) ins = tool.tool_inputs() insdict = {i.id(): i for i in ins} fields = set(self.batch.fields) inkeys = set(i.id() for i in ins) invalid_keys = fields - inkeys if len(invalid_keys) > 0: raise Exception( f"Couldn't create batchtool from fields {', '.join(invalid_keys)} " f"as they do not exist on '{tool.id()}'") if self.batch.groupby not in inputs: raise Exception( f"the group_by field '{self.batch.groupby}' was not found in the inputs" ) innode_base = {} for i in ins: if i.id() in fields: continue default = i.default if isinstance(default, Selector): default = None innode_base[i.id()] = w.input(i.id(), i.intype, default=default, doc=i.doc) raw_groupby_values = inputs[self.batch.groupby] duplicate_keys = find_duplicates(raw_groupby_values) if len(duplicate_keys) > 0: raise Exception( f"There are duplicate group_by ({self.batch.groupby}) keys in the input: " + ", ".join(duplicate_keys)) groupby_values = [ Validators.transform_identifier_to_be_valid(ident) for ident in raw_groupby_values ] duplicate_keys = find_duplicates(groupby_values) if len(duplicate_keys) > 0: raise Exception( f"Janis transformed values in the group_by field ({self.batch.groupby}) to be a valid identifiers, " f"after this transformation, there were duplicates keys: " + ", ".join(duplicate_keys)) w.input(self.GROUPBY_FIELDNAME, Array(str), value=groupby_values) steps_created = [] stepid_from_gb = lambda gb: f"{gb}_{tool.id()}" for gbvalue in groupby_values: extra_ins = {} for f in fields: newkey = f"{f}_{gbvalue}" extra_ins[f] = w.input(newkey, insdict[f].intype) steps_created.append( w.step(stepid_from_gb(gbvalue), tool(**innode_base, **extra_ins))) for out in tool.tool_outputs(): output_folders = [] output_name = out.id() if isinstance(tool, WorkflowBase): outnode = tool.output_nodes[out.id()] output_folders = outnode.output_folder or [] if outnode.output_name is not None: output_name = outnode.output_name for idx, gbvalue, raw_gbvalue in zip(range(len(groupby_values)), groupby_values, raw_groupby_values): transformed_inputs = { **inputs, **{f: inputs[f][idx] for f in fields} } output_folders_transformed = Operator.evaluate_arg( output_folders, transformed_inputs) output_name_transformed = Operator.evaluate_arg( output_name, transformed_inputs) w.output( f"{gbvalue}_{out.id()}", source=w[stepid_from_gb(gbvalue)][out.id()], output_name=output_name_transformed, output_folder=[ raw_gbvalue, *(output_folders_transformed or []) ], ) return w
def from_janis( wid: str, outdir: str, tool: Tool, environment: Environment, hints: Dict[str, str], validation_requirements: Optional[ValidationRequirements], batchrun_requirements: Optional[BatchRunRequirements], inputs_dict: dict = None, dryrun=False, watch=True, max_cores=None, max_memory=None, keep_intermediate_files=False, run_in_background=True, dbconfig=None, allow_empty_container=False, container_override: dict = None, check_files=True, ): jc = JanisConfiguration.manager() # output directory has been created environment.identifier += "_" + wid tm = WorkflowManager(wid=wid, outdir=outdir, environment=environment) tm.database.runs.insert(wid) tm.database.workflowmetadata.wid = wid tm.database.workflowmetadata.engine = environment.engine tm.database.workflowmetadata.filescheme = environment.filescheme tm.database.workflowmetadata.environment = environment.id() tm.database.workflowmetadata.name = tool.id() tm.database.workflowmetadata.start = DateUtil.now() tm.database.workflowmetadata.executiondir = None tm.database.workflowmetadata.keepexecutiondir = keep_intermediate_files tm.database.workflowmetadata.configuration = jc tm.database.workflowmetadata.dbconfig = dbconfig # This is the only time we're allowed to skip the tm.set_status # This is a temporary stop gap until "notification on status" is implemented. # tm.set_status(TaskStatus.PROCESSING) tm.database.workflowmetadata.status = TaskStatus.PROCESSING tm.database.commit() spec = get_ideal_specification_for_engine(environment.engine) spec_translator = get_translator(spec) tool_evaluate = tm.prepare_and_output_workflow_to_evaluate_if_required( tool=tool, translator=spec_translator, validation=validation_requirements, batchrun=batchrun_requirements, hints=hints, additional_inputs=inputs_dict, max_cores=max_cores or jc.environment.max_cores, max_memory=max_memory or jc.environment.max_ram, allow_empty_container=allow_empty_container, container_override=container_override, check_files=check_files, ) outdir_workflow = tm.get_path_for_component( WorkflowManager.WorkflowManagerPath.workflow ) tm.database.workflowmetadata.submission_workflow = os.path.join( outdir_workflow, spec_translator.filename(tool_evaluate) ) tm.database.workflowmetadata.submission_inputs = os.path.join( outdir_workflow, spec_translator.inputs_filename(tool_evaluate) ) tm.database.workflowmetadata.submission_resources = os.path.join( outdir_workflow, spec_translator.dependencies_filename(tool_evaluate) ) tm.database.commit() if not dryrun: if ( not run_in_background and jc.template and jc.template.template and jc.template.template.can_run_in_foreground is False ): raise Exception( f"Your template '{jc.template.template.__class__.__name__}' is not allowed to run " f"in the foreground, try adding the '--background' argument" ) tm.start_or_submit(run_in_background=run_in_background, watch=watch) else: tm.set_status(TaskStatus.DRY_RUN) tm.database.commit() return tm
def tool_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Tool: from janis_bioinformatics.data_types import FastaWithDict, Vcf, Bed from janis_bioinformatics.tools.illumina import HapPyValidator_0_3_9 failed_outputs, untyped_outputs = ensure_outputs_are_in_workflow_and_are_compatible( tool, self.validation.fields, Vcf()) if len(failed_outputs) > 0: raise Exception( f"Some outputs for validation were not found in the tool '{tool.id()}': " f"{', '.join(failed_outputs)}") if len(untyped_outputs) > 0: Logger.critical( f"Some outputs for validation from the tool '{tool.id()}' were not " f"compatible with VCF: {', '.join(untyped_outputs)}") w = WorkflowBuilder(tool.id() + "_validated") w.input("validatorReference", FastaWithDict, value=self.validation.reference) w.input("validatorTruthVCF", Vcf, value=self.validation.truthVCF) w.input("validatorIntervals", Bed(optional=True), value=self.validation.intervals) inpdict = { i.id(): w.input(i.id(), i.intype) for i in tool.tool_inputs() } toolstp = w.step(tool.id(), tool(**inpdict)) if isinstance(tool, Workflow): wf: Workflow = tool for o in wf.output_nodes.values(): w.output( identifier=o.id(), source=toolstp[o.id()], output_folder=o.output_folder, output_name=o.output_name, ) else: for o in tool.tool_outputs(): w.output(identifier=o.id(), source=toolstp[o.id()]) for o in self.validation.fields: sid = "validator_" + o valstp = w.step( sid, HapPyValidator_0_3_9( compareVCF=toolstp[o], reportPrefix= o, # this will generate an input node with format validator_{o}_reportPrefix reference=w.validatorReference, truthVCF=w.validatorTruthVCF, intervals=w.validatorIntervals, ), ) # Connect all the outputs of the validator to an output for vo in valstp.tool.outputs(): w.output( f"validated_{o}_{vo.id()}", source=valstp[vo.id()], output_folder="validated", ) return w
def do_bed_fasta_contig_check(tool: Tool, inputs: Dict[str, any]): from janis_bioinformatics.data_types import Fasta, Bed, BedTabix supported_bed_types = (Bed, BedTabix) beds_inputs = [] refs = [] for i in tool.tool_inputs(): if isinstance(i.intype, supported_bed_types) or ( isinstance(i.intype, Array) and isinstance(i.intype.subtype(), supported_bed_types)): beds_inputs.append(i) if (isinstance(i.intype, Fasta) and i.intype.secondary_files() and ".fai" in i.intype.secondary_files()): refs.append(i) if len(refs) == 0: return if len(refs) > 1: Logger.info( "Skipping bioinformatics FASTA-BED file checks as there were more than 1 reference" ) for inp_ref in refs: value_ref = inputs[inp_ref.id()] if not value_ref: Logger.warn( f"Skipping '{inp_ref.id()}' as no value was provided") continue ref_contigs = ContigChecker.get_list_of_contigs_from_fastafai( value_ref + ".fai") if not ref_contigs: Logger.debug( f"Didn't get any contigs from ref {value_ref}.fai, skipping..." ) continue for inp_bed in beds_inputs: value_bed = inputs[inp_bed.id()] is_array = isinstance(value_bed, list) beds = value_bed if is_array else [value_bed] for b_idx in range(len(beds)): bed = beds[b_idx] bed_contigs = ContigChecker.get_list_of_contigs_from_bed( bed) missing_contigs = bed_contigs - ref_contigs if missing_contigs: inpname = (f"{inp_bed.id()}.{b_idx}" if is_array else inp_bed.id()) contiglist = (", ".join(missing_contigs) if len(missing_contigs) < 5 else (", ".join(list(missing_contigs)[:3]) + "...")) Logger.warn( f"The BED file '{inpname}' contained {len(missing_contigs)} contigs ({contiglist}) that were missing from the reference: {value_ref}" )
def tool_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Tool: # Build custom pipeline w = WorkflowBuilder(tool.id(), friendly_name=tool.friendly_name(), version=tool.version()) ins = tool.tool_inputs() insdict = {i.id(): i for i in ins} fields = set(self.batch.fields) inkeys = set(i.id() for i in ins) invalid_keys = fields - inkeys if len(invalid_keys) > 0: raise Exception( f"Couldn't create batchtool from fields {', '.join(invalid_keys)} " f"as they do not exist on '{tool.id()}'") if self.batch.groupby not in inputs: raise Exception( f"the group_by field '{self.batch.groupby}' was not found in the inputs" ) innode_base = {} for i in ins: if i.id() in fields: continue innode_base[i.id()] = w.input(i.id(), i.intype, default=i.default, doc=i.doc) raw_groupby_values = inputs[self.batch.groupby] duplicate_keys = find_duplicates(raw_groupby_values) if len(duplicate_keys) > 0: raise Exception( f"There are duplicate group_by ({self.batch.groupby}) keys in the input: " + ", ".join(duplicate_keys)) groupby_values = [ Validators.transform_identifier_to_be_valid(ident) for ident in raw_groupby_values ] duplicate_keys = find_duplicates(groupby_values) if len(duplicate_keys) > 0: raise Exception( f"Janis transformed values in the group_by field ({self.batch.groupby}) to be a valid identifiers, " f"after this transformation, there were duplicates keys: " + ", ".join(duplicate_keys)) w.input(self.GROUPBY_FIELDNAME, Array(str), value=groupby_values) steps_created = [] stepid_from_gb = lambda gb: f"{gbvalue}_{tool.id()}" for gbvalue in groupby_values: extra_ins = {} for f in fields: newkey = f"{f}_{gbvalue}" extra_ins[f] = w.input(newkey, insdict[f].intype) steps_created.append( w.step(stepid_from_gb(gbvalue), tool(**innode_base, **extra_ins))) def transform_token_in_output_namers(token, outputid): if token is None: return token if isinstance(token, list): return [ transform_token_in_output_namers(t, outputid) for t in token ] if isinstance(token, InputSelector): if token.input_to_select in fields: # need to transform it return InputSelector(f"{token.input_to_select}_{outputid}") else: return token elif isinstance(token, (str, int, float, bool)): return token else: raise Exception( f"Unsure how to translate token of type {token.__class__.__name__} " ) for out in tool.tool_outputs(): output_folders = [] output_name = out.id() if isinstance(tool, Workflow): outnode = tool.output_nodes[out.id()] output_folders = outnode.output_folder or [] if outnode.output_name: output_name = outnode.output_name for gbvalue, raw_gbvalue in zip(groupby_values, raw_groupby_values): # This is pretty hacky, we're relying on the output_folder and output_name to be InputSelectors # or a literal value, otherwise this will probably break (this will probably break for expressions) output_folders_transformed = transform_token_in_output_namers( output_folders, gbvalue) output_name_transformed = transform_token_in_output_namers( output_name, gbvalue) w.output( f"{gbvalue}_{out.id()}", source=w[stepid_from_gb(gbvalue)][out.id()], output_name=output_name_transformed, output_folder=[ raw_gbvalue, *(output_folders_transformed or []) ], ) return w