def prepare_quickstart(tool: Tool): required_python_input_map = "\n".join(" " * 15 + i.id() + "=None," for i in tool.tool_inputs() if not i.intype.optional) python_step_name = tool.id().lower() + "_step" output_python_code = "\n".join( " " * 7 + f'wf.output("{o.id()}", source={python_step_name}.{o.id()})' for o in tool.tool_outputs()) python_codeblock = f"""\ .. code-block:: python from {tool.__module__} import {tool.__class__.__name__} wf = WorkflowBuilder("myworkflow") wf.step( "{python_step_name}", {tool.__class__.__name__}( {required_python_input_map} ) ) {output_python_code} """ return f"""\
def validate_inputs(tool: Tool, additional_inputs): errors = {} input_values_from_workflow = {} if isinstance(tool, Workflow): input_values_from_workflow = { inpkey: inp.value for inpkey, inp in tool.input_nodes.items() if inp.value } input_values_to_use = {**input_values_from_workflow, **additional_inputs} for inp in tool.tool_inputs(): inpkey = inp.id() value = input_values_to_use.get(inpkey) if inp.intype.validate_value(value, allow_null_if_not_optional=False): continue errors[inpkey] = ( inp.intype.invalid_value_hint(value) or f"An internal error occurred when validating {inpkey} from {inp.intype.id()}" ) if len(errors) == 0: return True raise ValueError(f"There were errors in {len(errors)} inputs: " + str(errors))
def check_existence_of_files(wf: Tool, inputs: Dict): doesnt_exist = {} for inp in wf.tool_inputs(): intype = inp.intype is_path = isinstance(intype, (File, Directory)) is_array_of_paths = isinstance(intype, Array) and isinstance( intype.fundamental_type(), (File, Directory)) if not (is_path or is_array_of_paths): continue val = inputs.get(inp.id()) if val is None: if inp.intype.optional: continue raise Exception( f"Expected input '{inp.id()}' was not found or is null") doesnt_exist.update( InputChecker.check_base_with_type(inp, intype, val)) if len(doesnt_exist) > 0: import ruamel.yaml stringified = ruamel.yaml.dump(doesnt_exist, default_flow_style=False) raise Exception("The following inputs were not found:\n" + stringified)
def prepare_run_instructions_input_file(tool: Tool, user_inps: dict, other_inps: dict, reference_information: str): yaml_user_inps = CwlTranslator.stringify_translated_inputs(user_inps) yaml_other_inps = CwlTranslator.stringify_translated_inputs(other_inps) indented_user = "".join(" " * 7 + s for s in yaml_user_inps.splitlines(True)) indented_other = "".join(" " * 7 + s for s in yaml_other_inps.splitlines(True)) not_localising_secondary_warning = "" if isinstance(tool, WorkflowBase): inputs_that_arent_localising_secondary_files = [ t.id() for t in tool.tool_inputs() if t.doc.skip_sourcing_secondary_files ] if len(inputs_that_arent_localising_secondary_files) > 0: not_localising_secondary_warning = f"""\ .. warning:: The secondary files for the inputs '{"', '".join(inputs_that_arent_localising_secondary_files)}' will not automatically \ localise using janis prepare and are built just after download. Please note this can take a few hours to build \ before the pipeline runs. """ has_static = len(other_inps) > 0 tb = " " * 4 run_args = ["janis run [...run options]", tb + "--inputs inputs.yaml"] static_generation = ("" if not has_static else f"""\ # static inputs janis inputs --static {tool.id()} > static.yaml""") static_yaml = ("" if not has_static else f"""\ **static.yaml** .. code-block:: yaml {indented_other}""") if has_static: run_args.append(tb + "--inputs static.yaml") if isinstance(tool, CommandTool) and not tool.container(): run_args.append( tb + f"--container-override '{tool.id()}=<organisation/container:version>'" ) run_args.append(tb + tool.id()) run_statement = " \\\n".join(" " * 3 + el for el in run_args) if reference_information: reference_information = f"The following inputs have a suggested source. Using janis prepare with the relevant \ ``--source-hint`` will automatically download these files. See `below <#additional-configuration-inputs>`_ for \ more information about inputs for {tool.id()}.\n{reference_information}" return f"""\
def prepare_run_instructions(tool: Tool): metadata = tool.bind_metadata() or tool.metadata has_array_of_arrays_inps = True bla = any( (isinstance(i.intype, Array) and isinstance(i.intype.subtype(), Array)) for i in tool.tool_inputs()) static_input_tuples = [[ i.id(), i.intype.id(), prepare_source(i.doc.source) ] for i in tool.tool_inputs() if i.doc.quality == InputQualityType.static and i.doc.source is not None] reference_information = "" if len(static_input_tuples) > 0: static_input_headers = ["Name", "Type", "Source"] reference_information = tabulate(static_input_tuples, headers=static_input_headers, tablefmt="rst") # overrides = metadata.sample_input_overrides or {} user_inps = {} other_inps = {} for i in tool.tool_inputs(): if i.intype.optional or i.default is not None: continue val = i.doc.example or prepare_default_for_type(i.id(), i.intype) if i.doc and i.doc.quality and i.doc.quality != InputQualityType.user: other_inps[i.id()] = val else: # catch None and InputQualityType.user user_inps[i.id()] = val if has_array_of_arrays_inps: return prepare_run_instructions_input_file(tool, user_inps, other_inps, reference_information) else: return prepare_run_instructions_cli(tool, user_inps, other_inps, reference_information)
def inputs_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Dict: new_inputs = {} for inp in tool.tool_inputs(): if not isinstance(inp.intype, File): continue value = inputs.get(inp.id()) if value is not None: processed_value = self.check_input_for_correctness( inp.id(), inp.intype, value ) if processed_value is not None: new_inputs[inp.id()] = processed_value return {**inputs, **new_inputs}
def inputs_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Dict: """ Download remote files and reploce input with the local files :param tool: an instance of janis tool :type tool: janis_core.Tool :param inputs: a dictionary of tool inputs :type inputs: dict :param hints: :type hints: dict :return: modified input :rtype: dict """ new_inputs = {} for inp in tool.tool_inputs(): modification_required = False if isinstance(inp.intype, File) or ( isinstance(inp.intype, Array) and isinstance(inp.intype.fundamental_type(), File)): if inp.id() in inputs and inputs[inp.id()] is not None: modification_required = True if modification_required: source = inputs[inp.id()] basedir = self.cache_dir os.makedirs(basedir, exist_ok=True) new_inputs[inp.id()] = self.localise_inputs( inp.id(), inp.intype, basedir, source, # mfranklin 2021-01-08: # if we specify a remote input, and we're localising files, we should localise secondary files localise_secondary_files=True, ) return {**inputs, **new_inputs}
def do_bed_fasta_contig_check(tool: Tool, inputs: Dict[str, any]): from janis_bioinformatics.data_types import Fasta, Bed, BedTabix supported_bed_types = (Bed, BedTabix) beds_inputs = [] refs = [] for i in tool.tool_inputs(): if isinstance(i.intype, supported_bed_types) or ( isinstance(i.intype, Array) and isinstance(i.intype.subtype(), supported_bed_types)): beds_inputs.append(i) if (isinstance(i.intype, Fasta) and i.intype.secondary_files() and ".fai" in i.intype.secondary_files()): refs.append(i) if len(refs) == 0: return if len(refs) > 1: Logger.info( "Skipping bioinformatics FASTA-BED file checks as there were more than 1 reference" ) for inp_ref in refs: value_ref = inputs[inp_ref.id()] if not value_ref: Logger.warn( f"Skipping '{inp_ref.id()}' as no value was provided") continue ref_contigs = ContigChecker.get_list_of_contigs_from_fastafai( value_ref + ".fai") if not ref_contigs: Logger.debug( f"Didn't get any contigs from ref {value_ref}.fai, skipping..." ) continue for inp_bed in beds_inputs: value_bed = inputs[inp_bed.id()] is_array = isinstance(value_bed, list) beds = value_bed if is_array else [value_bed] for b_idx in range(len(beds)): bed = beds[b_idx] bed_contigs = ContigChecker.get_list_of_contigs_from_bed( bed) missing_contigs = bed_contigs - ref_contigs if missing_contigs: inpname = (f"{inp_bed.id()}.{b_idx}" if is_array else inp_bed.id()) contiglist = (", ".join(missing_contigs) if len(missing_contigs) < 5 else (", ".join(list(missing_contigs)[:3]) + "...")) Logger.warn( f"The BED file '{inpname}' contained {len(missing_contigs)} contigs ({contiglist}) that were missing from the reference: {value_ref}" )
def tool_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Tool: from janis_bioinformatics.data_types import FastaWithDict, Vcf, Bed from janis_bioinformatics.tools.illumina import HapPyValidator_0_3_9 failed_outputs, untyped_outputs = ensure_outputs_are_in_workflow_and_are_compatible( tool, self.validation.fields, Vcf()) if len(failed_outputs) > 0: raise Exception( f"Some outputs for validation were not found in the tool '{tool.id()}': " f"{', '.join(failed_outputs)}") if len(untyped_outputs) > 0: Logger.critical( f"Some outputs for validation from the tool '{tool.id()}' were not " f"compatible with VCF: {', '.join(untyped_outputs)}") w = WorkflowBuilder(tool.id() + "_validated") w.input("validatorReference", FastaWithDict, value=self.validation.reference) w.input("validatorTruthVCF", Vcf, value=self.validation.truthVCF) w.input("validatorIntervals", Bed(optional=True), value=self.validation.intervals) inpdict = { i.id(): w.input(i.id(), i.intype) for i in tool.tool_inputs() } toolstp = w.step(tool.id(), tool(**inpdict)) if isinstance(tool, Workflow): wf: Workflow = tool for o in wf.output_nodes.values(): w.output( identifier=o.id(), source=toolstp[o.id()], output_folder=o.output_folder, output_name=o.output_name, ) else: for o in tool.tool_outputs(): w.output(identifier=o.id(), source=toolstp[o.id()]) for o in self.validation.fields: sid = "validator_" + o valstp = w.step( sid, HapPyValidator_0_3_9( compareVCF=toolstp[o], reportPrefix= o, # this will generate an input node with format validator_{o}_reportPrefix reference=w.validatorReference, truthVCF=w.validatorTruthVCF, intervals=w.validatorIntervals, ), ) # Connect all the outputs of the validator to an output for vo in valstp.tool.outputs(): w.output( f"validated_{o}_{vo.id()}", source=valstp[vo.id()], output_folder="validated", ) return w
def tool_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Tool: # Build custom pipeline w = WorkflowBuilder(tool.id(), friendly_name=tool.friendly_name(), version=tool.version()) ins = tool.tool_inputs() insdict = {i.id(): i for i in ins} fields = set(self.batch.fields) inkeys = set(i.id() for i in ins) invalid_keys = fields - inkeys if len(invalid_keys) > 0: raise Exception( f"Couldn't create batchtool from fields {', '.join(invalid_keys)} " f"as they do not exist on '{tool.id()}'") if self.batch.groupby not in inputs: raise Exception( f"the group_by field '{self.batch.groupby}' was not found in the inputs" ) innode_base = {} for i in ins: if i.id() in fields: continue default = i.default if isinstance(default, Selector): default = None innode_base[i.id()] = w.input(i.id(), i.intype, default=default, doc=i.doc) raw_groupby_values = inputs[self.batch.groupby] duplicate_keys = find_duplicates(raw_groupby_values) if len(duplicate_keys) > 0: raise Exception( f"There are duplicate group_by ({self.batch.groupby}) keys in the input: " + ", ".join(duplicate_keys)) groupby_values = [ Validators.transform_identifier_to_be_valid(ident) for ident in raw_groupby_values ] duplicate_keys = find_duplicates(groupby_values) if len(duplicate_keys) > 0: raise Exception( f"Janis transformed values in the group_by field ({self.batch.groupby}) to be a valid identifiers, " f"after this transformation, there were duplicates keys: " + ", ".join(duplicate_keys)) w.input(self.GROUPBY_FIELDNAME, Array(str), value=groupby_values) steps_created = [] stepid_from_gb = lambda gb: f"{gb}_{tool.id()}" for gbvalue in groupby_values: extra_ins = {} for f in fields: newkey = f"{f}_{gbvalue}" extra_ins[f] = w.input(newkey, insdict[f].intype) steps_created.append( w.step(stepid_from_gb(gbvalue), tool(**innode_base, **extra_ins))) for out in tool.tool_outputs(): output_folders = [] output_name = out.id() if isinstance(tool, WorkflowBase): outnode = tool.output_nodes[out.id()] output_folders = outnode.output_folder or [] if outnode.output_name is not None: output_name = outnode.output_name for idx, gbvalue, raw_gbvalue in zip(range(len(groupby_values)), groupby_values, raw_groupby_values): transformed_inputs = { **inputs, **{f: inputs[f][idx] for f in fields} } output_folders_transformed = Operator.evaluate_arg( output_folders, transformed_inputs) output_name_transformed = Operator.evaluate_arg( output_name, transformed_inputs) w.output( f"{gbvalue}_{out.id()}", source=w[stepid_from_gb(gbvalue)][out.id()], output_name=output_name_transformed, output_folder=[ raw_gbvalue, *(output_folders_transformed or []) ], ) return w
def tool_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Tool: # Build custom pipeline w = WorkflowBuilder(tool.id(), friendly_name=tool.friendly_name(), version=tool.version()) ins = tool.tool_inputs() insdict = {i.id(): i for i in ins} fields = set(self.batch.fields) inkeys = set(i.id() for i in ins) invalid_keys = fields - inkeys if len(invalid_keys) > 0: raise Exception( f"Couldn't create batchtool from fields {', '.join(invalid_keys)} " f"as they do not exist on '{tool.id()}'") if self.batch.groupby not in inputs: raise Exception( f"the group_by field '{self.batch.groupby}' was not found in the inputs" ) innode_base = {} for i in ins: if i.id() in fields: continue innode_base[i.id()] = w.input(i.id(), i.intype, default=i.default, doc=i.doc) raw_groupby_values = inputs[self.batch.groupby] duplicate_keys = find_duplicates(raw_groupby_values) if len(duplicate_keys) > 0: raise Exception( f"There are duplicate group_by ({self.batch.groupby}) keys in the input: " + ", ".join(duplicate_keys)) groupby_values = [ Validators.transform_identifier_to_be_valid(ident) for ident in raw_groupby_values ] duplicate_keys = find_duplicates(groupby_values) if len(duplicate_keys) > 0: raise Exception( f"Janis transformed values in the group_by field ({self.batch.groupby}) to be a valid identifiers, " f"after this transformation, there were duplicates keys: " + ", ".join(duplicate_keys)) w.input(self.GROUPBY_FIELDNAME, Array(str), value=groupby_values) steps_created = [] stepid_from_gb = lambda gb: f"{gbvalue}_{tool.id()}" for gbvalue in groupby_values: extra_ins = {} for f in fields: newkey = f"{f}_{gbvalue}" extra_ins[f] = w.input(newkey, insdict[f].intype) steps_created.append( w.step(stepid_from_gb(gbvalue), tool(**innode_base, **extra_ins))) def transform_token_in_output_namers(token, outputid): if token is None: return token if isinstance(token, list): return [ transform_token_in_output_namers(t, outputid) for t in token ] if isinstance(token, InputSelector): if token.input_to_select in fields: # need to transform it return InputSelector(f"{token.input_to_select}_{outputid}") else: return token elif isinstance(token, (str, int, float, bool)): return token else: raise Exception( f"Unsure how to translate token of type {token.__class__.__name__} " ) for out in tool.tool_outputs(): output_folders = [] output_name = out.id() if isinstance(tool, Workflow): outnode = tool.output_nodes[out.id()] output_folders = outnode.output_folder or [] if outnode.output_name: output_name = outnode.output_name for gbvalue, raw_gbvalue in zip(groupby_values, raw_groupby_values): # This is pretty hacky, we're relying on the output_folder and output_name to be InputSelectors # or a literal value, otherwise this will probably break (this will probably break for expressions) output_folders_transformed = transform_token_in_output_namers( output_folders, gbvalue) output_name_transformed = transform_token_in_output_namers( output_name, gbvalue) w.output( f"{gbvalue}_{out.id()}", source=w[stepid_from_gb(gbvalue)][out.id()], output_name=output_name_transformed, output_folder=[ raw_gbvalue, *(output_folders_transformed or []) ], ) return w