def __init__(self, **connections): super().__init__(metadata_class=WorkflowMetadata) self.connections = connections Logger.log(f"Creating workflow with identifier: '{self.id()}'") if not Validators.validate_identifier(self.id()): raise Exception( f"The identifier '{self.id()}' was invalid because {Validators.reason_for_failure(self.id())}" ) # The following variables allow us to quickly check data about the graph self.nodes: Dict[str, Node] = {} self.input_nodes: Dict[str, InputNode] = {} self.step_nodes: Dict[str, StepNode] = {} self.output_nodes: Dict[str, OutputNode] = {} # Flags for different requirements that a workflow might need self.has_scatter = False self.has_subworkflow = False self.has_multiple_inputs = False # Now that we've initialised everything, we can "construct" the workflows for that subclass this class # else, for the WorkflowBuilder it will do nothing and they'll add workflows later self.constructor()
def __init__( self, tag: str, output_type: ParseableType, glob: Optional[Union[Selector, str]] = None, doc: Optional[str] = None, ): """ A ToolOutput instructs the the engine how to collect an output and how it may be referenced in a workflow. :param tag: The identifier of a output, must be unique in the inputs and outputs. :param output_type: The type of output that is being collected. :param glob: How to collect this output, can accept any :class:`janis.Selector`. :param doc: Documentation on what the output is, used to generate docs. """ if not Validators.validate_identifier(tag): raise Exception( f"The identifier '{tag}' was invalid because {Validators.reason_for_failure(tag)}" ) self.tag = tag self.output_type: ParseableType = get_instantiated_type(output_type) self.glob = glob self.doc = doc
def __init__( self, tag: str, output_type: ParseableType, glob: Optional[Union[Selector, str]] = None, presents_as: str = None, secondaries_present_as: Dict[str, str] = None, doc: Optional[Union[str, OutputDocumentation]] = None, ): """ A ToolOutput instructs the the engine how to collect an output and how it may be referenced in a workflow. :param tag: The identifier of a output, must be unique in the inputs and outputs. :param output_type: The type of output that is being collected. :param glob: How to collect this output, can accept any :class:`janis.Selector`. :param doc: Documentation on what the output is, used to generate docs. """ if not Validators.validate_identifier(tag): raise Exception( f"The identifier '{tag}' was invalid because {Validators.reason_for_failure(tag)}" ) self.tag = tag self.output_type: ParseableType = get_instantiated_type(output_type) if not glob and not ( isinstance(self.output_type, Stdout) or isinstance(self.output_type, Stderr) ): raise Exception( "ToolOutput expects a glob when the output type is not Stdout / Stderr" ) self.glob = glob self.presents_as = presents_as self.secondaries_present_as = secondaries_present_as self.doc = ( doc if isinstance(doc, OutputDocumentation) else OutputDocumentation(doc=doc) ) if self.secondaries_present_as: if not self.output_type.secondary_files(): raise Exception( f"The ToolOutput '{self.id()}' requested a rewrite of secondary file extension through " f"'secondaries_present_as', but the type {self.output_type.id()} not have any secondary files." ) secs = set(self.output_type.secondary_files()) to_remap = set(self.secondaries_present_as.keys()) invalid = to_remap - secs if len(invalid) > 0: raise Exception( f"Error when constructing output '{self.id()}', the secondaries_present_as contained secondary " f"files ({', '.join(invalid)}) that were not found in the output " f"type '{self.output_type.id()}' ({', '.join(secs)})" )
def get_tool_tag_from_identifier(cls, identifier): i = cls.get_tag_from_identifier(identifier) while not Validators.validate_identifier(i): i = str( input( f"The tag for tool: '{i}' (fullID: {identifier}) was invalid, please choose another: " )) return i
def __init__( self, tag: str, input_type: ParseableType, position: Optional[int] = None, prefix: Optional[str] = None, separate_value_from_prefix: bool = None, prefix_applies_to_all_elements: bool = None, separator: str = None, shell_quote: bool = None, localise_file: bool = None, default: Any = None, doc: Optional[str] = None, ): """ A ``ToolInput`` represents an input to a tool, with parameters that allow it to be bound on the command line. The ToolInput must have either a position or prefix set to be bound onto the command line. :param tag: The identifier of the input (unique to inputs and outputs of a tool) :param input_type: The data type that this input accepts :type input_type: ``janis.ParseableType`` :param position: The position of the input to be applied. (Default = 0, after the base_command). :param prefix: The prefix to be appended before the element. (By default, a space will also be applied, see ``separate_value_from_prefix`` for more information) :param separate_value_from_prefix: (Default: True) Add a space between the prefix and value when ``True``. :param prefix_applies_to_all_elements: Applies the prefix to each element of the array (Array inputs only) :param shell_quote: Stops shell quotes from being applied in all circumstances, useful when joining multiple commands together. :param separator: The separator between each element of an array (defaults to ' ') :param localise_file: Ensures that the file(s) are localised into the execution directory. :param default: The default value to be applied if the input is not defined. :param doc: Documentation string for the ToolInput, this is used to generate the tool documentation and provide hints to the user. """ super().__init__( value=None, prefix=prefix, position=position, separate_value_from_prefix=separate_value_from_prefix, doc=doc, shell_quote=shell_quote, ) # if default is not None: # input_type.optional = True if not Validators.validate_identifier(tag): raise Exception( f"The identifier '{tag}' was not validated because {Validators.reason_for_failure(tag)}" ) self.tag: str = tag self.input_type: ParseableType = get_instantiated_type(input_type) self.default = default self.prefix_applies_to_all_elements = prefix_applies_to_all_elements self.separator = separator self.localise_file = localise_file
def verify_identifier(self, identifier: str, component: str): if identifier in self.__dict__: raise Exception( f"'{identifier}' is a protected keyword for a janis workflow") if identifier in self.nodes: existing = self.nodes[identifier] raise Exception( f"There already exists a node (and component) with id '{identifier}'. The added " f"component ('{component}') clashes with '{repr(existing)}').") if not Validators.validate_identifier(identifier): raise Exception( f"The identifier '{identifier}' was invalid because {Validators.reason_for_failure(identifier)}" )
def inputs_modifier(self, wf: Tool, inputs: Dict, hints: Dict[str, str]) -> Dict: if self.batch.groupby not in inputs: raise Exception( "the group_by field '{self.batch.groupby}' was not found in the inputs" ) # batch_inputs is seen as the source of truth for the length operations raw_groupby_values = inputs[self.batch.groupby] groupby_values = [ Validators.transform_identifier_to_be_valid(ident) for ident in raw_groupby_values ] if not isinstance(groupby_values, list): raise ValueError( f"The value of the groupBy field '{self.batch.groupby}' was not a 'list', got '{type(groupby_values)}'" ) # Split up the inputs dict to be keyed by the groupBy field self.validate_inputs(inputs, groupby_values) fields = set(self.batch.fields) retval = {k: v for k, v in inputs.items() if k not in fields} retval["groupby_field"] = groupby_values # Tbh, this would be made a lot simpler with the Operator syntax from conditions # In the step map, you could just do self.inputs[field][idx] and create an IndexOperator for f in fields: for idx in range(len(groupby_values)): gb_value = groupby_values[idx] newkey = f"{f}_{gb_value}" retval[newkey] = inputs[f][idx] return retval
def __init__( self, tag: str, output_type: ParseableType, selector: Optional[Union[Selector, str]] = None, presents_as: str = None, secondaries_present_as: Dict[str, str] = None, doc: Optional[Union[str, OutputDocumentation]] = None, glob: Optional[Union[Selector, str]] = None, _skip_output_quality_check=False, ): """ A ToolOutput instructs the the engine how to collect an output and how it may be referenced in a workflow. :param tag: The identifier of a output, must be unique in the inputs and outputs. :param output_type: The type of output that is being collected. :param selector: How to collect this output, can accept any :class:`janis.Selector`. :param glob: (DEPRECATED) An alias for `selector` :param doc: Documentation on what the output is, used to generate docs. :param _skip_output_quality_check: DO NOT USE THIS PARAMETER, it's a scapegoat for parsing CWL ExpressionTools when an cwl.output.json is generated """ if not Validators.validate_identifier(tag): raise Exception( f"The identifier '{tag}' was invalid because {Validators.reason_for_failure(tag)}" ) self.tag = tag self.output_type: ParseableType = get_instantiated_type(output_type) self._skip_output_quality_check = _skip_output_quality_check if selector is None and glob is not None: selector = glob elif selector is not None and glob is not None: raise TypeError( f"ToolInput({tag}) received inputs for both selector and glob. Please only use glob" ) if (not _skip_output_quality_check and selector is None and not (isinstance(self.output_type, Stdout) or isinstance(self.output_type, Stderr))): raise Exception( "ToolOutput expects a 'selector=' param when the output type is not Stdout / Stderr" ) self.selector = selector self.presents_as = presents_as self.secondaries_present_as = secondaries_present_as self.doc = (doc if isinstance(doc, OutputDocumentation) else OutputDocumentation(doc=doc)) if isinstance(selector, Operator) and self.presents_as: raise Exception( f"Error when constructing output '{self.id()}', Janis does not support 'presents_as' AND " "operators within a ToolOutput selector. Please raise an issue if you think this is in error." ) if self.secondaries_present_as: if not self.output_type.secondary_files(): raise Exception( f"The ToolOutput '{self.id()}' requested a rewrite of secondary file extension through " f"'secondaries_present_as', but the type {self.output_type.id()} not have any secondary files." ) secs = set(self.output_type.secondary_files()) to_remap = set(self.secondaries_present_as.keys()) invalid = to_remap - secs if len(invalid) > 0: raise Exception( f"Error when constructing output '{self.id()}', the secondaries_present_as contained secondary " f"files ({', '.join(invalid)}) that were not found in the output " f"type '{self.output_type.id()}' ({', '.join(secs)})")
def versioned_id(self) -> str: if self.version() is not None: return Validators.transform_identifier_to_be_valid( f"{self.id()}/{self.version()}", "_") return self.id()
def test_invalid_identifiers(self): self.assertFalse(Validators.validate_identifier("test-workflow"))
def test_valid_identifiers(self): self.assertTrue(Validators.validate_identifier("test_workflow"))
def test_invalid_sample_name_error(self): error = Validators.reason_for_failure("fastqs_CDG-025-156R_PDX") self.assertNotEqual("Undefined", error)
def test_invalid_sample_name(self): self.assertFalse(Validators.validate_identifier("fastqs_CDG-025-156R_PDX"))
def tool_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Tool: # Build custom pipeline w = WorkflowBuilder(tool.id(), friendly_name=tool.friendly_name(), version=tool.version()) ins = tool.tool_inputs() insdict = {i.id(): i for i in ins} fields = set(self.batch.fields) inkeys = set(i.id() for i in ins) invalid_keys = fields - inkeys if len(invalid_keys) > 0: raise Exception( f"Couldn't create batchtool from fields {', '.join(invalid_keys)} " f"as they do not exist on '{tool.id()}'") if self.batch.groupby not in inputs: raise Exception( f"the group_by field '{self.batch.groupby}' was not found in the inputs" ) innode_base = {} for i in ins: if i.id() in fields: continue default = i.default if isinstance(default, Selector): default = None innode_base[i.id()] = w.input(i.id(), i.intype, default=default, doc=i.doc) raw_groupby_values = inputs[self.batch.groupby] duplicate_keys = find_duplicates(raw_groupby_values) if len(duplicate_keys) > 0: raise Exception( f"There are duplicate group_by ({self.batch.groupby}) keys in the input: " + ", ".join(duplicate_keys)) groupby_values = [ Validators.transform_identifier_to_be_valid(ident) for ident in raw_groupby_values ] duplicate_keys = find_duplicates(groupby_values) if len(duplicate_keys) > 0: raise Exception( f"Janis transformed values in the group_by field ({self.batch.groupby}) to be a valid identifiers, " f"after this transformation, there were duplicates keys: " + ", ".join(duplicate_keys)) w.input(self.GROUPBY_FIELDNAME, Array(str), value=groupby_values) steps_created = [] stepid_from_gb = lambda gb: f"{gb}_{tool.id()}" for gbvalue in groupby_values: extra_ins = {} for f in fields: newkey = f"{f}_{gbvalue}" extra_ins[f] = w.input(newkey, insdict[f].intype) steps_created.append( w.step(stepid_from_gb(gbvalue), tool(**innode_base, **extra_ins))) for out in tool.tool_outputs(): output_folders = [] output_name = out.id() if isinstance(tool, WorkflowBase): outnode = tool.output_nodes[out.id()] output_folders = outnode.output_folder or [] if outnode.output_name is not None: output_name = outnode.output_name for idx, gbvalue, raw_gbvalue in zip(range(len(groupby_values)), groupby_values, raw_groupby_values): transformed_inputs = { **inputs, **{f: inputs[f][idx] for f in fields} } output_folders_transformed = Operator.evaluate_arg( output_folders, transformed_inputs) output_name_transformed = Operator.evaluate_arg( output_name, transformed_inputs) w.output( f"{gbvalue}_{out.id()}", source=w[stepid_from_gb(gbvalue)][out.id()], output_name=output_name_transformed, output_folder=[ raw_gbvalue, *(output_folders_transformed or []) ], ) return w
def __init__( self, tag: str, input_type: ParseableType, position: Optional[int] = None, prefix: Optional[str] = None, separate_value_from_prefix: bool = None, prefix_applies_to_all_elements: bool = None, presents_as: str = None, secondaries_present_as: Dict[str, str] = None, separator: str = None, shell_quote: bool = None, localise_file: bool = None, default: Any = None, doc: Optional[Union[str, InputDocumentation]] = None, ): """ A ``ToolInput`` represents an input to a tool, with parameters that allow it to be bound on the command line. The ToolInput must have either a position or prefix set to be bound onto the command line. :param tag: The identifier of the input (unique to inputs and outputs of a tool) :param input_type: The data type that this input accepts :type input_type: ``janis.ParseableType`` :param position: The position of the input to be applied. (Default = 0, after the base_command). :param prefix: The prefix to be appended before the element. (By default, a space will also be applied, see ``separate_value_from_prefix`` for more information) :param separate_value_from_prefix: (Default: True) Add a space between the prefix and value when ``True``. :param prefix_applies_to_all_elements: Applies the prefix to each element of the array (Array inputs only) :param shell_quote: Stops shell quotes from being applied in all circumstances, useful when joining multiple commands together. :param separator: The separator between each element of an array (defaults to ' ') :param localise_file: Ensures that the file(s) are localised into the execution directory. :param default: The default value to be applied if the input is not defined. :param doc: Documentation string for the ToolInput, this is used to generate the tool documentation and provide hints to the user. """ super().__init__( value=None, prefix=prefix, position=position, separate_value_from_prefix=separate_value_from_prefix, doc=None, shell_quote=shell_quote, ) self.doc: InputDocumentation = (doc if isinstance( doc, DocumentationMeta) else InputDocumentation(doc=doc)) # if default is not None: # input_type.optional = True if not Validators.validate_identifier(tag): raise Exception( f"The identifier '{tag}' was not validated because {Validators.reason_for_failure(tag)}" ) self.tag: str = tag self.input_type: ParseableType = get_instantiated_type(input_type) self.default = default self.prefix_applies_to_all_elements = prefix_applies_to_all_elements self.separator = separator self.localise_file = localise_file self.presents_as = presents_as self.secondaries_present_as = secondaries_present_as if self.secondaries_present_as: if not self.input_type.secondary_files(): raise Exception( f"The ToolOutput '{self.id()}' requested a rewrite of secondary file extension through " f"'secondaries_present_as', but the type {self.input_type.id()} not have any secondary files." ) secs = set(self.input_type.secondary_files()) to_remap = set(self.secondaries_present_as.keys()) invalid = to_remap - secs if len(invalid) > 0: raise Exception( f"Error when constructing output '{self.id()}', the secondaries_present_as contained secondary " f"files ({', '.join(invalid)}) that were not found in the output " f"type '{self.input_type.id()}' ({', '.join(secs)})")
def test_transform_sample_name(self): self.assertEqual( "fastqs_CDG025156R_PDX", Validators.transform_identifier_to_be_valid("fastqs_CDG-025-156R_PDX"), )
def parse_str(helpstr, option_marker: str = None, requires_prev_line_blank_or_param=False): doc = "" args = [] lines = helpstr.replace("\\n", "\n").split("\n") options_idx = None markers = option_markers if option_marker: markers = markers.union({option_marker.lower()}) for il in range(len(lines)): line = lines[il] if not line.lstrip(): continue ll = line.strip().lower() if any(ll.startswith(m) for m in markers): options_idx = il break doc += line + "\n" if options_idx is None: raise Exception("Couldn't find the start of the inputs") prev_arg = None last_line_was_blank_or_param = True while options_idx < len(lines) - 1: options_idx += 1 line = lines[options_idx] if not line.lstrip(): # line is empty prev_arg = None last_line_was_blank_or_param = True continue line_args = [l.strip() for l in line.lstrip().split(" ") if l] largs = len(line_args) if largs == 0: raise Exception( "No args when should have been filtered by previous step") tool_doc = "" if (not requires_prev_line_blank_or_param or last_line_was_blank_or_param) and line_args[0].startswith("-"): # sometimes this section has two items processed_tags = [ get_tag_and_cleanup_prefix(p) for p in line_args[0].split(",") ] processed_tags = [t for t in processed_tags if t is not None] if len(processed_tags) < 1: continue tags = sorted(processed_tags, key=lambda l: len(l[1]), reverse=True) potential_type = first_or_default([p[3] for p in processed_tags]) if len(tags) > 1: tool_doc += "(" + ", ".join(t[0] for t in tags[1:]) + ") " if largs > 1: tool_doc += " ".join(line_args[1:]) prefix, tag, has_equal, guessed_type = tags[0] eqifrequired = "=" if has_equal else "" if not potential_type: potential_type = Boolean if len(tag) == 1: while not Validators.validate_identifier(tag): print( f"The tag for '{prefix}' was invalid, we need you to come up with a new identifier for:" ) print("\t" + tool_doc if tool_doc else line) tag = str(input("New identifier: ")) try: prev_arg = ToolInput( tag, potential_type(optional=True), prefix=prefix + eqifrequired, separate_value_from_prefix=not has_equal, doc=tool_doc.replace('"', "'"), ) except: print(f"Skipping '{tag}' as it wasn't validated correctly") args.append(prev_arg) # we'll get the longer one for the tag elif prev_arg: prev_arg.doc.doc += " " + line.lstrip() else: last_line_was_blank_or_param = False return doc, args
def tool_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Tool: # Build custom pipeline w = WorkflowBuilder(tool.id(), friendly_name=tool.friendly_name(), version=tool.version()) ins = tool.tool_inputs() insdict = {i.id(): i for i in ins} fields = set(self.batch.fields) inkeys = set(i.id() for i in ins) invalid_keys = fields - inkeys if len(invalid_keys) > 0: raise Exception( f"Couldn't create batchtool from fields {', '.join(invalid_keys)} " f"as they do not exist on '{tool.id()}'") if self.batch.groupby not in inputs: raise Exception( f"the group_by field '{self.batch.groupby}' was not found in the inputs" ) innode_base = {} for i in ins: if i.id() in fields: continue innode_base[i.id()] = w.input(i.id(), i.intype, default=i.default, doc=i.doc) raw_groupby_values = inputs[self.batch.groupby] duplicate_keys = find_duplicates(raw_groupby_values) if len(duplicate_keys) > 0: raise Exception( f"There are duplicate group_by ({self.batch.groupby}) keys in the input: " + ", ".join(duplicate_keys)) groupby_values = [ Validators.transform_identifier_to_be_valid(ident) for ident in raw_groupby_values ] duplicate_keys = find_duplicates(groupby_values) if len(duplicate_keys) > 0: raise Exception( f"Janis transformed values in the group_by field ({self.batch.groupby}) to be a valid identifiers, " f"after this transformation, there were duplicates keys: " + ", ".join(duplicate_keys)) w.input(self.GROUPBY_FIELDNAME, Array(str), value=groupby_values) steps_created = [] stepid_from_gb = lambda gb: f"{gbvalue}_{tool.id()}" for gbvalue in groupby_values: extra_ins = {} for f in fields: newkey = f"{f}_{gbvalue}" extra_ins[f] = w.input(newkey, insdict[f].intype) steps_created.append( w.step(stepid_from_gb(gbvalue), tool(**innode_base, **extra_ins))) def transform_token_in_output_namers(token, outputid): if token is None: return token if isinstance(token, list): return [ transform_token_in_output_namers(t, outputid) for t in token ] if isinstance(token, InputSelector): if token.input_to_select in fields: # need to transform it return InputSelector(f"{token.input_to_select}_{outputid}") else: return token elif isinstance(token, (str, int, float, bool)): return token else: raise Exception( f"Unsure how to translate token of type {token.__class__.__name__} " ) for out in tool.tool_outputs(): output_folders = [] output_name = out.id() if isinstance(tool, Workflow): outnode = tool.output_nodes[out.id()] output_folders = outnode.output_folder or [] if outnode.output_name: output_name = outnode.output_name for gbvalue, raw_gbvalue in zip(groupby_values, raw_groupby_values): # This is pretty hacky, we're relying on the output_folder and output_name to be InputSelectors # or a literal value, otherwise this will probably break (this will probably break for expressions) output_folders_transformed = transform_token_in_output_namers( output_folders, gbvalue) output_name_transformed = transform_token_in_output_namers( output_name, gbvalue) w.output( f"{gbvalue}_{out.id()}", source=w[stepid_from_gb(gbvalue)][out.id()], output_name=output_name_transformed, output_folder=[ raw_gbvalue, *(output_folders_transformed or []) ], ) return w