def embed_all_runs(workflow_tool, cwl_args=None, location=None): """ Tries to find and load all "run" fields from the "workflow_tool" if it is Workflow. If not, doesn't replace anything. "cwl_args" can be used to update default arguments used by loading and runtime contexts. If "location" is provided, save resulted workflow to json file. Returns workflow tool with all "run" fields replaced. """ def __embed(workflow_tool, cwl_args=None): if isinstance(workflow_tool, MutableSequence): for item in workflow_tool: __embed(item, cwl_args) elif isinstance(workflow_tool, MutableMapping): if "run" in workflow_tool and isinstance(workflow_tool["run"], str): workflow_tool["run"] = slow_cwl_load( workflow=workflow_tool["run"], cwl_args=cwl_args, only_tool=True) for item in workflow_tool.values(): __embed(item, cwl_args) if workflow_tool["class"] == "Workflow": workflow_tool_copy = deepcopy(workflow_tool) __embed(workflow_tool_copy, cwl_args) else: workflow_tool_copy = workflow_tool if location is not None: dump_json(workflow_tool_copy, location) return workflow_tool_copy
def execute_workflow_step(workflow, task_id, job_data, cwl_args=None, executor=None): """ Constructs and executes single step workflow based on the "workflow" and "task_id". "cwl_args" can be used to update default parameters used for loading and runtime contexts. Exports json file with the execution results. """ cwl_args = {} if cwl_args is None else cwl_args executor = SingleJobExecutor() if executor is None else executor step_tmp_folder, step_cache_folder, step_outputs_folder, step_report = get_temp_folders( task_id=task_id, job_data=job_data) default_cwl_args = get_default_cwl_args(cwl_args) default_cwl_args.update({ # add execution specific parameters "tmp_outdir_prefix": step_cache_folder + "/", "tmpdir_prefix": step_cache_folder + "/", "cidfile_dir": step_tmp_folder, "cidfile_prefix": task_id, "basedir": os.getcwd( ), # job should already have abs path for inputs, so this is useless "outdir": step_outputs_folder }) workflow_step_path = os.path.join(step_tmp_folder, task_id + "_step_workflow.cwl") fast_cwl_step_load( # will save new worlflow to "workflow_step_path" workflow=workflow, target_id=task_id, cwl_args=default_cwl_args, location=workflow_step_path) _stderr = sys.stderr # to trick the logger sys.stderr = sys.__stderr__ step_outputs, step_status = executor( slow_cwl_load(workflow=workflow_step_path, cwl_args=default_cwl_args), job_data, RuntimeContext(default_cwl_args)) sys.stderr = _stderr if step_status != "success": raise ValueError # To remove "http://commonwl.org/cwltool#generation": 0 (copied from cwltool) visit_class(step_outputs, ("File", ), MutationManager().unset_generation) dump_json(step_outputs, step_report) return step_outputs, step_report
def relocate_outputs(workflow, job_data, cwl_args=None): """ Moves or copies filtered outputs to "outputs_folder" depending on "runtime_context.move_outputs" value, however "tmp_folder" is not going to be deleted as it will be done when DAG finishes running. Saves report with relocated outputs as "workflow_report.json" to "outputs_folder". Maps outputs from "workflow" back to normal (from step_id_step_out to workflow output) and filters "job_data" based on them (combining items from "job_data" into a list based on "outputSource" if it was a list). "cwl_args" can be used to update default parameters used for loading and runtime contexts. """ cwl_args = {} if cwl_args is None else cwl_args default_cwl_args = get_default_cwl_args(cwl_args) workflow_tool = fast_cwl_load(workflow=workflow, cwl_args=default_cwl_args) # Filter "job_data" to include only items required by workflow outputs. # Remap keys to the proper workflow outputs IDs (without step id). # If "outputSource" was a list even of len=1, find all correspondent items # from the "job_data" and assign them as list of the same size. job_data_copy = deepcopy(job_data) filtered_job_data = {} for output_id, output_data in get_items(workflow_tool["outputs"]): collected_job_items = [] for source_id, _ in get_items(output_data["outputSource"]): collected_job_items.append(job_data_copy[source_id.replace( "/", "_")]) if isinstance(output_data["outputSource"], list): filtered_job_data[output_id] = collected_job_items else: filtered_job_data[output_id] = collected_job_items[0] runtime_context = RuntimeContext(default_cwl_args) relocated_job_data = relocateOutputs( outputObj=filtered_job_data, destination_path=job_data_copy["outputs_folder"], source_directories=[ job_data_copy["tmp_folder"] ], # need to set it to tmp_folder otherwise it won't be able to delete tmp data if action is "move" action=runtime_context.move_outputs, fs_access=runtime_context.make_fs_access(""), compute_checksum=runtime_context.compute_checksum, path_mapper=runtime_context.path_mapper) # Dump report with relocated outputs workflow_report = os.path.join(job_data_copy["outputs_folder"], "workflow_report.json") dump_json(relocated_job_data, workflow_report) return relocated_job_data, workflow_report
def execute(self, context): """ Loads job Object from the context. Sets "tmp_folder" and "output_folder" if they have not been set before in the job. In case "tmp_folder" and/or "output_folder" were read from the job and are relative, resolves paths relative to the "tmp_folder" and/or "outputs_folder" from "cwl_args". Dumps step outputs as a json file into "tmp_folder". Writes to X-Com report file location. """ setup_cwl_logger(context["ti"]) post_status(context) # for easy access dag_id = context["dag"].dag_id workflow = context["dag"].workflow run_id = context["run_id"].replace(":", "_").replace( "+", "_") # to make it dumpable by json cwl_args = context["dag"].default_args["cwl"] # Loads job from dag_run configuration. Sets defaults from "workflow". Fails on missing input files job_data = load_job(workflow=workflow, job=context["dag_run"].conf["job"], cwl_args=cwl_args) job_data["tmp_folder"] = get_dir( get_absolute_path( job_data.get( "tmp_folder", mkdtemp(dir=cwl_args["tmp_folder"], prefix=dag_id + "_" + run_id + "_")), cwl_args["tmp_folder"])) job_data["outputs_folder"] = get_dir( get_absolute_path( job_data.get( "outputs_folder", os.path.join(cwl_args["outputs_folder"], dag_id, run_id)), cwl_args["outputs_folder"])) _, _, _, step_report = get_temp_folders(task_id=self.task_id, job_data=job_data) dump_json(job_data, step_report) return step_report
def fast_cwl_step_load(workflow, target_id, cwl_args=None, location=None): """ Returns workflow (CommentedMap) that includes only single step selected by "target_id" from the parsed "workflow". Other steps are removed. Workflow inputs and outputs are updated based on source fields of "in" and "out" from the selected workflow step. If selected step includes "scatter" field all output types will be transformed to the nested/flat array of items of the same type. IDs of updated workflow inputs and outputs as well as IDs of correspondent "source" fields also include step id separated by underscore. All other fields remain unchanged. "cwl_args" can be used to update default location of "pickle_folder" used by "fast_cwl_load" as well as other parameters used by "slow_cwl_load" for loading and runtime contexts. If "location" is not None, export modified workflow. """ cwl_args = {} if cwl_args is None else cwl_args default_cwl_args = get_default_cwl_args(cwl_args) workflow_inputs = [] workflow_outputs = [] workflow_steps = [] workflow_tool = fast_cwl_load(workflow=workflow, cwl_args=default_cwl_args) selected_step = list(get_items(workflow_tool["steps"], target_id))[0][1] workflow_steps.append(selected_step) for _, step_in in get_items(selected_step.get( "in", [])): # step might not have "in" updated_sources = [] # to keep track of updated sources for step_in_source, _ in get_items(step_in.get( "source", [])): # "in" might not have "source" try: # try to find workflow input that corresponds to "source" workflow_input = list( get_items(workflow_tool["inputs"], step_in_source))[0][1] updated_workflow_input = { "id": step_in_source, "type": workflow_input["type"] } # need to copy: # original inputBinding because it can include loadContents section # loadContents and loadListing sections if present outside of inputBinding # both "default" and "secondaryFiles" if present # TODO: Do I need to copy format? for key in [ "default", "secondaryFiles", "inputBinding", "loadContents", "loadListing" ]: if key in workflow_input: updated_workflow_input[key] = workflow_input[key] # Check if we have already added input based on the same "source" # from another item from "in". Skip adding the same input twice. if len(list(get_items(workflow_inputs, step_in_source))) == 0: workflow_inputs.append(updated_workflow_input) updated_sources.append(step_in_source) except (IndexError, KeyError): # Need to find upstream step that corresponds to "source" upstream_step = list( get_items( workflow_tool["steps"], get_short_id(step_in_source, only_step_name=True)))[0][1] # Need to load tool from "run" of the found upstream step # and look for the output that corresponds to "source". # We look for correspondence only based on "id" upstream_step_tool = fast_cwl_load( workflow=upstream_step["run"], cwl_args=default_cwl_args) upstream_step_output = list( get_items( { get_short_id(k, only_id=True): v for k, v in get_items( upstream_step_tool["outputs"]) }, # trick get_short_id(step_in_source, only_id=True)))[0][1] step_in_source_with_step_id = step_in_source.replace( "/", "_") # to include both step name and id # Check if it should be assumed optional (default field is present) # NOTE: consider also checking if upstream step had scatter, so the # output type should become array based on the scatter parameters if "default" in step_in: upstream_step_output_type = [ "null", upstream_step_output["type"] ] else: upstream_step_output_type = upstream_step_output["type"] updated_workflow_input = { "id": step_in_source_with_step_id, "type": upstream_step_output_type } # No need to copy "secondaryFiles" for outputs from other steps # because they should be already included into the generated json # report file # # TODO: Do I need to copy format to "workflow_inputs"? # Check if we have already added input based on the same "source" # from another item from "in". Skip adding the same input twice. if len( list( get_items(workflow_inputs, step_in_source_with_step_id))) == 0: workflow_inputs.append(updated_workflow_input) updated_sources.append(step_in_source_with_step_id) # replace "source" in step's "in" if anything was updated if len(updated_sources) > 0: if isinstance(step_in["source"], list): step_in["source"] = updated_sources else: step_in["source"] = updated_sources[0] # Need to load tool from the "run" field of the selected step # and look for the outputs that correspond to the items from "out". # We look for correspondence only based on "id" selected_step_tool = fast_cwl_load(workflow=selected_step["run"], cwl_args=default_cwl_args) for step_out, _ in get_items(selected_step["out"]): selected_step_output = list( get_items( { get_short_id(k, only_id=True): v for k, v in get_items(selected_step_tool["outputs"]) }, # trick get_short_id(step_out, only_id=True)))[0][1] step_out_with_step_id = step_out.replace( "/", "_") # to include both step name and id # update output type in case of scatter if "scatter" in selected_step: selected_step_output = deepcopy( selected_step_output ) # need to deepcopy, otherwise we change embedded tool's output if isinstance(selected_step["scatter"], MutableSequence) \ and selected_step.get("scatterMethod") == "nested_crossproduct": nesting = len(selected_step["scatter"]) else: nesting = 1 for _ in range(0, nesting): selected_step_output["type"] = { "type": "array", "items": selected_step_output["type"] } workflow_outputs.append({ "id": step_out_with_step_id, "type": selected_step_output["type"], "outputSource": step_out }) workflow_tool.update({ "inputs": workflow_inputs, "outputs": workflow_outputs, "steps": workflow_steps }) if location is not None: dump_json(workflow_tool, location) return workflow_tool
def relocate_outputs(workflow, job_data, cwl_args=None, remove_tmp_folder=None): """ Relocates filtered outputs to "outputs_folder" and, by default, removes tmp_folder, unless "remove_tmp_folder" is set to something else. Saves report with relocated outputs as "workflow_report.json" to "outputs_folder". Maps outputs from "workflow" back to normal (from step_id_step_out to workflow output) and filters "job_data" based on them (combining items from "job_data" into a list based on "outputSource" if it was a list). "cwl_args" can be used to update default parameters used for loading and runtime contexts. """ cwl_args = {} if cwl_args is None else cwl_args remove_tmp_folder = True if remove_tmp_folder is None else remove_tmp_folder default_cwl_args = get_default_cwl_args(cwl_args) workflow_tool = fast_cwl_load(workflow=workflow, cwl_args=default_cwl_args) # Filter "job_data" to include only items required by workflow outputs. # Remap keys to the proper workflow outputs IDs (without step id). # If "outputSource" was a list even of len=1, find all correspondent items # from the "job_data" and assign them as list of the same size. job_data_copy = deepcopy(job_data) filtered_job_data = {} for output_id, output_data in get_items(workflow_tool["outputs"]): collected_job_items = [] for source_id, _ in get_items(output_data["outputSource"]): collected_job_items.append(job_data_copy[source_id.replace( "/", "_")]) if isinstance(output_data["outputSource"], list): filtered_job_data[output_id] = collected_job_items else: filtered_job_data[output_id] = collected_job_items[0] # Outputs will be always copied, because source_directories=[] runtime_context = RuntimeContext(default_cwl_args) relocated_job_data = relocateOutputs( outputObj=filtered_job_data, destination_path=job_data_copy["outputs_folder"], source_directories= [], # use it as a placeholder (shouldn't influence anything) action=runtime_context.move_outputs, fs_access=runtime_context.make_fs_access(""), compute_checksum=runtime_context.compute_checksum, path_mapper=runtime_context.path_mapper) # Dump report with relocated outputs workflow_report = os.path.join(job_data_copy["outputs_folder"], "workflow_report.json") dump_json(relocated_job_data, workflow_report) # Clean "tmp_folder" if remove_tmp_folder: shutil.rmtree(job_data_copy["tmp_folder"], ignore_errors=False) return relocated_job_data, workflow_report
def convert_to_workflow(command_line_tool, location=None): """ Converts "command_line_tool" to Workflow trying to keep all important elements. If "command_line_tool" is already Workflow, doesn't apply any changes. If "location" is not None, dumps results to json file. """ if command_line_tool["class"] == "Workflow": workflow_tool = command_line_tool else: workflow_tool = { "class": "Workflow", "cwlVersion": command_line_tool["cwlVersion"], "inputs": [], "outputs": [] } for key in ["requirements"]: if key in command_line_tool: workflow_tool[key] = command_line_tool[key] for input_id, input_data in get_items(command_line_tool["inputs"]): workflow_input = { "id": input_id, "type": remove_field_from_dict( input_data["type"], "inputBinding" ) # "type" in WorkflowInputParameter cannot have "inputBinding" } for key in ["secondaryFiles", "default"]: # TODO: Do I need to copy format? if key in input_data: workflow_input[key] = input_data[key] workflow_tool["inputs"].append(workflow_input) for output_id, output_data in get_items(command_line_tool["outputs"]): workflow_output = { "id": output_id, "type": output_data["type"], "outputSource": get_rootname(command_line_tool["id"]) + "/" + output_id } # TODO: not sure if I need format here # for key in ["format"]: # if key in output_data: # workflow_output[key] = output_data[key] workflow_tool["outputs"].append(workflow_output) workflow_tool["steps"] = [{ "id": get_rootname(command_line_tool["id"]), "run": command_line_tool, "in": [{ "id": input_id, "source": input_id } for input_id, _ in get_items(workflow_tool["inputs"])], "out": [ output_id for output_id, _ in get_items(workflow_tool["outputs"]) ] }] if location is not None: dump_json(workflow_tool, location) return workflow_tool
def execute_workflow_step(workflow, task_id, job_data, cwl_args=None, executor=None): """ Constructs and executes single step workflow based on the "workflow" and "task_id". "cwl_args" can be used to update default parameters used for loading and runtime contexts. Exports json file with the execution results. If the step was evaluated as the one that need to be skipped, the output "skipped" will set to True and the step_report file will include "nulls". This function doesn't remove any temporary data in both success and failure scenarios. """ cwl_args = {} if cwl_args is None else cwl_args executor = SingleJobExecutor() if executor is None else executor step_tmp_folder, step_cache_folder, step_outputs_folder, step_report = get_temp_folders( task_id=task_id, job_data=job_data) default_cwl_args = get_default_cwl_args(cwl_args) default_cwl_args.update({ # add execution specific parameters "tmp_outdir_prefix": step_cache_folder + "/", "tmpdir_prefix": step_cache_folder + "/", "cidfile_dir": step_tmp_folder, "cidfile_prefix": task_id, "basedir": os.getcwd( ), # job should already have abs path for inputs, so this is useless "outdir": step_outputs_folder }) workflow_step_path = os.path.join(step_tmp_folder, task_id + "_step_workflow.cwl") fast_cwl_step_load( # will save new worlflow to "workflow_step_path" workflow=workflow, target_id=task_id, cwl_args=default_cwl_args, location=workflow_step_path) workflow_data = slow_cwl_load(workflow=workflow_step_path, cwl_args=default_cwl_args) skipped = True step_outputs = { output_id: None for output_id, _ in get_items(workflow_data.tool["outputs"]) } if need_to_run(workflow_data, job_data, task_id): skipped = False _stderr = sys.stderr # to trick the logger sys.stderr = sys.__stderr__ step_outputs, step_status = executor(workflow_data, job_data, RuntimeContext(default_cwl_args)) sys.stderr = _stderr if step_status != "success": raise ValueError("Failed to run workflow step") # To remove "http://commonwl.org/cwltool#generation": 0 (copied from cwltool) visit_class(step_outputs, ("File", ), MutationManager().unset_generation) dump_json(step_outputs, step_report) return step_outputs, step_report, skipped
def fast_cwl_step_load(workflow, target_id, cwl_args=None, location=None): """ Returns workflow (CommentedMap) that includes only single step selected by "target_id" from the parsed "workflow". Other steps are removed. Workflow inputs and outputs are updated based on source fields of "in" and "out" from the selected workflow step. IDs of updated workflow inputs and outputs as well as IDs of correspondent "source" fields also include step id separated by underscore. All other fields remain unchanged. "cwl_args" can be used to update default location of "pickle_folder" used by "fast_cwl_load" as well as other parameters used by "slow_cwl_load" for loading and runtime contexts. If "location" is not None, export modified workflow. """ cwl_args = {} if cwl_args is None else cwl_args default_cwl_args = get_default_cwl_args(cwl_args) workflow_inputs = [] workflow_outputs = [] workflow_steps = [] workflow_tool = fast_cwl_load(workflow=workflow, cwl_args=default_cwl_args) selected_step = list(get_items(workflow_tool["steps"], target_id))[0][1] workflow_steps.append(selected_step) for _, step_in in get_items(selected_step.get( "in", [])): # step might not have "in" updated_sources = [] # to keep track of updated sources for step_in_source, _ in get_items(step_in.get( "source", [])): # "in" might not have "source" try: # try to find workflow input that corresponds to "source" workflow_input = list( get_items(workflow_tool["inputs"], step_in_source))[0][1] updated_workflow_input = { "id": step_in_source, "type": workflow_input["type"] } # need to copy both "default" and "secondaryFiles" if present for key in ["default", "secondaryFiles" ]: # TODO: Do I need to copy format? if key in workflow_input: updated_workflow_input[key] = workflow_input[key] # Check if we have already added input based on the same "source" # from another item from "in". Skip adding the same input twice. if len(list(get_items(workflow_inputs, step_in_source))) == 0: workflow_inputs.append(updated_workflow_input) updated_sources.append(step_in_source) except (IndexError, KeyError): # Need to find upstream step that corresponds to "source" upstream_step = list( get_items( workflow_tool["steps"], get_short_id(step_in_source, only_step_name=True)))[0][1] # Need to load tool from "run" of the found upstream step # and look for the output that corresponds to "source". # We look for correspondence only based on "id" upstream_step_tool = fast_cwl_load( workflow=upstream_step["run"], cwl_args=default_cwl_args) upstream_step_output = list( get_items( { get_short_id(k, only_id=True): v for k, v in get_items( upstream_step_tool["outputs"]) }, # trick get_short_id(step_in_source, only_id=True)))[0][1] step_in_source_with_step_id = step_in_source.replace( "/", "_") # to include both step name and id updated_workflow_input = { "id": step_in_source_with_step_id, "type": upstream_step_output["type"] } # No need to copy "secondaryFiles" for outputs from other steps # because they should be already included into the generated json # report file # # TODO: Do I need to copy format to "workflow_inputs"? # Check if we have already added input based on the same "source" # from another item from "in". Skip adding the same input twice. if len( list( get_items(workflow_inputs, step_in_source_with_step_id))) == 0: workflow_inputs.append(updated_workflow_input) updated_sources.append(step_in_source_with_step_id) # replace "source" in step's "in" if anything was updated if len(updated_sources) > 0: if isinstance(step_in["source"], list): step_in["source"] = updated_sources else: step_in["source"] = updated_sources[0] # Need to load tool from the "run" field of the selected step # and look for the outputs that correspond to the items from "out". # We look for correspondence only based on "id" selected_step_tool = fast_cwl_load(workflow=selected_step["run"], cwl_args=default_cwl_args) for step_out, _ in get_items(selected_step["out"]): selected_step_output = list( get_items( { get_short_id(k, only_id=True): v for k, v in get_items(selected_step_tool["outputs"]) }, # trick get_short_id(step_out, only_id=True)))[0][1] step_out_with_step_id = step_out.replace( "/", "_") # to include both step name and id workflow_outputs.append({ "id": step_out_with_step_id, "type": selected_step_output["type"], "outputSource": step_out }) workflow_tool.update({ "inputs": workflow_inputs, "outputs": workflow_outputs, "steps": workflow_steps }) if location is not None: dump_json(workflow_tool, location) return workflow_tool