def execute(self, context): collected_outputs = {} for task_outputs in self.xcom_pull( context=context, task_ids=[task.task_id for task in self.upstream_list]): collected_outputs = merge(collected_outputs, task_outputs["outputs"]) logging.debug('Collected outputs: \n{}'.format( json.dumps(collected_outputs, indent=4))) tmp_folder = collected_outputs["tmp_folder"] output_folder = collected_outputs["output_folder"] relocated_outputs = relocateOutputs(outputObj={ output_id: collected_outputs[output_src] for output_src, output_id in self.dag.get_output_list().items() if output_src in collected_outputs }, outdir=output_folder, output_dirs=[output_folder], action="copy", fs_access=StdFsAccess("")) relocated_outputs = { key.split("/")[-1]: val for key, val in relocated_outputs.items() } shutil.rmtree(tmp_folder, ignore_errors=False) logging.debug( 'Delete temporary output directory: \n{}'.format(tmp_folder)) logging.info("WORKFLOW RESULTS\n" + json.dumps(relocated_outputs, indent=4))
def relocate_outputs(workflow, job_data, cwl_args=None): """ Moves or copies filtered outputs to "outputs_folder" depending on "runtime_context.move_outputs" value, however "tmp_folder" is not going to be deleted as it will be done when DAG finishes running. Saves report with relocated outputs as "workflow_report.json" to "outputs_folder". Maps outputs from "workflow" back to normal (from step_id_step_out to workflow output) and filters "job_data" based on them (combining items from "job_data" into a list based on "outputSource" if it was a list). "cwl_args" can be used to update default parameters used for loading and runtime contexts. """ cwl_args = {} if cwl_args is None else cwl_args default_cwl_args = get_default_cwl_args(cwl_args) workflow_tool = fast_cwl_load(workflow=workflow, cwl_args=default_cwl_args) # Filter "job_data" to include only items required by workflow outputs. # Remap keys to the proper workflow outputs IDs (without step id). # If "outputSource" was a list even of len=1, find all correspondent items # from the "job_data" and assign them as list of the same size. job_data_copy = deepcopy(job_data) filtered_job_data = {} for output_id, output_data in get_items(workflow_tool["outputs"]): collected_job_items = [] for source_id, _ in get_items(output_data["outputSource"]): collected_job_items.append(job_data_copy[source_id.replace( "/", "_")]) if isinstance(output_data["outputSource"], list): filtered_job_data[output_id] = collected_job_items else: filtered_job_data[output_id] = collected_job_items[0] runtime_context = RuntimeContext(default_cwl_args) relocated_job_data = relocateOutputs( outputObj=filtered_job_data, destination_path=job_data_copy["outputs_folder"], source_directories=[ job_data_copy["tmp_folder"] ], # need to set it to tmp_folder otherwise it won't be able to delete tmp data if action is "move" action=runtime_context.move_outputs, fs_access=runtime_context.make_fs_access(""), compute_checksum=runtime_context.compute_checksum, path_mapper=runtime_context.path_mapper) # Dump report with relocated outputs workflow_report = os.path.join(job_data_copy["outputs_folder"], "workflow_report.json") dump_json(relocated_job_data, workflow_report) return relocated_job_data, workflow_report
def cwl_gather(self, context): upstream_task_ids = [t.task_id for t in self.dag.tasks if isinstance(t, CWLStepOperator)] + \ ([self.reader_task_id] if self.reader_task_id else []) upstream_data = self.xcom_pull(context=context, task_ids=upstream_task_ids) _logger.debug('{0}: xcom_pull data: \n {1}'.format( self.task_id, dumps(upstream_data, indent=4))) promises = {} for data in upstream_data: promises = merge(promises, data["promises"]) if "outdir" in data: self.outdir = data["outdir"] if "output_folder" in promises: self.output_folder = os.path.abspath(promises["output_folder"]) else: return _move_job = {out: promises[out] for out, val in self.outputs.items()} _logger.debug( '{0}: Final job: \n{1}\nMoving data: \n{2}\nMoving job:{3}'.format( self.task_id, dumps(promises, indent=4), dumps(self.outputs, indent=4), dumps(_move_job, indent=4))) _files_moved = relocateOutputs(_move_job, self.output_folder, [self.outdir], self.dag.default_args["move_outputs"], StdFsAccess("")) _job_result = { val.split("/")[-1]: _files_moved[out] # TODO: is split required? for out, val in self.outputs.items() if out in _files_moved } try: if self.outdir: shutil.rmtree(self.outdir, ignore_errors=False) _logger.info('{0}: Delete temporary output directory {1}'.format( self.task_id, self.outdir)) except Exception as e: _logger.error( "{0}: Temporary output directory hasn't been set {1}".format( self.task_id, e)) pass _logger.info("Job done: {}".format(dumps(_job_result, indent=4))) return _job_result, promises
def executor(self, tool, job_order, **kwargs): final_output = [] final_status = [] def output_callback(out, processStatus): final_status.append(processStatus) final_output.append(out) if "basedir" not in kwargs: raise WorkflowException("Must provide 'basedir' in kwargs") output_dirs = set() if kwargs.get("outdir"): finaloutdir = os.path.abspath(kwargs.get("outdir")) else: finaloutdir = None if kwargs.get("tmp_outdir_prefix"): kwargs["outdir"] = tempfile.mkdtemp( prefix=kwargs["tmp_outdir_prefix"]) else: kwargs["outdir"] = tempfile.mkdtemp() output_dirs.add(kwargs["outdir"]) kwargs["mutation_manager"] = MutationManager() jobReqs = None if "cwl:requirements" in job_order: jobReqs = job_order["cwl:requirements"] elif ("cwl:defaults" in tool.metadata and "cwl:requirements" in tool.metadata["cwl:defaults"]): jobReqs = tool.metadata["cwl:defaults"]["cwl:requirements"] if jobReqs: for req in jobReqs: tool.requirements.append(req) if kwargs.get("default_container"): tool.requirements.insert( 0, { "class": "DockerRequirement", "dockerPull": kwargs["default_container"] }) jobs = tool.job(job_order, output_callback, **kwargs) try: for runnable in jobs: if runnable: builder = kwargs.get("builder", None) if builder is not None: runnable.builder = builder if runnable.outdir: output_dirs.add(runnable.outdir) runnable.run(**kwargs) else: time.sleep(1) except WorkflowException as e: raise e except Exception as e: log.error("Got exception") raise WorkflowException(str(e)) # wait for all processes to finish self.wait() if final_output and final_output[0] and finaloutdir: final_output[0] = relocateOutputs(final_output[0], finaloutdir, output_dirs, kwargs.get("move_outputs"), kwargs["make_fs_access"]("")) if kwargs.get("rm_tmpdir"): cleanIntermediate(output_dirs) if final_output and final_status: return (final_output[0], final_status[0]) else: return (None, "permanentFail")
def relocate_outputs(workflow, job_data, cwl_args=None, remove_tmp_folder=None): """ Relocates filtered outputs to "outputs_folder" and, by default, removes tmp_folder, unless "remove_tmp_folder" is set to something else. Saves report with relocated outputs as "workflow_report.json" to "outputs_folder". Maps outputs from "workflow" back to normal (from step_id_step_out to workflow output) and filters "job_data" based on them (combining items from "job_data" into a list based on "outputSource" if it was a list). "cwl_args" can be used to update default parameters used for loading and runtime contexts. """ cwl_args = {} if cwl_args is None else cwl_args remove_tmp_folder = True if remove_tmp_folder is None else remove_tmp_folder default_cwl_args = get_default_cwl_args(cwl_args) workflow_tool = fast_cwl_load(workflow=workflow, cwl_args=default_cwl_args) # Filter "job_data" to include only items required by workflow outputs. # Remap keys to the proper workflow outputs IDs (without step id). # If "outputSource" was a list even of len=1, find all correspondent items # from the "job_data" and assign them as list of the same size. job_data_copy = deepcopy(job_data) filtered_job_data = {} for output_id, output_data in get_items(workflow_tool["outputs"]): collected_job_items = [] for source_id, _ in get_items(output_data["outputSource"]): collected_job_items.append(job_data_copy[source_id.replace( "/", "_")]) if isinstance(output_data["outputSource"], list): filtered_job_data[output_id] = collected_job_items else: filtered_job_data[output_id] = collected_job_items[0] # Outputs will be always copied, because source_directories=[] runtime_context = RuntimeContext(default_cwl_args) relocated_job_data = relocateOutputs( outputObj=filtered_job_data, destination_path=job_data_copy["outputs_folder"], source_directories= [], # use it as a placeholder (shouldn't influence anything) action=runtime_context.move_outputs, fs_access=runtime_context.make_fs_access(""), compute_checksum=runtime_context.compute_checksum, path_mapper=runtime_context.path_mapper) # Dump report with relocated outputs workflow_report = os.path.join(job_data_copy["outputs_folder"], "workflow_report.json") dump_json(relocated_job_data, workflow_report) # Clean "tmp_folder" if remove_tmp_folder: shutil.rmtree(job_data_copy["tmp_folder"], ignore_errors=False) return relocated_job_data, workflow_report
def executor(self, tool, job_order, runtimeContext, **kwargs): """Executor method.""" final_output = [] final_status = [] def output_callback(out, processStatus): final_status.append(processStatus) final_output.append(out) if not runtimeContext.basedir: raise WorkflowException('`runtimeContext` should contain a ' '`basedir`') output_dirs = set() if runtimeContext.outdir: finaloutdir = os.path.abspath(runtimeContext.outdir) else: finaloutdir = None if runtimeContext.tmp_outdir_prefix: runtimeContext.outdir = tempfile.mkdtemp( prefix=runtimeContext.tmp_outdir_prefix) else: runtimeContext.outdir = tempfile.mkdtemp() output_dirs.add(runtimeContext.outdir) runtimeContext.mutation_manager = MutationManager() jobReqs = None if "cwl:requirements" in job_order: jobReqs = job_order["cwl:requirements"] elif ("cwl:defaults" in tool.metadata and "cwl:requirements" in tool.metadata["cwl:defaults"]): jobReqs = tool.metadata["cwl:defaults"]["cwl:requirements"] if jobReqs: for req in jobReqs: tool.requirements.append(req) if not runtimeContext.default_container: runtimeContext.default_container = 'frolvlad/alpine-bash' runtimeContext.docker_outdir = os.path.join(runtimeContext.working_dir, "cwl/docker_outdir") runtimeContext.docker_tmpdir = os.path.join(runtimeContext.working_dir, "cwl/docker_tmpdir") runtimeContext.docker_stagedir = os.path.join( runtimeContext.working_dir, "cwl/docker_stagedir") jobs = tool.job(job_order, output_callback, runtimeContext) try: for runnable in jobs: if runnable: if runtimeContext.builder: runnable.builder = runtimeContext.builder if runnable.outdir: output_dirs.add(runnable.outdir) runnable.run(runtimeContext) else: # log.error( # "Workflow cannot make any more progress" # ) # break time.sleep(1) except WorkflowException as e: traceback.print_exc() raise e except Exception as e: traceback.print_exc() raise WorkflowException(str(e)) # wait for all processes to finish self.wait() if final_output and final_output[0] and finaloutdir: final_output[0] = relocateOutputs( final_output[0], finaloutdir, output_dirs, runtimeContext.move_outputs, runtimeContext.make_fs_access("")) if runtimeContext.rm_tmpdir: cleanIntermediate(output_dirs) if final_output and final_status: return str(final_output[0]), str(final_status[0]) else: return None, "permanentFail"