Example #1
0
    def execute(self, context):
        collected_outputs = {}
        for task_outputs in self.xcom_pull(
                context=context,
                task_ids=[task.task_id for task in self.upstream_list]):
            collected_outputs = merge(collected_outputs,
                                      task_outputs["outputs"])
        logging.debug('Collected outputs: \n{}'.format(
            json.dumps(collected_outputs, indent=4)))
        tmp_folder = collected_outputs["tmp_folder"]
        output_folder = collected_outputs["output_folder"]
        relocated_outputs = relocateOutputs(outputObj={
            output_id: collected_outputs[output_src]
            for output_src, output_id in self.dag.get_output_list().items()
            if output_src in collected_outputs
        },
                                            outdir=output_folder,
                                            output_dirs=[output_folder],
                                            action="copy",
                                            fs_access=StdFsAccess(""))

        relocated_outputs = {
            key.split("/")[-1]: val
            for key, val in relocated_outputs.items()
        }
        shutil.rmtree(tmp_folder, ignore_errors=False)
        logging.debug(
            'Delete temporary output directory: \n{}'.format(tmp_folder))
        logging.info("WORKFLOW RESULTS\n" +
                     json.dumps(relocated_outputs, indent=4))
Example #2
0
def relocate_outputs(workflow, job_data, cwl_args=None):
    """
    Moves or copies filtered outputs to "outputs_folder" depending on
    "runtime_context.move_outputs" value, however "tmp_folder" is not
    going to be deleted as it will be done when DAG finishes running.
    Saves report with relocated outputs as "workflow_report.json"
    to "outputs_folder". Maps outputs from "workflow" back to normal
    (from step_id_step_out to workflow output) and filters "job_data"
    based on them (combining items from "job_data" into a list based on
    "outputSource" if it was a list). "cwl_args" can be used to update
    default parameters used for loading and runtime contexts.
    """

    cwl_args = {} if cwl_args is None else cwl_args

    default_cwl_args = get_default_cwl_args(cwl_args)

    workflow_tool = fast_cwl_load(workflow=workflow, cwl_args=default_cwl_args)

    # Filter "job_data" to include only items required by workflow outputs.
    # Remap keys to the proper workflow outputs IDs (without step id).
    # If "outputSource" was a list even of len=1, find all correspondent items
    # from the "job_data" and assign them as list of the same size.
    job_data_copy = deepcopy(job_data)
    filtered_job_data = {}
    for output_id, output_data in get_items(workflow_tool["outputs"]):
        collected_job_items = []
        for source_id, _ in get_items(output_data["outputSource"]):
            collected_job_items.append(job_data_copy[source_id.replace(
                "/", "_")])
        if isinstance(output_data["outputSource"], list):
            filtered_job_data[output_id] = collected_job_items
        else:
            filtered_job_data[output_id] = collected_job_items[0]

    runtime_context = RuntimeContext(default_cwl_args)
    relocated_job_data = relocateOutputs(
        outputObj=filtered_job_data,
        destination_path=job_data_copy["outputs_folder"],
        source_directories=[
            job_data_copy["tmp_folder"]
        ],  # need to set it to tmp_folder otherwise it won't be able to delete tmp data if action is "move"
        action=runtime_context.move_outputs,
        fs_access=runtime_context.make_fs_access(""),
        compute_checksum=runtime_context.compute_checksum,
        path_mapper=runtime_context.path_mapper)

    # Dump report with relocated outputs
    workflow_report = os.path.join(job_data_copy["outputs_folder"],
                                   "workflow_report.json")

    dump_json(relocated_job_data, workflow_report)

    return relocated_job_data, workflow_report
    def cwl_gather(self, context):
        upstream_task_ids = [t.task_id for t in self.dag.tasks if isinstance(t, CWLStepOperator)] + \
                            ([self.reader_task_id] if self.reader_task_id else [])
        upstream_data = self.xcom_pull(context=context,
                                       task_ids=upstream_task_ids)

        _logger.debug('{0}: xcom_pull data: \n {1}'.format(
            self.task_id, dumps(upstream_data, indent=4)))

        promises = {}
        for data in upstream_data:
            promises = merge(promises, data["promises"])
            if "outdir" in data:
                self.outdir = data["outdir"]

        if "output_folder" in promises:
            self.output_folder = os.path.abspath(promises["output_folder"])
        else:
            return

        _move_job = {out: promises[out] for out, val in self.outputs.items()}
        _logger.debug(
            '{0}: Final job: \n{1}\nMoving data: \n{2}\nMoving job:{3}'.format(
                self.task_id, dumps(promises, indent=4),
                dumps(self.outputs, indent=4), dumps(_move_job, indent=4)))

        _files_moved = relocateOutputs(_move_job, self.output_folder,
                                       [self.outdir],
                                       self.dag.default_args["move_outputs"],
                                       StdFsAccess(""))
        _job_result = {
            val.split("/")[-1]: _files_moved[out]  # TODO: is split required?
            for out, val in self.outputs.items() if out in _files_moved
        }
        try:
            if self.outdir:
                shutil.rmtree(self.outdir, ignore_errors=False)
            _logger.info('{0}: Delete temporary output directory {1}'.format(
                self.task_id, self.outdir))
        except Exception as e:
            _logger.error(
                "{0}: Temporary output directory hasn't been set {1}".format(
                    self.task_id, e))
            pass
        _logger.info("Job done: {}".format(dumps(_job_result, indent=4)))

        return _job_result, promises
Example #4
0
    def executor(self, tool, job_order, **kwargs):
        final_output = []
        final_status = []

        def output_callback(out, processStatus):
            final_status.append(processStatus)
            final_output.append(out)

        if "basedir" not in kwargs:
            raise WorkflowException("Must provide 'basedir' in kwargs")

        output_dirs = set()

        if kwargs.get("outdir"):
            finaloutdir = os.path.abspath(kwargs.get("outdir"))
        else:
            finaloutdir = None

        if kwargs.get("tmp_outdir_prefix"):
            kwargs["outdir"] = tempfile.mkdtemp(
                prefix=kwargs["tmp_outdir_prefix"])
        else:
            kwargs["outdir"] = tempfile.mkdtemp()

        output_dirs.add(kwargs["outdir"])
        kwargs["mutation_manager"] = MutationManager()

        jobReqs = None
        if "cwl:requirements" in job_order:
            jobReqs = job_order["cwl:requirements"]
        elif ("cwl:defaults" in tool.metadata
              and "cwl:requirements" in tool.metadata["cwl:defaults"]):
            jobReqs = tool.metadata["cwl:defaults"]["cwl:requirements"]
        if jobReqs:
            for req in jobReqs:
                tool.requirements.append(req)

        if kwargs.get("default_container"):
            tool.requirements.insert(
                0, {
                    "class": "DockerRequirement",
                    "dockerPull": kwargs["default_container"]
                })

        jobs = tool.job(job_order, output_callback, **kwargs)
        try:
            for runnable in jobs:
                if runnable:
                    builder = kwargs.get("builder", None)
                    if builder is not None:
                        runnable.builder = builder
                    if runnable.outdir:
                        output_dirs.add(runnable.outdir)
                    runnable.run(**kwargs)
                else:
                    time.sleep(1)

        except WorkflowException as e:
            raise e
        except Exception as e:
            log.error("Got exception")
            raise WorkflowException(str(e))

        # wait for all processes to finish
        self.wait()

        if final_output and final_output[0] and finaloutdir:
            final_output[0] = relocateOutputs(final_output[0], finaloutdir,
                                              output_dirs,
                                              kwargs.get("move_outputs"),
                                              kwargs["make_fs_access"](""))

        if kwargs.get("rm_tmpdir"):
            cleanIntermediate(output_dirs)

        if final_output and final_status:
            return (final_output[0], final_status[0])
        else:
            return (None, "permanentFail")
Example #5
0
def relocate_outputs(workflow,
                     job_data,
                     cwl_args=None,
                     remove_tmp_folder=None):
    """
    Relocates filtered outputs to "outputs_folder" and, by default,
    removes tmp_folder, unless "remove_tmp_folder" is set to something
    else. Saves report with relocated outputs as "workflow_report.json"
    to "outputs_folder".
    Maps outputs from "workflow" back to normal (from step_id_step_out
    to workflow output) and filters "job_data" based on them (combining
    items from "job_data" into a list based on "outputSource" if it
    was a list). "cwl_args" can be used to update default parameters
    used for loading and runtime contexts.
    """

    cwl_args = {} if cwl_args is None else cwl_args
    remove_tmp_folder = True if remove_tmp_folder is None else remove_tmp_folder

    default_cwl_args = get_default_cwl_args(cwl_args)

    workflow_tool = fast_cwl_load(workflow=workflow, cwl_args=default_cwl_args)

    # Filter "job_data" to include only items required by workflow outputs.
    # Remap keys to the proper workflow outputs IDs (without step id).
    # If "outputSource" was a list even of len=1, find all correspondent items
    # from the "job_data" and assign them as list of the same size.
    job_data_copy = deepcopy(job_data)
    filtered_job_data = {}
    for output_id, output_data in get_items(workflow_tool["outputs"]):
        collected_job_items = []
        for source_id, _ in get_items(output_data["outputSource"]):
            collected_job_items.append(job_data_copy[source_id.replace(
                "/", "_")])
        if isinstance(output_data["outputSource"], list):
            filtered_job_data[output_id] = collected_job_items
        else:
            filtered_job_data[output_id] = collected_job_items[0]

    # Outputs will be always copied, because source_directories=[]
    runtime_context = RuntimeContext(default_cwl_args)
    relocated_job_data = relocateOutputs(
        outputObj=filtered_job_data,
        destination_path=job_data_copy["outputs_folder"],
        source_directories=
        [],  # use it as a placeholder (shouldn't influence anything)
        action=runtime_context.move_outputs,
        fs_access=runtime_context.make_fs_access(""),
        compute_checksum=runtime_context.compute_checksum,
        path_mapper=runtime_context.path_mapper)

    # Dump report with relocated outputs
    workflow_report = os.path.join(job_data_copy["outputs_folder"],
                                   "workflow_report.json")

    dump_json(relocated_job_data, workflow_report)

    # Clean "tmp_folder"
    if remove_tmp_folder:
        shutil.rmtree(job_data_copy["tmp_folder"], ignore_errors=False)

    return relocated_job_data, workflow_report
    def executor(self, tool, job_order, runtimeContext, **kwargs):
        """Executor method."""
        final_output = []
        final_status = []

        def output_callback(out, processStatus):
            final_status.append(processStatus)
            final_output.append(out)

        if not runtimeContext.basedir:
            raise WorkflowException('`runtimeContext` should contain a '
                                    '`basedir`')

        output_dirs = set()

        if runtimeContext.outdir:
            finaloutdir = os.path.abspath(runtimeContext.outdir)
        else:
            finaloutdir = None
        if runtimeContext.tmp_outdir_prefix:
            runtimeContext.outdir = tempfile.mkdtemp(
                prefix=runtimeContext.tmp_outdir_prefix)
        else:
            runtimeContext.outdir = tempfile.mkdtemp()

        output_dirs.add(runtimeContext.outdir)
        runtimeContext.mutation_manager = MutationManager()

        jobReqs = None
        if "cwl:requirements" in job_order:
            jobReqs = job_order["cwl:requirements"]
        elif ("cwl:defaults" in tool.metadata
              and "cwl:requirements" in tool.metadata["cwl:defaults"]):
            jobReqs = tool.metadata["cwl:defaults"]["cwl:requirements"]
        if jobReqs:
            for req in jobReqs:
                tool.requirements.append(req)

        if not runtimeContext.default_container:
            runtimeContext.default_container = 'frolvlad/alpine-bash'
        runtimeContext.docker_outdir = os.path.join(runtimeContext.working_dir,
                                                    "cwl/docker_outdir")
        runtimeContext.docker_tmpdir = os.path.join(runtimeContext.working_dir,
                                                    "cwl/docker_tmpdir")
        runtimeContext.docker_stagedir = os.path.join(
            runtimeContext.working_dir, "cwl/docker_stagedir")

        jobs = tool.job(job_order, output_callback, runtimeContext)
        try:
            for runnable in jobs:
                if runnable:
                    if runtimeContext.builder:
                        runnable.builder = runtimeContext.builder
                    if runnable.outdir:
                        output_dirs.add(runnable.outdir)
                    runnable.run(runtimeContext)
                else:
                    # log.error(
                    #     "Workflow cannot make any more progress"
                    # )
                    # break
                    time.sleep(1)

        except WorkflowException as e:
            traceback.print_exc()
            raise e
        except Exception as e:
            traceback.print_exc()
            raise WorkflowException(str(e))

        # wait for all processes to finish
        self.wait()

        if final_output and final_output[0] and finaloutdir:
            final_output[0] = relocateOutputs(
                final_output[0], finaloutdir, output_dirs,
                runtimeContext.move_outputs, runtimeContext.make_fs_access(""))

        if runtimeContext.rm_tmpdir:
            cleanIntermediate(output_dirs)

        if final_output and final_status:
            return str(final_output[0]), str(final_status[0])
        else:
            return None, "permanentFail"