Example #1
0
 def assign_job_cleanup(self, job_cleanup_task):
     for current_task in self.tasks:
         if isinstance(current_task, JobDispatcher):
             job_cleanup_task.set_upstream(current_task)  # Always connect JobDispatcher -> JobCleanup
         elif isinstance(current_task, CWLStepOperator):
             current_task_outputs_id = [shortname(current_task_output["id"]) for current_task_output in current_task.cwl_step.tool["outputs"]]
             workflow_outputs_outputsource = [shortname(workflow_output["outputSource"]) for workflow_output in self.cwlwf.tool["outputs"]]
             if any(i in current_task_outputs_id for i in workflow_outputs_outputsource):
                 job_cleanup_task.set_upstream(current_task)
Example #2
0
 def assign_job_dispatcher(self, task):
     for current_task in self.tasks:
         if isinstance(current_task, JobDispatcher) or isinstance(current_task, JobCleanup):
             continue
         current_task_input_sources = [shortname(source) for source in flatten([current_task_input["source"] \
                                                                                for current_task_input in current_task.cwl_step.tool["inputs"] \
                                                                                if "source" in current_task_input])]
         workflow_input_id = [shortname(workflow_input["id"]) for workflow_input in self.cwlwf.tool["inputs"]]
         # Should also check if current_task is on top, 'cos if task has all parameters to be set by
         # default and don't need any of its inputs to be read from the file
         # but it suppose to either return something directly to workflow output
         # or through the other tasks which don't have connections with JobDispatcher too,
         # it may happen that it will lost 'outdir', because the last one is set only by JoDispatcher task
         if any(i in current_task_input_sources for i in workflow_input_id) or not current_task.upstream_list:
             current_task.set_upstream(task)
Example #3
0
 def create(self):
     outputs = {}
     for step in self.cwlwf.steps:
         cwl_task = CWLStepOperator(cwl_step=step, dag=self)
         outputs[shortname(step.tool["id"])] = cwl_task
         for out in step.tool["outputs"]:
             outputs[shortname(out["id"])] = cwl_task
     for step in self.cwlwf.steps:
         current_task = outputs[shortname(step.tool["id"])]
         for inp in step.tool["inputs"]:
             step_input_sources = inp.get("source", '') if isinstance(inp.get("source", ''), list) else [inp.get("source", '')]
             for source in step_input_sources:
                 parent_task = outputs.get(shortname(source), None)
                 if parent_task and parent_task not in current_task.upstream_list:
                     current_task.set_upstream(parent_task)
Example #4
0
    def execute(self, context):
        initialized_job_order_object = init_job_order(self.dag.default_args["job_data"]["content"],
                                                      Namespace(),
                                                      self.dag.cwlwf,
                                                      Loader(jobloaderctx.copy()),
                                                      sys.stdout)

        updated_job_order_object = {}
        for index, inp in enumerate(self.dag.cwlwf.tool["inputs"]):
            inp_id = shortname(inp["id"])
            if inp_id.split("/")[-1] in initialized_job_order_object:
                updated_job_order_object[inp_id] = initialized_job_order_object[inp_id.split("/")[-1]]

        updated_job_order_object["tmp_folder"] = tempfile.mkdtemp(dir=self.dag.default_args["job_data"]["content"].get("tmp_folder", DEFAULT_TMP_FOLDER), prefix="dag_tmp_")
        updated_job_order_object["output_folder"] = self.dag.default_args["job_data"]["content"]["output_folder"]
        logging.info("Dispatch job\n{}".format(json.dumps(updated_job_order_object, indent=4)))
        return {"outputs": updated_job_order_object}
Example #5
0
    def execute(self, context):
        logging.info('Running tool: \n{}'.format(
            json.dumps(self.cwl_step.tool, indent=4)))
        collected_outputs = {}
        for task_outputs in self.xcom_pull(
                context=context,
                task_ids=[task.task_id for task in self.upstream_list]):
            collected_outputs = merge(collected_outputs,
                                      task_outputs["outputs"])
        logging.debug('Collected outputs:\n{}'.format(
            json.dumps(collected_outputs, indent=4)))

        jobobj = {}

        for inp in self.cwl_step.tool["inputs"]:
            jobobj_id = shortname(inp["id"]).split("/")[-1]
            source_ids = []
            promises_outputs = []
            try:
                source_ids = [shortname(source)
                              for source in inp["source"]] if isinstance(
                                  inp["source"],
                                  list) else [shortname(inp["source"])]
                promises_outputs = [
                    collected_outputs[source_id] for source_id in source_ids
                    if source_id in collected_outputs
                ]
            except Exception as ex:
                logging.info(
                    "Couldn't find source field in the step input: \n{}".
                    format(json.dumps(inp, indent=4)))
            logging.info(
                'For input {} with sources: \n{} \nfound upstream outputs: \n{}'
                .format(jobobj_id, source_ids, promises_outputs))
            if len(promises_outputs) > 1:
                if inp.get("linkMerge", "merge_nested") == "merge_flattened":
                    jobobj[jobobj_id] = flatten(promises_outputs)
                else:
                    jobobj[jobobj_id] = promises_outputs
            elif len(promises_outputs) == 1 and (
                    promises_outputs[0] is not None
            ):  # Should also check if [None], because in this case we need to take default value
                jobobj[jobobj_id] = promises_outputs[0]
            elif "valueFrom" in inp:
                jobobj[jobobj_id] = None
            elif "default" in inp:
                d = copy.copy(inp["default"])
                jobobj[jobobj_id] = d
            else:
                continue

        logging.info('Collected job object: \n{}'.format(
            json.dumps(jobobj, indent=4)))

        valueFrom = {
            shortname(i["id"]).split("/")[-1]: i["valueFrom"]
            for i in self.cwl_step.tool["inputs"] if "valueFrom" in i
        }

        logging.info('Inputs with valueFrom: \n{}'.format(
            json.dumps(valueFrom, indent=4)))

        def postScatterEval(shortio):
            def valueFromFunc(k, v):
                if k in valueFrom:
                    return cwltool.workflow.expression.do_eval(
                        valueFrom[k],
                        shortio,
                        self.dag.requirements,
                        None,
                        None, {},
                        context=v)
                else:
                    return v

            return {k: valueFromFunc(k, v) for k, v in shortio.items()}

        job = postScatterEval(jobobj)
        logging.info(
            'Collected job object after valueFrom evaluation: \n{}'.format(
                json.dumps(job, indent=4)))
        # maybe need to add here scatter functionality too

        kwargs = self.dag.default_args
        tmp_folder = collected_outputs["tmp_folder"]
        output_folder = collected_outputs["output_folder"]
        kwargs['outdir'] = tempfile.mkdtemp(dir=tmp_folder, prefix="step_tmp_")
        kwargs['tmpdir_prefix'] = os.path.join(tmp_folder, "cwl_tmp_")
        kwargs['tmp_outdir_prefix'] = os.path.join(tmp_folder,
                                                   "cwl_outdir_tmp_")
        kwargs['rm_tmpdir'] = False
        kwargs["basedir"] = os.path.abspath(
            os.path.dirname(self.dag.default_args["job_data"]["path"]))

        logger = logging.getLogger("cwltool")
        sys.stdout = StreamLogWriterUpdated(logger, logging.INFO)
        sys.stderr = StreamLogWriterUpdated(logger, logging.WARN)

        executor = cwltool.executors.SingleJobExecutor()
        runtimeContext = RuntimeContext(kwargs)
        runtimeContext.make_fs_access = getdefault(
            runtimeContext.make_fs_access, cwltool.stdfsaccess.StdFsAccess)

        for inp in self.cwl_step.tool["inputs"]:
            if inp.get("not_connected"):
                del job[shortname(inp["id"].split("/")[-1])]

        (output, status) = executor(self.cwl_step.embedded_tool,
                                    job,
                                    runtimeContext,
                                    logger=logger)

        if not output and status == "permanentFail":
            raise ValueError

        logging.debug('Embedded tool outputs: \n{}'.format(
            json.dumps(output, indent=4)))

        promises = {}
        for out in self.cwl_step.tool["outputs"]:
            out_id = shortname(out["id"])
            jobout_id = out_id.split("/")[-1]
            try:
                promises[out_id] = output[jobout_id]
            except:
                continue

        promises["tmp_folder"] = tmp_folder
        promises["output_folder"] = output_folder
        data = {"outputs": promises}

        logging.info('Outputs: \n{}'.format(json.dumps(data, indent=4)))

        return data
Example #6
0
 def __init__(self, cwl_step, *args, **kwargs):
     self.cwl_step = cwl_step
     super(self.__class__, self).__init__(task_id=shortname(
         cwl_step.tool["id"]).split("/")[-1],
                                          *args,
                                          **kwargs)
Example #7
0
 def get_output_list(self):
     # return [shortname(o) for o in self.cwlwf.tool["outputs"] ]
     outputs = {}
     for out in self.cwlwf.tool["outputs"]:
         outputs[shortname(out["outputSource"])] = shortname(out["id"])
     return outputs