Exemple #1
0
    def run(self, fileStore):
        cwljob = resolve_indirect(self.cwljob)

        if isinstance(self.step.tool["scatter"], string_types):
            scatter = [self.step.tool["scatter"]]
        else:
            scatter = self.step.tool["scatter"]

        scatterMethod = self.step.tool.get("scatterMethod", None)
        if len(scatter) == 1:
            scatterMethod = "dotproduct"
        outputs = []

        valueFrom = {shortname(i["id"]): i["valueFrom"] for i in self.step.tool["inputs"] if "valueFrom" in i}
        def postScatterEval(io):
            shortio = {shortname(k): v for k, v in iteritems(io)}
            for k in valueFrom:
                io.setdefault(k, None)
            def valueFromFunc(k, v):
                if k in valueFrom:
                    return cwltool.expression.do_eval(
                            valueFrom[k], shortio, self.step.requirements,
                            None, None, {}, context=v)
                else:
                    return v
            return {k: valueFromFunc(k, v) for k,v in list(io.items())}

        if scatterMethod == "dotproduct":
            for i in range(0, len(cwljob[shortname(scatter[0])])):
                copyjob = copy.copy(cwljob)
                for sc in [shortname(x) for x in scatter]:
                    copyjob[sc] = cwljob[sc][i]
                copyjob = postScatterEval(copyjob)
                (subjob, followOn) = makeJob(self.step.embedded_tool, copyjob, **self.executor_options)
                self.addChild(subjob)
                outputs.append(followOn.rv())
        elif scatterMethod == "nested_crossproduct":
            outputs = self.nested_crossproduct_scatter(cwljob, scatter, postScatterEval)
        elif scatterMethod == "flat_crossproduct":
            self.flat_crossproduct_scatter(cwljob, scatter, outputs, postScatterEval)
        else:
            if scatterMethod:
                raise validate.ValidationException(
                    "Unsupported complex scatter type '%s'" % scatterMethod)
            else:
                raise validate.ValidationException(
                    "Must provide scatterMethod to scatter over multiple inputs")

        return outputs
Exemple #2
0
    def job(self, joborder, output_callback, runtimeContext):
        runtimeContext = runtimeContext.copy()
        runtimeContext.toplevel = True  # Preserve behavior for #13365

        builder = make_builder({shortname(k): v for k,v in viewitems(joborder)}, self.hints, self.requirements, runtimeContext)
        runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext)
        return super(ArvadosWorkflowStep, self).job(joborder, output_callback, runtimeContext)
Exemple #3
0
def to_script(ctx, path, job_path, **kwds):
    if schema_salad is None:
        raise Exception("This functionality requires schema_salad and Python 2.7.")

    if cwltool is None:
        raise Exception("This functionality requires cwltool and Python 2.7.")

    uri = "file://" + os.path.abspath(job_path)

    loader = schema_salad.ref_resolver.Loader({
        "@base": uri,
        "path": {
            "@type": "@id"
        }
    })

    job, _ = loader.resolve_ref(uri)

    t = load_tool(path, False, False, cwltool.workflow.defaultMakeTool, True)

    if type(t) == int:
        return t

    process.checkRequirements(t.tool, cwl2script.supportedProcessRequirements)

    for inp in t.tool["inputs"]:
        if process.shortname(inp["id"]) in job:
            pass
        elif process.shortname(inp["id"]) not in job and "default" in inp:
            job[process.shortname(inp["id"])] = copy.copy(inp["default"])
        elif process.shortname(inp["id"]) not in job and inp["type"][0] == "null":
            pass
        else:
            raise Exception("Missing inputs `%s`" % process.shortname(inp["id"]))

    if not kwds.get("basedir", None):
        kwds["basedir"] = os.path.dirname(os.path.abspath(job_path))

    outdir = kwds.get("outdir")

    if t.tool["class"] == "Workflow":
        print(cwl2script.generateScriptForWorkflow(t, job, outdir))
    elif t.tool["class"] == "CommandLineTool":
        print(cwl2script.generateScriptForTool(t, job, outdir))

    return 0
Exemple #4
0
    def pipeline_component_spec(self):
        """Return a component that Workbench and a-r-p-i will understand.

        Specifically, translate CWL input specs to Arvados pipeline
        format, like {"dataclass":"File","value":"xyz"}.
        """
        spec = self.job.arvados_job_spec()

        # Most of the component spec is exactly the same as the job
        # spec (script, script_version, etc.).
        # spec['script_parameters'] isn't right, though. A component
        # spec's script_parameters hash is a translation of
        # self.tool.tool['inputs'] with defaults/overrides taken from
        # the job order. So we move the job parameters out of the way
        # and build a new spec['script_parameters'].
        job_params = spec['script_parameters']
        spec['script_parameters'] = {}

        for param in self.tool.tool['inputs']:
            param = copy.deepcopy(param)

            # Data type and "required" flag...
            types = param['type']
            if not isinstance(types, list):
                types = [types]
            param['required'] = 'null' not in types
            non_null_types = set(types) - set(['null'])
            if len(non_null_types) == 1:
                the_type = [c for c in non_null_types][0]
                dataclass = self.type_to_dataclass.get(the_type)
                if dataclass:
                    param['dataclass'] = dataclass
            # Note: If we didn't figure out a single appropriate
            # dataclass, we just left that attribute out.  We leave
            # the "type" attribute there in any case, which might help
            # downstream.

            # Title and description...
            title = param.pop('label', '')
            descr = param.pop('doc', '').rstrip('\n')
            if title:
                param['title'] = title
            if descr:
                param['description'] = descr

            # Fill in the value from the current job order, if any.
            param_id = shortname(param.pop('id'))
            value = job_params.get(param_id)
            if value is None:
                pass
            elif not isinstance(value, dict):
                param['value'] = value
            elif param.get('dataclass') == 'File' and value.get('location'):
                param['value'] = value['location']

            spec['script_parameters'][param_id] = param
        spec['script_parameters']['cwl:tool'] = job_params['cwl:tool']
        return spec
Exemple #5
0
 def postScatterEval(io):
     shortio = {shortname(k): v for k, v in iteritems(io)}
     def valueFromFunc(k, v):
         if k in valueFrom:
             return cwltool.expression.do_eval(
                     valueFrom[k], shortio, self.step.requirements,
                     None, None, {}, context=v)
         else:
             return v
     return {k: valueFromFunc(k, v) for k,v in io.items()}
Exemple #6
0
    def run(self, fileStore):
        cwljob = resolve_indirect(self.cwljob)

        if isinstance(self.step.tool["scatter"], basestring):
            scatter = [self.step.tool["scatter"]]
        else:
            scatter = self.step.tool["scatter"]

        scatterMethod = self.step.tool.get("scatterMethod", None)
        if len(scatter) == 1:
            scatterMethod = "dotproduct"
        outputs = []

        self.vfinputs = cwljob

        shortscatter = [shortname(s) for s in scatter]
        cwljob = {k: self.valueFromFunc(k, v) if k not in shortscatter else v
                    for k,v in cwljob.items()}

        if scatterMethod == "dotproduct":
            for i in xrange(0, len(cwljob[shortname(scatter[0])])):
                copyjob = copy.copy(cwljob)
                for sc in scatter:
                    scatter_key = shortname(sc)
                    copyjob[scatter_key] = self.valueFromFunc(scatter_key, cwljob[scatter_key][i])
                (subjob, followOn) = makeJob(self.step.embedded_tool, copyjob, **self.executor_options)
                self.addChild(subjob)
                outputs.append(followOn.rv())
        elif scatterMethod == "nested_crossproduct":
            outputs = self.nested_crossproduct_scatter(cwljob, scatter)
        elif scatterMethod == "flat_crossproduct":
            self.flat_crossproduct_scatter(cwljob, scatter, outputs)
        else:
            if scatterMethod:
                raise validate.ValidationException(
                    "Unsupported complex scatter type '%s'" % scatterMethod)
            else:
                raise validate.ValidationException(
                    "Must provide scatterMethod to scatter over multiple inputs")

        return outputs
Exemple #7
0
 def flat_crossproduct_scatter(self, joborder, scatter_keys, outputs):
     scatter_key = shortname(scatter_keys[0])
     l = len(joborder[scatter_key])
     for n in xrange(0, l):
         jo = copy.copy(joborder)
         jo[scatter_key] = self.valueFromFunc(scatter_key, joborder[scatter_key][n])
         if len(scatter_keys) == 1:
             (subjob, followOn) = makeJob(self.step.embedded_tool, jo, **self.executor_options)
             self.addChild(subjob)
             outputs.append(followOn.rv())
         else:
             self.flat_crossproduct_scatter(jo, scatter_keys[1:], outputs)
Exemple #8
0
 def nested_crossproduct_scatter(self, joborder, scatter_keys):
     scatter_key = shortname(scatter_keys[0])
     l = len(joborder[scatter_key])
     outputs = []
     for n in xrange(0, l):
         jo = copy.copy(joborder)
         jo[scatter_key] = self.valueFromFunc(scatter_key, joborder[scatter_key][n])
         if len(scatter_keys) == 1:
             (subjob, followOn) = makeJob(self.step.embedded_tool, jo)
             self.addChild(subjob)
             outputs.append(followOn.rv())
         else:
             outputs.append(self.nested_crossproduct_scatter(jo, scatter_keys[1:]))
     return outputs
Exemple #9
0
 def nested_crossproduct_scatter(self, joborder, scatter_keys, postScatterEval):
     scatter_key = shortname(scatter_keys[0])
     l = len(joborder[scatter_key])
     outputs = []
     for n in xrange(0, l):
         jo = copy.copy(joborder)
         jo[scatter_key] = joborder[scatter_key][n]
         if len(scatter_keys) == 1:
             jo = postScatterEval(jo)
             (subjob, followOn) = makeJob(self.step.embedded_tool, jo, **self.executor_options)
             self.addChild(subjob)
             outputs.append(followOn.rv())
         else:
             outputs.append(self.nested_crossproduct_scatter(jo, scatter_keys[1:], postScatterEval))
     return outputs
Exemple #10
0
def upload_workflow(arvRunner, tool, job_order, project_uuid, uuid=None,
                    submit_runner_ram=0, name=None, merged_map=None):

    packed = packed_workflow(arvRunner, tool, merged_map)

    adjustDirObjs(job_order, trim_listing)
    adjustFileObjs(job_order, trim_anonymous_location)
    adjustDirObjs(job_order, trim_anonymous_location)

    main = [p for p in packed["$graph"] if p["id"] == "#main"][0]
    for inp in main["inputs"]:
        sn = shortname(inp["id"])
        if sn in job_order:
            inp["default"] = job_order[sn]

    if not name:
        name = tool.tool.get("label", os.path.basename(tool.tool["id"]))

    upload_dependencies(arvRunner, name, tool.doc_loader,
                        packed, tool.tool["id"], False)

    if submit_runner_ram:
        hints = main.get("hints", [])
        found = False
        for h in hints:
            if h["class"] == "http://arvados.org/cwl#WorkflowRunnerResources":
                h["ramMin"] = submit_runner_ram
                found = True
                break
        if not found:
            hints.append({"class": "http://arvados.org/cwl#WorkflowRunnerResources",
                          "ramMin": submit_runner_ram})
        main["hints"] = hints

    body = {
        "workflow": {
            "name": name,
            "description": tool.tool.get("doc", ""),
            "definition":json.dumps(packed, sort_keys=True, indent=4, separators=(',',': '))
        }}
    if project_uuid:
        body["workflow"]["owner_uuid"] = project_uuid

    if uuid:
        call = arvRunner.api.workflows().update(uuid=uuid, body=body)
    else:
        call = arvRunner.api.workflows().create(body=body)
    return call.execute(num_retries=arvRunner.num_retries)["uuid"]
Exemple #11
0
 def upload_tool_deps(deptool):
     if "id" in deptool:
         discovered_secondaryfiles = {}
         pm = upload_dependencies(arvrunner,
                                  "%s dependencies" % (shortname(deptool["id"])),
                                  document_loader,
                                  deptool,
                                  deptool["id"],
                                  False,
                                  include_primary=False,
                                  discovered_secondaryfiles=discovered_secondaryfiles)
         document_loader.idx[deptool["id"]] = deptool
         toolmap = {}
         for k,v in pm.items():
             toolmap[k] = v.resolved
         merged_map[deptool["id"]] = FileUpdates(toolmap, discovered_secondaryfiles)
Exemple #12
0
        def postScatterEval(io):
            shortio = {shortname(k): v for k, v in iteritems(io)}
            for k in valueFrom:
                io.setdefault(k, None)

            def valueFromFunc(k, v):
                if k in valueFrom:
                    return cwltool.expression.do_eval(valueFrom[k],
                                                      shortio,
                                                      self.step.requirements,
                                                      None,
                                                      None, {},
                                                      context=v)
                else:
                    return v

            return {k: valueFromFunc(k, v) for k, v in list(io.items())}
Exemple #13
0
 def nested_crossproduct_scatter(self, joborder, scatter_keys):
     scatter_key = shortname(scatter_keys[0])
     l = len(joborder[scatter_key])
     outputs = []
     for n in xrange(0, l):
         jo = copy.copy(joborder)
         jo[scatter_key] = self.valueFromFunc(scatter_key,
                                              joborder[scatter_key][n])
         if len(scatter_keys) == 1:
             (subjob, followOn) = makeJob(self.step.embedded_tool, jo,
                                          **self.executor_options)
             self.addChild(subjob)
             outputs.append(followOn.rv())
         else:
             outputs.append(
                 self.nested_crossproduct_scatter(jo, scatter_keys[1:]))
     return outputs
Exemple #14
0
 def nested_crossproduct_scatter(self, joborder, scatter_keys,
                                 postScatterEval):
     scatter_key = shortname(scatter_keys[0])
     outputs = []
     for n in range(0, len(joborder[scatter_key])):
         jo = copy.copy(joborder)
         jo[scatter_key] = joborder[scatter_key][n]
         if len(scatter_keys) == 1:
             jo = postScatterEval(jo)
             (subjob, followOn) = makeJob(self.step.embedded_tool, jo,
                                          **self.executor_options)
             self.addChild(subjob)
             outputs.append(followOn.rv())
         else:
             outputs.append(
                 self.nested_crossproduct_scatter(jo, scatter_keys[1:],
                                                  postScatterEval))
     return outputs
Exemple #15
0
 def upload_tool_deps(deptool):
     if "id" in deptool:
         discovered_secondaryfiles = {}
         pm = upload_dependencies(
             arvrunner,
             "%s dependencies" % (shortname(deptool["id"])),
             document_loader,
             deptool,
             deptool["id"],
             False,
             include_primary=False,
             discovered_secondaryfiles=discovered_secondaryfiles)
         document_loader.idx[deptool["id"]] = deptool
         toolmap = {}
         for k, v in pm.items():
             toolmap[k] = v.resolved
         merged_map[deptool["id"]] = FileUpdates(toolmap,
                                                 discovered_secondaryfiles)
Exemple #16
0
def upload_workflow(arvRunner,
                    tool,
                    job_order,
                    project_uuid,
                    uuid=None,
                    submit_runner_ram=0,
                    name=None):

    packed = packed_workflow(arvRunner, tool)

    adjustDirObjs(job_order, trim_listing)
    adjustFileObjs(job_order, trim_anonymous_location)
    adjustDirObjs(job_order, trim_anonymous_location)

    main = [p for p in packed["$graph"] if p["id"] == "#main"][0]
    for inp in main["inputs"]:
        sn = shortname(inp["id"])
        if sn in job_order:
            inp["default"] = job_order[sn]

    if not name:
        name = tool.tool.get("label", os.path.basename(tool.tool["id"]))

    upload_dependencies(arvRunner, name, tool.doc_loader, packed,
                        tool.tool["id"], False)

    # TODO nowhere for submit_runner_ram to go.

    body = {
        "workflow": {
            "name": name,
            "description": tool.tool.get("doc", ""),
            "definition": yaml.round_trip_dump(packed)
        }
    }
    if project_uuid:
        body["workflow"]["owner_uuid"] = project_uuid

    if uuid:
        call = arvRunner.api.workflows().update(uuid=uuid, body=body)
    else:
        call = arvRunner.api.workflows().create(body=body)
    return call.execute(num_retries=arvRunner.num_retries)["uuid"]
Exemple #17
0
    def run(self, *args, **kwargs):
        job_spec = self.arvados_job_spec(*args, **kwargs)

        for k, v in job_spec["script_parameters"].items():
            if v is False or v is None or isinstance(v, dict):
                job_spec["script_parameters"][k] = {"value": v}

        self.arvrunner.pipeline = self.arvrunner.api.pipeline_instances(
        ).create(
            body={
                "owner_uuid": self.arvrunner.project_uuid,
                "name": shortname(self.tool.tool["id"]),
                "components": {
                    "cwl-runner": job_spec
                },
                "state": "RunningOnServer"
            }).execute(num_retries=self.arvrunner.num_retries)
        logger.info("Created pipeline %s", self.arvrunner.pipeline["uuid"])

        if kwargs.get("wait") is False:
            self.uuid = self.arvrunner.pipeline["uuid"]
            return

        job = None
        while not job:
            time.sleep(2)
            self.arvrunner.pipeline = self.arvrunner.api.pipeline_instances(
            ).get(uuid=self.arvrunner.pipeline["uuid"]).execute(
                num_retries=self.arvrunner.num_retries)
            job = self.arvrunner.pipeline["components"]["cwl-runner"].get(
                "job")
            if not job and self.arvrunner.pipeline[
                    "state"] != "RunningOnServer":
                raise WorkflowException("Submitted pipeline is %s" %
                                        (self.arvrunner.pipeline["state"]))

        self.uuid = job["uuid"]
        self.arvrunner.processes[self.uuid] = self

        if job["state"] in ("Complete", "Failed", "Cancelled"):
            self.done(job)
Exemple #18
0
 def __init__(self,
              builder,           # type: Builder
              joborder,          # type: Dict[Text, Union[Dict[Text, Any], List, Text, None]]
              requirements,      # type: List[Dict[Text, Text]]
              hints,             # type: List[Dict[Text, Text]]
              name,              # type: Text
              wps_process,       # type: WpsProcessInterface
              expected_outputs,  # type: List[CWL_ExpectedOutputs]
              ):                 # type: (...) -> None
     super(WpsWorkflowJob, self).__init__(builder, joborder, None, requirements, hints, name)
     self.wps_process = wps_process
     self.expected_outputs = {}  # type: Dict[str, str]  # {id: file-pattern}
     for output in expected_outputs:
         # TODO Should we support something else?
         if is_cwl_file_type(output):
             # Expecting output to look like this
             # output = {"id": "file:///tmp/random_path/process_name#output_id,
             #           "type": "File",
             #           "outputBinding": {"glob": output_name }
             #          }
             output_id = shortname(output["id"])
             self.expected_outputs[output_id] = output["outputBinding"]["glob"]
Exemple #19
0
def upload_workflow(arvRunner, tool, job_order, project_uuid, uuid=None,
                    submit_runner_ram=0, name=None):

    packed = packed_workflow(arvRunner, tool)

    adjustDirObjs(job_order, trim_listing)
    adjustFileObjs(job_order, trim_anonymous_location)
    adjustDirObjs(job_order, trim_anonymous_location)

    main = [p for p in packed["$graph"] if p["id"] == "#main"][0]
    for inp in main["inputs"]:
        sn = shortname(inp["id"])
        if sn in job_order:
            inp["default"] = job_order[sn]

    if not name:
        name = tool.tool.get("label", os.path.basename(tool.tool["id"]))

    upload_dependencies(arvRunner, name, tool.doc_loader,
                        packed, tool.tool["id"], False)

    # TODO nowhere for submit_runner_ram to go.

    body = {
        "workflow": {
            "name": name,
            "description": tool.tool.get("doc", ""),
            "definition":yaml.round_trip_dump(packed)
        }}
    if project_uuid:
        body["workflow"]["owner_uuid"] = project_uuid

    if uuid:
        call = arvRunner.api.workflows().update(uuid=uuid, body=body)
    else:
        call = arvRunner.api.workflows().create(body=body)
    return call.execute(num_retries=arvRunner.num_retries)["uuid"]
Exemple #20
0
    def run(self, *args, **kwargs):
        job_spec = self.arvados_job_spec(*args, **kwargs)
        job_spec.setdefault("owner_uuid", self.arvrunner.project_uuid)

        response = self.arvrunner.api.jobs().create(
            body=job_spec,
            find_or_create=self.enable_reuse
        ).execute(num_retries=self.arvrunner.num_retries)

        self.uuid = response["uuid"]
        self.arvrunner.processes[self.uuid] = self

        logger.info("Submitted job %s", response["uuid"])

        if kwargs.get("submit"):
            self.pipeline = self.arvrunner.api.pipeline_instances().create(
                body={
                    "owner_uuid": self.arvrunner.project_uuid,
                    "name": shortname(self.tool.tool["id"]),
                    "components": {"cwl-runner": {"job": {"uuid": self.uuid, "state": response["state"]} } },
                    "state": "RunningOnClient"}).execute(num_retries=self.arvrunner.num_retries)

        if response["state"] in ("Complete", "Failed", "Cancelled"):
            self.done(response)
Exemple #21
0
    def arv_executor(self, updated_tool, job_order, runtimeContext, logger=None):
        self.debug = runtimeContext.debug

        updated_tool.visit(self.check_features)

        self.project_uuid = runtimeContext.project_uuid
        self.pipeline = None
        self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
        self.secret_store = runtimeContext.secret_store

        self.trash_intermediate = runtimeContext.trash_intermediate
        if self.trash_intermediate and self.work_api != "containers":
            raise Exception("--trash-intermediate is only supported with --api=containers.")

        self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
        if self.intermediate_output_ttl and self.work_api != "containers":
            raise Exception("--intermediate-output-ttl is only supported with --api=containers.")
        if self.intermediate_output_ttl < 0:
            raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl)

        if runtimeContext.submit_request_uuid and self.work_api != "containers":
            raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api))

        if not runtimeContext.name:
            runtimeContext.name = self.name = updated_tool.tool.get("label") or updated_tool.metadata.get("label") or os.path.basename(updated_tool.tool["id"])

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % runtimeContext.name,
                                     updated_tool, job_order)

        # the last clause means: if it is a command line tool, and we
        # are going to wait for the result, and always_submit_runner
        # is false, then we don't submit a runner process.

        submitting = (runtimeContext.update_workflow or
                      runtimeContext.create_workflow or
                      (runtimeContext.submit and not
                       (updated_tool.tool["class"] == "CommandLineTool" and
                        runtimeContext.wait and
                        not runtimeContext.always_submit_runner)))

        loadingContext = self.loadingContext.copy()
        loadingContext.do_validate = False
        loadingContext.do_update = False
        if submitting:
            # Document may have been auto-updated. Reload the original
            # document with updating disabled because we want to
            # submit the document with its original CWL version, not
            # the auto-updated one.
            tool = load_tool(updated_tool.tool["id"], loadingContext)
        else:
            tool = updated_tool

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        merged_map = upload_workflow_deps(self, tool)

        # Recreate process object (ArvadosWorkflow or
        # ArvadosCommandTool) because tool document may have been
        # updated by upload_workflow_deps in ways that modify
        # inheritance of hints or requirements.
        loadingContext.loader = tool.doc_loader
        loadingContext.avsc_names = tool.doc_schema
        loadingContext.metadata = tool.metadata
        tool = load_tool(tool.tool, loadingContext)

        existing_uuid = runtimeContext.update_workflow
        if existing_uuid or runtimeContext.create_workflow:
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "containers":
                return (upload_workflow(self, tool, job_order,
                                        self.project_uuid,
                                        uuid=existing_uuid,
                                        submit_runner_ram=runtimeContext.submit_runner_ram,
                                        name=runtimeContext.name,
                                        merged_map=merged_map),
                        "success")

        self.apply_reqs(job_order, tool)

        self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
        self.eval_timeout = runtimeContext.eval_timeout

        runtimeContext = runtimeContext.copy()
        runtimeContext.use_container = True
        runtimeContext.tmpdir_prefix = "tmp"
        runtimeContext.work_api = self.work_api

        if self.work_api == "containers":
            if self.ignore_docker_for_reuse:
                raise Exception("--ignore-docker-for-reuse not supported with containers API.")
            runtimeContext.outdir = "/var/spool/cwl"
            runtimeContext.docker_outdir = "/var/spool/cwl"
            runtimeContext.tmpdir = "/tmp"
            runtimeContext.docker_tmpdir = "/tmp"

        if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
            raise Exception("--priority must be in the range 1..1000.")

        if self.should_estimate_cache_size:
            visited = set()
            estimated_size = [0]
            def estimate_collection_cache(obj):
                if obj.get("location", "").startswith("keep:"):
                    m = pdh_size.match(obj["location"][5:])
                    if m and m.group(1) not in visited:
                        visited.add(m.group(1))
                        estimated_size[0] += int(m.group(2))
            visit_class(job_order, ("File", "Directory"), estimate_collection_cache)
            runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256)
            self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024)

        logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size)

        runnerjob = None
        if runtimeContext.submit:
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if submitting:
                    tool = RunnerContainer(self, updated_tool,
                                           tool, loadingContext, runtimeContext.enable_reuse,
                                           self.output_name,
                                           self.output_tags,
                                           submit_runner_ram=runtimeContext.submit_runner_ram,
                                           name=runtimeContext.name,
                                           on_error=runtimeContext.on_error,
                                           submit_runner_image=runtimeContext.submit_runner_image,
                                           intermediate_output_ttl=runtimeContext.intermediate_output_ttl,
                                           merged_map=merged_map,
                                           priority=runtimeContext.priority,
                                           secret_store=self.secret_store,
                                           collection_cache_size=runtimeContext.collection_cache_size,
                                           collection_cache_is_default=self.should_estimate_cache_size)
                else:
                    runtimeContext.runnerjob = tool.tool["id"]

        if runtimeContext.cwl_runner_job is not None:
            self.uuid = runtimeContext.cwl_runner_job.get('uuid')

        jobiter = tool.job(job_order,
                           self.output_callback,
                           runtimeContext)

        if runtimeContext.submit and not runtimeContext.wait:
            runnerjob = next(jobiter)
            runnerjob.run(runtimeContext)
            return (runnerjob.uuid, "success")

        current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
        if current_container:
            logger.info("Running inside container %s", current_container.get("uuid"))

        self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout)
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)

        try:
            self.workflow_eval_lock.acquire()

            # Holds the lock while this code runs and releases it when
            # it is safe to do so in self.workflow_eval_lock.wait(),
            # at which point on_message can update job state and
            # process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if self.task_queue.error is not None:
                    raise self.task_queue.error

                if runnable:
                    with Perf(metrics, "run"):
                        self.start_run(runnable, runtimeContext)
                else:
                    if (self.task_queue.in_flight + len(self.processes)) > 0:
                        self.workflow_eval_lock.wait(3)
                    else:
                        logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.")
                        break

                if self.stop_polling.is_set():
                    break

                loopperf.__enter__()
            loopperf.__exit__()

            while (self.task_queue.in_flight + len(self.processes)) > 0:
                if self.task_queue.error is not None:
                    raise self.task_queue.error
                self.workflow_eval_lock.wait(3)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit:
                logger.error("Interrupted, workflow will be cancelled")
            elif isinstance(sys.exc_info()[1], WorkflowException):
                logger.error("Workflow execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
            else:
                logger.exception("Workflow execution failed")

            if self.pipeline:
                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
                                                     body={"state": "Failed"}).execute(num_retries=self.num_retries)

            if self.work_api == "containers" and not current_container:
                # Not running in a crunch container, so cancel any outstanding processes.
                for p in self.processes:
                    try:
                        self.api.container_requests().update(uuid=p,
                                                             body={"priority": "0"}
                        ).execute(num_retries=self.num_retries)
                    except Exception:
                        pass
        finally:
            self.workflow_eval_lock.release()
            self.task_queue.drain()
            self.stop_polling.set()
            self.polling_thread.join()
            self.task_queue.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if runtimeContext.submit and isinstance(tool, Runner):
            logger.info("Final output collection %s", tool.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""

            storage_classes = runtimeContext.storage_classes.strip().split(",")
            self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output)
            self.set_crunch_output()

        if runtimeContext.compute_checksum:
            adjustDirObjs(self.final_output, partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access))

        if self.trash_intermediate and self.final_status == "success":
            self.trash_intermediate_output()

        return (self.final_output, self.final_status)
Exemple #22
0
def main(args=None):
    parser = argparse.ArgumentParser()
    parser.add_argument("cwltool", type=str)
    parser.add_argument("cwljob", type=str)

    parser.add_argument("--conformance-test", action="store_true")
    parser.add_argument("--no-container", action="store_true")
    parser.add_argument("--basedir", type=str)
    parser.add_argument("--outdir", type=str, default=os.getcwd())

    options = parser.parse_args(args)

    uri = "file://" + os.path.abspath(options.cwljob)

    if options.conformance_test:
        loader = schema_salad.ref_resolver.Loader({})
    else:
        loader = schema_salad.ref_resolver.Loader({"@base": uri, "path": {"@type": "@id"}})

    job, _ = loader.resolve_ref(uri)
    print "> job looks like ", job

    print "> working with ", options.cwltool
    # returns a workflow/tool object
    t = cwltool.main.load_tool(options.cwltool, False, False, cwltool.workflow.defaultMakeTool, True)

    # print t

    # print "> Need ", supportedProcessRequirements, " for tool ", t

    # check the requirements of the tool, skip for now
    if type(t) == int:
        return t

    try:
        # checkRequirements(t.tool, supportedProcessRequirements)
        print "skipping requirement checking ..."
    except Exception as e:
        logging.error(e)
        return 33

    # ----------------- assign inputs to jobs ------------------------------ #
    for inp in t.tool["inputs"]:
        if shortname(inp["id"]) in job:
            pass
        elif shortname(inp["id"]) not in job and "default" in inp:
            job[shortname(inp["id"])] = copy.copy(inp["default"])
        elif shortname(inp["id"]) not in job and inp["type"][0] == "null":
            pass
        else:
            raise Exception("Missing inputs `%s`" % shortname(inp["id"]))
    print "> job looks like ", job

    if options.conformance_test:
        sys.stdout.write(
            json.dumps(
                cwltool.main.single_job_executor(t, job, options.basedir, options, conformance_test=True), indent=4
            )
        )
        return 0

    if not options.basedir:
        options.basedir = os.path.dirname(os.path.abspath(options.cwljob))

    outdir = options.outdir

    print "> the cwl file is for a ", t.tool["class"]
    if t.tool["class"] == "Workflow":
        print generateScriptForWorkflow(t, job, outdir)
    elif t.tool["class"] == "CommandLineTool":
        print generateScriptForTool(t, job, outdir)

    return 0
Exemple #23
0
    def pipeline_component_spec(self):
        """Return a component that Workbench and a-r-p-i will understand.

        Specifically, translate CWL input specs to Arvados pipeline
        format, like {"dataclass":"File","value":"xyz"}.
        """

        spec = self.job.arvados_job_spec()

        # Most of the component spec is exactly the same as the job
        # spec (script, script_version, etc.).
        # spec['script_parameters'] isn't right, though. A component
        # spec's script_parameters hash is a translation of
        # self.tool.tool['inputs'] with defaults/overrides taken from
        # the job order. So we move the job parameters out of the way
        # and build a new spec['script_parameters'].
        job_params = spec['script_parameters']
        spec['script_parameters'] = {}

        for param in self.embedded_tool.tool['inputs']:
            param = copy.deepcopy(param)

            # Data type and "required" flag...
            types = param['type']
            if not isinstance(types, list):
                types = [types]
            param['required'] = 'null' not in types
            non_null_types = [t for t in types if t != "null"]
            if len(non_null_types) == 1:
                the_type = [c for c in non_null_types][0]
                dataclass = None
                if isinstance(the_type, basestring):
                    dataclass = self.type_to_dataclass.get(the_type)
                if dataclass:
                    param['dataclass'] = dataclass
            # Note: If we didn't figure out a single appropriate
            # dataclass, we just left that attribute out.  We leave
            # the "type" attribute there in any case, which might help
            # downstream.

            # Title and description...
            title = param.pop('label', '')
            descr = param.pop('doc', '').rstrip('\n')
            if title:
                param['title'] = title
            if descr:
                param['description'] = descr

            # Fill in the value from the current job order, if any.
            param_id = shortname(param.pop('id'))
            value = job_params.get(param_id)
            if value is None:
                pass
            elif not isinstance(value, dict):
                param['value'] = value
            elif param.get('dataclass') in (
                    'File', 'Collection') and value.get('location'):
                param['value'] = value['location'][5:]

            spec['script_parameters'][param_id] = param
        spec['script_parameters']['cwl:tool'] = job_params['cwl:tool']
        return spec
Exemple #24
0
    def run(self, fileStore):
        cwljob = resolve_indirect(self.cwljob)

        if isinstance(self.step.tool["scatter"], string_types):
            scatter = [self.step.tool["scatter"]]
        else:
            scatter = self.step.tool["scatter"]

        scatterMethod = self.step.tool.get("scatterMethod", None)
        if len(scatter) == 1:
            scatterMethod = "dotproduct"
        outputs = []

        valueFrom = {
            shortname(i["id"]): i["valueFrom"]
            for i in self.step.tool["inputs"] if "valueFrom" in i
        }

        def postScatterEval(io):
            shortio = {shortname(k): v for k, v in iteritems(io)}
            for k in valueFrom:
                io.setdefault(k, None)

            def valueFromFunc(k, v):
                if k in valueFrom:
                    return cwltool.expression.do_eval(valueFrom[k],
                                                      shortio,
                                                      self.step.requirements,
                                                      None,
                                                      None, {},
                                                      context=v)
                else:
                    return v

            return {k: valueFromFunc(k, v) for k, v in list(io.items())}

        if scatterMethod == "dotproduct":
            for i in range(0, len(cwljob[shortname(scatter[0])])):
                copyjob = copy.copy(cwljob)
                for sc in [shortname(x) for x in scatter]:
                    copyjob[sc] = cwljob[sc][i]
                copyjob = postScatterEval(copyjob)
                (subjob, followOn) = makeJob(self.step.embedded_tool, copyjob,
                                             **self.executor_options)
                self.addChild(subjob)
                outputs.append(followOn.rv())
        elif scatterMethod == "nested_crossproduct":
            outputs = self.nested_crossproduct_scatter(cwljob, scatter,
                                                       postScatterEval)
        elif scatterMethod == "flat_crossproduct":
            self.flat_crossproduct_scatter(cwljob, scatter, outputs,
                                           postScatterEval)
        else:
            if scatterMethod:
                raise validate.ValidationException(
                    "Unsupported complex scatter type '%s'" % scatterMethod)
            else:
                raise validate.ValidationException(
                    "Must provide scatterMethod to scatter over multiple inputs"
                )

        return outputs
Exemple #25
0
    def job(self, joborder, output_callback, **kwargs):
        kwargs["work_api"] = self.work_api
        req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer")
        if req:
            with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                if "id" not in self.tool:
                    raise WorkflowException("%s object must have 'id'" % (self.tool["class"]))
            document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch(self.tool["id"]), self.tool["id"])

            discover_secondary_files(self.tool["inputs"], joborder)

            with Perf(metrics, "subworkflow upload_deps"):
                upload_dependencies(self.arvrunner,
                                    os.path.basename(joborder.get("id", "#")),
                                    document_loader,
                                    joborder,
                                    joborder.get("id", "#"),
                                    False)

                if self.wf_pdh is None:
                    workflowobj["requirements"] = dedup_reqs(self.requirements)
                    workflowobj["hints"] = dedup_reqs(self.hints)

                    packed = pack(document_loader, workflowobj, uri, self.metadata)

                    upload_dependencies(self.arvrunner,
                                        kwargs.get("name", ""),
                                        document_loader,
                                        packed,
                                        uri,
                                        False)

            with Perf(metrics, "subworkflow adjust"):
                joborder_resolved = copy.deepcopy(joborder)
                joborder_keepmount = copy.deepcopy(joborder)

                reffiles = []
                visit_class(joborder_keepmount, ("File", "Directory"), lambda x: reffiles.append(x))

                mapper = ArvPathMapper(self.arvrunner, reffiles, kwargs["basedir"],
                                 "/keep/%s",
                                 "/keep/%s/%s",
                                 **kwargs)

                def keepmount(obj):
                    remove_redundant_fields(obj)
                    with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                        if "location" not in obj:
                            raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj))
                    with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                        if obj["location"].startswith("keep:"):
                            obj["location"] = mapper.mapper(obj["location"]).target
                            if "listing" in obj:
                                del obj["listing"]
                        elif obj["location"].startswith("_:"):
                            del obj["location"]
                        else:
                            raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"])

                visit_class(joborder_keepmount, ("File", "Directory"), keepmount)

                def resolved(obj):
                    if obj["location"].startswith("keep:"):
                        obj["location"] = mapper.mapper(obj["location"]).resolved

                visit_class(joborder_resolved, ("File", "Directory"), resolved)

                if self.wf_pdh is None:
                    adjustFileObjs(packed, keepmount)
                    adjustDirObjs(packed, keepmount)
                    self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed)

            wf_runner = cmap({
                "class": "CommandLineTool",
                "baseCommand": "cwltool",
                "inputs": self.tool["inputs"],
                "outputs": self.tool["outputs"],
                "stdout": "cwl.output.json",
                "requirements": self.requirements+[
                    {
                    "class": "InitialWorkDirRequirement",
                    "listing": [{
                            "entryname": "workflow.cwl",
                            "entry": {
                                "class": "File",
                                "location": "keep:%s/workflow.cwl" % self.wf_pdh
                            }
                        }, {
                            "entryname": "cwl.input.yml",
                            "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${')
                        }]
                }],
                "hints": self.hints,
                "arguments": ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml"],
                "id": "#"
            })
            kwargs["loader"] = self.doc_loader
            kwargs["avsc_names"] = self.doc_schema
            return ArvadosCommandTool(self.arvrunner, wf_runner, **kwargs).job(joborder_resolved, output_callback, **kwargs)
        else:
            return super(ArvadosWorkflow, self).job(joborder, output_callback, **kwargs)
Exemple #26
0
def discover_secondary_files(inputs, job_order, discovered=None):
    for t in inputs:
        if shortname(t["id"]) in job_order and t.get("secondaryFiles"):
            setSecondary(t, job_order[shortname(t["id"])], discovered)
Exemple #27
0
    def arvExecutor(self, tool, job_order, input_basedir, args, **kwargs):
        events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message)

        try:
            self.api.collections().get(uuid=crunchrunner_pdh).execute()
        except arvados.errors.ApiError as e:
            import httplib2
            h = httplib2.Http(ca_certs=arvados.util.ca_certs_path())
            resp, content = h.request(crunchrunner_download, "GET")
            resp2, content2 = h.request(certs_download, "GET")
            with arvados.collection.Collection() as col:
                with col.open("crunchrunner", "w") as f:
                    f.write(content)
                with col.open("ca-certificates.crt", "w") as f:
                    f.write(content2)

                col.save_new("crunchrunner binary", ensure_unique_name=True)

        self.fs_access = CollectionFsAccess(input_basedir)

        kwargs["fs_access"] = self.fs_access
        kwargs["enable_reuse"] = args.enable_reuse

        kwargs["outdir"] = "$(task.outdir)"
        kwargs["tmpdir"] = "$(task.tmpdir)"

        if kwargs.get("conformance_test"):
            return cwltool.main.single_job_executor(tool, job_order, input_basedir, args, **kwargs)
        else:
            self.pipeline = self.api.pipeline_instances().create(body={"name": shortname(tool.tool["id"]),
                                                                   "components": {},
                                                                   "state": "RunningOnClient"}).execute(num_retries=self.num_retries)

            jobiter = tool.job(job_order,
                               input_basedir,
                               self.output_callback,
                               docker_outdir="$(task.outdir)",
                               **kwargs)

            try:
                for runnable in jobiter:
                    if runnable:
                        with self.lock:
                            runnable.run(**kwargs)
                    else:
                        if self.jobs:
                            try:
                                self.cond.acquire()
                                self.cond.wait(1)
                            except RuntimeError:
                                pass
                            finally:
                                self.cond.release()
                        else:
                            logger.error("Workflow cannot make any more progress.")
                            break

                while self.jobs:
                    try:
                        self.cond.acquire()
                        self.cond.wait(1)
                    except RuntimeError:
                        pass
                    finally:
                        self.cond.release()

                events.close()

                if self.final_output is None:
                    raise cwltool.workflow.WorkflowException("Workflow did not return a result.")

            except:
                if sys.exc_info()[0] is not KeyboardInterrupt:
                    logger.exception("Caught unhandled exception, marking pipeline as failed")
                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
                                                     body={"state": "Failed"}).execute(num_retries=self.num_retries)

            return self.final_output
Exemple #28
0
    def arv_executor(self, tool, job_order, runtimeContext, logger=None):
        self.debug = runtimeContext.debug

        tool.visit(self.check_features)

        self.project_uuid = runtimeContext.project_uuid
        self.pipeline = None
        self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
        self.secret_store = runtimeContext.secret_store

        self.trash_intermediate = runtimeContext.trash_intermediate
        if self.trash_intermediate and self.work_api != "containers":
            raise Exception("--trash-intermediate is only supported with --api=containers.")

        self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
        if self.intermediate_output_ttl and self.work_api != "containers":
            raise Exception("--intermediate-output-ttl is only supported with --api=containers.")
        if self.intermediate_output_ttl < 0:
            raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl)

        if runtimeContext.submit_request_uuid and self.work_api != "containers":
            raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api))

        if not runtimeContext.name:
            runtimeContext.name = self.name = tool.tool.get("label") or tool.metadata.get("label") or os.path.basename(tool.tool["id"])

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        merged_map = upload_workflow_deps(self, tool)

        # Reload tool object which may have been updated by
        # upload_workflow_deps
        # Don't validate this time because it will just print redundant errors.
        loadingContext = self.loadingContext.copy()
        loadingContext.loader = tool.doc_loader
        loadingContext.avsc_names = tool.doc_schema
        loadingContext.metadata = tool.metadata
        loadingContext.do_validate = False

        tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]],
                                  loadingContext)

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % runtimeContext.name,
                                     tool, job_order)

        existing_uuid = runtimeContext.update_workflow
        if existing_uuid or runtimeContext.create_workflow:
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "jobs":
                tmpl = RunnerTemplate(self, tool, job_order,
                                      runtimeContext.enable_reuse,
                                      uuid=existing_uuid,
                                      submit_runner_ram=runtimeContext.submit_runner_ram,
                                      name=runtimeContext.name,
                                      merged_map=merged_map,
                                      loadingContext=loadingContext)
                tmpl.save()
                # cwltool.main will write our return value to stdout.
                return (tmpl.uuid, "success")
            elif self.work_api == "containers":
                return (upload_workflow(self, tool, job_order,
                                        self.project_uuid,
                                        uuid=existing_uuid,
                                        submit_runner_ram=runtimeContext.submit_runner_ram,
                                        name=runtimeContext.name,
                                        merged_map=merged_map),
                        "success")

        self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
        self.eval_timeout = runtimeContext.eval_timeout

        runtimeContext = runtimeContext.copy()
        runtimeContext.use_container = True
        runtimeContext.tmpdir_prefix = "tmp"
        runtimeContext.work_api = self.work_api

        if self.work_api == "containers":
            if self.ignore_docker_for_reuse:
                raise Exception("--ignore-docker-for-reuse not supported with containers API.")
            runtimeContext.outdir = "/var/spool/cwl"
            runtimeContext.docker_outdir = "/var/spool/cwl"
            runtimeContext.tmpdir = "/tmp"
            runtimeContext.docker_tmpdir = "/tmp"
        elif self.work_api == "jobs":
            if runtimeContext.priority != DEFAULT_PRIORITY:
                raise Exception("--priority not implemented for jobs API.")
            runtimeContext.outdir = "$(task.outdir)"
            runtimeContext.docker_outdir = "$(task.outdir)"
            runtimeContext.tmpdir = "$(task.tmpdir)"

        if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
            raise Exception("--priority must be in the range 1..1000.")

        if self.should_estimate_cache_size:
            visited = set()
            estimated_size = [0]
            def estimate_collection_cache(obj):
                if obj.get("location", "").startswith("keep:"):
                    m = pdh_size.match(obj["location"][5:])
                    if m and m.group(1) not in visited:
                        visited.add(m.group(1))
                        estimated_size[0] += int(m.group(2))
            visit_class(job_order, ("File", "Directory"), estimate_collection_cache)
            runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256)
            self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024)

        logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size)

        runnerjob = None
        if runtimeContext.submit:
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if tool.tool["class"] == "CommandLineTool" and runtimeContext.wait and (not runtimeContext.always_submit_runner):
                    runtimeContext.runnerjob = tool.tool["id"]
                else:
                    tool = RunnerContainer(self, tool, loadingContext, runtimeContext.enable_reuse,
                                                self.output_name,
                                                self.output_tags,
                                                submit_runner_ram=runtimeContext.submit_runner_ram,
                                                name=runtimeContext.name,
                                                on_error=runtimeContext.on_error,
                                                submit_runner_image=runtimeContext.submit_runner_image,
                                                intermediate_output_ttl=runtimeContext.intermediate_output_ttl,
                                                merged_map=merged_map,
                                                priority=runtimeContext.priority,
                                                secret_store=self.secret_store,
                                                collection_cache_size=runtimeContext.collection_cache_size,
                                                collection_cache_is_default=self.should_estimate_cache_size)
            elif self.work_api == "jobs":
                tool = RunnerJob(self, tool, loadingContext, runtimeContext.enable_reuse,
                                      self.output_name,
                                      self.output_tags,
                                      submit_runner_ram=runtimeContext.submit_runner_ram,
                                      name=runtimeContext.name,
                                      on_error=runtimeContext.on_error,
                                      submit_runner_image=runtimeContext.submit_runner_image,
                                      merged_map=merged_map)
        elif runtimeContext.cwl_runner_job is None and self.work_api == "jobs":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid": self.project_uuid,
                    "name": runtimeContext.name if runtimeContext.name else shortname(tool.tool["id"]),
                    "components": {},
                    "state": "RunningOnClient"}).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runtimeContext.cwl_runner_job is not None:
            self.uuid = runtimeContext.cwl_runner_job.get('uuid')

        jobiter = tool.job(job_order,
                           self.output_callback,
                           runtimeContext)

        if runtimeContext.submit and not runtimeContext.wait:
            runnerjob = next(jobiter)
            runnerjob.run(runtimeContext)
            return (runnerjob.uuid, "success")

        current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
        if current_container:
            logger.info("Running inside container %s", current_container.get("uuid"))

        self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout)
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)

        try:
            self.workflow_eval_lock.acquire()

            # Holds the lock while this code runs and releases it when
            # it is safe to do so in self.workflow_eval_lock.wait(),
            # at which point on_message can update job state and
            # process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if self.task_queue.error is not None:
                    raise self.task_queue.error

                if runnable:
                    with Perf(metrics, "run"):
                        self.start_run(runnable, runtimeContext)
                else:
                    if (self.task_queue.in_flight + len(self.processes)) > 0:
                        self.workflow_eval_lock.wait(3)
                    else:
                        logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.")
                        break

                if self.stop_polling.is_set():
                    break

                loopperf.__enter__()
            loopperf.__exit__()

            while (self.task_queue.in_flight + len(self.processes)) > 0:
                if self.task_queue.error is not None:
                    raise self.task_queue.error
                self.workflow_eval_lock.wait(3)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit:
                logger.error("Interrupted, workflow will be cancelled")
            else:
                logger.error("Execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
                                                     body={"state": "Failed"}).execute(num_retries=self.num_retries)
            if runtimeContext.submit and isinstance(tool, Runner):
                runnerjob = tool
                if runnerjob.uuid and self.work_api == "containers":
                    self.api.container_requests().update(uuid=runnerjob.uuid,
                                                     body={"priority": "0"}).execute(num_retries=self.num_retries)
        finally:
            self.workflow_eval_lock.release()
            self.task_queue.drain()
            self.stop_polling.set()
            self.polling_thread.join()
            self.task_queue.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if runtimeContext.submit and isinstance(tool, Runner):
            logger.info("Final output collection %s", tool.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""

            storage_classes = runtimeContext.storage_classes.strip().split(",")
            self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output)
            self.set_crunch_output()

        if runtimeContext.compute_checksum:
            adjustDirObjs(self.final_output, partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access))

        if self.trash_intermediate and self.final_status == "success":
            self.trash_intermediate_output()

        return (self.final_output, self.final_status)
Exemple #29
0
def main(args=None, stdout=sys.stdout):
    parser = argparse.ArgumentParser()
    Job.Runner.addToilOptions(parser)
    parser.add_argument("cwltool", type=str)
    parser.add_argument("cwljob", nargs=argparse.REMAINDER)

    # Will override the "jobStore" positional argument, enables
    # user to select jobStore or get a default from logic one below.
    parser.add_argument("--jobStore", type=str)
    parser.add_argument("--not-strict", action="store_true")
    parser.add_argument("--no-container", action="store_true")
    parser.add_argument("--quiet",
                        dest="logLevel",
                        action="store_const",
                        const="ERROR")
    parser.add_argument("--basedir", type=str)
    parser.add_argument("--outdir", type=str, default=os.getcwd())
    parser.add_argument("--version", action='version', version=baseVersion)
    parser.add_argument(
        "--preserve-environment",
        type=str,
        nargs='+',
        help=
        "Preserve specified environment variables when running CommandLineTools",
        metavar=("VAR1 VAR2"),
        default=("PATH", ),
        dest="preserve_environment")

    # mkdtemp actually creates the directory, but
    # toil requires that the directory not exist,
    # so make it and delete it and allow
    # toil to create it again (!)
    workdir = tempfile.mkdtemp()
    os.rmdir(workdir)

    if args is None:
        args = sys.argv[1:]

    options = parser.parse_args([workdir] + args)

    use_container = not options.no_container

    setLoggingFromOptions(options)
    if options.logLevel:
        cwllogger.setLevel(options.logLevel)

    outdir = os.path.abspath(options.outdir)
    fileindex = {}
    existing = {}

    with Toil(options) as toil:
        if options.restart:
            outobj = toil.restart()
        else:
            useStrict = not options.not_strict
            try:
                t = cwltool.load_tool.load_tool(
                    options.cwltool,
                    toilMakeTool,
                    kwargs={
                        "hints": [{
                            "class":
                            "ResourceRequirement",
                            "coresMin":
                            toil.config.defaultCores,
                            "ramMin":
                            toil.config.defaultMemory / (2**20),
                            "outdirMin":
                            toil.config.defaultDisk / (2**20),
                            "tmpdirMin":
                            0
                        }]
                    },
                    resolver=cwltool.resolver.tool_resolver,
                    strict=useStrict)
                unsupportedRequirementsCheck(t.requirements)
            except cwltool.process.UnsupportedRequirement as e:
                logging.error(e)
                return 33

            if type(t) == int:
                return t

            options.workflow = options.cwltool
            options.job_order = options.cwljob
            options.tool_help = None
            options.debug = options.logLevel == "DEBUG"
            job = cwltool.main.load_job_order(options, t, sys.stdin)

            if type(job) == int:
                return job

            job, options.basedir = job

            fillInDefaults(t.tool["inputs"], job)

            def pathToLoc(p):
                if "location" not in p and "path" in p:
                    p["location"] = p["path"]
                    del p["path"]

            def importFiles(tool):
                visit_class(tool, ("File", "Directory"), pathToLoc)
                normalizeFilesDirs(tool)
                adjustDirObjs(
                    tool,
                    functools.partial(get_listing,
                                      cwltool.stdfsaccess.StdFsAccess(""),
                                      recursive=True))
                adjustFileObjs(
                    tool,
                    functools.partial(uploadFile,
                                      toil.importFile,
                                      fileindex,
                                      existing,
                                      skip_broken=True))

            t.visit(importFiles)

            for inp in t.tool["inputs"]:

                def setSecondary(fileobj):
                    if isinstance(fileobj,
                                  dict) and fileobj.get("class") == "File":
                        if "secondaryFiles" not in fileobj:
                            fileobj["secondaryFiles"] = [{
                                "location":
                                cwltool.builder.substitute(
                                    fileobj["location"], sf),
                                "class":
                                "File"
                            } for sf in inp["secondaryFiles"]]

                    if isinstance(fileobj, list):
                        for e in fileobj:
                            setSecondary(e)

                if shortname(inp["id"]) in job and inp.get("secondaryFiles"):
                    setSecondary(job[shortname(inp["id"])])

            importFiles(job)
            visitSteps(t, importFiles)

            make_fs_access = functools.partial(ToilFsAccess, fileStore=toil)
            try:
                (wf1, wf2) = makeJob(
                    t, {},
                    use_container=use_container,
                    preserve_environment=options.preserve_environment,
                    tmpdir=os.path.realpath(outdir),
                    workdir=options.workDir)
            except cwltool.process.UnsupportedRequirement as e:
                logging.error(e)
                return 33

            wf1.cwljob = job
            outobj = toil.start(wf1)

        outobj = resolve_indirect(outobj)

        toilStageFiles(toil, outobj, outdir, fileindex, existing, True)

        visit_class(
            outobj, ("File", ),
            functools.partial(compute_checksums,
                              cwltool.stdfsaccess.StdFsAccess("")))

        stdout.write(json.dumps(outobj, indent=4))

    return 0
Exemple #30
0
    def arvExecutor(self, tool, job_order, **kwargs):
        self.debug = kwargs.get("debug")

        if kwargs.get("quiet"):
            logger.setLevel(logging.WARN)
            logging.getLogger('arvados.arv-run').setLevel(logging.WARN)

        useruuid = self.api.users().current().execute()["uuid"]
        self.project_uuid = kwargs.get("project_uuid") if kwargs.get("project_uuid") else useruuid
        self.pipeline = None
        self.fs_access = CollectionFsAccess(kwargs["basedir"], api_client=self.api)

        if kwargs.get("create_template"):
            tmpl = RunnerTemplate(self, tool, job_order, kwargs.get("enable_reuse"))
            tmpl.save()
            # cwltool.main will write our return value to stdout.
            return tmpl.uuid

        self.debug = kwargs.get("debug")
        self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")

        kwargs["fs_access"] = self.fs_access
        kwargs["enable_reuse"] = kwargs.get("enable_reuse")
        kwargs["use_container"] = True
        kwargs["tmpdir_prefix"] = "tmp"
        kwargs["on_error"] = "continue"
        kwargs["compute_checksum"] = kwargs.get("compute_checksum")

        if self.work_api == "containers":
            kwargs["outdir"] = "/var/spool/cwl"
            kwargs["docker_outdir"] = "/var/spool/cwl"
            kwargs["tmpdir"] = "/tmp"
        elif self.work_api == "jobs":
            kwargs["outdir"] = "$(task.outdir)"
            kwargs["docker_outdir"] = "$(task.outdir)"
            kwargs["tmpdir"] = "$(task.tmpdir)"

        runnerjob = None
        if kwargs.get("submit"):
            if self.work_api == "containers":
                if tool.tool["class"] == "CommandLineTool":
                    runnerjob = tool.job(job_order,
                                         self.output_callback,
                                         **kwargs).next()
                else:
                    runnerjob = RunnerContainer(self, tool, job_order, kwargs.get("enable_reuse"))
            else:
                runnerjob = RunnerJob(self, tool, job_order, kwargs.get("enable_reuse"))

        if not kwargs.get("submit") and "cwl_runner_job" not in kwargs and not self.work_api == "containers":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid": self.project_uuid,
                    "name": shortname(tool.tool["id"]),
                    "components": {},
                    "state": "RunningOnClient"}).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runnerjob and not kwargs.get("wait"):
            runnerjob.run()
            return runnerjob.uuid

        arvados.config.settings()["ARVADOS_DISABLE_WEBSOCKETS"] = "1"

        if self.work_api == "containers":
            events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#container"]], self.on_message)
        if self.work_api == "jobs":
            events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message)

        if runnerjob:
            jobiter = iter((runnerjob,))
        else:
            if "cwl_runner_job" in kwargs:
                self.uuid = kwargs.get("cwl_runner_job").get('uuid')
            jobiter = tool.job(job_order,
                               self.output_callback,
                               **kwargs)

        try:
            self.cond.acquire()
            # Will continue to hold the lock for the duration of this code
            # except when in cond.wait(), at which point on_message can update
            # job state and process output callbacks.

            for runnable in jobiter:
                if runnable:
                    runnable.run(**kwargs)
                else:
                    if self.processes:
                        self.cond.wait(1)
                    else:
                        logger.error("Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs.")
                        break

            while self.processes:
                self.cond.wait(1)

            events.close()
        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt:
                logger.error("Interrupted, marking pipeline as failed")
            else:
                logger.error("Caught unhandled exception, marking pipeline as failed.  Error was: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
                                                     body={"state": "Failed"}).execute(num_retries=self.num_retries)
            if runnerjob and runnerjob.uuid and self.work_api == "containers":
                self.api.container_requests().update(uuid=runnerjob.uuid,
                                                     body={"priority": "0"}).execute(num_retries=self.num_retries)
        finally:
            self.cond.release()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_status != "success":
            raise WorkflowException("Workflow failed.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if kwargs.get("compute_checksum"):
            def compute_checksums(fileobj):
                if "checksum" not in fileobj:
                    checksum = hashlib.sha1()
                    with self.fs_access.open(fileobj["location"], "rb") as f:
                        contents = f.read(1024*1024)
                        while contents != "":
                            checksum.update(contents)
                            contents = f.read(1024*1024)
                    fileobj["checksum"] = "sha1$%s" % checksum.hexdigest()

            adjustFileObjs(self.final_output, compute_checksums)

        return self.final_output
Exemple #31
0
    def arvExecutor(self, tool, job_order, **kwargs):
        self.debug = kwargs.get("debug")

        if kwargs.get("quiet"):
            logger.setLevel(logging.WARN)
            logging.getLogger('arvados.arv-run').setLevel(logging.WARN)

        useruuid = self.api.users().current().execute()["uuid"]
        self.project_uuid = kwargs.get("project_uuid") if kwargs.get("project_uuid") else useruuid
        self.pipeline = None

        if kwargs.get("create_template"):
            tmpl = RunnerTemplate(self, tool, job_order, kwargs.get("enable_reuse"))
            tmpl.save()
            # cwltool.main will write our return value to stdout.
            return tmpl.uuid

        if kwargs.get("submit"):
            runnerjob = RunnerJob(self, tool, job_order, kwargs.get("enable_reuse"))
            if not kwargs.get("wait"):
                runnerjob.run()
                return runnerjob.uuid

        events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message)

        self.debug = kwargs.get("debug")
        self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")
        self.fs_access = CollectionFsAccess(kwargs["basedir"])

        kwargs["fs_access"] = self.fs_access
        kwargs["enable_reuse"] = kwargs.get("enable_reuse")

        kwargs["outdir"] = "$(task.outdir)"
        kwargs["tmpdir"] = "$(task.tmpdir)"

        if kwargs.get("conformance_test"):
            return cwltool.main.single_job_executor(tool, job_order, **kwargs)
        else:
            if kwargs.get("submit"):
                jobiter = iter((runnerjob,))
            else:
                components = {}
                if "cwl_runner_job" in kwargs:
                    components[os.path.basename(tool.tool["id"])] = {"job": kwargs["cwl_runner_job"]}

                self.pipeline = self.api.pipeline_instances().create(
                    body={
                        "owner_uuid": self.project_uuid,
                        "name": shortname(tool.tool["id"]),
                        "components": components,
                        "state": "RunningOnClient"}).execute(num_retries=self.num_retries)

                logger.info("Pipeline instance %s", self.pipeline["uuid"])

                jobiter = tool.job(job_order,
                                   self.output_callback,
                                   docker_outdir="$(task.outdir)",
                                   **kwargs)

            try:
                self.cond.acquire()
                # Will continue to hold the lock for the duration of this code
                # except when in cond.wait(), at which point on_message can update
                # job state and process output callbacks.

                for runnable in jobiter:
                    if runnable:
                        runnable.run(**kwargs)
                    else:
                        if self.jobs:
                            self.cond.wait(1)
                        else:
                            logger.error("Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs.")
                            break

                while self.jobs:
                    self.cond.wait(1)

                events.close()
            except:
                if sys.exc_info()[0] is KeyboardInterrupt:
                    logger.error("Interrupted, marking pipeline as failed")
                else:
                    logger.error("Caught unhandled exception, marking pipeline as failed.  Error was: %s", sys.exc_info()[0], exc_info=(sys.exc_info()[1] if self.debug else False))
                if self.pipeline:
                    self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
                                                         body={"state": "Failed"}).execute(num_retries=self.num_retries)
            finally:
                self.cond.release()

            if self.final_output is None:
                raise cwltool.workflow.WorkflowException("Workflow did not return a result.")

            return self.final_output
Exemple #32
0
    def arv_executor(self, tool, job_order, runtimeContext, logger=None):
        self.debug = runtimeContext.debug

        tool.visit(self.check_features)

        self.project_uuid = runtimeContext.project_uuid
        self.pipeline = None
        self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
        self.secret_store = runtimeContext.secret_store

        self.trash_intermediate = runtimeContext.trash_intermediate
        if self.trash_intermediate and self.work_api != "containers":
            raise Exception(
                "--trash-intermediate is only supported with --api=containers."
            )

        self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
        if self.intermediate_output_ttl and self.work_api != "containers":
            raise Exception(
                "--intermediate-output-ttl is only supported with --api=containers."
            )
        if self.intermediate_output_ttl < 0:
            raise Exception(
                "Invalid value %d for --intermediate-output-ttl, cannot be less than zero"
                % self.intermediate_output_ttl)

        if runtimeContext.submit_request_uuid and self.work_api != "containers":
            raise Exception(
                "--submit-request-uuid requires containers API, but using '{}' api"
                .format(self.work_api))

        if not runtimeContext.name:
            runtimeContext.name = self.name = tool.tool.get(
                "label") or tool.metadata.get("label") or os.path.basename(
                    tool.tool["id"])

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        merged_map = upload_workflow_deps(self, tool)

        # Reload tool object which may have been updated by
        # upload_workflow_deps
        # Don't validate this time because it will just print redundant errors.
        loadingContext = self.loadingContext.copy()
        loadingContext.loader = tool.doc_loader
        loadingContext.avsc_names = tool.doc_schema
        loadingContext.metadata = tool.metadata
        loadingContext.do_validate = False

        tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]],
                                  loadingContext)

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % runtimeContext.name,
                                     tool, job_order)

        existing_uuid = runtimeContext.update_workflow
        if existing_uuid or runtimeContext.create_workflow:
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "jobs":
                tmpl = RunnerTemplate(
                    self,
                    tool,
                    job_order,
                    runtimeContext.enable_reuse,
                    uuid=existing_uuid,
                    submit_runner_ram=runtimeContext.submit_runner_ram,
                    name=runtimeContext.name,
                    merged_map=merged_map)
                tmpl.save()
                # cwltool.main will write our return value to stdout.
                return (tmpl.uuid, "success")
            elif self.work_api == "containers":
                return (upload_workflow(
                    self,
                    tool,
                    job_order,
                    self.project_uuid,
                    uuid=existing_uuid,
                    submit_runner_ram=runtimeContext.submit_runner_ram,
                    name=runtimeContext.name,
                    merged_map=merged_map), "success")

        self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
        self.eval_timeout = runtimeContext.eval_timeout

        runtimeContext = runtimeContext.copy()
        runtimeContext.use_container = True
        runtimeContext.tmpdir_prefix = "tmp"
        runtimeContext.work_api = self.work_api

        if self.work_api == "containers":
            if self.ignore_docker_for_reuse:
                raise Exception(
                    "--ignore-docker-for-reuse not supported with containers API."
                )
            runtimeContext.outdir = "/var/spool/cwl"
            runtimeContext.docker_outdir = "/var/spool/cwl"
            runtimeContext.tmpdir = "/tmp"
            runtimeContext.docker_tmpdir = "/tmp"
        elif self.work_api == "jobs":
            if runtimeContext.priority != DEFAULT_PRIORITY:
                raise Exception("--priority not implemented for jobs API.")
            runtimeContext.outdir = "$(task.outdir)"
            runtimeContext.docker_outdir = "$(task.outdir)"
            runtimeContext.tmpdir = "$(task.tmpdir)"

        if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
            raise Exception("--priority must be in the range 1..1000.")

        runnerjob = None
        if runtimeContext.submit:
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if tool.tool[
                        "class"] == "CommandLineTool" and runtimeContext.wait:
                    runtimeContext.runnerjob = tool.tool["id"]
                    runnerjob = tool.job(job_order, self.output_callback,
                                         runtimeContext).next()
                else:
                    runnerjob = RunnerContainer(
                        self,
                        tool,
                        job_order,
                        runtimeContext.enable_reuse,
                        self.output_name,
                        self.output_tags,
                        submit_runner_ram=runtimeContext.submit_runner_ram,
                        name=runtimeContext.name,
                        on_error=runtimeContext.on_error,
                        submit_runner_image=runtimeContext.submit_runner_image,
                        intermediate_output_ttl=runtimeContext.
                        intermediate_output_ttl,
                        merged_map=merged_map,
                        priority=runtimeContext.priority,
                        secret_store=self.secret_store)
            elif self.work_api == "jobs":
                runnerjob = RunnerJob(
                    self,
                    tool,
                    job_order,
                    runtimeContext.enable_reuse,
                    self.output_name,
                    self.output_tags,
                    submit_runner_ram=runtimeContext.submit_runner_ram,
                    name=runtimeContext.name,
                    on_error=runtimeContext.on_error,
                    submit_runner_image=runtimeContext.submit_runner_image,
                    merged_map=merged_map)
        elif runtimeContext.cwl_runner_job is None and self.work_api == "jobs":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid":
                    self.project_uuid,
                    "name":
                    runtimeContext.name if runtimeContext.
                    name else shortname(tool.tool["id"]),
                    "components": {},
                    "state":
                    "RunningOnClient"
                }).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runnerjob and not runtimeContext.wait:
            submitargs = runtimeContext.copy()
            submitargs.submit = False
            runnerjob.run(submitargs)
            return (runnerjob.uuid, "success")

        self.poll_api = arvados.api('v1')
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)

        if runnerjob:
            jobiter = iter((runnerjob, ))
        else:
            if runtimeContext.cwl_runner_job is not None:
                self.uuid = runtimeContext.cwl_runner_job.get('uuid')
            jobiter = tool.job(job_order, self.output_callback, runtimeContext)

        try:
            self.workflow_eval_lock.acquire()
            # Holds the lock while this code runs and releases it when
            # it is safe to do so in self.workflow_eval_lock.wait(),
            # at which point on_message can update job state and
            # process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if self.task_queue.error is not None:
                    raise self.task_queue.error

                if runnable:
                    with Perf(metrics, "run"):
                        self.start_run(runnable, runtimeContext)
                else:
                    if (self.task_queue.in_flight + len(self.processes)) > 0:
                        self.workflow_eval_lock.wait(3)
                    else:
                        logger.error(
                            "Workflow is deadlocked, no runnable processes and not waiting on any pending processes."
                        )
                        break
                loopperf.__enter__()
            loopperf.__exit__()

            while (self.task_queue.in_flight + len(self.processes)) > 0:
                if self.task_queue.error is not None:
                    raise self.task_queue.error
                self.workflow_eval_lock.wait(3)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info(
            )[0] is SystemExit:
                logger.error("Interrupted, workflow will be cancelled")
            else:
                logger.error(
                    "Execution failed: %s",
                    sys.exc_info()[1],
                    exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(
                    uuid=self.pipeline["uuid"], body={
                        "state": "Failed"
                    }).execute(num_retries=self.num_retries)
            if runnerjob and runnerjob.uuid and self.work_api == "containers":
                self.api.container_requests().update(
                    uuid=runnerjob.uuid, body={
                        "priority": "0"
                    }).execute(num_retries=self.num_retries)
        finally:
            self.workflow_eval_lock.release()
            self.task_queue.drain()
            self.stop_polling.set()
            self.polling_thread.join()
            self.task_queue.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if runtimeContext.submit and isinstance(runnerjob, Runner):
            logger.info("Final output collection %s", runnerjob.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(
                    tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""

            storage_classes = runtimeContext.storage_classes.strip().split(",")
            self.final_output, self.final_output_collection = self.make_output_collection(
                self.output_name, storage_classes, self.output_tags,
                self.final_output)
            self.set_crunch_output()

        if runtimeContext.compute_checksum:
            adjustDirObjs(self.final_output,
                          partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output,
                           partial(compute_checksums, self.fs_access))

        if self.trash_intermediate and self.final_status == "success":
            self.trash_intermediate_output()

        return (self.final_output, self.final_status)
Exemple #33
0
    def arvExecutor(self, tool, job_order, **kwargs):
        self.debug = kwargs.get("debug")

        tool.visit(self.check_writable)

        if kwargs.get("quiet"):
            logger.setLevel(logging.WARN)
            logging.getLogger('arvados.arv-run').setLevel(logging.WARN)

        useruuid = self.api.users().current().execute()["uuid"]
        self.project_uuid = kwargs.get("project_uuid") if kwargs.get(
            "project_uuid") else useruuid
        self.pipeline = None
        make_fs_access = kwargs.get("make_fs_access") or partial(
            CollectionFsAccess, api_client=self.api)
        self.fs_access = make_fs_access(kwargs["basedir"])

        if kwargs.get("create_template"):
            tmpl = RunnerTemplate(self, tool, job_order,
                                  kwargs.get("enable_reuse"))
            tmpl.save()
            # cwltool.main will write our return value to stdout.
            return tmpl.uuid

        self.debug = kwargs.get("debug")
        self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")

        kwargs["make_fs_access"] = make_fs_access
        kwargs["enable_reuse"] = kwargs.get("enable_reuse")
        kwargs["use_container"] = True
        kwargs["tmpdir_prefix"] = "tmp"
        kwargs["on_error"] = "continue"
        kwargs["compute_checksum"] = kwargs.get("compute_checksum")

        if self.work_api == "containers":
            kwargs["outdir"] = "/var/spool/cwl"
            kwargs["docker_outdir"] = "/var/spool/cwl"
            kwargs["tmpdir"] = "/tmp"
            kwargs["docker_tmpdir"] = "/tmp"
        elif self.work_api == "jobs":
            kwargs["outdir"] = "$(task.outdir)"
            kwargs["docker_outdir"] = "$(task.outdir)"
            kwargs["tmpdir"] = "$(task.tmpdir)"

        runnerjob = None
        if kwargs.get("submit"):
            if self.work_api == "containers":
                if tool.tool["class"] == "CommandLineTool":
                    runnerjob = tool.job(job_order, self.output_callback,
                                         **kwargs).next()
                else:
                    runnerjob = RunnerContainer(self, tool, job_order,
                                                kwargs.get("enable_reuse"))
            else:
                runnerjob = RunnerJob(self, tool, job_order,
                                      kwargs.get("enable_reuse"))

        if not kwargs.get(
                "submit"
        ) and "cwl_runner_job" not in kwargs and not self.work_api == "containers":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid": self.project_uuid,
                    "name": shortname(tool.tool["id"]),
                    "components": {},
                    "state": "RunningOnClient"
                }).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runnerjob and not kwargs.get("wait"):
            runnerjob.run()
            return runnerjob.uuid

        self.poll_api = arvados.api('v1')
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        if runnerjob:
            jobiter = iter((runnerjob, ))
        else:
            if "cwl_runner_job" in kwargs:
                self.uuid = kwargs.get("cwl_runner_job").get('uuid')
            jobiter = tool.job(job_order, self.output_callback, **kwargs)

        try:
            self.cond.acquire()
            # Will continue to hold the lock for the duration of this code
            # except when in cond.wait(), at which point on_message can update
            # job state and process output callbacks.

            for runnable in jobiter:
                if runnable:
                    runnable.run(**kwargs)
                else:
                    if self.processes:
                        self.cond.wait(1)
                    else:
                        logger.error(
                            "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs."
                        )
                        break

            while self.processes:
                self.cond.wait(1)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt:
                logger.error("Interrupted, marking pipeline as failed")
            else:
                logger.error(
                    "Caught unhandled exception, marking pipeline as failed.  Error was: %s",
                    sys.exc_info()[1],
                    exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(
                    uuid=self.pipeline["uuid"], body={
                        "state": "Failed"
                    }).execute(num_retries=self.num_retries)
            if runnerjob and runnerjob.uuid and self.work_api == "containers":
                self.api.container_requests().update(
                    uuid=runnerjob.uuid, body={
                        "priority": "0"
                    }).execute(num_retries=self.num_retries)
        finally:
            self.cond.release()
            self.stop_polling.set()
            self.polling_thread.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_status != "success":
            raise WorkflowException("Workflow failed.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if kwargs.get("compute_checksum"):
            adjustFileObjs(self.final_output,
                           partial(compute_checksums, self.fs_access))

        return self.final_output
Exemple #34
0
 def sn(n):
     if isinstance(n, dict):
         return shortname(n["id"])
     if isinstance(n, string_types):
         return shortname(n)
Exemple #35
0
def main(args=None, stdout=sys.stdout):
    config = Config()
    config.cwl = True
    parser = argparse.ArgumentParser()
    addOptions(parser, config)
    parser.add_argument("cwltool", type=str)
    parser.add_argument("cwljob", nargs=argparse.REMAINDER)

    # Will override the "jobStore" positional argument, enables
    # user to select jobStore or get a default from logic one below.
    parser.add_argument("--jobStore", type=str)
    parser.add_argument("--not-strict", action="store_true")
    parser.add_argument("--no-container", action="store_true")
    parser.add_argument("--quiet", dest="logLevel", action="store_const", const="ERROR")
    parser.add_argument("--basedir", type=str)
    parser.add_argument("--outdir", type=str, default=os.getcwd())
    parser.add_argument("--version", action='version', version=baseVersion)
    parser.add_argument("--user-space-docker-cmd",
                        help="(Linux/OS X only) Specify a user space docker "
                        "command (like udocker or dx-docker) that will be "
                        "used to call 'pull' and 'run'")
    parser.add_argument("--preserve-environment", type=str, nargs='+',
                    help="Preserve specified environment variables when running CommandLineTools",
                    metavar=("VAR1 VAR2"),
                    default=("PATH",),
                    dest="preserve_environment")
    # help="Dependency resolver configuration file describing how to adapt 'SoftwareRequirement' packages to current system."
    parser.add_argument("--beta-dependency-resolvers-configuration", default=None)
    # help="Defaut root directory used by dependency resolvers configuration."
    parser.add_argument("--beta-dependencies-directory", default=None)
    # help="Use biocontainers for tools without an explicitly annotated Docker container."
    parser.add_argument("--beta-use-biocontainers", default=None, action="store_true")
    # help="Short cut to use Conda to resolve 'SoftwareRequirement' packages."
    parser.add_argument("--beta-conda-dependencies", default=None, action="store_true")
    parser.add_argument("--tmpdir-prefix", type=Text,
                        help="Path prefix for temporary directories",
                        default="tmp")
    parser.add_argument("--tmp-outdir-prefix", type=Text,
                        help="Path prefix for intermediate output directories",
                        default="tmp")

    # mkdtemp actually creates the directory, but
    # toil requires that the directory not exist,
    # so make it and delete it and allow
    # toil to create it again (!)
    workdir = tempfile.mkdtemp()
    os.rmdir(workdir)

    if args is None:
        args = sys.argv[1:]

    options = parser.parse_args([workdir] + args)

    use_container = not options.no_container

    if options.logLevel:
        cwllogger.setLevel(options.logLevel)

    outdir = os.path.abspath(options.outdir)
    fileindex = {}
    existing = {}
    make_tool_kwargs = {}
    conf_file = getattr(options, "beta_dependency_resolvers_configuration", None)  # Text
    use_conda_dependencies = getattr(options, "beta_conda_dependencies", None)  # Text
    job_script_provider = None
    if conf_file or use_conda_dependencies:
        dependencies_configuration = DependenciesConfiguration(options)  # type: DependenciesConfiguration
        job_script_provider = dependencies_configuration

    options.default_container = None
    make_tool_kwargs["find_default_container"] = functools.partial(find_default_container, options)

    with Toil(options) as toil:
        if options.restart:
            outobj = toil.restart()
        else:
            useStrict = not options.not_strict
            make_tool_kwargs["hints"] = [{
                "class": "ResourceRequirement",
                "coresMin": toil.config.defaultCores,
                "ramMin": toil.config.defaultMemory / (2**20),
                "outdirMin": toil.config.defaultDisk / (2**20),
                "tmpdirMin": 0
            }]
            try:
                t = cwltool.load_tool.load_tool(options.cwltool, toilMakeTool,
                                                kwargs=make_tool_kwargs,
                                                resolver=cwltool.resolver.tool_resolver,
                                                strict=useStrict)
                unsupportedRequirementsCheck(t.requirements)
            except cwltool.process.UnsupportedRequirement as e:
                logging.error(e)
                return 33

            if type(t) == int:
                return t

            options.workflow = options.cwltool
            options.job_order = options.cwljob
            options.tool_help = None
            options.debug = options.logLevel == "DEBUG"
            job, options.basedir, loader = cwltool.main.load_job_order(
                options, sys.stdin, None, [], options.job_order)
            job = cwltool.main.init_job_order(job, options, t, loader=loader)

            fillInDefaults(t.tool["inputs"], job)

            def pathToLoc(p):
                if "location" not in p and "path" in p:
                    p["location"] = p["path"]
                    del p["path"]

            def importFiles(tool):
                visit_class(tool, ("File", "Directory"), pathToLoc)
                normalizeFilesDirs(tool)
                adjustDirObjs(tool, functools.partial(get_listing,
                                                      cwltool.stdfsaccess.StdFsAccess(""),
                                                      recursive=True))
                adjustFileObjs(tool, functools.partial(uploadFile,
                                                       toil.importFile,
                                                       fileindex, existing, skip_broken=True))

            t.visit(importFiles)

            for inp in t.tool["inputs"]:
                def setSecondary(fileobj):
                    if isinstance(fileobj, dict) and fileobj.get("class") == "File":
                        if "secondaryFiles" not in fileobj:
                            fileobj["secondaryFiles"] = [{
                                "location": cwltool.builder.substitute(fileobj["location"], sf), "class": "File"}
                                                         for sf in inp["secondaryFiles"]]

                    if isinstance(fileobj, list):
                        for e in fileobj:
                            setSecondary(e)

                if shortname(inp["id"]) in job and inp.get("secondaryFiles"):
                    setSecondary(job[shortname(inp["id"])])

            importFiles(job)
            visitSteps(t, importFiles)

            try:
                make_opts = copy.deepcopy(vars(options))
                make_opts.update({'tool': t, 'jobobj': {},
                    'use_container': use_container,
                    'tmpdir': os.path.realpath(outdir),
                    'job_script_provider': job_script_provider})

                (wf1, wf2) = makeJob(**make_opts)
            except cwltool.process.UnsupportedRequirement as e:
                logging.error(e)
                return 33

            wf1.cwljob = job
            outobj = toil.start(wf1)

        outobj = resolve_indirect(outobj)

        toilStageFiles(toil, outobj, outdir, fileindex, existing, True)

        visit_class(outobj, ("File",), functools.partial(compute_checksums, cwltool.stdfsaccess.StdFsAccess("")))

        stdout.write(json.dumps(outobj, indent=4))

    return 0
Exemple #36
0
    def job(self, joborder, output_callback, runtimeContext):

        builder = make_builder(joborder, self.hints, self.requirements, runtimeContext)
        runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext)

        req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer")
        if not req:
            return super(ArvadosWorkflow, self).job(joborder, output_callback, runtimeContext)

        # RunInSingleContainer is true

        with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
            if "id" not in self.tool:
                raise WorkflowException("%s object must have 'id'" % (self.tool["class"]))
        document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch(self.tool["id"]), self.tool["id"])

        discover_secondary_files(self.tool["inputs"], joborder)

        with Perf(metrics, "subworkflow upload_deps"):
            upload_dependencies(self.arvrunner,
                                os.path.basename(joborder.get("id", "#")),
                                document_loader,
                                joborder,
                                joborder.get("id", "#"),
                                False)

            if self.wf_pdh is None:
                workflowobj["requirements"] = dedup_reqs(self.requirements)
                workflowobj["hints"] = dedup_reqs(self.hints)

                packed = pack(document_loader, workflowobj, uri, self.metadata)

                def visit(item):
                    for t in ("hints", "requirements"):
                        if t not in item:
                            continue
                        for req in item[t]:
                            if req["class"] == "ResourceRequirement":
                                dyn = False
                                for k in max_res_pars + sum_res_pars:
                                    if k in req:
                                        if isinstance(req[k], basestring):
                                            if item["id"] == "#main":
                                                # only the top-level requirements/hints may contain expressions
                                                self.dynamic_resource_req.append(req)
                                                dyn = True
                                                break
                                            else:
                                                with SourceLine(req, k, WorkflowException):
                                                    raise WorkflowException("Non-top-level ResourceRequirement in single container cannot have expressions")
                                if not dyn:
                                    self.static_resource_req.append(req)
                            if req["class"] == "DockerRequirement":
                                if "http://arvados.org/cwl#dockerCollectionPDH" in req:
                                    del req["http://arvados.org/cwl#dockerCollectionPDH"]

                visit_class(packed["$graph"], ("Workflow", "CommandLineTool"), visit)

                if self.static_resource_req:
                    self.static_resource_req = [get_overall_res_req(self.static_resource_req)]

                upload_dependencies(self.arvrunner,
                                    runtimeContext.name,
                                    document_loader,
                                    packed,
                                    uri,
                                    False)

                # Discover files/directories referenced by the
                # workflow (mainly "default" values)
                visit_class(packed, ("File", "Directory"), self.wf_reffiles.append)


        if self.dynamic_resource_req:
            # Evaluate dynamic resource requirements using current builder
            rs = copy.copy(self.static_resource_req)
            for dyn_rs in self.dynamic_resource_req:
                eval_req = {"class": "ResourceRequirement"}
                for a in max_res_pars + sum_res_pars:
                    if a in dyn_rs:
                        eval_req[a] = builder.do_eval(dyn_rs[a])
                rs.append(eval_req)
            job_res_reqs = [get_overall_res_req(rs)]
        else:
            job_res_reqs = self.static_resource_req

        with Perf(metrics, "subworkflow adjust"):
            joborder_resolved = copy.deepcopy(joborder)
            joborder_keepmount = copy.deepcopy(joborder)

            reffiles = []
            visit_class(joborder_keepmount, ("File", "Directory"), reffiles.append)

            mapper = ArvPathMapper(self.arvrunner, reffiles+self.wf_reffiles, runtimeContext.basedir,
                                   "/keep/%s",
                                   "/keep/%s/%s")

            # For containers API, we need to make sure any extra
            # referenced files (ie referenced by the workflow but
            # not in the inputs) are included in the mounts.
            if self.wf_reffiles:
                runtimeContext = runtimeContext.copy()
                runtimeContext.extra_reffiles = copy.deepcopy(self.wf_reffiles)

            def keepmount(obj):
                remove_redundant_fields(obj)
                with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                    if "location" not in obj:
                        raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj))
                with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                    if obj["location"].startswith("keep:"):
                        obj["location"] = mapper.mapper(obj["location"]).target
                        if "listing" in obj:
                            del obj["listing"]
                    elif obj["location"].startswith("_:"):
                        del obj["location"]
                    else:
                        raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"])

            visit_class(joborder_keepmount, ("File", "Directory"), keepmount)

            def resolved(obj):
                if obj["location"].startswith("keep:"):
                    obj["location"] = mapper.mapper(obj["location"]).resolved

            visit_class(joborder_resolved, ("File", "Directory"), resolved)

            if self.wf_pdh is None:
                adjustFileObjs(packed, keepmount)
                adjustDirObjs(packed, keepmount)
                self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed)

        wf_runner = cmap({
            "class": "CommandLineTool",
            "baseCommand": "cwltool",
            "inputs": self.tool["inputs"],
            "outputs": self.tool["outputs"],
            "stdout": "cwl.output.json",
            "requirements": self.requirements+job_res_reqs+[
                {"class": "InlineJavascriptRequirement"},
                {
                "class": "InitialWorkDirRequirement",
                "listing": [{
                        "entryname": "workflow.cwl",
                        "entry": '$({"class": "File", "location": "keep:%s/workflow.cwl"})' % self.wf_pdh
                    }, {
                        "entryname": "cwl.input.yml",
                        "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${')
                    }]
            }],
            "hints": self.hints,
            "arguments": ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml"],
            "id": "#"
        })
        return ArvadosCommandTool(self.arvrunner, wf_runner, self.loadingContext).job(joborder_resolved, output_callback, runtimeContext)
Exemple #37
0
def discover_secondary_files(inputs, job_order, discovered=None):
    for t in inputs:
        if shortname(t["id"]) in job_order and t.get("secondaryFiles"):
            setSecondary(t, job_order[shortname(t["id"])], discovered)
Exemple #38
0
 def sn(n):
     if isinstance(n, dict):
         return shortname(n["id"])
     if isinstance(n, string_types):
         return shortname(n)
Exemple #39
0
 def discover_default_secondary_files(obj):
     discover_secondary_files(
         obj["inputs"], {
             shortname(t["id"]): t["default"]
             for t in obj["inputs"] if "default" in t
         }, discovered)
Exemple #40
0
    def arvExecutor(self, tool, job_order, input_basedir, args, **kwargs):
        events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message)

        try:
            self.api.collections().get(uuid=crunchrunner_pdh).execute()
        except arvados.errors.ApiError as e:
            import httplib2
            h = httplib2.Http(ca_certs=arvados.util.ca_certs_path())
            resp, content = h.request(crunchrunner_download, "GET")
            resp2, content2 = h.request(certs_download, "GET")
            with arvados.collection.Collection() as col:
                with col.open("crunchrunner", "w") as f:
                    f.write(content)
                with col.open("ca-certificates.crt", "w") as f:
                    f.write(content2)

                col.save_new("crunchrunner binary", ensure_unique_name=True)

        self.fs_access = CollectionFsAccess(input_basedir)

        kwargs["fs_access"] = self.fs_access
        kwargs["enable_reuse"] = args.enable_reuse

        kwargs["outdir"] = "$(task.outdir)"
        kwargs["tmpdir"] = "$(task.tmpdir)"

        useruuid = self.api.users().current().execute()["uuid"]
        self.project_uuid = args.project_uuid if args.project_uuid else useruuid

        if kwargs.get("conformance_test"):
            return cwltool.main.single_job_executor(tool, job_order, input_basedir, args, **kwargs)
        else:
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid": self.project_uuid,
                    "name": shortname(tool.tool["id"]),
                    "components": {},
                    "state": "RunningOnClient"}).execute(num_retries=self.num_retries)

            logger.info("Pipeline instance %s", self.pipeline["uuid"])

            jobiter = tool.job(job_order,
                               input_basedir,
                               self.output_callback,
                               docker_outdir="$(task.outdir)",
                               **kwargs)

            try:
                self.cond.acquire()
                # Will continue to hold the lock for the duration of this code
                # except when in cond.wait(), at which point on_message can update
                # job state and process output callbacks.

                for runnable in jobiter:
                    if runnable:
                        runnable.run(**kwargs)
                    else:
                        if self.jobs:
                            self.cond.wait(1)
                        else:
                            logger.error("Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs.")
                            break

                while self.jobs:
                    self.cond.wait(1)

                events.close()

                if self.final_output is None:
                    raise cwltool.workflow.WorkflowException("Workflow did not return a result.")

                # create final output collection
            except:
                if sys.exc_info()[0] is KeyboardInterrupt:
                    logger.error("Interrupted, marking pipeline as failed")
                else:
                    logger.exception("Caught unhandled exception, marking pipeline as failed")
                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
                                                     body={"state": "Failed"}).execute(num_retries=self.num_retries)
            finally:
                self.cond.release()

            return self.final_output
Exemple #41
0
 def make_workflow_exception(msg):
     return WorkflowException(
         u"Error collecting output for parameter '%s':\n%s"
         % (shortname(port["id"]), msg))
Exemple #42
0
 def __init__(self, step, cwljob, **kwargs):
     super(CWLScatter, self).__init__()
     self.step = step
     self.cwljob = cwljob
     self.valueFrom = {shortname(i["id"]): i["valueFrom"] for i in step.tool["inputs"] if "valueFrom" in i}
     self.executor_options = kwargs
Exemple #43
0
    def arv_executor(self, tool, job_order, **kwargs):
        self.debug = kwargs.get("debug")

        tool.visit(self.check_features)

        self.project_uuid = kwargs.get("project_uuid")
        self.pipeline = None
        make_fs_access = kwargs.get("make_fs_access") or partial(
            CollectionFsAccess,
            api_client=self.api,
            keep_client=self.keep_client)
        self.fs_access = make_fs_access(kwargs["basedir"])

        if not kwargs.get("name"):
            kwargs["name"] = self.name = tool.tool.get(
                "label") or tool.metadata.get("label") or os.path.basename(
                    tool.tool["id"])

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        upload_workflow_deps(self, tool)

        # Reload tool object which may have been updated by
        # upload_workflow_deps
        tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]],
                                  makeTool=self.arv_make_tool,
                                  loader=tool.doc_loader,
                                  avsc_names=tool.doc_schema,
                                  metadata=tool.metadata)

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % kwargs["name"], tool,
                                     job_order)

        existing_uuid = kwargs.get("update_workflow")
        if existing_uuid or kwargs.get("create_workflow"):
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "jobs":
                tmpl = RunnerTemplate(
                    self,
                    tool,
                    job_order,
                    kwargs.get("enable_reuse"),
                    uuid=existing_uuid,
                    submit_runner_ram=kwargs.get("submit_runner_ram"),
                    name=kwargs["name"])
                tmpl.save()
                # cwltool.main will write our return value to stdout.
                return (tmpl.uuid, "success")
            elif self.work_api == "containers":
                return (upload_workflow(
                    self,
                    tool,
                    job_order,
                    self.project_uuid,
                    uuid=existing_uuid,
                    submit_runner_ram=kwargs.get("submit_runner_ram"),
                    name=kwargs["name"]), "success")

        self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")

        kwargs["make_fs_access"] = make_fs_access
        kwargs["enable_reuse"] = kwargs.get("enable_reuse")
        kwargs["use_container"] = True
        kwargs["tmpdir_prefix"] = "tmp"
        kwargs["compute_checksum"] = kwargs.get("compute_checksum")

        if self.work_api == "containers":
            kwargs["outdir"] = "/var/spool/cwl"
            kwargs["docker_outdir"] = "/var/spool/cwl"
            kwargs["tmpdir"] = "/tmp"
            kwargs["docker_tmpdir"] = "/tmp"
        elif self.work_api == "jobs":
            kwargs["outdir"] = "$(task.outdir)"
            kwargs["docker_outdir"] = "$(task.outdir)"
            kwargs["tmpdir"] = "$(task.tmpdir)"

        runnerjob = None
        if kwargs.get("submit"):
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if tool.tool["class"] == "CommandLineTool":
                    kwargs["runnerjob"] = tool.tool["id"]
                    upload_dependencies(self, kwargs["name"], tool.doc_loader,
                                        tool.tool, tool.tool["id"], False)
                    runnerjob = tool.job(job_order, self.output_callback,
                                         **kwargs).next()
                else:
                    runnerjob = RunnerContainer(
                        self,
                        tool,
                        job_order,
                        kwargs.get("enable_reuse"),
                        self.output_name,
                        self.output_tags,
                        submit_runner_ram=kwargs.get("submit_runner_ram"),
                        name=kwargs.get("name"),
                        on_error=kwargs.get("on_error"),
                        submit_runner_image=kwargs.get("submit_runner_image"))
            elif self.work_api == "jobs":
                runnerjob = RunnerJob(
                    self,
                    tool,
                    job_order,
                    kwargs.get("enable_reuse"),
                    self.output_name,
                    self.output_tags,
                    submit_runner_ram=kwargs.get("submit_runner_ram"),
                    name=kwargs.get("name"),
                    on_error=kwargs.get("on_error"),
                    submit_runner_image=kwargs.get("submit_runner_image"))

        if not kwargs.get(
                "submit"
        ) and "cwl_runner_job" not in kwargs and self.work_api == "jobs":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid":
                    self.project_uuid,
                    "name":
                    kwargs["name"] if kwargs.
                    get("name") else shortname(tool.tool["id"]),
                    "components": {},
                    "state":
                    "RunningOnClient"
                }).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runnerjob and not kwargs.get("wait"):
            runnerjob.run(wait=kwargs.get("wait"))
            return (runnerjob.uuid, "success")

        self.poll_api = arvados.api('v1')
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        if runnerjob:
            jobiter = iter((runnerjob, ))
        else:
            if "cwl_runner_job" in kwargs:
                self.uuid = kwargs.get("cwl_runner_job").get('uuid')
            jobiter = tool.job(job_order, self.output_callback, **kwargs)

        try:
            self.cond.acquire()
            # Will continue to hold the lock for the duration of this code
            # except when in cond.wait(), at which point on_message can update
            # job state and process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if runnable:
                    with Perf(metrics, "run"):
                        runnable.run(**kwargs)
                else:
                    if self.processes:
                        self.cond.wait(1)
                    else:
                        logger.error(
                            "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs."
                        )
                        break
                loopperf.__enter__()
            loopperf.__exit__()

            while self.processes:
                self.cond.wait(1)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt:
                logger.error("Interrupted, marking pipeline as failed")
            else:
                logger.error(
                    "Execution failed: %s",
                    sys.exc_info()[1],
                    exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(
                    uuid=self.pipeline["uuid"], body={
                        "state": "Failed"
                    }).execute(num_retries=self.num_retries)
            if runnerjob and runnerjob.uuid and self.work_api == "containers":
                self.api.container_requests().update(
                    uuid=runnerjob.uuid, body={
                        "priority": "0"
                    }).execute(num_retries=self.num_retries)
        finally:
            self.cond.release()
            self.stop_polling.set()
            self.polling_thread.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if kwargs.get("submit") and isinstance(runnerjob, Runner):
            logger.info("Final output collection %s", runnerjob.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(
                    tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""
            self.final_output, self.final_output_collection = self.make_output_collection(
                self.output_name, self.output_tags, self.final_output)
            self.set_crunch_output()

        if kwargs.get("compute_checksum"):
            adjustDirObjs(self.final_output, partial(getListing,
                                                     self.fs_access))
            adjustFileObjs(self.final_output,
                           partial(compute_checksums, self.fs_access))

        return (self.final_output, self.final_status)
Exemple #44
0
def main(args=None, stdout=sys.stdout):
    """Main method for toil-cwl-runner."""
    cwllogger.removeHandler(defaultStreamHandler)
    config = Config()
    config.cwl = True
    parser = argparse.ArgumentParser()
    addOptions(parser, config)
    parser.add_argument("cwltool", type=str)
    parser.add_argument("cwljob", nargs=argparse.REMAINDER)

    # Will override the "jobStore" positional argument, enables
    # user to select jobStore or get a default from logic one below.
    parser.add_argument("--jobStore", type=str)
    parser.add_argument("--not-strict", action="store_true")
    parser.add_argument("--quiet", dest="logLevel", action="store_const",
                        const="ERROR")
    parser.add_argument("--basedir", type=str)
    parser.add_argument("--outdir", type=str, default=os.getcwd())
    parser.add_argument("--version", action='version', version=baseVersion)
    dockergroup = parser.add_mutually_exclusive_group()
    dockergroup.add_argument(
        "--user-space-docker-cmd",
        help="(Linux/OS X only) Specify a user space docker command (like "
        "udocker or dx-docker) that will be used to call 'pull' and 'run'")
    dockergroup.add_argument(
        "--singularity", action="store_true", default=False,
        help="[experimental] Use Singularity runtime for running containers. "
        "Requires Singularity v2.3.2+ and Linux with kernel version v3.18+ or "
        "with overlayfs support backported.")
    dockergroup.add_argument(
        "--no-container", action="store_true", help="Do not execute jobs in a "
        "Docker container, even when `DockerRequirement` "
        "is specified under `hints`.")
    parser.add_argument(
        "--preserve-environment", type=str, nargs='+',
        help="Preserve specified environment variables when running"
        " CommandLineTools", metavar=("VAR1 VAR2"), default=("PATH",),
        dest="preserve_environment")
    parser.add_argument(
        "--destBucket", type=str,
        help="Specify a cloud bucket endpoint for output files.")
    parser.add_argument(
        "--beta-dependency-resolvers-configuration", default=None)
    parser.add_argument("--beta-dependencies-directory", default=None)
    parser.add_argument(
        "--beta-use-biocontainers", default=None, action="store_true")
    parser.add_argument(
        "--beta-conda-dependencies", default=None, action="store_true")
    parser.add_argument("--tmpdir-prefix", type=Text,
                        help="Path prefix for temporary directories",
                        default="tmp")
    parser.add_argument("--tmp-outdir-prefix", type=Text,
                        help="Path prefix for intermediate output directories",
                        default="tmp")
    parser.add_argument(
        "--force-docker-pull", action="store_true", default=False,
        dest="force_docker_pull",
        help="Pull latest docker image even if it is locally present")
    parser.add_argument(
        "--no-match-user", action="store_true", default=False,
        help="Disable passing the current uid to `docker run --user`")

    # mkdtemp actually creates the directory, but
    # toil requires that the directory not exist,
    # so make it and delete it and allow
    # toil to create it again (!)
    workdir = tempfile.mkdtemp()
    os.rmdir(workdir)

    if args is None:
        args = sys.argv[1:]

    # we use workdir as jobStore:
    options = parser.parse_args([workdir] + args)

    # if tmpdir_prefix is not the default value, set workDir too
    if options.tmpdir_prefix != 'tmp':
        options.workDir = options.tmpdir_prefix

    if options.provisioner and not options.jobStore:
        raise NoSuchJobStoreException(
            'Please specify a jobstore with the --jobStore option when specifying a provisioner.')

    use_container = not options.no_container

    if options.logLevel:
        cwllogger.setLevel(options.logLevel)

    outdir = os.path.abspath(options.outdir)
    tmp_outdir_prefix = os.path.abspath(options.tmp_outdir_prefix)
    tmpdir_prefix = os.path.abspath(options.tmpdir_prefix)

    fileindex = {}
    existing = {}
    conf_file = getattr(options,
                        "beta_dependency_resolvers_configuration", None)
    use_conda_dependencies = getattr(options, "beta_conda_dependencies", None)
    job_script_provider = None
    if conf_file or use_conda_dependencies:
        dependencies_configuration = DependenciesConfiguration(options)
        job_script_provider = dependencies_configuration

    options.default_container = None
    runtime_context = cwltool.context.RuntimeContext(vars(options))
    runtime_context.find_default_container = functools.partial(
        find_default_container, options)
    runtime_context.workdir = workdir
    runtime_context.move_outputs = "leave"
    runtime_context.rm_tmpdir = False
    loading_context = cwltool.context.LoadingContext(vars(options))

    with Toil(options) as toil:
        if options.restart:
            outobj = toil.restart()
        else:
            loading_context.hints = [{
                "class": "ResourceRequirement",
                "coresMin": toil.config.defaultCores,
                "ramMin": toil.config.defaultMemory / (2**20),
                "outdirMin": toil.config.defaultDisk / (2**20),
                "tmpdirMin": 0
            }]
            loading_context.construct_tool_object = toil_make_tool
            loading_context.resolver = cwltool.resolver.tool_resolver
            loading_context.strict = not options.not_strict
            options.workflow = options.cwltool
            options.job_order = options.cwljob
            uri, tool_file_uri = cwltool.load_tool.resolve_tool_uri(
                options.cwltool, loading_context.resolver,
                loading_context.fetcher_constructor)
            options.tool_help = None
            options.debug = options.logLevel == "DEBUG"
            job_order_object, options.basedir, jobloader = \
                cwltool.main.load_job_order(
                    options, sys.stdin, loading_context.fetcher_constructor,
                    loading_context.overrides_list, tool_file_uri)
            document_loader, workflowobj, uri = \
                cwltool.load_tool.fetch_document(
                    uri, loading_context.resolver,
                    loading_context.fetcher_constructor)
            document_loader, avsc_names, processobj, metadata, uri = \
                cwltool.load_tool.validate_document(
                    document_loader, workflowobj, uri,
                    loading_context.enable_dev, loading_context.strict, False,
                    loading_context.fetcher_constructor, False,
                    loading_context.overrides_list,
                    do_validate=loading_context.do_validate)
            loading_context.overrides_list.extend(
                metadata.get("cwltool:overrides", []))
            try:
                tool = cwltool.load_tool.make_tool(
                    document_loader, avsc_names, metadata, uri,
                    loading_context)
            except cwltool.process.UnsupportedRequirement as err:
                logging.error(err)
                return 33
            runtime_context.secret_store = SecretStore()
            initialized_job_order = cwltool.main.init_job_order(
                job_order_object, options, tool, jobloader, sys.stdout,
                secret_store=runtime_context.secret_store)
            fs_access = cwltool.stdfsaccess.StdFsAccess(options.basedir)
            fill_in_defaults(
                tool.tool["inputs"], initialized_job_order, fs_access)

            def path_to_loc(obj):
                if "location" not in obj and "path" in obj:
                    obj["location"] = obj["path"]
                    del obj["path"]

            def import_files(tool):
                visit_class(tool, ("File", "Directory"), path_to_loc)
                visit_class(tool, ("File", ), functools.partial(
                    add_sizes, fs_access))
                normalizeFilesDirs(tool)
                adjustDirObjs(tool, functools.partial(
                    get_listing, fs_access, recursive=True))
                adjustFileObjs(tool, functools.partial(
                    uploadFile, toil.importFile, fileindex, existing,
                    skip_broken=True))

            tool.visit(import_files)

            for inp in tool.tool["inputs"]:
                def set_secondary(fileobj):
                    if isinstance(fileobj, Mapping) \
                            and fileobj.get("class") == "File":
                        if "secondaryFiles" not in fileobj:
                            fileobj["secondaryFiles"] = [
                                {"location": cwltool.builder.substitute(
                                    fileobj["location"], sf), "class": "File"}
                                for sf in inp["secondaryFiles"]]

                    if isinstance(fileobj, MutableSequence):
                        for entry in fileobj:
                            set_secondary(entry)

                if shortname(inp["id"]) in initialized_job_order \
                        and inp.get("secondaryFiles"):
                    set_secondary(initialized_job_order[shortname(inp["id"])])

            import_files(initialized_job_order)
            visitSteps(tool, import_files)

            try:
                runtime_context.use_container = use_container
                runtime_context.tmpdir = os.path.realpath(tmpdir_prefix)
                runtime_context.tmp_outdir_prefix = os.path.realpath(
                    tmp_outdir_prefix)
                runtime_context.job_script_provider = job_script_provider
                runtime_context.force_docker_pull = options.force_docker_pull
                runtime_context.no_match_user = options.no_match_user
                (wf1, _) = makeJob(tool, {}, None, runtime_context)
            except cwltool.process.UnsupportedRequirement as err:
                logging.error(err)
                return 33

            wf1.cwljob = initialized_job_order
            if wf1 is CWLJob:  # Clean up temporary directories only created with CWLJobs.
                wf1.addFollowOnFn(cleanTempDirs, wf1)
            outobj = toil.start(wf1)

        outobj = resolve_indirect(outobj)

        # Stage files. Specify destination bucket if specified in CLI
        # options. If destination bucket not passed in,
        # options.destBucket's value will be None.
        toilStageFiles(
            toil,
            outobj,
            outdir,
            fileindex,
            existing,
            export=True,
            destBucket=options.destBucket)

        if not options.destBucket:
            visit_class(outobj, ("File",), functools.partial(
                compute_checksums, cwltool.stdfsaccess.StdFsAccess("")))

        visit_class(outobj, ("File", ), MutationManager().unset_generation)
        stdout.write(json.dumps(outobj, indent=4))

    return 0
Exemple #45
0
 def __init__(self, step, cwljob):
     Job.__init__(self)
     self.step = step
     self.cwljob = cwljob
     self.valueFrom = {shortname(i["id"]): i["valueFrom"] for i in step.tool["inputs"] if "valueFrom" in i}
Exemple #46
0
    def run(self, file_store):
        cwljob = resolve_indirect(self.cwljob)

        # `promises` dict
        # from: each parameter (workflow input or step output)
        #   that may be used as a "source" for a step input workflow output
        #   parameter
        # to: the job that will produce that value.
        promises = {}

        # `jobs` dict from step id to job that implements that step.
        jobs = {}

        for inp in self.cwlwf.tool["inputs"]:
            promises[inp["id"]] = SelfJob(self, cwljob)

        alloutputs_fufilled = False
        while not alloutputs_fufilled:
            # Iteratively go over the workflow steps, scheduling jobs as their
            # dependencies can be fufilled by upstream workflow inputs or
            # step outputs.  Loop exits when the workflow outputs
            # are satisfied.

            alloutputs_fufilled = True

            for step in self.cwlwf.steps:
                if step.tool["id"] not in jobs:
                    stepinputs_fufilled = True
                    for inp in step.tool["inputs"]:
                        if "source" in inp:
                            for s in aslist(inp["source"]):
                                if s not in promises:
                                    stepinputs_fufilled = False
                    if stepinputs_fufilled:
                        jobobj = {}

                        for inp in step.tool["inputs"]:
                            key = shortname(inp["id"])
                            if "source" in inp:
                                if inp.get("linkMerge") \
                                        or len(aslist(inp["source"])) > 1:
                                    linkMerge = inp.get(
                                        "linkMerge", "merge_nested")
                                    if linkMerge == "merge_nested":
                                        jobobj[key] = (
                                            MergeInputsNested(
                                                [(shortname(s),
                                                  promises[s].rv())
                                                 for s in aslist(
                                                     inp["source"])]))
                                    elif linkMerge == "merge_flattened":
                                        jobobj[key] = (
                                            MergeInputsFlattened(
                                                [(shortname(s),
                                                  promises[s].rv())
                                                 for s in aslist(
                                                      inp["source"])]))
                                    else:
                                        raise validate.ValidationException(
                                            "Unsupported linkMerge '%s'" %
                                            linkMerge)
                                else:
                                    inpSource = inp["source"]
                                    if isinstance(inpSource, MutableSequence):
                                        # It seems that an input source with a
                                        # '#' in the name will be returned as a
                                        # CommentedSeq list by the yaml parser.
                                        inpSource = str(inpSource[0])
                                    jobobj[key] = (shortname(inpSource),
                                                   promises[inpSource].rv())
                            if "default" in inp:
                                if key in jobobj:
                                    if isinstance(jobobj[key][1], Promise):
                                        d = copy.copy(inp["default"])
                                        jobobj[key] = DefaultWithSource(
                                            d, jobobj[key])
                                    else:
                                        if jobobj[key][1][
                                                jobobj[key][0]] is None:
                                            d = copy.copy(inp["default"])
                                            jobobj[key] = (
                                                "default", {"default": d})
                                else:
                                    d = copy.copy(inp["default"])
                                    jobobj[key] = ("default", {"default": d})

                            if "valueFrom" in inp \
                                    and "scatter" not in step.tool:
                                if key in jobobj:
                                    jobobj[key] = StepValueFrom(
                                        inp["valueFrom"], jobobj[key],
                                        self.cwlwf.requirements)
                                else:
                                    jobobj[key] = StepValueFrom(
                                        inp["valueFrom"], (
                                            "None", {"None": None}),
                                        self.cwlwf.requirements)

                        if "scatter" in step.tool:
                            wfjob = CWLScatter(step, IndirectDict(jobobj),
                                               self.runtime_context)
                            followOn = CWLGather(step, wfjob.rv())
                            wfjob.addFollowOn(followOn)
                        else:
                            (wfjob, followOn) = makeJob(
                                step.embedded_tool, IndirectDict(jobobj),
                                step.tool["inputs"],
                                self.runtime_context)

                        jobs[step.tool["id"]] = followOn

                        connected = False
                        for inp in step.tool["inputs"]:
                            for s in aslist(inp.get("source", [])):
                                if (isinstance(
                                        promises[s], (CWLJobWrapper, CWLGather)
                                              ) and
                                        not promises[s].hasFollowOn(wfjob)):
                                    promises[s].addFollowOn(wfjob)
                                    connected = True
                                if (not isinstance(
                                        promises[s], (CWLJobWrapper, CWLGather)
                                                  ) and
                                        not promises[s].hasChild(wfjob)):
                                    promises[s].addChild(wfjob)
                                    connected = True
                        if not connected:
                            # the workflow step has default inputs only & isn't
                            # connected to other jobs, so add it as child of
                            # this workflow.
                            self.addChild(wfjob)

                        for out in step.tool["outputs"]:
                            promises[out["id"]] = followOn

                for inp in step.tool["inputs"]:
                    for source in aslist(inp.get("source", [])):
                        if source not in promises:
                            alloutputs_fufilled = False

            # may need a test
            for out in self.cwlwf.tool["outputs"]:
                if "source" in out:
                    if out["source"] not in promises:
                        alloutputs_fufilled = False

        outobj = {}
        for out in self.cwlwf.tool["outputs"]:
            key = shortname(out["id"])
            if out.get("linkMerge") or len(aslist(out["outputSource"])) > 1:
                link_merge = out.get("linkMerge", "merge_nested")
                if link_merge == "merge_nested":
                    outobj[key] = (
                        MergeInputsNested(
                            [(shortname(s), promises[s].rv())
                             for s in aslist(out["outputSource"])]))
                elif link_merge == "merge_flattened":
                    outobj[key] = (
                        MergeInputsFlattened([
                            (shortname(s), promises[s].rv())
                            for s in aslist(out["source"])]))
                else:
                    raise validate.ValidationException(
                        "Unsupported linkMerge '{}'".format(link_merge))

            else:
                # A CommentedSeq of length one still appears here rarely -
                # not clear why from the CWL code. When it does, it breaks
                # the execution by causing a non-hashable type exception.
                # We simplify the list into its first (and only) element.
                src = simplify_list(out["outputSource"])
                outobj[key] = (shortname(src), promises[src].rv())

        return IndirectDict(outobj)
Exemple #47
0
def generateScriptForWorkflow(cwlwf, cwljob, outdir):
    promises = {}
    jobs = {}
    script = []

    outdirs = []

    print "> inputs for this workflow are: ", cwlwf.tool["inputs"]
    print "> steps in this workflow are: ", cwlwf.steps

    # need to confirm why this is neccesary
    for inp in cwlwf.tool["inputs"]:
        promises[inp["id"]] = (cwlwf, cwljob[shortname(inp["id"])])

    alloutputs_fufilled = False
    while not alloutputs_fufilled:
        # Iteratively go over the workflow steps, adding jobs to the script as their
        # dependencies are fufilled by upstream workflow inputs or
        # step outputs.  Loop exits when the workflow outputs
        # are satisfied.

        alloutputs_fufilled = True

        progress = False
        # step = t.steps[1] # testing
        for step in t.cwlwf.steps:
            if step.tool["id"] not in jobs:
                stepinputs_fufilled = True
                for inp in step.tool["inputs"]:
                    if "source" in inp and inp["source"] not in promises:
                        stepinputs_fufilled = False
                if stepinputs_fufilled:
                    jobobj = {}

                    # TODO: Handle multiple inbound links
                    # TODO: Handle scatter/gather
                    # (both are discussed in section 5.1.2 in CWL spec draft-2)

                    # script.append("# Run step %s" % step.tool["id"])

                    for inp in step.tool["inputs"]:
                        if "source" in inp:
                            jobobj[shortname(inp["id"])] = promises[inp["source"]][1]
                            # script.append("# depends on step %s" % promises[inp["source"]][0].tool["id"])
                        elif "default" in inp:
                            d = copy.copy(inp["default"])
                            jobobj[shortname(inp["id"])] = d

                    (wfjob, joboutdir, jobtmpdir) = generateScriptForTool(step.embedded_tool, jobobj, None)
                    outdirs.append(joboutdir)

                    jobs[step.tool["id"]] = True
                    # This line is where it generates the command together with arguments
                    # (from the function generateScriptForTool)
                    script.append(wfjob)

                    for out in step.tool["outputs"]:
                        for toolout in step.embedded_tool.tool["outputs"]:
                            if shortname(toolout["id"]) == shortname(out["id"]):
                                if toolout["type"] != "File":
                                    raise Exception("Only supports file outputs")
                                if glob_metacharacters(toolout["outputBinding"]["glob"]):
                                    raise Exception("Only support glob with concrete filename.")
                                promises[out["id"]] = (
                                    step,
                                    {
                                        "class": "File",
                                        "path": os.path.join(joboutdir, toolout["outputBinding"]["glob"]),
                                    },
                                )
                    progress = True

        for out in cwlwf.tool["outputs"]:
            if "source" in out:
                if out["source"] not in promises:
                    alloutputs_fufilled = False

        if not alloutputs_fufilled and not progress:
            raise Exception("Not making progress")

    #     outobj = {}
    #     script.append("# Move output files to the current directory")
    #
    #     for out in cwlwf.tool["outputs"]:
    #         f = promises[out["source"]][1]
    #         script.append("mv %s ." % (maybe_quote(f["path"])))
    #         f["path"] = os.path.basename(f["path"])
    #
    #         if f.get("secondaryFiles"):
    #             script.append("mv %s ." % (' '.join([maybe_quote(sf["path"]) for sf in f["secondaryFiles"]])))
    #             for sf in f["secondaryFiles"]:
    #                 sf["path"] = os.path.basename(sf["path"])
    #
    #         outobj[shortname(out["id"])] = f

    #     script.append("")
    #     script.append("# Clean up staging output directories")
    #     script.append("rm -r %s" % (' '.join([maybe_quote(od) for od in outdirs])))
    #     script.append("")

    #     script.append("# Generate final output object")
    #     script.append("echo '%s'" % json.dumps(outobj, indent=4))

    return "\n".join(script)
Exemple #48
0
    def collect_output(
            self,
            schema,  # type: Dict[Text, Any]
            builder,  # type: Builder
            outdir,  # type: Text
            fs_access,  # type: StdFsAccess
            compute_checksum=True  # type: bool
    ):
        # type: (...) -> Optional[Union[Dict[Text, Any], List[Union[Dict[Text, Any], Text]]]]
        result = []  # type: List[Any]
        empty_and_optional = False
        debug = LOGGER.isEnabledFor(logging.DEBUG)
        if "outputBinding" in schema:
            binding = schema["outputBinding"]
            globpatterns = []  # type: List[Text]

            revmap = partial(command_line_tool.revmap_file, builder, outdir)

            if "glob" in binding:
                with SourceLine(binding, "glob", WorkflowException, debug):
                    for glob in aslist(binding["glob"]):
                        glob = builder.do_eval(glob)
                        if glob:
                            globpatterns.extend(aslist(glob))

                    for glob in globpatterns:
                        if glob.startswith(outdir):
                            glob = glob[len(outdir) + 1:]
                        elif glob == ".":
                            glob = outdir
                        elif glob.startswith("/"):
                            raise WorkflowException(
                                "glob patterns must not start with '/'")
                        try:
                            prefix = fs_access.glob(outdir)
                            key = cmp_to_key(
                                cast(Callable[[Text, Text], int],
                                     locale.strcoll))

                            # In case of stdout.log or stderr.log file not created
                            if "stdout" in self.tool and "stderr" in self.tool \
                                    and glob in (self.tool["stdout"], self.tool["stderr"]):
                                filepath = Path(fs_access.join(outdir, glob))
                                if not filepath.is_file():
                                    Path(filepath).touch()

                            result.extend([{
                                "location":
                                g,
                                "path":
                                fs_access.join(builder.outdir,
                                               g[len(prefix[0]) + 1:]),
                                "basename":
                                os.path.basename(g),
                                "nameroot":
                                os.path.splitext(os.path.basename(g))[0],
                                "nameext":
                                os.path.splitext(os.path.basename(g))[1],
                                "class":
                                "File" if fs_access.isfile(g) else "Directory"
                            } for g in sorted(fs_access.glob(
                                fs_access.join(outdir, glob)),
                                              key=key)])
                        except (OSError, IOError) as exc:
                            LOGGER.warning(Text(exc))
                        except Exception:
                            LOGGER.exception("Unexpected error from fs_access")
                            raise

                for files in result:
                    rfile = files.copy()
                    # TODO This function raise an exception and seems to be related to docker (which is not used here)
                    # revmap(rfile)
                    if files["class"] == "Directory":
                        load_listing = builder.loadListing or (
                            binding and binding.get("loadListing"))
                        if load_listing and load_listing != "no_listing":
                            get_listing(fs_access, files,
                                        (load_listing == "deep_listing"))
                    else:
                        with fs_access.open(rfile["location"], "rb") as f:
                            contents = b""
                            if binding.get("loadContents") or compute_checksum:
                                contents = f.read(CONTENT_LIMIT)
                            if binding.get("loadContents"):
                                files["contents"] = contents.decode("utf-8")
                            if compute_checksum:
                                checksum = hashlib.sha1()  # nosec: B303
                                while contents != b"":
                                    checksum.update(contents)
                                    contents = f.read(1024 * 1024)
                                files[
                                    "checksum"] = "sha1$%s" % checksum.hexdigest(
                                    )
                            f.seek(0, 2)
                            file_size = f.tell()
                        files["size"] = file_size

            optional = False
            single = False
            if isinstance(schema["type"], list):
                if "null" in schema["type"]:
                    optional = True
                if "File" in schema["type"] or "Directory" in schema["type"]:
                    single = True
            elif schema["type"] == "File" or schema["type"] == "Directory":
                single = True

            if "outputEval" in binding:
                with SourceLine(binding, "outputEval", WorkflowException,
                                debug):
                    result = builder.do_eval(binding["outputEval"],
                                             context=result)

            if single:
                if not result and not optional:
                    with SourceLine(binding, "glob", WorkflowException, debug):
                        raise WorkflowException(
                            "Did not find output file with glob pattern: '{}'".
                            format(globpatterns))
                elif not result and optional:
                    pass
                elif isinstance(result, list):
                    if len(result) > 1:
                        raise WorkflowException(
                            "Multiple matches for output item that is a single file."
                        )
                    result = result[0]

            if "secondaryFiles" in schema:
                with SourceLine(schema, "secondaryFiles", WorkflowException,
                                debug):
                    for primary in aslist(result):
                        if isinstance(primary, dict):
                            primary.setdefault("secondaryFiles", [])
                            pathprefix = primary["path"][0:primary["path"].
                                                         rindex("/") + 1]
                            for file in aslist(schema["secondaryFiles"]):
                                if isinstance(
                                        file,
                                        dict) or "$(" in file or "${" in file:
                                    sfpath = builder.do_eval(file,
                                                             context=primary)
                                    subst = False
                                else:
                                    sfpath = file
                                    subst = True
                                for sfitem in aslist(sfpath):
                                    if isinstance(sfitem, str):
                                        if subst:
                                            sfitem = {
                                                "path":
                                                substitute(
                                                    primary["path"], sfitem)
                                            }
                                        else:
                                            sfitem = {
                                                "path": pathprefix + sfitem
                                            }
                                    if "path" in sfitem and "location" not in sfitem:
                                        revmap(sfitem)
                                    if fs_access.isfile(sfitem["location"]):
                                        sfitem["class"] = "File"
                                        primary["secondaryFiles"].append(
                                            sfitem)
                                    elif fs_access.isdir(sfitem["location"]):
                                        sfitem["class"] = "Directory"
                                        primary["secondaryFiles"].append(
                                            sfitem)

            if "format" in schema:
                for primary in aslist(result):
                    primary["format"] = builder.do_eval(schema["format"],
                                                        context=primary)

            # Ensure files point to local references outside of the run environment
            # TODO: Again removing revmap....
            # adjustFileObjs(result, revmap)

            if not result and optional:
                return None

        if not empty_and_optional and isinstance(
                schema["type"], dict) and schema["type"]["type"] == "record":
            out = {}
            for f in schema["type"]["fields"]:
                out[shortname(
                    f["name"])] = self.collect_output(  # type: ignore
                        f,
                        builder,
                        outdir,
                        fs_access,
                        compute_checksum=compute_checksum)
            return out
        return result
Exemple #49
0
    def arvExecutor(self, tool, job_order, **kwargs):
        self.debug = kwargs.get("debug")

        if kwargs.get("quiet"):
            logger.setLevel(logging.WARN)
            logging.getLogger('arvados.arv-run').setLevel(logging.WARN)

        useruuid = self.api.users().current().execute()["uuid"]
        self.project_uuid = kwargs.get("project_uuid") if kwargs.get(
            "project_uuid") else useruuid
        self.pipeline = None

        if kwargs.get("create_template"):
            tmpl = RunnerTemplate(self, tool, job_order,
                                  kwargs.get("enable_reuse"))
            tmpl.save()
            # cwltool.main will write our return value to stdout.
            return tmpl.uuid

        if kwargs.get("submit"):
            runnerjob = RunnerJob(self, tool, job_order,
                                  kwargs.get("enable_reuse"))

        if not kwargs.get("submit") and "cwl_runner_job" not in kwargs:
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid": self.project_uuid,
                    "name": shortname(tool.tool["id"]),
                    "components": {},
                    "state": "RunningOnClient"
                }).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if kwargs.get("submit") and not kwargs.get("wait"):
            runnerjob.run()
            return runnerjob.uuid

        events = arvados.events.subscribe(
            arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]],
            self.on_message)

        self.debug = kwargs.get("debug")
        self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")
        self.fs_access = CollectionFsAccess(kwargs["basedir"])

        kwargs["fs_access"] = self.fs_access
        kwargs["enable_reuse"] = kwargs.get("enable_reuse")

        kwargs["outdir"] = "$(task.outdir)"
        kwargs["tmpdir"] = "$(task.tmpdir)"

        if kwargs.get("conformance_test"):
            return cwltool.main.single_job_executor(tool, job_order, **kwargs)
        else:
            if kwargs.get("submit"):
                jobiter = iter((runnerjob, ))
            else:
                if "cwl_runner_job" in kwargs:
                    self.uuid = kwargs.get("cwl_runner_job").get('uuid')
                jobiter = tool.job(job_order,
                                   self.output_callback,
                                   docker_outdir="$(task.outdir)",
                                   **kwargs)

            try:
                self.cond.acquire()
                # Will continue to hold the lock for the duration of this code
                # except when in cond.wait(), at which point on_message can update
                # job state and process output callbacks.

                for runnable in jobiter:
                    if runnable:
                        runnable.run(**kwargs)
                    else:
                        if self.jobs:
                            self.cond.wait(1)
                        else:
                            logger.error(
                                "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs."
                            )
                            break

                while self.jobs:
                    self.cond.wait(1)

                events.close()
            except:
                if sys.exc_info()[0] is KeyboardInterrupt:
                    logger.error("Interrupted, marking pipeline as failed")
                else:
                    logger.error(
                        "Caught unhandled exception, marking pipeline as failed.  Error was: %s",
                        sys.exc_info()[0],
                        exc_info=(sys.exc_info()[1] if self.debug else False))
                if self.pipeline:
                    self.api.pipeline_instances().update(
                        uuid=self.pipeline["uuid"], body={
                            "state": "Failed"
                        }).execute(num_retries=self.num_retries)
            finally:
                self.cond.release()

            if self.final_output is None:
                raise cwltool.workflow.WorkflowException(
                    "Workflow did not return a result.")

            return self.final_output
Exemple #50
0
def set_secondary(fsaccess, builder, inputschema, secondaryspec, primary,
                  discovered):
    if isinstance(inputschema,
                  Sequence) and not isinstance(inputschema, basestring):
        # union type, collect all possible secondaryFiles
        for i in inputschema:
            set_secondary(fsaccess, builder, i, secondaryspec, primary,
                          discovered)
        return

    if isinstance(inputschema, basestring):
        sd = search_schemadef(inputschema,
                              reversed(builder.hints + builder.requirements))
        if sd:
            inputschema = sd
        else:
            return

    if "secondaryFiles" in inputschema:
        # set secondaryFiles, may be inherited by compound types.
        secondaryspec = inputschema["secondaryFiles"]

    if (isinstance(inputschema["type"], (Mapping, Sequence))
            and not isinstance(inputschema["type"], basestring)):
        # compound type (union, array, record)
        set_secondary(fsaccess, builder, inputschema["type"], secondaryspec,
                      primary, discovered)

    elif (inputschema["type"] == "record" and isinstance(primary, Mapping)):
        #
        # record type, find secondary files associated with fields.
        #
        for f in inputschema["fields"]:
            p = primary.get(shortname(f["name"]))
            if p:
                set_secondary(fsaccess, builder, f, secondaryspec, p,
                              discovered)

    elif (inputschema["type"] == "array" and isinstance(primary, Sequence)):
        #
        # array type, find secondary files of elements
        #
        for p in primary:
            set_secondary(fsaccess, builder, {"type": inputschema["items"]},
                          secondaryspec, p, discovered)

    elif (inputschema["type"] == "File" and secondaryspec
          and isinstance(primary, Mapping) and primary.get("class") == "File"
          and "secondaryFiles" not in primary):
        #
        # Found a file, check for secondaryFiles
        #
        specs = []
        primary["secondaryFiles"] = secondaryspec
        for i, sf in enumerate(aslist(secondaryspec)):
            if builder.cwlVersion == "v1.0":
                pattern = builder.do_eval(sf, context=primary)
            else:
                pattern = builder.do_eval(sf["pattern"], context=primary)
            if pattern is None:
                continue
            if isinstance(pattern, list):
                specs.extend(pattern)
            elif isinstance(pattern, dict):
                specs.append(pattern)
            elif isinstance(pattern, str):
                if builder.cwlVersion == "v1.0":
                    specs.append({"pattern": pattern, "required": True})
                else:
                    specs.append({
                        "pattern": pattern,
                        "required": sf.get("required")
                    })
            else:
                raise SourceLine(
                    primary["secondaryFiles"], i,
                    validate.ValidationException).makeError(
                        "Expression must return list, object, string or null")

        found = []
        for i, sf in enumerate(specs):
            if isinstance(sf, dict):
                if sf.get("class") == "File":
                    pattern = None
                    if sf.get("location") is None:
                        raise SourceLine(
                            primary["secondaryFiles"], i,
                            validate.ValidationException).makeError(
                                "File object is missing 'location': %s" % sf)
                    sfpath = sf["location"]
                    required = True
                else:
                    pattern = sf["pattern"]
                    required = sf.get("required")
            elif isinstance(sf, str):
                pattern = sf
                required = True
            else:
                raise SourceLine(
                    primary["secondaryFiles"], i,
                    validate.ValidationException).makeError(
                        "Expression must return list, object, string or null")

            if pattern is not None:
                sfpath = substitute(primary["location"], pattern)

            required = builder.do_eval(required, context=primary)

            if fsaccess.exists(sfpath):
                if pattern is not None:
                    found.append({"location": sfpath, "class": "File"})
                else:
                    found.append(sf)
            elif required:
                raise SourceLine(
                    primary["secondaryFiles"], i,
                    validate.ValidationException).makeError(
                        "Required secondary file '%s' does not exist" % sfpath)

        primary["secondaryFiles"] = cmap(found)
        if discovered is not None:
            discovered[primary["location"]] = primary["secondaryFiles"]
    elif inputschema["type"] not in primitive_types_set:
        set_secondary(fsaccess, builder, inputschema["type"], secondaryspec,
                      primary, discovered)
Exemple #51
0
    def run(self, fileStore):
        cwljob = resolve_indirect(self.cwljob)

        # `promises` dict
        # from: each parameter (workflow input or step output)
        #   that may be used as a "source" for a step input workflow output
        #   parameter
        # to: the job that will produce that value.
        promises = {}

        # `jobs` dict from step id to job that implements that step.
        jobs = {}

        for inp in self.cwlwf.tool["inputs"]:
            promises[inp["id"]] = SelfJob(self, cwljob)

        alloutputs_fufilled = False
        while not alloutputs_fufilled:
            # Iteratively go over the workflow steps, scheduling jobs as their
            # dependencies can be fufilled by upstream workflow inputs or
            # step outputs.  Loop exits when the workflow outputs
            # are satisfied.

            alloutputs_fufilled = True

            for step in self.cwlwf.steps:
                if step.tool["id"] not in jobs:
                    stepinputs_fufilled = True
                    for inp in step.tool["inputs"]:
                        if "source" in inp:
                            for s in aslist(inp["source"]):
                                if s not in promises:
                                    stepinputs_fufilled = False
                    if stepinputs_fufilled:
                        jobobj = {}

                        for inp in step.tool["inputs"]:
                            key = shortname(inp["id"])
                            if "source" in inp:
                                if inp.get("linkMerge") or len(aslist(inp["source"])) > 1:
                                    linkMerge = inp.get("linkMerge", "merge_nested")
                                    if linkMerge == "merge_nested":
                                        jobobj[key] = (
                                            MergeInputsNested([(shortname(s), promises[s].rv())
                                                               for s in aslist(inp["source"])]))
                                    elif linkMerge == "merge_flattened":
                                        jobobj[key] = (
                                            MergeInputsFlattened([(shortname(s), promises[s].rv())
                                                                  for s in aslist(inp["source"])]))
                                    else:
                                        raise validate.ValidationException(
                                            "Unsupported linkMerge '%s'", linkMerge)
                                else:
                                    jobobj[key] = (shortname(inp["source"]),
                                                   promises[inp["source"]].rv())
                            elif "default" in inp:
                                d = copy.copy(inp["default"])
                                jobobj[key] = ("default", {"default": d})

                            if "valueFrom" in inp and "scatter" not in step.tool:
                                if key in jobobj:
                                    jobobj[key] = StepValueFrom(inp["valueFrom"],
                                                                jobobj[key],
                                                                self.cwlwf.requirements)
                                else:
                                    jobobj[key] = StepValueFrom(inp["valueFrom"],
                                                                ("None", {"None": None}),
                                                                self.cwlwf.requirements)

                        if "scatter" in step.tool:
                            wfjob = CWLScatter(step, IndirectDict(jobobj), **self.executor_options)
                            followOn = CWLGather(step, wfjob.rv())
                            wfjob.addFollowOn(followOn)
                        else:
                            (wfjob, followOn) = makeJob(step.embedded_tool, IndirectDict(jobobj),
                                                        step_inputs=step.tool["inputs"],
                                                        **self.executor_options)

                        jobs[step.tool["id"]] = followOn

                        connected = False
                        for inp in step.tool["inputs"]:
                            for s in aslist(inp.get("source", [])):
                                if not promises[s].hasChild(wfjob):
                                    promises[s].addChild(wfjob)
                                    connected = True
                        if not connected:
                            # workflow step has default inputs only, isn't connected to other jobs,
                            # so add it as child of workflow.
                            self.addChild(wfjob)

                        for out in step.tool["outputs"]:
                            promises[out["id"]] = followOn

                for inp in step.tool["inputs"]:
                    for s in aslist(inp.get("source", [])):
                        if s not in promises:
                            alloutputs_fufilled = False

            # may need a test
            for out in self.cwlwf.tool["outputs"]:
                if "source" in out:
                    if out["source"] not in promises:
                        alloutputs_fufilled = False

        outobj = {}
        for out in self.cwlwf.tool["outputs"]:
            outobj[shortname(out["id"])] = (shortname(out["outputSource"]), promises[out["outputSource"]].rv())

        return IndirectDict(outobj)
Exemple #52
0
    def collect_output_ports(
        self,
        ports,  # type: Set[Dict[Text, Any]]
        builder,  # type: Builder
        outdir,  # type: Text
        compute_checksum=True,  # type: bool
        jobname="",  # type: Text
        readers=None  # type: Dict[Text, Any]
    ):  # type: (...) -> OutputPorts
        ret = {}  # type: OutputPorts
        debug = LOGGER.isEnabledFor(logging.DEBUG)
        try:
            fs_access = builder.make_fs_access(outdir)
            custom_output = fs_access.join(outdir, "cwl.output.json")
            if fs_access.exists(custom_output):
                with fs_access.open(custom_output, "r") as f:
                    ret = json.load(f)
                if debug:
                    LOGGER.debug(u"Raw output from %s: %s", custom_output,
                                 json.dumps(ret, indent=4))
            else:
                for i, port in enumerate(ports):

                    def make_workflow_exception(msg):
                        return WorkflowException(
                            u"Error collecting output for parameter '%s':\n%s"
                            % (shortname(port["id"]), msg))

                    with SourceLine(ports, i, make_workflow_exception, debug):
                        fragment = shortname(port["id"])
                        ret[fragment] = self.collect_output(
                            port,
                            builder,
                            outdir,
                            fs_access,
                            compute_checksum=compute_checksum)
            if ret:
                # revmap = partial(command_line_tool.revmap_file, builder, outdir)
                adjustDirObjs(ret, trim_listing)

                # TODO: Attempt to avoid a crash because the revmap fct is not functional
                #       (intend for a docker usage only?)
                # visit_class(ret, ("File", "Directory"), cast(Callable[[Any], Any], revmap))
                visit_class(ret, ("File", "Directory"),
                            command_line_tool.remove_path)
                normalizeFilesDirs(ret)
                visit_class(
                    ret, ("File", "Directory"),
                    partial(command_line_tool.check_valid_locations,
                            fs_access))

                if compute_checksum:
                    adjustFileObjs(ret, partial(compute_checksums, fs_access))

            validate.validate_ex(self.names.get_name("outputs_record_schema",
                                                     ""),
                                 ret,
                                 strict=False,
                                 logger=LOGGER)
            if ret is not None and builder.mutation_manager is not None:
                adjustFileObjs(ret, builder.mutation_manager.set_generation)
            return ret if ret is not None else {}
        except validate.ValidationException as exc:
            raise WorkflowException(
                "Error validating output record: {!s}\nIn:\n{}".format(
                    exc, json.dumps(ret, indent=4)))
        finally:
            if builder.mutation_manager and readers:
                for reader in readers.values():
                    builder.mutation_manager.release_reader(jobname, reader)
Exemple #53
0
def main(args=None):
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    parser.add_argument("cwltool", type=str)
    parser.add_argument("cwljob", type=str)

    # Will override the "jobStore" positional argument, enables
    # user to select jobStore or get a default from logic one below.
    parser.add_argument("--jobStore", type=str)
    parser.add_argument("--conformance-test", action="store_true")
    parser.add_argument("--no-container", action="store_true")
    parser.add_argument("--quiet", action="store_true")
    parser.add_argument("--basedir", type=str)
    parser.add_argument("--outdir", type=str, default=os.getcwd())
    parser.add_argument("--version", action='version', version=version)

    # mkdtemp actually creates the directory, but
    # toil requires that the directory not exist,
    # so make it and delete it and allow
    # toil to create it again (!)
    workdir = tempfile.mkdtemp()
    os.rmdir(workdir)

    if args is None:
        args = sys.argv[1:]

    options = parser.parse_args([workdir] + args)

    if options.quiet:
        options.logLevel = "WARNING"

    uri = "file://" + os.path.abspath(options.cwljob)

    t = cwltool.main.load_tool(options.cwltool, False, True,
                               cwltool.workflow.defaultMakeTool,
                               True)

    if options.conformance_test:
        loader = schema_salad.ref_resolver.Loader({})
    else:
        jobloaderctx = {"path": {"@type": "@id"}, "format": {"@type": "@id"}}
        jobloaderctx.update(t.metadata.get("$namespaces", {}))
        loader = schema_salad.ref_resolver.Loader(jobloaderctx)

    job, _ = loader.resolve_ref(uri)

    if type(t) == int:
        return t

    try:
        checkRequirements(t.tool)
    except Exception as e:
        logging.error(e)
        return 33

    jobobj = {}
    for inp in t.tool["inputs"]:
        if shortname(inp["id"]) in job:
            pass
        elif shortname(inp["id"]) not in job and "default" in inp:
            job[shortname(inp["id"])] = copy.copy(inp["default"])
        elif shortname(inp["id"]) not in job and inp["type"][0] == "null":
            pass
        else:
            raise validate.ValidationException("Missing inputs `%s`" % shortname(inp["id"]))

    adjustFiles(job, lambda x: x.replace("file://", ""))

    if options.conformance_test:
        sys.stdout.write(json.dumps(
            cwltool.main.single_job_executor(t, job, options.basedir, options,
                                             conformance_test=True), indent=4))
        return 0

    if not options.basedir:
        options.basedir = os.path.dirname(os.path.abspath(options.cwljob))

    outdir = options.outdir

    staging = StageJob(t, job, os.path.dirname(os.path.abspath(options.cwljob)))

    (wf1, wf2) = makeJob(t, staging.rv())

    staging.addFollowOn(wf1)
    wf2.addFollowOn(FinalJob(wf2.rv(), outdir))

    Job.Runner.startToil(staging, options)

    with open(os.path.join(outdir, "cwl.output.json"), "r") as f:
        sys.stdout.write(f.read())

    return 0
Exemple #54
0
def main(args=None, stdout=sys.stdout):
    parser = argparse.ArgumentParser()
    Job.Runner.addToilOptions(parser)
    parser.add_argument("cwltool", type=str)
    parser.add_argument("cwljob", nargs=argparse.REMAINDER)

    # Will override the "jobStore" positional argument, enables
    # user to select jobStore or get a default from logic one below.
    parser.add_argument("--jobStore", type=str)
    parser.add_argument("--not-strict", action="store_true")
    parser.add_argument("--no-container", action="store_true")
    parser.add_argument("--quiet", dest="logLevel", action="store_const", const="ERROR")
    parser.add_argument("--basedir", type=str)
    parser.add_argument("--outdir", type=str, default=os.getcwd())
    parser.add_argument("--version", action='version', version=baseVersion)
    parser.add_argument("--preserve-environment", type=str, nargs='+',
                    help="Preserve specified environment variables when running CommandLineTools",
                    metavar=("VAR1 VAR2"),
                    default=("PATH",),
                    dest="preserve_environment")

    # mkdtemp actually creates the directory, but
    # toil requires that the directory not exist,
    # so make it and delete it and allow
    # toil to create it again (!)
    workdir = tempfile.mkdtemp()
    os.rmdir(workdir)

    if args is None:
        args = sys.argv[1:]

    options = parser.parse_args([workdir] + args)

    use_container = not options.no_container

    setLoggingFromOptions(options)
    if options.logLevel:
        cwllogger.setLevel(options.logLevel)

    outdir = os.path.abspath(options.outdir)
    fileindex = {}
    existing = {}

    with Toil(options) as toil:
        if options.restart:
            outobj = toil.restart()
        else:
            useStrict = not options.not_strict
            try:
                t = cwltool.load_tool.load_tool(options.cwltool, toilMakeTool,
                                                kwargs={
                                                    "hints": [{
                                                        "class": "ResourceRequirement",
                                                        "coresMin": toil.config.defaultCores,
                                                        "ramMin": toil.config.defaultMemory / (2**20),
                                                        "outdirMin": toil.config.defaultDisk / (2**20),
                                                        "tmpdirMin": 0
                                                    }]},
                                                resolver=cwltool.resolver.tool_resolver,
                                                strict=useStrict)
                unsupportedRequirementsCheck(t.requirements)
            except cwltool.process.UnsupportedRequirement as e:
                logging.error(e)
                return 33

            if type(t) == int:
                return t

            options.workflow = options.cwltool
            options.job_order = options.cwljob
            options.tool_help = None
            options.debug = options.logLevel == "DEBUG"
            job = cwltool.main.load_job_order(options, t, sys.stdin)

            if type(job) == int:
                return job

            job, options.basedir = job

            fillInDefaults(t.tool["inputs"], job)

            def pathToLoc(p):
                if "location" not in p and "path" in p:
                    p["location"] = p["path"]
                    del p["path"]

            def importFiles(tool):
                visit_class(tool, ("File", "Directory"), pathToLoc)
                normalizeFilesDirs(tool)
                adjustDirObjs(tool, functools.partial(get_listing,
                                                      cwltool.stdfsaccess.StdFsAccess(""),
                                                      recursive=True))
                adjustFileObjs(tool, functools.partial(uploadFile,
                                                       toil.importFile,
                                                       fileindex, existing, skip_broken=True))

            t.visit(importFiles)

            for inp in t.tool["inputs"]:
                def setSecondary(fileobj):
                    if isinstance(fileobj, dict) and fileobj.get("class") == "File":
                        if "secondaryFiles" not in fileobj:
                            fileobj["secondaryFiles"] = [{
                                "location": cwltool.builder.substitute(fileobj["location"], sf), "class": "File"}
                                                         for sf in inp["secondaryFiles"]]

                    if isinstance(fileobj, list):
                        for e in fileobj:
                            setSecondary(e)

                if shortname(inp["id"]) in job and inp.get("secondaryFiles"):
                    setSecondary(job[shortname(inp["id"])])

            importFiles(job)
            visitSteps(t, importFiles)

            make_fs_access = functools.partial(ToilFsAccess, fileStore=toil)
            try:
                (wf1, wf2) = makeJob(t, {}, use_container=use_container,
                                     preserve_environment=options.preserve_environment,
                                     tmpdir=os.path.realpath(outdir), workdir=options.workDir)
            except cwltool.process.UnsupportedRequirement as e:
                logging.error(e)
                return 33

            wf1.cwljob = job
            outobj = toil.start(wf1)

        outobj = resolve_indirect(outobj)

        toilStageFiles(toil, outobj, outdir, fileindex, existing, True)

        visit_class(outobj, ("File",), functools.partial(compute_checksums, cwltool.stdfsaccess.StdFsAccess("")))

        stdout.write(json.dumps(outobj, indent=4))

    return 0
Exemple #55
0
def upload_workflow(arvRunner,
                    tool,
                    job_order,
                    project_uuid,
                    uuid=None,
                    submit_runner_ram=0,
                    name=None,
                    merged_map=None):

    packed = packed_workflow(arvRunner, tool, merged_map)

    adjustDirObjs(job_order, trim_listing)
    adjustFileObjs(job_order, trim_anonymous_location)
    adjustDirObjs(job_order, trim_anonymous_location)

    main = [p for p in packed["$graph"] if p["id"] == "#main"][0]
    for inp in main["inputs"]:
        sn = shortname(inp["id"])
        if sn in job_order:
            inp["default"] = job_order[sn]

    if not name:
        name = tool.tool.get("label", os.path.basename(tool.tool["id"]))

    upload_dependencies(arvRunner, name, tool.doc_loader, packed,
                        tool.tool["id"], False)

    if submit_runner_ram:
        hints = main.get("hints", [])
        found = False
        for h in hints:
            if h["class"] == "http://arvados.org/cwl#WorkflowRunnerResources":
                h["ramMin"] = submit_runner_ram
                found = True
                break
        if not found:
            hints.append({
                "class": "http://arvados.org/cwl#WorkflowRunnerResources",
                "ramMin": submit_runner_ram
            })
        main["hints"] = hints

    body = {
        "workflow": {
            "name":
            name,
            "description":
            tool.tool.get("doc", ""),
            "definition":
            json.dumps(packed,
                       sort_keys=True,
                       indent=4,
                       separators=(',', ': '))
        }
    }
    if project_uuid:
        body["workflow"]["owner_uuid"] = project_uuid

    if uuid:
        call = arvRunner.api.workflows().update(uuid=uuid, body=body)
    else:
        call = arvRunner.api.workflows().create(body=body)
    return call.execute(num_retries=arvRunner.num_retries)["uuid"]
Exemple #56
0
    def job(self, joborder, output_callback, **kwargs):
        kwargs["work_api"] = self.work_api
        req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer")
        if req:
            with SourceLine(self.tool, None, WorkflowException):
                if "id" not in self.tool:
                    raise WorkflowException("%s object must have 'id'" % (self.tool["class"]))
            document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch(self.tool["id"]), self.tool["id"])

            with Perf(metrics, "subworkflow upload_deps"):
                upload_dependencies(self.arvrunner,
                                    os.path.basename(joborder.get("id", "#")),
                                    document_loader,
                                    joborder,
                                    joborder.get("id", "#"),
                                    False)

                if self.wf_pdh is None:
                    workflowobj["requirements"] = dedup_reqs(self.requirements)
                    workflowobj["hints"] = dedup_reqs(self.hints)

                    packed = pack(document_loader, workflowobj, uri, self.metadata)

                    upload_dependencies(self.arvrunner,
                                        kwargs.get("name", ""),
                                        document_loader,
                                        packed,
                                        uri,
                                        False)

            with Perf(metrics, "subworkflow adjust"):
                joborder_keepmount = copy.deepcopy(joborder)

                def keepmount(obj):
                    with SourceLine(obj, None, WorkflowException):
                        if "location" not in obj:
                            raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj))
                    with SourceLine(obj, "location", WorkflowException):
                        if obj["location"].startswith("keep:"):
                            obj["location"] = "/keep/" + obj["location"][5:]
                            if "listing" in obj:
                                del obj["listing"]
                        elif obj["location"].startswith("_:"):
                            del obj["location"]
                        else:
                            raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"])

                adjustFileObjs(joborder_keepmount, keepmount)
                adjustDirObjs(joborder_keepmount, keepmount)

                if self.wf_pdh is None:
                    adjustFileObjs(packed, keepmount)
                    adjustDirObjs(packed, keepmount)
                    self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed)

            wf_runner = cmap({
                "class": "CommandLineTool",
                "baseCommand": "cwltool",
                "inputs": self.tool["inputs"],
                "outputs": self.tool["outputs"],
                "stdout": "cwl.output.json",
                "requirements": workflowobj["requirements"]+[
                    {
                    "class": "InitialWorkDirRequirement",
                    "listing": [{
                            "entryname": "workflow.cwl",
                            "entry": {
                                "class": "File",
                                "location": "keep:%s/workflow.cwl" % self.wf_pdh
                            }
                        }, {
                            "entryname": "cwl.input.yml",
                            "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${')
                        }]
                }],
                "hints": workflowobj["hints"],
                "arguments": ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml"]
            })
            kwargs["loader"] = self.doc_loader
            kwargs["avsc_names"] = self.doc_schema
            return ArvadosCommandTool(self.arvrunner, wf_runner, **kwargs).job(joborder, output_callback, **kwargs)
        else:
            return super(ArvadosWorkflow, self).job(joborder, output_callback, **kwargs)
Exemple #57
0
 def discover_default_secondary_files(obj):
     discover_secondary_files(obj["inputs"],
                              {shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t},
                              discovered)
Exemple #58
0
    def job(
        self,
        joborder,  # type: Dict[Text, AnyValue]
        output_callbacks,  # type: Callable[[Any, Any], Any]
        runtime_context,  # type: RuntimeContext
    ):  # type: (...) -> Generator[Union[JobBase, CallbackJob], None, None]
        """
        Workflow job generator.

        :param joborder: inputs of the job submission
        :param output_callbacks: method to fetch step outputs and corresponding step details
        :param runtime_context: configs about execution environment
        :return:
        """
        require_prefix = ""
        if self.metadata["cwlVersion"] == "v1.0":
            require_prefix = "http://commonwl.org/cwltool#"

        jobname = uniquename(runtime_context.name
                             or shortname(self.tool.get("id", "job")))

        # outdir must be served by the EMS because downstream step will need access to upstream steps output
        weaver_out_dir = get_wps_output_dir(get_settings(app))
        runtime_context.outdir = tempfile.mkdtemp(prefix=getdefault(
            runtime_context.tmp_outdir_prefix, DEFAULT_TMP_PREFIX),
                                                  dir=weaver_out_dir)
        builder = self._init_job(joborder, runtime_context)

        # `jobname` is the step name and `joborder` is the actual step inputs
        wps_workflow_job = WpsWorkflowJob(
            builder, builder.job, self.requirements, self.hints, jobname,
            self.get_job_process_definition(jobname, joborder, self.tool),
            self.tool["outputs"])
        wps_workflow_job.prov_obj = self.prov_obj
        wps_workflow_job.successCodes = self.tool.get("successCodes")
        wps_workflow_job.temporaryFailCodes = self.tool.get(
            "temporaryFailCodes")
        wps_workflow_job.permanentFailCodes = self.tool.get(
            "permanentFailCodes")

        # TODO Taken from command_line_tool.py maybe this could let us use the revmap if required at all
        # reffiles = copy.deepcopy(builder.files)
        # builder.pathmapper = self.make_path_mapper(
        #     reffiles, builder.stagedir, runtimeContext, True)
        # builder.requirements = wps_workflow_job.requirements

        wps_workflow_job.outdir = builder.outdir
        wps_workflow_job.tmpdir = builder.tmpdir
        wps_workflow_job.stagedir = builder.stagedir

        readers = {}  # type: Dict[Text, Any]
        timelimit = self.get_requirement(require_prefix + "TimeLimit")[0]
        if timelimit:
            with SourceLine(timelimit, "timelimit",
                            validate.ValidationException):
                wps_workflow_job.timelimit = builder.do_eval(
                    timelimit["timelimit"])
                if not isinstance(wps_workflow_job.timelimit,
                                  int) or wps_workflow_job.timelimit < 0:
                    raise Exception(
                        "timelimit must be an integer >= 0, got: %s" %
                        wps_workflow_job.timelimit)

        wps_workflow_job.collect_outputs = partial(
            self.collect_output_ports,
            self.tool["outputs"],
            builder,
            compute_checksum=getdefault(runtime_context.compute_checksum,
                                        True),
            jobname=jobname,
            readers=readers)
        wps_workflow_job.output_callback = output_callbacks

        yield wps_workflow_job
Exemple #59
0
    def job(self, joborder, output_callback, runtimeContext):

        builder = make_builder(joborder, self.hints, self.requirements, runtimeContext)
        runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext)

        req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer")
        if not req:
            return super(ArvadosWorkflow, self).job(joborder, output_callback, runtimeContext)

        # RunInSingleContainer is true

        with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
            if "id" not in self.tool:
                raise WorkflowException("%s object must have 'id'" % (self.tool["class"]))

        discover_secondary_files(self.arvrunner.fs_access, builder,
                                 self.tool["inputs"], joborder)

        with Perf(metrics, "subworkflow upload_deps"):
            upload_dependencies(self.arvrunner,
                                os.path.basename(joborder.get("id", "#")),
                                self.doc_loader,
                                joborder,
                                joborder.get("id", "#"),
                                False)

            if self.wf_pdh is None:
                packed = pack(self.loadingContext, self.tool["id"], loader=self.doc_loader)

                for p in packed["$graph"]:
                    if p["id"] == "#main":
                        p["requirements"] = dedup_reqs(self.requirements)
                        p["hints"] = dedup_reqs(self.hints)

                def visit(item):
                    if "requirements" in item:
                        item["requirements"] = [i for i in item["requirements"] if i["class"] != "DockerRequirement"]
                    for t in ("hints", "requirements"):
                        if t not in item:
                            continue
                        for req in item[t]:
                            if req["class"] == "ResourceRequirement":
                                dyn = False
                                for k in max_res_pars + sum_res_pars:
                                    if k in req:
                                        if isinstance(req[k], basestring):
                                            if item["id"] == "#main":
                                                # only the top-level requirements/hints may contain expressions
                                                self.dynamic_resource_req.append(req)
                                                dyn = True
                                                break
                                            else:
                                                with SourceLine(req, k, WorkflowException):
                                                    raise WorkflowException("Non-top-level ResourceRequirement in single container cannot have expressions")
                                if not dyn:
                                    self.static_resource_req.append(req)

                visit_class(packed["$graph"], ("Workflow", "CommandLineTool"), visit)

                if self.static_resource_req:
                    self.static_resource_req = [get_overall_res_req(self.static_resource_req)]

                upload_dependencies(self.arvrunner,
                                    runtimeContext.name,
                                    self.doc_loader,
                                    packed,
                                    self.tool["id"],
                                    False)

                # Discover files/directories referenced by the
                # workflow (mainly "default" values)
                visit_class(packed, ("File", "Directory"), self.wf_reffiles.append)


        if self.dynamic_resource_req:
            # Evaluate dynamic resource requirements using current builder
            rs = copy.copy(self.static_resource_req)
            for dyn_rs in self.dynamic_resource_req:
                eval_req = {"class": "ResourceRequirement"}
                for a in max_res_pars + sum_res_pars:
                    if a in dyn_rs:
                        eval_req[a] = builder.do_eval(dyn_rs[a])
                rs.append(eval_req)
            job_res_reqs = [get_overall_res_req(rs)]
        else:
            job_res_reqs = self.static_resource_req

        with Perf(metrics, "subworkflow adjust"):
            joborder_resolved = copy.deepcopy(joborder)
            joborder_keepmount = copy.deepcopy(joborder)

            reffiles = []
            visit_class(joborder_keepmount, ("File", "Directory"), reffiles.append)

            mapper = ArvPathMapper(self.arvrunner, reffiles+self.wf_reffiles, runtimeContext.basedir,
                                   "/keep/%s",
                                   "/keep/%s/%s")

            # For containers API, we need to make sure any extra
            # referenced files (ie referenced by the workflow but
            # not in the inputs) are included in the mounts.
            if self.wf_reffiles:
                runtimeContext = runtimeContext.copy()
                runtimeContext.extra_reffiles = copy.deepcopy(self.wf_reffiles)

            def keepmount(obj):
                remove_redundant_fields(obj)
                with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                    if "location" not in obj:
                        raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj))
                with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                    if obj["location"].startswith("keep:"):
                        obj["location"] = mapper.mapper(obj["location"]).target
                        if "listing" in obj:
                            del obj["listing"]
                    elif obj["location"].startswith("_:"):
                        del obj["location"]
                    else:
                        raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"])

            visit_class(joborder_keepmount, ("File", "Directory"), keepmount)

            def resolved(obj):
                if obj["location"].startswith("keep:"):
                    obj["location"] = mapper.mapper(obj["location"]).resolved

            visit_class(joborder_resolved, ("File", "Directory"), resolved)

            if self.wf_pdh is None:
                adjustFileObjs(packed, keepmount)
                adjustDirObjs(packed, keepmount)
                self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed)

        self.loadingContext = self.loadingContext.copy()
        self.loadingContext.metadata = self.loadingContext.metadata.copy()
        self.loadingContext.metadata["http://commonwl.org/cwltool#original_cwlVersion"] = "v1.0"

        if len(job_res_reqs) == 1:
            # RAM request needs to be at least 128 MiB or the workflow
            # runner itself won't run reliably.
            if job_res_reqs[0].get("ramMin", 1024) < 128:
                job_res_reqs[0]["ramMin"] = 128

        arguments = ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl", "cwl.input.yml"]
        if runtimeContext.debug:
            arguments.insert(0, '--debug')

        wf_runner = cmap({
            "class": "CommandLineTool",
            "baseCommand": "cwltool",
            "inputs": self.tool["inputs"],
            "outputs": self.tool["outputs"],
            "stdout": "cwl.output.json",
            "requirements": self.requirements+job_res_reqs+[
                {"class": "InlineJavascriptRequirement"},
                {
                "class": "InitialWorkDirRequirement",
                "listing": [{
                        "entryname": "workflow.cwl",
                        "entry": '$({"class": "File", "location": "keep:%s/workflow.cwl"})' % self.wf_pdh
                    }, {
                        "entryname": "cwl.input.yml",
                        "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${')
                    }]
            }],
            "hints": self.hints,
            "arguments": arguments,
            "id": "#"
        })
        return ArvadosCommandTool(self.arvrunner, wf_runner, self.loadingContext).job(joborder_resolved, output_callback, runtimeContext)
Exemple #60
0
    def run(self, fileStore):
        cwljob = resolve_indirect(self.cwljob)

        # `promises` dict
        # from: each parameter (workflow input or step output)
        #   that may be used as a "source" for a step input workflow output
        #   parameter
        # to: the job that will produce that value.
        promises = {}

        # `jobs` dict from step id to job that implements that step.
        jobs = {}

        for inp in self.cwlwf.tool["inputs"]:
            promises[inp["id"]] = SelfJob(self, cwljob)

        alloutputs_fufilled = False
        while not alloutputs_fufilled:
            # Iteratively go over the workflow steps, scheduling jobs as their
            # dependencies can be fufilled by upstream workflow inputs or
            # step outputs.  Loop exits when the workflow outputs
            # are satisfied.

            alloutputs_fufilled = True

            for step in self.cwlwf.steps:
                if step.tool["id"] not in jobs:
                    stepinputs_fufilled = True
                    for inp in step.tool["inputs"]:
                        if "source" in inp:
                            for s in aslist(inp["source"]):
                                if s not in promises:
                                    stepinputs_fufilled = False
                    if stepinputs_fufilled:
                        jobobj = {}

                        for inp in step.tool["inputs"]:
                            key = shortname(inp["id"])
                            if "source" in inp:
                                if inp.get("linkMerge") or len(aslist(inp["source"])) > 1:
                                    linkMerge = inp.get("linkMerge", "merge_nested")
                                    if linkMerge == "merge_nested":
                                        jobobj[key] = (
                                            MergeInputsNested([(shortname(s), promises[s].rv())
                                                               for s in aslist(inp["source"])]))
                                    elif linkMerge == "merge_flattened":
                                        jobobj[key] = (
                                            MergeInputsFlattened([(shortname(s), promises[s].rv())
                                                                  for s in aslist(inp["source"])]))
                                    else:
                                        raise validate.ValidationException(
                                            "Unsupported linkMerge '%s'", linkMerge)
                                else:
                                    jobobj[key] = (
                                    shortname(inp["source"]), promises[inp["source"]].rv())
                            elif "default" in inp:
                                d = copy.copy(inp["default"])
                                jobobj[key] = ("default", {"default": d})

                            if "valueFrom" in inp and "scatter" not in step.tool:
                                if key in jobobj:
                                    jobobj[key] = StepValueFrom(inp["valueFrom"],
                                                                jobobj[key],
                                                                self.cwlwf.requirements)
                                else:
                                    jobobj[key] = StepValueFrom(inp["valueFrom"],
                                                                ("None", {"None": None}),
                                                                self.cwlwf.requirements)

                        if "scatter" in step.tool:
                            wfjob = CWLScatter(step, IndirectDict(jobobj), **self.executor_options)
                            followOn = CWLGather(step, wfjob.rv())
                            wfjob.addFollowOn(followOn)
                        else:
                            (wfjob, followOn) = makeJob(step.embedded_tool, IndirectDict(jobobj),
                                                        **self.executor_options)

                        jobs[step.tool["id"]] = followOn

                        connected = False
                        for inp in step.tool["inputs"]:
                            for s in aslist(inp.get("source", [])):
                                if not promises[s].hasChild(wfjob):
                                    promises[s].addChild(wfjob)
                                    connected = True
                        if not connected:
                            # workflow step has default inputs only, isn't connected to other jobs,
                            # so add it as child of workflow.
                            self.addChild(wfjob)

                        for out in step.tool["outputs"]:
                            promises[out["id"]] = followOn

                for inp in step.tool["inputs"]:
                    for s in aslist(inp.get("source", [])):
                        if s not in promises:
                            alloutputs_fufilled = False

            # may need a test
            for out in self.cwlwf.tool["outputs"]:
                if "source" in out:
                    if out["source"] not in promises:
                        alloutputs_fufilled = False

        outobj = {}
        for out in self.cwlwf.tool["outputs"]:
            outobj[shortname(out["id"])] = (shortname(out["outputSource"]), promises[out["outputSource"]].rv())

        return IndirectDict(outobj)