Ejemplo n.º 1
0
def toilStageFiles(fileStore, cwljob, outdir, index, existing, export):
    """Copy input files out of the global file store and update location and
    path."""

    jobfiles = []  # type: List[Dict[Text, Any]]
    collectFilesAndDirs(cwljob, jobfiles)
    pm = ToilPathMapper(jobfiles, "", outdir, separateDirs=False, stage_listing=True)
    for f, p in pm.items():
        if not p.staged:
            continue
        if not os.path.exists(os.path.dirname(p.target)):
            os.makedirs(os.path.dirname(p.target), 0o0755)
        if p.type == "File":
            fileStore.exportFile(p.resolved[7:], "file://" + p.target)
        elif p.type == "Directory" and not os.path.exists(p.target):
            os.makedirs(p.target, 0o0755)
        elif p.type == "CreateFile":
            with open(p.target, "wb") as n:
                n.write(p.resolved.encode("utf-8"))

    def _check_adjust(f):
        f["location"] = schema_salad.ref_resolver.file_uri(pm.mapper(f["location"])[1])
        if "contents" in f:
            del f["contents"]
        return f

    visit_class(cwljob, ("File", "Directory"), _check_adjust)
Ejemplo n.º 2
0
def toilStageFiles(fileStore, cwljob, outdir, index, existing, export):
    """Copy input files out of the global file store and update location and
    path."""

    jobfiles = []  # type: List[Dict[Text, Any]]
    collectFilesAndDirs(cwljob, jobfiles)
    pm = ToilPathMapper(jobfiles, "", outdir, separateDirs=False, stage_listing=True)
    for f, p in pm.items():
        if not p.staged:
            continue
        if not os.path.exists(os.path.dirname(p.target)):
            os.makedirs(os.path.dirname(p.target), 0o0755)
        if p.type == "File":
            fileStore.exportFile(p.resolved[7:], "file://" + p.target)
        elif p.type == "Directory" and not os.path.exists(p.target):
            os.makedirs(p.target, 0o0755)
        elif p.type == "CreateFile":
            with open(p.target, "wb") as n:
                n.write(p.resolved.encode("utf-8"))

    def _check_adjust(f):
        f["location"] = schema_salad.ref_resolver.file_uri(pm.mapper(f["location"])[1])
        if "contents" in f:
            del f["contents"]
        return f

    visit_class(cwljob, ("File", "Directory"), _check_adjust)
Ejemplo n.º 3
0
def returndeps(
        obj,  # type: Optional[Mapping[Text, Any]]
        document_loader,  # type: Loader
        stdout,  # type: Union[TextIO, StreamWriter]
        relative_deps,  # type: bool
        uri,  # type: Text
        basedir=None  # type: Text
):  # type: (...) -> None
    """Print a JSON representation of the dependencies of the CWL document."""
    deps = {"class": "File", "location": uri}  # type: Dict[Text, Any]

    def loadref(base, uri):
        return document_loader.fetch(document_loader.fetcher.urljoin(
            base, uri))

    sfs = scandeps(basedir if basedir else uri, obj, {"$import", "run"},
                   {"$include", "$schemas", "location"}, loadref)
    if sfs:
        deps["secondaryFiles"] = sfs

    if relative_deps:
        if relative_deps == "primary":
            base = basedir if basedir else os.path.dirname(
                uri_file_path(str(uri)))
        elif relative_deps == "cwd":
            base = os.getcwd()
        else:
            raise Exception(u"Unknown relative_deps %s" % relative_deps)

        visit_class(deps, ("File", "Directory"),
                    functools.partial(make_relative, base))

    return json_dumps(deps, indent=4)
Ejemplo n.º 4
0
 def importFiles(tool):
     visit_class(tool, ("File", "Directory"), pathToLoc)
     normalizeFilesDirs(tool)
     adjustDirObjs(tool, functools.partial(get_listing,
                                           cwltool.stdfsaccess.StdFsAccess(""),
                                           recursive=True))
     adjustFileObjs(tool, functools.partial(uploadFile,
                                            toil.importFile,
                                            fileindex, existing, skip_broken=True))
Ejemplo n.º 5
0
 def importFiles(tool):
     visit_class(tool, ("File", "Directory"), pathToLoc)
     normalizeFilesDirs(tool)
     adjustDirObjs(tool, functools.partial(get_listing,
                                           cwltool.stdfsaccess.StdFsAccess(""),
                                           recursive=True))
     adjustFileObjs(tool, functools.partial(uploadFile,
                                            toil.importFile,
                                            fileindex, existing, skip_broken=True))
Ejemplo n.º 6
0
 def import_files(tool):
     visit_class(tool, ("File", "Directory"), path_to_loc)
     visit_class(tool, ("File", ), functools.partial(
         add_sizes, fs_access))
     normalizeFilesDirs(tool)
     adjustDirObjs(tool, functools.partial(
         get_listing, fs_access, recursive=True))
     adjustFileObjs(tool, functools.partial(
         uploadFile, toil.importFile, fileindex, existing,
         skip_broken=True))
Ejemplo n.º 7
0
    def collect_output_ports(self,
                             ports,                  # type: Set[Dict[Text, Any]]
                             builder,                # type: Builder
                             outdir,                 # type: Text
                             compute_checksum=True,  # type: bool
                             jobname="",             # type: Text
                             readers=None            # type: Dict[Text, Any]
                             ):                      # type: (...) -> OutputPorts
        ret = {}  # type: OutputPorts
        debug = LOGGER.isEnabledFor(logging.DEBUG)
        try:
            fs_access = builder.make_fs_access(outdir)
            custom_output = fs_access.join(outdir, "cwl.output.json")
            if fs_access.exists(custom_output):
                with fs_access.open(custom_output, "r") as f:
                    ret = json.load(f)
                if debug:
                    LOGGER.debug(u"Raw output from %s: %s", custom_output, json.dumps(ret, indent=4))
            else:
                for i, port in enumerate(ports):
                    def make_workflow_exception(msg):
                        return WorkflowException(
                            u"Error collecting output for parameter '%s':\n%s"
                            % (shortname(port["id"]), msg))
                    with SourceLine(ports, i, make_workflow_exception, debug):
                        fragment = shortname(port["id"])
                        ret[fragment] = self.collect_output(port, builder, outdir, fs_access,
                                                            compute_checksum=compute_checksum)
            if ret:
                # revmap = partial(command_line_tool.revmap_file, builder, outdir)
                adjustDirObjs(ret, trim_listing)

                # TODO: Attempt to avoid a crash because the revmap fct is not functional
                #       (intend for a docker usage only?)
                # visit_class(ret, ("File", "Directory"), cast(Callable[[Any], Any], revmap))
                visit_class(ret, ("File", "Directory"), command_line_tool.remove_path)
                normalizeFilesDirs(ret)
                visit_class(ret, ("File", "Directory"), partial(command_line_tool.check_valid_locations, fs_access))

                if compute_checksum:
                    adjustFileObjs(ret, partial(compute_checksums, fs_access))

            validate.validate_ex(
                self.names.get_name("outputs_record_schema", ""), ret,
                strict=False, logger=LOGGER)
            if ret is not None and builder.mutation_manager is not None:
                adjustFileObjs(ret, builder.mutation_manager.set_generation)
            return ret if ret is not None else {}
        except validate.ValidationException as exc:
            raise WorkflowException("Error validating output record: {!s}\nIn:\n{}"
                                    .format(exc, json.dumps(ret, indent=4)))
        finally:
            if builder.mutation_manager and readers:
                for reader in readers.values():
                    builder.mutation_manager.release_reader(jobname, reader)
Ejemplo n.º 8
0
def toilStageFiles(fileStore,
                   cwljob,
                   outdir,
                   index,
                   existing,
                   export,
                   destBucket=None):
    """Copy input files out of the global file store and update location and
    path."""

    jobfiles = []  # type: List[Dict[Text, Any]]
    collectFilesAndDirs(cwljob, jobfiles)
    pm = ToilPathMapper(jobfiles,
                        "",
                        outdir,
                        separateDirs=False,
                        stage_listing=True)
    for f, p in pm.items():
        if not p.staged:
            continue

        # Deal with bucket exports
        if destBucket:
            # Directories don't need to be created if we're exporting to
            # a bucket
            if p.type == "File":
                # Remove the staging directory from the filepath and
                # form the destination URL
                unstageTargetPath = p.target[len(outdir):]
                destUrl = '/'.join(
                    s.strip('/') for s in [destBucket, unstageTargetPath])

                fileStore.exportFile(p.resolved[7:], destUrl)

            continue

        if not os.path.exists(os.path.dirname(p.target)):
            os.makedirs(os.path.dirname(p.target), 0o0755)
        if p.type == "File":
            fileStore.exportFile(p.resolved[7:], "file://" + p.target)
        elif p.type == "Directory" and not os.path.exists(p.target):
            os.makedirs(p.target, 0o0755)
        elif p.type == "CreateFile":
            with open(p.target, "wb") as n:
                n.write(p.resolved.encode("utf-8"))

    def _check_adjust(f):
        f["location"] = schema_salad.ref_resolver.file_uri(
            pm.mapper(f["location"])[1])
        if "contents" in f:
            del f["contents"]
        return f

    visit_class(cwljob, ("File", "Directory"), _check_adjust)
Ejemplo n.º 9
0
 def visit_default(obj):
     remove = [False]
     def ensure_default_location(f):
         if "location" not in f and "path" in f:
             f["location"] = f["path"]
             del f["path"]
         if "location" in f and not arvrunner.fs_access.exists(f["location"]):
             # Doesn't exist, remove from list of dependencies to upload
             sc[:] = [x for x in sc if x["location"] != f["location"]]
             # Delete "default" from workflowobj
             remove[0] = True
     visit_class(obj["default"], ("File", "Directory"), ensure_default_location)
     if remove[0]:
         del obj["default"]
Ejemplo n.º 10
0
 def capture_default(obj):
     remove = [False]
     def add_default(f):
         if "location" not in f and "path" in f:
             f["location"] = f["path"]
             del f["path"]
         if "location" in f and not arvrunner.fs_access.exists(f["location"]):
             # Remove from sc
             sc[:] = [x for x in sc if x["location"] != f["location"]]
             # Delete "default" from workflowobj
             remove[0] = True
     visit_class(obj["default"], ("File", "Directory"), add_default)
     if remove[0]:
         del obj["default"]
Ejemplo n.º 11
0
    def arvados_job_spec(self, debug=False):
        """Create an Arvados job specification for this workflow.

        The returned dict can be used to create a job (i.e., passed as
        the +body+ argument to jobs().create()), or as a component in
        a pipeline template or pipeline instance.
        """

        if self.embedded_tool.tool["id"].startswith("keep:"):
            self.job_order["cwl:tool"] = self.embedded_tool.tool["id"][5:]
        else:
            packed = packed_workflow(self.arvrunner, self.embedded_tool,
                                     self.merged_map)
            wf_pdh = upload_workflow_collection(self.arvrunner, self.name,
                                                packed)
            self.job_order["cwl:tool"] = "%s/workflow.cwl#main" % wf_pdh

        adjustDirObjs(self.job_order, trim_listing)
        visit_class(self.job_order, ("File", "Directory"),
                    trim_anonymous_location)
        visit_class(self.job_order, ("File", "Directory"),
                    remove_redundant_fields)

        if self.output_name:
            self.job_order["arv:output_name"] = self.output_name

        if self.output_tags:
            self.job_order["arv:output_tags"] = self.output_tags

        self.job_order["arv:enable_reuse"] = self.enable_reuse

        if self.on_error:
            self.job_order["arv:on_error"] = self.on_error

        if debug:
            self.job_order["arv:debug"] = True

        return {
            "script": "cwl-runner",
            "script_version": "master",
            "minimum_script_version":
            "570509ab4d2ef93d870fd2b1f2eab178afb1bad9",
            "repository": "arvados",
            "script_parameters": self.job_order,
            "runtime_constraints": {
                "docker_image": arvados_jobs_image(self.arvrunner,
                                                   self.jobs_image),
                "min_ram_mb_per_node": self.submit_runner_ram
            }
        }
Ejemplo n.º 12
0
 def visit_default(obj):
     remove = [False]
     def ensure_default_location(f):
         if "location" not in f and "path" in f:
             f["location"] = f["path"]
             del f["path"]
         if "location" in f and not arvrunner.fs_access.exists(f["location"]):
             # Doesn't exist, remove from list of dependencies to upload
             sc[:] = [x for x in sc if x["location"] != f["location"]]
             # Delete "default" from workflowobj
             remove[0] = True
     visit_class(obj["default"], ("File", "Directory"), ensure_default_location)
     if remove[0]:
         del obj["default"]
Ejemplo n.º 13
0
    def visit_default(obj):
        remove = [False]

        def ensure_default_location(fileobj):
            if "location" not in fileobj and "path" in fileobj:
                fileobj["location"] = fileobj["path"]
                del fileobj["path"]
            if "location" in fileobj \
                    and not ftp_access.exists(fileobj["location"]):
                # Delete "default" from workflowobj
                remove[0] = True
        visit_class(obj["default"], ("File", "Directory"),
                    ensure_default_location)
        if remove[0]:
            del obj["default"]
Ejemplo n.º 14
0
    def arvados_job_spec(self, debug=False):
        """Create an Arvados job specification for this workflow.

        The returned dict can be used to create a job (i.e., passed as
        the +body+ argument to jobs().create()), or as a component in
        a pipeline template or pipeline instance.
        """

        if self.embedded_tool.tool["id"].startswith("keep:"):
            self.job_order["cwl:tool"] = self.embedded_tool.tool["id"][5:]
        else:
            packed = packed_workflow(self.arvrunner, self.embedded_tool, self.merged_map)
            wf_pdh = upload_workflow_collection(self.arvrunner, self.name, packed)
            self.job_order["cwl:tool"] = "%s/workflow.cwl#main" % wf_pdh

        adjustDirObjs(self.job_order, trim_listing)
        visit_class(self.job_order, ("File", "Directory"), trim_anonymous_location)
        visit_class(self.job_order, ("File", "Directory"), remove_redundant_fields)

        if self.output_name:
            self.job_order["arv:output_name"] = self.output_name

        if self.output_tags:
            self.job_order["arv:output_tags"] = self.output_tags

        self.job_order["arv:enable_reuse"] = self.enable_reuse

        if self.on_error:
            self.job_order["arv:on_error"] = self.on_error

        if debug:
            self.job_order["arv:debug"] = True

        return {
            "script": "cwl-runner",
            "script_version": "master",
            "minimum_script_version": "570509ab4d2ef93d870fd2b1f2eab178afb1bad9",
            "repository": "arvados",
            "script_parameters": self.job_order,
            "runtime_constraints": {
                "docker_image": arvados_jobs_image(self.arvrunner, self.jobs_image),
                "min_ram_mb_per_node": self.submit_runner_ram
            }
        }
Ejemplo n.º 15
0
    def job(self, joborder, output_callback, **kwargs):
        kwargs["work_api"] = self.work_api
        req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer")
        if req:
            with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                if "id" not in self.tool:
                    raise WorkflowException("%s object must have 'id'" % (self.tool["class"]))
            document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch(self.tool["id"]), self.tool["id"])

            discover_secondary_files(self.tool["inputs"], joborder)

            with Perf(metrics, "subworkflow upload_deps"):
                upload_dependencies(self.arvrunner,
                                    os.path.basename(joborder.get("id", "#")),
                                    document_loader,
                                    joborder,
                                    joborder.get("id", "#"),
                                    False)

                if self.wf_pdh is None:
                    workflowobj["requirements"] = dedup_reqs(self.requirements)
                    workflowobj["hints"] = dedup_reqs(self.hints)

                    packed = pack(document_loader, workflowobj, uri, self.metadata)

                    upload_dependencies(self.arvrunner,
                                        kwargs.get("name", ""),
                                        document_loader,
                                        packed,
                                        uri,
                                        False)

            with Perf(metrics, "subworkflow adjust"):
                joborder_resolved = copy.deepcopy(joborder)
                joborder_keepmount = copy.deepcopy(joborder)

                reffiles = []
                visit_class(joborder_keepmount, ("File", "Directory"), lambda x: reffiles.append(x))

                mapper = ArvPathMapper(self.arvrunner, reffiles, kwargs["basedir"],
                                 "/keep/%s",
                                 "/keep/%s/%s",
                                 **kwargs)

                def keepmount(obj):
                    remove_redundant_fields(obj)
                    with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                        if "location" not in obj:
                            raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj))
                    with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                        if obj["location"].startswith("keep:"):
                            obj["location"] = mapper.mapper(obj["location"]).target
                            if "listing" in obj:
                                del obj["listing"]
                        elif obj["location"].startswith("_:"):
                            del obj["location"]
                        else:
                            raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"])

                visit_class(joborder_keepmount, ("File", "Directory"), keepmount)

                def resolved(obj):
                    if obj["location"].startswith("keep:"):
                        obj["location"] = mapper.mapper(obj["location"]).resolved

                visit_class(joborder_resolved, ("File", "Directory"), resolved)

                if self.wf_pdh is None:
                    adjustFileObjs(packed, keepmount)
                    adjustDirObjs(packed, keepmount)
                    self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed)

            wf_runner = cmap({
                "class": "CommandLineTool",
                "baseCommand": "cwltool",
                "inputs": self.tool["inputs"],
                "outputs": self.tool["outputs"],
                "stdout": "cwl.output.json",
                "requirements": self.requirements+[
                    {
                    "class": "InitialWorkDirRequirement",
                    "listing": [{
                            "entryname": "workflow.cwl",
                            "entry": {
                                "class": "File",
                                "location": "keep:%s/workflow.cwl" % self.wf_pdh
                            }
                        }, {
                            "entryname": "cwl.input.yml",
                            "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${')
                        }]
                }],
                "hints": self.hints,
                "arguments": ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml"],
                "id": "#"
            })
            kwargs["loader"] = self.doc_loader
            kwargs["avsc_names"] = self.doc_schema
            return ArvadosCommandTool(self.arvrunner, wf_runner, **kwargs).job(joborder_resolved, output_callback, **kwargs)
        else:
            return super(ArvadosWorkflow, self).job(joborder, output_callback, **kwargs)
Ejemplo n.º 16
0
def upload_dependencies_ftp(document_loader, workflowobj, uri, loadref_run,
                            remote_storage_url, ftp_access):
    """
    Upload the dependencies of the workflowobj document to an FTP location.

    Does an in-place update of references in "workflowobj".
    Use scandeps to find $import, $include, $schemas, run, File and Directory
    fields that represent external references.
    If workflowobj has an "id" field, this will reload the document to ensure
    it is scanning the raw document prior to preprocessing.

    Adapted from:
    https://github.com/curoverse/arvados/blob/2b0b06579199967eca3d44d955ad64195d2db3c3/sdk/cwl/arvados_cwl/runner.py#L83
    """
    loaded = set()

    def loadref(base, ref):
        joined = document_loader.fetcher.urljoin(base, ref)
        defrg, _ = urllib.parse.urldefrag(joined)
        if defrg not in loaded:
            loaded.add(defrg)
            # Use fetch_text to get raw file (before preprocessing).
            text = document_loader.fetch_text(defrg)
            if isinstance(text, bytes):
                text_io = StringIO(text.decode('utf-8'))
            else:
                text_io = StringIO(text)
            return yaml.safe_load(text_io)
        else:
            return {}

    if loadref_run:
        loadref_fields = set(("$import", "run"))
    else:
        loadref_fields = set(("$import", ))
    scanobj = workflowobj
    if "id" in workflowobj:
        # Need raw file content (before preprocessing) to ensure
        # that external references in $include and $mixin are captured.
        scanobj = loadref("", workflowobj["id"])
    scandeps(uri,
             scanobj,
             loadref_fields,
             set(("$include", "$schemas", "location")),
             loadref,
             urljoin=document_loader.fetcher.urljoin)

    def visit_default(obj):
        remove = [False]

        def ensure_default_location(fileobj):
            if "location" not in fileobj and "path" in fileobj:
                fileobj["location"] = fileobj["path"]
                del fileobj["path"]
            if "location" in fileobj \
                    and not ftp_access.exists(fileobj["location"]):
                # Delete "default" from workflowobj
                remove[0] = True

        visit_class(obj["default"], ("File", "Directory"),
                    ensure_default_location)
        if remove[0]:
            del obj["default"]

    find_defaults(workflowobj, visit_default)

    discovered = {}

    def discover_default_secondary_files(obj):
        discover_secondary_files(
            obj["inputs"], {
                shortname(t["id"]): t["default"]
                for t in obj["inputs"] if "default" in t
            }, discovered)

    visit_class(workflowobj, ("CommandLineTool", "Workflow"),
                discover_default_secondary_files)
    for entry in list(discovered.keys()):
        # Only interested in discovered secondaryFiles which are local
        # files that need to be uploaded.
        if not entry.startswith("file:"):
            del discovered[entry]
    visit_class(workflowobj, ("Directory"),
                functools.partial(ftp_upload, remote_storage_url, ftp_access))
    visit_class(workflowobj, ("File"),
                functools.partial(ftp_upload, remote_storage_url, ftp_access))
    visit_class(discovered, ("Directory"),
                functools.partial(ftp_upload, remote_storage_url, ftp_access))
    visit_class(discovered, ("File"),
                functools.partial(ftp_upload, remote_storage_url, ftp_access))
Ejemplo n.º 17
0
    def arvados_job_spec(self, runtimeContext):
        """Create an Arvados container request for this workflow.

        The returned dict can be used to create a container passed as
        the +body+ argument to container_requests().create().
        """

        adjustDirObjs(self.job_order, trim_listing)
        visit_class(self.job_order, ("File", "Directory"),
                    trim_anonymous_location)
        visit_class(self.job_order, ("File", "Directory"),
                    remove_redundant_fields)

        secret_mounts = {}
        for param in sorted(self.job_order.keys()):
            if self.secret_store.has_secret(self.job_order[param]):
                mnt = "/secrets/s%d" % len(secret_mounts)
                secret_mounts[mnt] = {
                    "kind": "text",
                    "content":
                    self.secret_store.retrieve(self.job_order[param])
                }
                self.job_order[param] = {"$include": mnt}

        container_req = {
            "name": self.name,
            "output_path": "/var/spool/cwl",
            "cwd": "/var/spool/cwl",
            "priority": self.priority,
            "state": "Committed",
            "container_image": arvados_jobs_image(self.arvrunner,
                                                  self.jobs_image),
            "mounts": {
                "/var/lib/cwl/cwl.input.json": {
                    "kind": "json",
                    "content": self.job_order
                },
                "stdout": {
                    "kind": "file",
                    "path": "/var/spool/cwl/cwl.output.json"
                },
                "/var/spool/cwl": {
                    "kind": "collection",
                    "writable": True
                }
            },
            "secret_mounts": secret_mounts,
            "runtime_constraints": {
                "vcpus":
                math.ceil(self.submit_runner_cores),
                "ram":
                1024 * 1024 * (math.ceil(self.submit_runner_ram) +
                               math.ceil(self.collection_cache_size)),
                "API":
                True
            },
            "use_existing":
            False,  # Never reuse the runner container - see #15497.
            "properties": {}
        }

        if self.embedded_tool.tool.get("id", "").startswith("keep:"):
            sp = self.embedded_tool.tool["id"].split('/')
            workflowcollection = sp[0][5:]
            workflowname = "/".join(sp[1:])
            workflowpath = "/var/lib/cwl/workflow/%s" % workflowname
            container_req["mounts"]["/var/lib/cwl/workflow"] = {
                "kind": "collection",
                "portable_data_hash": "%s" % workflowcollection
            }
        else:
            packed = packed_workflow(self.arvrunner, self.embedded_tool,
                                     self.merged_map)
            workflowpath = "/var/lib/cwl/workflow.json#main"
            container_req["mounts"]["/var/lib/cwl/workflow.json"] = {
                "kind": "json",
                "content": packed
            }
            if self.embedded_tool.tool.get("id", "").startswith("arvwf:"):
                container_req["properties"][
                    "template_uuid"] = self.embedded_tool.tool["id"][6:33]

        # --local means execute the workflow instead of submitting a container request
        # --api=containers means use the containers API
        # --no-log-timestamps means don't add timestamps (the logging infrastructure does this)
        # --disable-validate because we already validated so don't need to do it again
        # --eval-timeout is the timeout for javascript invocation
        # --parallel-task-count is the number of threads to use for job submission
        # --enable/disable-reuse sets desired job reuse
        # --collection-cache-size sets aside memory to store collections
        command = [
            "arvados-cwl-runner", "--local", "--api=containers",
            "--no-log-timestamps", "--disable-validate", "--disable-color",
            "--eval-timeout=%s" % self.arvrunner.eval_timeout,
            "--thread-count=%s" % self.arvrunner.thread_count,
            "--enable-reuse" if self.enable_reuse else "--disable-reuse",
            "--collection-cache-size=%s" % self.collection_cache_size
        ]

        if self.output_name:
            command.append("--output-name=" + self.output_name)
            container_req["output_name"] = self.output_name

        if self.output_tags:
            command.append("--output-tags=" + self.output_tags)

        if runtimeContext.debug:
            command.append("--debug")

        if runtimeContext.storage_classes != "default":
            command.append("--storage-classes=" +
                           runtimeContext.storage_classes)

        if self.on_error:
            command.append("--on-error=" + self.on_error)

        if self.intermediate_output_ttl:
            command.append("--intermediate-output-ttl=%d" %
                           self.intermediate_output_ttl)

        if self.arvrunner.trash_intermediate:
            command.append("--trash-intermediate")

        if self.arvrunner.project_uuid:
            command.append("--project-uuid=" + self.arvrunner.project_uuid)

        if self.enable_dev:
            command.append("--enable-dev")

        command.extend([workflowpath, "/var/lib/cwl/cwl.input.json"])

        container_req["command"] = command

        return container_req
Ejemplo n.º 18
0
def upload_dependencies(arvrunner,
                        name,
                        document_loader,
                        workflowobj,
                        uri,
                        loadref_run,
                        include_primary=True,
                        discovered_secondaryfiles=None):
    """Upload the dependencies of the workflowobj document to Keep.

    Returns a pathmapper object mapping local paths to keep references.  Also
    does an in-place update of references in "workflowobj".

    Use scandeps to find $import, $include, $schemas, run, File and Directory
    fields that represent external references.

    If workflowobj has an "id" field, this will reload the document to ensure
    it is scanning the raw document prior to preprocessing.
    """

    loaded = set()

    def loadref(b, u):
        joined = document_loader.fetcher.urljoin(b, u)
        defrg, _ = urllib.parse.urldefrag(joined)
        if defrg not in loaded:
            loaded.add(defrg)
            # Use fetch_text to get raw file (before preprocessing).
            text = document_loader.fetch_text(defrg)
            if isinstance(text, bytes):
                textIO = StringIO(text.decode('utf-8'))
            else:
                textIO = StringIO(text)
            return yaml.safe_load(textIO)
        else:
            return {}

    if loadref_run:
        loadref_fields = set(("$import", "run"))
    else:
        loadref_fields = set(("$import", ))

    scanobj = workflowobj
    if "id" in workflowobj:
        # Need raw file content (before preprocessing) to ensure
        # that external references in $include and $mixin are captured.
        scanobj = loadref("", workflowobj["id"])

    sc_result = scandeps(uri,
                         scanobj,
                         loadref_fields,
                         set(("$include", "$schemas", "location")),
                         loadref,
                         urljoin=document_loader.fetcher.urljoin)

    sc = []
    uuids = {}

    def collect_uuids(obj):
        loc = obj.get("location", "")
        sp = loc.split(":")
        if sp[0] == "keep":
            # Collect collection uuids that need to be resolved to
            # portable data hashes
            gp = collection_uuid_pattern.match(loc)
            if gp:
                uuids[gp.groups()[0]] = obj
            if collectionUUID in obj:
                uuids[obj[collectionUUID]] = obj

    def collect_uploads(obj):
        loc = obj.get("location", "")
        sp = loc.split(":")
        if len(sp) < 1:
            return
        if sp[0] in ("file", "http", "https"):
            # Record local files than need to be uploaded,
            # don't include file literals, keep references, etc.
            sc.append(obj)
        collect_uuids(obj)

    visit_class(workflowobj, ("File", "Directory"), collect_uuids)
    visit_class(sc_result, ("File", "Directory"), collect_uploads)

    # Resolve any collection uuids we found to portable data hashes
    # and assign them to uuid_map
    uuid_map = {}
    fetch_uuids = list(uuids.keys())
    while fetch_uuids:
        # For a large number of fetch_uuids, API server may limit
        # response size, so keep fetching from API server has nothing
        # more to give us.
        lookups = arvrunner.api.collections().list(
            filters=[["uuid", "in", fetch_uuids]],
            count="none",
            select=["uuid", "portable_data_hash"
                    ]).execute(num_retries=arvrunner.num_retries)

        if not lookups["items"]:
            break

        for l in lookups["items"]:
            uuid_map[l["uuid"]] = l["portable_data_hash"]

        fetch_uuids = [u for u in fetch_uuids if u not in uuid_map]

    normalizeFilesDirs(sc)

    if include_primary and "id" in workflowobj:
        sc.append({"class": "File", "location": workflowobj["id"]})

    if "$schemas" in workflowobj:
        for s in workflowobj["$schemas"]:
            sc.append({"class": "File", "location": s})

    def visit_default(obj):
        remove = [False]

        def ensure_default_location(f):
            if "location" not in f and "path" in f:
                f["location"] = f["path"]
                del f["path"]
            if "location" in f and not arvrunner.fs_access.exists(
                    f["location"]):
                # Doesn't exist, remove from list of dependencies to upload
                sc[:] = [x for x in sc if x["location"] != f["location"]]
                # Delete "default" from workflowobj
                remove[0] = True

        visit_class(obj["default"], ("File", "Directory"),
                    ensure_default_location)
        if remove[0]:
            del obj["default"]

    find_defaults(workflowobj, visit_default)

    discovered = {}

    def discover_default_secondary_files(obj):
        discover_secondary_files(
            obj["inputs"], {
                shortname(t["id"]): t["default"]
                for t in obj["inputs"] if "default" in t
            }, discovered)

    visit_class(workflowobj, ("CommandLineTool", "Workflow"),
                discover_default_secondary_files)

    for d in list(discovered):
        # Only interested in discovered secondaryFiles which are local
        # files that need to be uploaded.
        if d.startswith("file:"):
            sc.extend(discovered[d])
        else:
            del discovered[d]

    mapper = ArvPathMapper(arvrunner,
                           sc,
                           "",
                           "keep:%s",
                           "keep:%s/%s",
                           name=name,
                           single_collection=True)

    def setloc(p):
        loc = p.get("location")
        if loc and (not loc.startswith("_:")) and (
                not loc.startswith("keep:")):
            p["location"] = mapper.mapper(p["location"]).resolved
            return

        if not loc:
            return

        if collectionUUID in p:
            uuid = p[collectionUUID]
            if uuid not in uuid_map:
                raise SourceLine(p, collectionUUID,
                                 validate.ValidationException).makeError(
                                     "Collection uuid %s not found" % uuid)
            gp = collection_pdh_pattern.match(loc)
            if gp and uuid_map[uuid] != gp.groups()[0]:
                # This file entry has both collectionUUID and a PDH
                # location. If the PDH doesn't match the one returned
                # the API server, raise an error.
                raise SourceLine(
                    p, "location", validate.ValidationException
                ).makeError(
                    "Expected collection uuid %s to be %s but API server reported %s"
                    % (uuid, gp.groups()[0], uuid_map[p[collectionUUID]]))

        gp = collection_uuid_pattern.match(loc)
        if not gp:
            return
        uuid = gp.groups()[0]
        if uuid not in uuid_map:
            raise SourceLine(p, "location",
                             validate.ValidationException).makeError(
                                 "Collection uuid %s not found" % uuid)
        p["location"] = "keep:%s%s" % (uuid_map[uuid], gp.groups()[1]
                                       if gp.groups()[1] else "")
        p[collectionUUID] = uuid

    visit_class(workflowobj, ("File", "Directory"), setloc)
    visit_class(discovered, ("File", "Directory"), setloc)

    if discovered_secondaryfiles is not None:
        for d in discovered:
            discovered_secondaryfiles[mapper.mapper(
                d).resolved] = discovered[d]

    if "$schemas" in workflowobj:
        sch = []
        for s in workflowobj["$schemas"]:
            sch.append(mapper.mapper(s).resolved)
        workflowobj["$schemas"] = sch

    return mapper
Ejemplo n.º 19
0
    def cwlmain(
        self,
        argsl=None,  # type: List[str]
        args=None,  # type: argparse.Namespace
        job_order_object=None,  # type: MutableMapping[Text, Any]
        stdin=sys.stdin,  # type: IO[Any]
        stdout=None,  # type: Union[TextIO, codecs.StreamWriter]
        stderr=sys.stderr,  # type: IO[Any]
        versionfunc=versionstring,  # type: Callable[[], Text]
        logger_handler=None,  #
        custom_schema_callback=None,  # type: Callable[[], None]
        executor=None,  # type: Callable[..., Tuple[Dict[Text, Any], Text]]
        loadingContext=None,  # type: LoadingContext
        runtimeContext=None  # type: RuntimeContext
    ):  # type: (...) -> int

        if not stdout:
            stdout = codecs.getwriter('utf-8')(sys.stdout)
        _logger.removeHandler(defaultStreamHandler)
        if logger_handler:
            stderr_handler = logger_handler
        else:
            stderr_handler = logging.StreamHandler(stderr)
        _logger.addHandler(stderr_handler)
        try:
            if args is None:
                args = arg_parser().parse_args(argsl)
                if args.workflow and "--outdir" not in argsl:
                    outputPath = args.workflow.split('/')[-1].split('.')[0]
                    setattr(
                        args, "outdir",
                        os.getcwd() + "/" + outputPath + "/" +
                        datetime.datetime.now().strftime('%Y-%m-%d-%H%M'))
            if runtimeContext is None:
                runtimeContext = RuntimeContext(vars(args))
            else:
                runtimeContext = runtimeContext.copy()

            rdflib_logger = logging.getLogger("rdflib.term")
            rdflib_logger.addHandler(stderr_handler)
            rdflib_logger.setLevel(logging.ERROR)
            if args.quiet:
                _logger.setLevel(logging.WARN)
            if runtimeContext.debug:
                _logger.setLevel(logging.DEBUG)
                rdflib_logger.setLevel(logging.DEBUG)
            if args.timestamps:
                formatter = logging.Formatter("[%(asctime)s] %(message)s",
                                              "%Y-%m-%d %H:%M:%S")
                stderr_handler.setFormatter(formatter)
            # version
            if args.version:
                return versionfunc(), 0
            else:
                _logger.info(versionfunc())

            if args.print_supported_versions:
                return "\n".join(supportedCWLversions(args.enable_dev)), 0

            if not args.workflow:
                if os.path.isfile("CWLFile"):
                    setattr(args, "workflow", "CWLFile")
                else:
                    _logger.error("")
                    _logger.error(
                        "CWL document required, no input file was provided")
                    arg_parser().print_help()
                    return "CWL document required, no input file was provided", 1
            if args.relax_path_checks:
                command_line_tool.ACCEPTLIST_RE = command_line_tool.ACCEPTLIST_EN_RELAXED_RE

            if args.ga4gh_tool_registries:
                ga4gh_tool_registries[:] = args.ga4gh_tool_registries
            if not args.enable_ga4gh_tool_registry:
                del ga4gh_tool_registries[:]

            if custom_schema_callback:
                custom_schema_callback()
            elif args.enable_ext:
                res = pkg_resources.resource_stream(__name__, 'extensions.yml')
                use_custom_schema("v1.0", "http://commonwl.org/cwltool",
                                  res.read())
                res.close()
            else:
                use_standard_schema("v1.0")

            if loadingContext is None:
                loadingContext = LoadingContext(vars(args))
            else:
                loadingContext = loadingContext.copy()

            loadingContext.disable_js_validation = \
                args.disable_js_validation or (not args.do_validate)
            loadingContext.construct_tool_object = getdefault(
                loadingContext.construct_tool_object,
                workflow.default_make_tool)
            loadingContext.resolver = getdefault(loadingContext.resolver,
                                                 tool_resolver)
            try:
                uri, tool_file_uri = resolve_tool_uri(
                    args.workflow,
                    resolver=loadingContext.resolver,
                    fetcher_constructor=loadingContext.fetcher_constructor)
            except:
                return "Can't find file " + args.workflow, 0

            try_again_msg = "" if args.debug else ", try again with --debug for more information"

            try:
                job_order_object, input_basedir, jobloader = load_job_order(
                    args, stdin, loadingContext.fetcher_constructor,
                    loadingContext.overrides_list, tool_file_uri)

                if args.overrides:
                    loadingContext.overrides_list.extend(
                        load_overrides(
                            file_uri(os.path.abspath(args.overrides)),
                            tool_file_uri))

                document_loader, workflowobj, uri = fetch_document(
                    uri,
                    resolver=loadingContext.resolver,
                    fetcher_constructor=loadingContext.fetcher_constructor)

                if args.print_deps:
                    # printdeps(workflowobj, document_loader, stdout, args.relative_deps, uri)
                    result = returndeps(workflowobj, document_loader, stdout,
                                        args.relative_deps, uri)
                    return result, 0

                document_loader, avsc_names, processobj, metadata, uri \
                    = validate_document(document_loader, workflowobj, uri,
                                        enable_dev=loadingContext.enable_dev,
                                        strict=loadingContext.strict,
                                        preprocess_only=(args.print_pre or args.pack),
                                        fetcher_constructor=loadingContext.fetcher_constructor,
                                        skip_schemas=args.skip_schemas,
                                        overrides=loadingContext.overrides_list,
                                        do_validate=loadingContext.do_validate)

                if args.print_pre:
                    # stdout.write(json_dumps(processobj, indent=4))
                    return json_dumps(processobj, indent=4), 0

                loadingContext.overrides_list.extend(
                    metadata.get("cwltool:overrides", []))

                tool = make_tool(document_loader, avsc_names, metadata, uri,
                                 loadingContext)
                if args.make_template:
                    yaml.safe_dump(generate_input_template(tool),
                                   sys.stdout,
                                   default_flow_style=False,
                                   indent=4,
                                   block_seq_indent=2)
                    return yaml.safe_dump(generate_input_template(tool),
                                          indent=4), 0

                if args.validate:
                    _logger.info("Tool definition is valid")
                    return "Tool definition is valid", 0

                if args.pack:
                    stdout.write(
                        print_pack(document_loader, processobj, uri, metadata))
                    return print_pack(document_loader, processobj, uri,
                                      metadata), 0

                if args.print_rdf:
                    stdout.write(
                        printrdf(tool, document_loader.ctx,
                                 args.rdf_serializer))
                    return printrdf(tool, document_loader.ctx,
                                    args.rdf_serializer), 0

                if args.print_dot:
                    printdot(tool, document_loader.ctx, stdout)
                    return "args.print_dot still not solved", 0

            except (validate.ValidationException) as exc:
                _logger.error(u"Tool definition failed validation:\n%s",
                              exc,
                              exc_info=args.debug)
                infor = "Tool definition failed validation:\n%s" + exc + args.debug
                return infor, 1
            except (RuntimeError, WorkflowException) as exc:
                _logger.error(u"Tool definition failed initialization:\n%s",
                              exc,
                              exc_info=args.debug)
                infor = "Tool definition failed initialization:\n%s" + exc + args.debug
                return infor, 1
            except Exception as exc:
                _logger.error(
                    u"I'm sorry, I couldn't load this CWL file%s.\nThe error was: %s",
                    try_again_msg,
                    exc if not args.debug else "",
                    exc_info=args.debug)
                return "I'm sorry, I couldn't load this CWL file", 1

            if isinstance(tool, int):
                return tool, 0

            # If on MacOS platform, TMPDIR must be set to be under one of the
            # shared volumes in Docker for Mac
            # More info: https://dockstore.org/docs/faq
            if sys.platform == "darwin":
                default_mac_path = "/private/tmp/docker_tmp"
                if runtimeContext.tmp_outdir_prefix == DEFAULT_TMP_PREFIX:
                    runtimeContext.tmp_outdir_prefix = default_mac_path

            for dirprefix in ("tmpdir_prefix", "tmp_outdir_prefix",
                              "cachedir"):
                if getattr(runtimeContext, dirprefix) and getattr(
                        runtimeContext, dirprefix) != DEFAULT_TMP_PREFIX:
                    sl = "/" if getattr(runtimeContext, dirprefix).endswith("/") or dirprefix == "cachedir" \
                        else ""
                    setattr(
                        runtimeContext, dirprefix,
                        os.path.abspath(getattr(runtimeContext, dirprefix)) +
                        sl)
                    if not os.path.exists(
                            os.path.dirname(getattr(runtimeContext,
                                                    dirprefix))):
                        try:
                            os.makedirs(
                                os.path.dirname(
                                    getattr(runtimeContext, dirprefix)))
                        except Exception as e:
                            _logger.error("Failed to create directory: %s", e)
                            infor = "Failed to create directory: %s" + e + ""
                            return infor, 1

            if args.cachedir:
                if args.move_outputs == "move":
                    runtimeContext.move_outputs = "copy"
                runtimeContext.tmp_outdir_prefix = args.cachedir

            runtimeContext.secret_store = getdefault(
                runtimeContext.secret_store, SecretStore())

            try:
                initialized_job_order_object = init_job_order(
                    job_order_object,
                    args,
                    tool,
                    jobloader,
                    stdout,
                    print_input_deps=args.print_input_deps,
                    relative_deps=args.relative_deps,
                    input_basedir=input_basedir,
                    secret_store=runtimeContext.secret_store)
            except SystemExit as err:
                return err.code
            if not executor:
                if args.parallel:
                    executor = MultithreadedJobExecutor()
                else:
                    executor = SingleJobExecutor()
            assert executor is not None

            if isinstance(initialized_job_order_object, int):
                return initialized_job_order_object

            try:
                runtimeContext.basedir = input_basedir
                del args.workflow
                del args.job_order

                conf_file = getattr(args,
                                    "beta_dependency_resolvers_configuration",
                                    None)  # Text
                use_conda_dependencies = getattr(args,
                                                 "beta_conda_dependencies",
                                                 None)  # Text

                job_script_provider = None  # type: Optional[DependenciesConfiguration]
                if conf_file or use_conda_dependencies:
                    runtimeContext.job_script_provider = DependenciesConfiguration(
                        args)

                runtimeContext.find_default_container = \
                    functools.partial(find_default_container, args)
                runtimeContext.make_fs_access = getdefault(
                    runtimeContext.make_fs_access, StdFsAccess)

                (out, status) = executor(tool,
                                         initialized_job_order_object,
                                         runtimeContext,
                                         logger=_logger)
                # This is the workflow output, it needs to be written
                if out is not None:

                    def loc_to_path(obj):
                        for field in ("path", "nameext", "nameroot",
                                      "dirname"):
                            if field in obj:
                                del obj[field]
                        if obj["location"].startswith("file://"):
                            obj["path"] = uri_file_path(obj["location"])

                    visit_class(out, ("File", "Directory"), loc_to_path)

                    # Unsetting the Generation fron final output object
                    visit_class(out, ("File", ),
                                MutationManager().unset_generation)

                    if isinstance(out, string_types):
                        stdout.write(out)
                    else:
                        stdout.write(
                            json_dumps(
                                out,
                                indent=4,  # type: ignore
                                ensure_ascii=False))
                    stdout.write("\n")
                    if hasattr(stdout, "flush"):
                        stdout.flush()  # type: ignore

                if status != "success":
                    _logger.warning(u"Final process status is %s", status)
                    infor = "Final process status is %s" + status + ""
                    return infor, 1

                _logger.info(u"Final process status is %s", status)
                return out, status

            except (validate.ValidationException) as exc:
                _logger.error(u"Input object failed validation:\n%s",
                              exc,
                              exc_info=args.debug)
                infor = "Input object failed validation:\n%s" + exc + args.debug
                return infor, 1
            except UnsupportedRequirement as exc:
                _logger.error(
                    u"Workflow or tool uses unsupported feature:\n%s",
                    exc,
                    exc_info=args.debug)
                infor = "Workflow or tool uses unsupported feature:\n%s" + exc + args.debug
                return infor, 3
            except WorkflowException as exc:
                _logger.error(u"Workflow error%s:\n%s",
                              try_again_msg,
                              strip_dup_lineno(six.text_type(exc)),
                              exc_info=args.debug)
                infor = "Workflow error%s:\n%s" + try_again_msg + strip_dup_lineno(
                    six.text_type(exc)) + args.debug
                return infor, 1
            except Exception as exc:
                _logger.error(u"Unhandled error%s:\n  %s",
                              try_again_msg,
                              exc,
                              exc_info=args.debug)
                infor = "Unhandled error%s:\n  %s" + try_again_msg + exc + args.debug
                return infor, 1

        finally:
            _logger.removeHandler(stderr_handler)
            _logger.addHandler(defaultStreamHandler)
Ejemplo n.º 20
0
def main(args=None, stdout=sys.stdout):
    parser = argparse.ArgumentParser()
    Job.Runner.addToilOptions(parser)
    parser.add_argument("cwltool", type=str)
    parser.add_argument("cwljob", nargs=argparse.REMAINDER)

    # Will override the "jobStore" positional argument, enables
    # user to select jobStore or get a default from logic one below.
    parser.add_argument("--jobStore", type=str)
    parser.add_argument("--not-strict", action="store_true")
    parser.add_argument("--no-container", action="store_true")
    parser.add_argument("--quiet", dest="logLevel", action="store_const", const="ERROR")
    parser.add_argument("--basedir", type=str)
    parser.add_argument("--outdir", type=str, default=os.getcwd())
    parser.add_argument("--version", action='version', version=baseVersion)
    parser.add_argument("--preserve-environment", type=str, nargs='+',
                    help="Preserve specified environment variables when running CommandLineTools",
                    metavar=("VAR1 VAR2"),
                    default=("PATH",),
                    dest="preserve_environment")

    # mkdtemp actually creates the directory, but
    # toil requires that the directory not exist,
    # so make it and delete it and allow
    # toil to create it again (!)
    workdir = tempfile.mkdtemp()
    os.rmdir(workdir)

    if args is None:
        args = sys.argv[1:]

    options = parser.parse_args([workdir] + args)

    use_container = not options.no_container

    setLoggingFromOptions(options)
    if options.logLevel:
        cwllogger.setLevel(options.logLevel)

    outdir = os.path.abspath(options.outdir)
    fileindex = {}
    existing = {}

    with Toil(options) as toil:
        if options.restart:
            outobj = toil.restart()
        else:
            useStrict = not options.not_strict
            try:
                t = cwltool.load_tool.load_tool(options.cwltool, toilMakeTool,
                                                kwargs={
                                                    "hints": [{
                                                        "class": "ResourceRequirement",
                                                        "coresMin": toil.config.defaultCores,
                                                        "ramMin": toil.config.defaultMemory / (2**20),
                                                        "outdirMin": toil.config.defaultDisk / (2**20),
                                                        "tmpdirMin": 0
                                                    }]},
                                                resolver=cwltool.resolver.tool_resolver,
                                                strict=useStrict)
                unsupportedRequirementsCheck(t.requirements)
            except cwltool.process.UnsupportedRequirement as e:
                logging.error(e)
                return 33

            if type(t) == int:
                return t

            options.workflow = options.cwltool
            options.job_order = options.cwljob
            options.tool_help = None
            options.debug = options.logLevel == "DEBUG"
            job = cwltool.main.load_job_order(options, t, sys.stdin)

            if type(job) == int:
                return job

            job, options.basedir = job

            fillInDefaults(t.tool["inputs"], job)

            def pathToLoc(p):
                if "location" not in p and "path" in p:
                    p["location"] = p["path"]
                    del p["path"]

            def importFiles(tool):
                visit_class(tool, ("File", "Directory"), pathToLoc)
                normalizeFilesDirs(tool)
                adjustDirObjs(tool, functools.partial(get_listing,
                                                      cwltool.stdfsaccess.StdFsAccess(""),
                                                      recursive=True))
                adjustFileObjs(tool, functools.partial(uploadFile,
                                                       toil.importFile,
                                                       fileindex, existing, skip_broken=True))

            t.visit(importFiles)

            for inp in t.tool["inputs"]:
                def setSecondary(fileobj):
                    if isinstance(fileobj, dict) and fileobj.get("class") == "File":
                        if "secondaryFiles" not in fileobj:
                            fileobj["secondaryFiles"] = [{
                                "location": cwltool.builder.substitute(fileobj["location"], sf), "class": "File"}
                                                         for sf in inp["secondaryFiles"]]

                    if isinstance(fileobj, list):
                        for e in fileobj:
                            setSecondary(e)

                if shortname(inp["id"]) in job and inp.get("secondaryFiles"):
                    setSecondary(job[shortname(inp["id"])])

            importFiles(job)
            visitSteps(t, importFiles)

            make_fs_access = functools.partial(ToilFsAccess, fileStore=toil)
            try:
                (wf1, wf2) = makeJob(t, {}, use_container=use_container,
                                     preserve_environment=options.preserve_environment,
                                     tmpdir=os.path.realpath(outdir), workdir=options.workDir)
            except cwltool.process.UnsupportedRequirement as e:
                logging.error(e)
                return 33

            wf1.cwljob = job
            outobj = toil.start(wf1)

        outobj = resolve_indirect(outobj)

        toilStageFiles(toil, outobj, outdir, fileindex, existing, True)

        visit_class(outobj, ("File",), functools.partial(compute_checksums, cwltool.stdfsaccess.StdFsAccess("")))

        stdout.write(json.dumps(outobj, indent=4))

    return 0
Ejemplo n.º 21
0
def main(args=None, stdout=sys.stdout):
    """Main method for toil-cwl-runner."""
    cwllogger.removeHandler(defaultStreamHandler)
    config = Config()
    config.cwl = True
    parser = argparse.ArgumentParser()
    addOptions(parser, config)
    parser.add_argument("cwltool", type=str)
    parser.add_argument("cwljob", nargs=argparse.REMAINDER)

    # Will override the "jobStore" positional argument, enables
    # user to select jobStore or get a default from logic one below.
    parser.add_argument("--jobStore", type=str)
    parser.add_argument("--not-strict", action="store_true")
    parser.add_argument("--quiet", dest="logLevel", action="store_const",
                        const="ERROR")
    parser.add_argument("--basedir", type=str)
    parser.add_argument("--outdir", type=str, default=os.getcwd())
    parser.add_argument("--version", action='version', version=baseVersion)
    dockergroup = parser.add_mutually_exclusive_group()
    dockergroup.add_argument(
        "--user-space-docker-cmd",
        help="(Linux/OS X only) Specify a user space docker command (like "
        "udocker or dx-docker) that will be used to call 'pull' and 'run'")
    dockergroup.add_argument(
        "--singularity", action="store_true", default=False,
        help="[experimental] Use Singularity runtime for running containers. "
        "Requires Singularity v2.3.2+ and Linux with kernel version v3.18+ or "
        "with overlayfs support backported.")
    dockergroup.add_argument(
        "--no-container", action="store_true", help="Do not execute jobs in a "
        "Docker container, even when `DockerRequirement` "
        "is specified under `hints`.")
    parser.add_argument(
        "--preserve-environment", type=str, nargs='+',
        help="Preserve specified environment variables when running"
        " CommandLineTools", metavar=("VAR1 VAR2"), default=("PATH",),
        dest="preserve_environment")
    parser.add_argument(
        "--destBucket", type=str,
        help="Specify a cloud bucket endpoint for output files.")
    parser.add_argument(
        "--beta-dependency-resolvers-configuration", default=None)
    parser.add_argument("--beta-dependencies-directory", default=None)
    parser.add_argument(
        "--beta-use-biocontainers", default=None, action="store_true")
    parser.add_argument(
        "--beta-conda-dependencies", default=None, action="store_true")
    parser.add_argument("--tmpdir-prefix", type=Text,
                        help="Path prefix for temporary directories",
                        default="tmp")
    parser.add_argument("--tmp-outdir-prefix", type=Text,
                        help="Path prefix for intermediate output directories",
                        default="tmp")
    parser.add_argument(
        "--force-docker-pull", action="store_true", default=False,
        dest="force_docker_pull",
        help="Pull latest docker image even if it is locally present")
    parser.add_argument(
        "--no-match-user", action="store_true", default=False,
        help="Disable passing the current uid to `docker run --user`")

    # mkdtemp actually creates the directory, but
    # toil requires that the directory not exist,
    # so make it and delete it and allow
    # toil to create it again (!)
    workdir = tempfile.mkdtemp()
    os.rmdir(workdir)

    if args is None:
        args = sys.argv[1:]

    # we use workdir as jobStore:
    options = parser.parse_args([workdir] + args)

    # if tmpdir_prefix is not the default value, set workDir too
    if options.tmpdir_prefix != 'tmp':
        options.workDir = options.tmpdir_prefix

    if options.provisioner and not options.jobStore:
        raise NoSuchJobStoreException(
            'Please specify a jobstore with the --jobStore option when specifying a provisioner.')

    use_container = not options.no_container

    if options.logLevel:
        cwllogger.setLevel(options.logLevel)

    outdir = os.path.abspath(options.outdir)
    tmp_outdir_prefix = os.path.abspath(options.tmp_outdir_prefix)
    tmpdir_prefix = os.path.abspath(options.tmpdir_prefix)

    fileindex = {}
    existing = {}
    conf_file = getattr(options,
                        "beta_dependency_resolvers_configuration", None)
    use_conda_dependencies = getattr(options, "beta_conda_dependencies", None)
    job_script_provider = None
    if conf_file or use_conda_dependencies:
        dependencies_configuration = DependenciesConfiguration(options)
        job_script_provider = dependencies_configuration

    options.default_container = None
    runtime_context = cwltool.context.RuntimeContext(vars(options))
    runtime_context.find_default_container = functools.partial(
        find_default_container, options)
    runtime_context.workdir = workdir
    runtime_context.move_outputs = "leave"
    runtime_context.rm_tmpdir = False
    loading_context = cwltool.context.LoadingContext(vars(options))

    with Toil(options) as toil:
        if options.restart:
            outobj = toil.restart()
        else:
            loading_context.hints = [{
                "class": "ResourceRequirement",
                "coresMin": toil.config.defaultCores,
                "ramMin": toil.config.defaultMemory / (2**20),
                "outdirMin": toil.config.defaultDisk / (2**20),
                "tmpdirMin": 0
            }]
            loading_context.construct_tool_object = toil_make_tool
            loading_context.resolver = cwltool.resolver.tool_resolver
            loading_context.strict = not options.not_strict
            options.workflow = options.cwltool
            options.job_order = options.cwljob
            uri, tool_file_uri = cwltool.load_tool.resolve_tool_uri(
                options.cwltool, loading_context.resolver,
                loading_context.fetcher_constructor)
            options.tool_help = None
            options.debug = options.logLevel == "DEBUG"
            job_order_object, options.basedir, jobloader = \
                cwltool.main.load_job_order(
                    options, sys.stdin, loading_context.fetcher_constructor,
                    loading_context.overrides_list, tool_file_uri)
            document_loader, workflowobj, uri = \
                cwltool.load_tool.fetch_document(
                    uri, loading_context.resolver,
                    loading_context.fetcher_constructor)
            document_loader, avsc_names, processobj, metadata, uri = \
                cwltool.load_tool.validate_document(
                    document_loader, workflowobj, uri,
                    loading_context.enable_dev, loading_context.strict, False,
                    loading_context.fetcher_constructor, False,
                    loading_context.overrides_list,
                    do_validate=loading_context.do_validate)
            loading_context.overrides_list.extend(
                metadata.get("cwltool:overrides", []))
            try:
                tool = cwltool.load_tool.make_tool(
                    document_loader, avsc_names, metadata, uri,
                    loading_context)
            except cwltool.process.UnsupportedRequirement as err:
                logging.error(err)
                return 33
            runtime_context.secret_store = SecretStore()
            initialized_job_order = cwltool.main.init_job_order(
                job_order_object, options, tool, jobloader, sys.stdout,
                secret_store=runtime_context.secret_store)
            fs_access = cwltool.stdfsaccess.StdFsAccess(options.basedir)
            fill_in_defaults(
                tool.tool["inputs"], initialized_job_order, fs_access)

            def path_to_loc(obj):
                if "location" not in obj and "path" in obj:
                    obj["location"] = obj["path"]
                    del obj["path"]

            def import_files(tool):
                visit_class(tool, ("File", "Directory"), path_to_loc)
                visit_class(tool, ("File", ), functools.partial(
                    add_sizes, fs_access))
                normalizeFilesDirs(tool)
                adjustDirObjs(tool, functools.partial(
                    get_listing, fs_access, recursive=True))
                adjustFileObjs(tool, functools.partial(
                    uploadFile, toil.importFile, fileindex, existing,
                    skip_broken=True))

            tool.visit(import_files)

            for inp in tool.tool["inputs"]:
                def set_secondary(fileobj):
                    if isinstance(fileobj, Mapping) \
                            and fileobj.get("class") == "File":
                        if "secondaryFiles" not in fileobj:
                            fileobj["secondaryFiles"] = [
                                {"location": cwltool.builder.substitute(
                                    fileobj["location"], sf), "class": "File"}
                                for sf in inp["secondaryFiles"]]

                    if isinstance(fileobj, MutableSequence):
                        for entry in fileobj:
                            set_secondary(entry)

                if shortname(inp["id"]) in initialized_job_order \
                        and inp.get("secondaryFiles"):
                    set_secondary(initialized_job_order[shortname(inp["id"])])

            import_files(initialized_job_order)
            visitSteps(tool, import_files)

            try:
                runtime_context.use_container = use_container
                runtime_context.tmpdir = os.path.realpath(tmpdir_prefix)
                runtime_context.tmp_outdir_prefix = os.path.realpath(
                    tmp_outdir_prefix)
                runtime_context.job_script_provider = job_script_provider
                runtime_context.force_docker_pull = options.force_docker_pull
                runtime_context.no_match_user = options.no_match_user
                (wf1, _) = makeJob(tool, {}, None, runtime_context)
            except cwltool.process.UnsupportedRequirement as err:
                logging.error(err)
                return 33

            wf1.cwljob = initialized_job_order
            if wf1 is CWLJob:  # Clean up temporary directories only created with CWLJobs.
                wf1.addFollowOnFn(cleanTempDirs, wf1)
            outobj = toil.start(wf1)

        outobj = resolve_indirect(outobj)

        # Stage files. Specify destination bucket if specified in CLI
        # options. If destination bucket not passed in,
        # options.destBucket's value will be None.
        toilStageFiles(
            toil,
            outobj,
            outdir,
            fileindex,
            existing,
            export=True,
            destBucket=options.destBucket)

        if not options.destBucket:
            visit_class(outobj, ("File",), functools.partial(
                compute_checksums, cwltool.stdfsaccess.StdFsAccess("")))

        visit_class(outobj, ("File", ), MutationManager().unset_generation)
        stdout.write(json.dumps(outobj, indent=4))

    return 0
Ejemplo n.º 22
0
def upload_dependencies(arvrunner, name, document_loader,
                        workflowobj, uri, loadref_run,
                        include_primary=True, discovered_secondaryfiles=None):
    """Upload the dependencies of the workflowobj document to Keep.

    Returns a pathmapper object mapping local paths to keep references.  Also
    does an in-place update of references in "workflowobj".

    Use scandeps to find $import, $include, $schemas, run, File and Directory
    fields that represent external references.

    If workflowobj has an "id" field, this will reload the document to ensure
    it is scanning the raw document prior to preprocessing.
    """

    loaded = set()
    def loadref(b, u):
        joined = document_loader.fetcher.urljoin(b, u)
        defrg, _ = urllib.parse.urldefrag(joined)
        if defrg not in loaded:
            loaded.add(defrg)
            # Use fetch_text to get raw file (before preprocessing).
            text = document_loader.fetch_text(defrg)
            if isinstance(text, bytes):
                textIO = StringIO(text.decode('utf-8'))
            else:
                textIO = StringIO(text)
            return yaml.safe_load(textIO)
        else:
            return {}

    if loadref_run:
        loadref_fields = set(("$import", "run"))
    else:
        loadref_fields = set(("$import",))

    scanobj = workflowobj
    if "id" in workflowobj:
        # Need raw file content (before preprocessing) to ensure
        # that external references in $include and $mixin are captured.
        scanobj = loadref("", workflowobj["id"])

    sc_result = scandeps(uri, scanobj,
                  loadref_fields,
                  set(("$include", "$schemas", "location")),
                  loadref, urljoin=document_loader.fetcher.urljoin)

    sc = []
    uuids = {}

    def collect_uuids(obj):
        loc = obj.get("location", "")
        sp = loc.split(":")
        if sp[0] == "keep":
            # Collect collection uuids that need to be resolved to
            # portable data hashes
            gp = collection_uuid_pattern.match(loc)
            if gp:
                uuids[gp.groups()[0]] = obj
            if collectionUUID in obj:
                uuids[obj[collectionUUID]] = obj

    def collect_uploads(obj):
        loc = obj.get("location", "")
        sp = loc.split(":")
        if len(sp) < 1:
            return
        if sp[0] in ("file", "http", "https"):
            # Record local files than need to be uploaded,
            # don't include file literals, keep references, etc.
            sc.append(obj)
        collect_uuids(obj)

    visit_class(workflowobj, ("File", "Directory"), collect_uuids)
    visit_class(sc_result, ("File", "Directory"), collect_uploads)

    # Resolve any collection uuids we found to portable data hashes
    # and assign them to uuid_map
    uuid_map = {}
    fetch_uuids = list(uuids.keys())
    while fetch_uuids:
        # For a large number of fetch_uuids, API server may limit
        # response size, so keep fetching from API server has nothing
        # more to give us.
        lookups = arvrunner.api.collections().list(
            filters=[["uuid", "in", fetch_uuids]],
            count="none",
            select=["uuid", "portable_data_hash"]).execute(
                num_retries=arvrunner.num_retries)

        if not lookups["items"]:
            break

        for l in lookups["items"]:
            uuid_map[l["uuid"]] = l["portable_data_hash"]

        fetch_uuids = [u for u in fetch_uuids if u not in uuid_map]

    normalizeFilesDirs(sc)

    if include_primary and "id" in workflowobj:
        sc.append({"class": "File", "location": workflowobj["id"]})

    if "$schemas" in workflowobj:
        for s in workflowobj["$schemas"]:
            sc.append({"class": "File", "location": s})

    def visit_default(obj):
        remove = [False]
        def ensure_default_location(f):
            if "location" not in f and "path" in f:
                f["location"] = f["path"]
                del f["path"]
            if "location" in f and not arvrunner.fs_access.exists(f["location"]):
                # Doesn't exist, remove from list of dependencies to upload
                sc[:] = [x for x in sc if x["location"] != f["location"]]
                # Delete "default" from workflowobj
                remove[0] = True
        visit_class(obj["default"], ("File", "Directory"), ensure_default_location)
        if remove[0]:
            del obj["default"]

    find_defaults(workflowobj, visit_default)

    discovered = {}
    def discover_default_secondary_files(obj):
        discover_secondary_files(obj["inputs"],
                                 {shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t},
                                 discovered)

    visit_class(workflowobj, ("CommandLineTool", "Workflow"), discover_default_secondary_files)

    for d in list(discovered):
        # Only interested in discovered secondaryFiles which are local
        # files that need to be uploaded.
        if d.startswith("file:"):
            sc.extend(discovered[d])
        else:
            del discovered[d]

    mapper = ArvPathMapper(arvrunner, sc, "",
                           "keep:%s",
                           "keep:%s/%s",
                           name=name,
                           single_collection=True)

    def setloc(p):
        loc = p.get("location")
        if loc and (not loc.startswith("_:")) and (not loc.startswith("keep:")):
            p["location"] = mapper.mapper(p["location"]).resolved
            return

        if not loc:
            return

        if collectionUUID in p:
            uuid = p[collectionUUID]
            if uuid not in uuid_map:
                raise SourceLine(p, collectionUUID, validate.ValidationException).makeError(
                    "Collection uuid %s not found" % uuid)
            gp = collection_pdh_pattern.match(loc)
            if gp and uuid_map[uuid] != gp.groups()[0]:
                # This file entry has both collectionUUID and a PDH
                # location. If the PDH doesn't match the one returned
                # the API server, raise an error.
                raise SourceLine(p, "location", validate.ValidationException).makeError(
                    "Expected collection uuid %s to be %s but API server reported %s" % (
                        uuid, gp.groups()[0], uuid_map[p[collectionUUID]]))

        gp = collection_uuid_pattern.match(loc)
        if not gp:
            return
        uuid = gp.groups()[0]
        if uuid not in uuid_map:
            raise SourceLine(p, "location", validate.ValidationException).makeError(
                "Collection uuid %s not found" % uuid)
        p["location"] = "keep:%s%s" % (uuid_map[uuid], gp.groups()[1] if gp.groups()[1] else "")
        p[collectionUUID] = uuid

    visit_class(workflowobj, ("File", "Directory"), setloc)
    visit_class(discovered, ("File", "Directory"), setloc)

    if discovered_secondaryfiles is not None:
        for d in discovered:
            discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d]

    if "$schemas" in workflowobj:
        sch = []
        for s in workflowobj["$schemas"]:
            sch.append(mapper.mapper(s).resolved)
        workflowobj["$schemas"] = sch

    return mapper
Ejemplo n.º 23
0
    def arv_executor(self, tool, job_order, runtimeContext, logger=None):
        self.debug = runtimeContext.debug

        tool.visit(self.check_features)

        self.project_uuid = runtimeContext.project_uuid
        self.pipeline = None
        self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
        self.secret_store = runtimeContext.secret_store

        self.trash_intermediate = runtimeContext.trash_intermediate
        if self.trash_intermediate and self.work_api != "containers":
            raise Exception("--trash-intermediate is only supported with --api=containers.")

        self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
        if self.intermediate_output_ttl and self.work_api != "containers":
            raise Exception("--intermediate-output-ttl is only supported with --api=containers.")
        if self.intermediate_output_ttl < 0:
            raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl)

        if runtimeContext.submit_request_uuid and self.work_api != "containers":
            raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api))

        if not runtimeContext.name:
            runtimeContext.name = self.name = tool.tool.get("label") or tool.metadata.get("label") or os.path.basename(tool.tool["id"])

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        merged_map = upload_workflow_deps(self, tool)

        # Reload tool object which may have been updated by
        # upload_workflow_deps
        # Don't validate this time because it will just print redundant errors.
        loadingContext = self.loadingContext.copy()
        loadingContext.loader = tool.doc_loader
        loadingContext.avsc_names = tool.doc_schema
        loadingContext.metadata = tool.metadata
        loadingContext.do_validate = False

        tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]],
                                  loadingContext)

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % runtimeContext.name,
                                     tool, job_order)

        existing_uuid = runtimeContext.update_workflow
        if existing_uuid or runtimeContext.create_workflow:
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "jobs":
                tmpl = RunnerTemplate(self, tool, job_order,
                                      runtimeContext.enable_reuse,
                                      uuid=existing_uuid,
                                      submit_runner_ram=runtimeContext.submit_runner_ram,
                                      name=runtimeContext.name,
                                      merged_map=merged_map,
                                      loadingContext=loadingContext)
                tmpl.save()
                # cwltool.main will write our return value to stdout.
                return (tmpl.uuid, "success")
            elif self.work_api == "containers":
                return (upload_workflow(self, tool, job_order,
                                        self.project_uuid,
                                        uuid=existing_uuid,
                                        submit_runner_ram=runtimeContext.submit_runner_ram,
                                        name=runtimeContext.name,
                                        merged_map=merged_map),
                        "success")

        self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
        self.eval_timeout = runtimeContext.eval_timeout

        runtimeContext = runtimeContext.copy()
        runtimeContext.use_container = True
        runtimeContext.tmpdir_prefix = "tmp"
        runtimeContext.work_api = self.work_api

        if self.work_api == "containers":
            if self.ignore_docker_for_reuse:
                raise Exception("--ignore-docker-for-reuse not supported with containers API.")
            runtimeContext.outdir = "/var/spool/cwl"
            runtimeContext.docker_outdir = "/var/spool/cwl"
            runtimeContext.tmpdir = "/tmp"
            runtimeContext.docker_tmpdir = "/tmp"
        elif self.work_api == "jobs":
            if runtimeContext.priority != DEFAULT_PRIORITY:
                raise Exception("--priority not implemented for jobs API.")
            runtimeContext.outdir = "$(task.outdir)"
            runtimeContext.docker_outdir = "$(task.outdir)"
            runtimeContext.tmpdir = "$(task.tmpdir)"

        if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
            raise Exception("--priority must be in the range 1..1000.")

        if self.should_estimate_cache_size:
            visited = set()
            estimated_size = [0]
            def estimate_collection_cache(obj):
                if obj.get("location", "").startswith("keep:"):
                    m = pdh_size.match(obj["location"][5:])
                    if m and m.group(1) not in visited:
                        visited.add(m.group(1))
                        estimated_size[0] += int(m.group(2))
            visit_class(job_order, ("File", "Directory"), estimate_collection_cache)
            runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256)
            self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024)

        logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size)

        runnerjob = None
        if runtimeContext.submit:
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if tool.tool["class"] == "CommandLineTool" and runtimeContext.wait and (not runtimeContext.always_submit_runner):
                    runtimeContext.runnerjob = tool.tool["id"]
                else:
                    tool = RunnerContainer(self, tool, loadingContext, runtimeContext.enable_reuse,
                                                self.output_name,
                                                self.output_tags,
                                                submit_runner_ram=runtimeContext.submit_runner_ram,
                                                name=runtimeContext.name,
                                                on_error=runtimeContext.on_error,
                                                submit_runner_image=runtimeContext.submit_runner_image,
                                                intermediate_output_ttl=runtimeContext.intermediate_output_ttl,
                                                merged_map=merged_map,
                                                priority=runtimeContext.priority,
                                                secret_store=self.secret_store,
                                                collection_cache_size=runtimeContext.collection_cache_size,
                                                collection_cache_is_default=self.should_estimate_cache_size)
            elif self.work_api == "jobs":
                tool = RunnerJob(self, tool, loadingContext, runtimeContext.enable_reuse,
                                      self.output_name,
                                      self.output_tags,
                                      submit_runner_ram=runtimeContext.submit_runner_ram,
                                      name=runtimeContext.name,
                                      on_error=runtimeContext.on_error,
                                      submit_runner_image=runtimeContext.submit_runner_image,
                                      merged_map=merged_map)
        elif runtimeContext.cwl_runner_job is None and self.work_api == "jobs":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid": self.project_uuid,
                    "name": runtimeContext.name if runtimeContext.name else shortname(tool.tool["id"]),
                    "components": {},
                    "state": "RunningOnClient"}).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runtimeContext.cwl_runner_job is not None:
            self.uuid = runtimeContext.cwl_runner_job.get('uuid')

        jobiter = tool.job(job_order,
                           self.output_callback,
                           runtimeContext)

        if runtimeContext.submit and not runtimeContext.wait:
            runnerjob = next(jobiter)
            runnerjob.run(runtimeContext)
            return (runnerjob.uuid, "success")

        current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
        if current_container:
            logger.info("Running inside container %s", current_container.get("uuid"))

        self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout)
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)

        try:
            self.workflow_eval_lock.acquire()

            # Holds the lock while this code runs and releases it when
            # it is safe to do so in self.workflow_eval_lock.wait(),
            # at which point on_message can update job state and
            # process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if self.task_queue.error is not None:
                    raise self.task_queue.error

                if runnable:
                    with Perf(metrics, "run"):
                        self.start_run(runnable, runtimeContext)
                else:
                    if (self.task_queue.in_flight + len(self.processes)) > 0:
                        self.workflow_eval_lock.wait(3)
                    else:
                        logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.")
                        break

                if self.stop_polling.is_set():
                    break

                loopperf.__enter__()
            loopperf.__exit__()

            while (self.task_queue.in_flight + len(self.processes)) > 0:
                if self.task_queue.error is not None:
                    raise self.task_queue.error
                self.workflow_eval_lock.wait(3)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit:
                logger.error("Interrupted, workflow will be cancelled")
            else:
                logger.error("Execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
                                                     body={"state": "Failed"}).execute(num_retries=self.num_retries)
            if runtimeContext.submit and isinstance(tool, Runner):
                runnerjob = tool
                if runnerjob.uuid and self.work_api == "containers":
                    self.api.container_requests().update(uuid=runnerjob.uuid,
                                                     body={"priority": "0"}).execute(num_retries=self.num_retries)
        finally:
            self.workflow_eval_lock.release()
            self.task_queue.drain()
            self.stop_polling.set()
            self.polling_thread.join()
            self.task_queue.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if runtimeContext.submit and isinstance(tool, Runner):
            logger.info("Final output collection %s", tool.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""

            storage_classes = runtimeContext.storage_classes.strip().split(",")
            self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output)
            self.set_crunch_output()

        if runtimeContext.compute_checksum:
            adjustDirObjs(self.final_output, partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access))

        if self.trash_intermediate and self.final_status == "success":
            self.trash_intermediate_output()

        return (self.final_output, self.final_status)
Ejemplo n.º 24
0
def main(args=None, stdout=sys.stdout):
    parser = argparse.ArgumentParser()
    Job.Runner.addToilOptions(parser)
    parser.add_argument("cwltool", type=str)
    parser.add_argument("cwljob", nargs=argparse.REMAINDER)

    # Will override the "jobStore" positional argument, enables
    # user to select jobStore or get a default from logic one below.
    parser.add_argument("--jobStore", type=str)
    parser.add_argument("--not-strict", action="store_true")
    parser.add_argument("--no-container", action="store_true")
    parser.add_argument("--quiet",
                        dest="logLevel",
                        action="store_const",
                        const="ERROR")
    parser.add_argument("--basedir", type=str)
    parser.add_argument("--outdir", type=str, default=os.getcwd())
    parser.add_argument("--version", action='version', version=baseVersion)
    parser.add_argument(
        "--preserve-environment",
        type=str,
        nargs='+',
        help=
        "Preserve specified environment variables when running CommandLineTools",
        metavar=("VAR1 VAR2"),
        default=("PATH", ),
        dest="preserve_environment")

    # mkdtemp actually creates the directory, but
    # toil requires that the directory not exist,
    # so make it and delete it and allow
    # toil to create it again (!)
    workdir = tempfile.mkdtemp()
    os.rmdir(workdir)

    if args is None:
        args = sys.argv[1:]

    options = parser.parse_args([workdir] + args)

    use_container = not options.no_container

    setLoggingFromOptions(options)
    if options.logLevel:
        cwllogger.setLevel(options.logLevel)

    outdir = os.path.abspath(options.outdir)
    fileindex = {}
    existing = {}

    with Toil(options) as toil:
        if options.restart:
            outobj = toil.restart()
        else:
            useStrict = not options.not_strict
            try:
                t = cwltool.load_tool.load_tool(
                    options.cwltool,
                    toilMakeTool,
                    kwargs={
                        "hints": [{
                            "class":
                            "ResourceRequirement",
                            "coresMin":
                            toil.config.defaultCores,
                            "ramMin":
                            toil.config.defaultMemory / (2**20),
                            "outdirMin":
                            toil.config.defaultDisk / (2**20),
                            "tmpdirMin":
                            0
                        }]
                    },
                    resolver=cwltool.resolver.tool_resolver,
                    strict=useStrict)
                unsupportedRequirementsCheck(t.requirements)
            except cwltool.process.UnsupportedRequirement as e:
                logging.error(e)
                return 33

            if type(t) == int:
                return t

            options.workflow = options.cwltool
            options.job_order = options.cwljob
            options.tool_help = None
            options.debug = options.logLevel == "DEBUG"
            job = cwltool.main.load_job_order(options, t, sys.stdin)

            if type(job) == int:
                return job

            job, options.basedir = job

            fillInDefaults(t.tool["inputs"], job)

            def pathToLoc(p):
                if "location" not in p and "path" in p:
                    p["location"] = p["path"]
                    del p["path"]

            def importFiles(tool):
                visit_class(tool, ("File", "Directory"), pathToLoc)
                normalizeFilesDirs(tool)
                adjustDirObjs(
                    tool,
                    functools.partial(get_listing,
                                      cwltool.stdfsaccess.StdFsAccess(""),
                                      recursive=True))
                adjustFileObjs(
                    tool,
                    functools.partial(uploadFile,
                                      toil.importFile,
                                      fileindex,
                                      existing,
                                      skip_broken=True))

            t.visit(importFiles)

            for inp in t.tool["inputs"]:

                def setSecondary(fileobj):
                    if isinstance(fileobj,
                                  dict) and fileobj.get("class") == "File":
                        if "secondaryFiles" not in fileobj:
                            fileobj["secondaryFiles"] = [{
                                "location":
                                cwltool.builder.substitute(
                                    fileobj["location"], sf),
                                "class":
                                "File"
                            } for sf in inp["secondaryFiles"]]

                    if isinstance(fileobj, list):
                        for e in fileobj:
                            setSecondary(e)

                if shortname(inp["id"]) in job and inp.get("secondaryFiles"):
                    setSecondary(job[shortname(inp["id"])])

            importFiles(job)
            visitSteps(t, importFiles)

            make_fs_access = functools.partial(ToilFsAccess, fileStore=toil)
            try:
                (wf1, wf2) = makeJob(
                    t, {},
                    use_container=use_container,
                    preserve_environment=options.preserve_environment,
                    tmpdir=os.path.realpath(outdir),
                    workdir=options.workDir)
            except cwltool.process.UnsupportedRequirement as e:
                logging.error(e)
                return 33

            wf1.cwljob = job
            outobj = toil.start(wf1)

        outobj = resolve_indirect(outobj)

        toilStageFiles(toil, outobj, outdir, fileindex, existing, True)

        visit_class(
            outobj, ("File", ),
            functools.partial(compute_checksums,
                              cwltool.stdfsaccess.StdFsAccess("")))

        stdout.write(json.dumps(outobj, indent=4))

    return 0
Ejemplo n.º 25
0
    def job(self, joborder, output_callback, runtimeContext):

        builder = make_builder(joborder, self.hints, self.requirements, runtimeContext)
        runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext)

        req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer")
        if not req:
            return super(ArvadosWorkflow, self).job(joborder, output_callback, runtimeContext)

        # RunInSingleContainer is true

        with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
            if "id" not in self.tool:
                raise WorkflowException("%s object must have 'id'" % (self.tool["class"]))
        document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch(self.tool["id"]), self.tool["id"])

        discover_secondary_files(self.tool["inputs"], joborder)

        with Perf(metrics, "subworkflow upload_deps"):
            upload_dependencies(self.arvrunner,
                                os.path.basename(joborder.get("id", "#")),
                                document_loader,
                                joborder,
                                joborder.get("id", "#"),
                                False)

            if self.wf_pdh is None:
                workflowobj["requirements"] = dedup_reqs(self.requirements)
                workflowobj["hints"] = dedup_reqs(self.hints)

                packed = pack(document_loader, workflowobj, uri, self.metadata)

                def visit(item):
                    for t in ("hints", "requirements"):
                        if t not in item:
                            continue
                        for req in item[t]:
                            if req["class"] == "ResourceRequirement":
                                dyn = False
                                for k in max_res_pars + sum_res_pars:
                                    if k in req:
                                        if isinstance(req[k], basestring):
                                            if item["id"] == "#main":
                                                # only the top-level requirements/hints may contain expressions
                                                self.dynamic_resource_req.append(req)
                                                dyn = True
                                                break
                                            else:
                                                with SourceLine(req, k, WorkflowException):
                                                    raise WorkflowException("Non-top-level ResourceRequirement in single container cannot have expressions")
                                if not dyn:
                                    self.static_resource_req.append(req)
                            if req["class"] == "DockerRequirement":
                                if "http://arvados.org/cwl#dockerCollectionPDH" in req:
                                    del req["http://arvados.org/cwl#dockerCollectionPDH"]

                visit_class(packed["$graph"], ("Workflow", "CommandLineTool"), visit)

                if self.static_resource_req:
                    self.static_resource_req = [get_overall_res_req(self.static_resource_req)]

                upload_dependencies(self.arvrunner,
                                    runtimeContext.name,
                                    document_loader,
                                    packed,
                                    uri,
                                    False)

                # Discover files/directories referenced by the
                # workflow (mainly "default" values)
                visit_class(packed, ("File", "Directory"), self.wf_reffiles.append)


        if self.dynamic_resource_req:
            # Evaluate dynamic resource requirements using current builder
            rs = copy.copy(self.static_resource_req)
            for dyn_rs in self.dynamic_resource_req:
                eval_req = {"class": "ResourceRequirement"}
                for a in max_res_pars + sum_res_pars:
                    if a in dyn_rs:
                        eval_req[a] = builder.do_eval(dyn_rs[a])
                rs.append(eval_req)
            job_res_reqs = [get_overall_res_req(rs)]
        else:
            job_res_reqs = self.static_resource_req

        with Perf(metrics, "subworkflow adjust"):
            joborder_resolved = copy.deepcopy(joborder)
            joborder_keepmount = copy.deepcopy(joborder)

            reffiles = []
            visit_class(joborder_keepmount, ("File", "Directory"), reffiles.append)

            mapper = ArvPathMapper(self.arvrunner, reffiles+self.wf_reffiles, runtimeContext.basedir,
                                   "/keep/%s",
                                   "/keep/%s/%s")

            # For containers API, we need to make sure any extra
            # referenced files (ie referenced by the workflow but
            # not in the inputs) are included in the mounts.
            if self.wf_reffiles:
                runtimeContext = runtimeContext.copy()
                runtimeContext.extra_reffiles = copy.deepcopy(self.wf_reffiles)

            def keepmount(obj):
                remove_redundant_fields(obj)
                with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                    if "location" not in obj:
                        raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj))
                with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                    if obj["location"].startswith("keep:"):
                        obj["location"] = mapper.mapper(obj["location"]).target
                        if "listing" in obj:
                            del obj["listing"]
                    elif obj["location"].startswith("_:"):
                        del obj["location"]
                    else:
                        raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"])

            visit_class(joborder_keepmount, ("File", "Directory"), keepmount)

            def resolved(obj):
                if obj["location"].startswith("keep:"):
                    obj["location"] = mapper.mapper(obj["location"]).resolved

            visit_class(joborder_resolved, ("File", "Directory"), resolved)

            if self.wf_pdh is None:
                adjustFileObjs(packed, keepmount)
                adjustDirObjs(packed, keepmount)
                self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed)

        wf_runner = cmap({
            "class": "CommandLineTool",
            "baseCommand": "cwltool",
            "inputs": self.tool["inputs"],
            "outputs": self.tool["outputs"],
            "stdout": "cwl.output.json",
            "requirements": self.requirements+job_res_reqs+[
                {"class": "InlineJavascriptRequirement"},
                {
                "class": "InitialWorkDirRequirement",
                "listing": [{
                        "entryname": "workflow.cwl",
                        "entry": '$({"class": "File", "location": "keep:%s/workflow.cwl"})' % self.wf_pdh
                    }, {
                        "entryname": "cwl.input.yml",
                        "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${')
                    }]
            }],
            "hints": self.hints,
            "arguments": ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml"],
            "id": "#"
        })
        return ArvadosCommandTool(self.arvrunner, wf_runner, self.loadingContext).job(joborder_resolved, output_callback, runtimeContext)
Ejemplo n.º 26
0
    def arv_executor(self, updated_tool, job_order, runtimeContext, logger=None):
        self.debug = runtimeContext.debug

        updated_tool.visit(self.check_features)

        self.project_uuid = runtimeContext.project_uuid
        self.pipeline = None
        self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
        self.secret_store = runtimeContext.secret_store

        self.trash_intermediate = runtimeContext.trash_intermediate
        if self.trash_intermediate and self.work_api != "containers":
            raise Exception("--trash-intermediate is only supported with --api=containers.")

        self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
        if self.intermediate_output_ttl and self.work_api != "containers":
            raise Exception("--intermediate-output-ttl is only supported with --api=containers.")
        if self.intermediate_output_ttl < 0:
            raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl)

        if runtimeContext.submit_request_uuid and self.work_api != "containers":
            raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api))

        if not runtimeContext.name:
            runtimeContext.name = self.name = updated_tool.tool.get("label") or updated_tool.metadata.get("label") or os.path.basename(updated_tool.tool["id"])

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % runtimeContext.name,
                                     updated_tool, job_order)

        # the last clause means: if it is a command line tool, and we
        # are going to wait for the result, and always_submit_runner
        # is false, then we don't submit a runner process.

        submitting = (runtimeContext.update_workflow or
                      runtimeContext.create_workflow or
                      (runtimeContext.submit and not
                       (updated_tool.tool["class"] == "CommandLineTool" and
                        runtimeContext.wait and
                        not runtimeContext.always_submit_runner)))

        loadingContext = self.loadingContext.copy()
        loadingContext.do_validate = False
        loadingContext.do_update = False
        if submitting:
            # Document may have been auto-updated. Reload the original
            # document with updating disabled because we want to
            # submit the document with its original CWL version, not
            # the auto-updated one.
            tool = load_tool(updated_tool.tool["id"], loadingContext)
        else:
            tool = updated_tool

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        merged_map = upload_workflow_deps(self, tool)

        # Recreate process object (ArvadosWorkflow or
        # ArvadosCommandTool) because tool document may have been
        # updated by upload_workflow_deps in ways that modify
        # inheritance of hints or requirements.
        loadingContext.loader = tool.doc_loader
        loadingContext.avsc_names = tool.doc_schema
        loadingContext.metadata = tool.metadata
        tool = load_tool(tool.tool, loadingContext)

        existing_uuid = runtimeContext.update_workflow
        if existing_uuid or runtimeContext.create_workflow:
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "containers":
                return (upload_workflow(self, tool, job_order,
                                        self.project_uuid,
                                        uuid=existing_uuid,
                                        submit_runner_ram=runtimeContext.submit_runner_ram,
                                        name=runtimeContext.name,
                                        merged_map=merged_map),
                        "success")

        self.apply_reqs(job_order, tool)

        self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
        self.eval_timeout = runtimeContext.eval_timeout

        runtimeContext = runtimeContext.copy()
        runtimeContext.use_container = True
        runtimeContext.tmpdir_prefix = "tmp"
        runtimeContext.work_api = self.work_api

        if self.work_api == "containers":
            if self.ignore_docker_for_reuse:
                raise Exception("--ignore-docker-for-reuse not supported with containers API.")
            runtimeContext.outdir = "/var/spool/cwl"
            runtimeContext.docker_outdir = "/var/spool/cwl"
            runtimeContext.tmpdir = "/tmp"
            runtimeContext.docker_tmpdir = "/tmp"

        if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
            raise Exception("--priority must be in the range 1..1000.")

        if self.should_estimate_cache_size:
            visited = set()
            estimated_size = [0]
            def estimate_collection_cache(obj):
                if obj.get("location", "").startswith("keep:"):
                    m = pdh_size.match(obj["location"][5:])
                    if m and m.group(1) not in visited:
                        visited.add(m.group(1))
                        estimated_size[0] += int(m.group(2))
            visit_class(job_order, ("File", "Directory"), estimate_collection_cache)
            runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256)
            self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024)

        logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size)

        runnerjob = None
        if runtimeContext.submit:
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if submitting:
                    tool = RunnerContainer(self, updated_tool,
                                           tool, loadingContext, runtimeContext.enable_reuse,
                                           self.output_name,
                                           self.output_tags,
                                           submit_runner_ram=runtimeContext.submit_runner_ram,
                                           name=runtimeContext.name,
                                           on_error=runtimeContext.on_error,
                                           submit_runner_image=runtimeContext.submit_runner_image,
                                           intermediate_output_ttl=runtimeContext.intermediate_output_ttl,
                                           merged_map=merged_map,
                                           priority=runtimeContext.priority,
                                           secret_store=self.secret_store,
                                           collection_cache_size=runtimeContext.collection_cache_size,
                                           collection_cache_is_default=self.should_estimate_cache_size)
                else:
                    runtimeContext.runnerjob = tool.tool["id"]

        if runtimeContext.cwl_runner_job is not None:
            self.uuid = runtimeContext.cwl_runner_job.get('uuid')

        jobiter = tool.job(job_order,
                           self.output_callback,
                           runtimeContext)

        if runtimeContext.submit and not runtimeContext.wait:
            runnerjob = next(jobiter)
            runnerjob.run(runtimeContext)
            return (runnerjob.uuid, "success")

        current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
        if current_container:
            logger.info("Running inside container %s", current_container.get("uuid"))

        self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout)
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)

        try:
            self.workflow_eval_lock.acquire()

            # Holds the lock while this code runs and releases it when
            # it is safe to do so in self.workflow_eval_lock.wait(),
            # at which point on_message can update job state and
            # process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if self.task_queue.error is not None:
                    raise self.task_queue.error

                if runnable:
                    with Perf(metrics, "run"):
                        self.start_run(runnable, runtimeContext)
                else:
                    if (self.task_queue.in_flight + len(self.processes)) > 0:
                        self.workflow_eval_lock.wait(3)
                    else:
                        logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.")
                        break

                if self.stop_polling.is_set():
                    break

                loopperf.__enter__()
            loopperf.__exit__()

            while (self.task_queue.in_flight + len(self.processes)) > 0:
                if self.task_queue.error is not None:
                    raise self.task_queue.error
                self.workflow_eval_lock.wait(3)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit:
                logger.error("Interrupted, workflow will be cancelled")
            elif isinstance(sys.exc_info()[1], WorkflowException):
                logger.error("Workflow execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
            else:
                logger.exception("Workflow execution failed")

            if self.pipeline:
                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
                                                     body={"state": "Failed"}).execute(num_retries=self.num_retries)

            if self.work_api == "containers" and not current_container:
                # Not running in a crunch container, so cancel any outstanding processes.
                for p in self.processes:
                    try:
                        self.api.container_requests().update(uuid=p,
                                                             body={"priority": "0"}
                        ).execute(num_retries=self.num_retries)
                    except Exception:
                        pass
        finally:
            self.workflow_eval_lock.release()
            self.task_queue.drain()
            self.stop_polling.set()
            self.polling_thread.join()
            self.task_queue.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if runtimeContext.submit and isinstance(tool, Runner):
            logger.info("Final output collection %s", tool.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""

            storage_classes = runtimeContext.storage_classes.strip().split(",")
            self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output)
            self.set_crunch_output()

        if runtimeContext.compute_checksum:
            adjustDirObjs(self.final_output, partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access))

        if self.trash_intermediate and self.final_status == "success":
            self.trash_intermediate_output()

        return (self.final_output, self.final_status)
Ejemplo n.º 27
0
def upload_dependencies(arvrunner, name, document_loader,
                        workflowobj, uri, loadref_run,
                        include_primary=True, discovered_secondaryfiles=None):
    """Upload the dependencies of the workflowobj document to Keep.

    Returns a pathmapper object mapping local paths to keep references.  Also
    does an in-place update of references in "workflowobj".

    Use scandeps to find $import, $include, $schemas, run, File and Directory
    fields that represent external references.

    If workflowobj has an "id" field, this will reload the document to ensure
    it is scanning the raw document prior to preprocessing.
    """

    loaded = set()
    def loadref(b, u):
        joined = document_loader.fetcher.urljoin(b, u)
        defrg, _ = urlparse.urldefrag(joined)
        if defrg not in loaded:
            loaded.add(defrg)
            # Use fetch_text to get raw file (before preprocessing).
            text = document_loader.fetch_text(defrg)
            if isinstance(text, bytes):
                textIO = StringIO(text.decode('utf-8'))
            else:
                textIO = StringIO(text)
            return yaml.safe_load(textIO)
        else:
            return {}

    if loadref_run:
        loadref_fields = set(("$import", "run"))
    else:
        loadref_fields = set(("$import",))

    scanobj = workflowobj
    if "id" in workflowobj:
        # Need raw file content (before preprocessing) to ensure
        # that external references in $include and $mixin are captured.
        scanobj = loadref("", workflowobj["id"])

    sc_result = scandeps(uri, scanobj,
                  loadref_fields,
                  set(("$include", "$schemas", "location")),
                  loadref, urljoin=document_loader.fetcher.urljoin)

    sc = []
    def only_real(obj):
        # Only interested in local files than need to be uploaded,
        # don't include file literals, keep references, etc.
        sp = obj.get("location", "").split(":")
        if len(sp) > 1 and sp[0] in ("file", "http", "https"):
            sc.append(obj)

    visit_class(sc_result, ("File", "Directory"), only_real)

    normalizeFilesDirs(sc)

    if include_primary and "id" in workflowobj:
        sc.append({"class": "File", "location": workflowobj["id"]})

    if "$schemas" in workflowobj:
        for s in workflowobj["$schemas"]:
            sc.append({"class": "File", "location": s})

    def visit_default(obj):
        remove = [False]
        def ensure_default_location(f):
            if "location" not in f and "path" in f:
                f["location"] = f["path"]
                del f["path"]
            if "location" in f and not arvrunner.fs_access.exists(f["location"]):
                # Doesn't exist, remove from list of dependencies to upload
                sc[:] = [x for x in sc if x["location"] != f["location"]]
                # Delete "default" from workflowobj
                remove[0] = True
        visit_class(obj["default"], ("File", "Directory"), ensure_default_location)
        if remove[0]:
            del obj["default"]

    find_defaults(workflowobj, visit_default)

    discovered = {}
    def discover_default_secondary_files(obj):
        discover_secondary_files(obj["inputs"],
                                 {shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t},
                                 discovered)

    visit_class(workflowobj, ("CommandLineTool", "Workflow"), discover_default_secondary_files)

    for d in list(discovered.keys()):
        # Only interested in discovered secondaryFiles which are local
        # files that need to be uploaded.
        if d.startswith("file:"):
            sc.extend(discovered[d])
        else:
            del discovered[d]

    mapper = ArvPathMapper(arvrunner, sc, "",
                           "keep:%s",
                           "keep:%s/%s",
                           name=name,
                           single_collection=True)

    def setloc(p):
        if "location" in p and (not p["location"].startswith("_:")) and (not p["location"].startswith("keep:")):
            p["location"] = mapper.mapper(p["location"]).resolved

    visit_class(workflowobj, ("File", "Directory"), setloc)
    visit_class(discovered, ("File", "Directory"), setloc)

    if discovered_secondaryfiles is not None:
        for d in discovered:
            discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d]

    if "$schemas" in workflowobj:
        sch = []
        for s in workflowobj["$schemas"]:
            sch.append(mapper.mapper(s).resolved)
        workflowobj["$schemas"] = sch

    return mapper
Ejemplo n.º 28
0
def main(args=None, stdout=sys.stdout):
    config = Config()
    config.cwl = True
    parser = argparse.ArgumentParser()
    addOptions(parser, config)
    parser.add_argument("cwltool", type=str)
    parser.add_argument("cwljob", nargs=argparse.REMAINDER)

    # Will override the "jobStore" positional argument, enables
    # user to select jobStore or get a default from logic one below.
    parser.add_argument("--jobStore", type=str)
    parser.add_argument("--not-strict", action="store_true")
    parser.add_argument("--no-container", action="store_true")
    parser.add_argument("--quiet", dest="logLevel", action="store_const", const="ERROR")
    parser.add_argument("--basedir", type=str)
    parser.add_argument("--outdir", type=str, default=os.getcwd())
    parser.add_argument("--version", action='version', version=baseVersion)
    parser.add_argument("--user-space-docker-cmd",
                        help="(Linux/OS X only) Specify a user space docker "
                        "command (like udocker or dx-docker) that will be "
                        "used to call 'pull' and 'run'")
    parser.add_argument("--preserve-environment", type=str, nargs='+',
                    help="Preserve specified environment variables when running CommandLineTools",
                    metavar=("VAR1 VAR2"),
                    default=("PATH",),
                    dest="preserve_environment")
    # help="Dependency resolver configuration file describing how to adapt 'SoftwareRequirement' packages to current system."
    parser.add_argument("--beta-dependency-resolvers-configuration", default=None)
    # help="Defaut root directory used by dependency resolvers configuration."
    parser.add_argument("--beta-dependencies-directory", default=None)
    # help="Use biocontainers for tools without an explicitly annotated Docker container."
    parser.add_argument("--beta-use-biocontainers", default=None, action="store_true")
    # help="Short cut to use Conda to resolve 'SoftwareRequirement' packages."
    parser.add_argument("--beta-conda-dependencies", default=None, action="store_true")
    parser.add_argument("--tmpdir-prefix", type=Text,
                        help="Path prefix for temporary directories",
                        default="tmp")
    parser.add_argument("--tmp-outdir-prefix", type=Text,
                        help="Path prefix for intermediate output directories",
                        default="tmp")

    # mkdtemp actually creates the directory, but
    # toil requires that the directory not exist,
    # so make it and delete it and allow
    # toil to create it again (!)
    workdir = tempfile.mkdtemp()
    os.rmdir(workdir)

    if args is None:
        args = sys.argv[1:]

    options = parser.parse_args([workdir] + args)

    use_container = not options.no_container

    if options.logLevel:
        cwllogger.setLevel(options.logLevel)

    outdir = os.path.abspath(options.outdir)
    fileindex = {}
    existing = {}
    make_tool_kwargs = {}
    conf_file = getattr(options, "beta_dependency_resolvers_configuration", None)  # Text
    use_conda_dependencies = getattr(options, "beta_conda_dependencies", None)  # Text
    job_script_provider = None
    if conf_file or use_conda_dependencies:
        dependencies_configuration = DependenciesConfiguration(options)  # type: DependenciesConfiguration
        job_script_provider = dependencies_configuration

    options.default_container = None
    make_tool_kwargs["find_default_container"] = functools.partial(find_default_container, options)

    with Toil(options) as toil:
        if options.restart:
            outobj = toil.restart()
        else:
            useStrict = not options.not_strict
            make_tool_kwargs["hints"] = [{
                "class": "ResourceRequirement",
                "coresMin": toil.config.defaultCores,
                "ramMin": toil.config.defaultMemory / (2**20),
                "outdirMin": toil.config.defaultDisk / (2**20),
                "tmpdirMin": 0
            }]
            try:
                t = cwltool.load_tool.load_tool(options.cwltool, toilMakeTool,
                                                kwargs=make_tool_kwargs,
                                                resolver=cwltool.resolver.tool_resolver,
                                                strict=useStrict)
                unsupportedRequirementsCheck(t.requirements)
            except cwltool.process.UnsupportedRequirement as e:
                logging.error(e)
                return 33

            if type(t) == int:
                return t

            options.workflow = options.cwltool
            options.job_order = options.cwljob
            options.tool_help = None
            options.debug = options.logLevel == "DEBUG"
            job, options.basedir, loader = cwltool.main.load_job_order(
                options, sys.stdin, None, [], options.job_order)
            job = cwltool.main.init_job_order(job, options, t, loader=loader)

            fillInDefaults(t.tool["inputs"], job)

            def pathToLoc(p):
                if "location" not in p and "path" in p:
                    p["location"] = p["path"]
                    del p["path"]

            def importFiles(tool):
                visit_class(tool, ("File", "Directory"), pathToLoc)
                normalizeFilesDirs(tool)
                adjustDirObjs(tool, functools.partial(get_listing,
                                                      cwltool.stdfsaccess.StdFsAccess(""),
                                                      recursive=True))
                adjustFileObjs(tool, functools.partial(uploadFile,
                                                       toil.importFile,
                                                       fileindex, existing, skip_broken=True))

            t.visit(importFiles)

            for inp in t.tool["inputs"]:
                def setSecondary(fileobj):
                    if isinstance(fileobj, dict) and fileobj.get("class") == "File":
                        if "secondaryFiles" not in fileobj:
                            fileobj["secondaryFiles"] = [{
                                "location": cwltool.builder.substitute(fileobj["location"], sf), "class": "File"}
                                                         for sf in inp["secondaryFiles"]]

                    if isinstance(fileobj, list):
                        for e in fileobj:
                            setSecondary(e)

                if shortname(inp["id"]) in job and inp.get("secondaryFiles"):
                    setSecondary(job[shortname(inp["id"])])

            importFiles(job)
            visitSteps(t, importFiles)

            try:
                make_opts = copy.deepcopy(vars(options))
                make_opts.update({'tool': t, 'jobobj': {},
                    'use_container': use_container,
                    'tmpdir': os.path.realpath(outdir),
                    'job_script_provider': job_script_provider})

                (wf1, wf2) = makeJob(**make_opts)
            except cwltool.process.UnsupportedRequirement as e:
                logging.error(e)
                return 33

            wf1.cwljob = job
            outobj = toil.start(wf1)

        outobj = resolve_indirect(outobj)

        toilStageFiles(toil, outobj, outdir, fileindex, existing, True)

        visit_class(outobj, ("File",), functools.partial(compute_checksums, cwltool.stdfsaccess.StdFsAccess("")))

        stdout.write(json.dumps(outobj, indent=4))

    return 0
Ejemplo n.º 29
0
    def job(self, joborder, output_callback, runtimeContext):

        builder = make_builder(joborder, self.hints, self.requirements, runtimeContext)
        runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext)

        req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer")
        if not req:
            return super(ArvadosWorkflow, self).job(joborder, output_callback, runtimeContext)

        # RunInSingleContainer is true

        with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
            if "id" not in self.tool:
                raise WorkflowException("%s object must have 'id'" % (self.tool["class"]))

        discover_secondary_files(self.arvrunner.fs_access, builder,
                                 self.tool["inputs"], joborder)

        with Perf(metrics, "subworkflow upload_deps"):
            upload_dependencies(self.arvrunner,
                                os.path.basename(joborder.get("id", "#")),
                                self.doc_loader,
                                joborder,
                                joborder.get("id", "#"),
                                False)

            if self.wf_pdh is None:
                packed = pack(self.loadingContext, self.tool["id"], loader=self.doc_loader)

                for p in packed["$graph"]:
                    if p["id"] == "#main":
                        p["requirements"] = dedup_reqs(self.requirements)
                        p["hints"] = dedup_reqs(self.hints)

                def visit(item):
                    if "requirements" in item:
                        item["requirements"] = [i for i in item["requirements"] if i["class"] != "DockerRequirement"]
                    for t in ("hints", "requirements"):
                        if t not in item:
                            continue
                        for req in item[t]:
                            if req["class"] == "ResourceRequirement":
                                dyn = False
                                for k in max_res_pars + sum_res_pars:
                                    if k in req:
                                        if isinstance(req[k], basestring):
                                            if item["id"] == "#main":
                                                # only the top-level requirements/hints may contain expressions
                                                self.dynamic_resource_req.append(req)
                                                dyn = True
                                                break
                                            else:
                                                with SourceLine(req, k, WorkflowException):
                                                    raise WorkflowException("Non-top-level ResourceRequirement in single container cannot have expressions")
                                if not dyn:
                                    self.static_resource_req.append(req)

                visit_class(packed["$graph"], ("Workflow", "CommandLineTool"), visit)

                if self.static_resource_req:
                    self.static_resource_req = [get_overall_res_req(self.static_resource_req)]

                upload_dependencies(self.arvrunner,
                                    runtimeContext.name,
                                    self.doc_loader,
                                    packed,
                                    self.tool["id"],
                                    False)

                # Discover files/directories referenced by the
                # workflow (mainly "default" values)
                visit_class(packed, ("File", "Directory"), self.wf_reffiles.append)


        if self.dynamic_resource_req:
            # Evaluate dynamic resource requirements using current builder
            rs = copy.copy(self.static_resource_req)
            for dyn_rs in self.dynamic_resource_req:
                eval_req = {"class": "ResourceRequirement"}
                for a in max_res_pars + sum_res_pars:
                    if a in dyn_rs:
                        eval_req[a] = builder.do_eval(dyn_rs[a])
                rs.append(eval_req)
            job_res_reqs = [get_overall_res_req(rs)]
        else:
            job_res_reqs = self.static_resource_req

        with Perf(metrics, "subworkflow adjust"):
            joborder_resolved = copy.deepcopy(joborder)
            joborder_keepmount = copy.deepcopy(joborder)

            reffiles = []
            visit_class(joborder_keepmount, ("File", "Directory"), reffiles.append)

            mapper = ArvPathMapper(self.arvrunner, reffiles+self.wf_reffiles, runtimeContext.basedir,
                                   "/keep/%s",
                                   "/keep/%s/%s")

            # For containers API, we need to make sure any extra
            # referenced files (ie referenced by the workflow but
            # not in the inputs) are included in the mounts.
            if self.wf_reffiles:
                runtimeContext = runtimeContext.copy()
                runtimeContext.extra_reffiles = copy.deepcopy(self.wf_reffiles)

            def keepmount(obj):
                remove_redundant_fields(obj)
                with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                    if "location" not in obj:
                        raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj))
                with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                    if obj["location"].startswith("keep:"):
                        obj["location"] = mapper.mapper(obj["location"]).target
                        if "listing" in obj:
                            del obj["listing"]
                    elif obj["location"].startswith("_:"):
                        del obj["location"]
                    else:
                        raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"])

            visit_class(joborder_keepmount, ("File", "Directory"), keepmount)

            def resolved(obj):
                if obj["location"].startswith("keep:"):
                    obj["location"] = mapper.mapper(obj["location"]).resolved

            visit_class(joborder_resolved, ("File", "Directory"), resolved)

            if self.wf_pdh is None:
                adjustFileObjs(packed, keepmount)
                adjustDirObjs(packed, keepmount)
                self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed)

        self.loadingContext = self.loadingContext.copy()
        self.loadingContext.metadata = self.loadingContext.metadata.copy()
        self.loadingContext.metadata["http://commonwl.org/cwltool#original_cwlVersion"] = "v1.0"

        if len(job_res_reqs) == 1:
            # RAM request needs to be at least 128 MiB or the workflow
            # runner itself won't run reliably.
            if job_res_reqs[0].get("ramMin", 1024) < 128:
                job_res_reqs[0]["ramMin"] = 128

        arguments = ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl", "cwl.input.yml"]
        if runtimeContext.debug:
            arguments.insert(0, '--debug')

        wf_runner = cmap({
            "class": "CommandLineTool",
            "baseCommand": "cwltool",
            "inputs": self.tool["inputs"],
            "outputs": self.tool["outputs"],
            "stdout": "cwl.output.json",
            "requirements": self.requirements+job_res_reqs+[
                {"class": "InlineJavascriptRequirement"},
                {
                "class": "InitialWorkDirRequirement",
                "listing": [{
                        "entryname": "workflow.cwl",
                        "entry": '$({"class": "File", "location": "keep:%s/workflow.cwl"})' % self.wf_pdh
                    }, {
                        "entryname": "cwl.input.yml",
                        "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${')
                    }]
            }],
            "hints": self.hints,
            "arguments": arguments,
            "id": "#"
        })
        return ArvadosCommandTool(self.arvrunner, wf_runner, self.loadingContext).job(joborder_resolved, output_callback, runtimeContext)
Ejemplo n.º 30
0
    def arvados_job_spec(self, runtimeContext):
        """Create an Arvados container request for this workflow.

        The returned dict can be used to create a container passed as
        the +body+ argument to container_requests().create().
        """

        adjustDirObjs(self.job_order, trim_listing)
        visit_class(self.job_order, ("File", "Directory"), trim_anonymous_location)
        visit_class(self.job_order, ("File", "Directory"), remove_redundant_fields)

        secret_mounts = {}
        for param in sorted(self.job_order.keys()):
            if self.secret_store.has_secret(self.job_order[param]):
                mnt = "/secrets/s%d" % len(secret_mounts)
                secret_mounts[mnt] = {
                    "kind": "text",
                    "content": self.secret_store.retrieve(self.job_order[param])
                }
                self.job_order[param] = {"$include": mnt}

        container_req = {
            "name": self.name,
            "output_path": "/var/spool/cwl",
            "cwd": "/var/spool/cwl",
            "priority": self.priority,
            "state": "Committed",
            "container_image": arvados_jobs_image(self.arvrunner, self.jobs_image),
            "mounts": {
                "/var/lib/cwl/cwl.input.json": {
                    "kind": "json",
                    "content": self.job_order
                },
                "stdout": {
                    "kind": "file",
                    "path": "/var/spool/cwl/cwl.output.json"
                },
                "/var/spool/cwl": {
                    "kind": "collection",
                    "writable": True
                }
            },
            "secret_mounts": secret_mounts,
            "runtime_constraints": {
                "vcpus": math.ceil(self.submit_runner_cores),
                "ram": 1024*1024 * (math.ceil(self.submit_runner_ram) + math.ceil(self.collection_cache_size)),
                "API": True
            },
            "use_existing": self.enable_reuse,
            "properties": {}
        }

        if self.embedded_tool.tool.get("id", "").startswith("keep:"):
            sp = self.embedded_tool.tool["id"].split('/')
            workflowcollection = sp[0][5:]
            workflowname = "/".join(sp[1:])
            workflowpath = "/var/lib/cwl/workflow/%s" % workflowname
            container_req["mounts"]["/var/lib/cwl/workflow"] = {
                "kind": "collection",
                "portable_data_hash": "%s" % workflowcollection
            }
        else:
            packed = packed_workflow(self.arvrunner, self.embedded_tool, self.merged_map)
            workflowpath = "/var/lib/cwl/workflow.json#main"
            container_req["mounts"]["/var/lib/cwl/workflow.json"] = {
                "kind": "json",
                "content": packed
            }
            if self.embedded_tool.tool.get("id", "").startswith("arvwf:"):
                container_req["properties"]["template_uuid"] = self.embedded_tool.tool["id"][6:33]


        # --local means execute the workflow instead of submitting a container request
        # --api=containers means use the containers API
        # --no-log-timestamps means don't add timestamps (the logging infrastructure does this)
        # --disable-validate because we already validated so don't need to do it again
        # --eval-timeout is the timeout for javascript invocation
        # --parallel-task-count is the number of threads to use for job submission
        # --enable/disable-reuse sets desired job reuse
        # --collection-cache-size sets aside memory to store collections
        command = ["arvados-cwl-runner",
                   "--local",
                   "--api=containers",
                   "--no-log-timestamps",
                   "--disable-validate",
                   "--eval-timeout=%s" % self.arvrunner.eval_timeout,
                   "--thread-count=%s" % self.arvrunner.thread_count,
                   "--enable-reuse" if self.enable_reuse else "--disable-reuse",
                   "--collection-cache-size=%s" % self.collection_cache_size]

        if self.output_name:
            command.append("--output-name=" + self.output_name)
            container_req["output_name"] = self.output_name

        if self.output_tags:
            command.append("--output-tags=" + self.output_tags)

        if runtimeContext.debug:
            command.append("--debug")

        if runtimeContext.storage_classes != "default":
            command.append("--storage-classes=" + runtimeContext.storage_classes)

        if self.on_error:
            command.append("--on-error=" + self.on_error)

        if self.intermediate_output_ttl:
            command.append("--intermediate-output-ttl=%d" % self.intermediate_output_ttl)

        if self.arvrunner.trash_intermediate:
            command.append("--trash-intermediate")

        if self.arvrunner.project_uuid:
            command.append("--project-uuid="+self.arvrunner.project_uuid)

        command.extend([workflowpath, "/var/lib/cwl/cwl.input.json"])

        container_req["command"] = command

        return container_req
Ejemplo n.º 31
0
    def arvados_job_spec(self, dry_run=False, pull_image=True, **kwargs):
        """Create an Arvados container request for this workflow.

        The returned dict can be used to create a container passed as
        the +body+ argument to container_requests().create().
        """

        adjustDirObjs(self.job_order, trim_listing)
        visit_class(self.job_order, ("File", "Directory"), trim_anonymous_location)
        visit_class(self.job_order, ("File", "Directory"), remove_redundant_fields)

        container_req = {
            "owner_uuid": self.arvrunner.project_uuid,
            "name": self.name,
            "output_path": "/var/spool/cwl",
            "cwd": "/var/spool/cwl",
            "priority": 1,
            "state": "Committed",
            "container_image": arvados_jobs_image(self.arvrunner, self.jobs_image),
            "mounts": {
                "/var/lib/cwl/cwl.input.json": {
                    "kind": "json",
                    "content": self.job_order
                },
                "stdout": {
                    "kind": "file",
                    "path": "/var/spool/cwl/cwl.output.json"
                },
                "/var/spool/cwl": {
                    "kind": "collection",
                    "writable": True
                }
            },
            "runtime_constraints": {
                "vcpus": 1,
                "ram": 1024*1024 * self.submit_runner_ram,
                "API": True
            },
            "properties": {}
        }

        if self.tool.tool.get("id", "").startswith("keep:"):
            sp = self.tool.tool["id"].split('/')
            workflowcollection = sp[0][5:]
            workflowname = "/".join(sp[1:])
            workflowpath = "/var/lib/cwl/workflow/%s" % workflowname
            container_req["mounts"]["/var/lib/cwl/workflow"] = {
                "kind": "collection",
                "portable_data_hash": "%s" % workflowcollection
            }
        else:
            packed = packed_workflow(self.arvrunner, self.tool)
            workflowpath = "/var/lib/cwl/workflow.json#main"
            container_req["mounts"]["/var/lib/cwl/workflow.json"] = {
                "kind": "json",
                "content": packed
            }
            if self.tool.tool.get("id", "").startswith("arvwf:"):
                container_req["properties"]["template_uuid"] = self.tool.tool["id"][6:33]

        command = ["arvados-cwl-runner", "--local", "--api=containers", "--no-log-timestamps"]
        if self.output_name:
            command.append("--output-name=" + self.output_name)
            container_req["output_name"] = self.output_name

        if self.output_tags:
            command.append("--output-tags=" + self.output_tags)

        if kwargs.get("debug"):
            command.append("--debug")

        if self.enable_reuse:
            command.append("--enable-reuse")
        else:
            command.append("--disable-reuse")

        if self.on_error:
            command.append("--on-error=" + self.on_error)

        if self.intermediate_output_ttl:
            command.append("--intermediate-output-ttl=%d" % self.intermediate_output_ttl)

        if self.arvrunner.trash_intermediate:
            command.append("--trash-intermediate")

        if self.arvrunner.project_uuid:
            command.append("--project-uuid="+self.arvrunner.project_uuid)

        command.extend([workflowpath, "/var/lib/cwl/cwl.input.json"])

        container_req["command"] = command

        return container_req
Ejemplo n.º 32
0
    def execute(self, context):

        post_status(context)

        self.cwlwf, it_is_workflow = load_cwl(
            self.dag.default_args["cwl_workflow"], self.dag.default_args)
        self.cwl_step = [
            step for step in self.cwlwf.steps
            if self.task_id == step.id.split("#")[-1]
        ][0] if it_is_workflow else self.cwlwf

        _logger.info('{0}: Running!'.format(self.task_id))

        upstream_task_ids = [t.task_id for t in self.upstream_list] + \
                            ([self.reader_task_id] if self.reader_task_id else [])
        _logger.debug('{0}: Collecting outputs from: \n{1}'.format(
            self.task_id, json.dumps(upstream_task_ids, indent=4)))

        upstream_data = self.xcom_pull(context=context,
                                       task_ids=upstream_task_ids)
        _logger.info('{0}: Upstream data: \n {1}'.format(
            self.task_id, json.dumps(upstream_data, indent=4)))

        promises = {}
        for data in upstream_data:  # upstream_data is an array with { promises and outdir }
            promises = merge(promises, data["promises"])
            if "outdir" in data:
                self.outdir = data["outdir"]

        _d_args = self.dag.default_args

        if not self.outdir:
            self.outdir = _d_args['tmp_folder']

        _logger.debug('{0}: Step inputs: {1}'.format(
            self.task_id, json.dumps(self.cwl_step.tool["inputs"], indent=4)))

        _logger.debug('{0}: Step outputs: {1}'.format(
            self.task_id, json.dumps(self.cwl_step.tool["outputs"], indent=4)))

        jobobj = {}

        for inp in self.cwl_step.tool["inputs"]:
            jobobj_id = shortname(inp["id"]).split("/")[-1]
            source_ids = []
            promises_outputs = []
            try:
                source_field = inp["source"] if it_is_workflow else inp.get(
                    "id")
                source_ids = [shortname(s)
                              for s in source_field] if isinstance(
                                  source_field,
                                  list) else [shortname(source_field)]
                promises_outputs = [
                    promises[source_id] for source_id in source_ids
                    if source_id in promises
                ]
            except:
                _logger.warning(
                    "{0}: Couldn't find source field in step input: {1}".
                    format(self.task_id, json.dumps(inp, indent=4)))

            _logger.info(
                '{0}: For input {1} with source_ids: {2} found upstream outputs: \n{3}'
                .format(self.task_id, jobobj_id, source_ids, promises_outputs))

            if len(promises_outputs) > 1:
                if inp.get("linkMerge", "merge_nested") == "merge_flattened":
                    jobobj[jobobj_id] = flatten(promises_outputs)
                else:
                    jobobj[jobobj_id] = promises_outputs
            # Should also check if [None], because in this case we need to take default value
            elif len(promises_outputs) == 1 and (promises_outputs[0]
                                                 is not None):
                jobobj[jobobj_id] = promises_outputs[0]
            elif "valueFrom" in inp:
                jobobj[jobobj_id] = None
            elif "default" in inp:
                d = copy.copy(inp["default"])
                jobobj[jobobj_id] = d
            else:
                continue

        _logger.debug('{0}: Collected job object: \n {1}'.format(
            self.task_id, json.dumps(jobobj, indent=4)))

        def _post_scatter_eval(shortio, cwl_step):
            _value_from = {
                shortname(i["id"]).split("/")[-1]: i["valueFrom"]
                for i in cwl_step.tool["inputs"] if "valueFrom" in i
            }
            _logger.debug('{0}: Step inputs with valueFrom: \n{1}'.format(
                self.task_id, json.dumps(_value_from, indent=4)))

            def value_from_func(k, v):
                if k in _value_from:
                    return expression.do_eval(_value_from[k],
                                              shortio,
                                              self.cwlwf.tool.get(
                                                  "requirements", []),
                                              None,
                                              None, {},
                                              context=v)
                else:
                    return v

            return {k: value_from_func(k, v) for k, v in shortio.items()}

        job = _post_scatter_eval(jobobj, self.cwl_step)
        _logger.info('{0}: Final job data: \n {1}'.format(
            self.task_id, json.dumps(job, indent=4)))

        _d_args['outdir'] = tempfile.mkdtemp(
            prefix=os.path.join(self.outdir, "step_tmp"))
        _d_args['tmpdir_prefix'] = os.path.join(_d_args['outdir'], 'cwl_tmp_')
        _d_args['tmp_outdir_prefix'] = os.path.join(_d_args['outdir'],
                                                    'cwl_outdir_')

        _d_args["record_container_id"] = True
        _d_args["cidfile_dir"] = _d_args['outdir']
        _d_args["cidfile_prefix"] = self.task_id

        _logger.debug('{0}: Runtime context: \n {1}'.format(self, _d_args))

        executor = SingleJobExecutor()
        runtimeContext = RuntimeContext(_d_args)
        runtimeContext.make_fs_access = getdefault(
            runtimeContext.make_fs_access, StdFsAccess)

        for inp in self.cwl_step.tool["inputs"]:
            if inp.get("not_connected"):
                del job[shortname(inp["id"].split("/")[-1])]

        _stderr = sys.stderr
        sys.stderr = sys.__stderr__
        (output, status) = executor(
            self.cwl_step.embedded_tool if it_is_workflow else self.cwl_step,
            job,
            runtimeContext,
            logger=_logger)
        sys.stderr = _stderr

        if not output and status == "permanentFail":
            raise ValueError

        _logger.debug('{0}: Embedded tool outputs: \n {1}'.format(
            self.task_id, json.dumps(output, indent=4)))

        promises = {}

        for out in self.cwl_step.tool["outputs"]:

            out_id = shortname(out["id"])
            jobout_id = out_id.split("/")[-1]
            try:
                promises[out_id] = output[jobout_id]
            except:
                continue

        # Unsetting the Generation from final output object
        visit_class(promises, ("File", ), MutationManager().unset_generation)

        data = {"promises": promises, "outdir": self.outdir}

        _logger.info('{0}: Output: \n {1}'.format(self.task_id,
                                                  json.dumps(data, indent=4)))

        return data
Ejemplo n.º 33
0
def upload_dependencies(arvrunner, name, document_loader,
                        workflowobj, uri, loadref_run,
                        include_primary=True, discovered_secondaryfiles=None):
    """Upload the dependencies of the workflowobj document to Keep.

    Returns a pathmapper object mapping local paths to keep references.  Also
    does an in-place update of references in "workflowobj".

    Use scandeps to find $import, $include, $schemas, run, File and Directory
    fields that represent external references.

    If workflowobj has an "id" field, this will reload the document to ensure
    it is scanning the raw document prior to preprocessing.
    """

    loaded = set()
    def loadref(b, u):
        joined = document_loader.fetcher.urljoin(b, u)
        defrg, _ = urlparse.urldefrag(joined)
        if defrg not in loaded:
            loaded.add(defrg)
            # Use fetch_text to get raw file (before preprocessing).
            text = document_loader.fetch_text(defrg)
            if isinstance(text, bytes):
                textIO = StringIO(text.decode('utf-8'))
            else:
                textIO = StringIO(text)
            return yaml.safe_load(textIO)
        else:
            return {}

    if loadref_run:
        loadref_fields = set(("$import", "run"))
    else:
        loadref_fields = set(("$import",))

    scanobj = workflowobj
    if "id" in workflowobj:
        # Need raw file content (before preprocessing) to ensure
        # that external references in $include and $mixin are captured.
        scanobj = loadref("", workflowobj["id"])

    sc_result = scandeps(uri, scanobj,
                  loadref_fields,
                  set(("$include", "$schemas", "location")),
                  loadref, urljoin=document_loader.fetcher.urljoin)

    sc = []
    def only_real(obj):
        if obj.get("location", "").startswith("file:"):
            sc.append(obj)

    visit_class(sc_result, ("File", "Directory"), only_real)

    normalizeFilesDirs(sc)

    if include_primary and "id" in workflowobj:
        sc.append({"class": "File", "location": workflowobj["id"]})

    if "$schemas" in workflowobj:
        for s in workflowobj["$schemas"]:
            sc.append({"class": "File", "location": s})

    def visit_default(obj):
        remove = [False]
        def ensure_default_location(f):
            if "location" not in f and "path" in f:
                f["location"] = f["path"]
                del f["path"]
            if "location" in f and not arvrunner.fs_access.exists(f["location"]):
                # Doesn't exist, remove from list of dependencies to upload
                sc[:] = [x for x in sc if x["location"] != f["location"]]
                # Delete "default" from workflowobj
                remove[0] = True
        visit_class(obj["default"], ("File", "Directory"), ensure_default_location)
        if remove[0]:
            del obj["default"]

    find_defaults(workflowobj, visit_default)

    discovered = {}
    def discover_default_secondary_files(obj):
        discover_secondary_files(obj["inputs"],
                                 {shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t},
                                 discovered)

    visit_class(workflowobj, ("CommandLineTool", "Workflow"), discover_default_secondary_files)

    for d in discovered:
        sc.extend(discovered[d])

    mapper = ArvPathMapper(arvrunner, sc, "",
                           "keep:%s",
                           "keep:%s/%s",
                           name=name,
                           single_collection=True)

    def setloc(p):
        if "location" in p and (not p["location"].startswith("_:")) and (not p["location"].startswith("keep:")):
            p["location"] = mapper.mapper(p["location"]).resolved

    visit_class(workflowobj, ("File", "Directory"), setloc)
    visit_class(discovered, ("File", "Directory"), setloc)

    if discovered_secondaryfiles is not None:
        for d in discovered:
            discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d]

    if "$schemas" in workflowobj:
        sch = []
        for s in workflowobj["$schemas"]:
            sch.append(mapper.mapper(s).resolved)
        workflowobj["$schemas"] = sch

    return mapper