Esempio n. 1
0
        def setSecondary(fileobj):
            if isinstance(fileobj, dict) and fileobj.get("class") == "File":
                if "secondaryFiles" not in fileobj:
                    fileobj["secondaryFiles"] = [{"location": substitute(fileobj["location"], sf), "class": "File"} for sf in t["secondaryFiles"]]

            if isinstance(fileobj, list):
                for e in fileobj:
                    setSecondary(e)
Esempio n. 2
0
def setSecondary(t, fileobj, discovered):
    if isinstance(fileobj, dict) and fileobj.get("class") == "File":
        if "secondaryFiles" not in fileobj:
            fileobj["secondaryFiles"] = cmap([{"location": substitute(fileobj["location"], sf), "class": "File"} for sf in t["secondaryFiles"]])
            if discovered is not None:
                discovered[fileobj["location"]] = fileobj["secondaryFiles"]
    elif isinstance(fileobj, list):
        for e in fileobj:
            setSecondary(t, e, discovered)
Esempio n. 3
0
def setSecondary(t, fileobj, discovered):
    if isinstance(fileobj, dict) and fileobj.get("class") == "File":
        if "secondaryFiles" not in fileobj:
            fileobj["secondaryFiles"] = cmap([{"location": substitute(fileobj["location"], sf), "class": "File"} for sf in t["secondaryFiles"]])
            if discovered is not None:
                discovered[fileobj["location"]] = fileobj["secondaryFiles"]
    elif isinstance(fileobj, list):
        for e in fileobj:
            setSecondary(t, e, discovered)
Esempio n. 4
0
def set_secondary(typedef, fileobj, discovered):
    """
    Pull over missing secondaryFiles to the job object entry.

    Adapted from:
    https://github.com/curoverse/arvados/blob/2b0b06579199967eca3d44d955ad64195d2db3c3/sdk/cwl/arvados_cwl/runner.py#L67
    """
    if isinstance(fileobj, MutableMapping) and fileobj.get("class") == "File":
        if "secondaryFiles" not in fileobj:
            fileobj["secondaryFiles"] = cmap(
                [{"location": substitute(fileobj["location"], sf["pattern"]),
                  "class": "File"} for sf in typedef["secondaryFiles"]])
            if discovered is not None:
                discovered[fileobj["location"]] = fileobj["secondaryFiles"]
    elif isinstance(fileobj, MutableSequence):
        for entry in fileobj:
            set_secondary(typedef, entry, discovered)
Esempio n. 5
0
def set_secondary(fsaccess, builder, inputschema, secondaryspec, primary,
                  discovered):
    if isinstance(inputschema,
                  Sequence) and not isinstance(inputschema, basestring):
        # union type, collect all possible secondaryFiles
        for i in inputschema:
            set_secondary(fsaccess, builder, i, secondaryspec, primary,
                          discovered)
        return

    if isinstance(inputschema, basestring):
        sd = search_schemadef(inputschema,
                              reversed(builder.hints + builder.requirements))
        if sd:
            inputschema = sd
        else:
            return

    if "secondaryFiles" in inputschema:
        # set secondaryFiles, may be inherited by compound types.
        secondaryspec = inputschema["secondaryFiles"]

    if (isinstance(inputschema["type"], (Mapping, Sequence))
            and not isinstance(inputschema["type"], basestring)):
        # compound type (union, array, record)
        set_secondary(fsaccess, builder, inputschema["type"], secondaryspec,
                      primary, discovered)

    elif (inputschema["type"] == "record" and isinstance(primary, Mapping)):
        #
        # record type, find secondary files associated with fields.
        #
        for f in inputschema["fields"]:
            p = primary.get(shortname(f["name"]))
            if p:
                set_secondary(fsaccess, builder, f, secondaryspec, p,
                              discovered)

    elif (inputschema["type"] == "array" and isinstance(primary, Sequence)):
        #
        # array type, find secondary files of elements
        #
        for p in primary:
            set_secondary(fsaccess, builder, {"type": inputschema["items"]},
                          secondaryspec, p, discovered)

    elif (inputschema["type"] == "File" and secondaryspec
          and isinstance(primary, Mapping) and primary.get("class") == "File"
          and "secondaryFiles" not in primary):
        #
        # Found a file, check for secondaryFiles
        #
        specs = []
        primary["secondaryFiles"] = secondaryspec
        for i, sf in enumerate(aslist(secondaryspec)):
            if builder.cwlVersion == "v1.0":
                pattern = builder.do_eval(sf, context=primary)
            else:
                pattern = builder.do_eval(sf["pattern"], context=primary)
            if pattern is None:
                continue
            if isinstance(pattern, list):
                specs.extend(pattern)
            elif isinstance(pattern, dict):
                specs.append(pattern)
            elif isinstance(pattern, str):
                if builder.cwlVersion == "v1.0":
                    specs.append({"pattern": pattern, "required": True})
                else:
                    specs.append({
                        "pattern": pattern,
                        "required": sf.get("required")
                    })
            else:
                raise SourceLine(
                    primary["secondaryFiles"], i,
                    validate.ValidationException).makeError(
                        "Expression must return list, object, string or null")

        found = []
        for i, sf in enumerate(specs):
            if isinstance(sf, dict):
                if sf.get("class") == "File":
                    pattern = None
                    if sf.get("location") is None:
                        raise SourceLine(
                            primary["secondaryFiles"], i,
                            validate.ValidationException).makeError(
                                "File object is missing 'location': %s" % sf)
                    sfpath = sf["location"]
                    required = True
                else:
                    pattern = sf["pattern"]
                    required = sf.get("required")
            elif isinstance(sf, str):
                pattern = sf
                required = True
            else:
                raise SourceLine(
                    primary["secondaryFiles"], i,
                    validate.ValidationException).makeError(
                        "Expression must return list, object, string or null")

            if pattern is not None:
                sfpath = substitute(primary["location"], pattern)

            required = builder.do_eval(required, context=primary)

            if fsaccess.exists(sfpath):
                if pattern is not None:
                    found.append({"location": sfpath, "class": "File"})
                else:
                    found.append(sf)
            elif required:
                raise SourceLine(
                    primary["secondaryFiles"], i,
                    validate.ValidationException).makeError(
                        "Required secondary file '%s' does not exist" % sfpath)

        primary["secondaryFiles"] = cmap(found)
        if discovered is not None:
            discovered[primary["location"]] = primary["secondaryFiles"]
    elif inputschema["type"] not in primitive_types_set:
        set_secondary(fsaccess, builder, inputschema["type"], secondaryspec,
                      primary, discovered)
Esempio n. 6
0
    def collect_output(
            self,
            schema,  # type: Dict[Text, Any]
            builder,  # type: Builder
            outdir,  # type: Text
            fs_access,  # type: StdFsAccess
            compute_checksum=True  # type: bool
    ):
        # type: (...) -> Optional[Union[Dict[Text, Any], List[Union[Dict[Text, Any], Text]]]]
        result = []  # type: List[Any]
        empty_and_optional = False
        debug = LOGGER.isEnabledFor(logging.DEBUG)
        if "outputBinding" in schema:
            binding = schema["outputBinding"]
            globpatterns = []  # type: List[Text]

            revmap = partial(command_line_tool.revmap_file, builder, outdir)

            if "glob" in binding:
                with SourceLine(binding, "glob", WorkflowException, debug):
                    for glob in aslist(binding["glob"]):
                        glob = builder.do_eval(glob)
                        if glob:
                            globpatterns.extend(aslist(glob))

                    for glob in globpatterns:
                        if glob.startswith(outdir):
                            glob = glob[len(outdir) + 1:]
                        elif glob == ".":
                            glob = outdir
                        elif glob.startswith("/"):
                            raise WorkflowException(
                                "glob patterns must not start with '/'")
                        try:
                            prefix = fs_access.glob(outdir)
                            key = cmp_to_key(
                                cast(Callable[[Text, Text], int],
                                     locale.strcoll))

                            # In case of stdout.log or stderr.log file not created
                            if "stdout" in self.tool and "stderr" in self.tool \
                                    and glob in (self.tool["stdout"], self.tool["stderr"]):
                                filepath = Path(fs_access.join(outdir, glob))
                                if not filepath.is_file():
                                    Path(filepath).touch()

                            result.extend([{
                                "location":
                                g,
                                "path":
                                fs_access.join(builder.outdir,
                                               g[len(prefix[0]) + 1:]),
                                "basename":
                                os.path.basename(g),
                                "nameroot":
                                os.path.splitext(os.path.basename(g))[0],
                                "nameext":
                                os.path.splitext(os.path.basename(g))[1],
                                "class":
                                "File" if fs_access.isfile(g) else "Directory"
                            } for g in sorted(fs_access.glob(
                                fs_access.join(outdir, glob)),
                                              key=key)])
                        except (OSError, IOError) as exc:
                            LOGGER.warning(Text(exc))
                        except Exception:
                            LOGGER.exception("Unexpected error from fs_access")
                            raise

                for files in result:
                    rfile = files.copy()
                    # TODO This function raise an exception and seems to be related to docker (which is not used here)
                    # revmap(rfile)
                    if files["class"] == "Directory":
                        load_listing = builder.loadListing or (
                            binding and binding.get("loadListing"))
                        if load_listing and load_listing != "no_listing":
                            get_listing(fs_access, files,
                                        (load_listing == "deep_listing"))
                    else:
                        with fs_access.open(rfile["location"], "rb") as f:
                            contents = b""
                            if binding.get("loadContents") or compute_checksum:
                                contents = f.read(CONTENT_LIMIT)
                            if binding.get("loadContents"):
                                files["contents"] = contents.decode("utf-8")
                            if compute_checksum:
                                checksum = hashlib.sha1()  # nosec: B303
                                while contents != b"":
                                    checksum.update(contents)
                                    contents = f.read(1024 * 1024)
                                files[
                                    "checksum"] = "sha1$%s" % checksum.hexdigest(
                                    )
                            f.seek(0, 2)
                            file_size = f.tell()
                        files["size"] = file_size

            optional = False
            single = False
            if isinstance(schema["type"], list):
                if "null" in schema["type"]:
                    optional = True
                if "File" in schema["type"] or "Directory" in schema["type"]:
                    single = True
            elif schema["type"] == "File" or schema["type"] == "Directory":
                single = True

            if "outputEval" in binding:
                with SourceLine(binding, "outputEval", WorkflowException,
                                debug):
                    result = builder.do_eval(binding["outputEval"],
                                             context=result)

            if single:
                if not result and not optional:
                    with SourceLine(binding, "glob", WorkflowException, debug):
                        raise WorkflowException(
                            "Did not find output file with glob pattern: '{}'".
                            format(globpatterns))
                elif not result and optional:
                    pass
                elif isinstance(result, list):
                    if len(result) > 1:
                        raise WorkflowException(
                            "Multiple matches for output item that is a single file."
                        )
                    result = result[0]

            if "secondaryFiles" in schema:
                with SourceLine(schema, "secondaryFiles", WorkflowException,
                                debug):
                    for primary in aslist(result):
                        if isinstance(primary, dict):
                            primary.setdefault("secondaryFiles", [])
                            pathprefix = primary["path"][0:primary["path"].
                                                         rindex("/") + 1]
                            for file in aslist(schema["secondaryFiles"]):
                                if isinstance(
                                        file,
                                        dict) or "$(" in file or "${" in file:
                                    sfpath = builder.do_eval(file,
                                                             context=primary)
                                    subst = False
                                else:
                                    sfpath = file
                                    subst = True
                                for sfitem in aslist(sfpath):
                                    if isinstance(sfitem, str):
                                        if subst:
                                            sfitem = {
                                                "path":
                                                substitute(
                                                    primary["path"], sfitem)
                                            }
                                        else:
                                            sfitem = {
                                                "path": pathprefix + sfitem
                                            }
                                    if "path" in sfitem and "location" not in sfitem:
                                        revmap(sfitem)
                                    if fs_access.isfile(sfitem["location"]):
                                        sfitem["class"] = "File"
                                        primary["secondaryFiles"].append(
                                            sfitem)
                                    elif fs_access.isdir(sfitem["location"]):
                                        sfitem["class"] = "Directory"
                                        primary["secondaryFiles"].append(
                                            sfitem)

            if "format" in schema:
                for primary in aslist(result):
                    primary["format"] = builder.do_eval(schema["format"],
                                                        context=primary)

            # Ensure files point to local references outside of the run environment
            # TODO: Again removing revmap....
            # adjustFileObjs(result, revmap)

            if not result and optional:
                return None

        if not empty_and_optional and isinstance(
                schema["type"], dict) and schema["type"]["type"] == "record":
            out = {}
            for f in schema["type"]["fields"]:
                out[shortname(
                    f["name"])] = self.collect_output(  # type: ignore
                        f,
                        builder,
                        outdir,
                        fs_access,
                        compute_checksum=compute_checksum)
            return out
        return result
Esempio n. 7
0
    def collect_output(
            self,
            schema,  # type: Dict[Text, Any]
            builder,  # type: Builder
            outdir,  # type: Text
            fs_access,  # type: StdFsAccess
            compute_checksum=True  # type: bool
    ):
        # type: (...) -> Optional[Union[Dict[Text, Any], List[Union[Dict[Text, Any], Text]]]]
        """
        Collect outputs from the step :term:`Process` following its execution.

        .. note:
            When :term:`CWL` runner tries to forward ``step(i) outputs -> step(i+1) inputs``
            using :meth:`collect_outputs`, it expects exact ``outputBindings`` locations to be matched.
            In other words, a definition like ``outputBindings: {glob: outputs/*.txt}`` will generate results located
            in ``step(i)`` as ``"<tmp-workdir>/outputs/file.txt"`` and ``step(i+1)`` will look explicitly
            in ``"<tmp-workdir>/outputs`` using the ``glob`` pattern. Because each of our :term:`Process` in
            the workflow are distinct/remote entities, each one stages its outputs at different URL locations,
            not sharing the same *root directory*. When we stage intermediate results locally, the sub-dirs are lost.
            Therefore, they act like individual :term:`CWL` runner calls where the *final results* are moved back
            to the local directory for convenient access, but our *local directory* is the URL WPS-outputs location.
            To let :term:`CWL` :term:`Workflow` inter-steps mapping work as intended, we must remap the locations
            ignoring any nested dirs where the modified *outputBindings* definition will be able to match as if each
            step :term:`Process` outputs were generated locally.
        """
        result = []  # type: List[Any]
        empty_and_optional = False
        debug = LOGGER.isEnabledFor(logging.DEBUG)
        if "outputBinding" in schema:
            binding = schema["outputBinding"]
            globpatterns = []  # type: List[Text]

            revmap = partial(command_line_tool.revmap_file, builder, outdir)

            if "glob" in binding:
                with SourceLine(binding, "glob", WorkflowException, debug):
                    for glob in aslist(binding["glob"]):
                        glob = builder.do_eval(glob)
                        if glob:
                            globpatterns.extend(aslist(glob))

                    # rebase glob pattern as applicable (see note)
                    for glob in list(globpatterns):
                        if not any(
                                glob.startswith(part)
                                for part in [".", "/", "~"]) and "/" in glob:
                            glob = builder.do_eval(glob.split("/")[-1])
                            if glob:
                                globpatterns.extend(aslist(glob))

                    for glob in globpatterns:
                        if glob.startswith(outdir):
                            glob = glob[len(outdir) + 1:]
                        elif glob == ".":
                            glob = outdir
                        elif glob.startswith("/"):
                            raise WorkflowException(
                                "glob patterns must not start with '/'")
                        try:
                            prefix = fs_access.glob(outdir)
                            key = cmp_to_key(
                                cast(Callable[[Text, Text], int],
                                     locale.strcoll))

                            # In case of stdout.log or stderr.log file not created
                            if "stdout" in self.tool and "stderr" in self.tool \
                                    and glob in (self.tool["stdout"], self.tool["stderr"]):
                                filepath = Path(fs_access.join(outdir, glob))
                                if not filepath.is_file():
                                    Path(filepath).touch()

                            result.extend([{
                                "location":
                                g,
                                "path":
                                fs_access.join(builder.outdir,
                                               g[len(prefix[0]) + 1:]),
                                "basename":
                                os.path.basename(g),
                                "nameroot":
                                os.path.splitext(os.path.basename(g))[0],
                                "nameext":
                                os.path.splitext(os.path.basename(g))[1],
                                "class":
                                "File" if fs_access.isfile(g) else "Directory"
                            } for g in sorted(fs_access.glob(
                                fs_access.join(outdir, glob)),
                                              key=key)])
                        except (OSError, IOError) as exc:
                            LOGGER.warning(Text(exc))
                        except Exception:
                            LOGGER.exception("Unexpected error from fs_access")
                            raise

                for files in result:
                    rfile = files.copy()
                    # TODO This function raise an exception and seems to be related to docker (which is not used here)
                    # revmap(rfile)
                    if files["class"] == "Directory":
                        load_listing = builder.loadListing or (
                            binding and binding.get("loadListing"))
                        if load_listing and load_listing != "no_listing":
                            get_listing(fs_access, files,
                                        (load_listing == "deep_listing"))
                    else:
                        with fs_access.open(rfile["location"], "rb") as f:
                            contents = b""
                            if binding.get("loadContents") or compute_checksum:
                                contents = f.read(CONTENT_LIMIT)
                            if binding.get("loadContents"):
                                files["contents"] = contents.decode("utf-8")
                            if compute_checksum:
                                checksum = hashlib.sha1()  # nosec: B303
                                while contents != b"":
                                    checksum.update(contents)
                                    contents = f.read(1024 * 1024)
                                files[
                                    "checksum"] = f"sha1${checksum.hexdigest()}"
                            f.seek(0, 2)
                            file_size = f.tell()
                        files["size"] = file_size

            optional = False
            single = False
            if isinstance(schema["type"], list):
                if "null" in schema["type"]:
                    optional = True
                if "File" in schema["type"] or "Directory" in schema["type"]:
                    single = True
            elif schema["type"] == "File" or schema["type"] == "Directory":
                single = True

            if "outputEval" in binding:
                with SourceLine(binding, "outputEval", WorkflowException,
                                debug):
                    result = builder.do_eval(binding["outputEval"],
                                             context=result)

            if single:
                if not result and not optional:
                    with SourceLine(binding, "glob", WorkflowException, debug):
                        raise WorkflowException(
                            f"Did not find output file with glob pattern: '{globpatterns}'"
                        )
                elif not result and optional:
                    pass
                elif isinstance(result, list):
                    if len(result) > 1:
                        raise WorkflowException(
                            "Multiple matches for output item that is a single file."
                        )
                    result = result[0]

            if "secondaryFiles" in schema:
                with SourceLine(schema, "secondaryFiles", WorkflowException,
                                debug):
                    for primary in aslist(result):
                        if isinstance(primary, dict):
                            primary.setdefault("secondaryFiles", [])
                            pathprefix = primary["path"][0:primary["path"].
                                                         rindex("/") + 1]
                            for file in aslist(schema["secondaryFiles"]):
                                if isinstance(
                                        file,
                                        dict) or "$(" in file or "${" in file:
                                    sfpath = builder.do_eval(file,
                                                             context=primary)
                                    subst = False
                                else:
                                    sfpath = file
                                    subst = True
                                for sfitem in aslist(sfpath):
                                    if isinstance(sfitem, str):
                                        if subst:
                                            sfitem = {
                                                "path":
                                                substitute(
                                                    primary["path"], sfitem)
                                            }
                                        else:
                                            sfitem = {
                                                "path": pathprefix + sfitem
                                            }
                                    if "path" in sfitem and "location" not in sfitem:
                                        revmap(sfitem)
                                    if fs_access.isfile(sfitem["location"]):
                                        sfitem["class"] = "File"
                                        primary["secondaryFiles"].append(
                                            sfitem)
                                    elif fs_access.isdir(sfitem["location"]):
                                        sfitem["class"] = "Directory"
                                        primary["secondaryFiles"].append(
                                            sfitem)

            if "format" in schema:
                for primary in aslist(result):
                    primary["format"] = builder.do_eval(schema["format"],
                                                        context=primary)

            # Ensure files point to local references outside of the run environment
            # TODO: Again removing revmap....
            # adjustFileObjs(result, revmap)

            if not result and optional:
                return None

        if not empty_and_optional and isinstance(
                schema["type"], dict) and schema["type"]["type"] == "record":
            out = {}
            for f in schema["type"]["fields"]:
                out[shortname(
                    f["name"])] = self.collect_output(  # type: ignore
                        f,
                        builder,
                        outdir,
                        fs_access,
                        compute_checksum=compute_checksum)
            return out
        return result