Esempio n. 1
0
 def validate_hints(self, avsc_names, hints, strict):
     # type: (Any, List[Dict[Text, Any]], bool) -> None
     for i, r in enumerate(hints):
         sl = SourceLine(hints, i, validate.ValidationException)
         with sl:
             if avsc_names.get_name(r["class"], "") is not None:
                 plain_hint = dict((key, r[key]) for key in r if key not in
                                   self.doc_loader.identifiers)  # strip identifiers
                 validate.validate_ex(
                     avsc_names.get_name(plain_hint["class"], ""),
                     plain_hint, strict=strict)
             else:
                 _logger.info(sl.makeError(u"Unknown hint %s" % (r["class"])))
Esempio n. 2
0
    def job(self, joborder, output_callback, **kwargs):
        # type: (Dict[Text, Any], Callable[[Any, Any], Any], **Any) -> Generator
        self.state = {}
        self.processStatus = "success"

        if "outdir" in kwargs:
            del kwargs["outdir"]

        for e, i in enumerate(self.tool["inputs"]):
            with SourceLine(self.tool["inputs"], e, WorkflowException,
                            _logger.isEnabledFor(logging.DEBUG)):
                iid = shortname(i["id"])
                if iid in joborder:
                    self.state[i["id"]] = WorkflowStateItem(
                        i, copy.deepcopy(joborder[iid]), "success")
                elif "default" in i:
                    self.state[i["id"]] = WorkflowStateItem(
                        i, copy.deepcopy(i["default"]), "success")
                else:
                    raise WorkflowException(
                        u"Input '%s' not in input object and does not have a default value."
                        % (i["id"]))

        for s in self.steps:
            for out in s.tool["outputs"]:
                self.state[out["id"]] = None

        completed = 0
        while completed < len(self.steps):
            self.made_progress = False

            for step in self.steps:
                if kwargs.get(
                        "on_error",
                        "stop") == "stop" and self.processStatus != "success":
                    break

                if not step.submitted:
                    try:
                        step.iterable = self.try_make_job(
                            step, output_callback, **kwargs)
                    except WorkflowException as e:
                        _logger.error(u"[%s] Cannot make job: %s", step.name,
                                      e)
                        _logger.debug("", exc_info=True)
                        self.processStatus = "permanentFail"

                if step.iterable:
                    try:
                        for newjob in step.iterable:
                            if kwargs.get(
                                    "on_error", "stop"
                            ) == "stop" and self.processStatus != "success":
                                break
                            if newjob:
                                self.made_progress = True
                                yield newjob
                            else:
                                break
                    except WorkflowException as e:
                        _logger.error(u"[%s] Cannot make job: %s", step.name,
                                      e)
                        _logger.debug("", exc_info=True)
                        self.processStatus = "permanentFail"

            completed = sum(1 for s in self.steps if s.completed)

            if not self.made_progress and completed < len(self.steps):
                if self.processStatus != "success":
                    break
                else:
                    yield None

        if not self.did_callback:
            self.do_output_callback(output_callback)
Esempio n. 3
0
    def run(self, runtimeContext):
        # type: (RuntimeContext) -> None

        (docker_req, docker_is_req) = self.get_requirement("DockerRequirement")
        self.prov_obj = runtimeContext.prov_obj
        img_id = None
        env = cast(MutableMapping[Text, Text], os.environ)
        user_space_docker_cmd = runtimeContext.user_space_docker_cmd
        if docker_req and user_space_docker_cmd:
            # For user-space docker implementations, a local image name or ID
            # takes precedence over a network pull
            if 'dockerImageId' in docker_req:
                img_id = str(docker_req["dockerImageId"])
            elif 'dockerPull' in docker_req:
                img_id = str(docker_req["dockerPull"])
            else:
                raise WorkflowException(
                    SourceLine(docker_req).makeError(
                        "Docker image must be specified as 'dockerImageId' or "
                        "'dockerPull' when using user space implementations of "
                        "Docker"))
        else:
            try:
                if docker_req and runtimeContext.use_container:
                    img_id = str(
                        self.get_from_requirements(
                            docker_req, True, runtimeContext.pull_image,
                            getdefault(runtimeContext.force_docker_pull,
                                       False),
                            getdefault(runtimeContext.tmp_outdir_prefix,
                                       DEFAULT_TMP_PREFIX)))
                if img_id is None:
                    if self.builder.find_default_container:
                        default_container = self.builder.find_default_container(
                        )
                        if default_container:
                            img_id = str(default_container)

                if docker_req and img_id is None and runtimeContext.use_container:
                    raise Exception("Docker image not available")

                if self.prov_obj and img_id and runtimeContext.process_run_id:
                    # TODO: Integrate with record_container_id
                    container_agent = self.prov_obj.document.agent(
                        uuid.uuid4().urn, {
                            "prov:type":
                            PROV["SoftwareAgent"],
                            "cwlprov:image":
                            img_id,
                            "prov:label":
                            "Container execution of image %s" % img_id
                        })
                    # FIXME: img_id is not a sha256 id, it might just be "debian:8"
                    #img_entity = document.entity("nih:sha-256;%s" % img_id,
                    #                  {"prov:label": "Container image %s" % img_id} )
                    # The image is the plan for this activity-agent association
                    #document.wasAssociatedWith(process_run_ID, container_agent, img_entity)
                    self.prov_obj.document.wasAssociatedWith(
                        runtimeContext.process_run_id, container_agent)
            except Exception as err:
                container = "Singularity" if runtimeContext.singularity else "Docker"
                _logger.debug("%s error", container, exc_info=True)
                if docker_is_req:
                    raise UnsupportedRequirement(
                        "%s is required to run this tool: %s" %
                        (container, err))
                else:
                    raise WorkflowException(
                        "{0} is not available for this tool, try "
                        "--no-container to disable {0}, or install "
                        "a user space Docker replacement like uDocker with "
                        "--user-space-docker-cmd.: {1}".format(container, err))

        self._setup(runtimeContext)
        runtime = self.create_runtime(env, runtimeContext)
        runtime.append(img_id)
        self._execute(runtime, env, runtimeContext)
Esempio n. 4
0
    def __init__(self, toolpath_object, **kwargs):
        # type: (Dict[Text, Any], **Any) -> None
        """
        kwargs:

        metadata: tool document metadata
        requirements: inherited requirements
        hints: inherited hints
        loader: schema_salad.ref_resolver.Loader used to load tool document
        avsc_names: CWL Avro schema object used to validate document
        strict: flag to determine strict validation (fail on unrecognized fields)
        """

        self.metadata = kwargs.get("metadata", {})  # type: Dict[Text,Any]
        self.names = None  # type: schema.Names

        global SCHEMA_FILE, SCHEMA_DIR, SCHEMA_ANY  # pylint: disable=global-statement
        if SCHEMA_FILE is None:
            get_schema("v1.0")
            SCHEMA_ANY = cast(
                Dict[Text, Any],
                SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/salad#Any"])
            SCHEMA_FILE = cast(
                Dict[Text, Any],
                SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/cwl#File"])
            SCHEMA_DIR = cast(
                Dict[Text, Any], SCHEMA_CACHE["v1.0"]
                [3].idx["https://w3id.org/cwl/cwl#Directory"])

        names = schema.make_avro_schema([SCHEMA_FILE, SCHEMA_DIR, SCHEMA_ANY],
                                        Loader({}))[0]
        if isinstance(names, schema.SchemaParseException):
            raise names
        else:
            self.names = names
        self.tool = toolpath_object
        self.requirements = (kwargs.get("requirements", []) + self.tool.get(
            "requirements", []) + get_overrides(kwargs.get(
                "overrides", []), self.tool["id"]).get("requirements", []))
        self.hints = kwargs.get("hints", []) + self.tool.get("hints", [])
        self.formatgraph = None  # type: Graph
        if "loader" in kwargs:
            self.formatgraph = kwargs["loader"].graph

        self.doc_loader = kwargs["loader"]
        self.doc_schema = kwargs["avsc_names"]

        checkRequirements(self.tool, supportedProcessRequirements)
        self.validate_hints(kwargs["avsc_names"],
                            self.tool.get("hints", []),
                            strict=kwargs.get("strict"))

        self.schemaDefs = {}  # type: Dict[Text,Dict[Text, Any]]

        sd, _ = self.get_requirement("SchemaDefRequirement")

        if sd:
            sdtypes = sd["types"]
            av = schema.make_valid_avro(
                sdtypes, {t["name"]: t
                          for t in avroize_type(sdtypes)}, set())
            for i in av:
                self.schemaDefs[i["name"]] = i  # type: ignore
            schema.AvroSchemaFromJSONData(av, self.names)  # type: ignore

        # Build record schema from inputs
        self.inputs_record_schema = {
            "name": "input_record_schema",
            "type": "record",
            "fields": []
        }  # type: Dict[Text, Any]
        self.outputs_record_schema = {
            "name": "outputs_record_schema",
            "type": "record",
            "fields": []
        }  # type: Dict[Text, Any]

        for key in ("inputs", "outputs"):
            for i in self.tool[key]:
                c = copy.copy(i)
                c["name"] = shortname(c["id"])
                del c["id"]

                if "type" not in c:
                    raise validate.ValidationException(u"Missing 'type' in "
                                                       "parameter '%s'" %
                                                       c["name"])

                if "default" in c and "null" not in aslist(c["type"]):
                    c["type"] = ["null"] + aslist(c["type"])
                else:
                    c["type"] = c["type"]
                c["type"] = avroize_type(c["type"], c["name"])
                if key == "inputs":
                    self.inputs_record_schema["fields"].append(c)
                elif key == "outputs":
                    self.outputs_record_schema["fields"].append(c)

        with SourceLine(toolpath_object, "inputs",
                        validate.ValidationException):
            self.inputs_record_schema = cast(
                Dict[six.text_type, Any],
                schema.make_valid_avro(self.inputs_record_schema, {}, set()))
            schema.AvroSchemaFromJSONData(self.inputs_record_schema,
                                          self.names)
        with SourceLine(toolpath_object, "outputs",
                        validate.ValidationException):
            self.outputs_record_schema = cast(
                Dict[six.text_type, Any],
                schema.make_valid_avro(self.outputs_record_schema, {}, set()))
            schema.AvroSchemaFromJSONData(self.outputs_record_schema,
                                          self.names)

        if toolpath_object.get("class") is not None and not kwargs.get(
                "disable_js_validation", False):
            if kwargs.get("js_hint_options_file") is not None:
                try:
                    with open(kwargs["js_hint_options_file"]) as options_file:
                        validate_js_options = json.load(options_file)
                except (OSError, ValueError) as e:
                    _logger.error("Failed to read options file %s" %
                                  kwargs["js_hint_options_file"])
                    raise e
            else:
                validate_js_options = None

            validate_js_expressions(
                cast(CommentedMap, toolpath_object),
                self.doc_schema.names[toolpath_object["class"]],
                validate_js_options)

        dockerReq, is_req = self.get_requirement("DockerRequirement")

        if dockerReq and dockerReq.get("dockerOutputDirectory") and not is_req:
            _logger.warn(
                SourceLine(item=dockerReq, raise_type=Text).makeError(
                    """When 'dockerOutputDirectory' is declared, DockerRequirement
  should go in the 'requirements' section, not 'hints'."""))

        if dockerReq and dockerReq.get(
                "dockerOutputDirectory") == "/var/spool/cwl":
            if is_req:
                # In this specific case, it is legal to have /var/spool/cwl, so skip the check.
                pass
            else:
                # Must be a requirement
                var_spool_cwl_detector(self.tool)
        else:
            var_spool_cwl_detector(self.tool)
Esempio n. 5
0
    def collect_output_ports(self,
                             ports,
                             builder,
                             outdir,
                             compute_checksum=True,
                             jobname="",
                             readers=None):
        # type: (Set[Dict[Text, Any]], Builder, Text, bool, Text, Dict[Text, Any]) -> Dict[Text, Union[Text, List[Any], Dict[Text, Any]]]
        ret = {}  # type: Dict[Text, Union[Text, List[Any], Dict[Text, Any]]]
        debug = _logger.isEnabledFor(logging.DEBUG)
        try:
            fs_access = builder.make_fs_access(outdir)
            custom_output = fs_access.join(outdir, "cwl.output.json")
            if fs_access.exists(custom_output):
                with fs_access.open(custom_output, "r") as f:
                    ret = json.load(f)
                if debug:
                    _logger.debug(u"Raw output from %s: %s", custom_output,
                                  json.dumps(ret, indent=4))
            else:
                for i, port in enumerate(ports):

                    def makeWorkflowException(msg):
                        return WorkflowException(
                            u"Error collecting output for parameter '%s':\n%s"
                            % (shortname(port["id"]), msg))

                    with SourceLine(ports, i, makeWorkflowException, debug):
                        fragment = shortname(port["id"])
                        ret[fragment] = self.collect_output(
                            port,
                            builder,
                            outdir,
                            fs_access,
                            compute_checksum=compute_checksum)
            if ret:
                revmap = partial(revmap_file, builder, outdir)
                adjustDirObjs(ret, trim_listing)
                visit_class(ret, ("File", "Directory"),
                            cast(Callable[[Any], Any], revmap))
                visit_class(ret, ("File", "Directory"), remove_path)
                normalizeFilesDirs(ret)
                visit_class(ret, ("File", "Directory"),
                            partial(check_valid_locations, fs_access))

                if compute_checksum:
                    adjustFileObjs(ret, partial(compute_checksums, fs_access))

            validate.validate_ex(self.names.get_name("outputs_record_schema",
                                                     ""),
                                 ret,
                                 strict=False,
                                 logger=_logger_validation_warnings)
            if ret is not None and builder.mutation_manager is not None:
                adjustFileObjs(ret, builder.mutation_manager.set_generation)
            return ret if ret is not None else {}
        except validate.ValidationException as e:
            raise WorkflowException("Error validating output record. " +
                                    Text(e) + "\n in " +
                                    json.dumps(ret, indent=4))
        finally:
            if builder.mutation_manager and readers:
                for r in readers.values():
                    builder.mutation_manager.release_reader(jobname, r)
Esempio n. 6
0
    def __init__(
        self,
        toolpath_object,  # type: Dict[Text, Any]
        pos,  # type: int
        loadingContext,  # type: LoadingContext
        parentworkflowProv=None  # type: Optional[ProvenanceProfile]
    ):  # type: (...) -> None
        if "id" in toolpath_object:
            self.id = toolpath_object["id"]
        else:
            self.id = "#step" + Text(pos)

        loadingContext = loadingContext.copy()

        loadingContext.requirements = copy.deepcopy(
            getdefault(loadingContext.requirements, []))
        assert loadingContext.requirements is not None
        loadingContext.requirements.extend(
            toolpath_object.get("requirements", []))
        loadingContext.requirements.extend(
            get_overrides(getdefault(loadingContext.overrides_list, []),
                          self.id).get("requirements", []))

        loadingContext.hints = copy.deepcopy(
            getdefault(loadingContext.hints, []))
        loadingContext.hints.extend(toolpath_object.get("hints", []))

        try:
            if isinstance(toolpath_object["run"], MutableMapping):
                self.embedded_tool = loadingContext.construct_tool_object(
                    toolpath_object["run"], loadingContext)  # type: Process
            else:
                self.embedded_tool = load_tool(toolpath_object["run"],
                                               loadingContext)
        except validate.ValidationException as vexc:
            if loadingContext.debug:
                _logger.exception("Validation exception")
            raise WorkflowException(
                u"Tool definition %s failed validation:\n%s" %
                (toolpath_object["run"], validate.indent(str(vexc))))

        validation_errors = []
        self.tool = toolpath_object = copy.deepcopy(toolpath_object)
        bound = set()
        for stepfield, toolfield in (("in", "inputs"), ("out", "outputs")):
            toolpath_object[toolfield] = []
            for index, step_entry in enumerate(toolpath_object[stepfield]):
                if isinstance(step_entry, string_types):
                    param = CommentedMap()  # type: CommentedMap
                    inputid = step_entry
                else:
                    param = CommentedMap(iteritems(step_entry))
                    inputid = step_entry["id"]

                shortinputid = shortname(inputid)
                found = False
                for tool_entry in self.embedded_tool.tool[toolfield]:
                    frag = shortname(tool_entry["id"])
                    if frag == shortinputid:
                        #if the case that the step has a default for a parameter,
                        #we do not want the default of the tool to override it
                        step_default = None
                        if "default" in param and "default" in tool_entry:
                            step_default = param["default"]
                        param.update(tool_entry)
                        param["_tool_entry"] = tool_entry
                        if step_default is not None:
                            param["default"] = step_default
                        found = True
                        bound.add(frag)
                        break
                if not found:
                    if stepfield == "in":
                        param["type"] = "Any"
                        param["not_connected"] = True
                    else:
                        validation_errors.append(
                            SourceLine(self.tool["out"], index).makeError(
                                "Workflow step output '%s' does not correspond to"
                                % shortname(step_entry)) + "\n" +
                            SourceLine(self.embedded_tool.tool, "outputs").
                            makeError("  tool output (expected '%s')" %
                                      ("', '".join([
                                          shortname(tool_entry["id"])
                                          for tool_entry in
                                          self.embedded_tool.tool[toolfield]
                                      ]))))
                param["id"] = inputid
                param.lc.line = toolpath_object[stepfield].lc.data[index][0]
                param.lc.col = toolpath_object[stepfield].lc.data[index][1]
                param.lc.filename = toolpath_object[stepfield].lc.filename
                toolpath_object[toolfield].append(param)

        missing_values = []
        for _, tool_entry in enumerate(self.embedded_tool.tool["inputs"]):
            if shortname(tool_entry["id"]) not in bound:
                if "null" not in tool_entry[
                        "type"] and "default" not in tool_entry:
                    missing_values.append(shortname(tool_entry["id"]))

        if missing_values:
            validation_errors.append(
                SourceLine(self.tool, "in").makeError(
                    "Step is missing required parameter%s '%s'" %
                    ("s" if len(missing_values) > 1 else "",
                     "', '".join(missing_values))))

        if validation_errors:
            raise validate.ValidationException("\n".join(validation_errors))

        super(WorkflowStep, self).__init__(toolpath_object, loadingContext)

        if self.embedded_tool.tool["class"] == "Workflow":
            (feature,
             _) = self.get_requirement("SubworkflowFeatureRequirement")
            if not feature:
                raise WorkflowException(
                    "Workflow contains embedded workflow but "
                    "SubworkflowFeatureRequirement not in requirements")

        if "scatter" in self.tool:
            (feature, _) = self.get_requirement("ScatterFeatureRequirement")
            if not feature:
                raise WorkflowException(
                    "Workflow contains scatter but ScatterFeatureRequirement "
                    "not in requirements")

            inputparms = copy.deepcopy(self.tool["inputs"])
            outputparms = copy.deepcopy(self.tool["outputs"])
            scatter = aslist(self.tool["scatter"])

            method = self.tool.get("scatterMethod")
            if method is None and len(scatter) != 1:
                raise validate.ValidationException(
                    "Must specify scatterMethod when scattering over multiple inputs"
                )

            inp_map = {i["id"]: i for i in inputparms}
            for inp in scatter:
                if inp not in inp_map:
                    raise validate.ValidationException(
                        SourceLine(self.tool, "scatter").makeError(
                            "Scatter parameter '%s' does not correspond to "
                            "an input parameter of this step, expecting '%s'" %
                            (shortname(inp), "', '".join(
                                shortname(k) for k in inp_map.keys()))))

                inp_map[inp]["type"] = {
                    "type": "array",
                    "items": inp_map[inp]["type"]
                }

            if self.tool.get("scatterMethod") == "nested_crossproduct":
                nesting = len(scatter)
            else:
                nesting = 1

            for _ in range(0, nesting):
                for oparam in outputparms:
                    oparam["type"] = {"type": "array", "items": oparam["type"]}
            self.tool["inputs"] = inputparms
            self.tool["outputs"] = outputparms
        self.prov_obj = None  # type: Optional[ProvenanceProfile]
        if loadingContext.research_obj is not None:
            self.prov_obj = parentworkflowProv
            if self.embedded_tool.tool["class"] == "Workflow":
                self.parent_wf = self.embedded_tool.parent_wf
            else:
                self.parent_wf = self.prov_obj
Esempio n. 7
0
    def get_image(
            dockerRequirement,  # type: Dict[Text, Text]
            pull_image,  # type: bool
            force_pull=False  # type: bool
    ):
        # type: (...) -> bool
        """
        Acquire the software container image in the specified dockerRequirement
        using Singularity and returns the success as a bool. Updates the
        provided dockerRequirement with the specific dockerImageId to the full
        path of the local image, if found. Likewise the
        dockerRequirement['dockerPull'] is updated to a docker:// URI if needed.
        """
        found = False

        candidates = []

        if "dockerImageId" not in dockerRequirement and "dockerPull" in dockerRequirement:
            match = re.search(pattern=r'([a-z]*://)',
                              string=dockerRequirement["dockerPull"])
            candidate = _normalize_image_id(dockerRequirement['dockerPull'])
            candidates.append(candidate)
            dockerRequirement['dockerImageId'] = candidate
            if not match:
                dockerRequirement[
                    "dockerPull"] = "docker://" + dockerRequirement[
                        "dockerPull"]
        elif "dockerImageId" in dockerRequirement:
            candidates.append(dockerRequirement['dockerImageId'])
            candidates.append(
                _normalize_image_id(dockerRequirement['dockerImageId']))

        # check if Singularity image is available in $SINGULARITY_CACHEDIR
        targets = [os.getcwd()]
        for env in ("SINGULARITY_CACHEDIR", "SINGULARITY_PULLFOLDER"):
            if env in os.environ:
                targets.append(os.environ[env])
        for target in targets:
            for candidate in candidates:
                path = os.path.join(target, candidate)
                if os.path.isfile(path):
                    _logger.info(
                        "Using local copy of Singularity image found in %s",
                        target)
                    dockerRequirement["dockerImageId"] = path
                    found = True

        if (force_pull or not found) and pull_image:
            cmd = []  # type: List[Text]
            if "dockerPull" in dockerRequirement:
                cmd = [
                    "singularity", "pull", "--force", "--name",
                    str(dockerRequirement["dockerImageId"]),
                    str(dockerRequirement["dockerPull"])
                ]
                _logger.info(Text(cmd))
                check_call(cmd, stdout=sys.stderr)
                found = True
            elif "dockerFile" in dockerRequirement:
                raise WorkflowException(
                    SourceLine(dockerRequirement, 'dockerFile').makeError(
                        "dockerFile is not currently supported when using the "
                        "Singularity runtime for Docker containers."))
            elif "dockerLoad" in dockerRequirement:
                raise WorkflowException(
                    SourceLine(dockerRequirement, 'dockerLoad').makeError(
                        "dockerLoad is not currently supported when using the "
                        "Singularity runtime for Docker containers."))
            elif "dockerImport" in dockerRequirement:
                raise WorkflowException(
                    SourceLine(dockerRequirement, 'dockerImport').makeError(
                        "dockerImport is not currently supported when using the "
                        "Singularity runtime for Docker containers."))

        return found
Esempio n. 8
0
    def run(
        self,
        runtimeContext: RuntimeContext,
        tmpdir_lock: Optional[threading.Lock] = None,
    ) -> None:
        if tmpdir_lock:
            with tmpdir_lock:
                if not os.path.exists(self.tmpdir):
                    os.makedirs(self.tmpdir)
        else:
            if not os.path.exists(self.tmpdir):
                os.makedirs(self.tmpdir)

        (docker_req, docker_is_req) = self.get_requirement("DockerRequirement")
        self.prov_obj = runtimeContext.prov_obj
        img_id = None
        env = cast(MutableMapping[str, str], os.environ)
        user_space_docker_cmd = runtimeContext.user_space_docker_cmd
        if docker_req is not None and user_space_docker_cmd:
            # For user-space docker implementations, a local image name or ID
            # takes precedence over a network pull
            if "dockerImageId" in docker_req:
                img_id = str(docker_req["dockerImageId"])
            elif "dockerPull" in docker_req:
                img_id = str(docker_req["dockerPull"])
                cmd = [user_space_docker_cmd, "pull", img_id]
                _logger.info(str(cmd))
                try:
                    subprocess.check_call(cmd, stdout=sys.stderr)  # nosec
                except OSError:
                    raise WorkflowException(
                        SourceLine(docker_req).makeError(
                            "Either Docker container {} is not available with "
                            "user space docker implementation {} or {} is missing "
                            "or broken.".format(img_id, user_space_docker_cmd,
                                                user_space_docker_cmd)))
            else:
                raise WorkflowException(
                    SourceLine(docker_req).makeError(
                        "Docker image must be specified as 'dockerImageId' or "
                        "'dockerPull' when using user space implementations of "
                        "Docker"))
        else:
            try:
                if docker_req is not None and runtimeContext.use_container:
                    img_id = str(
                        self.get_from_requirements(
                            docker_req,
                            runtimeContext.pull_image,
                            getdefault(runtimeContext.force_docker_pull,
                                       False),
                            getdefault(runtimeContext.tmp_outdir_prefix,
                                       DEFAULT_TMP_PREFIX),
                        ))
                if img_id is None:
                    if self.builder.find_default_container:
                        default_container = self.builder.find_default_container(
                        )
                        if default_container:
                            img_id = str(default_container)

                if (docker_req is not None and img_id is None
                        and runtimeContext.use_container):
                    raise Exception("Docker image not available")

                if (self.prov_obj is not None and img_id is not None
                        and runtimeContext.process_run_id is not None):
                    container_agent = self.prov_obj.document.agent(
                        uuid.uuid4().urn,
                        {
                            "prov:type":
                            PROV["SoftwareAgent"],
                            "cwlprov:image":
                            img_id,
                            "prov:label":
                            "Container execution of image %s" % img_id,
                        },
                    )
                    # FIXME: img_id is not a sha256 id, it might just be "debian:8"
                    # img_entity = document.entity("nih:sha-256;%s" % img_id,
                    #                  {"prov:label": "Container image %s" % img_id} )
                    # The image is the plan for this activity-agent association
                    # document.wasAssociatedWith(process_run_ID, container_agent, img_entity)
                    self.prov_obj.document.wasAssociatedWith(
                        runtimeContext.process_run_id, container_agent)
            except Exception as err:
                container = "Singularity" if runtimeContext.singularity else "Docker"
                _logger.debug("%s error", container, exc_info=True)
                if docker_is_req:
                    raise UnsupportedRequirement(
                        "%s is required to run this tool: %s" %
                        (container, str(err))) from err
                else:
                    raise WorkflowException(
                        "{0} is not available for this tool, try "
                        "--no-container to disable {0}, or install "
                        "a user space Docker replacement like uDocker with "
                        "--user-space-docker-cmd.: {1}".format(container, err))

        self._setup(runtimeContext)
        (runtime, cidfile) = self.create_runtime(env, runtimeContext)
        runtime.append(str(img_id))
        monitor_function = None
        if cidfile:
            monitor_function = functools.partial(
                self.docker_monitor,
                cidfile,
                runtimeContext.tmpdir_prefix,
                not bool(runtimeContext.cidfile_dir),
            )
        elif runtimeContext.user_space_docker_cmd:
            monitor_function = functools.partial(self.process_monitor)
        self._execute(runtime, env, runtimeContext, monitor_function)
Esempio n. 9
0
def validate_document(
        document_loader,  # type: Loader
        workflowobj,  # type: CommentedMap
        uri,  # type: Text
        overrides,  # type: List[Dict]
        metadata,  # type: Dict[Text, Any]
        enable_dev=False,  # type: bool
        strict=True,  # type: bool
        preprocess_only=False,  # type: bool
        fetcher_constructor=None,  # type: FetcherConstructorType
        skip_schemas=None,  # type: bool
        do_validate=True  # type: bool
):
    # type: (...) -> Tuple[Loader, schema.Names, Union[Dict[Text, Any], List[Dict[Text, Any]]], Dict[Text, Any], Text]
    """Validate a CWL document."""

    if isinstance(workflowobj, MutableSequence):
        workflowobj = cmap({"$graph": workflowobj}, fn=uri)

    if not isinstance(workflowobj, MutableMapping):
        raise ValueError("workflowjobj must be a dict, got '{}': {}".format(
            type(workflowobj), workflowobj))

    jobobj = None
    if "cwl:tool" in workflowobj:
        job_loader = default_loader(fetcher_constructor)  # type: ignore
        jobobj, _ = job_loader.resolve_all(workflowobj,
                                           uri,
                                           checklinks=do_validate)
        uri = urllib.parse.urljoin(
            uri, workflowobj["https://w3id.org/cwl/cwl#tool"])
        del cast(dict, jobobj)["https://w3id.org/cwl/cwl#tool"]

        if isinstance(jobobj, CommentedMap
                      ) and "http://commonwl.org/cwltool#overrides" in jobobj:
            overrides.extend(resolve_overrides(jobobj, uri, uri))
            del jobobj["http://commonwl.org/cwltool#overrides"]

        workflowobj = fetch_document(
            uri, fetcher_constructor=fetcher_constructor)[1]

    fileuri = urllib.parse.urldefrag(uri)[0]
    if "cwlVersion" not in workflowobj:
        if 'cwlVersion' in metadata:
            workflowobj['cwlVersion'] = metadata['cwlVersion']
        else:
            raise ValidationException(
                "No cwlVersion found. "
                "Use the following syntax in your CWL document to declare "
                "the version: cwlVersion: <version>.\n"
                "Note: if this is a CWL draft-2 (pre v1.0) document then it "
                "will need to be upgraded first.")

    if not isinstance(workflowobj["cwlVersion"], string_types):
        with SourceLine(workflowobj, "cwlVersion", ValidationException):
            raise ValidationException("'cwlVersion' must be a string, "
                                      "got {}".format(
                                          type(workflowobj["cwlVersion"])))
    # strip out version
    workflowobj["cwlVersion"] = re.sub(r"^(?:cwl:|https://w3id.org/cwl/cwl#)",
                                       "", workflowobj["cwlVersion"])
    if workflowobj["cwlVersion"] not in list(ALLUPDATES):
        # print out all the Supported Versions of cwlVersion
        versions = []
        for version in list(ALLUPDATES):
            if "dev" in version:
                version += " (with --enable-dev flag only)"
            versions.append(version)
        versions.sort()
        raise ValidationException(
            "The CWL reference runner no longer supports pre CWL v1.0 "
            "documents. Supported versions are: "
            "\n{}".format("\n".join(versions)))

    (sch_document_loader, avsc_names) = \
        process.get_schema(workflowobj["cwlVersion"])[:2]

    if isinstance(avsc_names, Exception):
        raise avsc_names

    processobj = None  # type: Union[CommentedMap, CommentedSeq, Text, None]
    document_loader = Loader(sch_document_loader.ctx,
                             schemagraph=sch_document_loader.graph,
                             idx=document_loader.idx,
                             cache=sch_document_loader.cache,
                             fetcher_constructor=fetcher_constructor,
                             skip_schemas=skip_schemas)

    _add_blank_ids(workflowobj)

    workflowobj["id"] = fileuri
    processobj, new_metadata = document_loader.resolve_all(
        workflowobj, fileuri, checklinks=do_validate)
    if not isinstance(processobj, (CommentedMap, CommentedSeq)):
        raise ValidationException("Workflow must be a dict or list.")

    if not new_metadata and isinstance(processobj, CommentedMap):
        new_metadata = cast(
            CommentedMap,
            cmap(
                {
                    "$namespaces": processobj.get("$namespaces", {}),
                    "$schemas": processobj.get("$schemas", []),
                    "cwlVersion": processobj["cwlVersion"]
                },
                fn=fileuri))

    _convert_stdstreams_to_files(workflowobj)

    if preprocess_only:
        return document_loader, avsc_names, processobj, new_metadata, uri

    if do_validate:
        schema.validate_doc(avsc_names, processobj, document_loader, strict)

    if new_metadata.get("cwlVersion") != update.LATEST:
        processobj = cast(
            CommentedMap,
            cmap(
                update.update(processobj, document_loader, fileuri, enable_dev,
                              new_metadata)))

    if jobobj is not None:
        new_metadata[u"cwl:defaults"] = jobobj

    if overrides:
        new_metadata[u"cwltool:overrides"] = overrides

    return document_loader, avsc_names, processobj, new_metadata, uri
Esempio n. 10
0
    def job(self, joborder, output_callback, **kwargs):
        kwargs["work_api"] = self.work_api
        req, _ = self.get_requirement(
            "http://arvados.org/cwl#RunInSingleContainer")
        if req:
            with SourceLine(self.tool, None, WorkflowException,
                            logger.isEnabledFor(logging.DEBUG)):
                if "id" not in self.tool:
                    raise WorkflowException("%s object must have 'id'" %
                                            (self.tool["class"]))
            document_loader, workflowobj, uri = (self.doc_loader,
                                                 self.doc_loader.fetch(
                                                     self.tool["id"]),
                                                 self.tool["id"])

            with Perf(metrics, "subworkflow upload_deps"):
                upload_dependencies(self.arvrunner,
                                    os.path.basename(joborder.get("id", "#")),
                                    document_loader, joborder,
                                    joborder.get("id", "#"), False)

                if self.wf_pdh is None:
                    workflowobj["requirements"] = dedup_reqs(self.requirements)
                    workflowobj["hints"] = dedup_reqs(self.hints)

                    packed = pack(document_loader, workflowobj, uri,
                                  self.metadata)

                    upload_dependencies(self.arvrunner, kwargs.get("name", ""),
                                        document_loader, packed, uri, False)

            with Perf(metrics, "subworkflow adjust"):
                joborder_resolved = copy.deepcopy(joborder)
                joborder_keepmount = copy.deepcopy(joborder)

                reffiles = []
                visit_class(joborder_keepmount, ("File", "Directory"),
                            lambda x: reffiles.append(x))

                mapper = ArvPathMapper(self.arvrunner, reffiles,
                                       kwargs["basedir"], "/keep/%s",
                                       "/keep/%s/%s", **kwargs)

                def keepmount(obj):
                    remove_redundant_fields(obj)
                    with SourceLine(obj, None, WorkflowException,
                                    logger.isEnabledFor(logging.DEBUG)):
                        if "location" not in obj:
                            raise WorkflowException(
                                "%s object is missing required 'location' field: %s"
                                % (obj["class"], obj))
                    with SourceLine(obj, "location", WorkflowException,
                                    logger.isEnabledFor(logging.DEBUG)):
                        if obj["location"].startswith("keep:"):
                            obj["location"] = mapper.mapper(
                                obj["location"]).target
                            if "listing" in obj:
                                del obj["listing"]
                        elif obj["location"].startswith("_:"):
                            del obj["location"]
                        else:
                            raise WorkflowException(
                                "Location is not a keep reference or a literal: '%s'"
                                % obj["location"])

                visit_class(joborder_keepmount, ("File", "Directory"),
                            keepmount)

                def resolved(obj):
                    if obj["location"].startswith("keep:"):
                        obj["location"] = mapper.mapper(
                            obj["location"]).resolved

                visit_class(joborder_resolved, ("File", "Directory"), resolved)

                if self.wf_pdh is None:
                    adjustFileObjs(packed, keepmount)
                    adjustDirObjs(packed, keepmount)
                    self.wf_pdh = upload_workflow_collection(
                        self.arvrunner, shortname(self.tool["id"]), packed)

            wf_runner = cmap({
                "class":
                "CommandLineTool",
                "baseCommand":
                "cwltool",
                "inputs":
                self.tool["inputs"],
                "outputs":
                self.tool["outputs"],
                "stdout":
                "cwl.output.json",
                "requirements":
                workflowobj["requirements"] + [{
                    "class":
                    "InitialWorkDirRequirement",
                    "listing": [{
                        "entryname": "workflow.cwl",
                        "entry": {
                            "class": "File",
                            "location": "keep:%s/workflow.cwl" % self.wf_pdh
                        }
                    }, {
                        "entryname":
                        "cwl.input.yml",
                        "entry":
                        json.dumps(joborder_keepmount,
                                   indent=2,
                                   sort_keys=True,
                                   separators=(',', ': ')).replace(
                                       "\\", "\\\\").replace(
                                           '$(', '\$(').replace('${', '\${')
                    }]
                }],
                "hints":
                workflowobj["hints"],
                "arguments": [
                    "--no-container", "--move-outputs",
                    "--preserve-entire-environment", "workflow.cwl#main",
                    "cwl.input.yml"
                ]
            })
            kwargs["loader"] = self.doc_loader
            kwargs["avsc_names"] = self.doc_schema
            return ArvadosCommandTool(self.arvrunner, wf_runner,
                                      **kwargs).job(joborder_resolved,
                                                    output_callback, **kwargs)
        else:
            return super(ArvadosWorkflow, self).job(joborder, output_callback,
                                                    **kwargs)
Esempio n. 11
0
def _convert_stdstreams_to_files(workflowobj):
    # type: (Union[Dict[str, Any], List[Dict[str, Any]]]) -> None

    if isinstance(workflowobj, MutableMapping):
        if workflowobj.get("class") == "CommandLineTool":
            with SourceLine(
                workflowobj,
                "outputs",
                ValidationException,
                _logger.isEnabledFor(logging.DEBUG),
            ):
                outputs = workflowobj.get("outputs", [])
                if not isinstance(outputs, CommentedSeq):
                    raise ValidationException('"outputs" section is not ' "valid.")
                for out in workflowobj.get("outputs", []):
                    if not isinstance(out, CommentedMap):
                        raise ValidationException(
                            "Output '{}' is not a valid " "OutputParameter.".format(out)
                        )
                    for streamtype in ["stdout", "stderr"]:
                        if out.get("type") == streamtype:
                            if "outputBinding" in out:
                                raise ValidationException(
                                    "Not allowed to specify outputBinding when"
                                    " using %s shortcut." % streamtype
                                )
                            if streamtype in workflowobj:
                                filename = workflowobj[streamtype]
                            else:
                                filename = str(
                                    hashlib.sha1(  # nosec
                                        json_dumps(workflowobj, sort_keys=True).encode(
                                            "utf-8"
                                        )
                                    ).hexdigest()
                                )
                                workflowobj[streamtype] = filename
                            out["type"] = "File"
                            out["outputBinding"] = cmap({"glob": filename})
            for inp in workflowobj.get("inputs", []):
                if inp.get("type") == "stdin":
                    if "inputBinding" in inp:
                        raise ValidationException(
                            "Not allowed to specify inputBinding when"
                            " using stdin shortcut."
                        )
                    if "stdin" in workflowobj:
                        raise ValidationException(
                            "Not allowed to specify stdin path when"
                            " using stdin type shortcut."
                        )
                    else:
                        workflowobj["stdin"] = (
                            "$(inputs.%s.path)" % inp["id"].rpartition("#")[2]
                        )
                        inp["type"] = "File"
        else:
            for entry in workflowobj.values():
                _convert_stdstreams_to_files(entry)
    if isinstance(workflowobj, MutableSequence):
        for entry in workflowobj:
            _convert_stdstreams_to_files(entry)
Esempio n. 12
0
    def collect_output(
            self,
            schema,  # type: Dict[str, Any]
            builder,  # type: Builder
            outdir,  # type: str
            fs_access,  # type: StdFsAccess
            compute_checksum=True,  # type: bool
    ):
        # type: (...) -> Optional[Union[Dict[str, Any], List[Union[Dict[str, Any], str]]]]
        r = []  # type: List[Any]
        empty_and_optional = False
        debug = _logger.isEnabledFor(logging.DEBUG)
        if "outputBinding" in schema:
            binding = schema["outputBinding"]
            globpatterns = []  # type: List[str]

            revmap = partial(revmap_file, builder, outdir)

            if "glob" in binding:
                with SourceLine(binding, "glob", WorkflowException, debug):
                    for gb in aslist(binding["glob"]):
                        gb = builder.do_eval(gb)
                        if gb:
                            globpatterns.extend(aslist(gb))

                    for gb in globpatterns:
                        if gb.startswith(builder.outdir):
                            gb = gb[len(builder.outdir) + 1:]
                        elif gb == ".":
                            gb = outdir
                        elif gb.startswith("/"):
                            raise WorkflowException(
                                "glob patterns must not start with '/'")
                        try:
                            prefix = fs_access.glob(outdir)
                            r.extend([{
                                "location":
                                g,
                                "path":
                                fs_access.join(builder.outdir,
                                               g[len(prefix[0]) + 1:]),
                                "basename":
                                os.path.basename(g),
                                "nameroot":
                                os.path.splitext(os.path.basename(g))[0],
                                "nameext":
                                os.path.splitext(os.path.basename(g))[1],
                                "class":
                                "File" if fs_access.isfile(g) else "Directory",
                            } for g in sorted(
                                fs_access.glob(fs_access.join(outdir, gb)),
                                key=cmp_to_key(
                                    cast(
                                        Callable[[str, str], int],
                                        locale.strcoll,
                                    )),
                            )])
                        except (OSError, IOError) as e:
                            _logger.warning(str(e))
                        except Exception:
                            _logger.error("Unexpected error from fs_access",
                                          exc_info=True)
                            raise

                for files in r:
                    rfile = files.copy()
                    revmap(rfile)
                    if files["class"] == "Directory":
                        ll = schema.get("loadListing") or builder.loadListing
                        if ll and ll != "no_listing":
                            get_listing(fs_access, files,
                                        (ll == "deep_listing"))
                    else:
                        if binding.get("loadContents"):
                            with fs_access.open(rfile["location"], "rb") as f:
                                files[
                                    "contents"] = content_limit_respected_read_bytes(
                                        f).decode("utf-8")
                        if compute_checksum:
                            with fs_access.open(rfile["location"], "rb") as f:
                                checksum = hashlib.sha1()  # nosec
                                contents = f.read(1024 * 1024)
                                while contents != b"":
                                    checksum.update(contents)
                                    contents = f.read(1024 * 1024)
                                files[
                                    "checksum"] = "sha1$%s" % checksum.hexdigest(
                                    )
                        files["size"] = fs_access.size(rfile["location"])

            optional = False
            single = False
            if isinstance(schema["type"], MutableSequence):
                if "null" in schema["type"]:
                    optional = True
                if "File" in schema["type"] or "Directory" in schema["type"]:
                    single = True
            elif schema["type"] == "File" or schema["type"] == "Directory":
                single = True

            if "outputEval" in binding:
                with SourceLine(binding, "outputEval", WorkflowException,
                                debug):
                    r = builder.do_eval(binding["outputEval"], context=r)

            if single:
                if not r and not optional:
                    with SourceLine(binding, "glob", WorkflowException, debug):
                        raise WorkflowException(
                            "Did not find output file with glob pattern: '{}'".
                            format(globpatterns))
                elif not r and optional:
                    pass
                elif isinstance(r, MutableSequence):
                    if len(r) > 1:
                        raise WorkflowException(
                            "Multiple matches for output item that is a single file."
                        )
                    else:
                        r = r[0]

            if "secondaryFiles" in schema:
                with SourceLine(schema, "secondaryFiles", WorkflowException,
                                debug):
                    for primary in aslist(r):
                        if isinstance(primary, MutableMapping):
                            primary.setdefault("secondaryFiles", [])
                            pathprefix = primary["path"][0:primary["path"].
                                                         rindex("/") + 1]
                            for sf in aslist(schema["secondaryFiles"]):
                                if "required" in sf:
                                    sf_required = builder.do_eval(
                                        sf["required"], context=primary)
                                else:
                                    sf_required = False

                                if "$(" in sf["pattern"] or "${" in sf[
                                        "pattern"]:
                                    sfpath = builder.do_eval(sf["pattern"],
                                                             context=primary)
                                else:
                                    sfpath = substitute(
                                        primary["basename"], sf["pattern"])

                                for sfitem in aslist(sfpath):
                                    if not sfitem:
                                        continue
                                    if isinstance(sfitem, str):
                                        sfitem = {"path": pathprefix + sfitem}
                                    if (not fs_access.exists(sfitem["path"])
                                            and sf_required):
                                        raise WorkflowException(
                                            "Missing required secondary file '%s'"
                                            % (sfitem["path"]))
                                    if "path" in sfitem and "location" not in sfitem:
                                        revmap(sfitem)
                                    if fs_access.isfile(sfitem["location"]):
                                        sfitem["class"] = "File"
                                        primary["secondaryFiles"].append(
                                            sfitem)
                                    elif fs_access.isdir(sfitem["location"]):
                                        sfitem["class"] = "Directory"
                                        primary["secondaryFiles"].append(
                                            sfitem)

            if "format" in schema:
                for primary in aslist(r):
                    primary["format"] = builder.do_eval(schema["format"],
                                                        context=primary)

            # Ensure files point to local references outside of the run environment
            adjustFileObjs(r, revmap)

            if not r and optional:
                # Don't convert zero or empty string to None
                if r in [0, ""]:
                    return r
                # For [] or None, return None
                else:
                    return None

        if (not empty_and_optional
                and isinstance(schema["type"], MutableMapping)
                and schema["type"]["type"] == "record"):
            out = {}
            for field in schema["type"]["fields"]:
                out[shortname(field["name"])] = self.collect_output(
                    field,
                    builder,
                    outdir,
                    fs_access,
                    compute_checksum=compute_checksum)
            return out
        return r
Esempio n. 13
0
    def collect_output_ports(
        self,
        ports: Set[Dict[str, Any]],
        builder: Builder,
        outdir: str,
        rcode: int,
        compute_checksum: bool = True,
        jobname: str = "",
        readers: Optional[Dict[str, Any]] = None,
    ) -> OutputPorts:
        ret = {}  # type: OutputPorts
        debug = _logger.isEnabledFor(logging.DEBUG)
        cwl_version = self.metadata.get(
            "http://commonwl.org/cwltool#original_cwlVersion", None)
        if cwl_version != "v1.0":
            builder.resources["exitCode"] = rcode
        try:
            fs_access = builder.make_fs_access(outdir)
            custom_output = fs_access.join(outdir, "cwl.output.json")
            if fs_access.exists(custom_output):
                with fs_access.open(custom_output, "r") as f:
                    ret = json.load(f)
                if debug:
                    _logger.debug(
                        "Raw output from %s: %s",
                        custom_output,
                        json_dumps(ret, indent=4),
                    )
            else:
                for i, port in enumerate(ports):

                    class ParameterOutputWorkflowException(WorkflowException):
                        def __init__(self, msg,
                                     **kwargs):  # type: (str, **Any) -> None
                            super(
                                ParameterOutputWorkflowException, self
                            ).__init__(
                                "Error collecting output for parameter '%s':\n%s"
                                % (shortname(port["id"]), msg),
                                kwargs,
                            )

                    with SourceLine(ports, i, ParameterOutputWorkflowException,
                                    debug):
                        fragment = shortname(port["id"])
                        ret[fragment] = self.collect_output(
                            port,
                            builder,
                            outdir,
                            fs_access,
                            compute_checksum=compute_checksum,
                        )
            if ret:
                revmap = partial(revmap_file, builder, outdir)
                adjustDirObjs(ret, trim_listing)
                visit_class(ret, ("File", "Directory"),
                            cast(Callable[[Any], Any], revmap))
                visit_class(ret, ("File", "Directory"), remove_path)
                normalizeFilesDirs(ret)
                visit_class(
                    ret,
                    ("File", "Directory"),
                    partial(check_valid_locations, fs_access),
                )

                if compute_checksum:
                    adjustFileObjs(ret, partial(compute_checksums, fs_access))
            expected_schema = cast(
                Schema, self.names.get_name("outputs_record_schema", ""))
            validate.validate_ex(expected_schema,
                                 ret,
                                 strict=False,
                                 logger=_logger_validation_warnings)
            if ret is not None and builder.mutation_manager is not None:
                adjustFileObjs(ret, builder.mutation_manager.set_generation)
            return ret if ret is not None else {}
        except validate.ValidationException as e:
            raise WorkflowException("Error validating output record. " +
                                    str(e) + "\n in " +
                                    json_dumps(ret, indent=4)) from e
        finally:
            if builder.mutation_manager and readers:
                for r in readers.values():
                    builder.mutation_manager.release_reader(jobname, r)
Esempio n. 14
0
    def job(
            self,
            job_order,  # type: Mapping[str, str]
            output_callbacks,  # type: Callable[[Any, Any], Any]
            runtimeContext,  # type: RuntimeContext
    ):
        # type: (...) -> Generator[Union[JobBase, CallbackJob], None, None]

        workReuse, _ = self.get_requirement("WorkReuse")
        enableReuse = workReuse.get("enableReuse", True) if workReuse else True

        jobname = uniquename(runtimeContext.name
                             or shortname(self.tool.get("id", "job")))
        if runtimeContext.cachedir and enableReuse:
            cachecontext = runtimeContext.copy()
            cachecontext.outdir = "/out"
            cachecontext.tmpdir = "/tmp"  # nosec
            cachecontext.stagedir = "/stage"
            cachebuilder = self._init_job(job_order, cachecontext)
            cachebuilder.pathmapper = PathMapper(
                cachebuilder.files,
                runtimeContext.basedir,
                cachebuilder.stagedir,
                separateDirs=False,
            )
            _check_adjust = partial(check_adjust, cachebuilder)
            visit_class(
                [cachebuilder.files, cachebuilder.bindings],
                ("File", "Directory"),
                _check_adjust,
            )

            cmdline = flatten(
                list(map(cachebuilder.generate_arg, cachebuilder.bindings)))
            docker_req, _ = self.get_requirement("DockerRequirement")
            if docker_req is not None and runtimeContext.use_container:
                dockerimg = docker_req.get("dockerImageId") or docker_req.get(
                    "dockerPull")
            elif (runtimeContext.default_container is not None
                  and runtimeContext.use_container):
                dockerimg = runtimeContext.default_container
            else:
                dockerimg = None

            if dockerimg is not None:
                cmdline = ["docker", "run", dockerimg] + cmdline
                # not really run using docker, just for hashing purposes

            keydict = {
                "cmdline": cmdline
            }  # type: Dict[str, Union[Dict[str, Any], List[Any]]]

            for shortcut in ["stdin", "stdout", "stderr"]:
                if shortcut in self.tool:
                    keydict[shortcut] = self.tool[shortcut]

            for location, fobj in cachebuilder.pathmapper.items():
                if fobj.type == "File":
                    checksum = next(
                        (e["checksum"] for e in cachebuilder.files
                         if "location" in e and e["location"] == location
                         and "checksum" in e and e["checksum"] != "sha1$hash"),
                        None,
                    )
                    fobj_stat = os.stat(fobj.resolved)
                    if checksum is not None:
                        keydict[fobj.resolved] = [fobj_stat.st_size, checksum]
                    else:
                        keydict[fobj.resolved] = [
                            fobj_stat.st_size,
                            int(fobj_stat.st_mtime * 1000),
                        ]

            interesting = {
                "DockerRequirement",
                "EnvVarRequirement",
                "InitialWorkDirRequirement",
                "ShellCommandRequirement",
                "NetworkAccess",
            }
            for rh in (self.original_requirements, self.original_hints):
                for r in reversed(rh):
                    if r["class"] in interesting and r["class"] not in keydict:
                        keydict[r["class"]] = r

            keydictstr = json_dumps(keydict,
                                    separators=(",", ":"),
                                    sort_keys=True)
            cachekey = hashlib.md5(
                keydictstr.encode("utf-8")).hexdigest()  # nosec

            _logger.debug("[job %s] keydictstr is %s -> %s", jobname,
                          keydictstr, cachekey)

            jobcache = os.path.join(runtimeContext.cachedir, cachekey)

            # Create a lockfile to manage cache status.
            jobcachepending = "{}.status".format(jobcache)
            jobcachelock = None
            jobstatus = None

            # Opens the file for read/write, or creates an empty file.
            jobcachelock = open(jobcachepending, "a+")

            # get the shared lock to ensure no other process is trying
            # to write to this cache
            shared_file_lock(jobcachelock)
            jobcachelock.seek(0)
            jobstatus = jobcachelock.read()

            if os.path.isdir(jobcache) and jobstatus == "success":
                if docker_req and runtimeContext.use_container:
                    cachebuilder.outdir = (runtimeContext.docker_outdir
                                           or random_outdir())
                else:
                    cachebuilder.outdir = jobcache

                _logger.info("[job %s] Using cached output in %s", jobname,
                             jobcache)
                yield CallbackJob(self, output_callbacks, cachebuilder,
                                  jobcache)
                # we're done with the cache so release lock
                jobcachelock.close()
                return
            else:
                _logger.info("[job %s] Output of job will be cached in %s",
                             jobname, jobcache)

                # turn shared lock into an exclusive lock since we'll
                # be writing the cache directory
                upgrade_lock(jobcachelock)

                shutil.rmtree(jobcache, True)
                os.makedirs(jobcache)
                runtimeContext = runtimeContext.copy()
                runtimeContext.outdir = jobcache

                def update_status_output_callback(
                    output_callbacks: Callable[[List[Dict[str, Any]], str],
                                               None],
                    jobcachelock: IO[Any],
                    outputs: List[Dict[str, Any]],
                    processStatus: str,
                ) -> None:
                    # save status to the lockfile then release the lock
                    jobcachelock.seek(0)
                    jobcachelock.truncate()
                    jobcachelock.write(processStatus)
                    jobcachelock.close()
                    output_callbacks(outputs, processStatus)

                output_callbacks = partial(update_status_output_callback,
                                           output_callbacks, jobcachelock)

        builder = self._init_job(job_order, runtimeContext)

        reffiles = copy.deepcopy(builder.files)

        j = self.make_job_runner(runtimeContext)(
            builder,
            builder.job,
            self.make_path_mapper,
            self.requirements,
            self.hints,
            jobname,
        )
        j.prov_obj = self.prov_obj

        j.successCodes = self.tool.get("successCodes", [])
        j.temporaryFailCodes = self.tool.get("temporaryFailCodes", [])
        j.permanentFailCodes = self.tool.get("permanentFailCodes", [])

        debug = _logger.isEnabledFor(logging.DEBUG)

        if debug:
            _logger.debug(
                "[job %s] initializing from %s%s",
                j.name,
                self.tool.get("id", ""),
                " as part of %s" %
                runtimeContext.part_of if runtimeContext.part_of else "",
            )
            _logger.debug("[job %s] %s", j.name,
                          json_dumps(builder.job, indent=4))

        builder.pathmapper = self.make_path_mapper(reffiles, builder.stagedir,
                                                   runtimeContext, True)
        builder.requirements = j.requirements

        _check_adjust = partial(check_adjust, builder)

        visit_class([builder.files, builder.bindings], ("File", "Directory"),
                    _check_adjust)

        initialWorkdir, _ = self.get_requirement("InitialWorkDirRequirement")
        if initialWorkdir is not None:
            ls = []  # type: List[Dict[str, Any]]
            if isinstance(initialWorkdir["listing"], str):
                ls = builder.do_eval(initialWorkdir["listing"])
            else:
                for t in initialWorkdir["listing"]:
                    if isinstance(t, Mapping) and "entry" in t:
                        entry_exp = builder.do_eval(t["entry"],
                                                    strip_whitespace=False)
                        for entry in aslist(entry_exp):
                            et = {"entry": entry}
                            if "entryname" in t:
                                et["entryname"] = builder.do_eval(
                                    t["entryname"])
                            else:
                                et["entryname"] = None
                            et["writable"] = t.get("writable", False)
                            if et["entry"] is not None:
                                ls.append(et)
                    else:
                        initwd_item = builder.do_eval(t)
                        if not initwd_item:
                            continue
                        if isinstance(initwd_item, MutableSequence):
                            ls.extend(initwd_item)
                        else:
                            ls.append(initwd_item)
            for i, t in enumerate(ls):
                if "entry" in t:
                    if isinstance(t["entry"], str):
                        ls[i] = {
                            "class": "File",
                            "basename": t["entryname"],
                            "contents": t["entry"],
                            "writable": t.get("writable"),
                        }
                    else:
                        if t.get("entryname") or t.get("writable"):
                            t = copy.deepcopy(t)
                            if t.get("entryname"):
                                t["entry"]["basename"] = t["entryname"]
                            t["entry"]["writable"] = t.get("writable")
                        ls[i] = t["entry"]
            j.generatefiles["listing"] = ls
            for l in ls:
                self.updatePathmap(builder.outdir, builder.pathmapper, l)
            visit_class([builder.files, builder.bindings],
                        ("File", "Directory"), _check_adjust)

        if debug:
            _logger.debug(
                "[job %s] path mappings is %s",
                j.name,
                json_dumps(
                    {
                        p: builder.pathmapper.mapper(p)
                        for p in builder.pathmapper.files()
                    },
                    indent=4,
                ),
            )

        if self.tool.get("stdin"):
            with SourceLine(self.tool, "stdin", validate.ValidationException,
                            debug):
                j.stdin = builder.do_eval(self.tool["stdin"])
                if j.stdin:
                    reffiles.append({"class": "File", "path": j.stdin})

        if self.tool.get("stderr"):
            with SourceLine(self.tool, "stderr", validate.ValidationException,
                            debug):
                j.stderr = builder.do_eval(self.tool["stderr"])
                if j.stderr:
                    if os.path.isabs(j.stderr) or ".." in j.stderr:
                        raise validate.ValidationException(
                            "stderr must be a relative path, got '%s'" %
                            j.stderr)

        if self.tool.get("stdout"):
            with SourceLine(self.tool, "stdout", validate.ValidationException,
                            debug):
                j.stdout = builder.do_eval(self.tool["stdout"])
                if j.stdout:
                    if os.path.isabs(
                            j.stdout) or ".." in j.stdout or not j.stdout:
                        raise validate.ValidationException(
                            "stdout must be a relative path, got '%s'" %
                            j.stdout)

        if debug:
            _logger.debug(
                "[job %s] command line bindings is %s",
                j.name,
                json_dumps(builder.bindings, indent=4),
            )
        dockerReq, _ = self.get_requirement("DockerRequirement")
        if dockerReq is not None and runtimeContext.use_container:
            out_dir, out_prefix = os.path.split(
                runtimeContext.tmp_outdir_prefix)
            j.outdir = runtimeContext.outdir or tempfile.mkdtemp(
                prefix=out_prefix, dir=out_dir)
            tmpdir_dir, tmpdir_prefix = os.path.split(
                runtimeContext.tmpdir_prefix)
            j.tmpdir = runtimeContext.tmpdir or tempfile.mkdtemp(
                prefix=tmpdir_prefix, dir=tmpdir_dir)
            j.stagedir = tempfile.mkdtemp(prefix=tmpdir_prefix, dir=tmpdir_dir)
        else:
            j.outdir = builder.outdir
            j.tmpdir = builder.tmpdir
            j.stagedir = builder.stagedir

        inplaceUpdateReq, _ = self.get_requirement("InplaceUpdateRequirement")
        if inplaceUpdateReq is not None:
            j.inplace_update = inplaceUpdateReq["inplaceUpdate"]
        normalizeFilesDirs(j.generatefiles)

        readers = {}  # type: Dict[str, Any]
        muts = set()  # type: Set[str]

        if builder.mutation_manager is not None:

            def register_mut(f):  # type: (Dict[str, Any]) -> None
                mm = cast(MutationManager, builder.mutation_manager)
                muts.add(f["location"])
                mm.register_mutation(j.name, f)

            def register_reader(f):  # type: (Dict[str, Any]) -> None
                mm = cast(MutationManager, builder.mutation_manager)
                if f["location"] not in muts:
                    mm.register_reader(j.name, f)
                    readers[f["location"]] = copy.deepcopy(f)

            for li in j.generatefiles["listing"]:
                li = cast(Dict[str, Any], li)
                if li.get("writable") and j.inplace_update:
                    adjustFileObjs(li, register_mut)
                    adjustDirObjs(li, register_mut)
                else:
                    adjustFileObjs(li, register_reader)
                    adjustDirObjs(li, register_reader)

            adjustFileObjs(builder.files, register_reader)
            adjustFileObjs(builder.bindings, register_reader)
            adjustDirObjs(builder.files, register_reader)
            adjustDirObjs(builder.bindings, register_reader)

        timelimit, _ = self.get_requirement("ToolTimeLimit")
        if timelimit is not None:
            with SourceLine(timelimit, "timelimit",
                            validate.ValidationException, debug):
                j.timelimit = builder.do_eval(timelimit["timelimit"])
                if not isinstance(j.timelimit, int) or j.timelimit < 0:
                    raise Exception(
                        "timelimit must be an integer >= 0, got: %s" %
                        j.timelimit)

        networkaccess, _ = self.get_requirement("NetworkAccess")
        if networkaccess is not None:
            with SourceLine(networkaccess, "networkAccess",
                            validate.ValidationException, debug):
                j.networkaccess = builder.do_eval(
                    networkaccess["networkAccess"])
                if not isinstance(j.networkaccess, bool):
                    raise Exception(
                        "networkAccess must be a boolean, got: %s" %
                        j.networkaccess)

        j.environment = {}
        evr, _ = self.get_requirement("EnvVarRequirement")
        if evr is not None:
            for t in evr["envDef"]:
                j.environment[t["envName"]] = builder.do_eval(t["envValue"])

        shellcmd, _ = self.get_requirement("ShellCommandRequirement")
        if shellcmd is not None:
            cmd = []  # type: List[str]
            for b in builder.bindings:
                arg = builder.generate_arg(b)
                if b.get("shellQuote", True):
                    arg = [shellescape.quote(a) for a in aslist(arg)]
                cmd.extend(aslist(arg))
            j.command_line = ["/bin/sh", "-c", " ".join(cmd)]
        else:
            j.command_line = flatten(
                list(map(builder.generate_arg, builder.bindings)))

        j.pathmapper = builder.pathmapper
        j.collect_outputs = partial(
            self.collect_output_ports,
            self.tool["outputs"],
            builder,
            compute_checksum=getdefault(runtimeContext.compute_checksum, True),
            jobname=jobname,
            readers=readers,
        )
        j.output_callback = output_callbacks

        yield j
Esempio n. 15
0
    def visit(
        self,
        obj: CWLObjectType,
        stagedir: str,
        basedir: str,
        copy: bool = False,
        staged: bool = False,
    ) -> None:
        tgt = convert_pathsep_to_unix(
            os.path.join(stagedir, cast(str, obj["basename"]))
        )
        if obj["location"] in self._pathmap:
            return
        if obj["class"] == "Directory":
            location = cast(str, obj["location"])
            if location.startswith("file://"):
                resolved = uri_file_path(location)
            else:
                resolved = location
            self._pathmap[location] = MapperEnt(
                resolved, tgt, "WritableDirectory" if copy else "Directory", staged
            )
            if location.startswith("file://"):
                staged = False
            self.visitlisting(
                cast(List[CWLObjectType], obj.get("listing", [])),
                tgt,
                basedir,
                copy=copy,
                staged=staged,
            )
        elif obj["class"] == "File":
            path = cast(str, obj["location"])
            ab = abspath(path, basedir)
            if "contents" in obj and path.startswith("_:"):
                self._pathmap[path] = MapperEnt(
                    obj["contents"],
                    tgt,
                    "CreateWritableFile" if copy else "CreateFile",
                    staged,
                )
            else:
                with SourceLine(
                    obj,
                    "location",
                    ValidationException,
                    _logger.isEnabledFor(logging.DEBUG),
                ):
                    deref = ab
                    if urllib.parse.urlsplit(deref).scheme in ["http", "https"]:
                        deref = downloadHttpFile(path)
                    else:
                        # Dereference symbolic links
                        st = os.lstat(deref)
                        while stat.S_ISLNK(st.st_mode):
                            rl = os.readlink(deref)
                            deref = (
                                rl
                                if os.path.isabs(rl)
                                else os.path.join(os.path.dirname(deref), rl)
                            )
                            st = os.lstat(deref)

                    self._pathmap[path] = MapperEnt(
                        deref, tgt, "WritableFile" if copy else "File", staged
                    )
            self.visitlisting(
                cast(List[CWLObjectType], obj.get("secondaryFiles", [])),
                stagedir,
                basedir,
                copy=copy,
                staged=staged,
            )
Esempio n. 16
0
def resolve_and_validate_document(loadingContext,
                      workflowobj,
                      uri,
                      preprocess_only=False,     # type: bool
                      skip_schemas=None,         # type: bool
                     ):
    # type: (...) -> Tuple[LoadingContext, Text]
    """Validate a CWL document."""
    loadingContext = loadingContext.copy()

    if not isinstance(workflowobj, MutableMapping):
        raise ValueError("workflowjobj must be a dict, got '{}': {}".format(
            type(workflowobj), workflowobj))

    jobobj = None
    if "cwl:tool" in workflowobj:
        jobobj, _ = loadingContext.loader.resolve_all(workflowobj, uri)
        uri = urllib.parse.urljoin(uri, workflowobj["https://w3id.org/cwl/cwl#tool"])
        del cast(dict, jobobj)["https://w3id.org/cwl/cwl#tool"]

        workflowobj = fetch_document(uri, loadingContext)[1]

    fileuri = urllib.parse.urldefrag(uri)[0]

    cwlVersion = loadingContext.metadata.get("cwlVersion")
    if not cwlVersion:
        cwlVersion = workflowobj.get("cwlVersion")
    if not cwlVersion and fileuri != uri:
        # The tool we're loading is a fragment of a bigger file.  Get
        # the document root element and look for cwlVersion there.
        metadata = fetch_document(fileuri, loadingContext)[1]
        cwlVersion = metadata.get("cwlVersion")
    if not cwlVersion:
        raise ValidationException(
            "No cwlVersion found. "
            "Use the following syntax in your CWL document to declare "
            "the version: cwlVersion: <version>.\n"
            "Note: if this is a CWL draft-2 (pre v1.0) document then it "
            "will need to be upgraded first.")

    if not isinstance(cwlVersion, string_types):
        with SourceLine(workflowobj, "cwlVersion", ValidationException):
            raise ValidationException("'cwlVersion' must be a string, "
                                      "got {}".format(
                                          type(cwlVersion)))
    # strip out version
    cwlVersion = re.sub(
        r"^(?:cwl:|https://w3id.org/cwl/cwl#)", "",
        cwlVersion)
    if cwlVersion not in list(ALLUPDATES):
        # print out all the Supported Versions of cwlVersion
        versions = []
        for version in list(ALLUPDATES):
            if "dev" in version:
                version += " (with --enable-dev flag only)"
            versions.append(version)
        versions.sort()
        raise ValidationException(
            "The CWL reference runner no longer supports pre CWL v1.0 "
            "documents. Supported versions are: "
            "\n{}".format("\n".join(versions)))

    if isinstance(jobobj, CommentedMap) and "http://commonwl.org/cwltool#overrides" in jobobj:
        loadingContext.overrides_list.extend(resolve_overrides(jobobj, uri, uri))
        del jobobj["http://commonwl.org/cwltool#overrides"]

    if isinstance(jobobj, CommentedMap) and "https://w3id.org/cwl/cwl#requirements" in jobobj:
        if cwlVersion not in ("v1.1.0-dev1","v1.1"):
            raise ValidationException(
                    "`cwl:requirements` in the input object is not part of CWL "
                    "v1.0. You can adjust to use `cwltool:overrides` instead; or you "
                    "can set the cwlVersion to v1.1 or greater.")
        loadingContext.overrides_list.append({"overrideTarget": uri,
                                              "requirements": jobobj["https://w3id.org/cwl/cwl#requirements"]})
        del jobobj["https://w3id.org/cwl/cwl#requirements"]

    (sch_document_loader, avsc_names) = \
        process.get_schema(cwlVersion)[:2]

    if isinstance(avsc_names, Exception):
        raise avsc_names

    processobj = None  # type: Union[CommentedMap, CommentedSeq, Text, None]
    document_loader = Loader(sch_document_loader.ctx,
                             schemagraph=sch_document_loader.graph,
                             idx=loadingContext.loader.idx,
                             cache=sch_document_loader.cache,
                             fetcher_constructor=loadingContext.fetcher_constructor,
                             skip_schemas=skip_schemas)

    if cwlVersion == "v1.0":
        _add_blank_ids(workflowobj)

    processobj, metadata = document_loader.resolve_all(workflowobj, fileuri)
    if loadingContext.metadata:
        metadata = loadingContext.metadata
    if not isinstance(processobj, (CommentedMap, CommentedSeq)):
        raise ValidationException("Workflow must be a CommentedMap or CommentedSeq.")
    if not isinstance(metadata, CommentedMap):
        raise ValidationException("metadata must be a CommentedMap, was %s" % type(metadata))

    if isinstance(processobj, CommentedMap):
        uri = processobj["id"]

    _convert_stdstreams_to_files(workflowobj)

    if preprocess_only:
        return loadingContext, uri

    if loadingContext.do_validate:
        schema.validate_doc(avsc_names, processobj, document_loader, loadingContext.strict)

    # None means default behavior (do update)
    if loadingContext.do_update in (True, None):
        if "cwlVersion" not in metadata:
            metadata["cwlVersion"] = cwlVersion
        processobj = update.update(
            processobj, document_loader, fileuri, loadingContext.enable_dev, metadata)
        document_loader.idx[processobj["id"]] = processobj

    if jobobj is not None:
        loadingContext.jobdefaults = jobobj

    loadingContext.loader = document_loader
    loadingContext.avsc_names = avsc_names
    loadingContext.metadata = metadata

    return loadingContext, uri
Esempio n. 17
0
    def _init_job(self, joborder, runtime_context):
        # type: (Mapping[Text, Text], RuntimeContext) -> Builder

        job = cast(Dict[Text, Union[Dict[Text, Any], List[Any], Text, None]],
                   copy.deepcopy(joborder))

        make_fs_access = getdefault(runtime_context.make_fs_access,
                                    StdFsAccess)
        fs_access = make_fs_access(runtime_context.basedir)

        load_listing_req, _ = self.get_requirement(
            "http://commonwl.org/cwltool#LoadListingRequirement")
        if load_listing_req is not None:
            load_listing = load_listing_req.get("loadListing")
        else:
            load_listing = "deep_listing"  # will default to "no_listing" in CWL v1.1

        # Validate job order
        try:
            fill_in_defaults(self.tool[u"inputs"], job, fs_access)

            normalizeFilesDirs(job)
            schema = self.names.get_name("input_record_schema", "")
            if schema is None:
                raise WorkflowException("Missing input record schema: "
                                        "{}".format(self.names))
            validate.validate_ex(schema,
                                 job,
                                 strict=False,
                                 logger=_logger_validation_warnings)

            if load_listing and load_listing != "no_listing":
                get_listing(fs_access,
                            job,
                            recursive=(load_listing == "deep_listing"))

            visit_class(job, ("File", ),
                        functools.partial(add_sizes, fs_access))

            if load_listing == "deep_listing" and load_listing_req is None:
                for i, inparm in enumerate(self.tool["inputs"]):
                    k = shortname(inparm["id"])
                    if k not in job:
                        continue
                    v = job[k]
                    dircount = [0]

                    def inc(d):  # type: (List[int]) -> None
                        d[0] += 1

                    visit_class(v, ("Directory", ), lambda x: inc(dircount))
                    if dircount[0] == 0:
                        continue
                    filecount = [0]
                    visit_class(v, ("File", ), lambda x: inc(filecount))
                    if filecount[0] > FILE_COUNT_WARNING:
                        # Long lines in this message are okay, will be reflowed based on terminal columns.
                        _logger.warning(
                            strip_dup_lineno(
                                SourceLine(self.tool["inputs"], i, Text).
                                makeError(
                                    """Recursive directory listing has resulted in a large number of File objects (%s) passed to the input parameter '%s'.  This may negatively affect workflow performance and memory use.

If this is a problem, use the hint 'cwltool:LoadListingRequirement' with "shallow_listing" or "no_listing" to change the directory listing behavior:

$namespaces:
  cwltool: "http://commonwl.org/cwltool#"
hints:
  cwltool:LoadListingRequirement:
    loadListing: shallow_listing

""" % (filecount[0], k))))

        except (validate.ValidationException, WorkflowException) as err:
            raise WorkflowException("Invalid job input record:\n" + Text(err))

        files = []  # type: List[Dict[Text, Text]]
        bindings = CommentedSeq()
        tmpdir = u""
        stagedir = u""

        docker_req, _ = self.get_requirement("DockerRequirement")
        default_docker = None

        if docker_req is None and runtime_context.default_container:
            default_docker = runtime_context.default_container

        if (docker_req or default_docker) and runtime_context.use_container:
            if docker_req is not None:
                # Check if docker output directory is absolute
                if docker_req.get("dockerOutputDirectory") and \
                        docker_req.get("dockerOutputDirectory").startswith('/'):
                    outdir = docker_req.get("dockerOutputDirectory")
                else:
                    outdir = docker_req.get("dockerOutputDirectory") or \
                        runtime_context.docker_outdir or random_outdir()
            elif default_docker is not None:
                outdir = runtime_context.docker_outdir or random_outdir()
            tmpdir = runtime_context.docker_tmpdir or "/tmp"
            stagedir = runtime_context.docker_stagedir or "/var/lib/cwl"
        else:
            outdir = fs_access.realpath(
                runtime_context.outdir or tempfile.mkdtemp(prefix=getdefault(
                    runtime_context.tmp_outdir_prefix, DEFAULT_TMP_PREFIX)))
            if self.tool[u"class"] != 'Workflow':
                tmpdir = fs_access.realpath(runtime_context.tmpdir
                                            or tempfile.mkdtemp())
                stagedir = fs_access.realpath(runtime_context.stagedir
                                              or tempfile.mkdtemp())

        builder = Builder(job, files, bindings, self.schemaDefs, self.names,
                          self.requirements, self.hints, {},
                          runtime_context.mutation_manager, self.formatgraph,
                          make_fs_access, fs_access,
                          runtime_context.job_script_provider,
                          runtime_context.eval_timeout, runtime_context.debug,
                          runtime_context.js_console,
                          runtime_context.force_docker_pull, load_listing,
                          outdir, tmpdir, stagedir)

        bindings.extend(
            builder.bind_input(self.inputs_record_schema,
                               job,
                               discover_secondaryFiles=getdefault(
                                   runtime_context.toplevel, False)))

        if self.tool.get("baseCommand"):
            for index, command in enumerate(aslist(self.tool["baseCommand"])):
                bindings.append({
                    "position": [-1000000, index],
                    "datum": command
                })

        if self.tool.get("arguments"):
            for i, arg in enumerate(self.tool["arguments"]):
                lc = self.tool["arguments"].lc.data[i]
                filename = self.tool["arguments"].lc.filename
                bindings.lc.add_kv_line_col(len(bindings), lc)
                if isinstance(arg, MutableMapping):
                    arg = copy.deepcopy(arg)
                    if arg.get("position"):
                        arg["position"] = [arg["position"], i]
                    else:
                        arg["position"] = [0, i]
                    bindings.append(arg)
                elif ("$(" in arg) or ("${" in arg):
                    cm = CommentedMap((("position", [0,
                                                     i]), ("valueFrom", arg)))
                    cm.lc.add_kv_line_col("valueFrom", lc)
                    cm.lc.filename = filename
                    bindings.append(cm)
                else:
                    cm = CommentedMap((("position", [0, i]), ("datum", arg)))
                    cm.lc.add_kv_line_col("datum", lc)
                    cm.lc.filename = filename
                    bindings.append(cm)

        # use python2 like sorting of heterogeneous lists
        # (containing str and int types),
        if PY3:
            key = functools.cmp_to_key(cmp_like_py2)
        else:  # PY2
            key = lambda d: d["position"]

        # This awkward construction replaces the contents of
        # "bindings" in place (because Builder expects it to be
        # mutated in place, sigh, I'm sorry) with its contents sorted,
        # supporting different versions of Python and ruamel.yaml with
        # different behaviors/bugs in CommentedSeq.
        bindings_copy = copy.deepcopy(bindings)
        del bindings[:]
        bindings.extend(sorted(bindings_copy, key=key))

        if self.tool[u"class"] != 'Workflow':
            builder.resources = self.evalResources(builder, runtime_context)
        return builder
Esempio n. 18
0
    def get_image(
            dockerRequirement,  # type: Dict[str, str]
            pull_image,  # type: bool
            force_pull=False,  # type: bool
    ):
        # type: (...) -> bool
        """
        Acquire the software container image in the specified dockerRequirement.

        Uses Singularity and returns the success as a bool. Updates the
        provided dockerRequirement with the specific dockerImageId to the full
        path of the local image, if found. Likewise the
        dockerRequirement['dockerPull'] is updated to a docker:// URI if needed.
        """
        found = False

        candidates = []

        cache_folder = None
        if "CWL_SINGULARITY_CACHE" in os.environ:
            cache_folder = os.environ["CWL_SINGULARITY_CACHE"]
        elif is_version_2_6() and "SINGULARITY_PULLFOLDER" in os.environ:
            cache_folder = os.environ["SINGULARITY_PULLFOLDER"]

        if ("dockerImageId" not in dockerRequirement
                and "dockerPull" in dockerRequirement):
            match = re.search(pattern=r"([a-z]*://)",
                              string=dockerRequirement["dockerPull"])
            img_name = _normalize_image_id(dockerRequirement["dockerPull"])
            candidates.append(img_name)
            if is_version_3_or_newer():
                sif_name = _normalize_sif_id(dockerRequirement["dockerPull"])
                candidates.append(sif_name)
                dockerRequirement["dockerImageId"] = sif_name
            else:
                dockerRequirement["dockerImageId"] = img_name
            if not match:
                dockerRequirement["dockerPull"] = (
                    "docker://" + dockerRequirement["dockerPull"])
        elif "dockerImageId" in dockerRequirement:
            if os.path.isfile(dockerRequirement["dockerImageId"]):
                found = True
            candidates.append(dockerRequirement["dockerImageId"])
            candidates.append(
                _normalize_image_id(dockerRequirement["dockerImageId"]))
            if is_version_3_or_newer():
                candidates.append(
                    _normalize_sif_id(dockerRequirement["dockerPull"]))

        targets = [os.getcwd()]
        if "CWL_SINGULARITY_CACHE" in os.environ:
            targets.append(os.environ["CWL_SINGULARITY_CACHE"])
        if is_version_2_6() and "SINGULARITY_PULLFOLDER" in os.environ:
            targets.append(os.environ["SINGULARITY_PULLFOLDER"])
        for target in targets:
            for dirpath, subdirs, files in os.walk(target):
                for entry in files:
                    if entry in candidates:
                        path = os.path.join(dirpath, entry)
                        if os.path.isfile(path):
                            _logger.info(
                                "Using local copy of Singularity image found in %s",
                                dirpath,
                            )
                            dockerRequirement["dockerImageId"] = path
                            found = True
        if (force_pull or not found) and pull_image:
            cmd = []  # type: List[str]
            if "dockerPull" in dockerRequirement:
                if cache_folder:
                    env = os.environ.copy()
                    if is_version_2_6():
                        env["SINGULARITY_PULLFOLDER"] = cache_folder
                        cmd = [
                            "singularity",
                            "pull",
                            "--force",
                            "--name",
                            dockerRequirement["dockerImageId"],
                            str(dockerRequirement["dockerPull"]),
                        ]
                    else:
                        cmd = [
                            "singularity",
                            "pull",
                            "--force",
                            "--name",
                            "{}/{}".format(cache_folder,
                                           dockerRequirement["dockerImageId"]),
                            str(dockerRequirement["dockerPull"]),
                        ]

                    _logger.info(str(cmd))
                    check_call(cmd, env=env, stdout=sys.stderr)  # nosec
                    dockerRequirement["dockerImageId"] = "{}/{}".format(
                        cache_folder, dockerRequirement["dockerImageId"])
                    found = True
                else:
                    cmd = [
                        "singularity",
                        "pull",
                        "--force",
                        "--name",
                        str(dockerRequirement["dockerImageId"]),
                        str(dockerRequirement["dockerPull"]),
                    ]
                    _logger.info(str(cmd))
                    check_call(cmd, stdout=sys.stderr)  # nosec
                    found = True

            elif "dockerFile" in dockerRequirement:
                raise WorkflowException(
                    SourceLine(dockerRequirement, "dockerFile").makeError(
                        "dockerFile is not currently supported when using the "
                        "Singularity runtime for Docker containers."))
            elif "dockerLoad" in dockerRequirement:
                if is_version_3_1_or_newer():
                    if "dockerImageId" in dockerRequirement:
                        name = "{}.sif".format(
                            dockerRequirement["dockerImageId"])
                    else:
                        name = "{}.sif".format(dockerRequirement["dockerLoad"])
                    cmd = [
                        "singularity",
                        "build",
                        name,
                        "docker-archive://{}".format(
                            dockerRequirement["dockerLoad"]),
                    ]
                    _logger.info(str(cmd))
                    check_call(cmd, stdout=sys.stderr)  # nosec
                    found = True
                    dockerRequirement["dockerImageId"] = name
                raise WorkflowException(
                    SourceLine(dockerRequirement, "dockerLoad").makeError(
                        "dockerLoad is not currently supported when using the "
                        "Singularity runtime (version less than 3.1) for Docker containers."
                    ))
            elif "dockerImport" in dockerRequirement:
                raise WorkflowException(
                    SourceLine(dockerRequirement, "dockerImport").makeError(
                        "dockerImport is not currently supported when using the "
                        "Singularity runtime for Docker containers."))

        return found
Esempio n. 19
0
def set_secondary(fsaccess, builder, inputschema, secondaryspec, primary,
                  discovered):
    if isinstance(inputschema,
                  Sequence) and not isinstance(inputschema, basestring):
        # union type, collect all possible secondaryFiles
        for i in inputschema:
            set_secondary(fsaccess, builder, i, secondaryspec, primary,
                          discovered)
        return

    if isinstance(inputschema, basestring):
        sd = search_schemadef(inputschema,
                              reversed(builder.hints + builder.requirements))
        if sd:
            inputschema = sd
        else:
            return

    if "secondaryFiles" in inputschema:
        # set secondaryFiles, may be inherited by compound types.
        secondaryspec = inputschema["secondaryFiles"]

    if (isinstance(inputschema["type"], (Mapping, Sequence))
            and not isinstance(inputschema["type"], basestring)):
        # compound type (union, array, record)
        set_secondary(fsaccess, builder, inputschema["type"], secondaryspec,
                      primary, discovered)

    elif (inputschema["type"] == "record" and isinstance(primary, Mapping)):
        #
        # record type, find secondary files associated with fields.
        #
        for f in inputschema["fields"]:
            p = primary.get(shortname(f["name"]))
            if p:
                set_secondary(fsaccess, builder, f, secondaryspec, p,
                              discovered)

    elif (inputschema["type"] == "array" and isinstance(primary, Sequence)):
        #
        # array type, find secondary files of elements
        #
        for p in primary:
            set_secondary(fsaccess, builder, {"type": inputschema["items"]},
                          secondaryspec, p, discovered)

    elif (inputschema["type"] == "File" and secondaryspec
          and isinstance(primary, Mapping) and primary.get("class") == "File"
          and "secondaryFiles" not in primary):
        #
        # Found a file, check for secondaryFiles
        #
        specs = []
        primary["secondaryFiles"] = secondaryspec
        for i, sf in enumerate(aslist(secondaryspec)):
            if builder.cwlVersion == "v1.0":
                pattern = builder.do_eval(sf, context=primary)
            else:
                pattern = builder.do_eval(sf["pattern"], context=primary)
            if pattern is None:
                continue
            if isinstance(pattern, list):
                specs.extend(pattern)
            elif isinstance(pattern, dict):
                specs.append(pattern)
            elif isinstance(pattern, str):
                specs.append({"pattern": pattern})
            else:
                raise SourceLine(
                    primary["secondaryFiles"], i,
                    validate.ValidationException).makeError(
                        "Expression must return list, object, string or null")

        found = []
        for i, sf in enumerate(specs):
            if isinstance(sf, dict):
                if sf.get("class") == "File":
                    pattern = sf["basename"]
                else:
                    pattern = sf["pattern"]
                    required = sf.get("required")
            elif isinstance(sf, str):
                pattern = sf
                required = True
            else:
                raise SourceLine(
                    primary["secondaryFiles"], i,
                    validate.ValidationException).makeError(
                        "Expression must return list, object, string or null")

            sfpath = substitute(primary["location"], pattern)
            required = builder.do_eval(required, context=primary)

            if fsaccess.exists(sfpath):
                found.append({"location": sfpath, "class": "File"})
            elif required:
                raise SourceLine(
                    primary["secondaryFiles"], i,
                    validate.ValidationException).makeError(
                        "Required secondary file '%s' does not exist" % sfpath)

        primary["secondaryFiles"] = cmap(found)
        if discovered is not None:
            discovered[primary["location"]] = primary["secondaryFiles"]
    elif inputschema["type"] not in primitive_types_set:
        set_secondary(fsaccess, builder, inputschema["type"], secondaryspec,
                      primary, discovered)
Esempio n. 20
0
    def job(
        self,
        joborder,  # type: MutableMapping[Text, Any]
        output_callback,  # type: Callable[[Any, Any], Any]
        runtimeContext  # type: RuntimeContext
    ):  # type: (...) -> Generator
        self.state = {}
        self.processStatus = "success"

        if _logger.isEnabledFor(logging.DEBUG):
            _logger.debug(u"[%s] %s", self.name, json_dumps(joborder,
                                                            indent=4))

        runtimeContext = runtimeContext.copy()
        runtimeContext.outdir = None

        for index, inp in enumerate(self.tool["inputs"]):
            with SourceLine(self.tool["inputs"], index, WorkflowException,
                            _logger.isEnabledFor(logging.DEBUG)):
                inp_id = shortname(inp["id"])
                if inp_id in joborder:
                    self.state[inp["id"]] = WorkflowStateItem(
                        inp, copy.deepcopy(joborder[inp_id]), "success")
                elif "default" in inp:
                    self.state[inp["id"]] = WorkflowStateItem(
                        inp, copy.deepcopy(inp["default"]), "success")
                else:
                    raise WorkflowException(
                        u"Input '%s' not in input object and does not have a "
                        " default value." % (inp["id"]))

        for step in self.steps:
            for out in step.tool["outputs"]:
                self.state[out["id"]] = None

        completed = 0
        while completed < len(self.steps):
            self.made_progress = False

            for step in self.steps:
                if getdefault(
                        runtimeContext.on_error,
                        "stop") == "stop" and self.processStatus != "success":
                    break

                if not step.submitted:
                    try:
                        step.iterable = self.try_make_job(
                            step, output_callback, runtimeContext)
                    except WorkflowException as exc:
                        _logger.error(u"[%s] Cannot make job: %s", step.name,
                                      exc)
                        _logger.debug("", exc_info=True)
                        self.processStatus = "permanentFail"

                if step.iterable is not None:
                    try:
                        for newjob in step.iterable:
                            if getdefault(runtimeContext.on_error, "stop") == "stop" \
                                    and self.processStatus != "success":
                                break
                            if newjob is not None:
                                self.made_progress = True
                                yield newjob
                            else:
                                break
                    except WorkflowException as exc:
                        _logger.error(u"[%s] Cannot make job: %s", step.name,
                                      exc)
                        _logger.debug("", exc_info=True)
                        self.processStatus = "permanentFail"

            completed = sum(1 for s in self.steps if s.completed)

            if not self.made_progress and completed < len(self.steps):
                if self.processStatus != "success":
                    break
                else:
                    yield None

        if not self.did_callback:
            self.do_output_callback(
                output_callback)  # could have called earlier on line 336;
Esempio n. 21
0
    def job(self, joborder, output_callback, runtimeContext):

        builder = make_builder(joborder, self.hints, self.requirements,
                               runtimeContext)
        runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder,
                                            runtimeContext)

        req, _ = self.get_requirement(
            "http://arvados.org/cwl#RunInSingleContainer")
        if not req:
            return super(ArvadosWorkflow, self).job(joborder, output_callback,
                                                    runtimeContext)

        # RunInSingleContainer is true

        with SourceLine(self.tool, None, WorkflowException,
                        logger.isEnabledFor(logging.DEBUG)):
            if "id" not in self.tool:
                raise WorkflowException("%s object must have 'id'" %
                                        (self.tool["class"]))
        document_loader, workflowobj, uri = (self.doc_loader,
                                             self.doc_loader.fetch(
                                                 self.tool["id"]),
                                             self.tool["id"])

        discover_secondary_files(self.tool["inputs"], joborder)

        with Perf(metrics, "subworkflow upload_deps"):
            upload_dependencies(self.arvrunner,
                                os.path.basename(joborder.get("id", "#")),
                                document_loader, joborder,
                                joborder.get("id", "#"), False)

            if self.wf_pdh is None:
                workflowobj["requirements"] = dedup_reqs(self.requirements)
                workflowobj["hints"] = dedup_reqs(self.hints)

                packed = pack(document_loader, workflowobj, uri, self.metadata)

                def visit(item):
                    for t in ("hints", "requirements"):
                        if t not in item:
                            continue
                        for req in item[t]:
                            if req["class"] == "ResourceRequirement":
                                dyn = False
                                for k in max_res_pars + sum_res_pars:
                                    if k in req:
                                        if isinstance(req[k], basestring):
                                            if item["id"] == "#main":
                                                # only the top-level requirements/hints may contain expressions
                                                self.dynamic_resource_req.append(
                                                    req)
                                                dyn = True
                                                break
                                            else:
                                                with SourceLine(
                                                        req, k,
                                                        WorkflowException):
                                                    raise WorkflowException(
                                                        "Non-top-level ResourceRequirement in single container cannot have expressions"
                                                    )
                                if not dyn:
                                    self.static_resource_req.append(req)
                            if req["class"] == "DockerRequirement":
                                if "http://arvados.org/cwl#dockerCollectionPDH" in req:
                                    del req[
                                        "http://arvados.org/cwl#dockerCollectionPDH"]

                visit_class(packed["$graph"], ("Workflow", "CommandLineTool"),
                            visit)

                if self.static_resource_req:
                    self.static_resource_req = [
                        get_overall_res_req(self.static_resource_req)
                    ]

                upload_dependencies(self.arvrunner, runtimeContext.name,
                                    document_loader, packed, uri, False)

                # Discover files/directories referenced by the
                # workflow (mainly "default" values)
                visit_class(packed, ("File", "Directory"),
                            self.wf_reffiles.append)

        if self.dynamic_resource_req:
            # Evaluate dynamic resource requirements using current builder
            rs = copy.copy(self.static_resource_req)
            for dyn_rs in self.dynamic_resource_req:
                eval_req = {"class": "ResourceRequirement"}
                for a in max_res_pars + sum_res_pars:
                    if a in dyn_rs:
                        eval_req[a] = builder.do_eval(dyn_rs[a])
                rs.append(eval_req)
            job_res_reqs = [get_overall_res_req(rs)]
        else:
            job_res_reqs = self.static_resource_req

        with Perf(metrics, "subworkflow adjust"):
            joborder_resolved = copy.deepcopy(joborder)
            joborder_keepmount = copy.deepcopy(joborder)

            reffiles = []
            visit_class(joborder_keepmount, ("File", "Directory"),
                        reffiles.append)

            mapper = ArvPathMapper(self.arvrunner, reffiles + self.wf_reffiles,
                                   runtimeContext.basedir, "/keep/%s",
                                   "/keep/%s/%s")

            # For containers API, we need to make sure any extra
            # referenced files (ie referenced by the workflow but
            # not in the inputs) are included in the mounts.
            if self.wf_reffiles:
                runtimeContext = runtimeContext.copy()
                runtimeContext.extra_reffiles = copy.deepcopy(self.wf_reffiles)

            def keepmount(obj):
                remove_redundant_fields(obj)
                with SourceLine(obj, None, WorkflowException,
                                logger.isEnabledFor(logging.DEBUG)):
                    if "location" not in obj:
                        raise WorkflowException(
                            "%s object is missing required 'location' field: %s"
                            % (obj["class"], obj))
                with SourceLine(obj, "location", WorkflowException,
                                logger.isEnabledFor(logging.DEBUG)):
                    if obj["location"].startswith("keep:"):
                        obj["location"] = mapper.mapper(obj["location"]).target
                        if "listing" in obj:
                            del obj["listing"]
                    elif obj["location"].startswith("_:"):
                        del obj["location"]
                    else:
                        raise WorkflowException(
                            "Location is not a keep reference or a literal: '%s'"
                            % obj["location"])

            visit_class(joborder_keepmount, ("File", "Directory"), keepmount)

            def resolved(obj):
                if obj["location"].startswith("keep:"):
                    obj["location"] = mapper.mapper(obj["location"]).resolved

            visit_class(joborder_resolved, ("File", "Directory"), resolved)

            if self.wf_pdh is None:
                adjustFileObjs(packed, keepmount)
                adjustDirObjs(packed, keepmount)
                self.wf_pdh = upload_workflow_collection(
                    self.arvrunner, shortname(self.tool["id"]), packed)

        wf_runner = cmap({
            "class":
            "CommandLineTool",
            "baseCommand":
            "cwltool",
            "inputs":
            self.tool["inputs"],
            "outputs":
            self.tool["outputs"],
            "stdout":
            "cwl.output.json",
            "requirements":
            self.requirements + job_res_reqs +
            [{
                "class": "InlineJavascriptRequirement"
            }, {
                "class":
                "InitialWorkDirRequirement",
                "listing": [{
                    "entryname":
                    "workflow.cwl",
                    "entry":
                    '$({"class": "File", "location": "keep:%s/workflow.cwl"})'
                    % self.wf_pdh
                }, {
                    "entryname":
                    "cwl.input.yml",
                    "entry":
                    json.dumps(joborder_keepmount,
                               indent=2,
                               sort_keys=True,
                               separators=(',', ': ')).replace(
                                   "\\", "\\\\").replace('$(', '\$(').replace(
                                       '${', '\${')
                }]
            }],
            "hints":
            self.hints,
            "arguments": [
                "--no-container", "--move-outputs",
                "--preserve-entire-environment", "workflow.cwl#main",
                "cwl.input.yml"
            ],
            "id":
            "#"
        })
        return ArvadosCommandTool(self.arvrunner, wf_runner,
                                  self.loadingContext).job(
                                      joborder_resolved, output_callback,
                                      runtimeContext)
Esempio n. 22
0
    def job(
            self,
            job_order,  # type: Dict[Text, Text]
            output_callbacks,  # type: Callable[[Any, Any], Any]
            **kwargs  # type: Any
    ):
        # type: (...) -> Generator[Union[JobBase, CallbackJob], None, None]

        jobname = uniquename(
            kwargs.get("name", shortname(self.tool.get("id", "job"))))
        if kwargs.get("cachedir"):
            cacheargs = kwargs.copy()
            cacheargs["outdir"] = "/out"
            cacheargs["tmpdir"] = "/tmp"
            cacheargs["stagedir"] = "/stage"
            cachebuilder = self._init_job(job_order, **cacheargs)
            cachebuilder.pathmapper = PathMapper(cachebuilder.files,
                                                 kwargs["basedir"],
                                                 cachebuilder.stagedir,
                                                 separateDirs=False)
            _check_adjust = partial(check_adjust, cachebuilder)
            visit_class([cachebuilder.files, cachebuilder.bindings],
                        ("File", "Directory"), _check_adjust)

            cmdline = flatten(
                list(map(cachebuilder.generate_arg, cachebuilder.bindings)))
            (docker_req,
             docker_is_req) = self.get_requirement("DockerRequirement")
            if docker_req and kwargs.get("use_container"):
                dockerimg = docker_req.get("dockerImageId") or docker_req.get(
                    "dockerPull")
            elif kwargs.get("default_container",
                            None) is not None and kwargs.get("use_container"):
                dockerimg = kwargs.get("default_container")

            if dockerimg:
                cmdline = ["docker", "run", dockerimg] + cmdline
            keydict = {u"cmdline": cmdline}

            if "stdout" in self.tool:
                keydict["stdout"] = self.tool["stdout"]
            for location, f in cachebuilder.pathmapper.items():
                if f.type == "File":
                    checksum = next(
                        (e['checksum'] for e in cachebuilder.files
                         if 'location' in e and e['location'] == location
                         and 'checksum' in e and e['checksum'] != 'sha1$hash'),
                        None)
                    st = os.stat(f.resolved)
                    if checksum:
                        keydict[f.resolved] = [st.st_size, checksum]
                    else:
                        keydict[f.resolved] = [
                            st.st_size, int(st.st_mtime * 1000)
                        ]

            interesting = {
                "DockerRequirement", "EnvVarRequirement",
                "CreateFileRequirement", "ShellCommandRequirement"
            }
            for rh in (self.requirements, self.hints):
                for r in reversed(rh):
                    if r["class"] in interesting and r["class"] not in keydict:
                        keydict[r["class"]] = r

            keydictstr = json.dumps(keydict,
                                    separators=(',', ':'),
                                    sort_keys=True)
            cachekey = hashlib.md5(keydictstr.encode('utf-8')).hexdigest()

            _logger.debug("[job %s] keydictstr is %s -> %s", jobname,
                          keydictstr, cachekey)

            jobcache = os.path.join(kwargs["cachedir"], cachekey)
            jobcachepending = jobcache + ".pending"

            if os.path.isdir(jobcache) and not os.path.isfile(jobcachepending):
                if docker_req and kwargs.get("use_container"):
                    cachebuilder.outdir = kwargs.get(
                        "docker_outdir") or "/var/spool/cwl"
                else:
                    cachebuilder.outdir = jobcache

                _logger.info("[job %s] Using cached output in %s", jobname,
                             jobcache)
                yield CallbackJob(self, output_callbacks, cachebuilder,
                                  jobcache)
                return
            else:
                _logger.info("[job %s] Output of job will be cached in %s",
                             jobname, jobcache)
                shutil.rmtree(jobcache, True)
                os.makedirs(jobcache)
                kwargs["outdir"] = jobcache
                open(jobcachepending, "w").close()

                def rm_pending_output_callback(output_callbacks,
                                               jobcachepending, outputs,
                                               processStatus):
                    if processStatus == "success":
                        os.remove(jobcachepending)
                    output_callbacks(outputs, processStatus)

                output_callbacks = cast(
                    Callable[..., Any],  # known bug in mypy
                    # https://github.com/python/mypy/issues/797
                    partial(rm_pending_output_callback, output_callbacks,
                            jobcachepending))

        builder = self._init_job(job_order, **kwargs)

        reffiles = copy.deepcopy(builder.files)

        j = self.makeJobRunner(**kwargs)
        j.builder = builder
        j.joborder = builder.job
        j.make_pathmapper = self.makePathMapper
        j.stdin = None
        j.stderr = None
        j.stdout = None
        j.successCodes = self.tool.get("successCodes")
        j.temporaryFailCodes = self.tool.get("temporaryFailCodes")
        j.permanentFailCodes = self.tool.get("permanentFailCodes")
        j.requirements = self.requirements
        j.hints = self.hints
        j.name = jobname

        debug = _logger.isEnabledFor(logging.DEBUG)

        if debug:
            _logger.debug(
                u"[job %s] initializing from %s%s", j.name,
                self.tool.get("id", ""), u" as part of %s" %
                kwargs["part_of"] if "part_of" in kwargs else "")
            _logger.debug(u"[job %s] %s", j.name,
                          json.dumps(job_order, indent=4))

        builder.pathmapper = None
        make_path_mapper_kwargs = kwargs
        if "stagedir" in make_path_mapper_kwargs:
            make_path_mapper_kwargs = make_path_mapper_kwargs.copy()
            del make_path_mapper_kwargs["stagedir"]

        builder.pathmapper = self.makePathMapper(reffiles, builder.stagedir,
                                                 **make_path_mapper_kwargs)
        builder.requirements = j.requirements

        _check_adjust = partial(check_adjust, builder)

        visit_class([builder.files, builder.bindings], ("File", "Directory"),
                    _check_adjust)

        initialWorkdir = self.get_requirement("InitialWorkDirRequirement")[0]
        j.generatefiles = {"class": "Directory", "listing": [], "basename": ""}
        if initialWorkdir:
            ls = []  # type: List[Dict[Text, Any]]
            if isinstance(initialWorkdir["listing"], (str, Text)):
                ls = builder.do_eval(initialWorkdir["listing"])
            else:
                for t in initialWorkdir["listing"]:
                    if "entry" in t:
                        et = {u"entry": builder.do_eval(t["entry"])}
                        if "entryname" in t:
                            et["entryname"] = builder.do_eval(t["entryname"])
                        else:
                            et["entryname"] = None
                        et["writable"] = t.get("writable", False)
                        ls.append(et)
                    else:
                        ls.append(builder.do_eval(t))
            for i, t in enumerate(ls):
                if "entry" in t:
                    if isinstance(t["entry"], string_types):
                        ls[i] = {
                            "class": "File",
                            "basename": t["entryname"],
                            "contents": t["entry"],
                            "writable": t.get("writable")
                        }
                    else:
                        if t.get("entryname") or t.get("writable"):
                            t = copy.deepcopy(t)
                            if t.get("entryname"):
                                t["entry"]["basename"] = t["entryname"]
                            t["entry"]["writable"] = t.get("writable")
                        ls[i] = t["entry"]
            j.generatefiles[u"listing"] = ls
            for l in ls:
                self.updatePathmap(builder.outdir, builder.pathmapper, l)
            visit_class([builder.files, builder.bindings],
                        ("File", "Directory"), _check_adjust)

        if debug:
            _logger.debug(
                u"[job %s] path mappings is %s", j.name,
                json.dumps(
                    {
                        p: builder.pathmapper.mapper(p)
                        for p in builder.pathmapper.files()
                    },
                    indent=4))

        if self.tool.get("stdin"):
            with SourceLine(self.tool, "stdin", validate.ValidationException,
                            debug):
                j.stdin = builder.do_eval(self.tool["stdin"])
                reffiles.append({"class": "File", "path": j.stdin})

        if self.tool.get("stderr"):
            with SourceLine(self.tool, "stderr", validate.ValidationException,
                            debug):
                j.stderr = builder.do_eval(self.tool["stderr"])
                if os.path.isabs(j.stderr) or ".." in j.stderr:
                    raise validate.ValidationException(
                        "stderr must be a relative path, got '%s'" % j.stderr)

        if self.tool.get("stdout"):
            with SourceLine(self.tool, "stdout", validate.ValidationException,
                            debug):
                j.stdout = builder.do_eval(self.tool["stdout"])
                if os.path.isabs(j.stdout) or ".." in j.stdout or not j.stdout:
                    raise validate.ValidationException(
                        "stdout must be a relative path, got '%s'" % j.stdout)

        if debug:
            _logger.debug(u"[job %s] command line bindings is %s", j.name,
                          json.dumps(builder.bindings, indent=4))

        dockerReq = self.get_requirement("DockerRequirement")[0]
        if dockerReq and kwargs.get("use_container"):
            out_prefix = kwargs.get("tmp_outdir_prefix")
            j.outdir = kwargs.get("outdir") or tempfile.mkdtemp(
                prefix=out_prefix)
            tmpdir_prefix = kwargs.get('tmpdir_prefix')
            j.tmpdir = kwargs.get("tmpdir") or tempfile.mkdtemp(
                prefix=tmpdir_prefix)
            j.stagedir = tempfile.mkdtemp(prefix=tmpdir_prefix)
        else:
            j.outdir = builder.outdir
            j.tmpdir = builder.tmpdir
            j.stagedir = builder.stagedir

        inplaceUpdateReq = self.get_requirement(
            "http://commonwl.org/cwltool#InplaceUpdateRequirement")[0]

        if inplaceUpdateReq:
            j.inplace_update = inplaceUpdateReq["inplaceUpdate"]
        normalizeFilesDirs(j.generatefiles)

        readers = {}
        muts = set()

        if builder.mutation_manager:

            def register_mut(f):
                muts.add(f["location"])
                builder.mutation_manager.register_mutation(j.name, f)

            def register_reader(f):
                if f["location"] not in muts:
                    builder.mutation_manager.register_reader(j.name, f)
                    readers[f["location"]] = f

            for li in j.generatefiles["listing"]:
                li = cast(Dict[Text, Any], li)
                if li.get("writable") and j.inplace_update:
                    adjustFileObjs(li, register_mut)
                    adjustDirObjs(li, register_mut)
                else:
                    adjustFileObjs(li, register_reader)
                    adjustDirObjs(li, register_reader)

            adjustFileObjs(builder.files, register_reader)
            adjustFileObjs(builder.bindings, register_reader)
            adjustDirObjs(builder.files, register_reader)
            adjustDirObjs(builder.bindings, register_reader)

        j.environment = {}
        evr = self.get_requirement("EnvVarRequirement")[0]
        if evr:
            for t in evr["envDef"]:
                j.environment[t["envName"]] = builder.do_eval(t["envValue"])

        shellcmd = self.get_requirement("ShellCommandRequirement")[0]
        if shellcmd:
            cmd = []  # type: List[Text]
            for b in builder.bindings:
                arg = builder.generate_arg(b)
                if b.get("shellQuote", True):
                    arg = [shellescape.quote(a) for a in aslist(arg)]
                cmd.extend(aslist(arg))
            j.command_line = ["/bin/sh", "-c", " ".join(cmd)]
        else:
            j.command_line = flatten(
                list(map(builder.generate_arg, builder.bindings)))

        j.pathmapper = builder.pathmapper
        j.collect_outputs = partial(self.collect_output_ports,
                                    self.tool["outputs"],
                                    builder,
                                    compute_checksum=kwargs.get(
                                        "compute_checksum", True),
                                    jobname=jobname,
                                    readers=readers)
        j.output_callback = output_callbacks

        yield j
Esempio n. 23
0
    def __init__(self,
                 toolpath_object,      # type: MutableMapping[Text, Any]
                 loadingContext        # type: LoadingContext
                ):  # type: (...) -> None
        self.metadata = getdefault(loadingContext.metadata, {})  # type: Dict[Text,Any]
        self.provenance_object = None  # type: Optional[CreateProvProfile]
        self.parent_wf = None          # type: Optional[CreateProvProfile]
        global SCHEMA_FILE, SCHEMA_DIR, SCHEMA_ANY  # pylint: disable=global-statement
        if SCHEMA_FILE is None or SCHEMA_ANY is None or SCHEMA_DIR is None:
            get_schema("v1.0")
            SCHEMA_ANY = cast(Dict[Text, Any],
                              SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/salad#Any"])
            SCHEMA_FILE = cast(Dict[Text, Any],
                               SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/cwl#File"])
            SCHEMA_DIR = cast(Dict[Text, Any],
                              SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/cwl#Directory"])

        names = schema.make_avro_schema([SCHEMA_FILE, SCHEMA_DIR, SCHEMA_ANY],
                                        Loader({}))[0]
        if isinstance(names, schema.SchemaParseException):
            raise names
        else:
            self.names = names
        self.tool = toolpath_object
        self.requirements = copy.deepcopy(getdefault(loadingContext.requirements, []))
        self.requirements.extend(self.tool.get("requirements", []))
        self.requirements.extend(get_overrides(getdefault(loadingContext.overrides_list, []),
                                               self.tool["id"]).get("requirements", []))
        self.hints = copy.deepcopy(getdefault(loadingContext.hints, []))
        self.hints.extend(self.tool.get("hints", []))
        # Versions of requirements and hints which aren't mutated.
        self.original_requirements = copy.deepcopy(self.requirements)
        self.original_hints = copy.deepcopy(self.hints)
        self.doc_loader = loadingContext.loader
        self.doc_schema = loadingContext.avsc_names

        self.formatgraph = None  # type: Optional[Graph]
        if self.doc_loader is not None:
            self.formatgraph = self.doc_loader.graph

        checkRequirements(self.tool, supportedProcessRequirements)
        self.validate_hints(loadingContext.avsc_names, self.tool.get("hints", []),
                            strict=getdefault(loadingContext.strict, False))

        self.schemaDefs = {}  # type: Dict[Text,Dict[Text, Any]]

        sd, _ = self.get_requirement("SchemaDefRequirement")

        if sd is not None:
            sdtypes = sd["types"]
            av = schema.make_valid_avro(sdtypes, {t["name"]: t for t in avroize_type(sdtypes)}, set())
            for i in av:
                self.schemaDefs[i["name"]] = i  # type: ignore
            schema.AvroSchemaFromJSONData(av, self.names)  # type: ignore

        # Build record schema from inputs
        self.inputs_record_schema = {
            "name": "input_record_schema", "type": "record",
            "fields": []}  # type: Dict[Text, Any]
        self.outputs_record_schema = {
            "name": "outputs_record_schema", "type": "record",
            "fields": []}  # type: Dict[Text, Any]

        for key in ("inputs", "outputs"):
            for i in self.tool[key]:
                c = copy.deepcopy(i)
                c["name"] = shortname(c["id"])
                del c["id"]

                if "type" not in c:
                    raise validate.ValidationException(
                        u"Missing 'type' in parameter '{}'".format(c["name"]))

                if "default" in c and "null" not in aslist(c["type"]):
                    nullable = ["null"]
                    nullable.extend(aslist(c["type"]))
                    c["type"] = nullable
                else:
                    c["type"] = c["type"]
                c["type"] = avroize_type(c["type"], c["name"])
                if key == "inputs":
                    self.inputs_record_schema["fields"].append(c)
                elif key == "outputs":
                    self.outputs_record_schema["fields"].append(c)

        with SourceLine(toolpath_object, "inputs", validate.ValidationException):
            self.inputs_record_schema = cast(
                Dict[Text, Any], schema.make_valid_avro(
                    self.inputs_record_schema, {}, set()))
            schema.AvroSchemaFromJSONData(self.inputs_record_schema, self.names)
        with SourceLine(toolpath_object, "outputs", validate.ValidationException):
            self.outputs_record_schema = cast(
                Dict[Text, Any],
                schema.make_valid_avro(self.outputs_record_schema, {}, set()))
            schema.AvroSchemaFromJSONData(self.outputs_record_schema, self.names)

        if toolpath_object.get("class") is not None \
                and not getdefault(loadingContext.disable_js_validation, False):
            if loadingContext.js_hint_options_file is not None:
                try:
                    with open(loadingContext.js_hint_options_file) as options_file:
                        validate_js_options = json.load(options_file)
                except (OSError, ValueError) as err:
                    _logger.error(
                        "Failed to read options file %s",
                        loadingContext.js_hint_options_file)
                    raise err
            else:
                validate_js_options = None
            if self.doc_schema is not None:
                validate_js_expressions(cast(CommentedMap, toolpath_object), self.doc_schema.names[toolpath_object["class"]], validate_js_options)

        dockerReq, is_req = self.get_requirement("DockerRequirement")

        if dockerReq is not None and "dockerOutputDirectory" in dockerReq\
                and is_req is not None and not is_req:
            _logger.warning(SourceLine(
                item=dockerReq, raise_type=Text).makeError(
                    "When 'dockerOutputDirectory' is declared, DockerRequirement "
                    "should go in the 'requirements' section, not 'hints'."""))

        if dockerReq is not None and is_req is not None\
                and dockerReq.get("dockerOutputDirectory") == "/var/spool/cwl":
            if is_req:
                # In this specific case, it is legal to have /var/spool/cwl, so skip the check.
                pass
            else:
                # Must be a requirement
                var_spool_cwl_detector(self.tool)
        else:
            var_spool_cwl_detector(self.tool)
Esempio n. 24
0
    def collect_output(self,
                       schema,
                       builder,
                       outdir,
                       fs_access,
                       compute_checksum=True):
        # type: (Dict[Text, Any], Builder, Text, StdFsAccess, bool) -> Union[Dict[Text, Any], List[Union[Dict[Text, Any], Text]]]
        r = []  # type: List[Any]
        debug = _logger.isEnabledFor(logging.DEBUG)
        if "outputBinding" in schema:
            binding = schema["outputBinding"]
            globpatterns = []  # type: List[Text]

            revmap = partial(revmap_file, builder, outdir)

            if "glob" in binding:
                with SourceLine(binding, "glob", WorkflowException, debug):
                    for gb in aslist(binding["glob"]):
                        gb = builder.do_eval(gb)
                        if gb:
                            globpatterns.extend(aslist(gb))

                    for gb in globpatterns:
                        if gb.startswith(outdir):
                            gb = gb[len(outdir) + 1:]
                        elif gb == ".":
                            gb = outdir
                        elif gb.startswith("/"):
                            raise WorkflowException(
                                "glob patterns must not start with '/'")
                        try:
                            prefix = fs_access.glob(outdir)
                            r.extend([{
                                "location":
                                g,
                                "path":
                                fs_access.join(builder.outdir,
                                               g[len(prefix[0]) + 1:]),
                                "basename":
                                os.path.basename(g),
                                "nameroot":
                                os.path.splitext(os.path.basename(g))[0],
                                "nameext":
                                os.path.splitext(os.path.basename(g))[1],
                                "class":
                                "File" if fs_access.isfile(g) else "Directory"
                            } for g in fs_access.glob(
                                fs_access.join(outdir, gb))])
                        except (OSError, IOError) as e:
                            _logger.warning(Text(e))
                        except:
                            _logger.error("Unexpected error from fs_access",
                                          exc_info=True)
                            raise

                for files in r:
                    rfile = files.copy()
                    revmap(rfile)
                    if files["class"] == "Directory":
                        ll = builder.loadListing or (
                            binding and binding.get("loadListing"))
                        if ll and ll != "no_listing":
                            get_listing(fs_access, files,
                                        (ll == "deep_listing"))
                    else:
                        with fs_access.open(rfile["location"], "rb") as f:
                            contents = b""
                            if binding.get("loadContents") or compute_checksum:
                                contents = f.read(CONTENT_LIMIT)
                            if binding.get("loadContents"):
                                files["contents"] = contents
                            if compute_checksum:
                                checksum = hashlib.sha1()
                                while contents != b"":
                                    checksum.update(contents)
                                    contents = f.read(1024 * 1024)
                                files[
                                    "checksum"] = "sha1$%s" % checksum.hexdigest(
                                    )
                            f.seek(0, 2)
                            filesize = f.tell()
                        files["size"] = filesize
                        if "format" in schema:
                            files["format"] = builder.do_eval(schema["format"],
                                                              context=files)

            optional = False
            single = False
            if isinstance(schema["type"], list):
                if "null" in schema["type"]:
                    optional = True
                if "File" in schema["type"] or "Directory" in schema["type"]:
                    single = True
            elif schema["type"] == "File" or schema["type"] == "Directory":
                single = True

            if "outputEval" in binding:
                with SourceLine(binding, "outputEval", WorkflowException,
                                debug):
                    r = builder.do_eval(binding["outputEval"], context=r)

            if single:
                if not r and not optional:
                    with SourceLine(binding, "glob", WorkflowException, debug):
                        raise WorkflowException(
                            "Did not find output file with glob pattern: '{}'".
                            format(globpatterns))
                elif not r and optional:
                    pass
                elif isinstance(r, list):
                    if len(r) > 1:
                        raise WorkflowException(
                            "Multiple matches for output item that is a single file."
                        )
                    else:
                        r = r[0]

            if "secondaryFiles" in schema:
                with SourceLine(schema, "secondaryFiles", WorkflowException,
                                debug):
                    for primary in aslist(r):
                        if isinstance(primary, dict):
                            primary.setdefault("secondaryFiles", [])
                            pathprefix = primary["path"][0:primary["path"].
                                                         rindex("/") + 1]
                            for sf in aslist(schema["secondaryFiles"]):
                                if isinstance(
                                        sf, dict) or "$(" in sf or "${" in sf:
                                    sfpath = builder.do_eval(sf,
                                                             context=primary)
                                    subst = False
                                else:
                                    sfpath = sf
                                    subst = True
                                for sfitem in aslist(sfpath):
                                    if isinstance(sfitem, string_types):
                                        if subst:
                                            sfitem = {
                                                "path":
                                                substitute(
                                                    primary["path"], sfitem)
                                            }
                                        else:
                                            sfitem = {
                                                "path": pathprefix + sfitem
                                            }
                                    if "path" in sfitem and "location" not in sfitem:
                                        revmap(sfitem)
                                    if fs_access.isfile(sfitem["location"]):
                                        sfitem["class"] = "File"
                                        primary["secondaryFiles"].append(
                                            sfitem)
                                    elif fs_access.isdir(sfitem["location"]):
                                        sfitem["class"] = "Directory"
                                        primary["secondaryFiles"].append(
                                            sfitem)

            # Ensure files point to local references outside of the run environment
            adjustFileObjs(
                r,
                cast(  # known bug in mypy
                    # https://github.com/python/mypy/issues/797
                    Callable[[Any], Any],
                    revmap))

            if not r and optional:
                r = None

        if (not r and isinstance(schema["type"], dict)
                and schema["type"]["type"] == "record"):
            out = {}
            for f in schema["type"]["fields"]:
                out[shortname(
                    f["name"])] = self.collect_output(  # type: ignore
                        f,
                        builder,
                        outdir,
                        fs_access,
                        compute_checksum=compute_checksum)
            return out
        return r
Esempio n. 25
0
def static_checker(workflow_inputs, workflow_outputs, step_inputs,
                   step_outputs):
    # type: (List[Dict[Text, Any]], List[Dict[Text, Any]], List[Dict[Text, Any]], List[Dict[Text, Any]]) -> None
    """Check if all source and sink types of a workflow are compatible before run time.
    """

    # source parameters: workflow_inputs and step_outputs
    # sink parameters: step_inputs and workflow_outputs

    # make a dictionary of source parameters, indexed by the "id" field
    src_parms = workflow_inputs + step_outputs
    src_dict = {}
    for parm in src_parms:
        src_dict[parm["id"]] = parm

    step_inputs_val = check_all_types(src_dict, step_inputs, "source")
    workflow_outputs_val = check_all_types(src_dict, workflow_outputs,
                                           "outputSource")

    warnings = step_inputs_val["warning"] + workflow_outputs_val["warning"]
    exceptions = step_inputs_val["exception"] + workflow_outputs_val[
        "exception"]

    warning_msgs = []
    exception_msgs = []
    for warning in warnings:
        src = warning.src
        sink = warning.sink
        linkMerge = warning.linkMerge
        msg = SourceLine(src, "type").makeError(
            "Source '%s' of type %s is partially incompatible"
            % (shortname(src["id"]), json.dumps(src["type"]))) + "\n" + \
            SourceLine(sink, "type").makeError(
            "  with sink '%s' of type %s"
            % (shortname(sink["id"]), json.dumps(sink["type"])))
        if linkMerge:
            msg += "\n" + SourceLine(sink).makeError(
                "  source has linkMerge method %s" % linkMerge)
        warning_msgs.append(msg)
    for exception in exceptions:
        src = exception.src
        sink = exception.sink
        linkMerge = exception.linkMerge
        msg = SourceLine(src, "type").makeError(
            "Source '%s' of type %s is incompatible"
            % (shortname(src["id"]), json.dumps(src["type"]))) + "\n" + \
            SourceLine(sink, "type").makeError(
            "  with sink '%s' of type %s"
            % (shortname(sink["id"]), json.dumps(sink["type"])))
        if linkMerge:
            msg += "\n" + SourceLine(sink).makeError(
                "  source has linkMerge method %s" % linkMerge)
        exception_msgs.append(msg)

    for sink in step_inputs:
        if ('null' != sink["type"] and 'null' not in sink["type"]
                and "source" not in sink and "default" not in sink
                and "valueFrom" not in sink):
            msg = SourceLine(sink).makeError(
                "Required parameter '%s' does not have source, default, or valueFrom expression"
                % shortname(sink["id"]))
            exception_msgs.append(msg)

    all_warning_msg = "\n".join(warning_msgs)
    all_exception_msg = "\n".join(exception_msgs)

    if warnings:
        _logger.warning("Workflow checker warning:\n%s" % all_warning_msg)
    if exceptions:
        raise validate.ValidationException(all_exception_msg)
Esempio n. 26
0
    def generate_arg(self, binding):  # type: (Dict[str, Any]) -> List[str]
        value = binding.get("datum")
        if "valueFrom" in binding:
            with SourceLine(
                    binding,
                    "valueFrom",
                    WorkflowException,
                    _logger.isEnabledFor(logging.DEBUG),
            ):
                value = self.do_eval(binding["valueFrom"], context=value)

        prefix = binding.get("prefix")  # type: Optional[str]
        sep = binding.get("separate", True)
        if prefix is None and not sep:
            with SourceLine(
                    binding,
                    "separate",
                    WorkflowException,
                    _logger.isEnabledFor(logging.DEBUG),
            ):
                raise WorkflowException(
                    "'separate' option can not be specified without prefix")

        argl = []  # type: MutableSequence[MutableMapping[str, str]]
        if isinstance(value, MutableSequence):
            if binding.get("itemSeparator") and value:
                argl = [
                    binding["itemSeparator"].join(
                        [self.tostr(v) for v in value])
                ]
            elif binding.get("valueFrom"):
                value = [self.tostr(v) for v in value]
                return ([prefix] if prefix else []) + value
            elif prefix and value:
                return [prefix]
            else:
                return []
        elif isinstance(value, MutableMapping) and value.get("class") in (
                "File",
                "Directory",
        ):
            argl = [value]
        elif isinstance(value, MutableMapping):
            return [prefix] if prefix else []
        elif value is True and prefix:
            return [prefix]
        elif value is False or value is None or (value is True and not prefix):
            return []
        else:
            argl = [value]

        args = []
        for j in argl:
            if sep:
                args.extend([prefix, self.tostr(j)])
            else:
                args.append(
                    self.tostr(j) if prefix is None else prefix +
                    self.tostr(j))

        return [a for a in args if a is not None]
Esempio n. 27
0
    def __init__(self, toolpath_object, pos, **kwargs):
        # type: (Dict[Text, Any], int, **Any) -> None
        if "id" in toolpath_object:
            self.id = toolpath_object["id"]
        else:
            self.id = "#step" + Text(pos)

        kwargs["requirements"] = (
            kwargs.get("requirements", []) +
            toolpath_object.get("requirements", []) +
            get_overrides(kwargs.get("overrides", []), self.id))
        kwargs["hints"] = kwargs.get("hints", []) + toolpath_object.get(
            "hints", [])

        try:
            if isinstance(toolpath_object["run"], dict):
                self.embedded_tool = kwargs.get("makeTool")(
                    toolpath_object["run"], **kwargs)
            else:
                self.embedded_tool = load_tool(
                    toolpath_object["run"],
                    kwargs.get("makeTool"),
                    kwargs,
                    enable_dev=kwargs.get("enable_dev"),
                    strict=kwargs.get("strict"),
                    fetcher_constructor=kwargs.get("fetcher_constructor"),
                    resolver=kwargs.get("resolver"),
                    overrides=kwargs.get("overrides"))
        except validate.ValidationException as v:
            raise WorkflowException(
                u"Tool definition %s failed validation:\n%s" %
                (toolpath_object["run"], validate.indent(str(v))))

        validation_errors = []
        self.tool = toolpath_object = copy.deepcopy(toolpath_object)
        bound = set()
        for stepfield, toolfield in (("in", "inputs"), ("out", "outputs")):
            toolpath_object[toolfield] = []
            for n, step_entry in enumerate(toolpath_object[stepfield]):
                if isinstance(step_entry, six.string_types):
                    param = CommentedMap()  # type: CommentedMap
                    inputid = step_entry
                else:
                    param = CommentedMap(six.iteritems(step_entry))
                    inputid = step_entry["id"]

                shortinputid = shortname(inputid)
                found = False
                for tool_entry in self.embedded_tool.tool[toolfield]:
                    frag = shortname(tool_entry["id"])
                    if frag == shortinputid:
                        param.update(tool_entry)
                        found = True
                        bound.add(frag)
                        break
                if not found:
                    if stepfield == "in":
                        param["type"] = "Any"
                    else:
                        validation_errors.append(
                            SourceLine(self.tool["out"], n).makeError(
                                "Workflow step output '%s' does not correspond to"
                                % shortname(step_entry) if isinstance(
                                    step_entry, six.string_types
                                ) else shortname(step_entry["id"])) + "\n" +
                            SourceLine(self.embedded_tool.tool, "outputs").
                            makeError("  tool output (expected '%s')" %
                                      ("', '".join([
                                          shortname(tool_entry["id"])
                                          for tool_entry in
                                          self.embedded_tool.tool[toolfield]
                                      ]))))
                param["id"] = inputid
                param.lc.line = toolpath_object[stepfield].lc.data[n][0]
                param.lc.col = toolpath_object[stepfield].lc.data[n][1]
                param.lc.filename = toolpath_object[stepfield].lc.filename
                toolpath_object[toolfield].append(param)

        missing = []
        for i, tool_entry in enumerate(self.embedded_tool.tool["inputs"]):
            if shortname(tool_entry["id"]) not in bound:
                if "null" not in tool_entry[
                        "type"] and "default" not in tool_entry:
                    missing.append(shortname(tool_entry["id"]))

        if missing:
            validation_errors.append(
                SourceLine(self.tool, "in").makeError(
                    "Step is missing required parameter%s '%s'" %
                    ("s" if len(missing) > 1 else "", "', '".join(missing))))

        if validation_errors:
            raise validate.ValidationException("\n".join(validation_errors))

        super(WorkflowStep, self).__init__(toolpath_object, **kwargs)

        if self.embedded_tool.tool["class"] == "Workflow":
            (feature,
             _) = self.get_requirement("SubworkflowFeatureRequirement")
            if not feature:
                raise WorkflowException(
                    "Workflow contains embedded workflow but SubworkflowFeatureRequirement not in requirements"
                )

        if "scatter" in self.tool:
            (feature, _) = self.get_requirement("ScatterFeatureRequirement")
            if not feature:
                raise WorkflowException(
                    "Workflow contains scatter but ScatterFeatureRequirement not in requirements"
                )

            inputparms = copy.deepcopy(self.tool["inputs"])
            outputparms = copy.deepcopy(self.tool["outputs"])
            scatter = aslist(self.tool["scatter"])

            method = self.tool.get("scatterMethod")
            if method is None and len(scatter) != 1:
                raise validate.ValidationException(
                    "Must specify scatterMethod when scattering over multiple inputs"
                )

            inp_map = {i["id"]: i for i in inputparms}
            for s in scatter:
                if s not in inp_map:
                    raise validate.ValidationException(
                        SourceLine(self.tool, "scatter").makeError(
                            u"Scatter parameter '%s' does not correspond to an input parameter of this "
                            u"step, expecting '%s'" %
                            (shortname(s), "', '".join(
                                shortname(k) for k in inp_map.keys()))))

                inp_map[s]["type"] = {
                    "type": "array",
                    "items": inp_map[s]["type"]
                }

            if self.tool.get("scatterMethod") == "nested_crossproduct":
                nesting = len(scatter)
            else:
                nesting = 1

            for r in range(0, nesting):
                for op in outputparms:
                    op["type"] = {"type": "array", "items": op["type"]}
            self.tool["inputs"] = inputparms
            self.tool["outputs"] = outputparms
Esempio n. 28
0
def arv_docker_get_image(api_client, dockerRequirement, pull_image,
                         project_uuid, force_pull, tmp_outdir_prefix):
    """Check if a Docker image is available in Keep, if not, upload it using arv-keepdocker."""

    if "http://arvados.org/cwl#dockerCollectionPDH" in dockerRequirement:
        return dockerRequirement["http://arvados.org/cwl#dockerCollectionPDH"]

    if "dockerImageId" not in dockerRequirement and "dockerPull" in dockerRequirement:
        dockerRequirement = copy.deepcopy(dockerRequirement)
        dockerRequirement["dockerImageId"] = dockerRequirement["dockerPull"]
        if hasattr(dockerRequirement, 'lc'):
            dockerRequirement.lc.data[
                "dockerImageId"] = dockerRequirement.lc.data["dockerPull"]

    global cached_lookups
    global cached_lookups_lock
    with cached_lookups_lock:
        if dockerRequirement["dockerImageId"] in cached_lookups:
            return cached_lookups[dockerRequirement["dockerImageId"]]

    with SourceLine(dockerRequirement, "dockerImageId", WorkflowException,
                    logger.isEnabledFor(logging.DEBUG)):
        sp = dockerRequirement["dockerImageId"].split(":")
        image_name = sp[0]
        image_tag = sp[1] if len(sp) > 1 else "latest"

        images = arvados.commands.keepdocker.list_images_in_arv(
            api_client, 3, image_name=image_name, image_tag=image_tag)

        if not images:
            # Fetch Docker image if necessary.
            try:
                result = cwltool.docker.DockerCommandLineJob.get_image(
                    dockerRequirement, pull_image, force_pull,
                    tmp_outdir_prefix)
                if not result:
                    raise WorkflowException("Docker image '%s' not available" %
                                            dockerRequirement["dockerImageId"])
            except OSError as e:
                raise WorkflowException(
                    "While trying to get Docker image '%s', failed to execute 'docker': %s"
                    % (dockerRequirement["dockerImageId"], e))

            # Upload image to Arvados
            args = []
            if project_uuid:
                args.append("--project-uuid=" + project_uuid)
            args.append(image_name)
            args.append(image_tag)
            logger.info("Uploading Docker image %s:%s", image_name, image_tag)
            try:
                arvados.commands.put.api_client = api_client
                arvados.commands.keepdocker.main(args,
                                                 stdout=sys.stderr,
                                                 install_sig_handlers=False,
                                                 api=api_client)
            except SystemExit as e:
                # If e.code is None or zero, then keepdocker exited normally and we can continue
                if e.code:
                    raise WorkflowException("keepdocker exited with code %s" %
                                            e.code)

            images = arvados.commands.keepdocker.list_images_in_arv(
                api_client, 3, image_name=image_name, image_tag=image_tag)

        if not images:
            raise WorkflowException("Could not find Docker image %s:%s" %
                                    (image_name, image_tag))

        pdh = api_client.collections().get(
            uuid=images[0][0]).execute()["portable_data_hash"]

        with cached_lookups_lock:
            cached_lookups[dockerRequirement["dockerImageId"]] = pdh

    return pdh
Esempio n. 29
0
 def check_for_abstract_op(tool: CWLObjectType) -> None:
     if tool["class"] == "Operation":
         raise SourceLine(
             tool, "class", WorkflowException, runtime_context.debug
         ).makeError("Workflow has unrunnable abstract Operation")
Esempio n. 30
0
    def run(self, runtimeContext):
        script_parameters = {"command": self.command_line}
        runtime_constraints = {}

        with Perf(metrics, "generatefiles %s" % self.name):
            if self.generatefiles["listing"]:
                vwd = arvados.collection.Collection(
                    api_client=self.arvrunner.api,
                    keep_client=self.arvrunner.keep_client,
                    num_retries=self.arvrunner.num_retries)
                script_parameters["task.vwd"] = {}
                generatemapper = VwdPathMapper(self.generatefiles["listing"],
                                               "",
                                               "",
                                               separateDirs=False)

                with Perf(metrics, "createfiles %s" % self.name):
                    for f, p in generatemapper.items():
                        if p.type == "CreateFile":
                            with vwd.open(p.target, "w") as n:
                                n.write(p.resolved.encode("utf-8"))

                if vwd:
                    with Perf(metrics,
                              "generatefiles.save_new %s" % self.name):
                        info = get_intermediate_collection_info(
                            self.name, None,
                            runtimeContext.intermediate_output_ttl)
                        vwd.save_new(name=info["name"],
                                     owner_uuid=self.arvrunner.project_uuid,
                                     ensure_unique_name=True,
                                     trash_at=info["trash_at"],
                                     properties=info["properties"])

                for f, p in generatemapper.items():
                    if p.type == "File":
                        script_parameters["task.vwd"][p.target] = p.resolved
                    if p.type == "CreateFile":
                        script_parameters["task.vwd"][
                            p.target] = "$(task.keep)/%s/%s" % (
                                vwd.portable_data_hash(), p.target)

        script_parameters["task.env"] = {
            "TMPDIR": self.tmpdir,
            "HOME": self.outdir
        }
        if self.environment:
            script_parameters["task.env"].update(self.environment)

        if self.stdin:
            script_parameters["task.stdin"] = self.stdin

        if self.stdout:
            script_parameters["task.stdout"] = self.stdout

        if self.stderr:
            script_parameters["task.stderr"] = self.stderr

        if self.successCodes:
            script_parameters["task.successCodes"] = self.successCodes
        if self.temporaryFailCodes:
            script_parameters[
                "task.temporaryFailCodes"] = self.temporaryFailCodes
        if self.permanentFailCodes:
            script_parameters[
                "task.permanentFailCodes"] = self.permanentFailCodes

        with Perf(metrics, "arv_docker_get_image %s" % self.name):
            (docker_req,
             docker_is_req) = self.get_requirement("DockerRequirement")
            if docker_req and runtimeContext.use_container is not False:
                if docker_req.get("dockerOutputDirectory"):
                    raise SourceLine(
                        docker_req, "dockerOutputDirectory",
                        UnsupportedRequirement
                    ).makeError(
                        "Option 'dockerOutputDirectory' of DockerRequirement not supported."
                    )
                runtime_constraints["docker_image"] = arv_docker_get_image(
                    self.arvrunner.api, docker_req, runtimeContext.pull_image,
                    self.arvrunner.project_uuid)
            else:
                runtime_constraints["docker_image"] = "arvados/jobs"

        resources = self.builder.resources
        if resources is not None:
            runtime_constraints["min_cores_per_node"] = resources.get(
                "cores", 1)
            runtime_constraints["min_ram_mb_per_node"] = resources.get("ram")
            runtime_constraints["min_scratch_mb_per_node"] = resources.get(
                "tmpdirSize", 0) + resources.get("outdirSize", 0)

        runtime_req, _ = self.get_requirement(
            "http://arvados.org/cwl#RuntimeConstraints")
        if runtime_req:
            if "keep_cache" in runtime_req:
                runtime_constraints["keep_cache_mb_per_task"] = runtime_req[
                    "keep_cache"]
                runtime_constraints["min_ram_mb_per_node"] += runtime_req[
                    "keep_cache"]
            if "outputDirType" in runtime_req:
                if runtime_req["outputDirType"] == "local_output_dir":
                    script_parameters["task.keepTmpOutput"] = False
                elif runtime_req["outputDirType"] == "keep_output_dir":
                    script_parameters["task.keepTmpOutput"] = True

        filters = [["repository", "=", "arvados"],
                   ["script", "=", "crunchrunner"],
                   ["script_version", "in git", crunchrunner_git_commit]]
        if not self.arvrunner.ignore_docker_for_reuse:
            filters.append([
                "docker_image_locator", "in docker",
                runtime_constraints["docker_image"]
            ])

        enable_reuse = runtimeContext.enable_reuse
        if enable_reuse:
            reuse_req, _ = self.get_requirement(
                "http://arvados.org/cwl#ReuseRequirement")
            if reuse_req:
                enable_reuse = reuse_req["enableReuse"]

        self.output_callback = self.arvrunner.get_wrapped_callback(
            self.output_callback)

        try:
            with Perf(metrics, "create %s" % self.name):
                response = self.arvrunner.api.jobs().create(
                    body={
                        "owner_uuid": self.arvrunner.project_uuid,
                        "script": "crunchrunner",
                        "repository": "arvados",
                        "script_version": "master",
                        "minimum_script_version": crunchrunner_git_commit,
                        "script_parameters": {
                            "tasks": [script_parameters]
                        },
                        "runtime_constraints": runtime_constraints
                    },
                    filters=filters,
                    find_or_create=enable_reuse).execute(
                        num_retries=self.arvrunner.num_retries)

            self.uuid = response["uuid"]
            self.arvrunner.process_submitted(self)

            self.update_pipeline_component(response)

            if response["state"] == "Complete":
                logger.info("%s reused job %s", self.arvrunner.label(self),
                            response["uuid"])
                # Give read permission to the desired project on reused jobs
                if response["owner_uuid"] != self.arvrunner.project_uuid:
                    try:
                        self.arvrunner.api.links().create(
                            body={
                                'link_class': 'permission',
                                'name': 'can_read',
                                'tail_uuid': self.arvrunner.project_uuid,
                                'head_uuid': response["uuid"],
                            }).execute(num_retries=self.arvrunner.num_retries)
                    except ApiError as e:
                        # The user might not have "manage" access on the job: log
                        # a message and continue.
                        logger.info("Creating read permission on job %s: %s",
                                    response["uuid"], e)
            else:
                logger.info("%s %s is %s", self.arvrunner.label(self),
                            response["uuid"], response["state"])
        except Exception as e:
            logger.exception("%s error" % (self.arvrunner.label(self)))
            self.output_callback({}, "permanentFail")
Esempio n. 31
0
    def job(self, joborder, output_callback, **kwargs):
        # type: (Dict[Text, Any], Callable[[Any, Any], Any], **Any) -> Generator
        self.state = {}
        self.processStatus = "success"

        if "outdir" in kwargs:
            del kwargs["outdir"]

        for e, i in enumerate(self.tool["inputs"]):
            with SourceLine(self.tool["inputs"], e, WorkflowException):
                iid = shortname(i["id"])
                if iid in joborder:
                    self.state[i["id"]] = WorkflowStateItem(i, copy.deepcopy(joborder[iid]), "success")
                elif "default" in i:
                    self.state[i["id"]] = WorkflowStateItem(i, copy.deepcopy(i["default"]), "success")
                else:
                    raise WorkflowException(
                        u"Input '%s' not in input object and does not have a default value." % (i["id"]))

        for s in self.steps:
            for out in s.tool["outputs"]:
                self.state[out["id"]] = None

        completed = 0
        while completed < len(self.steps):
            self.made_progress = False

            for step in self.steps:
                if kwargs.get("on_error", "stop") == "stop" and self.processStatus != "success":
                    break

                if not step.submitted:
                    try:
                        step.iterable = self.try_make_job(step, **kwargs)
                    except WorkflowException as e:
                        _logger.error(u"[%s] Cannot make job: %s", step.name, e)
                        _logger.debug("", exc_info=True)
                        self.processStatus = "permanentFail"

                if step.iterable:
                    try:
                        for newjob in step.iterable:
                            if kwargs.get("on_error", "stop") == "stop" and self.processStatus != "success":
                                break
                            if newjob:
                                self.made_progress = True
                                yield newjob
                            else:
                                break
                    except WorkflowException as e:
                        _logger.error(u"[%s] Cannot make job: %s", step.name, e)
                        _logger.debug("", exc_info=True)
                        self.processStatus = "permanentFail"

            completed = sum(1 for s in self.steps if s.completed)

            if not self.made_progress and completed < len(self.steps):
                if self.processStatus != "success":
                    break
                else:
                    yield None

        supportsMultipleInput = bool(self.workflow.get_requirement("MultipleInputFeatureRequirement")[0])

        try:
            wo = object_from_state(self.state, self.tool["outputs"], True, supportsMultipleInput, "outputSource",
                                   incomplete=True)
        except WorkflowException as e:
            _logger.error(u"[%s] Cannot collect workflow output: %s", self.name, e)
            wo = {}
            self.processStatus = "permanentFail"

        _logger.info(u"[%s] outdir is %s", self.name, self.outdir)

        output_callback(wo, self.processStatus)