Esempio n. 1
0
def main():

    parser = argparse.ArgumentParser(
        description='Common Workflow Language dependency manager')
    parser.add_argument("operation",
                        type=str,
                        choices=("install", "update", "clean", "check", "add",
                                 "search"))
    parser.add_argument("dependencies", type=str)
    parser.add_argument("upstream", type=str, nargs="?")
    parser.add_argument("--set-version", type=str, default=None)
    parser.add_argument("--install-to", type=str, default=None)

    args = parser.parse_args()

    if args.operation == "add":
        add_dep(args.dependencies, args.upstream, args.set_version,
                args.install_to)

    if args.operation == "search":
        print("WIP")
        return

    document, document_loader = load_nocheck(args.dependencies)

    lockfile = args.dependencies + ".dep.lock"
    locks = {}
    if os.path.isfile(lockfile):
        with open(lockfile, "r") as l:
            locks = json.load(l)

    verified = {}

    def do_deps(req):
        cwl_deps(os.getcwd(), req, locks, verified, args.operation)

    visit_class(document, (CWLDEP_DEPENDENCIES_URL, ), do_deps)

    unref = False
    for l in locks:
        if l not in verified:
            if args.operation == "clean":
                for i in locks[l]["installed_to"]:
                    logging.warn("Removing %s", i)
                    if os.path.isfile(i):
                        os.remove(i)
                    else:
                        shutil.rmtree(i)
            else:
                logging.warn("In cwldep.lock but not referenced: %s", l)
                verified[l] = locks[l]
                unref = True

    if unref:
        logging.warn("Use 'cwldep clean' to delete unused dependencies.")

    with open(lockfile, "w") as l:
        json.dump(verified, l, indent=4, sort_keys=True)

    document_loader.resolve_all(document, args.dependencies, checklinks=True)
Esempio n. 2
0
    def construct_tool_object(toolpath_object, *args, **kwargs):
        """Fix missing locations."""
        protocol = 'file://'

        def addLocation(d):
            if 'location' not in d and 'path' in d:
                d['location'] = protocol + d['path']

        visit_class(toolpath_object, ('File', 'Directory'), addLocation)
        return workflow.default_make_tool(toolpath_object, *args, **kwargs)
Esempio n. 3
0
    def construct_tool_object(toolpath_object, *args, **kwargs):
        """Fix missing locations."""
        protocol = "file://"

        def addLocation(d):
            if "location" not in d and "path" in d:
                d["location"] = protocol + d["path"]

        visit_class(toolpath_object, ("File", "Directory"), addLocation)
        return workflow.default_make_tool(toolpath_object, *args, **kwargs)
Esempio n. 4
0
def execute_workflow_step(workflow,
                          task_id,
                          job_data,
                          cwl_args=None,
                          executor=None):
    """
    Constructs and executes single step workflow based on the "workflow"
    and "task_id". "cwl_args" can be used to update default parameters
    used for loading and runtime contexts. Exports json file with the
    execution results.
    """

    cwl_args = {} if cwl_args is None else cwl_args
    executor = SingleJobExecutor() if executor is None else executor

    step_tmp_folder, step_cache_folder, step_outputs_folder, step_report = get_temp_folders(
        task_id=task_id, job_data=job_data)

    default_cwl_args = get_default_cwl_args(cwl_args)

    default_cwl_args.update({  # add execution specific parameters
        "tmp_outdir_prefix": step_cache_folder + "/",
        "tmpdir_prefix": step_cache_folder + "/",
        "cidfile_dir": step_tmp_folder,
        "cidfile_prefix": task_id,
        "basedir": os.getcwd(
        ),  # job should already have abs path for inputs, so this is useless
        "outdir": step_outputs_folder
    })

    workflow_step_path = os.path.join(step_tmp_folder,
                                      task_id + "_step_workflow.cwl")

    fast_cwl_step_load(  # will save new worlflow to "workflow_step_path"
        workflow=workflow,
        target_id=task_id,
        cwl_args=default_cwl_args,
        location=workflow_step_path)

    _stderr = sys.stderr  # to trick the logger
    sys.stderr = sys.__stderr__
    step_outputs, step_status = executor(
        slow_cwl_load(workflow=workflow_step_path, cwl_args=default_cwl_args),
        job_data, RuntimeContext(default_cwl_args))
    sys.stderr = _stderr

    if step_status != "success":
        raise ValueError

    # To remove "http://commonwl.org/cwltool#generation": 0 (copied from cwltool)
    visit_class(step_outputs, ("File", ), MutationManager().unset_generation)

    dump_json(step_outputs, step_report)

    return step_outputs, step_report
Esempio n. 5
0
def add_dep(fn, upstream, set_version, install_to):
    document_loader, workflowobj, uri = cwltool.load_tool.fetch_document(fn)
    namespaces = workflowobj.get("$namespaces", cmap({}))

    document_loader.idx = {}
    found = []

    def _add(wf):
        found.append(True)
        hints = wf.setdefault("hints", {})
        obj = cmap({"upstream": upstream})
        if set_version:
            obj["version"] = set_version
        if install_to:
            obj["installTo"] = install_to
        if isinstance(hints, list):
            for h in hints:
                if expand_ns(namespaces,
                             h["class"]) == CWLDEP_DEPENDENCIES_URL:
                    for u in h["dependencies"]:
                        if u["upstream"] == upstream:
                            u.update(obj)
                            return
                    h["dependencies"].append(cmap(obj))
                    return
            hints.append(
                cmap({
                    "class": "dep:Dependencies",
                    "dependencies": [obj]
                }))
        elif isinstance(hints, dict):
            for h in hints:
                if expand_ns(namespaces, h) == CWLDEP_DEPENDENCIES_URL:
                    for u in hints[h]["dependencies"]:
                        if u["upstream"] == upstream:
                            u.update(obj)
                            return
                    hints[h]["dependencies"].append(cmap(obj))
                    return
            hints["dep:Dependencies"] = cmap({"dependencies": [obj]})

    visit_class(workflowobj, ("Workflow", ), _add)

    if not found:
        logging.error("No Workflow found")

    namespaces["dep"] = CWLDEP_URL
    workflowobj["$namespaces"] = namespaces

    del workflowobj["id"]

    with open("_" + fn + "_", "w") as f:
        ruamel.yaml.round_trip_dump(workflowobj, f)
    os.rename("_" + fn + "_", fn)
Esempio n. 6
0
def total_size(outputs):
    """
    Recursively walk through an output dictionary object, totaling
    up file size from each dictionary where 'class' == 'File'
    :param outputs: output dictionary from a CWL job
    :return: Sum of all 'size' field values found
    """
    files = []
    visit_class(outputs, ("File", ), files.append)
    # Per https://www.commonwl.org/v1.0/CommandLineTool.html#File
    # size is optional in the class, so default to 0 if not found
    return sum([f.get('size', 0) for f in files])
Esempio n. 7
0
    def visit_default(obj):
        remove = [False]

        def ensure_default_location(f):
            if "location" not in f and "path" in f:
                f["location"] = f["path"]
                del f["path"]
            if "location" in f and not arvrunner.fs_access.exists(
                    f["location"]):
                # Doesn't exist, remove from list of dependencies to upload
                sc[:] = [x for x in sc if x["location"] != f["location"]]
                # Delete "default" from workflowobj
                remove[0] = True

        visit_class(obj["default"], ("File", "Directory"),
                    ensure_default_location)
        if remove[0]:
            del obj["default"]
Esempio n. 8
0
    def _store_provenance(cls, factory: ProvenanceFactory, out) -> None:
        """Proxy method to cwltool's logic"""
        runtime_context = factory.runtime_context
        loading_context = factory.loading_context
        workflow_object = factory.workflow_object
        uri = factory.uri

        if runtime_context.research_obj is not None:
            runtime_context.research_obj.create_job(out, True)

            def remove_at_id(doc: CWLObjectType) -> None:
                for key in list(doc.keys()):
                    if key == "@id":
                        del doc[key]
                    else:
                        value = doc[key]
                        if isinstance(value, MutableMapping):
                            remove_at_id(value)
                        elif isinstance(value, MutableSequence):
                            for entry in value:
                                if isinstance(entry, MutableMapping):
                                    remove_at_id(entry)

            remove_at_id(out)
            visit_class(
                out,
                ("File", ),
                functools.partial(add_sizes,
                                  runtime_context.make_fs_access("")),
            )

            research_obj = runtime_context.research_obj
            if loading_context.loader is not None:
                research_obj.generate_snapshot(
                    prov_deps(workflow_object, loading_context.loader, uri))

            research_obj.close(factory.store_provenance_directory)
Esempio n. 9
0
def upload_dependencies(arvrunner,
                        name,
                        document_loader,
                        workflowobj,
                        uri,
                        loadref_run,
                        include_primary=True,
                        discovered_secondaryfiles=None):
    """Upload the dependencies of the workflowobj document to Keep.

    Returns a pathmapper object mapping local paths to keep references.  Also
    does an in-place update of references in "workflowobj".

    Use scandeps to find $import, $include, $schemas, run, File and Directory
    fields that represent external references.

    If workflowobj has an "id" field, this will reload the document to ensure
    it is scanning the raw document prior to preprocessing.
    """

    loaded = set()

    def loadref(b, u):
        joined = document_loader.fetcher.urljoin(b, u)
        defrg, _ = urllib.parse.urldefrag(joined)
        if defrg not in loaded:
            loaded.add(defrg)
            # Use fetch_text to get raw file (before preprocessing).
            text = document_loader.fetch_text(defrg)
            if isinstance(text, bytes):
                textIO = StringIO(text.decode('utf-8'))
            else:
                textIO = StringIO(text)
            return yaml.safe_load(textIO)
        else:
            return {}

    if loadref_run:
        loadref_fields = set(("$import", "run"))
    else:
        loadref_fields = set(("$import", ))

    scanobj = workflowobj
    if "id" in workflowobj and not workflowobj["id"].startswith("_:"):
        # Need raw file content (before preprocessing) to ensure
        # that external references in $include and $mixin are captured.
        scanobj = loadref("", workflowobj["id"])

    metadata = scanobj

    sc_result = scandeps(uri,
                         scanobj,
                         loadref_fields,
                         set(("$include", "$schemas", "location")),
                         loadref,
                         urljoin=document_loader.fetcher.urljoin)

    sc = []
    uuids = {}

    def collect_uuids(obj):
        loc = obj.get("location", "")
        sp = loc.split(":")
        if sp[0] == "keep":
            # Collect collection uuids that need to be resolved to
            # portable data hashes
            gp = collection_uuid_pattern.match(loc)
            if gp:
                uuids[gp.groups()[0]] = obj
            if collectionUUID in obj:
                uuids[obj[collectionUUID]] = obj

    def collect_uploads(obj):
        loc = obj.get("location", "")
        sp = loc.split(":")
        if len(sp) < 1:
            return
        if sp[0] in ("file", "http", "https"):
            # Record local files than need to be uploaded,
            # don't include file literals, keep references, etc.
            sc.append(obj)
        collect_uuids(obj)

    visit_class(workflowobj, ("File", "Directory"), collect_uuids)
    visit_class(sc_result, ("File", "Directory"), collect_uploads)

    # Resolve any collection uuids we found to portable data hashes
    # and assign them to uuid_map
    uuid_map = {}
    fetch_uuids = list(uuids.keys())
    while fetch_uuids:
        # For a large number of fetch_uuids, API server may limit
        # response size, so keep fetching from API server has nothing
        # more to give us.
        lookups = arvrunner.api.collections().list(
            filters=[["uuid", "in", fetch_uuids]],
            count="none",
            select=["uuid", "portable_data_hash"
                    ]).execute(num_retries=arvrunner.num_retries)

        if not lookups["items"]:
            break

        for l in lookups["items"]:
            uuid_map[l["uuid"]] = l["portable_data_hash"]

        fetch_uuids = [u for u in fetch_uuids if u not in uuid_map]

    normalizeFilesDirs(sc)

    if include_primary and "id" in workflowobj:
        sc.append({"class": "File", "location": workflowobj["id"]})

    if "$schemas" in workflowobj:
        for s in workflowobj["$schemas"]:
            sc.append({"class": "File", "location": s})

    def visit_default(obj):
        remove = [False]

        def ensure_default_location(f):
            if "location" not in f and "path" in f:
                f["location"] = f["path"]
                del f["path"]
            if "location" in f and not arvrunner.fs_access.exists(
                    f["location"]):
                # Doesn't exist, remove from list of dependencies to upload
                sc[:] = [x for x in sc if x["location"] != f["location"]]
                # Delete "default" from workflowobj
                remove[0] = True

        visit_class(obj["default"], ("File", "Directory"),
                    ensure_default_location)
        if remove[0]:
            del obj["default"]

    find_defaults(workflowobj, visit_default)

    discovered = {}

    def discover_default_secondary_files(obj):
        builder_job_order = {}
        for t in obj["inputs"]:
            builder_job_order[shortname(
                t["id"])] = t["default"] if "default" in t else None
        # Need to create a builder object to evaluate expressions.
        builder = make_builder(builder_job_order, obj.get("hints", []),
                               obj.get("requirements", []),
                               ArvRuntimeContext(), metadata)
        discover_secondary_files(arvrunner.fs_access, builder, obj["inputs"],
                                 builder_job_order, discovered)

    copied, _ = document_loader.resolve_all(copy.deepcopy(cmap(workflowobj)),
                                            base_url=uri,
                                            checklinks=False)
    visit_class(copied, ("CommandLineTool", "Workflow"),
                discover_default_secondary_files)

    for d in list(discovered):
        # Only interested in discovered secondaryFiles which are local
        # files that need to be uploaded.
        if d.startswith("file:"):
            sc.extend(discovered[d])
        else:
            del discovered[d]

    mapper = ArvPathMapper(arvrunner,
                           sc,
                           "",
                           "keep:%s",
                           "keep:%s/%s",
                           name=name,
                           single_collection=True)

    def setloc(p):
        loc = p.get("location")
        if loc and (not loc.startswith("_:")) and (
                not loc.startswith("keep:")):
            p["location"] = mapper.mapper(p["location"]).resolved
            return

        if not loc:
            return

        if collectionUUID in p:
            uuid = p[collectionUUID]
            if uuid not in uuid_map:
                raise SourceLine(p, collectionUUID,
                                 validate.ValidationException).makeError(
                                     "Collection uuid %s not found" % uuid)
            gp = collection_pdh_pattern.match(loc)
            if gp and uuid_map[uuid] != gp.groups()[0]:
                # This file entry has both collectionUUID and a PDH
                # location. If the PDH doesn't match the one returned
                # the API server, raise an error.
                raise SourceLine(
                    p, "location", validate.ValidationException
                ).makeError(
                    "Expected collection uuid %s to be %s but API server reported %s"
                    % (uuid, gp.groups()[0], uuid_map[p[collectionUUID]]))

        gp = collection_uuid_pattern.match(loc)
        if not gp:
            return
        uuid = gp.groups()[0]
        if uuid not in uuid_map:
            raise SourceLine(p, "location",
                             validate.ValidationException).makeError(
                                 "Collection uuid %s not found" % uuid)
        p["location"] = "keep:%s%s" % (uuid_map[uuid], gp.groups()[1]
                                       if gp.groups()[1] else "")
        p[collectionUUID] = uuid

    visit_class(workflowobj, ("File", "Directory"), setloc)
    visit_class(discovered, ("File", "Directory"), setloc)

    if discovered_secondaryfiles is not None:
        for d in discovered:
            discovered_secondaryfiles[mapper.mapper(
                d).resolved] = discovered[d]

    if "$schemas" in workflowobj:
        sch = CommentedSeq()
        for s in workflowobj["$schemas"]:
            if s in mapper:
                sch.append(mapper.mapper(s).resolved)
        workflowobj["$schemas"] = sch

    return mapper
Esempio n. 10
0
    def job(self, joborder, output_callback, runtimeContext):

        builder = make_builder(joborder, self.hints, self.requirements, runtimeContext, self.metadata)
        runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext)

        req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer")
        if not req:
            return super(ArvadosWorkflow, self).job(joborder, output_callback, runtimeContext)

        # RunInSingleContainer is true

        with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
            if "id" not in self.tool:
                raise WorkflowException("%s object must have 'id'" % (self.tool["class"]))

        discover_secondary_files(self.arvrunner.fs_access, builder,
                                 self.tool["inputs"], joborder)

        with Perf(metrics, "subworkflow upload_deps"):
            upload_dependencies(self.arvrunner,
                                os.path.basename(joborder.get("id", "#")),
                                self.doc_loader,
                                joborder,
                                joborder.get("id", "#"),
                                False)

            if self.wf_pdh is None:
                packed = pack(self.loadingContext, self.tool["id"], loader=self.doc_loader)

                for p in packed["$graph"]:
                    if p["id"] == "#main":
                        p["requirements"] = dedup_reqs(self.requirements)
                        p["hints"] = dedup_reqs(self.hints)

                def visit(item):
                    if "requirements" in item:
                        item["requirements"] = [i for i in item["requirements"] if i["class"] != "DockerRequirement"]
                    for t in ("hints", "requirements"):
                        if t not in item:
                            continue
                        for req in item[t]:
                            if req["class"] == "ResourceRequirement":
                                dyn = False
                                for k in max_res_pars + sum_res_pars:
                                    if k in req:
                                        if isinstance(req[k], basestring):
                                            if item["id"] == "#main":
                                                # only the top-level requirements/hints may contain expressions
                                                self.dynamic_resource_req.append(req)
                                                dyn = True
                                                break
                                            else:
                                                with SourceLine(req, k, WorkflowException):
                                                    raise WorkflowException("Non-top-level ResourceRequirement in single container cannot have expressions")
                                if not dyn:
                                    self.static_resource_req.append(req)

                visit_class(packed["$graph"], ("Workflow", "CommandLineTool"), visit)

                if self.static_resource_req:
                    self.static_resource_req = [get_overall_res_req(self.static_resource_req)]

                upload_dependencies(self.arvrunner,
                                    runtimeContext.name,
                                    self.doc_loader,
                                    packed,
                                    self.tool["id"],
                                    False)

                # Discover files/directories referenced by the
                # workflow (mainly "default" values)
                visit_class(packed, ("File", "Directory"), self.wf_reffiles.append)


        if self.dynamic_resource_req:
            # Evaluate dynamic resource requirements using current builder
            rs = copy.copy(self.static_resource_req)
            for dyn_rs in self.dynamic_resource_req:
                eval_req = {"class": "ResourceRequirement"}
                for a in max_res_pars + sum_res_pars:
                    if a in dyn_rs:
                        eval_req[a] = builder.do_eval(dyn_rs[a])
                rs.append(eval_req)
            job_res_reqs = [get_overall_res_req(rs)]
        else:
            job_res_reqs = self.static_resource_req

        with Perf(metrics, "subworkflow adjust"):
            joborder_resolved = copy.deepcopy(joborder)
            joborder_keepmount = copy.deepcopy(joborder)

            reffiles = []
            visit_class(joborder_keepmount, ("File", "Directory"), reffiles.append)

            mapper = ArvPathMapper(self.arvrunner, reffiles+self.wf_reffiles, runtimeContext.basedir,
                                   "/keep/%s",
                                   "/keep/%s/%s")

            # For containers API, we need to make sure any extra
            # referenced files (ie referenced by the workflow but
            # not in the inputs) are included in the mounts.
            if self.wf_reffiles:
                runtimeContext = runtimeContext.copy()
                runtimeContext.extra_reffiles = copy.deepcopy(self.wf_reffiles)

            def keepmount(obj):
                remove_redundant_fields(obj)
                with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                    if "location" not in obj:
                        raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj))
                with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                    if obj["location"].startswith("keep:"):
                        obj["location"] = mapper.mapper(obj["location"]).target
                        if "listing" in obj:
                            del obj["listing"]
                    elif obj["location"].startswith("_:"):
                        del obj["location"]
                    else:
                        raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"])

            visit_class(joborder_keepmount, ("File", "Directory"), keepmount)

            def resolved(obj):
                if obj["location"].startswith("keep:"):
                    obj["location"] = mapper.mapper(obj["location"]).resolved

            visit_class(joborder_resolved, ("File", "Directory"), resolved)

            if self.wf_pdh is None:
                adjustFileObjs(packed, keepmount)
                adjustDirObjs(packed, keepmount)
                self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed)

        self.loadingContext = self.loadingContext.copy()
        self.loadingContext.metadata = self.loadingContext.metadata.copy()
        self.loadingContext.metadata["http://commonwl.org/cwltool#original_cwlVersion"] = "v1.0"

        if len(job_res_reqs) == 1:
            # RAM request needs to be at least 128 MiB or the workflow
            # runner itself won't run reliably.
            if job_res_reqs[0].get("ramMin", 1024) < 128:
                job_res_reqs[0]["ramMin"] = 128

        arguments = ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl", "cwl.input.yml"]
        if runtimeContext.debug:
            arguments.insert(0, '--debug')

        wf_runner = cmap({
            "class": "CommandLineTool",
            "baseCommand": "cwltool",
            "inputs": self.tool["inputs"],
            "outputs": self.tool["outputs"],
            "stdout": "cwl.output.json",
            "requirements": self.requirements+job_res_reqs+[
                {"class": "InlineJavascriptRequirement"},
                {
                "class": "InitialWorkDirRequirement",
                "listing": [{
                        "entryname": "workflow.cwl",
                        "entry": '$({"class": "File", "location": "keep:%s/workflow.cwl"})' % self.wf_pdh
                    }, {
                        "entryname": "cwl.input.yml",
                        "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${')
                    }]
            }],
            "hints": self.hints,
            "arguments": arguments,
            "id": "#"
        })
        return ArvadosCommandTool(self.arvrunner, wf_runner, self.loadingContext).job(joborder_resolved, output_callback, runtimeContext)
Esempio n. 11
0
def cwl_deps(basedir, dependencies, locks, verified, operation):
    for d in dependencies["dependencies"]:
        upstream = d["upstream"]
        spup = urllib.parse.urlsplit(upstream)

        if d.get("installTo"):
            installTo = os.path.join(basedir, d.get("installTo"))
        else:
            installTo = os.path.dirname(
                os.path.join(basedir, spup.netloc, spup.path.lstrip("/")))

        if not os.path.isdir(installTo):
            os.makedirs(installTo)

        if spup.scheme == "http" or spup.scheme == "https":
            tgt = os.path.join(installTo, os.path.basename(spup.path))

            if spup.path.endswith(".cwl"):
                deps = {
                    "class": "File",
                    "location": upstream
                }  # type: Dict[Text, Any]

                document, loading_context = load_nocheck(upstream)

                def loadref(base, uri):
                    return loading_context.loader.fetch(
                        loading_context.loader.fetcher.urljoin(base, uri))

                loading_context.loader.idx = {}

                sfs = scandeps(upstream,
                               loading_context.loader.fetch(upstream),
                               {"$import", "run"},
                               {"$include", "$schemas", "location"}, loadref)
                if sfs:
                    deps["secondaryFiles"] = sfs

                def retrieve(obj):
                    sploc = urllib.parse.urlsplit(obj["location"])
                    rp = os.path.relpath(sploc.path,
                                         os.path.dirname(spup.path))
                    tgt = os.path.join(installTo, rp)
                    if not os.path.isdir(os.path.dirname(tgt)):
                        os.makedirs(os.path.dirname(tgt))
                    if verify(
                            tgt, locks,
                            verified) and operation not in ("update", "check"):
                        return
                    download(tgt, obj["location"], "", locks, verified,
                             operation == "check")

                visit_class(deps, ("File", ), retrieve)

                def do_deps(req):
                    cwl_deps(installTo, req, locks, verified, operation)

                visit_class(document, (CWLDEP_DEPENDENCIES_URL, ), do_deps)

            elif spup.path.endswith(".tar.gz") or spup.path.endswith(
                    ".tar.bz2") or spup.path.endswith(".zip"):
                download(tgt, upstream, "", locks, verified,
                         operation == "check")
                if spup.path.endswith(".tar.gz"):
                    with tarfile.open(tgt) as t:
                        t.extractall(installTo)
                elif spup.path.endswith(".tar.bz2"):
                    with tarfile.open(tgt) as t:
                        t.extractall(installTo)
                elif spup.path.endswith(".zip"):
                    with zipfile.ZipFile(tgt) as z:
                        z.extractall(installTo)
                rel = os.path.relpath(tgt, os.getcwd())
                verified[rel]["installed_to"] = [
                    tgt, os.path.relpath(ex, os.getcwd())
                ]

            else:
                rq = requests.get(upstream +
                                  ".git/info/refs?service=git-upload-pack")
                if rq.status_code == 200:
                    if os.path.isdir(os.path.join(tgt, ".git")):
                        subprocess.call(["git", "fetch", "--all"])
                    else:
                        subprocess.call(["git", "clone", upstream, tgt])

                    version = d.get("version")
                    rel = os.path.relpath(tgt, os.getcwd())
                    if rel in locks and operation != "update":
                        version = locks[rel]["version"]

                    if version:
                        print(version)
                        co = subprocess.check_output(
                            ["git", "rev-parse", version], cwd=tgt).rstrip()
                        head = subprocess.check_output(
                            ["git", "rev-parse", "HEAD"], cwd=tgt).rstrip()
                        head = head.rstrip()
                        if head != co:
                            subprocess.call(["git", "checkout", co], cwd=tgt)
                    commit = subprocess.check_output(
                        ["git", "rev-parse", "HEAD"]).rstrip().decode('utf-8')

                    verified[rel] = {
                        "upstream": upstream,
                        "version": commit,
                        "retrieved_at": datetime.now(tzlocal()).isoformat(),
                        "installed_to": [rel]
                    }

        else:
            logging.error("Scheme %s not supported", spup.scheme)
Esempio n. 12
0
    def arvados_job_spec(self, runtimeContext):
        """Create an Arvados container request for this workflow.

        The returned dict can be used to create a container passed as
        the +body+ argument to container_requests().create().
        """

        adjustDirObjs(self.job_order, trim_listing)
        visit_class(self.job_order, ("File", "Directory"),
                    trim_anonymous_location)
        visit_class(self.job_order, ("File", "Directory"),
                    remove_redundant_fields)

        secret_mounts = {}
        for param in sorted(self.job_order.keys()):
            if self.secret_store.has_secret(self.job_order[param]):
                mnt = "/secrets/s%d" % len(secret_mounts)
                secret_mounts[mnt] = {
                    "kind": "text",
                    "content":
                    self.secret_store.retrieve(self.job_order[param])
                }
                self.job_order[param] = {"$include": mnt}

        container_req = {
            "name": self.name,
            "output_path": "/var/spool/cwl",
            "cwd": "/var/spool/cwl",
            "priority": self.priority,
            "state": "Committed",
            "container_image": arvados_jobs_image(self.arvrunner,
                                                  self.jobs_image),
            "mounts": {
                "/var/lib/cwl/cwl.input.json": {
                    "kind": "json",
                    "content": self.job_order
                },
                "stdout": {
                    "kind": "file",
                    "path": "/var/spool/cwl/cwl.output.json"
                },
                "/var/spool/cwl": {
                    "kind": "collection",
                    "writable": True
                }
            },
            "secret_mounts": secret_mounts,
            "runtime_constraints": {
                "vcpus":
                math.ceil(self.submit_runner_cores),
                "ram":
                1024 * 1024 * (math.ceil(self.submit_runner_ram) +
                               math.ceil(self.collection_cache_size)),
                "API":
                True
            },
            "use_existing":
            False,  # Never reuse the runner container - see #15497.
            "properties": {}
        }

        if self.embedded_tool.tool.get("id", "").startswith("keep:"):
            sp = self.embedded_tool.tool["id"].split('/')
            workflowcollection = sp[0][5:]
            workflowname = "/".join(sp[1:])
            workflowpath = "/var/lib/cwl/workflow/%s" % workflowname
            container_req["mounts"]["/var/lib/cwl/workflow"] = {
                "kind": "collection",
                "portable_data_hash": "%s" % workflowcollection
            }
        else:
            packed = packed_workflow(self.arvrunner, self.embedded_tool,
                                     self.merged_map)
            workflowpath = "/var/lib/cwl/workflow.json#main"
            container_req["mounts"]["/var/lib/cwl/workflow.json"] = {
                "kind": "json",
                "content": packed
            }
            if self.embedded_tool.tool.get("id", "").startswith("arvwf:"):
                container_req["properties"][
                    "template_uuid"] = self.embedded_tool.tool["id"][6:33]

        # --local means execute the workflow instead of submitting a container request
        # --api=containers means use the containers API
        # --no-log-timestamps means don't add timestamps (the logging infrastructure does this)
        # --disable-validate because we already validated so don't need to do it again
        # --eval-timeout is the timeout for javascript invocation
        # --parallel-task-count is the number of threads to use for job submission
        # --enable/disable-reuse sets desired job reuse
        # --collection-cache-size sets aside memory to store collections
        command = [
            "arvados-cwl-runner", "--local", "--api=containers",
            "--no-log-timestamps", "--disable-validate", "--disable-color",
            "--eval-timeout=%s" % self.arvrunner.eval_timeout,
            "--thread-count=%s" % self.arvrunner.thread_count,
            "--enable-reuse" if self.enable_reuse else "--disable-reuse",
            "--collection-cache-size=%s" % self.collection_cache_size
        ]

        if self.output_name:
            command.append("--output-name=" + self.output_name)
            container_req["output_name"] = self.output_name

        if self.output_tags:
            command.append("--output-tags=" + self.output_tags)

        if runtimeContext.debug:
            command.append("--debug")

        if runtimeContext.storage_classes != "default":
            command.append("--storage-classes=" +
                           runtimeContext.storage_classes)

        if self.on_error:
            command.append("--on-error=" + self.on_error)

        if self.intermediate_output_ttl:
            command.append("--intermediate-output-ttl=%d" %
                           self.intermediate_output_ttl)

        if self.arvrunner.trash_intermediate:
            command.append("--trash-intermediate")

        if self.arvrunner.project_uuid:
            command.append("--project-uuid=" + self.arvrunner.project_uuid)

        if self.enable_dev:
            command.append("--enable-dev")

        command.extend([workflowpath, "/var/lib/cwl/cwl.input.json"])

        container_req["command"] = command

        return container_req
Esempio n. 13
0
    def arv_executor(self,
                     updated_tool,
                     job_order,
                     runtimeContext,
                     logger=None):
        self.debug = runtimeContext.debug

        updated_tool.visit(self.check_features)

        self.project_uuid = runtimeContext.project_uuid
        self.pipeline = None
        self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
        self.secret_store = runtimeContext.secret_store

        self.trash_intermediate = runtimeContext.trash_intermediate
        if self.trash_intermediate and self.work_api != "containers":
            raise Exception(
                "--trash-intermediate is only supported with --api=containers."
            )

        self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
        if self.intermediate_output_ttl and self.work_api != "containers":
            raise Exception(
                "--intermediate-output-ttl is only supported with --api=containers."
            )
        if self.intermediate_output_ttl < 0:
            raise Exception(
                "Invalid value %d for --intermediate-output-ttl, cannot be less than zero"
                % self.intermediate_output_ttl)

        if runtimeContext.submit_request_uuid and self.work_api != "containers":
            raise Exception(
                "--submit-request-uuid requires containers API, but using '{}' api"
                .format(self.work_api))

        if not runtimeContext.name:
            runtimeContext.name = self.name = updated_tool.tool.get(
                "label") or updated_tool.metadata.get(
                    "label") or os.path.basename(updated_tool.tool["id"])

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % runtimeContext.name,
                                     updated_tool, job_order)

        # the last clause means: if it is a command line tool, and we
        # are going to wait for the result, and always_submit_runner
        # is false, then we don't submit a runner process.

        submitting = (runtimeContext.update_workflow
                      or runtimeContext.create_workflow or
                      (runtimeContext.submit
                       and not (updated_tool.tool["class"] == "CommandLineTool"
                                and runtimeContext.wait
                                and not runtimeContext.always_submit_runner)))

        loadingContext = self.loadingContext.copy()
        loadingContext.do_validate = False
        loadingContext.do_update = False
        if submitting:
            # Document may have been auto-updated. Reload the original
            # document with updating disabled because we want to
            # submit the document with its original CWL version, not
            # the auto-updated one.
            tool = load_tool(updated_tool.tool["id"], loadingContext)
        else:
            tool = updated_tool

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        merged_map = upload_workflow_deps(self, tool)

        # Recreate process object (ArvadosWorkflow or
        # ArvadosCommandTool) because tool document may have been
        # updated by upload_workflow_deps in ways that modify
        # inheritance of hints or requirements.
        loadingContext.loader = tool.doc_loader
        loadingContext.avsc_names = tool.doc_schema
        loadingContext.metadata = tool.metadata
        tool = load_tool(tool.tool, loadingContext)

        existing_uuid = runtimeContext.update_workflow
        if existing_uuid or runtimeContext.create_workflow:
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "containers":
                return (upload_workflow(
                    self,
                    tool,
                    job_order,
                    self.project_uuid,
                    uuid=existing_uuid,
                    submit_runner_ram=runtimeContext.submit_runner_ram,
                    name=runtimeContext.name,
                    merged_map=merged_map), "success")

        self.apply_reqs(job_order, tool)

        self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
        self.eval_timeout = runtimeContext.eval_timeout

        runtimeContext = runtimeContext.copy()
        runtimeContext.use_container = True
        runtimeContext.tmpdir_prefix = "tmp"
        runtimeContext.work_api = self.work_api

        if self.work_api == "containers":
            if self.ignore_docker_for_reuse:
                raise Exception(
                    "--ignore-docker-for-reuse not supported with containers API."
                )
            runtimeContext.outdir = "/var/spool/cwl"
            runtimeContext.docker_outdir = "/var/spool/cwl"
            runtimeContext.tmpdir = "/tmp"
            runtimeContext.docker_tmpdir = "/tmp"

        if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
            raise Exception("--priority must be in the range 1..1000.")

        if self.should_estimate_cache_size:
            visited = set()
            estimated_size = [0]

            def estimate_collection_cache(obj):
                if obj.get("location", "").startswith("keep:"):
                    m = pdh_size.match(obj["location"][5:])
                    if m and m.group(1) not in visited:
                        visited.add(m.group(1))
                        estimated_size[0] += int(m.group(2))

            visit_class(job_order, ("File", "Directory"),
                        estimate_collection_cache)
            runtimeContext.collection_cache_size = max(
                ((estimated_size[0] * 192) // (1024 * 1024)) + 1, 256)
            self.collection_cache.set_cap(
                runtimeContext.collection_cache_size * 1024 * 1024)

        logger.info("Using collection cache size %s MiB",
                    runtimeContext.collection_cache_size)

        runnerjob = None
        if runtimeContext.submit:
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if submitting:
                    tool = RunnerContainer(
                        self,
                        updated_tool,
                        tool,
                        loadingContext,
                        runtimeContext.enable_reuse,
                        self.output_name,
                        self.output_tags,
                        submit_runner_ram=runtimeContext.submit_runner_ram,
                        name=runtimeContext.name,
                        on_error=runtimeContext.on_error,
                        submit_runner_image=runtimeContext.submit_runner_image,
                        intermediate_output_ttl=runtimeContext.
                        intermediate_output_ttl,
                        merged_map=merged_map,
                        priority=runtimeContext.priority,
                        secret_store=self.secret_store,
                        collection_cache_size=runtimeContext.
                        collection_cache_size,
                        collection_cache_is_default=self.
                        should_estimate_cache_size)
                else:
                    runtimeContext.runnerjob = tool.tool["id"]

        if runtimeContext.cwl_runner_job is not None:
            self.uuid = runtimeContext.cwl_runner_job.get('uuid')

        jobiter = tool.job(job_order, self.output_callback, runtimeContext)

        if runtimeContext.submit and not runtimeContext.wait:
            runnerjob = next(jobiter)
            runnerjob.run(runtimeContext)
            return (runnerjob.uuid, "success")

        current_container = arvados_cwl.util.get_current_container(
            self.api, self.num_retries, logger)
        if current_container:
            logger.info("Running inside container %s",
                        current_container.get("uuid"))

        self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout)
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)

        try:
            self.workflow_eval_lock.acquire()

            # Holds the lock while this code runs and releases it when
            # it is safe to do so in self.workflow_eval_lock.wait(),
            # at which point on_message can update job state and
            # process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if self.task_queue.error is not None:
                    raise self.task_queue.error

                if runnable:
                    with Perf(metrics, "run"):
                        self.start_run(runnable, runtimeContext)
                else:
                    if (self.task_queue.in_flight + len(self.processes)) > 0:
                        self.workflow_eval_lock.wait(3)
                    else:
                        logger.error(
                            "Workflow is deadlocked, no runnable processes and not waiting on any pending processes."
                        )
                        break

                if self.stop_polling.is_set():
                    break

                loopperf.__enter__()
            loopperf.__exit__()

            while (self.task_queue.in_flight + len(self.processes)) > 0:
                if self.task_queue.error is not None:
                    raise self.task_queue.error
                self.workflow_eval_lock.wait(3)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info(
            )[0] is SystemExit:
                logger.error("Interrupted, workflow will be cancelled")
            elif isinstance(sys.exc_info()[1], WorkflowException):
                logger.error(
                    "Workflow execution failed:\n%s",
                    sys.exc_info()[1],
                    exc_info=(sys.exc_info()[1] if self.debug else False))
            else:
                logger.exception("Workflow execution failed")

            if self.pipeline:
                self.api.pipeline_instances().update(
                    uuid=self.pipeline["uuid"], body={
                        "state": "Failed"
                    }).execute(num_retries=self.num_retries)

            if self.work_api == "containers" and not current_container:
                # Not running in a crunch container, so cancel any outstanding processes.
                for p in self.processes:
                    try:
                        self.api.container_requests().update(
                            uuid=p, body={
                                "priority": "0"
                            }).execute(num_retries=self.num_retries)
                    except Exception:
                        pass
        finally:
            self.workflow_eval_lock.release()
            self.task_queue.drain()
            self.stop_polling.set()
            self.polling_thread.join()
            self.task_queue.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if runtimeContext.submit and isinstance(tool, Runner):
            logger.info("Final output collection %s", tool.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(
                    tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""

            storage_classes = runtimeContext.storage_classes.strip().split(",")
            self.final_output, self.final_output_collection = self.make_output_collection(
                self.output_name, storage_classes, self.output_tags,
                self.final_output)
            self.set_crunch_output()

        if runtimeContext.compute_checksum:
            adjustDirObjs(self.final_output,
                          partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output,
                           partial(compute_checksums, self.fs_access))

        if self.trash_intermediate and self.final_status == "success":
            self.trash_intermediate_output()

        return (self.final_output, self.final_status)
Esempio n. 14
0
def execute_workflow_step(workflow,
                          task_id,
                          job_data,
                          cwl_args=None,
                          executor=None):
    """
    Constructs and executes single step workflow based on the "workflow"
    and "task_id". "cwl_args" can be used to update default parameters
    used for loading and runtime contexts. Exports json file with the
    execution results. If the step was evaluated as the one that need to
    be skipped, the output "skipped" will set to True and the step_report
    file will include "nulls". This function doesn't remove any temporary
    data in both success and failure scenarios.
    """

    cwl_args = {} if cwl_args is None else cwl_args
    executor = SingleJobExecutor() if executor is None else executor

    step_tmp_folder, step_cache_folder, step_outputs_folder, step_report = get_temp_folders(
        task_id=task_id, job_data=job_data)

    default_cwl_args = get_default_cwl_args(cwl_args)

    default_cwl_args.update({  # add execution specific parameters
        "tmp_outdir_prefix": step_cache_folder + "/",
        "tmpdir_prefix": step_cache_folder + "/",
        "cidfile_dir": step_tmp_folder,
        "cidfile_prefix": task_id,
        "basedir": os.getcwd(
        ),  # job should already have abs path for inputs, so this is useless
        "outdir": step_outputs_folder
    })

    workflow_step_path = os.path.join(step_tmp_folder,
                                      task_id + "_step_workflow.cwl")

    fast_cwl_step_load(  # will save new worlflow to "workflow_step_path"
        workflow=workflow,
        target_id=task_id,
        cwl_args=default_cwl_args,
        location=workflow_step_path)

    workflow_data = slow_cwl_load(workflow=workflow_step_path,
                                  cwl_args=default_cwl_args)

    skipped = True
    step_outputs = {
        output_id: None
        for output_id, _ in get_items(workflow_data.tool["outputs"])
    }
    if need_to_run(workflow_data, job_data, task_id):
        skipped = False
        _stderr = sys.stderr  # to trick the logger
        sys.stderr = sys.__stderr__
        step_outputs, step_status = executor(workflow_data, job_data,
                                             RuntimeContext(default_cwl_args))
        sys.stderr = _stderr

        if step_status != "success":
            raise ValueError("Failed to run workflow step")

        # To remove "http://commonwl.org/cwltool#generation": 0 (copied from cwltool)
        visit_class(step_outputs, ("File", ),
                    MutationManager().unset_generation)

    dump_json(step_outputs, step_report)

    return step_outputs, step_report, skipped