Exemple #1
0
    def arvados_job_spec(self, *args, **kwargs):
        self.upload_docker(self.tool)

        workflowfiles = []
        jobfiles = []
        workflowfiles.append({"class":"File", "location": self.tool.tool["id"]})

        self.name = os.path.basename(self.tool.tool["id"])

        def visitFiles(files, path):
            files.append(path)

        document_loader, workflowobj, uri = fetch_document(self.tool.tool["id"])
        loaded = set()
        def loadref(b, u):
            joined = urlparse.urljoin(b, u)
            if joined not in loaded:
                loaded.add(joined)
                return document_loader.fetch(urlparse.urljoin(b, u))
            else:
                return {}

        sc = scandeps(uri, workflowobj,
                      set(("$import", "run")),
                      set(("$include", "$schemas", "path", "location")),
                      loadref)
        adjustFileObjs(sc, partial(visitFiles, workflowfiles))
        adjustFileObjs(self.job_order, partial(visitFiles, jobfiles))
        adjustDirObjs(sc, partial(visitFiles, workflowfiles))
        adjustDirObjs(self.job_order, partial(visitFiles, jobfiles))

        normalizeFilesDirs(jobfiles)
        normalizeFilesDirs(workflowfiles)

        keepprefix = kwargs.get("keepprefix", "")
        workflowmapper = ArvPathMapper(self.arvrunner, workflowfiles, "",
                                       keepprefix+"%s",
                                       keepprefix+"%s/%s",
                                       name=self.name,
                                       **kwargs)

        jobmapper = ArvPathMapper(self.arvrunner, jobfiles, "",
                                  keepprefix+"%s",
                                  keepprefix+"%s/%s",
                                  name=os.path.basename(self.job_order.get("id", "#")),
                                  **kwargs)

        def setloc(p):
            p["location"] = jobmapper.mapper(p["location"])[1]
        adjustFileObjs(self.job_order, setloc)
        adjustDirObjs(self.job_order, setloc)

        if "id" in self.job_order:
            del self.job_order["id"]

        return workflowmapper
Exemple #2
0
    def collect_output_ports(self,
                             ports,                  # type: Set[Dict[Text, Any]]
                             builder,                # type: Builder
                             outdir,                 # type: Text
                             compute_checksum=True,  # type: bool
                             jobname="",             # type: Text
                             readers=None            # type: Dict[Text, Any]
                             ):                      # type: (...) -> OutputPorts
        ret = {}  # type: OutputPorts
        debug = LOGGER.isEnabledFor(logging.DEBUG)
        try:
            fs_access = builder.make_fs_access(outdir)
            custom_output = fs_access.join(outdir, "cwl.output.json")
            if fs_access.exists(custom_output):
                with fs_access.open(custom_output, "r") as f:
                    ret = json.load(f)
                if debug:
                    LOGGER.debug(u"Raw output from %s: %s", custom_output, json.dumps(ret, indent=4))
            else:
                for i, port in enumerate(ports):
                    def make_workflow_exception(msg):
                        return WorkflowException(
                            u"Error collecting output for parameter '%s':\n%s"
                            % (shortname(port["id"]), msg))
                    with SourceLine(ports, i, make_workflow_exception, debug):
                        fragment = shortname(port["id"])
                        ret[fragment] = self.collect_output(port, builder, outdir, fs_access,
                                                            compute_checksum=compute_checksum)
            if ret:
                # revmap = partial(command_line_tool.revmap_file, builder, outdir)
                adjustDirObjs(ret, trim_listing)

                # TODO: Attempt to avoid a crash because the revmap fct is not functional
                #       (intend for a docker usage only?)
                # visit_class(ret, ("File", "Directory"), cast(Callable[[Any], Any], revmap))
                visit_class(ret, ("File", "Directory"), command_line_tool.remove_path)
                normalizeFilesDirs(ret)
                visit_class(ret, ("File", "Directory"), partial(command_line_tool.check_valid_locations, fs_access))

                if compute_checksum:
                    adjustFileObjs(ret, partial(compute_checksums, fs_access))

            validate.validate_ex(
                self.names.get_name("outputs_record_schema", ""), ret,
                strict=False, logger=LOGGER)
            if ret is not None and builder.mutation_manager is not None:
                adjustFileObjs(ret, builder.mutation_manager.set_generation)
            return ret if ret is not None else {}
        except validate.ValidationException as exc:
            raise WorkflowException("Error validating output record: {!s}\nIn:\n{}"
                                    .format(exc, json.dumps(ret, indent=4)))
        finally:
            if builder.mutation_manager and readers:
                for reader in readers.values():
                    builder.mutation_manager.release_reader(jobname, reader)
Exemple #3
0
def upload_dependencies(arvrunner,
                        name,
                        document_loader,
                        workflowobj,
                        uri,
                        loadref_run,
                        include_primary=True,
                        discovered_secondaryfiles=None):
    """Upload the dependencies of the workflowobj document to Keep.

    Returns a pathmapper object mapping local paths to keep references.  Also
    does an in-place update of references in "workflowobj".

    Use scandeps to find $import, $include, $schemas, run, File and Directory
    fields that represent external references.

    If workflowobj has an "id" field, this will reload the document to ensure
    it is scanning the raw document prior to preprocessing.
    """

    loaded = set()

    def loadref(b, u):
        joined = document_loader.fetcher.urljoin(b, u)
        defrg, _ = urllib.parse.urldefrag(joined)
        if defrg not in loaded:
            loaded.add(defrg)
            # Use fetch_text to get raw file (before preprocessing).
            text = document_loader.fetch_text(defrg)
            if isinstance(text, bytes):
                textIO = StringIO(text.decode('utf-8'))
            else:
                textIO = StringIO(text)
            return yaml.safe_load(textIO)
        else:
            return {}

    if loadref_run:
        loadref_fields = set(("$import", "run"))
    else:
        loadref_fields = set(("$import", ))

    scanobj = workflowobj
    if "id" in workflowobj and not workflowobj["id"].startswith("_:"):
        # Need raw file content (before preprocessing) to ensure
        # that external references in $include and $mixin are captured.
        scanobj = loadref("", workflowobj["id"])

    metadata = scanobj

    sc_result = scandeps(uri,
                         scanobj,
                         loadref_fields,
                         set(("$include", "$schemas", "location")),
                         loadref,
                         urljoin=document_loader.fetcher.urljoin)

    sc = []
    uuids = {}

    def collect_uuids(obj):
        loc = obj.get("location", "")
        sp = loc.split(":")
        if sp[0] == "keep":
            # Collect collection uuids that need to be resolved to
            # portable data hashes
            gp = collection_uuid_pattern.match(loc)
            if gp:
                uuids[gp.groups()[0]] = obj
            if collectionUUID in obj:
                uuids[obj[collectionUUID]] = obj

    def collect_uploads(obj):
        loc = obj.get("location", "")
        sp = loc.split(":")
        if len(sp) < 1:
            return
        if sp[0] in ("file", "http", "https"):
            # Record local files than need to be uploaded,
            # don't include file literals, keep references, etc.
            sc.append(obj)
        collect_uuids(obj)

    visit_class(workflowobj, ("File", "Directory"), collect_uuids)
    visit_class(sc_result, ("File", "Directory"), collect_uploads)

    # Resolve any collection uuids we found to portable data hashes
    # and assign them to uuid_map
    uuid_map = {}
    fetch_uuids = list(uuids.keys())
    while fetch_uuids:
        # For a large number of fetch_uuids, API server may limit
        # response size, so keep fetching from API server has nothing
        # more to give us.
        lookups = arvrunner.api.collections().list(
            filters=[["uuid", "in", fetch_uuids]],
            count="none",
            select=["uuid", "portable_data_hash"
                    ]).execute(num_retries=arvrunner.num_retries)

        if not lookups["items"]:
            break

        for l in lookups["items"]:
            uuid_map[l["uuid"]] = l["portable_data_hash"]

        fetch_uuids = [u for u in fetch_uuids if u not in uuid_map]

    normalizeFilesDirs(sc)

    if include_primary and "id" in workflowobj:
        sc.append({"class": "File", "location": workflowobj["id"]})

    if "$schemas" in workflowobj:
        for s in workflowobj["$schemas"]:
            sc.append({"class": "File", "location": s})

    def visit_default(obj):
        remove = [False]

        def ensure_default_location(f):
            if "location" not in f and "path" in f:
                f["location"] = f["path"]
                del f["path"]
            if "location" in f and not arvrunner.fs_access.exists(
                    f["location"]):
                # Doesn't exist, remove from list of dependencies to upload
                sc[:] = [x for x in sc if x["location"] != f["location"]]
                # Delete "default" from workflowobj
                remove[0] = True

        visit_class(obj["default"], ("File", "Directory"),
                    ensure_default_location)
        if remove[0]:
            del obj["default"]

    find_defaults(workflowobj, visit_default)

    discovered = {}

    def discover_default_secondary_files(obj):
        builder_job_order = {}
        for t in obj["inputs"]:
            builder_job_order[shortname(
                t["id"])] = t["default"] if "default" in t else None
        # Need to create a builder object to evaluate expressions.
        builder = make_builder(builder_job_order, obj.get("hints", []),
                               obj.get("requirements", []),
                               ArvRuntimeContext(), metadata)
        discover_secondary_files(arvrunner.fs_access, builder, obj["inputs"],
                                 builder_job_order, discovered)

    copied, _ = document_loader.resolve_all(copy.deepcopy(cmap(workflowobj)),
                                            base_url=uri,
                                            checklinks=False)
    visit_class(copied, ("CommandLineTool", "Workflow"),
                discover_default_secondary_files)

    for d in list(discovered):
        # Only interested in discovered secondaryFiles which are local
        # files that need to be uploaded.
        if d.startswith("file:"):
            sc.extend(discovered[d])
        else:
            del discovered[d]

    mapper = ArvPathMapper(arvrunner,
                           sc,
                           "",
                           "keep:%s",
                           "keep:%s/%s",
                           name=name,
                           single_collection=True)

    def setloc(p):
        loc = p.get("location")
        if loc and (not loc.startswith("_:")) and (
                not loc.startswith("keep:")):
            p["location"] = mapper.mapper(p["location"]).resolved
            return

        if not loc:
            return

        if collectionUUID in p:
            uuid = p[collectionUUID]
            if uuid not in uuid_map:
                raise SourceLine(p, collectionUUID,
                                 validate.ValidationException).makeError(
                                     "Collection uuid %s not found" % uuid)
            gp = collection_pdh_pattern.match(loc)
            if gp and uuid_map[uuid] != gp.groups()[0]:
                # This file entry has both collectionUUID and a PDH
                # location. If the PDH doesn't match the one returned
                # the API server, raise an error.
                raise SourceLine(
                    p, "location", validate.ValidationException
                ).makeError(
                    "Expected collection uuid %s to be %s but API server reported %s"
                    % (uuid, gp.groups()[0], uuid_map[p[collectionUUID]]))

        gp = collection_uuid_pattern.match(loc)
        if not gp:
            return
        uuid = gp.groups()[0]
        if uuid not in uuid_map:
            raise SourceLine(p, "location",
                             validate.ValidationException).makeError(
                                 "Collection uuid %s not found" % uuid)
        p["location"] = "keep:%s%s" % (uuid_map[uuid], gp.groups()[1]
                                       if gp.groups()[1] else "")
        p[collectionUUID] = uuid

    visit_class(workflowobj, ("File", "Directory"), setloc)
    visit_class(discovered, ("File", "Directory"), setloc)

    if discovered_secondaryfiles is not None:
        for d in discovered:
            discovered_secondaryfiles[mapper.mapper(
                d).resolved] = discovered[d]

    if "$schemas" in workflowobj:
        sch = CommentedSeq()
        for s in workflowobj["$schemas"]:
            if s in mapper:
                sch.append(mapper.mapper(s).resolved)
        workflowobj["$schemas"] = sch

    return mapper
Exemple #4
0
def upload_dependencies(arvrunner, name, document_loader,
                        workflowobj, uri, loadref_run,
                        include_primary=True, discovered_secondaryfiles=None):
    """Upload the dependencies of the workflowobj document to Keep.

    Returns a pathmapper object mapping local paths to keep references.  Also
    does an in-place update of references in "workflowobj".

    Use scandeps to find $import, $include, $schemas, run, File and Directory
    fields that represent external references.

    If workflowobj has an "id" field, this will reload the document to ensure
    it is scanning the raw document prior to preprocessing.
    """

    loaded = set()
    def loadref(b, u):
        joined = document_loader.fetcher.urljoin(b, u)
        defrg, _ = urlparse.urldefrag(joined)
        if defrg not in loaded:
            loaded.add(defrg)
            # Use fetch_text to get raw file (before preprocessing).
            text = document_loader.fetch_text(defrg)
            if isinstance(text, bytes):
                textIO = StringIO(text.decode('utf-8'))
            else:
                textIO = StringIO(text)
            return yaml.safe_load(textIO)
        else:
            return {}

    if loadref_run:
        loadref_fields = set(("$import", "run"))
    else:
        loadref_fields = set(("$import",))

    scanobj = workflowobj
    if "id" in workflowobj:
        # Need raw file content (before preprocessing) to ensure
        # that external references in $include and $mixin are captured.
        scanobj = loadref("", workflowobj["id"])

    sc_result = scandeps(uri, scanobj,
                  loadref_fields,
                  set(("$include", "$schemas", "location")),
                  loadref, urljoin=document_loader.fetcher.urljoin)

    sc = []
    def only_real(obj):
        if obj.get("location", "").startswith("file:"):
            sc.append(obj)

    visit_class(sc_result, ("File", "Directory"), only_real)

    normalizeFilesDirs(sc)

    if include_primary and "id" in workflowobj:
        sc.append({"class": "File", "location": workflowobj["id"]})

    if "$schemas" in workflowobj:
        for s in workflowobj["$schemas"]:
            sc.append({"class": "File", "location": s})

    def visit_default(obj):
        remove = [False]
        def ensure_default_location(f):
            if "location" not in f and "path" in f:
                f["location"] = f["path"]
                del f["path"]
            if "location" in f and not arvrunner.fs_access.exists(f["location"]):
                # Doesn't exist, remove from list of dependencies to upload
                sc[:] = [x for x in sc if x["location"] != f["location"]]
                # Delete "default" from workflowobj
                remove[0] = True
        visit_class(obj["default"], ("File", "Directory"), ensure_default_location)
        if remove[0]:
            del obj["default"]

    find_defaults(workflowobj, visit_default)

    discovered = {}
    def discover_default_secondary_files(obj):
        discover_secondary_files(obj["inputs"],
                                 {shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t},
                                 discovered)

    visit_class(workflowobj, ("CommandLineTool", "Workflow"), discover_default_secondary_files)

    for d in discovered:
        sc.extend(discovered[d])

    mapper = ArvPathMapper(arvrunner, sc, "",
                           "keep:%s",
                           "keep:%s/%s",
                           name=name,
                           single_collection=True)

    def setloc(p):
        if "location" in p and (not p["location"].startswith("_:")) and (not p["location"].startswith("keep:")):
            p["location"] = mapper.mapper(p["location"]).resolved

    visit_class(workflowobj, ("File", "Directory"), setloc)
    visit_class(discovered, ("File", "Directory"), setloc)

    if discovered_secondaryfiles is not None:
        for d in discovered:
            discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d]

    if "$schemas" in workflowobj:
        sch = []
        for s in workflowobj["$schemas"]:
            sch.append(mapper.mapper(s).resolved)
        workflowobj["$schemas"] = sch

    return mapper
Exemple #5
0
def upload_dependencies(arvrunner, name, document_loader, workflowobj, uri,
                        loadref_run):
    """Upload the dependencies of the workflowobj document to Keep.

    Returns a pathmapper object mapping local paths to keep references.  Also
    does an in-place update of references in "workflowobj".

    Use scandeps to find $import, $include, $schemas, run, File and Directory
    fields that represent external references.

    If workflowobj has an "id" field, this will reload the document to ensure
    it is scanning the raw document prior to preprocessing.
    """

    loaded = set()

    def loadref(b, u):
        joined = document_loader.fetcher.urljoin(b, u)
        defrg, _ = urlparse.urldefrag(joined)
        if defrg not in loaded:
            loaded.add(defrg)
            # Use fetch_text to get raw file (before preprocessing).
            text = document_loader.fetch_text(defrg)
            if isinstance(text, bytes):
                textIO = StringIO(text.decode('utf-8'))
            else:
                textIO = StringIO(text)
            return yaml.safe_load(textIO)
        else:
            return {}

    if loadref_run:
        loadref_fields = set(("$import", "run"))
    else:
        loadref_fields = set(("$import", ))

    scanobj = workflowobj
    if "id" in workflowobj:
        # Need raw file content (before preprocessing) to ensure
        # that external references in $include and $mixin are captured.
        scanobj = loadref("", workflowobj["id"])

    sc = scandeps(uri,
                  scanobj,
                  loadref_fields,
                  set(("$include", "$schemas", "location")),
                  loadref,
                  urljoin=document_loader.fetcher.urljoin)

    normalizeFilesDirs(sc)

    if "id" in workflowobj:
        sc.append({"class": "File", "location": workflowobj["id"]})

    mapper = ArvPathMapper(arvrunner,
                           sc,
                           "",
                           "keep:%s",
                           "keep:%s/%s",
                           name=name)

    def setloc(p):
        if "location" in p and (not p["location"].startswith("_:")) and (
                not p["location"].startswith("keep:")):
            p["location"] = mapper.mapper(p["location"]).resolved

    adjustFileObjs(workflowobj, setloc)
    adjustDirObjs(workflowobj, setloc)

    return mapper
Exemple #6
0
def upload_dependencies(arvrunner, name, document_loader,
                        workflowobj, uri, loadref_run, include_primary=True):
    """Upload the dependencies of the workflowobj document to Keep.

    Returns a pathmapper object mapping local paths to keep references.  Also
    does an in-place update of references in "workflowobj".

    Use scandeps to find $import, $include, $schemas, run, File and Directory
    fields that represent external references.

    If workflowobj has an "id" field, this will reload the document to ensure
    it is scanning the raw document prior to preprocessing.
    """

    loaded = set()
    def loadref(b, u):
        joined = document_loader.fetcher.urljoin(b, u)
        defrg, _ = urlparse.urldefrag(joined)
        if defrg not in loaded:
            loaded.add(defrg)
            # Use fetch_text to get raw file (before preprocessing).
            text = document_loader.fetch_text(defrg)
            if isinstance(text, bytes):
                textIO = StringIO(text.decode('utf-8'))
            else:
                textIO = StringIO(text)
            return yaml.safe_load(textIO)
        else:
            return {}

    if loadref_run:
        loadref_fields = set(("$import", "run"))
    else:
        loadref_fields = set(("$import",))

    scanobj = workflowobj
    if "id" in workflowobj:
        # Need raw file content (before preprocessing) to ensure
        # that external references in $include and $mixin are captured.
        scanobj = loadref("", workflowobj["id"])

    sc = scandeps(uri, scanobj,
                  loadref_fields,
                  set(("$include", "$schemas", "location")),
                  loadref, urljoin=document_loader.fetcher.urljoin)

    normalizeFilesDirs(sc)

    if include_primary and "id" in workflowobj:
        sc.append({"class": "File", "location": workflowobj["id"]})

    if "$schemas" in workflowobj:
        for s in workflowobj["$schemas"]:
            sc.append({"class": "File", "location": s})

    def capture_default(obj):
        remove = [False]
        def add_default(f):
            if "location" not in f and "path" in f:
                f["location"] = f["path"]
                del f["path"]
            if "location" in f and not arvrunner.fs_access.exists(f["location"]):
                # Remove from sc
                sc[:] = [x for x in sc if x["location"] != f["location"]]
                # Delete "default" from workflowobj
                remove[0] = True
        visit_class(obj["default"], ("File", "Directory"), add_default)
        if remove[0]:
            del obj["default"]

    find_defaults(workflowobj, capture_default)

    mapper = ArvPathMapper(arvrunner, sc, "",
                           "keep:%s",
                           "keep:%s/%s",
                           name=name,
                           single_collection=True)

    def setloc(p):
        if "location" in p and (not p["location"].startswith("_:")) and (not p["location"].startswith("keep:")):
            p["location"] = mapper.mapper(p["location"]).resolved
    adjustFileObjs(workflowobj, setloc)
    adjustDirObjs(workflowobj, setloc)

    if "$schemas" in workflowobj:
        sch = []
        for s in workflowobj["$schemas"]:
            sch.append(mapper.mapper(s).resolved)
        workflowobj["$schemas"] = sch

    return mapper
Exemple #7
0
def upload_dependencies(arvrunner, name, document_loader,
                        workflowobj, uri, loadref_run,
                        include_primary=True, discovered_secondaryfiles=None):
    """Upload the dependencies of the workflowobj document to Keep.

    Returns a pathmapper object mapping local paths to keep references.  Also
    does an in-place update of references in "workflowobj".

    Use scandeps to find $import, $include, $schemas, run, File and Directory
    fields that represent external references.

    If workflowobj has an "id" field, this will reload the document to ensure
    it is scanning the raw document prior to preprocessing.
    """

    loaded = set()
    def loadref(b, u):
        joined = document_loader.fetcher.urljoin(b, u)
        defrg, _ = urllib.parse.urldefrag(joined)
        if defrg not in loaded:
            loaded.add(defrg)
            # Use fetch_text to get raw file (before preprocessing).
            text = document_loader.fetch_text(defrg)
            if isinstance(text, bytes):
                textIO = StringIO(text.decode('utf-8'))
            else:
                textIO = StringIO(text)
            return yaml.safe_load(textIO)
        else:
            return {}

    if loadref_run:
        loadref_fields = set(("$import", "run"))
    else:
        loadref_fields = set(("$import",))

    scanobj = workflowobj
    if "id" in workflowobj:
        # Need raw file content (before preprocessing) to ensure
        # that external references in $include and $mixin are captured.
        scanobj = loadref("", workflowobj["id"])

    sc_result = scandeps(uri, scanobj,
                  loadref_fields,
                  set(("$include", "$schemas", "location")),
                  loadref, urljoin=document_loader.fetcher.urljoin)

    sc = []
    uuids = {}

    def collect_uuids(obj):
        loc = obj.get("location", "")
        sp = loc.split(":")
        if sp[0] == "keep":
            # Collect collection uuids that need to be resolved to
            # portable data hashes
            gp = collection_uuid_pattern.match(loc)
            if gp:
                uuids[gp.groups()[0]] = obj
            if collectionUUID in obj:
                uuids[obj[collectionUUID]] = obj

    def collect_uploads(obj):
        loc = obj.get("location", "")
        sp = loc.split(":")
        if len(sp) < 1:
            return
        if sp[0] in ("file", "http", "https"):
            # Record local files than need to be uploaded,
            # don't include file literals, keep references, etc.
            sc.append(obj)
        collect_uuids(obj)

    visit_class(workflowobj, ("File", "Directory"), collect_uuids)
    visit_class(sc_result, ("File", "Directory"), collect_uploads)

    # Resolve any collection uuids we found to portable data hashes
    # and assign them to uuid_map
    uuid_map = {}
    fetch_uuids = list(uuids.keys())
    while fetch_uuids:
        # For a large number of fetch_uuids, API server may limit
        # response size, so keep fetching from API server has nothing
        # more to give us.
        lookups = arvrunner.api.collections().list(
            filters=[["uuid", "in", fetch_uuids]],
            count="none",
            select=["uuid", "portable_data_hash"]).execute(
                num_retries=arvrunner.num_retries)

        if not lookups["items"]:
            break

        for l in lookups["items"]:
            uuid_map[l["uuid"]] = l["portable_data_hash"]

        fetch_uuids = [u for u in fetch_uuids if u not in uuid_map]

    normalizeFilesDirs(sc)

    if include_primary and "id" in workflowobj:
        sc.append({"class": "File", "location": workflowobj["id"]})

    if "$schemas" in workflowobj:
        for s in workflowobj["$schemas"]:
            sc.append({"class": "File", "location": s})

    def visit_default(obj):
        remove = [False]
        def ensure_default_location(f):
            if "location" not in f and "path" in f:
                f["location"] = f["path"]
                del f["path"]
            if "location" in f and not arvrunner.fs_access.exists(f["location"]):
                # Doesn't exist, remove from list of dependencies to upload
                sc[:] = [x for x in sc if x["location"] != f["location"]]
                # Delete "default" from workflowobj
                remove[0] = True
        visit_class(obj["default"], ("File", "Directory"), ensure_default_location)
        if remove[0]:
            del obj["default"]

    find_defaults(workflowobj, visit_default)

    discovered = {}
    def discover_default_secondary_files(obj):
        discover_secondary_files(obj["inputs"],
                                 {shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t},
                                 discovered)

    visit_class(workflowobj, ("CommandLineTool", "Workflow"), discover_default_secondary_files)

    for d in list(discovered):
        # Only interested in discovered secondaryFiles which are local
        # files that need to be uploaded.
        if d.startswith("file:"):
            sc.extend(discovered[d])
        else:
            del discovered[d]

    mapper = ArvPathMapper(arvrunner, sc, "",
                           "keep:%s",
                           "keep:%s/%s",
                           name=name,
                           single_collection=True)

    def setloc(p):
        loc = p.get("location")
        if loc and (not loc.startswith("_:")) and (not loc.startswith("keep:")):
            p["location"] = mapper.mapper(p["location"]).resolved
            return

        if not loc:
            return

        if collectionUUID in p:
            uuid = p[collectionUUID]
            if uuid not in uuid_map:
                raise SourceLine(p, collectionUUID, validate.ValidationException).makeError(
                    "Collection uuid %s not found" % uuid)
            gp = collection_pdh_pattern.match(loc)
            if gp and uuid_map[uuid] != gp.groups()[0]:
                # This file entry has both collectionUUID and a PDH
                # location. If the PDH doesn't match the one returned
                # the API server, raise an error.
                raise SourceLine(p, "location", validate.ValidationException).makeError(
                    "Expected collection uuid %s to be %s but API server reported %s" % (
                        uuid, gp.groups()[0], uuid_map[p[collectionUUID]]))

        gp = collection_uuid_pattern.match(loc)
        if not gp:
            return
        uuid = gp.groups()[0]
        if uuid not in uuid_map:
            raise SourceLine(p, "location", validate.ValidationException).makeError(
                "Collection uuid %s not found" % uuid)
        p["location"] = "keep:%s%s" % (uuid_map[uuid], gp.groups()[1] if gp.groups()[1] else "")
        p[collectionUUID] = uuid

    visit_class(workflowobj, ("File", "Directory"), setloc)
    visit_class(discovered, ("File", "Directory"), setloc)

    if discovered_secondaryfiles is not None:
        for d in discovered:
            discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d]

    if "$schemas" in workflowobj:
        sch = []
        for s in workflowobj["$schemas"]:
            sch.append(mapper.mapper(s).resolved)
        workflowobj["$schemas"] = sch

    return mapper
Exemple #8
0
def upload_dependencies(arvrunner, name, document_loader,
                        workflowobj, uri, loadref_run,
                        include_primary=True, discovered_secondaryfiles=None):
    """Upload the dependencies of the workflowobj document to Keep.

    Returns a pathmapper object mapping local paths to keep references.  Also
    does an in-place update of references in "workflowobj".

    Use scandeps to find $import, $include, $schemas, run, File and Directory
    fields that represent external references.

    If workflowobj has an "id" field, this will reload the document to ensure
    it is scanning the raw document prior to preprocessing.
    """

    loaded = set()
    def loadref(b, u):
        joined = document_loader.fetcher.urljoin(b, u)
        defrg, _ = urlparse.urldefrag(joined)
        if defrg not in loaded:
            loaded.add(defrg)
            # Use fetch_text to get raw file (before preprocessing).
            text = document_loader.fetch_text(defrg)
            if isinstance(text, bytes):
                textIO = StringIO(text.decode('utf-8'))
            else:
                textIO = StringIO(text)
            return yaml.safe_load(textIO)
        else:
            return {}

    if loadref_run:
        loadref_fields = set(("$import", "run"))
    else:
        loadref_fields = set(("$import",))

    scanobj = workflowobj
    if "id" in workflowobj:
        # Need raw file content (before preprocessing) to ensure
        # that external references in $include and $mixin are captured.
        scanobj = loadref("", workflowobj["id"])

    sc_result = scandeps(uri, scanobj,
                  loadref_fields,
                  set(("$include", "$schemas", "location")),
                  loadref, urljoin=document_loader.fetcher.urljoin)

    sc = []
    def only_real(obj):
        # Only interested in local files than need to be uploaded,
        # don't include file literals, keep references, etc.
        sp = obj.get("location", "").split(":")
        if len(sp) > 1 and sp[0] in ("file", "http", "https"):
            sc.append(obj)

    visit_class(sc_result, ("File", "Directory"), only_real)

    normalizeFilesDirs(sc)

    if include_primary and "id" in workflowobj:
        sc.append({"class": "File", "location": workflowobj["id"]})

    if "$schemas" in workflowobj:
        for s in workflowobj["$schemas"]:
            sc.append({"class": "File", "location": s})

    def visit_default(obj):
        remove = [False]
        def ensure_default_location(f):
            if "location" not in f and "path" in f:
                f["location"] = f["path"]
                del f["path"]
            if "location" in f and not arvrunner.fs_access.exists(f["location"]):
                # Doesn't exist, remove from list of dependencies to upload
                sc[:] = [x for x in sc if x["location"] != f["location"]]
                # Delete "default" from workflowobj
                remove[0] = True
        visit_class(obj["default"], ("File", "Directory"), ensure_default_location)
        if remove[0]:
            del obj["default"]

    find_defaults(workflowobj, visit_default)

    discovered = {}
    def discover_default_secondary_files(obj):
        discover_secondary_files(obj["inputs"],
                                 {shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t},
                                 discovered)

    visit_class(workflowobj, ("CommandLineTool", "Workflow"), discover_default_secondary_files)

    for d in list(discovered.keys()):
        # Only interested in discovered secondaryFiles which are local
        # files that need to be uploaded.
        if d.startswith("file:"):
            sc.extend(discovered[d])
        else:
            del discovered[d]

    mapper = ArvPathMapper(arvrunner, sc, "",
                           "keep:%s",
                           "keep:%s/%s",
                           name=name,
                           single_collection=True)

    def setloc(p):
        if "location" in p and (not p["location"].startswith("_:")) and (not p["location"].startswith("keep:")):
            p["location"] = mapper.mapper(p["location"]).resolved

    visit_class(workflowobj, ("File", "Directory"), setloc)
    visit_class(discovered, ("File", "Directory"), setloc)

    if discovered_secondaryfiles is not None:
        for d in discovered:
            discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d]

    if "$schemas" in workflowobj:
        sch = []
        for s in workflowobj["$schemas"]:
            sch.append(mapper.mapper(s).resolved)
        workflowobj["$schemas"] = sch

    return mapper
def run():
    # Timestamps are added by crunch-job, so don't print redundant timestamps.
    arvados.log_handler.setFormatter(
        logging.Formatter('%(name)s %(levelname)s: %(message)s'))

    # Print package versions
    logger.info(arvados_cwl.versionstring())

    api = arvados.api("v1")

    arvados_cwl.add_arv_hints()

    runner = None
    try:
        job_order_object = arvados.current_job()['script_parameters']
        toolpath = "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'],
                                     job_order_object.pop("cwl:tool"))

        pdh_path = re.compile(r'^[0-9a-f]{32}\+\d+(/.+)?$')

        def keeppath(v):
            if pdh_path.match(v):
                return "keep:%s" % v
            else:
                return v

        def keeppathObj(v):
            v["location"] = keeppath(v["location"])

        for k, v in job_order_object.items():
            if isinstance(
                    v,
                    basestring) and arvados.util.keep_locator_pattern.match(v):
                job_order_object[k] = {
                    "class": "File",
                    "location": "keep:%s" % v
                }

        adjustFileObjs(job_order_object, keeppathObj)
        adjustDirObjs(job_order_object, keeppathObj)
        normalizeFilesDirs(job_order_object)
        adjustDirObjs(
            job_order_object,
            functools.partial(
                getListing,
                arvados_cwl.fsaccess.CollectionFsAccess("", api_client=api)))

        output_name = None
        output_tags = None
        enable_reuse = True
        on_error = "continue"
        if "arv:output_name" in job_order_object:
            output_name = job_order_object["arv:output_name"]
            del job_order_object["arv:output_name"]

        if "arv:output_tags" in job_order_object:
            output_tags = job_order_object["arv:output_tags"]
            del job_order_object["arv:output_tags"]

        if "arv:enable_reuse" in job_order_object:
            enable_reuse = job_order_object["arv:enable_reuse"]
            del job_order_object["arv:enable_reuse"]

        if "arv:on_error" in job_order_object:
            on_error = job_order_object["arv:on_error"]
            del job_order_object["arv:on_error"]

        runner = arvados_cwl.ArvCwlRunner(api_client=arvados.api(
            'v1', model=OrderedJsonModel()),
                                          output_name=output_name,
                                          output_tags=output_tags)

        t = load_tool(toolpath,
                      runner.arv_make_tool,
                      fetcher_constructor=functools.partial(
                          CollectionFetcher,
                          api_client=api,
                          keep_client=arvados.keep.KeepClient(api_client=api,
                                                              num_retries=4)))

        args = argparse.Namespace()
        args.project_uuid = arvados.current_job()["owner_uuid"]
        args.enable_reuse = enable_reuse
        args.on_error = on_error
        args.submit = False
        args.debug = False
        args.quiet = False
        args.ignore_docker_for_reuse = False
        args.basedir = os.getcwd()
        args.name = None
        args.cwl_runner_job = {
            "uuid": arvados.current_job()["uuid"],
            "state": arvados.current_job()["state"]
        }
        runner.arv_executor(t, job_order_object, **vars(args))
    except Exception as e:
        if isinstance(e, WorkflowException):
            logging.info("Workflow error %s", e)
        else:
            logging.exception("Unhandled exception")
        if runner and runner.final_output_collection:
            outputCollection = runner.final_output_collection.portable_data_hash(
            )
        else:
            outputCollection = None
        api.job_tasks().update(uuid=arvados.current_task()['uuid'],
                               body={
                                   'output': outputCollection,
                                   'success': False,
                                   'progress': 1.0
                               }).execute()
Exemple #10
0
def run():
    # Print package versions
    logger.info(arvados_cwl.versionstring())

    api = arvados.api("v1")

    arvados_cwl.add_arv_hints()

    try:
        job_order_object = arvados.current_job()['script_parameters']

        pdh_path = re.compile(r'^[0-9a-f]{32}\+\d+(/.+)?$')

        def keeppath(v):
            if pdh_path.match(v):
                return "keep:%s" % v
            else:
                return v

        def keeppathObj(v):
            v["location"] = keeppath(v["location"])

        job_order_object["cwl:tool"] = "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'], job_order_object["cwl:tool"])

        for k,v in job_order_object.items():
            if isinstance(v, basestring) and arvados.util.keep_locator_pattern.match(v):
                job_order_object[k] = {
                    "class": "File",
                    "location": "keep:%s" % v
                }

        adjustFileObjs(job_order_object, keeppathObj)
        adjustDirObjs(job_order_object, keeppathObj)
        normalizeFilesDirs(job_order_object)
        adjustDirObjs(job_order_object, functools.partial(getListing, arvados_cwl.fsaccess.CollectionFsAccess("", api_client=api)))

        output_name = None
        if "arv:output_name" in job_order_object:
            output_name = job_order_object["arv:output_name"]
            del job_order_object["arv:output_name"]

        runner = arvados_cwl.ArvCwlRunner(api_client=arvados.api('v1', model=OrderedJsonModel()),
                                          output_name=output_name)

        t = load_tool(job_order_object, runner.arv_make_tool)

        args = argparse.Namespace()
        args.project_uuid = arvados.current_job()["owner_uuid"]
        args.enable_reuse = True
        args.submit = False
        args.debug = True
        args.quiet = False
        args.ignore_docker_for_reuse = False
        args.basedir = os.getcwd()
        args.cwl_runner_job={"uuid": arvados.current_job()["uuid"], "state": arvados.current_job()["state"]}
        outputObj = runner.arv_executor(t, job_order_object, **vars(args))

        if runner.final_output_collection:
            outputCollection = runner.final_output_collection.portable_data_hash()
        else:
            outputCollection = None

        api.job_tasks().update(uuid=arvados.current_task()['uuid'],
                                             body={
                                                 'output': outputCollection,
                                                 'success': True,
                                                 'progress':1.0
                                             }).execute()
    except Exception as e:
        logging.exception("Unhandled exception")
        api.job_tasks().update(uuid=arvados.current_task()['uuid'],
                                             body={
                                                 'output': None,
                                                 'success': False,
                                                 'progress':1.0
                                             }).execute()