def arvados_job_spec(self, *args, **kwargs): self.upload_docker(self.tool) workflowfiles = [] jobfiles = [] workflowfiles.append({"class":"File", "location": self.tool.tool["id"]}) self.name = os.path.basename(self.tool.tool["id"]) def visitFiles(files, path): files.append(path) document_loader, workflowobj, uri = fetch_document(self.tool.tool["id"]) loaded = set() def loadref(b, u): joined = urlparse.urljoin(b, u) if joined not in loaded: loaded.add(joined) return document_loader.fetch(urlparse.urljoin(b, u)) else: return {} sc = scandeps(uri, workflowobj, set(("$import", "run")), set(("$include", "$schemas", "path", "location")), loadref) adjustFileObjs(sc, partial(visitFiles, workflowfiles)) adjustFileObjs(self.job_order, partial(visitFiles, jobfiles)) adjustDirObjs(sc, partial(visitFiles, workflowfiles)) adjustDirObjs(self.job_order, partial(visitFiles, jobfiles)) normalizeFilesDirs(jobfiles) normalizeFilesDirs(workflowfiles) keepprefix = kwargs.get("keepprefix", "") workflowmapper = ArvPathMapper(self.arvrunner, workflowfiles, "", keepprefix+"%s", keepprefix+"%s/%s", name=self.name, **kwargs) jobmapper = ArvPathMapper(self.arvrunner, jobfiles, "", keepprefix+"%s", keepprefix+"%s/%s", name=os.path.basename(self.job_order.get("id", "#")), **kwargs) def setloc(p): p["location"] = jobmapper.mapper(p["location"])[1] adjustFileObjs(self.job_order, setloc) adjustDirObjs(self.job_order, setloc) if "id" in self.job_order: del self.job_order["id"] return workflowmapper
def collect_output_ports(self, ports, # type: Set[Dict[Text, Any]] builder, # type: Builder outdir, # type: Text compute_checksum=True, # type: bool jobname="", # type: Text readers=None # type: Dict[Text, Any] ): # type: (...) -> OutputPorts ret = {} # type: OutputPorts debug = LOGGER.isEnabledFor(logging.DEBUG) try: fs_access = builder.make_fs_access(outdir) custom_output = fs_access.join(outdir, "cwl.output.json") if fs_access.exists(custom_output): with fs_access.open(custom_output, "r") as f: ret = json.load(f) if debug: LOGGER.debug(u"Raw output from %s: %s", custom_output, json.dumps(ret, indent=4)) else: for i, port in enumerate(ports): def make_workflow_exception(msg): return WorkflowException( u"Error collecting output for parameter '%s':\n%s" % (shortname(port["id"]), msg)) with SourceLine(ports, i, make_workflow_exception, debug): fragment = shortname(port["id"]) ret[fragment] = self.collect_output(port, builder, outdir, fs_access, compute_checksum=compute_checksum) if ret: # revmap = partial(command_line_tool.revmap_file, builder, outdir) adjustDirObjs(ret, trim_listing) # TODO: Attempt to avoid a crash because the revmap fct is not functional # (intend for a docker usage only?) # visit_class(ret, ("File", "Directory"), cast(Callable[[Any], Any], revmap)) visit_class(ret, ("File", "Directory"), command_line_tool.remove_path) normalizeFilesDirs(ret) visit_class(ret, ("File", "Directory"), partial(command_line_tool.check_valid_locations, fs_access)) if compute_checksum: adjustFileObjs(ret, partial(compute_checksums, fs_access)) validate.validate_ex( self.names.get_name("outputs_record_schema", ""), ret, strict=False, logger=LOGGER) if ret is not None and builder.mutation_manager is not None: adjustFileObjs(ret, builder.mutation_manager.set_generation) return ret if ret is not None else {} except validate.ValidationException as exc: raise WorkflowException("Error validating output record: {!s}\nIn:\n{}" .format(exc, json.dumps(ret, indent=4))) finally: if builder.mutation_manager and readers: for reader in readers.values(): builder.mutation_manager.release_reader(jobname, reader)
def upload_dependencies(arvrunner, name, document_loader, workflowobj, uri, loadref_run, include_primary=True, discovered_secondaryfiles=None): """Upload the dependencies of the workflowobj document to Keep. Returns a pathmapper object mapping local paths to keep references. Also does an in-place update of references in "workflowobj". Use scandeps to find $import, $include, $schemas, run, File and Directory fields that represent external references. If workflowobj has an "id" field, this will reload the document to ensure it is scanning the raw document prior to preprocessing. """ loaded = set() def loadref(b, u): joined = document_loader.fetcher.urljoin(b, u) defrg, _ = urllib.parse.urldefrag(joined) if defrg not in loaded: loaded.add(defrg) # Use fetch_text to get raw file (before preprocessing). text = document_loader.fetch_text(defrg) if isinstance(text, bytes): textIO = StringIO(text.decode('utf-8')) else: textIO = StringIO(text) return yaml.safe_load(textIO) else: return {} if loadref_run: loadref_fields = set(("$import", "run")) else: loadref_fields = set(("$import", )) scanobj = workflowobj if "id" in workflowobj and not workflowobj["id"].startswith("_:"): # Need raw file content (before preprocessing) to ensure # that external references in $include and $mixin are captured. scanobj = loadref("", workflowobj["id"]) metadata = scanobj sc_result = scandeps(uri, scanobj, loadref_fields, set(("$include", "$schemas", "location")), loadref, urljoin=document_loader.fetcher.urljoin) sc = [] uuids = {} def collect_uuids(obj): loc = obj.get("location", "") sp = loc.split(":") if sp[0] == "keep": # Collect collection uuids that need to be resolved to # portable data hashes gp = collection_uuid_pattern.match(loc) if gp: uuids[gp.groups()[0]] = obj if collectionUUID in obj: uuids[obj[collectionUUID]] = obj def collect_uploads(obj): loc = obj.get("location", "") sp = loc.split(":") if len(sp) < 1: return if sp[0] in ("file", "http", "https"): # Record local files than need to be uploaded, # don't include file literals, keep references, etc. sc.append(obj) collect_uuids(obj) visit_class(workflowobj, ("File", "Directory"), collect_uuids) visit_class(sc_result, ("File", "Directory"), collect_uploads) # Resolve any collection uuids we found to portable data hashes # and assign them to uuid_map uuid_map = {} fetch_uuids = list(uuids.keys()) while fetch_uuids: # For a large number of fetch_uuids, API server may limit # response size, so keep fetching from API server has nothing # more to give us. lookups = arvrunner.api.collections().list( filters=[["uuid", "in", fetch_uuids]], count="none", select=["uuid", "portable_data_hash" ]).execute(num_retries=arvrunner.num_retries) if not lookups["items"]: break for l in lookups["items"]: uuid_map[l["uuid"]] = l["portable_data_hash"] fetch_uuids = [u for u in fetch_uuids if u not in uuid_map] normalizeFilesDirs(sc) if include_primary and "id" in workflowobj: sc.append({"class": "File", "location": workflowobj["id"]}) if "$schemas" in workflowobj: for s in workflowobj["$schemas"]: sc.append({"class": "File", "location": s}) def visit_default(obj): remove = [False] def ensure_default_location(f): if "location" not in f and "path" in f: f["location"] = f["path"] del f["path"] if "location" in f and not arvrunner.fs_access.exists( f["location"]): # Doesn't exist, remove from list of dependencies to upload sc[:] = [x for x in sc if x["location"] != f["location"]] # Delete "default" from workflowobj remove[0] = True visit_class(obj["default"], ("File", "Directory"), ensure_default_location) if remove[0]: del obj["default"] find_defaults(workflowobj, visit_default) discovered = {} def discover_default_secondary_files(obj): builder_job_order = {} for t in obj["inputs"]: builder_job_order[shortname( t["id"])] = t["default"] if "default" in t else None # Need to create a builder object to evaluate expressions. builder = make_builder(builder_job_order, obj.get("hints", []), obj.get("requirements", []), ArvRuntimeContext(), metadata) discover_secondary_files(arvrunner.fs_access, builder, obj["inputs"], builder_job_order, discovered) copied, _ = document_loader.resolve_all(copy.deepcopy(cmap(workflowobj)), base_url=uri, checklinks=False) visit_class(copied, ("CommandLineTool", "Workflow"), discover_default_secondary_files) for d in list(discovered): # Only interested in discovered secondaryFiles which are local # files that need to be uploaded. if d.startswith("file:"): sc.extend(discovered[d]) else: del discovered[d] mapper = ArvPathMapper(arvrunner, sc, "", "keep:%s", "keep:%s/%s", name=name, single_collection=True) def setloc(p): loc = p.get("location") if loc and (not loc.startswith("_:")) and ( not loc.startswith("keep:")): p["location"] = mapper.mapper(p["location"]).resolved return if not loc: return if collectionUUID in p: uuid = p[collectionUUID] if uuid not in uuid_map: raise SourceLine(p, collectionUUID, validate.ValidationException).makeError( "Collection uuid %s not found" % uuid) gp = collection_pdh_pattern.match(loc) if gp and uuid_map[uuid] != gp.groups()[0]: # This file entry has both collectionUUID and a PDH # location. If the PDH doesn't match the one returned # the API server, raise an error. raise SourceLine( p, "location", validate.ValidationException ).makeError( "Expected collection uuid %s to be %s but API server reported %s" % (uuid, gp.groups()[0], uuid_map[p[collectionUUID]])) gp = collection_uuid_pattern.match(loc) if not gp: return uuid = gp.groups()[0] if uuid not in uuid_map: raise SourceLine(p, "location", validate.ValidationException).makeError( "Collection uuid %s not found" % uuid) p["location"] = "keep:%s%s" % (uuid_map[uuid], gp.groups()[1] if gp.groups()[1] else "") p[collectionUUID] = uuid visit_class(workflowobj, ("File", "Directory"), setloc) visit_class(discovered, ("File", "Directory"), setloc) if discovered_secondaryfiles is not None: for d in discovered: discovered_secondaryfiles[mapper.mapper( d).resolved] = discovered[d] if "$schemas" in workflowobj: sch = CommentedSeq() for s in workflowobj["$schemas"]: if s in mapper: sch.append(mapper.mapper(s).resolved) workflowobj["$schemas"] = sch return mapper
def upload_dependencies(arvrunner, name, document_loader, workflowobj, uri, loadref_run, include_primary=True, discovered_secondaryfiles=None): """Upload the dependencies of the workflowobj document to Keep. Returns a pathmapper object mapping local paths to keep references. Also does an in-place update of references in "workflowobj". Use scandeps to find $import, $include, $schemas, run, File and Directory fields that represent external references. If workflowobj has an "id" field, this will reload the document to ensure it is scanning the raw document prior to preprocessing. """ loaded = set() def loadref(b, u): joined = document_loader.fetcher.urljoin(b, u) defrg, _ = urlparse.urldefrag(joined) if defrg not in loaded: loaded.add(defrg) # Use fetch_text to get raw file (before preprocessing). text = document_loader.fetch_text(defrg) if isinstance(text, bytes): textIO = StringIO(text.decode('utf-8')) else: textIO = StringIO(text) return yaml.safe_load(textIO) else: return {} if loadref_run: loadref_fields = set(("$import", "run")) else: loadref_fields = set(("$import",)) scanobj = workflowobj if "id" in workflowobj: # Need raw file content (before preprocessing) to ensure # that external references in $include and $mixin are captured. scanobj = loadref("", workflowobj["id"]) sc_result = scandeps(uri, scanobj, loadref_fields, set(("$include", "$schemas", "location")), loadref, urljoin=document_loader.fetcher.urljoin) sc = [] def only_real(obj): if obj.get("location", "").startswith("file:"): sc.append(obj) visit_class(sc_result, ("File", "Directory"), only_real) normalizeFilesDirs(sc) if include_primary and "id" in workflowobj: sc.append({"class": "File", "location": workflowobj["id"]}) if "$schemas" in workflowobj: for s in workflowobj["$schemas"]: sc.append({"class": "File", "location": s}) def visit_default(obj): remove = [False] def ensure_default_location(f): if "location" not in f and "path" in f: f["location"] = f["path"] del f["path"] if "location" in f and not arvrunner.fs_access.exists(f["location"]): # Doesn't exist, remove from list of dependencies to upload sc[:] = [x for x in sc if x["location"] != f["location"]] # Delete "default" from workflowobj remove[0] = True visit_class(obj["default"], ("File", "Directory"), ensure_default_location) if remove[0]: del obj["default"] find_defaults(workflowobj, visit_default) discovered = {} def discover_default_secondary_files(obj): discover_secondary_files(obj["inputs"], {shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t}, discovered) visit_class(workflowobj, ("CommandLineTool", "Workflow"), discover_default_secondary_files) for d in discovered: sc.extend(discovered[d]) mapper = ArvPathMapper(arvrunner, sc, "", "keep:%s", "keep:%s/%s", name=name, single_collection=True) def setloc(p): if "location" in p and (not p["location"].startswith("_:")) and (not p["location"].startswith("keep:")): p["location"] = mapper.mapper(p["location"]).resolved visit_class(workflowobj, ("File", "Directory"), setloc) visit_class(discovered, ("File", "Directory"), setloc) if discovered_secondaryfiles is not None: for d in discovered: discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d] if "$schemas" in workflowobj: sch = [] for s in workflowobj["$schemas"]: sch.append(mapper.mapper(s).resolved) workflowobj["$schemas"] = sch return mapper
def upload_dependencies(arvrunner, name, document_loader, workflowobj, uri, loadref_run): """Upload the dependencies of the workflowobj document to Keep. Returns a pathmapper object mapping local paths to keep references. Also does an in-place update of references in "workflowobj". Use scandeps to find $import, $include, $schemas, run, File and Directory fields that represent external references. If workflowobj has an "id" field, this will reload the document to ensure it is scanning the raw document prior to preprocessing. """ loaded = set() def loadref(b, u): joined = document_loader.fetcher.urljoin(b, u) defrg, _ = urlparse.urldefrag(joined) if defrg not in loaded: loaded.add(defrg) # Use fetch_text to get raw file (before preprocessing). text = document_loader.fetch_text(defrg) if isinstance(text, bytes): textIO = StringIO(text.decode('utf-8')) else: textIO = StringIO(text) return yaml.safe_load(textIO) else: return {} if loadref_run: loadref_fields = set(("$import", "run")) else: loadref_fields = set(("$import", )) scanobj = workflowobj if "id" in workflowobj: # Need raw file content (before preprocessing) to ensure # that external references in $include and $mixin are captured. scanobj = loadref("", workflowobj["id"]) sc = scandeps(uri, scanobj, loadref_fields, set(("$include", "$schemas", "location")), loadref, urljoin=document_loader.fetcher.urljoin) normalizeFilesDirs(sc) if "id" in workflowobj: sc.append({"class": "File", "location": workflowobj["id"]}) mapper = ArvPathMapper(arvrunner, sc, "", "keep:%s", "keep:%s/%s", name=name) def setloc(p): if "location" in p and (not p["location"].startswith("_:")) and ( not p["location"].startswith("keep:")): p["location"] = mapper.mapper(p["location"]).resolved adjustFileObjs(workflowobj, setloc) adjustDirObjs(workflowobj, setloc) return mapper
def upload_dependencies(arvrunner, name, document_loader, workflowobj, uri, loadref_run, include_primary=True): """Upload the dependencies of the workflowobj document to Keep. Returns a pathmapper object mapping local paths to keep references. Also does an in-place update of references in "workflowobj". Use scandeps to find $import, $include, $schemas, run, File and Directory fields that represent external references. If workflowobj has an "id" field, this will reload the document to ensure it is scanning the raw document prior to preprocessing. """ loaded = set() def loadref(b, u): joined = document_loader.fetcher.urljoin(b, u) defrg, _ = urlparse.urldefrag(joined) if defrg not in loaded: loaded.add(defrg) # Use fetch_text to get raw file (before preprocessing). text = document_loader.fetch_text(defrg) if isinstance(text, bytes): textIO = StringIO(text.decode('utf-8')) else: textIO = StringIO(text) return yaml.safe_load(textIO) else: return {} if loadref_run: loadref_fields = set(("$import", "run")) else: loadref_fields = set(("$import",)) scanobj = workflowobj if "id" in workflowobj: # Need raw file content (before preprocessing) to ensure # that external references in $include and $mixin are captured. scanobj = loadref("", workflowobj["id"]) sc = scandeps(uri, scanobj, loadref_fields, set(("$include", "$schemas", "location")), loadref, urljoin=document_loader.fetcher.urljoin) normalizeFilesDirs(sc) if include_primary and "id" in workflowobj: sc.append({"class": "File", "location": workflowobj["id"]}) if "$schemas" in workflowobj: for s in workflowobj["$schemas"]: sc.append({"class": "File", "location": s}) def capture_default(obj): remove = [False] def add_default(f): if "location" not in f and "path" in f: f["location"] = f["path"] del f["path"] if "location" in f and not arvrunner.fs_access.exists(f["location"]): # Remove from sc sc[:] = [x for x in sc if x["location"] != f["location"]] # Delete "default" from workflowobj remove[0] = True visit_class(obj["default"], ("File", "Directory"), add_default) if remove[0]: del obj["default"] find_defaults(workflowobj, capture_default) mapper = ArvPathMapper(arvrunner, sc, "", "keep:%s", "keep:%s/%s", name=name, single_collection=True) def setloc(p): if "location" in p and (not p["location"].startswith("_:")) and (not p["location"].startswith("keep:")): p["location"] = mapper.mapper(p["location"]).resolved adjustFileObjs(workflowobj, setloc) adjustDirObjs(workflowobj, setloc) if "$schemas" in workflowobj: sch = [] for s in workflowobj["$schemas"]: sch.append(mapper.mapper(s).resolved) workflowobj["$schemas"] = sch return mapper
def upload_dependencies(arvrunner, name, document_loader, workflowobj, uri, loadref_run, include_primary=True, discovered_secondaryfiles=None): """Upload the dependencies of the workflowobj document to Keep. Returns a pathmapper object mapping local paths to keep references. Also does an in-place update of references in "workflowobj". Use scandeps to find $import, $include, $schemas, run, File and Directory fields that represent external references. If workflowobj has an "id" field, this will reload the document to ensure it is scanning the raw document prior to preprocessing. """ loaded = set() def loadref(b, u): joined = document_loader.fetcher.urljoin(b, u) defrg, _ = urllib.parse.urldefrag(joined) if defrg not in loaded: loaded.add(defrg) # Use fetch_text to get raw file (before preprocessing). text = document_loader.fetch_text(defrg) if isinstance(text, bytes): textIO = StringIO(text.decode('utf-8')) else: textIO = StringIO(text) return yaml.safe_load(textIO) else: return {} if loadref_run: loadref_fields = set(("$import", "run")) else: loadref_fields = set(("$import",)) scanobj = workflowobj if "id" in workflowobj: # Need raw file content (before preprocessing) to ensure # that external references in $include and $mixin are captured. scanobj = loadref("", workflowobj["id"]) sc_result = scandeps(uri, scanobj, loadref_fields, set(("$include", "$schemas", "location")), loadref, urljoin=document_loader.fetcher.urljoin) sc = [] uuids = {} def collect_uuids(obj): loc = obj.get("location", "") sp = loc.split(":") if sp[0] == "keep": # Collect collection uuids that need to be resolved to # portable data hashes gp = collection_uuid_pattern.match(loc) if gp: uuids[gp.groups()[0]] = obj if collectionUUID in obj: uuids[obj[collectionUUID]] = obj def collect_uploads(obj): loc = obj.get("location", "") sp = loc.split(":") if len(sp) < 1: return if sp[0] in ("file", "http", "https"): # Record local files than need to be uploaded, # don't include file literals, keep references, etc. sc.append(obj) collect_uuids(obj) visit_class(workflowobj, ("File", "Directory"), collect_uuids) visit_class(sc_result, ("File", "Directory"), collect_uploads) # Resolve any collection uuids we found to portable data hashes # and assign them to uuid_map uuid_map = {} fetch_uuids = list(uuids.keys()) while fetch_uuids: # For a large number of fetch_uuids, API server may limit # response size, so keep fetching from API server has nothing # more to give us. lookups = arvrunner.api.collections().list( filters=[["uuid", "in", fetch_uuids]], count="none", select=["uuid", "portable_data_hash"]).execute( num_retries=arvrunner.num_retries) if not lookups["items"]: break for l in lookups["items"]: uuid_map[l["uuid"]] = l["portable_data_hash"] fetch_uuids = [u for u in fetch_uuids if u not in uuid_map] normalizeFilesDirs(sc) if include_primary and "id" in workflowobj: sc.append({"class": "File", "location": workflowobj["id"]}) if "$schemas" in workflowobj: for s in workflowobj["$schemas"]: sc.append({"class": "File", "location": s}) def visit_default(obj): remove = [False] def ensure_default_location(f): if "location" not in f and "path" in f: f["location"] = f["path"] del f["path"] if "location" in f and not arvrunner.fs_access.exists(f["location"]): # Doesn't exist, remove from list of dependencies to upload sc[:] = [x for x in sc if x["location"] != f["location"]] # Delete "default" from workflowobj remove[0] = True visit_class(obj["default"], ("File", "Directory"), ensure_default_location) if remove[0]: del obj["default"] find_defaults(workflowobj, visit_default) discovered = {} def discover_default_secondary_files(obj): discover_secondary_files(obj["inputs"], {shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t}, discovered) visit_class(workflowobj, ("CommandLineTool", "Workflow"), discover_default_secondary_files) for d in list(discovered): # Only interested in discovered secondaryFiles which are local # files that need to be uploaded. if d.startswith("file:"): sc.extend(discovered[d]) else: del discovered[d] mapper = ArvPathMapper(arvrunner, sc, "", "keep:%s", "keep:%s/%s", name=name, single_collection=True) def setloc(p): loc = p.get("location") if loc and (not loc.startswith("_:")) and (not loc.startswith("keep:")): p["location"] = mapper.mapper(p["location"]).resolved return if not loc: return if collectionUUID in p: uuid = p[collectionUUID] if uuid not in uuid_map: raise SourceLine(p, collectionUUID, validate.ValidationException).makeError( "Collection uuid %s not found" % uuid) gp = collection_pdh_pattern.match(loc) if gp and uuid_map[uuid] != gp.groups()[0]: # This file entry has both collectionUUID and a PDH # location. If the PDH doesn't match the one returned # the API server, raise an error. raise SourceLine(p, "location", validate.ValidationException).makeError( "Expected collection uuid %s to be %s but API server reported %s" % ( uuid, gp.groups()[0], uuid_map[p[collectionUUID]])) gp = collection_uuid_pattern.match(loc) if not gp: return uuid = gp.groups()[0] if uuid not in uuid_map: raise SourceLine(p, "location", validate.ValidationException).makeError( "Collection uuid %s not found" % uuid) p["location"] = "keep:%s%s" % (uuid_map[uuid], gp.groups()[1] if gp.groups()[1] else "") p[collectionUUID] = uuid visit_class(workflowobj, ("File", "Directory"), setloc) visit_class(discovered, ("File", "Directory"), setloc) if discovered_secondaryfiles is not None: for d in discovered: discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d] if "$schemas" in workflowobj: sch = [] for s in workflowobj["$schemas"]: sch.append(mapper.mapper(s).resolved) workflowobj["$schemas"] = sch return mapper
def upload_dependencies(arvrunner, name, document_loader, workflowobj, uri, loadref_run, include_primary=True, discovered_secondaryfiles=None): """Upload the dependencies of the workflowobj document to Keep. Returns a pathmapper object mapping local paths to keep references. Also does an in-place update of references in "workflowobj". Use scandeps to find $import, $include, $schemas, run, File and Directory fields that represent external references. If workflowobj has an "id" field, this will reload the document to ensure it is scanning the raw document prior to preprocessing. """ loaded = set() def loadref(b, u): joined = document_loader.fetcher.urljoin(b, u) defrg, _ = urlparse.urldefrag(joined) if defrg not in loaded: loaded.add(defrg) # Use fetch_text to get raw file (before preprocessing). text = document_loader.fetch_text(defrg) if isinstance(text, bytes): textIO = StringIO(text.decode('utf-8')) else: textIO = StringIO(text) return yaml.safe_load(textIO) else: return {} if loadref_run: loadref_fields = set(("$import", "run")) else: loadref_fields = set(("$import",)) scanobj = workflowobj if "id" in workflowobj: # Need raw file content (before preprocessing) to ensure # that external references in $include and $mixin are captured. scanobj = loadref("", workflowobj["id"]) sc_result = scandeps(uri, scanobj, loadref_fields, set(("$include", "$schemas", "location")), loadref, urljoin=document_loader.fetcher.urljoin) sc = [] def only_real(obj): # Only interested in local files than need to be uploaded, # don't include file literals, keep references, etc. sp = obj.get("location", "").split(":") if len(sp) > 1 and sp[0] in ("file", "http", "https"): sc.append(obj) visit_class(sc_result, ("File", "Directory"), only_real) normalizeFilesDirs(sc) if include_primary and "id" in workflowobj: sc.append({"class": "File", "location": workflowobj["id"]}) if "$schemas" in workflowobj: for s in workflowobj["$schemas"]: sc.append({"class": "File", "location": s}) def visit_default(obj): remove = [False] def ensure_default_location(f): if "location" not in f and "path" in f: f["location"] = f["path"] del f["path"] if "location" in f and not arvrunner.fs_access.exists(f["location"]): # Doesn't exist, remove from list of dependencies to upload sc[:] = [x for x in sc if x["location"] != f["location"]] # Delete "default" from workflowobj remove[0] = True visit_class(obj["default"], ("File", "Directory"), ensure_default_location) if remove[0]: del obj["default"] find_defaults(workflowobj, visit_default) discovered = {} def discover_default_secondary_files(obj): discover_secondary_files(obj["inputs"], {shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t}, discovered) visit_class(workflowobj, ("CommandLineTool", "Workflow"), discover_default_secondary_files) for d in list(discovered.keys()): # Only interested in discovered secondaryFiles which are local # files that need to be uploaded. if d.startswith("file:"): sc.extend(discovered[d]) else: del discovered[d] mapper = ArvPathMapper(arvrunner, sc, "", "keep:%s", "keep:%s/%s", name=name, single_collection=True) def setloc(p): if "location" in p and (not p["location"].startswith("_:")) and (not p["location"].startswith("keep:")): p["location"] = mapper.mapper(p["location"]).resolved visit_class(workflowobj, ("File", "Directory"), setloc) visit_class(discovered, ("File", "Directory"), setloc) if discovered_secondaryfiles is not None: for d in discovered: discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d] if "$schemas" in workflowobj: sch = [] for s in workflowobj["$schemas"]: sch.append(mapper.mapper(s).resolved) workflowobj["$schemas"] = sch return mapper
def run(): # Timestamps are added by crunch-job, so don't print redundant timestamps. arvados.log_handler.setFormatter( logging.Formatter('%(name)s %(levelname)s: %(message)s')) # Print package versions logger.info(arvados_cwl.versionstring()) api = arvados.api("v1") arvados_cwl.add_arv_hints() runner = None try: job_order_object = arvados.current_job()['script_parameters'] toolpath = "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'], job_order_object.pop("cwl:tool")) pdh_path = re.compile(r'^[0-9a-f]{32}\+\d+(/.+)?$') def keeppath(v): if pdh_path.match(v): return "keep:%s" % v else: return v def keeppathObj(v): v["location"] = keeppath(v["location"]) for k, v in job_order_object.items(): if isinstance( v, basestring) and arvados.util.keep_locator_pattern.match(v): job_order_object[k] = { "class": "File", "location": "keep:%s" % v } adjustFileObjs(job_order_object, keeppathObj) adjustDirObjs(job_order_object, keeppathObj) normalizeFilesDirs(job_order_object) adjustDirObjs( job_order_object, functools.partial( getListing, arvados_cwl.fsaccess.CollectionFsAccess("", api_client=api))) output_name = None output_tags = None enable_reuse = True on_error = "continue" if "arv:output_name" in job_order_object: output_name = job_order_object["arv:output_name"] del job_order_object["arv:output_name"] if "arv:output_tags" in job_order_object: output_tags = job_order_object["arv:output_tags"] del job_order_object["arv:output_tags"] if "arv:enable_reuse" in job_order_object: enable_reuse = job_order_object["arv:enable_reuse"] del job_order_object["arv:enable_reuse"] if "arv:on_error" in job_order_object: on_error = job_order_object["arv:on_error"] del job_order_object["arv:on_error"] runner = arvados_cwl.ArvCwlRunner(api_client=arvados.api( 'v1', model=OrderedJsonModel()), output_name=output_name, output_tags=output_tags) t = load_tool(toolpath, runner.arv_make_tool, fetcher_constructor=functools.partial( CollectionFetcher, api_client=api, keep_client=arvados.keep.KeepClient(api_client=api, num_retries=4))) args = argparse.Namespace() args.project_uuid = arvados.current_job()["owner_uuid"] args.enable_reuse = enable_reuse args.on_error = on_error args.submit = False args.debug = False args.quiet = False args.ignore_docker_for_reuse = False args.basedir = os.getcwd() args.name = None args.cwl_runner_job = { "uuid": arvados.current_job()["uuid"], "state": arvados.current_job()["state"] } runner.arv_executor(t, job_order_object, **vars(args)) except Exception as e: if isinstance(e, WorkflowException): logging.info("Workflow error %s", e) else: logging.exception("Unhandled exception") if runner and runner.final_output_collection: outputCollection = runner.final_output_collection.portable_data_hash( ) else: outputCollection = None api.job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'output': outputCollection, 'success': False, 'progress': 1.0 }).execute()
def run(): # Print package versions logger.info(arvados_cwl.versionstring()) api = arvados.api("v1") arvados_cwl.add_arv_hints() try: job_order_object = arvados.current_job()['script_parameters'] pdh_path = re.compile(r'^[0-9a-f]{32}\+\d+(/.+)?$') def keeppath(v): if pdh_path.match(v): return "keep:%s" % v else: return v def keeppathObj(v): v["location"] = keeppath(v["location"]) job_order_object["cwl:tool"] = "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'], job_order_object["cwl:tool"]) for k,v in job_order_object.items(): if isinstance(v, basestring) and arvados.util.keep_locator_pattern.match(v): job_order_object[k] = { "class": "File", "location": "keep:%s" % v } adjustFileObjs(job_order_object, keeppathObj) adjustDirObjs(job_order_object, keeppathObj) normalizeFilesDirs(job_order_object) adjustDirObjs(job_order_object, functools.partial(getListing, arvados_cwl.fsaccess.CollectionFsAccess("", api_client=api))) output_name = None if "arv:output_name" in job_order_object: output_name = job_order_object["arv:output_name"] del job_order_object["arv:output_name"] runner = arvados_cwl.ArvCwlRunner(api_client=arvados.api('v1', model=OrderedJsonModel()), output_name=output_name) t = load_tool(job_order_object, runner.arv_make_tool) args = argparse.Namespace() args.project_uuid = arvados.current_job()["owner_uuid"] args.enable_reuse = True args.submit = False args.debug = True args.quiet = False args.ignore_docker_for_reuse = False args.basedir = os.getcwd() args.cwl_runner_job={"uuid": arvados.current_job()["uuid"], "state": arvados.current_job()["state"]} outputObj = runner.arv_executor(t, job_order_object, **vars(args)) if runner.final_output_collection: outputCollection = runner.final_output_collection.portable_data_hash() else: outputCollection = None api.job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'output': outputCollection, 'success': True, 'progress':1.0 }).execute() except Exception as e: logging.exception("Unhandled exception") api.job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'output': None, 'success': False, 'progress':1.0 }).execute()