def setSecondary(fileobj): if isinstance(fileobj, dict) and fileobj.get("class") == "File": if "secondaryFiles" not in fileobj: fileobj["secondaryFiles"] = [{"location": substitute(fileobj["location"], sf), "class": "File"} for sf in t["secondaryFiles"]] if isinstance(fileobj, list): for e in fileobj: setSecondary(e)
def setSecondary(t, fileobj, discovered): if isinstance(fileobj, dict) and fileobj.get("class") == "File": if "secondaryFiles" not in fileobj: fileobj["secondaryFiles"] = cmap([{"location": substitute(fileobj["location"], sf), "class": "File"} for sf in t["secondaryFiles"]]) if discovered is not None: discovered[fileobj["location"]] = fileobj["secondaryFiles"] elif isinstance(fileobj, list): for e in fileobj: setSecondary(t, e, discovered)
def set_secondary(typedef, fileobj, discovered): """ Pull over missing secondaryFiles to the job object entry. Adapted from: https://github.com/curoverse/arvados/blob/2b0b06579199967eca3d44d955ad64195d2db3c3/sdk/cwl/arvados_cwl/runner.py#L67 """ if isinstance(fileobj, MutableMapping) and fileobj.get("class") == "File": if "secondaryFiles" not in fileobj: fileobj["secondaryFiles"] = cmap( [{"location": substitute(fileobj["location"], sf["pattern"]), "class": "File"} for sf in typedef["secondaryFiles"]]) if discovered is not None: discovered[fileobj["location"]] = fileobj["secondaryFiles"] elif isinstance(fileobj, MutableSequence): for entry in fileobj: set_secondary(typedef, entry, discovered)
def set_secondary(fsaccess, builder, inputschema, secondaryspec, primary, discovered): if isinstance(inputschema, Sequence) and not isinstance(inputschema, basestring): # union type, collect all possible secondaryFiles for i in inputschema: set_secondary(fsaccess, builder, i, secondaryspec, primary, discovered) return if isinstance(inputschema, basestring): sd = search_schemadef(inputschema, reversed(builder.hints + builder.requirements)) if sd: inputschema = sd else: return if "secondaryFiles" in inputschema: # set secondaryFiles, may be inherited by compound types. secondaryspec = inputschema["secondaryFiles"] if (isinstance(inputschema["type"], (Mapping, Sequence)) and not isinstance(inputschema["type"], basestring)): # compound type (union, array, record) set_secondary(fsaccess, builder, inputschema["type"], secondaryspec, primary, discovered) elif (inputschema["type"] == "record" and isinstance(primary, Mapping)): # # record type, find secondary files associated with fields. # for f in inputschema["fields"]: p = primary.get(shortname(f["name"])) if p: set_secondary(fsaccess, builder, f, secondaryspec, p, discovered) elif (inputschema["type"] == "array" and isinstance(primary, Sequence)): # # array type, find secondary files of elements # for p in primary: set_secondary(fsaccess, builder, {"type": inputschema["items"]}, secondaryspec, p, discovered) elif (inputschema["type"] == "File" and secondaryspec and isinstance(primary, Mapping) and primary.get("class") == "File" and "secondaryFiles" not in primary): # # Found a file, check for secondaryFiles # specs = [] primary["secondaryFiles"] = secondaryspec for i, sf in enumerate(aslist(secondaryspec)): if builder.cwlVersion == "v1.0": pattern = builder.do_eval(sf, context=primary) else: pattern = builder.do_eval(sf["pattern"], context=primary) if pattern is None: continue if isinstance(pattern, list): specs.extend(pattern) elif isinstance(pattern, dict): specs.append(pattern) elif isinstance(pattern, str): if builder.cwlVersion == "v1.0": specs.append({"pattern": pattern, "required": True}) else: specs.append({ "pattern": pattern, "required": sf.get("required") }) else: raise SourceLine( primary["secondaryFiles"], i, validate.ValidationException).makeError( "Expression must return list, object, string or null") found = [] for i, sf in enumerate(specs): if isinstance(sf, dict): if sf.get("class") == "File": pattern = None if sf.get("location") is None: raise SourceLine( primary["secondaryFiles"], i, validate.ValidationException).makeError( "File object is missing 'location': %s" % sf) sfpath = sf["location"] required = True else: pattern = sf["pattern"] required = sf.get("required") elif isinstance(sf, str): pattern = sf required = True else: raise SourceLine( primary["secondaryFiles"], i, validate.ValidationException).makeError( "Expression must return list, object, string or null") if pattern is not None: sfpath = substitute(primary["location"], pattern) required = builder.do_eval(required, context=primary) if fsaccess.exists(sfpath): if pattern is not None: found.append({"location": sfpath, "class": "File"}) else: found.append(sf) elif required: raise SourceLine( primary["secondaryFiles"], i, validate.ValidationException).makeError( "Required secondary file '%s' does not exist" % sfpath) primary["secondaryFiles"] = cmap(found) if discovered is not None: discovered[primary["location"]] = primary["secondaryFiles"] elif inputschema["type"] not in primitive_types_set: set_secondary(fsaccess, builder, inputschema["type"], secondaryspec, primary, discovered)
def collect_output( self, schema, # type: Dict[Text, Any] builder, # type: Builder outdir, # type: Text fs_access, # type: StdFsAccess compute_checksum=True # type: bool ): # type: (...) -> Optional[Union[Dict[Text, Any], List[Union[Dict[Text, Any], Text]]]] result = [] # type: List[Any] empty_and_optional = False debug = LOGGER.isEnabledFor(logging.DEBUG) if "outputBinding" in schema: binding = schema["outputBinding"] globpatterns = [] # type: List[Text] revmap = partial(command_line_tool.revmap_file, builder, outdir) if "glob" in binding: with SourceLine(binding, "glob", WorkflowException, debug): for glob in aslist(binding["glob"]): glob = builder.do_eval(glob) if glob: globpatterns.extend(aslist(glob)) for glob in globpatterns: if glob.startswith(outdir): glob = glob[len(outdir) + 1:] elif glob == ".": glob = outdir elif glob.startswith("/"): raise WorkflowException( "glob patterns must not start with '/'") try: prefix = fs_access.glob(outdir) key = cmp_to_key( cast(Callable[[Text, Text], int], locale.strcoll)) # In case of stdout.log or stderr.log file not created if "stdout" in self.tool and "stderr" in self.tool \ and glob in (self.tool["stdout"], self.tool["stderr"]): filepath = Path(fs_access.join(outdir, glob)) if not filepath.is_file(): Path(filepath).touch() result.extend([{ "location": g, "path": fs_access.join(builder.outdir, g[len(prefix[0]) + 1:]), "basename": os.path.basename(g), "nameroot": os.path.splitext(os.path.basename(g))[0], "nameext": os.path.splitext(os.path.basename(g))[1], "class": "File" if fs_access.isfile(g) else "Directory" } for g in sorted(fs_access.glob( fs_access.join(outdir, glob)), key=key)]) except (OSError, IOError) as exc: LOGGER.warning(Text(exc)) except Exception: LOGGER.exception("Unexpected error from fs_access") raise for files in result: rfile = files.copy() # TODO This function raise an exception and seems to be related to docker (which is not used here) # revmap(rfile) if files["class"] == "Directory": load_listing = builder.loadListing or ( binding and binding.get("loadListing")) if load_listing and load_listing != "no_listing": get_listing(fs_access, files, (load_listing == "deep_listing")) else: with fs_access.open(rfile["location"], "rb") as f: contents = b"" if binding.get("loadContents") or compute_checksum: contents = f.read(CONTENT_LIMIT) if binding.get("loadContents"): files["contents"] = contents.decode("utf-8") if compute_checksum: checksum = hashlib.sha1() # nosec: B303 while contents != b"": checksum.update(contents) contents = f.read(1024 * 1024) files[ "checksum"] = "sha1$%s" % checksum.hexdigest( ) f.seek(0, 2) file_size = f.tell() files["size"] = file_size optional = False single = False if isinstance(schema["type"], list): if "null" in schema["type"]: optional = True if "File" in schema["type"] or "Directory" in schema["type"]: single = True elif schema["type"] == "File" or schema["type"] == "Directory": single = True if "outputEval" in binding: with SourceLine(binding, "outputEval", WorkflowException, debug): result = builder.do_eval(binding["outputEval"], context=result) if single: if not result and not optional: with SourceLine(binding, "glob", WorkflowException, debug): raise WorkflowException( "Did not find output file with glob pattern: '{}'". format(globpatterns)) elif not result and optional: pass elif isinstance(result, list): if len(result) > 1: raise WorkflowException( "Multiple matches for output item that is a single file." ) result = result[0] if "secondaryFiles" in schema: with SourceLine(schema, "secondaryFiles", WorkflowException, debug): for primary in aslist(result): if isinstance(primary, dict): primary.setdefault("secondaryFiles", []) pathprefix = primary["path"][0:primary["path"]. rindex("/") + 1] for file in aslist(schema["secondaryFiles"]): if isinstance( file, dict) or "$(" in file or "${" in file: sfpath = builder.do_eval(file, context=primary) subst = False else: sfpath = file subst = True for sfitem in aslist(sfpath): if isinstance(sfitem, str): if subst: sfitem = { "path": substitute( primary["path"], sfitem) } else: sfitem = { "path": pathprefix + sfitem } if "path" in sfitem and "location" not in sfitem: revmap(sfitem) if fs_access.isfile(sfitem["location"]): sfitem["class"] = "File" primary["secondaryFiles"].append( sfitem) elif fs_access.isdir(sfitem["location"]): sfitem["class"] = "Directory" primary["secondaryFiles"].append( sfitem) if "format" in schema: for primary in aslist(result): primary["format"] = builder.do_eval(schema["format"], context=primary) # Ensure files point to local references outside of the run environment # TODO: Again removing revmap.... # adjustFileObjs(result, revmap) if not result and optional: return None if not empty_and_optional and isinstance( schema["type"], dict) and schema["type"]["type"] == "record": out = {} for f in schema["type"]["fields"]: out[shortname( f["name"])] = self.collect_output( # type: ignore f, builder, outdir, fs_access, compute_checksum=compute_checksum) return out return result
def collect_output( self, schema, # type: Dict[Text, Any] builder, # type: Builder outdir, # type: Text fs_access, # type: StdFsAccess compute_checksum=True # type: bool ): # type: (...) -> Optional[Union[Dict[Text, Any], List[Union[Dict[Text, Any], Text]]]] """ Collect outputs from the step :term:`Process` following its execution. .. note: When :term:`CWL` runner tries to forward ``step(i) outputs -> step(i+1) inputs`` using :meth:`collect_outputs`, it expects exact ``outputBindings`` locations to be matched. In other words, a definition like ``outputBindings: {glob: outputs/*.txt}`` will generate results located in ``step(i)`` as ``"<tmp-workdir>/outputs/file.txt"`` and ``step(i+1)`` will look explicitly in ``"<tmp-workdir>/outputs`` using the ``glob`` pattern. Because each of our :term:`Process` in the workflow are distinct/remote entities, each one stages its outputs at different URL locations, not sharing the same *root directory*. When we stage intermediate results locally, the sub-dirs are lost. Therefore, they act like individual :term:`CWL` runner calls where the *final results* are moved back to the local directory for convenient access, but our *local directory* is the URL WPS-outputs location. To let :term:`CWL` :term:`Workflow` inter-steps mapping work as intended, we must remap the locations ignoring any nested dirs where the modified *outputBindings* definition will be able to match as if each step :term:`Process` outputs were generated locally. """ result = [] # type: List[Any] empty_and_optional = False debug = LOGGER.isEnabledFor(logging.DEBUG) if "outputBinding" in schema: binding = schema["outputBinding"] globpatterns = [] # type: List[Text] revmap = partial(command_line_tool.revmap_file, builder, outdir) if "glob" in binding: with SourceLine(binding, "glob", WorkflowException, debug): for glob in aslist(binding["glob"]): glob = builder.do_eval(glob) if glob: globpatterns.extend(aslist(glob)) # rebase glob pattern as applicable (see note) for glob in list(globpatterns): if not any( glob.startswith(part) for part in [".", "/", "~"]) and "/" in glob: glob = builder.do_eval(glob.split("/")[-1]) if glob: globpatterns.extend(aslist(glob)) for glob in globpatterns: if glob.startswith(outdir): glob = glob[len(outdir) + 1:] elif glob == ".": glob = outdir elif glob.startswith("/"): raise WorkflowException( "glob patterns must not start with '/'") try: prefix = fs_access.glob(outdir) key = cmp_to_key( cast(Callable[[Text, Text], int], locale.strcoll)) # In case of stdout.log or stderr.log file not created if "stdout" in self.tool and "stderr" in self.tool \ and glob in (self.tool["stdout"], self.tool["stderr"]): filepath = Path(fs_access.join(outdir, glob)) if not filepath.is_file(): Path(filepath).touch() result.extend([{ "location": g, "path": fs_access.join(builder.outdir, g[len(prefix[0]) + 1:]), "basename": os.path.basename(g), "nameroot": os.path.splitext(os.path.basename(g))[0], "nameext": os.path.splitext(os.path.basename(g))[1], "class": "File" if fs_access.isfile(g) else "Directory" } for g in sorted(fs_access.glob( fs_access.join(outdir, glob)), key=key)]) except (OSError, IOError) as exc: LOGGER.warning(Text(exc)) except Exception: LOGGER.exception("Unexpected error from fs_access") raise for files in result: rfile = files.copy() # TODO This function raise an exception and seems to be related to docker (which is not used here) # revmap(rfile) if files["class"] == "Directory": load_listing = builder.loadListing or ( binding and binding.get("loadListing")) if load_listing and load_listing != "no_listing": get_listing(fs_access, files, (load_listing == "deep_listing")) else: with fs_access.open(rfile["location"], "rb") as f: contents = b"" if binding.get("loadContents") or compute_checksum: contents = f.read(CONTENT_LIMIT) if binding.get("loadContents"): files["contents"] = contents.decode("utf-8") if compute_checksum: checksum = hashlib.sha1() # nosec: B303 while contents != b"": checksum.update(contents) contents = f.read(1024 * 1024) files[ "checksum"] = f"sha1${checksum.hexdigest()}" f.seek(0, 2) file_size = f.tell() files["size"] = file_size optional = False single = False if isinstance(schema["type"], list): if "null" in schema["type"]: optional = True if "File" in schema["type"] or "Directory" in schema["type"]: single = True elif schema["type"] == "File" or schema["type"] == "Directory": single = True if "outputEval" in binding: with SourceLine(binding, "outputEval", WorkflowException, debug): result = builder.do_eval(binding["outputEval"], context=result) if single: if not result and not optional: with SourceLine(binding, "glob", WorkflowException, debug): raise WorkflowException( f"Did not find output file with glob pattern: '{globpatterns}'" ) elif not result and optional: pass elif isinstance(result, list): if len(result) > 1: raise WorkflowException( "Multiple matches for output item that is a single file." ) result = result[0] if "secondaryFiles" in schema: with SourceLine(schema, "secondaryFiles", WorkflowException, debug): for primary in aslist(result): if isinstance(primary, dict): primary.setdefault("secondaryFiles", []) pathprefix = primary["path"][0:primary["path"]. rindex("/") + 1] for file in aslist(schema["secondaryFiles"]): if isinstance( file, dict) or "$(" in file or "${" in file: sfpath = builder.do_eval(file, context=primary) subst = False else: sfpath = file subst = True for sfitem in aslist(sfpath): if isinstance(sfitem, str): if subst: sfitem = { "path": substitute( primary["path"], sfitem) } else: sfitem = { "path": pathprefix + sfitem } if "path" in sfitem and "location" not in sfitem: revmap(sfitem) if fs_access.isfile(sfitem["location"]): sfitem["class"] = "File" primary["secondaryFiles"].append( sfitem) elif fs_access.isdir(sfitem["location"]): sfitem["class"] = "Directory" primary["secondaryFiles"].append( sfitem) if "format" in schema: for primary in aslist(result): primary["format"] = builder.do_eval(schema["format"], context=primary) # Ensure files point to local references outside of the run environment # TODO: Again removing revmap.... # adjustFileObjs(result, revmap) if not result and optional: return None if not empty_and_optional and isinstance( schema["type"], dict) and schema["type"]["type"] == "record": out = {} for f in schema["type"]["fields"]: out[shortname( f["name"])] = self.collect_output( # type: ignore f, builder, outdir, fs_access, compute_checksum=compute_checksum) return out return result