def validate_hints(self, avsc_names, hints, strict): # type: (Any, List[Dict[Text, Any]], bool) -> None for i, r in enumerate(hints): sl = SourceLine(hints, i, validate.ValidationException) with sl: if avsc_names.get_name(r["class"], "") is not None: plain_hint = dict((key, r[key]) for key in r if key not in self.doc_loader.identifiers) # strip identifiers validate.validate_ex( avsc_names.get_name(plain_hint["class"], ""), plain_hint, strict=strict) else: _logger.info(sl.makeError(u"Unknown hint %s" % (r["class"])))
def job(self, joborder, output_callback, **kwargs): # type: (Dict[Text, Any], Callable[[Any, Any], Any], **Any) -> Generator self.state = {} self.processStatus = "success" if "outdir" in kwargs: del kwargs["outdir"] for e, i in enumerate(self.tool["inputs"]): with SourceLine(self.tool["inputs"], e, WorkflowException, _logger.isEnabledFor(logging.DEBUG)): iid = shortname(i["id"]) if iid in joborder: self.state[i["id"]] = WorkflowStateItem( i, copy.deepcopy(joborder[iid]), "success") elif "default" in i: self.state[i["id"]] = WorkflowStateItem( i, copy.deepcopy(i["default"]), "success") else: raise WorkflowException( u"Input '%s' not in input object and does not have a default value." % (i["id"])) for s in self.steps: for out in s.tool["outputs"]: self.state[out["id"]] = None completed = 0 while completed < len(self.steps): self.made_progress = False for step in self.steps: if kwargs.get( "on_error", "stop") == "stop" and self.processStatus != "success": break if not step.submitted: try: step.iterable = self.try_make_job( step, output_callback, **kwargs) except WorkflowException as e: _logger.error(u"[%s] Cannot make job: %s", step.name, e) _logger.debug("", exc_info=True) self.processStatus = "permanentFail" if step.iterable: try: for newjob in step.iterable: if kwargs.get( "on_error", "stop" ) == "stop" and self.processStatus != "success": break if newjob: self.made_progress = True yield newjob else: break except WorkflowException as e: _logger.error(u"[%s] Cannot make job: %s", step.name, e) _logger.debug("", exc_info=True) self.processStatus = "permanentFail" completed = sum(1 for s in self.steps if s.completed) if not self.made_progress and completed < len(self.steps): if self.processStatus != "success": break else: yield None if not self.did_callback: self.do_output_callback(output_callback)
def run(self, runtimeContext): # type: (RuntimeContext) -> None (docker_req, docker_is_req) = self.get_requirement("DockerRequirement") self.prov_obj = runtimeContext.prov_obj img_id = None env = cast(MutableMapping[Text, Text], os.environ) user_space_docker_cmd = runtimeContext.user_space_docker_cmd if docker_req and user_space_docker_cmd: # For user-space docker implementations, a local image name or ID # takes precedence over a network pull if 'dockerImageId' in docker_req: img_id = str(docker_req["dockerImageId"]) elif 'dockerPull' in docker_req: img_id = str(docker_req["dockerPull"]) else: raise WorkflowException( SourceLine(docker_req).makeError( "Docker image must be specified as 'dockerImageId' or " "'dockerPull' when using user space implementations of " "Docker")) else: try: if docker_req and runtimeContext.use_container: img_id = str( self.get_from_requirements( docker_req, True, runtimeContext.pull_image, getdefault(runtimeContext.force_docker_pull, False), getdefault(runtimeContext.tmp_outdir_prefix, DEFAULT_TMP_PREFIX))) if img_id is None: if self.builder.find_default_container: default_container = self.builder.find_default_container( ) if default_container: img_id = str(default_container) if docker_req and img_id is None and runtimeContext.use_container: raise Exception("Docker image not available") if self.prov_obj and img_id and runtimeContext.process_run_id: # TODO: Integrate with record_container_id container_agent = self.prov_obj.document.agent( uuid.uuid4().urn, { "prov:type": PROV["SoftwareAgent"], "cwlprov:image": img_id, "prov:label": "Container execution of image %s" % img_id }) # FIXME: img_id is not a sha256 id, it might just be "debian:8" #img_entity = document.entity("nih:sha-256;%s" % img_id, # {"prov:label": "Container image %s" % img_id} ) # The image is the plan for this activity-agent association #document.wasAssociatedWith(process_run_ID, container_agent, img_entity) self.prov_obj.document.wasAssociatedWith( runtimeContext.process_run_id, container_agent) except Exception as err: container = "Singularity" if runtimeContext.singularity else "Docker" _logger.debug("%s error", container, exc_info=True) if docker_is_req: raise UnsupportedRequirement( "%s is required to run this tool: %s" % (container, err)) else: raise WorkflowException( "{0} is not available for this tool, try " "--no-container to disable {0}, or install " "a user space Docker replacement like uDocker with " "--user-space-docker-cmd.: {1}".format(container, err)) self._setup(runtimeContext) runtime = self.create_runtime(env, runtimeContext) runtime.append(img_id) self._execute(runtime, env, runtimeContext)
def __init__(self, toolpath_object, **kwargs): # type: (Dict[Text, Any], **Any) -> None """ kwargs: metadata: tool document metadata requirements: inherited requirements hints: inherited hints loader: schema_salad.ref_resolver.Loader used to load tool document avsc_names: CWL Avro schema object used to validate document strict: flag to determine strict validation (fail on unrecognized fields) """ self.metadata = kwargs.get("metadata", {}) # type: Dict[Text,Any] self.names = None # type: schema.Names global SCHEMA_FILE, SCHEMA_DIR, SCHEMA_ANY # pylint: disable=global-statement if SCHEMA_FILE is None: get_schema("v1.0") SCHEMA_ANY = cast( Dict[Text, Any], SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/salad#Any"]) SCHEMA_FILE = cast( Dict[Text, Any], SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/cwl#File"]) SCHEMA_DIR = cast( Dict[Text, Any], SCHEMA_CACHE["v1.0"] [3].idx["https://w3id.org/cwl/cwl#Directory"]) names = schema.make_avro_schema([SCHEMA_FILE, SCHEMA_DIR, SCHEMA_ANY], Loader({}))[0] if isinstance(names, schema.SchemaParseException): raise names else: self.names = names self.tool = toolpath_object self.requirements = (kwargs.get("requirements", []) + self.tool.get( "requirements", []) + get_overrides(kwargs.get( "overrides", []), self.tool["id"]).get("requirements", [])) self.hints = kwargs.get("hints", []) + self.tool.get("hints", []) self.formatgraph = None # type: Graph if "loader" in kwargs: self.formatgraph = kwargs["loader"].graph self.doc_loader = kwargs["loader"] self.doc_schema = kwargs["avsc_names"] checkRequirements(self.tool, supportedProcessRequirements) self.validate_hints(kwargs["avsc_names"], self.tool.get("hints", []), strict=kwargs.get("strict")) self.schemaDefs = {} # type: Dict[Text,Dict[Text, Any]] sd, _ = self.get_requirement("SchemaDefRequirement") if sd: sdtypes = sd["types"] av = schema.make_valid_avro( sdtypes, {t["name"]: t for t in avroize_type(sdtypes)}, set()) for i in av: self.schemaDefs[i["name"]] = i # type: ignore schema.AvroSchemaFromJSONData(av, self.names) # type: ignore # Build record schema from inputs self.inputs_record_schema = { "name": "input_record_schema", "type": "record", "fields": [] } # type: Dict[Text, Any] self.outputs_record_schema = { "name": "outputs_record_schema", "type": "record", "fields": [] } # type: Dict[Text, Any] for key in ("inputs", "outputs"): for i in self.tool[key]: c = copy.copy(i) c["name"] = shortname(c["id"]) del c["id"] if "type" not in c: raise validate.ValidationException(u"Missing 'type' in " "parameter '%s'" % c["name"]) if "default" in c and "null" not in aslist(c["type"]): c["type"] = ["null"] + aslist(c["type"]) else: c["type"] = c["type"] c["type"] = avroize_type(c["type"], c["name"]) if key == "inputs": self.inputs_record_schema["fields"].append(c) elif key == "outputs": self.outputs_record_schema["fields"].append(c) with SourceLine(toolpath_object, "inputs", validate.ValidationException): self.inputs_record_schema = cast( Dict[six.text_type, Any], schema.make_valid_avro(self.inputs_record_schema, {}, set())) schema.AvroSchemaFromJSONData(self.inputs_record_schema, self.names) with SourceLine(toolpath_object, "outputs", validate.ValidationException): self.outputs_record_schema = cast( Dict[six.text_type, Any], schema.make_valid_avro(self.outputs_record_schema, {}, set())) schema.AvroSchemaFromJSONData(self.outputs_record_schema, self.names) if toolpath_object.get("class") is not None and not kwargs.get( "disable_js_validation", False): if kwargs.get("js_hint_options_file") is not None: try: with open(kwargs["js_hint_options_file"]) as options_file: validate_js_options = json.load(options_file) except (OSError, ValueError) as e: _logger.error("Failed to read options file %s" % kwargs["js_hint_options_file"]) raise e else: validate_js_options = None validate_js_expressions( cast(CommentedMap, toolpath_object), self.doc_schema.names[toolpath_object["class"]], validate_js_options) dockerReq, is_req = self.get_requirement("DockerRequirement") if dockerReq and dockerReq.get("dockerOutputDirectory") and not is_req: _logger.warn( SourceLine(item=dockerReq, raise_type=Text).makeError( """When 'dockerOutputDirectory' is declared, DockerRequirement should go in the 'requirements' section, not 'hints'.""")) if dockerReq and dockerReq.get( "dockerOutputDirectory") == "/var/spool/cwl": if is_req: # In this specific case, it is legal to have /var/spool/cwl, so skip the check. pass else: # Must be a requirement var_spool_cwl_detector(self.tool) else: var_spool_cwl_detector(self.tool)
def collect_output_ports(self, ports, builder, outdir, compute_checksum=True, jobname="", readers=None): # type: (Set[Dict[Text, Any]], Builder, Text, bool, Text, Dict[Text, Any]) -> Dict[Text, Union[Text, List[Any], Dict[Text, Any]]] ret = {} # type: Dict[Text, Union[Text, List[Any], Dict[Text, Any]]] debug = _logger.isEnabledFor(logging.DEBUG) try: fs_access = builder.make_fs_access(outdir) custom_output = fs_access.join(outdir, "cwl.output.json") if fs_access.exists(custom_output): with fs_access.open(custom_output, "r") as f: ret = json.load(f) if debug: _logger.debug(u"Raw output from %s: %s", custom_output, json.dumps(ret, indent=4)) else: for i, port in enumerate(ports): def makeWorkflowException(msg): return WorkflowException( u"Error collecting output for parameter '%s':\n%s" % (shortname(port["id"]), msg)) with SourceLine(ports, i, makeWorkflowException, debug): fragment = shortname(port["id"]) ret[fragment] = self.collect_output( port, builder, outdir, fs_access, compute_checksum=compute_checksum) if ret: revmap = partial(revmap_file, builder, outdir) adjustDirObjs(ret, trim_listing) visit_class(ret, ("File", "Directory"), cast(Callable[[Any], Any], revmap)) visit_class(ret, ("File", "Directory"), remove_path) normalizeFilesDirs(ret) visit_class(ret, ("File", "Directory"), partial(check_valid_locations, fs_access)) if compute_checksum: adjustFileObjs(ret, partial(compute_checksums, fs_access)) validate.validate_ex(self.names.get_name("outputs_record_schema", ""), ret, strict=False, logger=_logger_validation_warnings) if ret is not None and builder.mutation_manager is not None: adjustFileObjs(ret, builder.mutation_manager.set_generation) return ret if ret is not None else {} except validate.ValidationException as e: raise WorkflowException("Error validating output record. " + Text(e) + "\n in " + json.dumps(ret, indent=4)) finally: if builder.mutation_manager and readers: for r in readers.values(): builder.mutation_manager.release_reader(jobname, r)
def __init__( self, toolpath_object, # type: Dict[Text, Any] pos, # type: int loadingContext, # type: LoadingContext parentworkflowProv=None # type: Optional[ProvenanceProfile] ): # type: (...) -> None if "id" in toolpath_object: self.id = toolpath_object["id"] else: self.id = "#step" + Text(pos) loadingContext = loadingContext.copy() loadingContext.requirements = copy.deepcopy( getdefault(loadingContext.requirements, [])) assert loadingContext.requirements is not None loadingContext.requirements.extend( toolpath_object.get("requirements", [])) loadingContext.requirements.extend( get_overrides(getdefault(loadingContext.overrides_list, []), self.id).get("requirements", [])) loadingContext.hints = copy.deepcopy( getdefault(loadingContext.hints, [])) loadingContext.hints.extend(toolpath_object.get("hints", [])) try: if isinstance(toolpath_object["run"], MutableMapping): self.embedded_tool = loadingContext.construct_tool_object( toolpath_object["run"], loadingContext) # type: Process else: self.embedded_tool = load_tool(toolpath_object["run"], loadingContext) except validate.ValidationException as vexc: if loadingContext.debug: _logger.exception("Validation exception") raise WorkflowException( u"Tool definition %s failed validation:\n%s" % (toolpath_object["run"], validate.indent(str(vexc)))) validation_errors = [] self.tool = toolpath_object = copy.deepcopy(toolpath_object) bound = set() for stepfield, toolfield in (("in", "inputs"), ("out", "outputs")): toolpath_object[toolfield] = [] for index, step_entry in enumerate(toolpath_object[stepfield]): if isinstance(step_entry, string_types): param = CommentedMap() # type: CommentedMap inputid = step_entry else: param = CommentedMap(iteritems(step_entry)) inputid = step_entry["id"] shortinputid = shortname(inputid) found = False for tool_entry in self.embedded_tool.tool[toolfield]: frag = shortname(tool_entry["id"]) if frag == shortinputid: #if the case that the step has a default for a parameter, #we do not want the default of the tool to override it step_default = None if "default" in param and "default" in tool_entry: step_default = param["default"] param.update(tool_entry) param["_tool_entry"] = tool_entry if step_default is not None: param["default"] = step_default found = True bound.add(frag) break if not found: if stepfield == "in": param["type"] = "Any" param["not_connected"] = True else: validation_errors.append( SourceLine(self.tool["out"], index).makeError( "Workflow step output '%s' does not correspond to" % shortname(step_entry)) + "\n" + SourceLine(self.embedded_tool.tool, "outputs"). makeError(" tool output (expected '%s')" % ("', '".join([ shortname(tool_entry["id"]) for tool_entry in self.embedded_tool.tool[toolfield] ])))) param["id"] = inputid param.lc.line = toolpath_object[stepfield].lc.data[index][0] param.lc.col = toolpath_object[stepfield].lc.data[index][1] param.lc.filename = toolpath_object[stepfield].lc.filename toolpath_object[toolfield].append(param) missing_values = [] for _, tool_entry in enumerate(self.embedded_tool.tool["inputs"]): if shortname(tool_entry["id"]) not in bound: if "null" not in tool_entry[ "type"] and "default" not in tool_entry: missing_values.append(shortname(tool_entry["id"])) if missing_values: validation_errors.append( SourceLine(self.tool, "in").makeError( "Step is missing required parameter%s '%s'" % ("s" if len(missing_values) > 1 else "", "', '".join(missing_values)))) if validation_errors: raise validate.ValidationException("\n".join(validation_errors)) super(WorkflowStep, self).__init__(toolpath_object, loadingContext) if self.embedded_tool.tool["class"] == "Workflow": (feature, _) = self.get_requirement("SubworkflowFeatureRequirement") if not feature: raise WorkflowException( "Workflow contains embedded workflow but " "SubworkflowFeatureRequirement not in requirements") if "scatter" in self.tool: (feature, _) = self.get_requirement("ScatterFeatureRequirement") if not feature: raise WorkflowException( "Workflow contains scatter but ScatterFeatureRequirement " "not in requirements") inputparms = copy.deepcopy(self.tool["inputs"]) outputparms = copy.deepcopy(self.tool["outputs"]) scatter = aslist(self.tool["scatter"]) method = self.tool.get("scatterMethod") if method is None and len(scatter) != 1: raise validate.ValidationException( "Must specify scatterMethod when scattering over multiple inputs" ) inp_map = {i["id"]: i for i in inputparms} for inp in scatter: if inp not in inp_map: raise validate.ValidationException( SourceLine(self.tool, "scatter").makeError( "Scatter parameter '%s' does not correspond to " "an input parameter of this step, expecting '%s'" % (shortname(inp), "', '".join( shortname(k) for k in inp_map.keys())))) inp_map[inp]["type"] = { "type": "array", "items": inp_map[inp]["type"] } if self.tool.get("scatterMethod") == "nested_crossproduct": nesting = len(scatter) else: nesting = 1 for _ in range(0, nesting): for oparam in outputparms: oparam["type"] = {"type": "array", "items": oparam["type"]} self.tool["inputs"] = inputparms self.tool["outputs"] = outputparms self.prov_obj = None # type: Optional[ProvenanceProfile] if loadingContext.research_obj is not None: self.prov_obj = parentworkflowProv if self.embedded_tool.tool["class"] == "Workflow": self.parent_wf = self.embedded_tool.parent_wf else: self.parent_wf = self.prov_obj
def get_image( dockerRequirement, # type: Dict[Text, Text] pull_image, # type: bool force_pull=False # type: bool ): # type: (...) -> bool """ Acquire the software container image in the specified dockerRequirement using Singularity and returns the success as a bool. Updates the provided dockerRequirement with the specific dockerImageId to the full path of the local image, if found. Likewise the dockerRequirement['dockerPull'] is updated to a docker:// URI if needed. """ found = False candidates = [] if "dockerImageId" not in dockerRequirement and "dockerPull" in dockerRequirement: match = re.search(pattern=r'([a-z]*://)', string=dockerRequirement["dockerPull"]) candidate = _normalize_image_id(dockerRequirement['dockerPull']) candidates.append(candidate) dockerRequirement['dockerImageId'] = candidate if not match: dockerRequirement[ "dockerPull"] = "docker://" + dockerRequirement[ "dockerPull"] elif "dockerImageId" in dockerRequirement: candidates.append(dockerRequirement['dockerImageId']) candidates.append( _normalize_image_id(dockerRequirement['dockerImageId'])) # check if Singularity image is available in $SINGULARITY_CACHEDIR targets = [os.getcwd()] for env in ("SINGULARITY_CACHEDIR", "SINGULARITY_PULLFOLDER"): if env in os.environ: targets.append(os.environ[env]) for target in targets: for candidate in candidates: path = os.path.join(target, candidate) if os.path.isfile(path): _logger.info( "Using local copy of Singularity image found in %s", target) dockerRequirement["dockerImageId"] = path found = True if (force_pull or not found) and pull_image: cmd = [] # type: List[Text] if "dockerPull" in dockerRequirement: cmd = [ "singularity", "pull", "--force", "--name", str(dockerRequirement["dockerImageId"]), str(dockerRequirement["dockerPull"]) ] _logger.info(Text(cmd)) check_call(cmd, stdout=sys.stderr) found = True elif "dockerFile" in dockerRequirement: raise WorkflowException( SourceLine(dockerRequirement, 'dockerFile').makeError( "dockerFile is not currently supported when using the " "Singularity runtime for Docker containers.")) elif "dockerLoad" in dockerRequirement: raise WorkflowException( SourceLine(dockerRequirement, 'dockerLoad').makeError( "dockerLoad is not currently supported when using the " "Singularity runtime for Docker containers.")) elif "dockerImport" in dockerRequirement: raise WorkflowException( SourceLine(dockerRequirement, 'dockerImport').makeError( "dockerImport is not currently supported when using the " "Singularity runtime for Docker containers.")) return found
def run( self, runtimeContext: RuntimeContext, tmpdir_lock: Optional[threading.Lock] = None, ) -> None: if tmpdir_lock: with tmpdir_lock: if not os.path.exists(self.tmpdir): os.makedirs(self.tmpdir) else: if not os.path.exists(self.tmpdir): os.makedirs(self.tmpdir) (docker_req, docker_is_req) = self.get_requirement("DockerRequirement") self.prov_obj = runtimeContext.prov_obj img_id = None env = cast(MutableMapping[str, str], os.environ) user_space_docker_cmd = runtimeContext.user_space_docker_cmd if docker_req is not None and user_space_docker_cmd: # For user-space docker implementations, a local image name or ID # takes precedence over a network pull if "dockerImageId" in docker_req: img_id = str(docker_req["dockerImageId"]) elif "dockerPull" in docker_req: img_id = str(docker_req["dockerPull"]) cmd = [user_space_docker_cmd, "pull", img_id] _logger.info(str(cmd)) try: subprocess.check_call(cmd, stdout=sys.stderr) # nosec except OSError: raise WorkflowException( SourceLine(docker_req).makeError( "Either Docker container {} is not available with " "user space docker implementation {} or {} is missing " "or broken.".format(img_id, user_space_docker_cmd, user_space_docker_cmd))) else: raise WorkflowException( SourceLine(docker_req).makeError( "Docker image must be specified as 'dockerImageId' or " "'dockerPull' when using user space implementations of " "Docker")) else: try: if docker_req is not None and runtimeContext.use_container: img_id = str( self.get_from_requirements( docker_req, runtimeContext.pull_image, getdefault(runtimeContext.force_docker_pull, False), getdefault(runtimeContext.tmp_outdir_prefix, DEFAULT_TMP_PREFIX), )) if img_id is None: if self.builder.find_default_container: default_container = self.builder.find_default_container( ) if default_container: img_id = str(default_container) if (docker_req is not None and img_id is None and runtimeContext.use_container): raise Exception("Docker image not available") if (self.prov_obj is not None and img_id is not None and runtimeContext.process_run_id is not None): container_agent = self.prov_obj.document.agent( uuid.uuid4().urn, { "prov:type": PROV["SoftwareAgent"], "cwlprov:image": img_id, "prov:label": "Container execution of image %s" % img_id, }, ) # FIXME: img_id is not a sha256 id, it might just be "debian:8" # img_entity = document.entity("nih:sha-256;%s" % img_id, # {"prov:label": "Container image %s" % img_id} ) # The image is the plan for this activity-agent association # document.wasAssociatedWith(process_run_ID, container_agent, img_entity) self.prov_obj.document.wasAssociatedWith( runtimeContext.process_run_id, container_agent) except Exception as err: container = "Singularity" if runtimeContext.singularity else "Docker" _logger.debug("%s error", container, exc_info=True) if docker_is_req: raise UnsupportedRequirement( "%s is required to run this tool: %s" % (container, str(err))) from err else: raise WorkflowException( "{0} is not available for this tool, try " "--no-container to disable {0}, or install " "a user space Docker replacement like uDocker with " "--user-space-docker-cmd.: {1}".format(container, err)) self._setup(runtimeContext) (runtime, cidfile) = self.create_runtime(env, runtimeContext) runtime.append(str(img_id)) monitor_function = None if cidfile: monitor_function = functools.partial( self.docker_monitor, cidfile, runtimeContext.tmpdir_prefix, not bool(runtimeContext.cidfile_dir), ) elif runtimeContext.user_space_docker_cmd: monitor_function = functools.partial(self.process_monitor) self._execute(runtime, env, runtimeContext, monitor_function)
def validate_document( document_loader, # type: Loader workflowobj, # type: CommentedMap uri, # type: Text overrides, # type: List[Dict] metadata, # type: Dict[Text, Any] enable_dev=False, # type: bool strict=True, # type: bool preprocess_only=False, # type: bool fetcher_constructor=None, # type: FetcherConstructorType skip_schemas=None, # type: bool do_validate=True # type: bool ): # type: (...) -> Tuple[Loader, schema.Names, Union[Dict[Text, Any], List[Dict[Text, Any]]], Dict[Text, Any], Text] """Validate a CWL document.""" if isinstance(workflowobj, MutableSequence): workflowobj = cmap({"$graph": workflowobj}, fn=uri) if not isinstance(workflowobj, MutableMapping): raise ValueError("workflowjobj must be a dict, got '{}': {}".format( type(workflowobj), workflowobj)) jobobj = None if "cwl:tool" in workflowobj: job_loader = default_loader(fetcher_constructor) # type: ignore jobobj, _ = job_loader.resolve_all(workflowobj, uri, checklinks=do_validate) uri = urllib.parse.urljoin( uri, workflowobj["https://w3id.org/cwl/cwl#tool"]) del cast(dict, jobobj)["https://w3id.org/cwl/cwl#tool"] if isinstance(jobobj, CommentedMap ) and "http://commonwl.org/cwltool#overrides" in jobobj: overrides.extend(resolve_overrides(jobobj, uri, uri)) del jobobj["http://commonwl.org/cwltool#overrides"] workflowobj = fetch_document( uri, fetcher_constructor=fetcher_constructor)[1] fileuri = urllib.parse.urldefrag(uri)[0] if "cwlVersion" not in workflowobj: if 'cwlVersion' in metadata: workflowobj['cwlVersion'] = metadata['cwlVersion'] else: raise ValidationException( "No cwlVersion found. " "Use the following syntax in your CWL document to declare " "the version: cwlVersion: <version>.\n" "Note: if this is a CWL draft-2 (pre v1.0) document then it " "will need to be upgraded first.") if not isinstance(workflowobj["cwlVersion"], string_types): with SourceLine(workflowobj, "cwlVersion", ValidationException): raise ValidationException("'cwlVersion' must be a string, " "got {}".format( type(workflowobj["cwlVersion"]))) # strip out version workflowobj["cwlVersion"] = re.sub(r"^(?:cwl:|https://w3id.org/cwl/cwl#)", "", workflowobj["cwlVersion"]) if workflowobj["cwlVersion"] not in list(ALLUPDATES): # print out all the Supported Versions of cwlVersion versions = [] for version in list(ALLUPDATES): if "dev" in version: version += " (with --enable-dev flag only)" versions.append(version) versions.sort() raise ValidationException( "The CWL reference runner no longer supports pre CWL v1.0 " "documents. Supported versions are: " "\n{}".format("\n".join(versions))) (sch_document_loader, avsc_names) = \ process.get_schema(workflowobj["cwlVersion"])[:2] if isinstance(avsc_names, Exception): raise avsc_names processobj = None # type: Union[CommentedMap, CommentedSeq, Text, None] document_loader = Loader(sch_document_loader.ctx, schemagraph=sch_document_loader.graph, idx=document_loader.idx, cache=sch_document_loader.cache, fetcher_constructor=fetcher_constructor, skip_schemas=skip_schemas) _add_blank_ids(workflowobj) workflowobj["id"] = fileuri processobj, new_metadata = document_loader.resolve_all( workflowobj, fileuri, checklinks=do_validate) if not isinstance(processobj, (CommentedMap, CommentedSeq)): raise ValidationException("Workflow must be a dict or list.") if not new_metadata and isinstance(processobj, CommentedMap): new_metadata = cast( CommentedMap, cmap( { "$namespaces": processobj.get("$namespaces", {}), "$schemas": processobj.get("$schemas", []), "cwlVersion": processobj["cwlVersion"] }, fn=fileuri)) _convert_stdstreams_to_files(workflowobj) if preprocess_only: return document_loader, avsc_names, processobj, new_metadata, uri if do_validate: schema.validate_doc(avsc_names, processobj, document_loader, strict) if new_metadata.get("cwlVersion") != update.LATEST: processobj = cast( CommentedMap, cmap( update.update(processobj, document_loader, fileuri, enable_dev, new_metadata))) if jobobj is not None: new_metadata[u"cwl:defaults"] = jobobj if overrides: new_metadata[u"cwltool:overrides"] = overrides return document_loader, avsc_names, processobj, new_metadata, uri
def job(self, joborder, output_callback, **kwargs): kwargs["work_api"] = self.work_api req, _ = self.get_requirement( "http://arvados.org/cwl#RunInSingleContainer") if req: with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "id" not in self.tool: raise WorkflowException("%s object must have 'id'" % (self.tool["class"])) document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch( self.tool["id"]), self.tool["id"]) with Perf(metrics, "subworkflow upload_deps"): upload_dependencies(self.arvrunner, os.path.basename(joborder.get("id", "#")), document_loader, joborder, joborder.get("id", "#"), False) if self.wf_pdh is None: workflowobj["requirements"] = dedup_reqs(self.requirements) workflowobj["hints"] = dedup_reqs(self.hints) packed = pack(document_loader, workflowobj, uri, self.metadata) upload_dependencies(self.arvrunner, kwargs.get("name", ""), document_loader, packed, uri, False) with Perf(metrics, "subworkflow adjust"): joborder_resolved = copy.deepcopy(joborder) joborder_keepmount = copy.deepcopy(joborder) reffiles = [] visit_class(joborder_keepmount, ("File", "Directory"), lambda x: reffiles.append(x)) mapper = ArvPathMapper(self.arvrunner, reffiles, kwargs["basedir"], "/keep/%s", "/keep/%s/%s", **kwargs) def keepmount(obj): remove_redundant_fields(obj) with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "location" not in obj: raise WorkflowException( "%s object is missing required 'location' field: %s" % (obj["class"], obj)) with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper( obj["location"]).target if "listing" in obj: del obj["listing"] elif obj["location"].startswith("_:"): del obj["location"] else: raise WorkflowException( "Location is not a keep reference or a literal: '%s'" % obj["location"]) visit_class(joborder_keepmount, ("File", "Directory"), keepmount) def resolved(obj): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper( obj["location"]).resolved visit_class(joborder_resolved, ("File", "Directory"), resolved) if self.wf_pdh is None: adjustFileObjs(packed, keepmount) adjustDirObjs(packed, keepmount) self.wf_pdh = upload_workflow_collection( self.arvrunner, shortname(self.tool["id"]), packed) wf_runner = cmap({ "class": "CommandLineTool", "baseCommand": "cwltool", "inputs": self.tool["inputs"], "outputs": self.tool["outputs"], "stdout": "cwl.output.json", "requirements": workflowobj["requirements"] + [{ "class": "InitialWorkDirRequirement", "listing": [{ "entryname": "workflow.cwl", "entry": { "class": "File", "location": "keep:%s/workflow.cwl" % self.wf_pdh } }, { "entryname": "cwl.input.yml", "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',', ': ')).replace( "\\", "\\\\").replace( '$(', '\$(').replace('${', '\${') }] }], "hints": workflowobj["hints"], "arguments": [ "--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml" ] }) kwargs["loader"] = self.doc_loader kwargs["avsc_names"] = self.doc_schema return ArvadosCommandTool(self.arvrunner, wf_runner, **kwargs).job(joborder_resolved, output_callback, **kwargs) else: return super(ArvadosWorkflow, self).job(joborder, output_callback, **kwargs)
def _convert_stdstreams_to_files(workflowobj): # type: (Union[Dict[str, Any], List[Dict[str, Any]]]) -> None if isinstance(workflowobj, MutableMapping): if workflowobj.get("class") == "CommandLineTool": with SourceLine( workflowobj, "outputs", ValidationException, _logger.isEnabledFor(logging.DEBUG), ): outputs = workflowobj.get("outputs", []) if not isinstance(outputs, CommentedSeq): raise ValidationException('"outputs" section is not ' "valid.") for out in workflowobj.get("outputs", []): if not isinstance(out, CommentedMap): raise ValidationException( "Output '{}' is not a valid " "OutputParameter.".format(out) ) for streamtype in ["stdout", "stderr"]: if out.get("type") == streamtype: if "outputBinding" in out: raise ValidationException( "Not allowed to specify outputBinding when" " using %s shortcut." % streamtype ) if streamtype in workflowobj: filename = workflowobj[streamtype] else: filename = str( hashlib.sha1( # nosec json_dumps(workflowobj, sort_keys=True).encode( "utf-8" ) ).hexdigest() ) workflowobj[streamtype] = filename out["type"] = "File" out["outputBinding"] = cmap({"glob": filename}) for inp in workflowobj.get("inputs", []): if inp.get("type") == "stdin": if "inputBinding" in inp: raise ValidationException( "Not allowed to specify inputBinding when" " using stdin shortcut." ) if "stdin" in workflowobj: raise ValidationException( "Not allowed to specify stdin path when" " using stdin type shortcut." ) else: workflowobj["stdin"] = ( "$(inputs.%s.path)" % inp["id"].rpartition("#")[2] ) inp["type"] = "File" else: for entry in workflowobj.values(): _convert_stdstreams_to_files(entry) if isinstance(workflowobj, MutableSequence): for entry in workflowobj: _convert_stdstreams_to_files(entry)
def collect_output( self, schema, # type: Dict[str, Any] builder, # type: Builder outdir, # type: str fs_access, # type: StdFsAccess compute_checksum=True, # type: bool ): # type: (...) -> Optional[Union[Dict[str, Any], List[Union[Dict[str, Any], str]]]] r = [] # type: List[Any] empty_and_optional = False debug = _logger.isEnabledFor(logging.DEBUG) if "outputBinding" in schema: binding = schema["outputBinding"] globpatterns = [] # type: List[str] revmap = partial(revmap_file, builder, outdir) if "glob" in binding: with SourceLine(binding, "glob", WorkflowException, debug): for gb in aslist(binding["glob"]): gb = builder.do_eval(gb) if gb: globpatterns.extend(aslist(gb)) for gb in globpatterns: if gb.startswith(builder.outdir): gb = gb[len(builder.outdir) + 1:] elif gb == ".": gb = outdir elif gb.startswith("/"): raise WorkflowException( "glob patterns must not start with '/'") try: prefix = fs_access.glob(outdir) r.extend([{ "location": g, "path": fs_access.join(builder.outdir, g[len(prefix[0]) + 1:]), "basename": os.path.basename(g), "nameroot": os.path.splitext(os.path.basename(g))[0], "nameext": os.path.splitext(os.path.basename(g))[1], "class": "File" if fs_access.isfile(g) else "Directory", } for g in sorted( fs_access.glob(fs_access.join(outdir, gb)), key=cmp_to_key( cast( Callable[[str, str], int], locale.strcoll, )), )]) except (OSError, IOError) as e: _logger.warning(str(e)) except Exception: _logger.error("Unexpected error from fs_access", exc_info=True) raise for files in r: rfile = files.copy() revmap(rfile) if files["class"] == "Directory": ll = schema.get("loadListing") or builder.loadListing if ll and ll != "no_listing": get_listing(fs_access, files, (ll == "deep_listing")) else: if binding.get("loadContents"): with fs_access.open(rfile["location"], "rb") as f: files[ "contents"] = content_limit_respected_read_bytes( f).decode("utf-8") if compute_checksum: with fs_access.open(rfile["location"], "rb") as f: checksum = hashlib.sha1() # nosec contents = f.read(1024 * 1024) while contents != b"": checksum.update(contents) contents = f.read(1024 * 1024) files[ "checksum"] = "sha1$%s" % checksum.hexdigest( ) files["size"] = fs_access.size(rfile["location"]) optional = False single = False if isinstance(schema["type"], MutableSequence): if "null" in schema["type"]: optional = True if "File" in schema["type"] or "Directory" in schema["type"]: single = True elif schema["type"] == "File" or schema["type"] == "Directory": single = True if "outputEval" in binding: with SourceLine(binding, "outputEval", WorkflowException, debug): r = builder.do_eval(binding["outputEval"], context=r) if single: if not r and not optional: with SourceLine(binding, "glob", WorkflowException, debug): raise WorkflowException( "Did not find output file with glob pattern: '{}'". format(globpatterns)) elif not r and optional: pass elif isinstance(r, MutableSequence): if len(r) > 1: raise WorkflowException( "Multiple matches for output item that is a single file." ) else: r = r[0] if "secondaryFiles" in schema: with SourceLine(schema, "secondaryFiles", WorkflowException, debug): for primary in aslist(r): if isinstance(primary, MutableMapping): primary.setdefault("secondaryFiles", []) pathprefix = primary["path"][0:primary["path"]. rindex("/") + 1] for sf in aslist(schema["secondaryFiles"]): if "required" in sf: sf_required = builder.do_eval( sf["required"], context=primary) else: sf_required = False if "$(" in sf["pattern"] or "${" in sf[ "pattern"]: sfpath = builder.do_eval(sf["pattern"], context=primary) else: sfpath = substitute( primary["basename"], sf["pattern"]) for sfitem in aslist(sfpath): if not sfitem: continue if isinstance(sfitem, str): sfitem = {"path": pathprefix + sfitem} if (not fs_access.exists(sfitem["path"]) and sf_required): raise WorkflowException( "Missing required secondary file '%s'" % (sfitem["path"])) if "path" in sfitem and "location" not in sfitem: revmap(sfitem) if fs_access.isfile(sfitem["location"]): sfitem["class"] = "File" primary["secondaryFiles"].append( sfitem) elif fs_access.isdir(sfitem["location"]): sfitem["class"] = "Directory" primary["secondaryFiles"].append( sfitem) if "format" in schema: for primary in aslist(r): primary["format"] = builder.do_eval(schema["format"], context=primary) # Ensure files point to local references outside of the run environment adjustFileObjs(r, revmap) if not r and optional: # Don't convert zero or empty string to None if r in [0, ""]: return r # For [] or None, return None else: return None if (not empty_and_optional and isinstance(schema["type"], MutableMapping) and schema["type"]["type"] == "record"): out = {} for field in schema["type"]["fields"]: out[shortname(field["name"])] = self.collect_output( field, builder, outdir, fs_access, compute_checksum=compute_checksum) return out return r
def collect_output_ports( self, ports: Set[Dict[str, Any]], builder: Builder, outdir: str, rcode: int, compute_checksum: bool = True, jobname: str = "", readers: Optional[Dict[str, Any]] = None, ) -> OutputPorts: ret = {} # type: OutputPorts debug = _logger.isEnabledFor(logging.DEBUG) cwl_version = self.metadata.get( "http://commonwl.org/cwltool#original_cwlVersion", None) if cwl_version != "v1.0": builder.resources["exitCode"] = rcode try: fs_access = builder.make_fs_access(outdir) custom_output = fs_access.join(outdir, "cwl.output.json") if fs_access.exists(custom_output): with fs_access.open(custom_output, "r") as f: ret = json.load(f) if debug: _logger.debug( "Raw output from %s: %s", custom_output, json_dumps(ret, indent=4), ) else: for i, port in enumerate(ports): class ParameterOutputWorkflowException(WorkflowException): def __init__(self, msg, **kwargs): # type: (str, **Any) -> None super( ParameterOutputWorkflowException, self ).__init__( "Error collecting output for parameter '%s':\n%s" % (shortname(port["id"]), msg), kwargs, ) with SourceLine(ports, i, ParameterOutputWorkflowException, debug): fragment = shortname(port["id"]) ret[fragment] = self.collect_output( port, builder, outdir, fs_access, compute_checksum=compute_checksum, ) if ret: revmap = partial(revmap_file, builder, outdir) adjustDirObjs(ret, trim_listing) visit_class(ret, ("File", "Directory"), cast(Callable[[Any], Any], revmap)) visit_class(ret, ("File", "Directory"), remove_path) normalizeFilesDirs(ret) visit_class( ret, ("File", "Directory"), partial(check_valid_locations, fs_access), ) if compute_checksum: adjustFileObjs(ret, partial(compute_checksums, fs_access)) expected_schema = cast( Schema, self.names.get_name("outputs_record_schema", "")) validate.validate_ex(expected_schema, ret, strict=False, logger=_logger_validation_warnings) if ret is not None and builder.mutation_manager is not None: adjustFileObjs(ret, builder.mutation_manager.set_generation) return ret if ret is not None else {} except validate.ValidationException as e: raise WorkflowException("Error validating output record. " + str(e) + "\n in " + json_dumps(ret, indent=4)) from e finally: if builder.mutation_manager and readers: for r in readers.values(): builder.mutation_manager.release_reader(jobname, r)
def job( self, job_order, # type: Mapping[str, str] output_callbacks, # type: Callable[[Any, Any], Any] runtimeContext, # type: RuntimeContext ): # type: (...) -> Generator[Union[JobBase, CallbackJob], None, None] workReuse, _ = self.get_requirement("WorkReuse") enableReuse = workReuse.get("enableReuse", True) if workReuse else True jobname = uniquename(runtimeContext.name or shortname(self.tool.get("id", "job"))) if runtimeContext.cachedir and enableReuse: cachecontext = runtimeContext.copy() cachecontext.outdir = "/out" cachecontext.tmpdir = "/tmp" # nosec cachecontext.stagedir = "/stage" cachebuilder = self._init_job(job_order, cachecontext) cachebuilder.pathmapper = PathMapper( cachebuilder.files, runtimeContext.basedir, cachebuilder.stagedir, separateDirs=False, ) _check_adjust = partial(check_adjust, cachebuilder) visit_class( [cachebuilder.files, cachebuilder.bindings], ("File", "Directory"), _check_adjust, ) cmdline = flatten( list(map(cachebuilder.generate_arg, cachebuilder.bindings))) docker_req, _ = self.get_requirement("DockerRequirement") if docker_req is not None and runtimeContext.use_container: dockerimg = docker_req.get("dockerImageId") or docker_req.get( "dockerPull") elif (runtimeContext.default_container is not None and runtimeContext.use_container): dockerimg = runtimeContext.default_container else: dockerimg = None if dockerimg is not None: cmdline = ["docker", "run", dockerimg] + cmdline # not really run using docker, just for hashing purposes keydict = { "cmdline": cmdline } # type: Dict[str, Union[Dict[str, Any], List[Any]]] for shortcut in ["stdin", "stdout", "stderr"]: if shortcut in self.tool: keydict[shortcut] = self.tool[shortcut] for location, fobj in cachebuilder.pathmapper.items(): if fobj.type == "File": checksum = next( (e["checksum"] for e in cachebuilder.files if "location" in e and e["location"] == location and "checksum" in e and e["checksum"] != "sha1$hash"), None, ) fobj_stat = os.stat(fobj.resolved) if checksum is not None: keydict[fobj.resolved] = [fobj_stat.st_size, checksum] else: keydict[fobj.resolved] = [ fobj_stat.st_size, int(fobj_stat.st_mtime * 1000), ] interesting = { "DockerRequirement", "EnvVarRequirement", "InitialWorkDirRequirement", "ShellCommandRequirement", "NetworkAccess", } for rh in (self.original_requirements, self.original_hints): for r in reversed(rh): if r["class"] in interesting and r["class"] not in keydict: keydict[r["class"]] = r keydictstr = json_dumps(keydict, separators=(",", ":"), sort_keys=True) cachekey = hashlib.md5( keydictstr.encode("utf-8")).hexdigest() # nosec _logger.debug("[job %s] keydictstr is %s -> %s", jobname, keydictstr, cachekey) jobcache = os.path.join(runtimeContext.cachedir, cachekey) # Create a lockfile to manage cache status. jobcachepending = "{}.status".format(jobcache) jobcachelock = None jobstatus = None # Opens the file for read/write, or creates an empty file. jobcachelock = open(jobcachepending, "a+") # get the shared lock to ensure no other process is trying # to write to this cache shared_file_lock(jobcachelock) jobcachelock.seek(0) jobstatus = jobcachelock.read() if os.path.isdir(jobcache) and jobstatus == "success": if docker_req and runtimeContext.use_container: cachebuilder.outdir = (runtimeContext.docker_outdir or random_outdir()) else: cachebuilder.outdir = jobcache _logger.info("[job %s] Using cached output in %s", jobname, jobcache) yield CallbackJob(self, output_callbacks, cachebuilder, jobcache) # we're done with the cache so release lock jobcachelock.close() return else: _logger.info("[job %s] Output of job will be cached in %s", jobname, jobcache) # turn shared lock into an exclusive lock since we'll # be writing the cache directory upgrade_lock(jobcachelock) shutil.rmtree(jobcache, True) os.makedirs(jobcache) runtimeContext = runtimeContext.copy() runtimeContext.outdir = jobcache def update_status_output_callback( output_callbacks: Callable[[List[Dict[str, Any]], str], None], jobcachelock: IO[Any], outputs: List[Dict[str, Any]], processStatus: str, ) -> None: # save status to the lockfile then release the lock jobcachelock.seek(0) jobcachelock.truncate() jobcachelock.write(processStatus) jobcachelock.close() output_callbacks(outputs, processStatus) output_callbacks = partial(update_status_output_callback, output_callbacks, jobcachelock) builder = self._init_job(job_order, runtimeContext) reffiles = copy.deepcopy(builder.files) j = self.make_job_runner(runtimeContext)( builder, builder.job, self.make_path_mapper, self.requirements, self.hints, jobname, ) j.prov_obj = self.prov_obj j.successCodes = self.tool.get("successCodes", []) j.temporaryFailCodes = self.tool.get("temporaryFailCodes", []) j.permanentFailCodes = self.tool.get("permanentFailCodes", []) debug = _logger.isEnabledFor(logging.DEBUG) if debug: _logger.debug( "[job %s] initializing from %s%s", j.name, self.tool.get("id", ""), " as part of %s" % runtimeContext.part_of if runtimeContext.part_of else "", ) _logger.debug("[job %s] %s", j.name, json_dumps(builder.job, indent=4)) builder.pathmapper = self.make_path_mapper(reffiles, builder.stagedir, runtimeContext, True) builder.requirements = j.requirements _check_adjust = partial(check_adjust, builder) visit_class([builder.files, builder.bindings], ("File", "Directory"), _check_adjust) initialWorkdir, _ = self.get_requirement("InitialWorkDirRequirement") if initialWorkdir is not None: ls = [] # type: List[Dict[str, Any]] if isinstance(initialWorkdir["listing"], str): ls = builder.do_eval(initialWorkdir["listing"]) else: for t in initialWorkdir["listing"]: if isinstance(t, Mapping) and "entry" in t: entry_exp = builder.do_eval(t["entry"], strip_whitespace=False) for entry in aslist(entry_exp): et = {"entry": entry} if "entryname" in t: et["entryname"] = builder.do_eval( t["entryname"]) else: et["entryname"] = None et["writable"] = t.get("writable", False) if et["entry"] is not None: ls.append(et) else: initwd_item = builder.do_eval(t) if not initwd_item: continue if isinstance(initwd_item, MutableSequence): ls.extend(initwd_item) else: ls.append(initwd_item) for i, t in enumerate(ls): if "entry" in t: if isinstance(t["entry"], str): ls[i] = { "class": "File", "basename": t["entryname"], "contents": t["entry"], "writable": t.get("writable"), } else: if t.get("entryname") or t.get("writable"): t = copy.deepcopy(t) if t.get("entryname"): t["entry"]["basename"] = t["entryname"] t["entry"]["writable"] = t.get("writable") ls[i] = t["entry"] j.generatefiles["listing"] = ls for l in ls: self.updatePathmap(builder.outdir, builder.pathmapper, l) visit_class([builder.files, builder.bindings], ("File", "Directory"), _check_adjust) if debug: _logger.debug( "[job %s] path mappings is %s", j.name, json_dumps( { p: builder.pathmapper.mapper(p) for p in builder.pathmapper.files() }, indent=4, ), ) if self.tool.get("stdin"): with SourceLine(self.tool, "stdin", validate.ValidationException, debug): j.stdin = builder.do_eval(self.tool["stdin"]) if j.stdin: reffiles.append({"class": "File", "path": j.stdin}) if self.tool.get("stderr"): with SourceLine(self.tool, "stderr", validate.ValidationException, debug): j.stderr = builder.do_eval(self.tool["stderr"]) if j.stderr: if os.path.isabs(j.stderr) or ".." in j.stderr: raise validate.ValidationException( "stderr must be a relative path, got '%s'" % j.stderr) if self.tool.get("stdout"): with SourceLine(self.tool, "stdout", validate.ValidationException, debug): j.stdout = builder.do_eval(self.tool["stdout"]) if j.stdout: if os.path.isabs( j.stdout) or ".." in j.stdout or not j.stdout: raise validate.ValidationException( "stdout must be a relative path, got '%s'" % j.stdout) if debug: _logger.debug( "[job %s] command line bindings is %s", j.name, json_dumps(builder.bindings, indent=4), ) dockerReq, _ = self.get_requirement("DockerRequirement") if dockerReq is not None and runtimeContext.use_container: out_dir, out_prefix = os.path.split( runtimeContext.tmp_outdir_prefix) j.outdir = runtimeContext.outdir or tempfile.mkdtemp( prefix=out_prefix, dir=out_dir) tmpdir_dir, tmpdir_prefix = os.path.split( runtimeContext.tmpdir_prefix) j.tmpdir = runtimeContext.tmpdir or tempfile.mkdtemp( prefix=tmpdir_prefix, dir=tmpdir_dir) j.stagedir = tempfile.mkdtemp(prefix=tmpdir_prefix, dir=tmpdir_dir) else: j.outdir = builder.outdir j.tmpdir = builder.tmpdir j.stagedir = builder.stagedir inplaceUpdateReq, _ = self.get_requirement("InplaceUpdateRequirement") if inplaceUpdateReq is not None: j.inplace_update = inplaceUpdateReq["inplaceUpdate"] normalizeFilesDirs(j.generatefiles) readers = {} # type: Dict[str, Any] muts = set() # type: Set[str] if builder.mutation_manager is not None: def register_mut(f): # type: (Dict[str, Any]) -> None mm = cast(MutationManager, builder.mutation_manager) muts.add(f["location"]) mm.register_mutation(j.name, f) def register_reader(f): # type: (Dict[str, Any]) -> None mm = cast(MutationManager, builder.mutation_manager) if f["location"] not in muts: mm.register_reader(j.name, f) readers[f["location"]] = copy.deepcopy(f) for li in j.generatefiles["listing"]: li = cast(Dict[str, Any], li) if li.get("writable") and j.inplace_update: adjustFileObjs(li, register_mut) adjustDirObjs(li, register_mut) else: adjustFileObjs(li, register_reader) adjustDirObjs(li, register_reader) adjustFileObjs(builder.files, register_reader) adjustFileObjs(builder.bindings, register_reader) adjustDirObjs(builder.files, register_reader) adjustDirObjs(builder.bindings, register_reader) timelimit, _ = self.get_requirement("ToolTimeLimit") if timelimit is not None: with SourceLine(timelimit, "timelimit", validate.ValidationException, debug): j.timelimit = builder.do_eval(timelimit["timelimit"]) if not isinstance(j.timelimit, int) or j.timelimit < 0: raise Exception( "timelimit must be an integer >= 0, got: %s" % j.timelimit) networkaccess, _ = self.get_requirement("NetworkAccess") if networkaccess is not None: with SourceLine(networkaccess, "networkAccess", validate.ValidationException, debug): j.networkaccess = builder.do_eval( networkaccess["networkAccess"]) if not isinstance(j.networkaccess, bool): raise Exception( "networkAccess must be a boolean, got: %s" % j.networkaccess) j.environment = {} evr, _ = self.get_requirement("EnvVarRequirement") if evr is not None: for t in evr["envDef"]: j.environment[t["envName"]] = builder.do_eval(t["envValue"]) shellcmd, _ = self.get_requirement("ShellCommandRequirement") if shellcmd is not None: cmd = [] # type: List[str] for b in builder.bindings: arg = builder.generate_arg(b) if b.get("shellQuote", True): arg = [shellescape.quote(a) for a in aslist(arg)] cmd.extend(aslist(arg)) j.command_line = ["/bin/sh", "-c", " ".join(cmd)] else: j.command_line = flatten( list(map(builder.generate_arg, builder.bindings))) j.pathmapper = builder.pathmapper j.collect_outputs = partial( self.collect_output_ports, self.tool["outputs"], builder, compute_checksum=getdefault(runtimeContext.compute_checksum, True), jobname=jobname, readers=readers, ) j.output_callback = output_callbacks yield j
def visit( self, obj: CWLObjectType, stagedir: str, basedir: str, copy: bool = False, staged: bool = False, ) -> None: tgt = convert_pathsep_to_unix( os.path.join(stagedir, cast(str, obj["basename"])) ) if obj["location"] in self._pathmap: return if obj["class"] == "Directory": location = cast(str, obj["location"]) if location.startswith("file://"): resolved = uri_file_path(location) else: resolved = location self._pathmap[location] = MapperEnt( resolved, tgt, "WritableDirectory" if copy else "Directory", staged ) if location.startswith("file://"): staged = False self.visitlisting( cast(List[CWLObjectType], obj.get("listing", [])), tgt, basedir, copy=copy, staged=staged, ) elif obj["class"] == "File": path = cast(str, obj["location"]) ab = abspath(path, basedir) if "contents" in obj and path.startswith("_:"): self._pathmap[path] = MapperEnt( obj["contents"], tgt, "CreateWritableFile" if copy else "CreateFile", staged, ) else: with SourceLine( obj, "location", ValidationException, _logger.isEnabledFor(logging.DEBUG), ): deref = ab if urllib.parse.urlsplit(deref).scheme in ["http", "https"]: deref = downloadHttpFile(path) else: # Dereference symbolic links st = os.lstat(deref) while stat.S_ISLNK(st.st_mode): rl = os.readlink(deref) deref = ( rl if os.path.isabs(rl) else os.path.join(os.path.dirname(deref), rl) ) st = os.lstat(deref) self._pathmap[path] = MapperEnt( deref, tgt, "WritableFile" if copy else "File", staged ) self.visitlisting( cast(List[CWLObjectType], obj.get("secondaryFiles", [])), stagedir, basedir, copy=copy, staged=staged, )
def resolve_and_validate_document(loadingContext, workflowobj, uri, preprocess_only=False, # type: bool skip_schemas=None, # type: bool ): # type: (...) -> Tuple[LoadingContext, Text] """Validate a CWL document.""" loadingContext = loadingContext.copy() if not isinstance(workflowobj, MutableMapping): raise ValueError("workflowjobj must be a dict, got '{}': {}".format( type(workflowobj), workflowobj)) jobobj = None if "cwl:tool" in workflowobj: jobobj, _ = loadingContext.loader.resolve_all(workflowobj, uri) uri = urllib.parse.urljoin(uri, workflowobj["https://w3id.org/cwl/cwl#tool"]) del cast(dict, jobobj)["https://w3id.org/cwl/cwl#tool"] workflowobj = fetch_document(uri, loadingContext)[1] fileuri = urllib.parse.urldefrag(uri)[0] cwlVersion = loadingContext.metadata.get("cwlVersion") if not cwlVersion: cwlVersion = workflowobj.get("cwlVersion") if not cwlVersion and fileuri != uri: # The tool we're loading is a fragment of a bigger file. Get # the document root element and look for cwlVersion there. metadata = fetch_document(fileuri, loadingContext)[1] cwlVersion = metadata.get("cwlVersion") if not cwlVersion: raise ValidationException( "No cwlVersion found. " "Use the following syntax in your CWL document to declare " "the version: cwlVersion: <version>.\n" "Note: if this is a CWL draft-2 (pre v1.0) document then it " "will need to be upgraded first.") if not isinstance(cwlVersion, string_types): with SourceLine(workflowobj, "cwlVersion", ValidationException): raise ValidationException("'cwlVersion' must be a string, " "got {}".format( type(cwlVersion))) # strip out version cwlVersion = re.sub( r"^(?:cwl:|https://w3id.org/cwl/cwl#)", "", cwlVersion) if cwlVersion not in list(ALLUPDATES): # print out all the Supported Versions of cwlVersion versions = [] for version in list(ALLUPDATES): if "dev" in version: version += " (with --enable-dev flag only)" versions.append(version) versions.sort() raise ValidationException( "The CWL reference runner no longer supports pre CWL v1.0 " "documents. Supported versions are: " "\n{}".format("\n".join(versions))) if isinstance(jobobj, CommentedMap) and "http://commonwl.org/cwltool#overrides" in jobobj: loadingContext.overrides_list.extend(resolve_overrides(jobobj, uri, uri)) del jobobj["http://commonwl.org/cwltool#overrides"] if isinstance(jobobj, CommentedMap) and "https://w3id.org/cwl/cwl#requirements" in jobobj: if cwlVersion not in ("v1.1.0-dev1","v1.1"): raise ValidationException( "`cwl:requirements` in the input object is not part of CWL " "v1.0. You can adjust to use `cwltool:overrides` instead; or you " "can set the cwlVersion to v1.1 or greater.") loadingContext.overrides_list.append({"overrideTarget": uri, "requirements": jobobj["https://w3id.org/cwl/cwl#requirements"]}) del jobobj["https://w3id.org/cwl/cwl#requirements"] (sch_document_loader, avsc_names) = \ process.get_schema(cwlVersion)[:2] if isinstance(avsc_names, Exception): raise avsc_names processobj = None # type: Union[CommentedMap, CommentedSeq, Text, None] document_loader = Loader(sch_document_loader.ctx, schemagraph=sch_document_loader.graph, idx=loadingContext.loader.idx, cache=sch_document_loader.cache, fetcher_constructor=loadingContext.fetcher_constructor, skip_schemas=skip_schemas) if cwlVersion == "v1.0": _add_blank_ids(workflowobj) processobj, metadata = document_loader.resolve_all(workflowobj, fileuri) if loadingContext.metadata: metadata = loadingContext.metadata if not isinstance(processobj, (CommentedMap, CommentedSeq)): raise ValidationException("Workflow must be a CommentedMap or CommentedSeq.") if not isinstance(metadata, CommentedMap): raise ValidationException("metadata must be a CommentedMap, was %s" % type(metadata)) if isinstance(processobj, CommentedMap): uri = processobj["id"] _convert_stdstreams_to_files(workflowobj) if preprocess_only: return loadingContext, uri if loadingContext.do_validate: schema.validate_doc(avsc_names, processobj, document_loader, loadingContext.strict) # None means default behavior (do update) if loadingContext.do_update in (True, None): if "cwlVersion" not in metadata: metadata["cwlVersion"] = cwlVersion processobj = update.update( processobj, document_loader, fileuri, loadingContext.enable_dev, metadata) document_loader.idx[processobj["id"]] = processobj if jobobj is not None: loadingContext.jobdefaults = jobobj loadingContext.loader = document_loader loadingContext.avsc_names = avsc_names loadingContext.metadata = metadata return loadingContext, uri
def _init_job(self, joborder, runtime_context): # type: (Mapping[Text, Text], RuntimeContext) -> Builder job = cast(Dict[Text, Union[Dict[Text, Any], List[Any], Text, None]], copy.deepcopy(joborder)) make_fs_access = getdefault(runtime_context.make_fs_access, StdFsAccess) fs_access = make_fs_access(runtime_context.basedir) load_listing_req, _ = self.get_requirement( "http://commonwl.org/cwltool#LoadListingRequirement") if load_listing_req is not None: load_listing = load_listing_req.get("loadListing") else: load_listing = "deep_listing" # will default to "no_listing" in CWL v1.1 # Validate job order try: fill_in_defaults(self.tool[u"inputs"], job, fs_access) normalizeFilesDirs(job) schema = self.names.get_name("input_record_schema", "") if schema is None: raise WorkflowException("Missing input record schema: " "{}".format(self.names)) validate.validate_ex(schema, job, strict=False, logger=_logger_validation_warnings) if load_listing and load_listing != "no_listing": get_listing(fs_access, job, recursive=(load_listing == "deep_listing")) visit_class(job, ("File", ), functools.partial(add_sizes, fs_access)) if load_listing == "deep_listing" and load_listing_req is None: for i, inparm in enumerate(self.tool["inputs"]): k = shortname(inparm["id"]) if k not in job: continue v = job[k] dircount = [0] def inc(d): # type: (List[int]) -> None d[0] += 1 visit_class(v, ("Directory", ), lambda x: inc(dircount)) if dircount[0] == 0: continue filecount = [0] visit_class(v, ("File", ), lambda x: inc(filecount)) if filecount[0] > FILE_COUNT_WARNING: # Long lines in this message are okay, will be reflowed based on terminal columns. _logger.warning( strip_dup_lineno( SourceLine(self.tool["inputs"], i, Text). makeError( """Recursive directory listing has resulted in a large number of File objects (%s) passed to the input parameter '%s'. This may negatively affect workflow performance and memory use. If this is a problem, use the hint 'cwltool:LoadListingRequirement' with "shallow_listing" or "no_listing" to change the directory listing behavior: $namespaces: cwltool: "http://commonwl.org/cwltool#" hints: cwltool:LoadListingRequirement: loadListing: shallow_listing """ % (filecount[0], k)))) except (validate.ValidationException, WorkflowException) as err: raise WorkflowException("Invalid job input record:\n" + Text(err)) files = [] # type: List[Dict[Text, Text]] bindings = CommentedSeq() tmpdir = u"" stagedir = u"" docker_req, _ = self.get_requirement("DockerRequirement") default_docker = None if docker_req is None and runtime_context.default_container: default_docker = runtime_context.default_container if (docker_req or default_docker) and runtime_context.use_container: if docker_req is not None: # Check if docker output directory is absolute if docker_req.get("dockerOutputDirectory") and \ docker_req.get("dockerOutputDirectory").startswith('/'): outdir = docker_req.get("dockerOutputDirectory") else: outdir = docker_req.get("dockerOutputDirectory") or \ runtime_context.docker_outdir or random_outdir() elif default_docker is not None: outdir = runtime_context.docker_outdir or random_outdir() tmpdir = runtime_context.docker_tmpdir or "/tmp" stagedir = runtime_context.docker_stagedir or "/var/lib/cwl" else: outdir = fs_access.realpath( runtime_context.outdir or tempfile.mkdtemp(prefix=getdefault( runtime_context.tmp_outdir_prefix, DEFAULT_TMP_PREFIX))) if self.tool[u"class"] != 'Workflow': tmpdir = fs_access.realpath(runtime_context.tmpdir or tempfile.mkdtemp()) stagedir = fs_access.realpath(runtime_context.stagedir or tempfile.mkdtemp()) builder = Builder(job, files, bindings, self.schemaDefs, self.names, self.requirements, self.hints, {}, runtime_context.mutation_manager, self.formatgraph, make_fs_access, fs_access, runtime_context.job_script_provider, runtime_context.eval_timeout, runtime_context.debug, runtime_context.js_console, runtime_context.force_docker_pull, load_listing, outdir, tmpdir, stagedir) bindings.extend( builder.bind_input(self.inputs_record_schema, job, discover_secondaryFiles=getdefault( runtime_context.toplevel, False))) if self.tool.get("baseCommand"): for index, command in enumerate(aslist(self.tool["baseCommand"])): bindings.append({ "position": [-1000000, index], "datum": command }) if self.tool.get("arguments"): for i, arg in enumerate(self.tool["arguments"]): lc = self.tool["arguments"].lc.data[i] filename = self.tool["arguments"].lc.filename bindings.lc.add_kv_line_col(len(bindings), lc) if isinstance(arg, MutableMapping): arg = copy.deepcopy(arg) if arg.get("position"): arg["position"] = [arg["position"], i] else: arg["position"] = [0, i] bindings.append(arg) elif ("$(" in arg) or ("${" in arg): cm = CommentedMap((("position", [0, i]), ("valueFrom", arg))) cm.lc.add_kv_line_col("valueFrom", lc) cm.lc.filename = filename bindings.append(cm) else: cm = CommentedMap((("position", [0, i]), ("datum", arg))) cm.lc.add_kv_line_col("datum", lc) cm.lc.filename = filename bindings.append(cm) # use python2 like sorting of heterogeneous lists # (containing str and int types), if PY3: key = functools.cmp_to_key(cmp_like_py2) else: # PY2 key = lambda d: d["position"] # This awkward construction replaces the contents of # "bindings" in place (because Builder expects it to be # mutated in place, sigh, I'm sorry) with its contents sorted, # supporting different versions of Python and ruamel.yaml with # different behaviors/bugs in CommentedSeq. bindings_copy = copy.deepcopy(bindings) del bindings[:] bindings.extend(sorted(bindings_copy, key=key)) if self.tool[u"class"] != 'Workflow': builder.resources = self.evalResources(builder, runtime_context) return builder
def get_image( dockerRequirement, # type: Dict[str, str] pull_image, # type: bool force_pull=False, # type: bool ): # type: (...) -> bool """ Acquire the software container image in the specified dockerRequirement. Uses Singularity and returns the success as a bool. Updates the provided dockerRequirement with the specific dockerImageId to the full path of the local image, if found. Likewise the dockerRequirement['dockerPull'] is updated to a docker:// URI if needed. """ found = False candidates = [] cache_folder = None if "CWL_SINGULARITY_CACHE" in os.environ: cache_folder = os.environ["CWL_SINGULARITY_CACHE"] elif is_version_2_6() and "SINGULARITY_PULLFOLDER" in os.environ: cache_folder = os.environ["SINGULARITY_PULLFOLDER"] if ("dockerImageId" not in dockerRequirement and "dockerPull" in dockerRequirement): match = re.search(pattern=r"([a-z]*://)", string=dockerRequirement["dockerPull"]) img_name = _normalize_image_id(dockerRequirement["dockerPull"]) candidates.append(img_name) if is_version_3_or_newer(): sif_name = _normalize_sif_id(dockerRequirement["dockerPull"]) candidates.append(sif_name) dockerRequirement["dockerImageId"] = sif_name else: dockerRequirement["dockerImageId"] = img_name if not match: dockerRequirement["dockerPull"] = ( "docker://" + dockerRequirement["dockerPull"]) elif "dockerImageId" in dockerRequirement: if os.path.isfile(dockerRequirement["dockerImageId"]): found = True candidates.append(dockerRequirement["dockerImageId"]) candidates.append( _normalize_image_id(dockerRequirement["dockerImageId"])) if is_version_3_or_newer(): candidates.append( _normalize_sif_id(dockerRequirement["dockerPull"])) targets = [os.getcwd()] if "CWL_SINGULARITY_CACHE" in os.environ: targets.append(os.environ["CWL_SINGULARITY_CACHE"]) if is_version_2_6() and "SINGULARITY_PULLFOLDER" in os.environ: targets.append(os.environ["SINGULARITY_PULLFOLDER"]) for target in targets: for dirpath, subdirs, files in os.walk(target): for entry in files: if entry in candidates: path = os.path.join(dirpath, entry) if os.path.isfile(path): _logger.info( "Using local copy of Singularity image found in %s", dirpath, ) dockerRequirement["dockerImageId"] = path found = True if (force_pull or not found) and pull_image: cmd = [] # type: List[str] if "dockerPull" in dockerRequirement: if cache_folder: env = os.environ.copy() if is_version_2_6(): env["SINGULARITY_PULLFOLDER"] = cache_folder cmd = [ "singularity", "pull", "--force", "--name", dockerRequirement["dockerImageId"], str(dockerRequirement["dockerPull"]), ] else: cmd = [ "singularity", "pull", "--force", "--name", "{}/{}".format(cache_folder, dockerRequirement["dockerImageId"]), str(dockerRequirement["dockerPull"]), ] _logger.info(str(cmd)) check_call(cmd, env=env, stdout=sys.stderr) # nosec dockerRequirement["dockerImageId"] = "{}/{}".format( cache_folder, dockerRequirement["dockerImageId"]) found = True else: cmd = [ "singularity", "pull", "--force", "--name", str(dockerRequirement["dockerImageId"]), str(dockerRequirement["dockerPull"]), ] _logger.info(str(cmd)) check_call(cmd, stdout=sys.stderr) # nosec found = True elif "dockerFile" in dockerRequirement: raise WorkflowException( SourceLine(dockerRequirement, "dockerFile").makeError( "dockerFile is not currently supported when using the " "Singularity runtime for Docker containers.")) elif "dockerLoad" in dockerRequirement: if is_version_3_1_or_newer(): if "dockerImageId" in dockerRequirement: name = "{}.sif".format( dockerRequirement["dockerImageId"]) else: name = "{}.sif".format(dockerRequirement["dockerLoad"]) cmd = [ "singularity", "build", name, "docker-archive://{}".format( dockerRequirement["dockerLoad"]), ] _logger.info(str(cmd)) check_call(cmd, stdout=sys.stderr) # nosec found = True dockerRequirement["dockerImageId"] = name raise WorkflowException( SourceLine(dockerRequirement, "dockerLoad").makeError( "dockerLoad is not currently supported when using the " "Singularity runtime (version less than 3.1) for Docker containers." )) elif "dockerImport" in dockerRequirement: raise WorkflowException( SourceLine(dockerRequirement, "dockerImport").makeError( "dockerImport is not currently supported when using the " "Singularity runtime for Docker containers.")) return found
def set_secondary(fsaccess, builder, inputschema, secondaryspec, primary, discovered): if isinstance(inputschema, Sequence) and not isinstance(inputschema, basestring): # union type, collect all possible secondaryFiles for i in inputschema: set_secondary(fsaccess, builder, i, secondaryspec, primary, discovered) return if isinstance(inputschema, basestring): sd = search_schemadef(inputschema, reversed(builder.hints + builder.requirements)) if sd: inputschema = sd else: return if "secondaryFiles" in inputschema: # set secondaryFiles, may be inherited by compound types. secondaryspec = inputschema["secondaryFiles"] if (isinstance(inputschema["type"], (Mapping, Sequence)) and not isinstance(inputschema["type"], basestring)): # compound type (union, array, record) set_secondary(fsaccess, builder, inputschema["type"], secondaryspec, primary, discovered) elif (inputschema["type"] == "record" and isinstance(primary, Mapping)): # # record type, find secondary files associated with fields. # for f in inputschema["fields"]: p = primary.get(shortname(f["name"])) if p: set_secondary(fsaccess, builder, f, secondaryspec, p, discovered) elif (inputschema["type"] == "array" and isinstance(primary, Sequence)): # # array type, find secondary files of elements # for p in primary: set_secondary(fsaccess, builder, {"type": inputschema["items"]}, secondaryspec, p, discovered) elif (inputschema["type"] == "File" and secondaryspec and isinstance(primary, Mapping) and primary.get("class") == "File" and "secondaryFiles" not in primary): # # Found a file, check for secondaryFiles # specs = [] primary["secondaryFiles"] = secondaryspec for i, sf in enumerate(aslist(secondaryspec)): if builder.cwlVersion == "v1.0": pattern = builder.do_eval(sf, context=primary) else: pattern = builder.do_eval(sf["pattern"], context=primary) if pattern is None: continue if isinstance(pattern, list): specs.extend(pattern) elif isinstance(pattern, dict): specs.append(pattern) elif isinstance(pattern, str): specs.append({"pattern": pattern}) else: raise SourceLine( primary["secondaryFiles"], i, validate.ValidationException).makeError( "Expression must return list, object, string or null") found = [] for i, sf in enumerate(specs): if isinstance(sf, dict): if sf.get("class") == "File": pattern = sf["basename"] else: pattern = sf["pattern"] required = sf.get("required") elif isinstance(sf, str): pattern = sf required = True else: raise SourceLine( primary["secondaryFiles"], i, validate.ValidationException).makeError( "Expression must return list, object, string or null") sfpath = substitute(primary["location"], pattern) required = builder.do_eval(required, context=primary) if fsaccess.exists(sfpath): found.append({"location": sfpath, "class": "File"}) elif required: raise SourceLine( primary["secondaryFiles"], i, validate.ValidationException).makeError( "Required secondary file '%s' does not exist" % sfpath) primary["secondaryFiles"] = cmap(found) if discovered is not None: discovered[primary["location"]] = primary["secondaryFiles"] elif inputschema["type"] not in primitive_types_set: set_secondary(fsaccess, builder, inputschema["type"], secondaryspec, primary, discovered)
def job( self, joborder, # type: MutableMapping[Text, Any] output_callback, # type: Callable[[Any, Any], Any] runtimeContext # type: RuntimeContext ): # type: (...) -> Generator self.state = {} self.processStatus = "success" if _logger.isEnabledFor(logging.DEBUG): _logger.debug(u"[%s] %s", self.name, json_dumps(joborder, indent=4)) runtimeContext = runtimeContext.copy() runtimeContext.outdir = None for index, inp in enumerate(self.tool["inputs"]): with SourceLine(self.tool["inputs"], index, WorkflowException, _logger.isEnabledFor(logging.DEBUG)): inp_id = shortname(inp["id"]) if inp_id in joborder: self.state[inp["id"]] = WorkflowStateItem( inp, copy.deepcopy(joborder[inp_id]), "success") elif "default" in inp: self.state[inp["id"]] = WorkflowStateItem( inp, copy.deepcopy(inp["default"]), "success") else: raise WorkflowException( u"Input '%s' not in input object and does not have a " " default value." % (inp["id"])) for step in self.steps: for out in step.tool["outputs"]: self.state[out["id"]] = None completed = 0 while completed < len(self.steps): self.made_progress = False for step in self.steps: if getdefault( runtimeContext.on_error, "stop") == "stop" and self.processStatus != "success": break if not step.submitted: try: step.iterable = self.try_make_job( step, output_callback, runtimeContext) except WorkflowException as exc: _logger.error(u"[%s] Cannot make job: %s", step.name, exc) _logger.debug("", exc_info=True) self.processStatus = "permanentFail" if step.iterable is not None: try: for newjob in step.iterable: if getdefault(runtimeContext.on_error, "stop") == "stop" \ and self.processStatus != "success": break if newjob is not None: self.made_progress = True yield newjob else: break except WorkflowException as exc: _logger.error(u"[%s] Cannot make job: %s", step.name, exc) _logger.debug("", exc_info=True) self.processStatus = "permanentFail" completed = sum(1 for s in self.steps if s.completed) if not self.made_progress and completed < len(self.steps): if self.processStatus != "success": break else: yield None if not self.did_callback: self.do_output_callback( output_callback) # could have called earlier on line 336;
def job(self, joborder, output_callback, runtimeContext): builder = make_builder(joborder, self.hints, self.requirements, runtimeContext) runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext) req, _ = self.get_requirement( "http://arvados.org/cwl#RunInSingleContainer") if not req: return super(ArvadosWorkflow, self).job(joborder, output_callback, runtimeContext) # RunInSingleContainer is true with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "id" not in self.tool: raise WorkflowException("%s object must have 'id'" % (self.tool["class"])) document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch( self.tool["id"]), self.tool["id"]) discover_secondary_files(self.tool["inputs"], joborder) with Perf(metrics, "subworkflow upload_deps"): upload_dependencies(self.arvrunner, os.path.basename(joborder.get("id", "#")), document_loader, joborder, joborder.get("id", "#"), False) if self.wf_pdh is None: workflowobj["requirements"] = dedup_reqs(self.requirements) workflowobj["hints"] = dedup_reqs(self.hints) packed = pack(document_loader, workflowobj, uri, self.metadata) def visit(item): for t in ("hints", "requirements"): if t not in item: continue for req in item[t]: if req["class"] == "ResourceRequirement": dyn = False for k in max_res_pars + sum_res_pars: if k in req: if isinstance(req[k], basestring): if item["id"] == "#main": # only the top-level requirements/hints may contain expressions self.dynamic_resource_req.append( req) dyn = True break else: with SourceLine( req, k, WorkflowException): raise WorkflowException( "Non-top-level ResourceRequirement in single container cannot have expressions" ) if not dyn: self.static_resource_req.append(req) if req["class"] == "DockerRequirement": if "http://arvados.org/cwl#dockerCollectionPDH" in req: del req[ "http://arvados.org/cwl#dockerCollectionPDH"] visit_class(packed["$graph"], ("Workflow", "CommandLineTool"), visit) if self.static_resource_req: self.static_resource_req = [ get_overall_res_req(self.static_resource_req) ] upload_dependencies(self.arvrunner, runtimeContext.name, document_loader, packed, uri, False) # Discover files/directories referenced by the # workflow (mainly "default" values) visit_class(packed, ("File", "Directory"), self.wf_reffiles.append) if self.dynamic_resource_req: # Evaluate dynamic resource requirements using current builder rs = copy.copy(self.static_resource_req) for dyn_rs in self.dynamic_resource_req: eval_req = {"class": "ResourceRequirement"} for a in max_res_pars + sum_res_pars: if a in dyn_rs: eval_req[a] = builder.do_eval(dyn_rs[a]) rs.append(eval_req) job_res_reqs = [get_overall_res_req(rs)] else: job_res_reqs = self.static_resource_req with Perf(metrics, "subworkflow adjust"): joborder_resolved = copy.deepcopy(joborder) joborder_keepmount = copy.deepcopy(joborder) reffiles = [] visit_class(joborder_keepmount, ("File", "Directory"), reffiles.append) mapper = ArvPathMapper(self.arvrunner, reffiles + self.wf_reffiles, runtimeContext.basedir, "/keep/%s", "/keep/%s/%s") # For containers API, we need to make sure any extra # referenced files (ie referenced by the workflow but # not in the inputs) are included in the mounts. if self.wf_reffiles: runtimeContext = runtimeContext.copy() runtimeContext.extra_reffiles = copy.deepcopy(self.wf_reffiles) def keepmount(obj): remove_redundant_fields(obj) with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "location" not in obj: raise WorkflowException( "%s object is missing required 'location' field: %s" % (obj["class"], obj)) with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).target if "listing" in obj: del obj["listing"] elif obj["location"].startswith("_:"): del obj["location"] else: raise WorkflowException( "Location is not a keep reference or a literal: '%s'" % obj["location"]) visit_class(joborder_keepmount, ("File", "Directory"), keepmount) def resolved(obj): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).resolved visit_class(joborder_resolved, ("File", "Directory"), resolved) if self.wf_pdh is None: adjustFileObjs(packed, keepmount) adjustDirObjs(packed, keepmount) self.wf_pdh = upload_workflow_collection( self.arvrunner, shortname(self.tool["id"]), packed) wf_runner = cmap({ "class": "CommandLineTool", "baseCommand": "cwltool", "inputs": self.tool["inputs"], "outputs": self.tool["outputs"], "stdout": "cwl.output.json", "requirements": self.requirements + job_res_reqs + [{ "class": "InlineJavascriptRequirement" }, { "class": "InitialWorkDirRequirement", "listing": [{ "entryname": "workflow.cwl", "entry": '$({"class": "File", "location": "keep:%s/workflow.cwl"})' % self.wf_pdh }, { "entryname": "cwl.input.yml", "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',', ': ')).replace( "\\", "\\\\").replace('$(', '\$(').replace( '${', '\${') }] }], "hints": self.hints, "arguments": [ "--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml" ], "id": "#" }) return ArvadosCommandTool(self.arvrunner, wf_runner, self.loadingContext).job( joborder_resolved, output_callback, runtimeContext)
def job( self, job_order, # type: Dict[Text, Text] output_callbacks, # type: Callable[[Any, Any], Any] **kwargs # type: Any ): # type: (...) -> Generator[Union[JobBase, CallbackJob], None, None] jobname = uniquename( kwargs.get("name", shortname(self.tool.get("id", "job")))) if kwargs.get("cachedir"): cacheargs = kwargs.copy() cacheargs["outdir"] = "/out" cacheargs["tmpdir"] = "/tmp" cacheargs["stagedir"] = "/stage" cachebuilder = self._init_job(job_order, **cacheargs) cachebuilder.pathmapper = PathMapper(cachebuilder.files, kwargs["basedir"], cachebuilder.stagedir, separateDirs=False) _check_adjust = partial(check_adjust, cachebuilder) visit_class([cachebuilder.files, cachebuilder.bindings], ("File", "Directory"), _check_adjust) cmdline = flatten( list(map(cachebuilder.generate_arg, cachebuilder.bindings))) (docker_req, docker_is_req) = self.get_requirement("DockerRequirement") if docker_req and kwargs.get("use_container"): dockerimg = docker_req.get("dockerImageId") or docker_req.get( "dockerPull") elif kwargs.get("default_container", None) is not None and kwargs.get("use_container"): dockerimg = kwargs.get("default_container") if dockerimg: cmdline = ["docker", "run", dockerimg] + cmdline keydict = {u"cmdline": cmdline} if "stdout" in self.tool: keydict["stdout"] = self.tool["stdout"] for location, f in cachebuilder.pathmapper.items(): if f.type == "File": checksum = next( (e['checksum'] for e in cachebuilder.files if 'location' in e and e['location'] == location and 'checksum' in e and e['checksum'] != 'sha1$hash'), None) st = os.stat(f.resolved) if checksum: keydict[f.resolved] = [st.st_size, checksum] else: keydict[f.resolved] = [ st.st_size, int(st.st_mtime * 1000) ] interesting = { "DockerRequirement", "EnvVarRequirement", "CreateFileRequirement", "ShellCommandRequirement" } for rh in (self.requirements, self.hints): for r in reversed(rh): if r["class"] in interesting and r["class"] not in keydict: keydict[r["class"]] = r keydictstr = json.dumps(keydict, separators=(',', ':'), sort_keys=True) cachekey = hashlib.md5(keydictstr.encode('utf-8')).hexdigest() _logger.debug("[job %s] keydictstr is %s -> %s", jobname, keydictstr, cachekey) jobcache = os.path.join(kwargs["cachedir"], cachekey) jobcachepending = jobcache + ".pending" if os.path.isdir(jobcache) and not os.path.isfile(jobcachepending): if docker_req and kwargs.get("use_container"): cachebuilder.outdir = kwargs.get( "docker_outdir") or "/var/spool/cwl" else: cachebuilder.outdir = jobcache _logger.info("[job %s] Using cached output in %s", jobname, jobcache) yield CallbackJob(self, output_callbacks, cachebuilder, jobcache) return else: _logger.info("[job %s] Output of job will be cached in %s", jobname, jobcache) shutil.rmtree(jobcache, True) os.makedirs(jobcache) kwargs["outdir"] = jobcache open(jobcachepending, "w").close() def rm_pending_output_callback(output_callbacks, jobcachepending, outputs, processStatus): if processStatus == "success": os.remove(jobcachepending) output_callbacks(outputs, processStatus) output_callbacks = cast( Callable[..., Any], # known bug in mypy # https://github.com/python/mypy/issues/797 partial(rm_pending_output_callback, output_callbacks, jobcachepending)) builder = self._init_job(job_order, **kwargs) reffiles = copy.deepcopy(builder.files) j = self.makeJobRunner(**kwargs) j.builder = builder j.joborder = builder.job j.make_pathmapper = self.makePathMapper j.stdin = None j.stderr = None j.stdout = None j.successCodes = self.tool.get("successCodes") j.temporaryFailCodes = self.tool.get("temporaryFailCodes") j.permanentFailCodes = self.tool.get("permanentFailCodes") j.requirements = self.requirements j.hints = self.hints j.name = jobname debug = _logger.isEnabledFor(logging.DEBUG) if debug: _logger.debug( u"[job %s] initializing from %s%s", j.name, self.tool.get("id", ""), u" as part of %s" % kwargs["part_of"] if "part_of" in kwargs else "") _logger.debug(u"[job %s] %s", j.name, json.dumps(job_order, indent=4)) builder.pathmapper = None make_path_mapper_kwargs = kwargs if "stagedir" in make_path_mapper_kwargs: make_path_mapper_kwargs = make_path_mapper_kwargs.copy() del make_path_mapper_kwargs["stagedir"] builder.pathmapper = self.makePathMapper(reffiles, builder.stagedir, **make_path_mapper_kwargs) builder.requirements = j.requirements _check_adjust = partial(check_adjust, builder) visit_class([builder.files, builder.bindings], ("File", "Directory"), _check_adjust) initialWorkdir = self.get_requirement("InitialWorkDirRequirement")[0] j.generatefiles = {"class": "Directory", "listing": [], "basename": ""} if initialWorkdir: ls = [] # type: List[Dict[Text, Any]] if isinstance(initialWorkdir["listing"], (str, Text)): ls = builder.do_eval(initialWorkdir["listing"]) else: for t in initialWorkdir["listing"]: if "entry" in t: et = {u"entry": builder.do_eval(t["entry"])} if "entryname" in t: et["entryname"] = builder.do_eval(t["entryname"]) else: et["entryname"] = None et["writable"] = t.get("writable", False) ls.append(et) else: ls.append(builder.do_eval(t)) for i, t in enumerate(ls): if "entry" in t: if isinstance(t["entry"], string_types): ls[i] = { "class": "File", "basename": t["entryname"], "contents": t["entry"], "writable": t.get("writable") } else: if t.get("entryname") or t.get("writable"): t = copy.deepcopy(t) if t.get("entryname"): t["entry"]["basename"] = t["entryname"] t["entry"]["writable"] = t.get("writable") ls[i] = t["entry"] j.generatefiles[u"listing"] = ls for l in ls: self.updatePathmap(builder.outdir, builder.pathmapper, l) visit_class([builder.files, builder.bindings], ("File", "Directory"), _check_adjust) if debug: _logger.debug( u"[job %s] path mappings is %s", j.name, json.dumps( { p: builder.pathmapper.mapper(p) for p in builder.pathmapper.files() }, indent=4)) if self.tool.get("stdin"): with SourceLine(self.tool, "stdin", validate.ValidationException, debug): j.stdin = builder.do_eval(self.tool["stdin"]) reffiles.append({"class": "File", "path": j.stdin}) if self.tool.get("stderr"): with SourceLine(self.tool, "stderr", validate.ValidationException, debug): j.stderr = builder.do_eval(self.tool["stderr"]) if os.path.isabs(j.stderr) or ".." in j.stderr: raise validate.ValidationException( "stderr must be a relative path, got '%s'" % j.stderr) if self.tool.get("stdout"): with SourceLine(self.tool, "stdout", validate.ValidationException, debug): j.stdout = builder.do_eval(self.tool["stdout"]) if os.path.isabs(j.stdout) or ".." in j.stdout or not j.stdout: raise validate.ValidationException( "stdout must be a relative path, got '%s'" % j.stdout) if debug: _logger.debug(u"[job %s] command line bindings is %s", j.name, json.dumps(builder.bindings, indent=4)) dockerReq = self.get_requirement("DockerRequirement")[0] if dockerReq and kwargs.get("use_container"): out_prefix = kwargs.get("tmp_outdir_prefix") j.outdir = kwargs.get("outdir") or tempfile.mkdtemp( prefix=out_prefix) tmpdir_prefix = kwargs.get('tmpdir_prefix') j.tmpdir = kwargs.get("tmpdir") or tempfile.mkdtemp( prefix=tmpdir_prefix) j.stagedir = tempfile.mkdtemp(prefix=tmpdir_prefix) else: j.outdir = builder.outdir j.tmpdir = builder.tmpdir j.stagedir = builder.stagedir inplaceUpdateReq = self.get_requirement( "http://commonwl.org/cwltool#InplaceUpdateRequirement")[0] if inplaceUpdateReq: j.inplace_update = inplaceUpdateReq["inplaceUpdate"] normalizeFilesDirs(j.generatefiles) readers = {} muts = set() if builder.mutation_manager: def register_mut(f): muts.add(f["location"]) builder.mutation_manager.register_mutation(j.name, f) def register_reader(f): if f["location"] not in muts: builder.mutation_manager.register_reader(j.name, f) readers[f["location"]] = f for li in j.generatefiles["listing"]: li = cast(Dict[Text, Any], li) if li.get("writable") and j.inplace_update: adjustFileObjs(li, register_mut) adjustDirObjs(li, register_mut) else: adjustFileObjs(li, register_reader) adjustDirObjs(li, register_reader) adjustFileObjs(builder.files, register_reader) adjustFileObjs(builder.bindings, register_reader) adjustDirObjs(builder.files, register_reader) adjustDirObjs(builder.bindings, register_reader) j.environment = {} evr = self.get_requirement("EnvVarRequirement")[0] if evr: for t in evr["envDef"]: j.environment[t["envName"]] = builder.do_eval(t["envValue"]) shellcmd = self.get_requirement("ShellCommandRequirement")[0] if shellcmd: cmd = [] # type: List[Text] for b in builder.bindings: arg = builder.generate_arg(b) if b.get("shellQuote", True): arg = [shellescape.quote(a) for a in aslist(arg)] cmd.extend(aslist(arg)) j.command_line = ["/bin/sh", "-c", " ".join(cmd)] else: j.command_line = flatten( list(map(builder.generate_arg, builder.bindings))) j.pathmapper = builder.pathmapper j.collect_outputs = partial(self.collect_output_ports, self.tool["outputs"], builder, compute_checksum=kwargs.get( "compute_checksum", True), jobname=jobname, readers=readers) j.output_callback = output_callbacks yield j
def __init__(self, toolpath_object, # type: MutableMapping[Text, Any] loadingContext # type: LoadingContext ): # type: (...) -> None self.metadata = getdefault(loadingContext.metadata, {}) # type: Dict[Text,Any] self.provenance_object = None # type: Optional[CreateProvProfile] self.parent_wf = None # type: Optional[CreateProvProfile] global SCHEMA_FILE, SCHEMA_DIR, SCHEMA_ANY # pylint: disable=global-statement if SCHEMA_FILE is None or SCHEMA_ANY is None or SCHEMA_DIR is None: get_schema("v1.0") SCHEMA_ANY = cast(Dict[Text, Any], SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/salad#Any"]) SCHEMA_FILE = cast(Dict[Text, Any], SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/cwl#File"]) SCHEMA_DIR = cast(Dict[Text, Any], SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/cwl#Directory"]) names = schema.make_avro_schema([SCHEMA_FILE, SCHEMA_DIR, SCHEMA_ANY], Loader({}))[0] if isinstance(names, schema.SchemaParseException): raise names else: self.names = names self.tool = toolpath_object self.requirements = copy.deepcopy(getdefault(loadingContext.requirements, [])) self.requirements.extend(self.tool.get("requirements", [])) self.requirements.extend(get_overrides(getdefault(loadingContext.overrides_list, []), self.tool["id"]).get("requirements", [])) self.hints = copy.deepcopy(getdefault(loadingContext.hints, [])) self.hints.extend(self.tool.get("hints", [])) # Versions of requirements and hints which aren't mutated. self.original_requirements = copy.deepcopy(self.requirements) self.original_hints = copy.deepcopy(self.hints) self.doc_loader = loadingContext.loader self.doc_schema = loadingContext.avsc_names self.formatgraph = None # type: Optional[Graph] if self.doc_loader is not None: self.formatgraph = self.doc_loader.graph checkRequirements(self.tool, supportedProcessRequirements) self.validate_hints(loadingContext.avsc_names, self.tool.get("hints", []), strict=getdefault(loadingContext.strict, False)) self.schemaDefs = {} # type: Dict[Text,Dict[Text, Any]] sd, _ = self.get_requirement("SchemaDefRequirement") if sd is not None: sdtypes = sd["types"] av = schema.make_valid_avro(sdtypes, {t["name"]: t for t in avroize_type(sdtypes)}, set()) for i in av: self.schemaDefs[i["name"]] = i # type: ignore schema.AvroSchemaFromJSONData(av, self.names) # type: ignore # Build record schema from inputs self.inputs_record_schema = { "name": "input_record_schema", "type": "record", "fields": []} # type: Dict[Text, Any] self.outputs_record_schema = { "name": "outputs_record_schema", "type": "record", "fields": []} # type: Dict[Text, Any] for key in ("inputs", "outputs"): for i in self.tool[key]: c = copy.deepcopy(i) c["name"] = shortname(c["id"]) del c["id"] if "type" not in c: raise validate.ValidationException( u"Missing 'type' in parameter '{}'".format(c["name"])) if "default" in c and "null" not in aslist(c["type"]): nullable = ["null"] nullable.extend(aslist(c["type"])) c["type"] = nullable else: c["type"] = c["type"] c["type"] = avroize_type(c["type"], c["name"]) if key == "inputs": self.inputs_record_schema["fields"].append(c) elif key == "outputs": self.outputs_record_schema["fields"].append(c) with SourceLine(toolpath_object, "inputs", validate.ValidationException): self.inputs_record_schema = cast( Dict[Text, Any], schema.make_valid_avro( self.inputs_record_schema, {}, set())) schema.AvroSchemaFromJSONData(self.inputs_record_schema, self.names) with SourceLine(toolpath_object, "outputs", validate.ValidationException): self.outputs_record_schema = cast( Dict[Text, Any], schema.make_valid_avro(self.outputs_record_schema, {}, set())) schema.AvroSchemaFromJSONData(self.outputs_record_schema, self.names) if toolpath_object.get("class") is not None \ and not getdefault(loadingContext.disable_js_validation, False): if loadingContext.js_hint_options_file is not None: try: with open(loadingContext.js_hint_options_file) as options_file: validate_js_options = json.load(options_file) except (OSError, ValueError) as err: _logger.error( "Failed to read options file %s", loadingContext.js_hint_options_file) raise err else: validate_js_options = None if self.doc_schema is not None: validate_js_expressions(cast(CommentedMap, toolpath_object), self.doc_schema.names[toolpath_object["class"]], validate_js_options) dockerReq, is_req = self.get_requirement("DockerRequirement") if dockerReq is not None and "dockerOutputDirectory" in dockerReq\ and is_req is not None and not is_req: _logger.warning(SourceLine( item=dockerReq, raise_type=Text).makeError( "When 'dockerOutputDirectory' is declared, DockerRequirement " "should go in the 'requirements' section, not 'hints'.""")) if dockerReq is not None and is_req is not None\ and dockerReq.get("dockerOutputDirectory") == "/var/spool/cwl": if is_req: # In this specific case, it is legal to have /var/spool/cwl, so skip the check. pass else: # Must be a requirement var_spool_cwl_detector(self.tool) else: var_spool_cwl_detector(self.tool)
def collect_output(self, schema, builder, outdir, fs_access, compute_checksum=True): # type: (Dict[Text, Any], Builder, Text, StdFsAccess, bool) -> Union[Dict[Text, Any], List[Union[Dict[Text, Any], Text]]] r = [] # type: List[Any] debug = _logger.isEnabledFor(logging.DEBUG) if "outputBinding" in schema: binding = schema["outputBinding"] globpatterns = [] # type: List[Text] revmap = partial(revmap_file, builder, outdir) if "glob" in binding: with SourceLine(binding, "glob", WorkflowException, debug): for gb in aslist(binding["glob"]): gb = builder.do_eval(gb) if gb: globpatterns.extend(aslist(gb)) for gb in globpatterns: if gb.startswith(outdir): gb = gb[len(outdir) + 1:] elif gb == ".": gb = outdir elif gb.startswith("/"): raise WorkflowException( "glob patterns must not start with '/'") try: prefix = fs_access.glob(outdir) r.extend([{ "location": g, "path": fs_access.join(builder.outdir, g[len(prefix[0]) + 1:]), "basename": os.path.basename(g), "nameroot": os.path.splitext(os.path.basename(g))[0], "nameext": os.path.splitext(os.path.basename(g))[1], "class": "File" if fs_access.isfile(g) else "Directory" } for g in fs_access.glob( fs_access.join(outdir, gb))]) except (OSError, IOError) as e: _logger.warning(Text(e)) except: _logger.error("Unexpected error from fs_access", exc_info=True) raise for files in r: rfile = files.copy() revmap(rfile) if files["class"] == "Directory": ll = builder.loadListing or ( binding and binding.get("loadListing")) if ll and ll != "no_listing": get_listing(fs_access, files, (ll == "deep_listing")) else: with fs_access.open(rfile["location"], "rb") as f: contents = b"" if binding.get("loadContents") or compute_checksum: contents = f.read(CONTENT_LIMIT) if binding.get("loadContents"): files["contents"] = contents if compute_checksum: checksum = hashlib.sha1() while contents != b"": checksum.update(contents) contents = f.read(1024 * 1024) files[ "checksum"] = "sha1$%s" % checksum.hexdigest( ) f.seek(0, 2) filesize = f.tell() files["size"] = filesize if "format" in schema: files["format"] = builder.do_eval(schema["format"], context=files) optional = False single = False if isinstance(schema["type"], list): if "null" in schema["type"]: optional = True if "File" in schema["type"] or "Directory" in schema["type"]: single = True elif schema["type"] == "File" or schema["type"] == "Directory": single = True if "outputEval" in binding: with SourceLine(binding, "outputEval", WorkflowException, debug): r = builder.do_eval(binding["outputEval"], context=r) if single: if not r and not optional: with SourceLine(binding, "glob", WorkflowException, debug): raise WorkflowException( "Did not find output file with glob pattern: '{}'". format(globpatterns)) elif not r and optional: pass elif isinstance(r, list): if len(r) > 1: raise WorkflowException( "Multiple matches for output item that is a single file." ) else: r = r[0] if "secondaryFiles" in schema: with SourceLine(schema, "secondaryFiles", WorkflowException, debug): for primary in aslist(r): if isinstance(primary, dict): primary.setdefault("secondaryFiles", []) pathprefix = primary["path"][0:primary["path"]. rindex("/") + 1] for sf in aslist(schema["secondaryFiles"]): if isinstance( sf, dict) or "$(" in sf or "${" in sf: sfpath = builder.do_eval(sf, context=primary) subst = False else: sfpath = sf subst = True for sfitem in aslist(sfpath): if isinstance(sfitem, string_types): if subst: sfitem = { "path": substitute( primary["path"], sfitem) } else: sfitem = { "path": pathprefix + sfitem } if "path" in sfitem and "location" not in sfitem: revmap(sfitem) if fs_access.isfile(sfitem["location"]): sfitem["class"] = "File" primary["secondaryFiles"].append( sfitem) elif fs_access.isdir(sfitem["location"]): sfitem["class"] = "Directory" primary["secondaryFiles"].append( sfitem) # Ensure files point to local references outside of the run environment adjustFileObjs( r, cast( # known bug in mypy # https://github.com/python/mypy/issues/797 Callable[[Any], Any], revmap)) if not r and optional: r = None if (not r and isinstance(schema["type"], dict) and schema["type"]["type"] == "record"): out = {} for f in schema["type"]["fields"]: out[shortname( f["name"])] = self.collect_output( # type: ignore f, builder, outdir, fs_access, compute_checksum=compute_checksum) return out return r
def static_checker(workflow_inputs, workflow_outputs, step_inputs, step_outputs): # type: (List[Dict[Text, Any]], List[Dict[Text, Any]], List[Dict[Text, Any]], List[Dict[Text, Any]]) -> None """Check if all source and sink types of a workflow are compatible before run time. """ # source parameters: workflow_inputs and step_outputs # sink parameters: step_inputs and workflow_outputs # make a dictionary of source parameters, indexed by the "id" field src_parms = workflow_inputs + step_outputs src_dict = {} for parm in src_parms: src_dict[parm["id"]] = parm step_inputs_val = check_all_types(src_dict, step_inputs, "source") workflow_outputs_val = check_all_types(src_dict, workflow_outputs, "outputSource") warnings = step_inputs_val["warning"] + workflow_outputs_val["warning"] exceptions = step_inputs_val["exception"] + workflow_outputs_val[ "exception"] warning_msgs = [] exception_msgs = [] for warning in warnings: src = warning.src sink = warning.sink linkMerge = warning.linkMerge msg = SourceLine(src, "type").makeError( "Source '%s' of type %s is partially incompatible" % (shortname(src["id"]), json.dumps(src["type"]))) + "\n" + \ SourceLine(sink, "type").makeError( " with sink '%s' of type %s" % (shortname(sink["id"]), json.dumps(sink["type"]))) if linkMerge: msg += "\n" + SourceLine(sink).makeError( " source has linkMerge method %s" % linkMerge) warning_msgs.append(msg) for exception in exceptions: src = exception.src sink = exception.sink linkMerge = exception.linkMerge msg = SourceLine(src, "type").makeError( "Source '%s' of type %s is incompatible" % (shortname(src["id"]), json.dumps(src["type"]))) + "\n" + \ SourceLine(sink, "type").makeError( " with sink '%s' of type %s" % (shortname(sink["id"]), json.dumps(sink["type"]))) if linkMerge: msg += "\n" + SourceLine(sink).makeError( " source has linkMerge method %s" % linkMerge) exception_msgs.append(msg) for sink in step_inputs: if ('null' != sink["type"] and 'null' not in sink["type"] and "source" not in sink and "default" not in sink and "valueFrom" not in sink): msg = SourceLine(sink).makeError( "Required parameter '%s' does not have source, default, or valueFrom expression" % shortname(sink["id"])) exception_msgs.append(msg) all_warning_msg = "\n".join(warning_msgs) all_exception_msg = "\n".join(exception_msgs) if warnings: _logger.warning("Workflow checker warning:\n%s" % all_warning_msg) if exceptions: raise validate.ValidationException(all_exception_msg)
def generate_arg(self, binding): # type: (Dict[str, Any]) -> List[str] value = binding.get("datum") if "valueFrom" in binding: with SourceLine( binding, "valueFrom", WorkflowException, _logger.isEnabledFor(logging.DEBUG), ): value = self.do_eval(binding["valueFrom"], context=value) prefix = binding.get("prefix") # type: Optional[str] sep = binding.get("separate", True) if prefix is None and not sep: with SourceLine( binding, "separate", WorkflowException, _logger.isEnabledFor(logging.DEBUG), ): raise WorkflowException( "'separate' option can not be specified without prefix") argl = [] # type: MutableSequence[MutableMapping[str, str]] if isinstance(value, MutableSequence): if binding.get("itemSeparator") and value: argl = [ binding["itemSeparator"].join( [self.tostr(v) for v in value]) ] elif binding.get("valueFrom"): value = [self.tostr(v) for v in value] return ([prefix] if prefix else []) + value elif prefix and value: return [prefix] else: return [] elif isinstance(value, MutableMapping) and value.get("class") in ( "File", "Directory", ): argl = [value] elif isinstance(value, MutableMapping): return [prefix] if prefix else [] elif value is True and prefix: return [prefix] elif value is False or value is None or (value is True and not prefix): return [] else: argl = [value] args = [] for j in argl: if sep: args.extend([prefix, self.tostr(j)]) else: args.append( self.tostr(j) if prefix is None else prefix + self.tostr(j)) return [a for a in args if a is not None]
def __init__(self, toolpath_object, pos, **kwargs): # type: (Dict[Text, Any], int, **Any) -> None if "id" in toolpath_object: self.id = toolpath_object["id"] else: self.id = "#step" + Text(pos) kwargs["requirements"] = ( kwargs.get("requirements", []) + toolpath_object.get("requirements", []) + get_overrides(kwargs.get("overrides", []), self.id)) kwargs["hints"] = kwargs.get("hints", []) + toolpath_object.get( "hints", []) try: if isinstance(toolpath_object["run"], dict): self.embedded_tool = kwargs.get("makeTool")( toolpath_object["run"], **kwargs) else: self.embedded_tool = load_tool( toolpath_object["run"], kwargs.get("makeTool"), kwargs, enable_dev=kwargs.get("enable_dev"), strict=kwargs.get("strict"), fetcher_constructor=kwargs.get("fetcher_constructor"), resolver=kwargs.get("resolver"), overrides=kwargs.get("overrides")) except validate.ValidationException as v: raise WorkflowException( u"Tool definition %s failed validation:\n%s" % (toolpath_object["run"], validate.indent(str(v)))) validation_errors = [] self.tool = toolpath_object = copy.deepcopy(toolpath_object) bound = set() for stepfield, toolfield in (("in", "inputs"), ("out", "outputs")): toolpath_object[toolfield] = [] for n, step_entry in enumerate(toolpath_object[stepfield]): if isinstance(step_entry, six.string_types): param = CommentedMap() # type: CommentedMap inputid = step_entry else: param = CommentedMap(six.iteritems(step_entry)) inputid = step_entry["id"] shortinputid = shortname(inputid) found = False for tool_entry in self.embedded_tool.tool[toolfield]: frag = shortname(tool_entry["id"]) if frag == shortinputid: param.update(tool_entry) found = True bound.add(frag) break if not found: if stepfield == "in": param["type"] = "Any" else: validation_errors.append( SourceLine(self.tool["out"], n).makeError( "Workflow step output '%s' does not correspond to" % shortname(step_entry) if isinstance( step_entry, six.string_types ) else shortname(step_entry["id"])) + "\n" + SourceLine(self.embedded_tool.tool, "outputs"). makeError(" tool output (expected '%s')" % ("', '".join([ shortname(tool_entry["id"]) for tool_entry in self.embedded_tool.tool[toolfield] ])))) param["id"] = inputid param.lc.line = toolpath_object[stepfield].lc.data[n][0] param.lc.col = toolpath_object[stepfield].lc.data[n][1] param.lc.filename = toolpath_object[stepfield].lc.filename toolpath_object[toolfield].append(param) missing = [] for i, tool_entry in enumerate(self.embedded_tool.tool["inputs"]): if shortname(tool_entry["id"]) not in bound: if "null" not in tool_entry[ "type"] and "default" not in tool_entry: missing.append(shortname(tool_entry["id"])) if missing: validation_errors.append( SourceLine(self.tool, "in").makeError( "Step is missing required parameter%s '%s'" % ("s" if len(missing) > 1 else "", "', '".join(missing)))) if validation_errors: raise validate.ValidationException("\n".join(validation_errors)) super(WorkflowStep, self).__init__(toolpath_object, **kwargs) if self.embedded_tool.tool["class"] == "Workflow": (feature, _) = self.get_requirement("SubworkflowFeatureRequirement") if not feature: raise WorkflowException( "Workflow contains embedded workflow but SubworkflowFeatureRequirement not in requirements" ) if "scatter" in self.tool: (feature, _) = self.get_requirement("ScatterFeatureRequirement") if not feature: raise WorkflowException( "Workflow contains scatter but ScatterFeatureRequirement not in requirements" ) inputparms = copy.deepcopy(self.tool["inputs"]) outputparms = copy.deepcopy(self.tool["outputs"]) scatter = aslist(self.tool["scatter"]) method = self.tool.get("scatterMethod") if method is None and len(scatter) != 1: raise validate.ValidationException( "Must specify scatterMethod when scattering over multiple inputs" ) inp_map = {i["id"]: i for i in inputparms} for s in scatter: if s not in inp_map: raise validate.ValidationException( SourceLine(self.tool, "scatter").makeError( u"Scatter parameter '%s' does not correspond to an input parameter of this " u"step, expecting '%s'" % (shortname(s), "', '".join( shortname(k) for k in inp_map.keys())))) inp_map[s]["type"] = { "type": "array", "items": inp_map[s]["type"] } if self.tool.get("scatterMethod") == "nested_crossproduct": nesting = len(scatter) else: nesting = 1 for r in range(0, nesting): for op in outputparms: op["type"] = {"type": "array", "items": op["type"]} self.tool["inputs"] = inputparms self.tool["outputs"] = outputparms
def arv_docker_get_image(api_client, dockerRequirement, pull_image, project_uuid, force_pull, tmp_outdir_prefix): """Check if a Docker image is available in Keep, if not, upload it using arv-keepdocker.""" if "http://arvados.org/cwl#dockerCollectionPDH" in dockerRequirement: return dockerRequirement["http://arvados.org/cwl#dockerCollectionPDH"] if "dockerImageId" not in dockerRequirement and "dockerPull" in dockerRequirement: dockerRequirement = copy.deepcopy(dockerRequirement) dockerRequirement["dockerImageId"] = dockerRequirement["dockerPull"] if hasattr(dockerRequirement, 'lc'): dockerRequirement.lc.data[ "dockerImageId"] = dockerRequirement.lc.data["dockerPull"] global cached_lookups global cached_lookups_lock with cached_lookups_lock: if dockerRequirement["dockerImageId"] in cached_lookups: return cached_lookups[dockerRequirement["dockerImageId"]] with SourceLine(dockerRequirement, "dockerImageId", WorkflowException, logger.isEnabledFor(logging.DEBUG)): sp = dockerRequirement["dockerImageId"].split(":") image_name = sp[0] image_tag = sp[1] if len(sp) > 1 else "latest" images = arvados.commands.keepdocker.list_images_in_arv( api_client, 3, image_name=image_name, image_tag=image_tag) if not images: # Fetch Docker image if necessary. try: result = cwltool.docker.DockerCommandLineJob.get_image( dockerRequirement, pull_image, force_pull, tmp_outdir_prefix) if not result: raise WorkflowException("Docker image '%s' not available" % dockerRequirement["dockerImageId"]) except OSError as e: raise WorkflowException( "While trying to get Docker image '%s', failed to execute 'docker': %s" % (dockerRequirement["dockerImageId"], e)) # Upload image to Arvados args = [] if project_uuid: args.append("--project-uuid=" + project_uuid) args.append(image_name) args.append(image_tag) logger.info("Uploading Docker image %s:%s", image_name, image_tag) try: arvados.commands.put.api_client = api_client arvados.commands.keepdocker.main(args, stdout=sys.stderr, install_sig_handlers=False, api=api_client) except SystemExit as e: # If e.code is None or zero, then keepdocker exited normally and we can continue if e.code: raise WorkflowException("keepdocker exited with code %s" % e.code) images = arvados.commands.keepdocker.list_images_in_arv( api_client, 3, image_name=image_name, image_tag=image_tag) if not images: raise WorkflowException("Could not find Docker image %s:%s" % (image_name, image_tag)) pdh = api_client.collections().get( uuid=images[0][0]).execute()["portable_data_hash"] with cached_lookups_lock: cached_lookups[dockerRequirement["dockerImageId"]] = pdh return pdh
def check_for_abstract_op(tool: CWLObjectType) -> None: if tool["class"] == "Operation": raise SourceLine( tool, "class", WorkflowException, runtime_context.debug ).makeError("Workflow has unrunnable abstract Operation")
def run(self, runtimeContext): script_parameters = {"command": self.command_line} runtime_constraints = {} with Perf(metrics, "generatefiles %s" % self.name): if self.generatefiles["listing"]: vwd = arvados.collection.Collection( api_client=self.arvrunner.api, keep_client=self.arvrunner.keep_client, num_retries=self.arvrunner.num_retries) script_parameters["task.vwd"] = {} generatemapper = VwdPathMapper(self.generatefiles["listing"], "", "", separateDirs=False) with Perf(metrics, "createfiles %s" % self.name): for f, p in generatemapper.items(): if p.type == "CreateFile": with vwd.open(p.target, "w") as n: n.write(p.resolved.encode("utf-8")) if vwd: with Perf(metrics, "generatefiles.save_new %s" % self.name): info = get_intermediate_collection_info( self.name, None, runtimeContext.intermediate_output_ttl) vwd.save_new(name=info["name"], owner_uuid=self.arvrunner.project_uuid, ensure_unique_name=True, trash_at=info["trash_at"], properties=info["properties"]) for f, p in generatemapper.items(): if p.type == "File": script_parameters["task.vwd"][p.target] = p.resolved if p.type == "CreateFile": script_parameters["task.vwd"][ p.target] = "$(task.keep)/%s/%s" % ( vwd.portable_data_hash(), p.target) script_parameters["task.env"] = { "TMPDIR": self.tmpdir, "HOME": self.outdir } if self.environment: script_parameters["task.env"].update(self.environment) if self.stdin: script_parameters["task.stdin"] = self.stdin if self.stdout: script_parameters["task.stdout"] = self.stdout if self.stderr: script_parameters["task.stderr"] = self.stderr if self.successCodes: script_parameters["task.successCodes"] = self.successCodes if self.temporaryFailCodes: script_parameters[ "task.temporaryFailCodes"] = self.temporaryFailCodes if self.permanentFailCodes: script_parameters[ "task.permanentFailCodes"] = self.permanentFailCodes with Perf(metrics, "arv_docker_get_image %s" % self.name): (docker_req, docker_is_req) = self.get_requirement("DockerRequirement") if docker_req and runtimeContext.use_container is not False: if docker_req.get("dockerOutputDirectory"): raise SourceLine( docker_req, "dockerOutputDirectory", UnsupportedRequirement ).makeError( "Option 'dockerOutputDirectory' of DockerRequirement not supported." ) runtime_constraints["docker_image"] = arv_docker_get_image( self.arvrunner.api, docker_req, runtimeContext.pull_image, self.arvrunner.project_uuid) else: runtime_constraints["docker_image"] = "arvados/jobs" resources = self.builder.resources if resources is not None: runtime_constraints["min_cores_per_node"] = resources.get( "cores", 1) runtime_constraints["min_ram_mb_per_node"] = resources.get("ram") runtime_constraints["min_scratch_mb_per_node"] = resources.get( "tmpdirSize", 0) + resources.get("outdirSize", 0) runtime_req, _ = self.get_requirement( "http://arvados.org/cwl#RuntimeConstraints") if runtime_req: if "keep_cache" in runtime_req: runtime_constraints["keep_cache_mb_per_task"] = runtime_req[ "keep_cache"] runtime_constraints["min_ram_mb_per_node"] += runtime_req[ "keep_cache"] if "outputDirType" in runtime_req: if runtime_req["outputDirType"] == "local_output_dir": script_parameters["task.keepTmpOutput"] = False elif runtime_req["outputDirType"] == "keep_output_dir": script_parameters["task.keepTmpOutput"] = True filters = [["repository", "=", "arvados"], ["script", "=", "crunchrunner"], ["script_version", "in git", crunchrunner_git_commit]] if not self.arvrunner.ignore_docker_for_reuse: filters.append([ "docker_image_locator", "in docker", runtime_constraints["docker_image"] ]) enable_reuse = runtimeContext.enable_reuse if enable_reuse: reuse_req, _ = self.get_requirement( "http://arvados.org/cwl#ReuseRequirement") if reuse_req: enable_reuse = reuse_req["enableReuse"] self.output_callback = self.arvrunner.get_wrapped_callback( self.output_callback) try: with Perf(metrics, "create %s" % self.name): response = self.arvrunner.api.jobs().create( body={ "owner_uuid": self.arvrunner.project_uuid, "script": "crunchrunner", "repository": "arvados", "script_version": "master", "minimum_script_version": crunchrunner_git_commit, "script_parameters": { "tasks": [script_parameters] }, "runtime_constraints": runtime_constraints }, filters=filters, find_or_create=enable_reuse).execute( num_retries=self.arvrunner.num_retries) self.uuid = response["uuid"] self.arvrunner.process_submitted(self) self.update_pipeline_component(response) if response["state"] == "Complete": logger.info("%s reused job %s", self.arvrunner.label(self), response["uuid"]) # Give read permission to the desired project on reused jobs if response["owner_uuid"] != self.arvrunner.project_uuid: try: self.arvrunner.api.links().create( body={ 'link_class': 'permission', 'name': 'can_read', 'tail_uuid': self.arvrunner.project_uuid, 'head_uuid': response["uuid"], }).execute(num_retries=self.arvrunner.num_retries) except ApiError as e: # The user might not have "manage" access on the job: log # a message and continue. logger.info("Creating read permission on job %s: %s", response["uuid"], e) else: logger.info("%s %s is %s", self.arvrunner.label(self), response["uuid"], response["state"]) except Exception as e: logger.exception("%s error" % (self.arvrunner.label(self))) self.output_callback({}, "permanentFail")
def job(self, joborder, output_callback, **kwargs): # type: (Dict[Text, Any], Callable[[Any, Any], Any], **Any) -> Generator self.state = {} self.processStatus = "success" if "outdir" in kwargs: del kwargs["outdir"] for e, i in enumerate(self.tool["inputs"]): with SourceLine(self.tool["inputs"], e, WorkflowException): iid = shortname(i["id"]) if iid in joborder: self.state[i["id"]] = WorkflowStateItem(i, copy.deepcopy(joborder[iid]), "success") elif "default" in i: self.state[i["id"]] = WorkflowStateItem(i, copy.deepcopy(i["default"]), "success") else: raise WorkflowException( u"Input '%s' not in input object and does not have a default value." % (i["id"])) for s in self.steps: for out in s.tool["outputs"]: self.state[out["id"]] = None completed = 0 while completed < len(self.steps): self.made_progress = False for step in self.steps: if kwargs.get("on_error", "stop") == "stop" and self.processStatus != "success": break if not step.submitted: try: step.iterable = self.try_make_job(step, **kwargs) except WorkflowException as e: _logger.error(u"[%s] Cannot make job: %s", step.name, e) _logger.debug("", exc_info=True) self.processStatus = "permanentFail" if step.iterable: try: for newjob in step.iterable: if kwargs.get("on_error", "stop") == "stop" and self.processStatus != "success": break if newjob: self.made_progress = True yield newjob else: break except WorkflowException as e: _logger.error(u"[%s] Cannot make job: %s", step.name, e) _logger.debug("", exc_info=True) self.processStatus = "permanentFail" completed = sum(1 for s in self.steps if s.completed) if not self.made_progress and completed < len(self.steps): if self.processStatus != "success": break else: yield None supportsMultipleInput = bool(self.workflow.get_requirement("MultipleInputFeatureRequirement")[0]) try: wo = object_from_state(self.state, self.tool["outputs"], True, supportsMultipleInput, "outputSource", incomplete=True) except WorkflowException as e: _logger.error(u"[%s] Cannot collect workflow output: %s", self.name, e) wo = {} self.processStatus = "permanentFail" _logger.info(u"[%s] outdir is %s", self.name, self.outdir) output_callback(wo, self.processStatus)