def test_pack_input_named_name() -> None: loadingContext, workflowobj, uri = fetch_document( get_data("tests/wf/trick_revsort.cwl") ) loadingContext.do_update = False loadingContext, uri = resolve_and_validate_document( loadingContext, workflowobj, uri ) loader = loadingContext.loader assert loader loader.resolve_ref(uri)[0] with open(get_data("tests/wf/expect_trick_packed.cwl")) as packed_file: expect_packed = yaml.main.round_trip_load(packed_file) packed = cwltool.pack.pack(loadingContext, uri) adjustFileObjs( packed, partial(make_relative, os.path.abspath(get_data("tests/wf"))) ) adjustDirObjs(packed, partial(make_relative, os.path.abspath(get_data("tests/wf")))) assert "$schemas" in packed assert len(packed["$schemas"]) == len(expect_packed["$schemas"]) del packed["$schemas"] del expect_packed["$schemas"] assert packed == expect_packed
def test_packing(unpacked: str, expected: str) -> None: """Compare expected version reality with various workflows and --pack.""" loadingContext, workflowobj, uri = fetch_document(get_data(unpacked)) loadingContext.do_update = False loadingContext, uri = resolve_and_validate_document( loadingContext, workflowobj, uri ) packed = json.loads(print_pack(loadingContext, uri)) context_dir = os.path.abspath(os.path.dirname(get_data(unpacked))) adjustFileObjs(packed, partial(make_relative, context_dir)) adjustDirObjs(packed, partial(make_relative, context_dir)) with open(get_data(expected)) as packed_file: expect_packed = json.load(packed_file) if "$schemas" in expect_packed: assert "$schemas" in packed packed_schemas = packed["$schemas"] assert isinstance(packed_schemas, Sized) assert len(packed_schemas) == len(expect_packed["$schemas"]) del packed["$schemas"] del expect_packed["$schemas"] assert packed == expect_packed
def done(self, record): """Base method for handling a completed runner.""" try: if record["state"] == "Complete": if record.get("exit_code") is not None: if record["exit_code"] == 33: processStatus = "UnsupportedRequirement" elif record["exit_code"] == 0: processStatus = "success" else: processStatus = "permanentFail" else: processStatus = "success" else: processStatus = "permanentFail" outputs = {} if processStatus == "permanentFail": logc = arvados.collection.CollectionReader( record["log"], api_client=self.arvrunner.api, keep_client=self.arvrunner.keep_client, num_retries=self.arvrunner.num_retries) done.logtail(logc, logger.error, "%s (%s) error log:" % (self.arvrunner.label(self), record["uuid"]), maxlen=40) self.final_output = record["output"] outc = arvados.collection.CollectionReader( self.final_output, api_client=self.arvrunner.api, keep_client=self.arvrunner.keep_client, num_retries=self.arvrunner.num_retries) if "cwl.output.json" in outc: with outc.open("cwl.output.json", "rb") as f: if f.size() > 0: outputs = json.loads(f.read().decode()) def keepify(fileobj): path = fileobj["location"] if not path.startswith("keep:"): fileobj["location"] = "keep:%s/%s" % (record["output"], path) adjustFileObjs(outputs, keepify) adjustDirObjs(outputs, keepify) except Exception: logger.exception("[%s] While getting final output object", self.name) self.arvrunner.output_callback({}, "permanentFail") else: self.arvrunner.output_callback(outputs, processStatus)
def upload_workflow(arvRunner, tool, job_order, project_uuid, uuid=None, submit_runner_ram=0, name=None, merged_map=None, submit_runner_image=None): packed = packed_workflow(arvRunner, tool, merged_map) adjustDirObjs(job_order, trim_listing) adjustFileObjs(job_order, trim_anonymous_location) adjustDirObjs(job_order, trim_anonymous_location) main = [p for p in packed["$graph"] if p["id"] == "#main"][0] for inp in main["inputs"]: sn = shortname(inp["id"]) if sn in job_order: inp["default"] = job_order[sn] if not name: name = tool.tool.get("label", os.path.basename(tool.tool["id"])) upload_dependencies(arvRunner, name, tool.doc_loader, packed, tool.tool["id"], False) wf_runner_resources = None hints = main.get("hints", []) found = False for h in hints: if h["class"] == "http://arvados.org/cwl#WorkflowRunnerResources": wf_runner_resources = h found = True break if not found: wf_runner_resources = {"class": "http://arvados.org/cwl#WorkflowRunnerResources"} hints.append(wf_runner_resources) wf_runner_resources["acrContainerImage"] = arvados_jobs_image(arvRunner, submit_runner_image or "arvados/jobs:"+__version__) if submit_runner_ram: wf_runner_resources["ramMin"] = submit_runner_ram main["hints"] = hints body = { "workflow": { "name": name, "description": tool.tool.get("doc", ""), "definition":json.dumps(packed, sort_keys=True, indent=4, separators=(',',': ')) }} if project_uuid: body["workflow"]["owner_uuid"] = project_uuid if uuid: call = arvRunner.api.workflows().update(uuid=uuid, body=body) else: call = arvRunner.api.workflows().create(body=body) return call.execute(num_retries=arvRunner.num_retries)["uuid"]
def test_pack_fragment() -> None: with open(get_data("tests/wf/scatter2_subwf.cwl")) as packed_file: expect_packed = yaml.main.safe_load(packed_file) loadingContext, workflowobj, uri = fetch_document(get_data("tests/wf/scatter2.cwl")) packed = cwltool.pack.pack(loadingContext, uri + "#scatterstep/mysub") adjustFileObjs( packed, partial(make_relative, os.path.abspath(get_data("tests/wf"))) ) adjustDirObjs(packed, partial(make_relative, os.path.abspath(get_data("tests/wf")))) assert json.dumps(packed, sort_keys=True, indent=2) == json.dumps( expect_packed, sort_keys=True, indent=2 )
def test_pack() -> None: loadingContext, workflowobj, uri = fetch_document(get_data("tests/wf/revsort.cwl")) with open(get_data("tests/wf/expect_packed.cwl")) as packed_file: expect_packed = yaml.main.safe_load(packed_file) packed = cwltool.pack.pack(loadingContext, uri) adjustFileObjs( packed, partial(make_relative, os.path.abspath(get_data("tests/wf"))) ) adjustDirObjs(packed, partial(make_relative, os.path.abspath(get_data("tests/wf")))) assert "$schemas" in packed assert len(packed["$schemas"]) == len(expect_packed["$schemas"]) del packed["$schemas"] del expect_packed["$schemas"] assert packed == expect_packed
def job(self, joborder, output_callback, runtimeContext): builder = make_builder(joborder, self.hints, self.requirements, runtimeContext, self.metadata) runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext) req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer") if not req: return super(ArvadosWorkflow, self).job(joborder, output_callback, runtimeContext) # RunInSingleContainer is true with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "id" not in self.tool: raise WorkflowException("%s object must have 'id'" % (self.tool["class"])) discover_secondary_files(self.arvrunner.fs_access, builder, self.tool["inputs"], joborder) with Perf(metrics, "subworkflow upload_deps"): upload_dependencies(self.arvrunner, os.path.basename(joborder.get("id", "#")), self.doc_loader, joborder, joborder.get("id", "#"), False) if self.wf_pdh is None: packed = pack(self.loadingContext, self.tool["id"], loader=self.doc_loader) for p in packed["$graph"]: if p["id"] == "#main": p["requirements"] = dedup_reqs(self.requirements) p["hints"] = dedup_reqs(self.hints) def visit(item): if "requirements" in item: item["requirements"] = [i for i in item["requirements"] if i["class"] != "DockerRequirement"] for t in ("hints", "requirements"): if t not in item: continue for req in item[t]: if req["class"] == "ResourceRequirement": dyn = False for k in max_res_pars + sum_res_pars: if k in req: if isinstance(req[k], basestring): if item["id"] == "#main": # only the top-level requirements/hints may contain expressions self.dynamic_resource_req.append(req) dyn = True break else: with SourceLine(req, k, WorkflowException): raise WorkflowException("Non-top-level ResourceRequirement in single container cannot have expressions") if not dyn: self.static_resource_req.append(req) visit_class(packed["$graph"], ("Workflow", "CommandLineTool"), visit) if self.static_resource_req: self.static_resource_req = [get_overall_res_req(self.static_resource_req)] upload_dependencies(self.arvrunner, runtimeContext.name, self.doc_loader, packed, self.tool["id"], False) # Discover files/directories referenced by the # workflow (mainly "default" values) visit_class(packed, ("File", "Directory"), self.wf_reffiles.append) if self.dynamic_resource_req: # Evaluate dynamic resource requirements using current builder rs = copy.copy(self.static_resource_req) for dyn_rs in self.dynamic_resource_req: eval_req = {"class": "ResourceRequirement"} for a in max_res_pars + sum_res_pars: if a in dyn_rs: eval_req[a] = builder.do_eval(dyn_rs[a]) rs.append(eval_req) job_res_reqs = [get_overall_res_req(rs)] else: job_res_reqs = self.static_resource_req with Perf(metrics, "subworkflow adjust"): joborder_resolved = copy.deepcopy(joborder) joborder_keepmount = copy.deepcopy(joborder) reffiles = [] visit_class(joborder_keepmount, ("File", "Directory"), reffiles.append) mapper = ArvPathMapper(self.arvrunner, reffiles+self.wf_reffiles, runtimeContext.basedir, "/keep/%s", "/keep/%s/%s") # For containers API, we need to make sure any extra # referenced files (ie referenced by the workflow but # not in the inputs) are included in the mounts. if self.wf_reffiles: runtimeContext = runtimeContext.copy() runtimeContext.extra_reffiles = copy.deepcopy(self.wf_reffiles) def keepmount(obj): remove_redundant_fields(obj) with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "location" not in obj: raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj)) with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).target if "listing" in obj: del obj["listing"] elif obj["location"].startswith("_:"): del obj["location"] else: raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"]) visit_class(joborder_keepmount, ("File", "Directory"), keepmount) def resolved(obj): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).resolved visit_class(joborder_resolved, ("File", "Directory"), resolved) if self.wf_pdh is None: adjustFileObjs(packed, keepmount) adjustDirObjs(packed, keepmount) self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed) self.loadingContext = self.loadingContext.copy() self.loadingContext.metadata = self.loadingContext.metadata.copy() self.loadingContext.metadata["http://commonwl.org/cwltool#original_cwlVersion"] = "v1.0" if len(job_res_reqs) == 1: # RAM request needs to be at least 128 MiB or the workflow # runner itself won't run reliably. if job_res_reqs[0].get("ramMin", 1024) < 128: job_res_reqs[0]["ramMin"] = 128 arguments = ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl", "cwl.input.yml"] if runtimeContext.debug: arguments.insert(0, '--debug') wf_runner = cmap({ "class": "CommandLineTool", "baseCommand": "cwltool", "inputs": self.tool["inputs"], "outputs": self.tool["outputs"], "stdout": "cwl.output.json", "requirements": self.requirements+job_res_reqs+[ {"class": "InlineJavascriptRequirement"}, { "class": "InitialWorkDirRequirement", "listing": [{ "entryname": "workflow.cwl", "entry": '$({"class": "File", "location": "keep:%s/workflow.cwl"})' % self.wf_pdh }, { "entryname": "cwl.input.yml", "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${') }] }], "hints": self.hints, "arguments": arguments, "id": "#" }) return ArvadosCommandTool(self.arvrunner, wf_runner, self.loadingContext).job(joborder_resolved, output_callback, runtimeContext)
def arv_executor(self, updated_tool, job_order, runtimeContext, logger=None): self.debug = runtimeContext.debug updated_tool.visit(self.check_features) self.project_uuid = runtimeContext.project_uuid self.pipeline = None self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir) self.secret_store = runtimeContext.secret_store self.trash_intermediate = runtimeContext.trash_intermediate if self.trash_intermediate and self.work_api != "containers": raise Exception( "--trash-intermediate is only supported with --api=containers." ) self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl if self.intermediate_output_ttl and self.work_api != "containers": raise Exception( "--intermediate-output-ttl is only supported with --api=containers." ) if self.intermediate_output_ttl < 0: raise Exception( "Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl) if runtimeContext.submit_request_uuid and self.work_api != "containers": raise Exception( "--submit-request-uuid requires containers API, but using '{}' api" .format(self.work_api)) if not runtimeContext.name: runtimeContext.name = self.name = updated_tool.tool.get( "label") or updated_tool.metadata.get( "label") or os.path.basename(updated_tool.tool["id"]) # Upload local file references in the job order. job_order = upload_job_order(self, "%s input" % runtimeContext.name, updated_tool, job_order) # the last clause means: if it is a command line tool, and we # are going to wait for the result, and always_submit_runner # is false, then we don't submit a runner process. submitting = (runtimeContext.update_workflow or runtimeContext.create_workflow or (runtimeContext.submit and not (updated_tool.tool["class"] == "CommandLineTool" and runtimeContext.wait and not runtimeContext.always_submit_runner))) loadingContext = self.loadingContext.copy() loadingContext.do_validate = False loadingContext.do_update = False if submitting: # Document may have been auto-updated. Reload the original # document with updating disabled because we want to # submit the document with its original CWL version, not # the auto-updated one. tool = load_tool(updated_tool.tool["id"], loadingContext) else: tool = updated_tool # Upload direct dependencies of workflow steps, get back mapping of files to keep references. # Also uploads docker images. merged_map = upload_workflow_deps(self, tool) # Recreate process object (ArvadosWorkflow or # ArvadosCommandTool) because tool document may have been # updated by upload_workflow_deps in ways that modify # inheritance of hints or requirements. loadingContext.loader = tool.doc_loader loadingContext.avsc_names = tool.doc_schema loadingContext.metadata = tool.metadata tool = load_tool(tool.tool, loadingContext) existing_uuid = runtimeContext.update_workflow if existing_uuid or runtimeContext.create_workflow: # Create a pipeline template or workflow record and exit. if self.work_api == "containers": return (upload_workflow( self, tool, job_order, self.project_uuid, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map), "success") self.apply_reqs(job_order, tool) self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse self.eval_timeout = runtimeContext.eval_timeout runtimeContext = runtimeContext.copy() runtimeContext.use_container = True runtimeContext.tmpdir_prefix = "tmp" runtimeContext.work_api = self.work_api if self.work_api == "containers": if self.ignore_docker_for_reuse: raise Exception( "--ignore-docker-for-reuse not supported with containers API." ) runtimeContext.outdir = "/var/spool/cwl" runtimeContext.docker_outdir = "/var/spool/cwl" runtimeContext.tmpdir = "/tmp" runtimeContext.docker_tmpdir = "/tmp" if runtimeContext.priority < 1 or runtimeContext.priority > 1000: raise Exception("--priority must be in the range 1..1000.") if self.should_estimate_cache_size: visited = set() estimated_size = [0] def estimate_collection_cache(obj): if obj.get("location", "").startswith("keep:"): m = pdh_size.match(obj["location"][5:]) if m and m.group(1) not in visited: visited.add(m.group(1)) estimated_size[0] += int(m.group(2)) visit_class(job_order, ("File", "Directory"), estimate_collection_cache) runtimeContext.collection_cache_size = max( ((estimated_size[0] * 192) // (1024 * 1024)) + 1, 256) self.collection_cache.set_cap( runtimeContext.collection_cache_size * 1024 * 1024) logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size) runnerjob = None if runtimeContext.submit: # Submit a runner job to run the workflow for us. if self.work_api == "containers": if submitting: tool = RunnerContainer( self, updated_tool, tool, loadingContext, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, intermediate_output_ttl=runtimeContext. intermediate_output_ttl, merged_map=merged_map, priority=runtimeContext.priority, secret_store=self.secret_store, collection_cache_size=runtimeContext. collection_cache_size, collection_cache_is_default=self. should_estimate_cache_size) else: runtimeContext.runnerjob = tool.tool["id"] if runtimeContext.cwl_runner_job is not None: self.uuid = runtimeContext.cwl_runner_job.get('uuid') jobiter = tool.job(job_order, self.output_callback, runtimeContext) if runtimeContext.submit and not runtimeContext.wait: runnerjob = next(jobiter) runnerjob.run(runtimeContext) return (runnerjob.uuid, "success") current_container = arvados_cwl.util.get_current_container( self.api, self.num_retries, logger) if current_container: logger.info("Running inside container %s", current_container.get("uuid")) self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout) self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count) try: self.workflow_eval_lock.acquire() # Holds the lock while this code runs and releases it when # it is safe to do so in self.workflow_eval_lock.wait(), # at which point on_message can update job state and # process output callbacks. loopperf = Perf(metrics, "jobiter") loopperf.__enter__() for runnable in jobiter: loopperf.__exit__() if self.stop_polling.is_set(): break if self.task_queue.error is not None: raise self.task_queue.error if runnable: with Perf(metrics, "run"): self.start_run(runnable, runtimeContext) else: if (self.task_queue.in_flight + len(self.processes)) > 0: self.workflow_eval_lock.wait(3) else: logger.error( "Workflow is deadlocked, no runnable processes and not waiting on any pending processes." ) break if self.stop_polling.is_set(): break loopperf.__enter__() loopperf.__exit__() while (self.task_queue.in_flight + len(self.processes)) > 0: if self.task_queue.error is not None: raise self.task_queue.error self.workflow_eval_lock.wait(3) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info( )[0] is SystemExit: logger.error("Interrupted, workflow will be cancelled") elif isinstance(sys.exc_info()[1], WorkflowException): logger.error( "Workflow execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) else: logger.exception("Workflow execution failed") if self.pipeline: self.api.pipeline_instances().update( uuid=self.pipeline["uuid"], body={ "state": "Failed" }).execute(num_retries=self.num_retries) if self.work_api == "containers" and not current_container: # Not running in a crunch container, so cancel any outstanding processes. for p in self.processes: try: self.api.container_requests().update( uuid=p, body={ "priority": "0" }).execute(num_retries=self.num_retries) except Exception: pass finally: self.workflow_eval_lock.release() self.task_queue.drain() self.stop_polling.set() self.polling_thread.join() self.task_queue.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if runtimeContext.submit and isinstance(tool, Runner): logger.info("Final output collection %s", tool.final_output) else: if self.output_name is None: self.output_name = "Output of %s" % (shortname( tool.tool["id"])) if self.output_tags is None: self.output_tags = "" storage_classes = runtimeContext.storage_classes.strip().split(",") self.final_output, self.final_output_collection = self.make_output_collection( self.output_name, storage_classes, self.output_tags, self.final_output) self.set_crunch_output() if runtimeContext.compute_checksum: adjustDirObjs(self.final_output, partial(get_listing, self.fs_access)) adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) if self.trash_intermediate and self.final_status == "success": self.trash_intermediate_output() return (self.final_output, self.final_status)
def make_output_collection(self, name, storage_classes, tagsString, outputObj): outputObj = copy.deepcopy(outputObj) files = [] def capture(fileobj): files.append(fileobj) adjustDirObjs(outputObj, capture) adjustFileObjs(outputObj, capture) generatemapper = NoFollowPathMapper(files, "", "", separateDirs=False) final = arvados.collection.Collection(api_client=self.api, keep_client=self.keep_client, num_retries=self.num_retries) for k, v in generatemapper.items(): if v.type == "Directory" and v.resolved.startswith("_:"): continue if v.type == "CreateFile" and (k.startswith("_:") or v.resolved.startswith("_:")): with final.open(v.target, "wb") as f: f.write(v.resolved.encode("utf-8")) continue if not v.resolved.startswith("keep:"): raise Exception("Output source is not in keep or a literal") sp = v.resolved.split("/") srccollection = sp[0][5:] try: reader = self.collection_cache.get(srccollection) srcpath = "/".join(sp[1:]) if len(sp) > 1 else "." final.copy(srcpath, v.target, source_collection=reader, overwrite=False) except arvados.errors.ArgumentError as e: logger.error("Creating CollectionReader for '%s' '%s': %s", k, v, e) raise except IOError as e: logger.error("While preparing output collection: %s", e) raise def rewrite(fileobj): fileobj["location"] = generatemapper.mapper( fileobj["location"]).target for k in ("listing", "contents", "nameext", "nameroot", "dirname"): if k in fileobj: del fileobj[k] adjustDirObjs(outputObj, rewrite) adjustFileObjs(outputObj, rewrite) with final.open("cwl.output.json", "w") as f: res = str( json.dumps(outputObj, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False)) f.write(res) final.save_new(name=name, owner_uuid=self.project_uuid, storage_classes=storage_classes, ensure_unique_name=True) logger.info("Final output collection %s \"%s\" (%s)", final.portable_data_hash(), final.api_response()["name"], final.manifest_locator()) final_uuid = final.manifest_locator() tags = tagsString.split(',') for tag in tags: self.api.links().create(body={ "head_uuid": final_uuid, "link_class": "tag", "name": tag }).execute(num_retries=self.num_retries) def finalcollection(fileobj): fileobj["location"] = "keep:%s/%s" % (final.portable_data_hash(), fileobj["location"]) adjustDirObjs(outputObj, finalcollection) adjustFileObjs(outputObj, finalcollection) return (outputObj, final)