def arvExecutor(self, tool, job_order, **kwargs): self.debug = kwargs.get("debug") tool.visit(self.check_writable) if kwargs.get("quiet"): logger.setLevel(logging.WARN) logging.getLogger('arvados.arv-run').setLevel(logging.WARN) useruuid = self.api.users().current().execute()["uuid"] self.project_uuid = kwargs.get("project_uuid") if kwargs.get( "project_uuid") else useruuid self.pipeline = None make_fs_access = kwargs.get("make_fs_access") or partial( CollectionFsAccess, api_client=self.api) self.fs_access = make_fs_access(kwargs["basedir"]) if kwargs.get("create_template"): tmpl = RunnerTemplate(self, tool, job_order, kwargs.get("enable_reuse")) tmpl.save() # cwltool.main will write our return value to stdout. return tmpl.uuid self.debug = kwargs.get("debug") self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse") kwargs["make_fs_access"] = make_fs_access kwargs["enable_reuse"] = kwargs.get("enable_reuse") kwargs["use_container"] = True kwargs["tmpdir_prefix"] = "tmp" kwargs["on_error"] = "continue" kwargs["compute_checksum"] = kwargs.get("compute_checksum") if self.work_api == "containers": kwargs["outdir"] = "/var/spool/cwl" kwargs["docker_outdir"] = "/var/spool/cwl" kwargs["tmpdir"] = "/tmp" kwargs["docker_tmpdir"] = "/tmp" elif self.work_api == "jobs": kwargs["outdir"] = "$(task.outdir)" kwargs["docker_outdir"] = "$(task.outdir)" kwargs["tmpdir"] = "$(task.tmpdir)" runnerjob = None if kwargs.get("submit"): if self.work_api == "containers": if tool.tool["class"] == "CommandLineTool": runnerjob = tool.job(job_order, self.output_callback, **kwargs).next() else: runnerjob = RunnerContainer(self, tool, job_order, kwargs.get("enable_reuse")) else: runnerjob = RunnerJob(self, tool, job_order, kwargs.get("enable_reuse")) if not kwargs.get( "submit" ) and "cwl_runner_job" not in kwargs and not self.work_api == "containers": # Create pipeline for local run self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient" }).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) if runnerjob and not kwargs.get("wait"): runnerjob.run() return runnerjob.uuid self.poll_api = arvados.api('v1') self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() if runnerjob: jobiter = iter((runnerjob, )) else: if "cwl_runner_job" in kwargs: self.uuid = kwargs.get("cwl_runner_job").get('uuid') jobiter = tool.job(job_order, self.output_callback, **kwargs) try: self.cond.acquire() # Will continue to hold the lock for the duration of this code # except when in cond.wait(), at which point on_message can update # job state and process output callbacks. for runnable in jobiter: if runnable: runnable.run(**kwargs) else: if self.processes: self.cond.wait(1) else: logger.error( "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs." ) break while self.processes: self.cond.wait(1) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt: logger.error("Interrupted, marking pipeline as failed") else: logger.error( "Caught unhandled exception, marking pipeline as failed. Error was: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update( uuid=self.pipeline["uuid"], body={ "state": "Failed" }).execute(num_retries=self.num_retries) if runnerjob and runnerjob.uuid and self.work_api == "containers": self.api.container_requests().update( uuid=runnerjob.uuid, body={ "priority": "0" }).execute(num_retries=self.num_retries) finally: self.cond.release() self.stop_polling.set() self.polling_thread.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_status != "success": raise WorkflowException("Workflow failed.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if kwargs.get("compute_checksum"): adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) return self.final_output
def run(self, runtimeContext, # type: RuntimeContext tmpdir_lock=None # type: Optional[threading.Lock] ): # type: (...) -> None log.debug( "[job %s] self.__dict__ in run() ----------------------", self.name ) log.debug(pformat(self.__dict__)) if not self.successCodes: self.successCodes = [0] task = self.create_task_msg() log.info( "[job %s] CREATED TASK MSG----------------------", self.name ) log.info(pformat(task)) try: self.id = self.client.create_task(task) log.info( "[job %s] SUBMITTED TASK ----------------------", self.name ) log.info("[job %s] task id: %s ", self.name, self.id) except Exception as e: log.error( "[job %s] Failed to submit task to TES service:\n%s", self.name, e ) raise WorkflowException(e) max_tries = 10 current_try = 1 self.exit_code = None while not self.is_done(): delay = 1.5 * current_try**2 time.sleep( random.randint( round( delay - 0.5 * delay), round( delay + 0.5 * delay))) try: task = self.client.get_task(self.id, "MINIMAL") self.state = task.state log.debug( "[job %s] POLLING %s, result: %s", self.name, pformat(self.id), task.state ) except Exception as e: log.error("[job %s] POLLING ERROR %s", self.name, e) if current_try <= max_tries: current_try += 1 continue else: log.error("[job %s] MAX POLLING RETRIES EXCEEDED", self.name) break try: process_status = None if self.state != "COMPLETE" \ and self.exit_code not in self.successCodes: process_status = "permanentFail" log.error("[job %s] job error:\n%s", self.name, self.state) remote_cwl_output_json = False if self.remote_storage_url: remote_fs_access = runtimeContext.make_fs_access( self.remote_storage_url) remote_cwl_output_json = remote_fs_access.exists( remote_fs_access.join( self.remote_storage_url, "cwl.output.json")) if self.remote_storage_url: original_outdir = self.builder.outdir if not remote_cwl_output_json: self.builder.outdir = self.remote_storage_url outputs = self.collect_outputs(self.remote_storage_url, self.exit_code) self.builder.outdir = original_outdir else: outputs = self.collect_outputs(self.outdir, self.exit_code) cleaned_outputs = {} for k, v in outputs.items(): if isinstance(k, bytes): k = k.decode("utf8") if isinstance(v, bytes): v = v.decode("utf8") cleaned_outputs[k] = v self.outputs = cleaned_outputs if not process_status: process_status = "success" except (WorkflowException, Exception) as err: log.error("[job %s] job error:\n%s", self.name, err) if log.isEnabledFor(logging.DEBUG): log.exception(err) process_status = "permanentFail" finally: if self.outputs is None: self.outputs = {} with self.runtime_context.workflow_eval_lock: self.output_callback(self.outputs, process_status) log.info( "[job %s] OUTPUTS ------------------", self.name ) log.info(pformat(self.outputs)) self.cleanup(self.runtime_context.rm_tmpdir) return
def run(self, runtimeContext): # noqa: C901 """Run a job.""" self._setup(runtimeContext) env = self.environment if not os.path.exists(self.tmpdir): os.makedirs(self.tmpdir) vars_to_preserve = runtimeContext.preserve_environment if runtimeContext.preserve_entire_environment: vars_to_preserve = os.environ if vars_to_preserve is not None: for key, value in os.environ.items(): if key in vars_to_preserve and key not in env: env[key] = value env["HOME"] = self.builder.outdir env["TMPDIR"] = self.tmpdir if "PATH" not in env: env["PATH"] = os.environ["PATH"] if "SYSTEMROOT" not in env and "SYSTEMROOT" in os.environ: env["SYSTEMROOT"] = os.environ["SYSTEMROOT"] try: stage_files(self.pathmapper, ignore_writable=True, symlink=False) if getattr(self, "generatemapper", ""): stage_files( self.generatemapper, ignore_writable=self.inplace_update, symlink=False, ) relink_initialworkdir( self.generatemapper, self.outdir, self.builder.outdir, inplace_update=self.inplace_update, ) except OSError: # cwltool/process.py, line 239, in stage_files # shutil.copytree(p.resolved, p.target) pass self.add_volumes(self.pathmapper) if getattr(self, "generatemapper", ""): self.add_volumes(self.generatemapper) # useful for debugging log.debug(f"[job {self.name}] self.__dict__ in run() ---------------") log.debug(pformat(self.__dict__)) task = self.create_task_msg( runtimeContext.working_dir, runtimeContext.workflow_uuid ) log.info(f"[job {self.name}] CREATED TASK MSG----------------------") log.info(pformat(task)) try: # task_id = job_id received from job-controller task_id = runtimeContext.pipeline.service.submit(**task) task_id = str(task_id["job_id"]) running_jobs = {"total": 1, "job_ids": [task_id]} runtimeContext.publisher.publish_workflow_status( runtimeContext.workflow_uuid, 1, message={"progress": {"running": running_jobs,}}, ) log.info(f"[job {self.name}] SUBMITTED TASK --------------------") log.info(f"[job {self.name}] task id: {task_id} ") self.task_name_map[self.name] = task_id operation = runtimeContext.pipeline.service.check_status(task_id) except Exception as e: log.error( f"[job {self.name}] " f"Failed to submit task to job controller:\n{e}" ) raise WorkflowException(e) def callback(rcode): try: outputs = self.collect_outputs(self.outdir, rcode=rcode) cleaned_outputs = {} for k, v in outputs.items(): if isinstance(k, bytes): k = k.decode("utf8") if isinstance(v, bytes): v = v.decode("utf8") cleaned_outputs[k] = v self.outputs = cleaned_outputs self.output_callback(self.outputs, "success") except WorkflowException as e: log.error(f"[job {self.name}] workflow job error:\n{e}") self.output_callback({}, "permanentFail") except Exception as e: log.error(f"[job {self.name}] job error:\n{e}") self.output_callback({}, "permanentFail") finally: if self.outputs is not None: log.info(f"[job {self.name}] OUTPUTS ------------------") log.info(pformat(self.outputs)) self.cleanup(runtimeContext.rm_tmpdir) poll = ReanaPipelinePoll( workflow_uuid=runtimeContext.workflow_uuid, task_id=self.task_name_map.get(self.name), jobname=self.name, service=runtimeContext.pipeline.service, operation=operation, callback=callback, publisher=runtimeContext.publisher, ) runtimeContext.pipeline.add_thread(poll) poll.start()
def arv_executor(self, updated_tool, job_order, runtimeContext, logger=None): self.debug = runtimeContext.debug workbench1 = self.api.config()["Services"]["Workbench1"]["ExternalURL"] workbench2 = self.api.config()["Services"]["Workbench2"]["ExternalURL"] controller = self.api.config()["Services"]["Controller"]["ExternalURL"] logger.info("Using cluster %s (%s)", self.api.config()["ClusterID"], workbench2 or workbench1 or controller) updated_tool.visit(self.check_features) self.project_uuid = runtimeContext.project_uuid self.pipeline = None self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir) self.secret_store = runtimeContext.secret_store self.trash_intermediate = runtimeContext.trash_intermediate if self.trash_intermediate and self.work_api != "containers": raise Exception( "--trash-intermediate is only supported with --api=containers." ) self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl if self.intermediate_output_ttl and self.work_api != "containers": raise Exception( "--intermediate-output-ttl is only supported with --api=containers." ) if self.intermediate_output_ttl < 0: raise Exception( "Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl) if runtimeContext.submit_request_uuid and self.work_api != "containers": raise Exception( "--submit-request-uuid requires containers API, but using '{}' api" .format(self.work_api)) default_storage_classes = ",".join([ k for k, v in self.api.config().get("StorageClasses", { "default": { "Default": True } }).items() if v.get("Default") is True ]) if runtimeContext.storage_classes == "default": runtimeContext.storage_classes = default_storage_classes if runtimeContext.intermediate_storage_classes == "default": runtimeContext.intermediate_storage_classes = default_storage_classes if not runtimeContext.name: runtimeContext.name = self.name = updated_tool.tool.get( "label") or updated_tool.metadata.get( "label") or os.path.basename(updated_tool.tool["id"]) # Upload local file references in the job order. job_order = upload_job_order(self, "%s input" % runtimeContext.name, updated_tool, job_order) # the last clause means: if it is a command line tool, and we # are going to wait for the result, and always_submit_runner # is false, then we don't submit a runner process. submitting = (runtimeContext.update_workflow or runtimeContext.create_workflow or (runtimeContext.submit and not (updated_tool.tool["class"] == "CommandLineTool" and runtimeContext.wait and not runtimeContext.always_submit_runner))) loadingContext = self.loadingContext.copy() loadingContext.do_validate = False if submitting: loadingContext.do_update = False # Document may have been auto-updated. Reload the original # document with updating disabled because we want to # submit the document with its original CWL version, not # the auto-updated one. tool = load_tool(updated_tool.tool["id"], loadingContext) else: tool = updated_tool # Upload direct dependencies of workflow steps, get back mapping of files to keep references. # Also uploads docker images. merged_map = upload_workflow_deps(self, tool) # Recreate process object (ArvadosWorkflow or # ArvadosCommandTool) because tool document may have been # updated by upload_workflow_deps in ways that modify # inheritance of hints or requirements. loadingContext.loader = tool.doc_loader loadingContext.avsc_names = tool.doc_schema loadingContext.metadata = tool.metadata tool = load_tool(tool.tool, loadingContext) existing_uuid = runtimeContext.update_workflow if existing_uuid or runtimeContext.create_workflow: # Create a pipeline template or workflow record and exit. if self.work_api == "containers": uuid = upload_workflow( self, tool, job_order, self.project_uuid, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map, submit_runner_image=runtimeContext.submit_runner_image) self.stdout.write(uuid + "\n") return (None, "success") self.apply_reqs(job_order, tool) self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse self.eval_timeout = runtimeContext.eval_timeout runtimeContext = runtimeContext.copy() runtimeContext.use_container = True runtimeContext.tmpdir_prefix = "tmp" runtimeContext.work_api = self.work_api if self.work_api == "containers": if self.ignore_docker_for_reuse: raise Exception( "--ignore-docker-for-reuse not supported with containers API." ) runtimeContext.outdir = "/var/spool/cwl" runtimeContext.docker_outdir = "/var/spool/cwl" runtimeContext.tmpdir = "/tmp" runtimeContext.docker_tmpdir = "/tmp" if runtimeContext.priority < 1 or runtimeContext.priority > 1000: raise Exception("--priority must be in the range 1..1000.") if self.should_estimate_cache_size: visited = set() estimated_size = [0] def estimate_collection_cache(obj): if obj.get("location", "").startswith("keep:"): m = pdh_size.match(obj["location"][5:]) if m and m.group(1) not in visited: visited.add(m.group(1)) estimated_size[0] += int(m.group(2)) visit_class(job_order, ("File", "Directory"), estimate_collection_cache) runtimeContext.collection_cache_size = max( ((estimated_size[0] * 192) // (1024 * 1024)) + 1, 256) self.collection_cache.set_cap( runtimeContext.collection_cache_size * 1024 * 1024) logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size) runnerjob = None if runtimeContext.submit: # Submit a runner job to run the workflow for us. if self.work_api == "containers": if submitting: tool = RunnerContainer( self, updated_tool, tool, loadingContext, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, intermediate_output_ttl=runtimeContext. intermediate_output_ttl, merged_map=merged_map, priority=runtimeContext.priority, secret_store=self.secret_store, collection_cache_size=runtimeContext. collection_cache_size, collection_cache_is_default=self. should_estimate_cache_size) else: runtimeContext.runnerjob = tool.tool["id"] if runtimeContext.cwl_runner_job is not None: self.uuid = runtimeContext.cwl_runner_job.get('uuid') jobiter = tool.job(job_order, self.output_callback, runtimeContext) if runtimeContext.submit and not runtimeContext.wait: runnerjob = next(jobiter) runnerjob.run(runtimeContext) self.stdout.write(runnerjob.uuid + "\n") return (None, "success") current_container = arvados_cwl.util.get_current_container( self.api, self.num_retries, logger) if current_container: logger.info("Running inside container %s", current_container.get("uuid")) self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout) self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count) try: self.workflow_eval_lock.acquire() # Holds the lock while this code runs and releases it when # it is safe to do so in self.workflow_eval_lock.wait(), # at which point on_message can update job state and # process output callbacks. loopperf = Perf(metrics, "jobiter") loopperf.__enter__() for runnable in jobiter: loopperf.__exit__() if self.stop_polling.is_set(): break if self.task_queue.error is not None: raise self.task_queue.error if runnable: with Perf(metrics, "run"): self.start_run(runnable, runtimeContext) else: if (self.task_queue.in_flight + len(self.processes)) > 0: self.workflow_eval_lock.wait(3) else: logger.error( "Workflow is deadlocked, no runnable processes and not waiting on any pending processes." ) break if self.stop_polling.is_set(): break loopperf.__enter__() loopperf.__exit__() while (self.task_queue.in_flight + len(self.processes)) > 0: if self.task_queue.error is not None: raise self.task_queue.error self.workflow_eval_lock.wait(3) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info( )[0] is SystemExit: logger.error("Interrupted, workflow will be cancelled") elif isinstance(sys.exc_info()[1], WorkflowException): logger.error( "Workflow execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) else: logger.exception("Workflow execution failed") if self.pipeline: self.api.pipeline_instances().update( uuid=self.pipeline["uuid"], body={ "state": "Failed" }).execute(num_retries=self.num_retries) if self.work_api == "containers" and not current_container: # Not running in a crunch container, so cancel any outstanding processes. for p in self.processes: try: self.api.container_requests().update( uuid=p, body={ "priority": "0" }).execute(num_retries=self.num_retries) except Exception: pass finally: self.workflow_eval_lock.release() self.task_queue.drain() self.stop_polling.set() self.polling_thread.join() self.task_queue.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if runtimeContext.submit and isinstance(tool, Runner): logger.info("Final output collection %s", tool.final_output) if workbench2 or workbench1: logger.info("Output at %scollections/%s", workbench2 or workbench1, tool.final_output) else: if self.output_name is None: self.output_name = "Output of %s" % (shortname( tool.tool["id"])) if self.output_tags is None: self.output_tags = "" storage_classes = "" storage_class_req, _ = tool.get_requirement( "http://arvados.org/cwl#OutputStorageClass") if storage_class_req and storage_class_req.get( "finalStorageClass"): storage_classes = aslist( storage_class_req["finalStorageClass"]) else: storage_classes = runtimeContext.storage_classes.strip().split( ",") self.final_output, self.final_output_collection = self.make_output_collection( self.output_name, storage_classes, self.output_tags, self.final_output) self.set_crunch_output() if runtimeContext.compute_checksum: adjustDirObjs(self.final_output, partial(get_listing, self.fs_access)) adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) if self.trash_intermediate and self.final_status == "success": self.trash_intermediate_output() return (self.final_output, self.final_status)
def run(self, runtimeContext): # ArvadosCommandTool subclasses from cwltool.CommandLineTool, # which calls makeJobRunner() to get a new ArvadosContainer # object. The fields that define execution such as # command_line, environment, etc are set on the # ArvadosContainer object by CommandLineTool.job() before # run() is called. runtimeContext = self.job_runtime container_request = { "command": self.command_line, "name": self.name, "output_path": self.outdir, "cwd": self.outdir, "priority": runtimeContext.priority, "state": "Committed", "properties": {}, } runtime_constraints = {} if runtimeContext.project_uuid: container_request["owner_uuid"] = runtimeContext.project_uuid if self.arvrunner.secret_store.has_secret(self.command_line): raise WorkflowException( "Secret material leaked on command line, only file literals may contain secrets" ) if self.arvrunner.secret_store.has_secret(self.environment): raise WorkflowException( "Secret material leaked in environment, only file literals may contain secrets" ) resources = self.builder.resources if resources is not None: runtime_constraints["vcpus"] = math.ceil(resources.get("cores", 1)) runtime_constraints["ram"] = math.ceil( resources.get("ram") * 2**20) mounts = { self.outdir: { "kind": "tmp", "capacity": math.ceil(resources.get("outdirSize", 0) * 2**20) }, self.tmpdir: { "kind": "tmp", "capacity": math.ceil(resources.get("tmpdirSize", 0) * 2**20) } } secret_mounts = {} scheduling_parameters = {} rf = [ self.pathmapper.mapper(f) for f in self.pathmapper.referenced_files ] rf.sort(key=lambda k: k.resolved) prevdir = None for resolved, target, tp, stg in rf: if not stg: continue if prevdir and target.startswith(prevdir): continue if tp == "Directory": targetdir = target else: targetdir = os.path.dirname(target) sp = resolved.split("/", 1) pdh = sp[0][5:] # remove "keep:" mounts[targetdir] = { "kind": "collection", "portable_data_hash": pdh } if pdh in self.pathmapper.pdh_to_uuid: mounts[targetdir]["uuid"] = self.pathmapper.pdh_to_uuid[pdh] if len(sp) == 2: if tp == "Directory": path = sp[1] else: path = os.path.dirname(sp[1]) if path and path != "/": mounts[targetdir]["path"] = path prevdir = targetdir + "/" with Perf(metrics, "generatefiles %s" % self.name): if self.generatefiles["listing"]: vwd = arvados.collection.Collection( api_client=self.arvrunner.api, keep_client=self.arvrunner.keep_client, num_retries=self.arvrunner.num_retries) generatemapper = NoFollowPathMapper( self.generatefiles["listing"], "", "", separateDirs=False) sorteditems = sorted(generatemapper.items(), key=lambda n: n[1].target) logger.debug("generatemapper is %s", sorteditems) with Perf(metrics, "createfiles %s" % self.name): for f, p in sorteditems: if not p.target: pass elif p.type in ("File", "Directory", "WritableFile", "WritableDirectory"): if p.resolved.startswith("_:"): vwd.mkdirs(p.target) else: source, path = self.arvrunner.fs_access.get_collection( p.resolved) vwd.copy(path or ".", p.target, source_collection=source) elif p.type == "CreateFile": if self.arvrunner.secret_store.has_secret( p.resolved): secret_mounts[ "%s/%s" % (self.outdir, p.target)] = { "kind": "text", "content": self.arvrunner.secret_store.retrieve( p.resolved) } else: with vwd.open(p.target, "w") as n: n.write(p.resolved) def keepemptydirs(p): if isinstance(p, arvados.collection.RichCollectionBase): if len(p) == 0: p.open(".keep", "w").close() else: for c in p: keepemptydirs(p[c]) keepemptydirs(vwd) if not runtimeContext.current_container: runtimeContext.current_container = arvados_cwl.util.get_current_container( self.arvrunner.api, self.arvrunner.num_retries, logger) info = arvados_cwl.util.get_intermediate_collection_info( self.name, runtimeContext.current_container, runtimeContext.intermediate_output_ttl) vwd.save_new(name=info["name"], owner_uuid=runtimeContext.project_uuid, ensure_unique_name=True, trash_at=info["trash_at"], properties=info["properties"]) prev = None for f, p in sorteditems: if (not p.target or self.arvrunner.secret_store.has_secret( p.resolved) or (prev is not None and p.target.startswith(prev))): continue mountpoint = "%s/%s" % (self.outdir, p.target) mounts[mountpoint] = { "kind": "collection", "portable_data_hash": vwd.portable_data_hash(), "path": p.target } if p.type.startswith("Writable"): mounts[mountpoint]["writable"] = True prev = p.target + "/" container_request["environment"] = { "TMPDIR": self.tmpdir, "HOME": self.outdir } if self.environment: container_request["environment"].update(self.environment) if self.stdin: sp = self.stdin[6:].split("/", 1) mounts["stdin"] = { "kind": "collection", "portable_data_hash": sp[0], "path": sp[1] } if self.stderr: mounts["stderr"] = { "kind": "file", "path": "%s/%s" % (self.outdir, self.stderr) } if self.stdout: mounts["stdout"] = { "kind": "file", "path": "%s/%s" % (self.outdir, self.stdout) } (docker_req, docker_is_req) = self.get_requirement("DockerRequirement") if not docker_req: docker_req = {"dockerImageId": "arvados/jobs"} container_request["container_image"] = arv_docker_get_image( self.arvrunner.api, docker_req, runtimeContext.pull_image, runtimeContext.project_uuid) network_req, _ = self.get_requirement("NetworkAccess") if network_req: runtime_constraints["API"] = network_req["networkAccess"] api_req, _ = self.get_requirement( "http://arvados.org/cwl#APIRequirement") if api_req: runtime_constraints["API"] = True runtime_req, _ = self.get_requirement( "http://arvados.org/cwl#RuntimeConstraints") if runtime_req: if "keep_cache" in runtime_req: runtime_constraints["keep_cache_ram"] = math.ceil( runtime_req["keep_cache"] * 2**20) if "outputDirType" in runtime_req: if runtime_req["outputDirType"] == "local_output_dir": # Currently the default behavior. pass elif runtime_req["outputDirType"] == "keep_output_dir": mounts[self.outdir] = { "kind": "collection", "writable": True } partition_req, _ = self.get_requirement( "http://arvados.org/cwl#PartitionRequirement") if partition_req: scheduling_parameters["partitions"] = aslist( partition_req["partition"]) intermediate_output_req, _ = self.get_requirement( "http://arvados.org/cwl#IntermediateOutput") if intermediate_output_req: self.output_ttl = intermediate_output_req["outputTTL"] else: self.output_ttl = self.arvrunner.intermediate_output_ttl if self.output_ttl < 0: raise WorkflowException( "Invalid value %d for output_ttl, cannot be less than zero" % container_request["output_ttl"]) if self.timelimit is not None and self.timelimit > 0: scheduling_parameters["max_run_time"] = self.timelimit extra_submit_params = {} if runtimeContext.submit_runner_cluster: extra_submit_params[ "cluster_id"] = runtimeContext.submit_runner_cluster container_request["output_name"] = "Output for step %s" % (self.name) container_request["output_ttl"] = self.output_ttl container_request["mounts"] = mounts container_request["secret_mounts"] = secret_mounts container_request["runtime_constraints"] = runtime_constraints container_request["scheduling_parameters"] = scheduling_parameters enable_reuse = runtimeContext.enable_reuse if enable_reuse: reuse_req, _ = self.get_requirement("WorkReuse") if reuse_req: enable_reuse = reuse_req["enableReuse"] reuse_req, _ = self.get_requirement( "http://arvados.org/cwl#ReuseRequirement") if reuse_req: enable_reuse = reuse_req["enableReuse"] container_request["use_existing"] = enable_reuse if runtimeContext.runnerjob.startswith("arvwf:"): wfuuid = runtimeContext.runnerjob[6:runtimeContext.runnerjob. index("#")] wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute( num_retries=self.arvrunner.num_retries) if container_request["name"] == "main": container_request["name"] = wfrecord["name"] container_request["properties"]["template_uuid"] = wfuuid self.output_callback = self.arvrunner.get_wrapped_callback( self.output_callback) try: if runtimeContext.submit_request_uuid: response = self.arvrunner.api.container_requests().update( uuid=runtimeContext.submit_request_uuid, body=container_request, **extra_submit_params).execute( num_retries=self.arvrunner.num_retries) else: response = self.arvrunner.api.container_requests().create( body=container_request, **extra_submit_params).execute( num_retries=self.arvrunner.num_retries) self.uuid = response["uuid"] self.arvrunner.process_submitted(self) if response["state"] == "Final": logger.info("%s reused container %s", self.arvrunner.label(self), response["container_uuid"]) else: logger.info("%s %s state is %s", self.arvrunner.label(self), response["uuid"], response["state"]) except Exception: logger.exception("%s got an error", self.arvrunner.label(self)) self.output_callback({}, "permanentFail")
def arv_executor(self, tool, job_order, runtimeContext, logger=None): self.debug = runtimeContext.debug tool.visit(self.check_features) self.project_uuid = runtimeContext.project_uuid self.pipeline = None self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir) self.secret_store = runtimeContext.secret_store self.trash_intermediate = runtimeContext.trash_intermediate if self.trash_intermediate and self.work_api != "containers": raise Exception( "--trash-intermediate is only supported with --api=containers." ) self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl if self.intermediate_output_ttl and self.work_api != "containers": raise Exception( "--intermediate-output-ttl is only supported with --api=containers." ) if self.intermediate_output_ttl < 0: raise Exception( "Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl) if runtimeContext.submit_request_uuid and self.work_api != "containers": raise Exception( "--submit-request-uuid requires containers API, but using '{}' api" .format(self.work_api)) if not runtimeContext.name: runtimeContext.name = self.name = tool.tool.get( "label") or tool.metadata.get("label") or os.path.basename( tool.tool["id"]) # Upload direct dependencies of workflow steps, get back mapping of files to keep references. # Also uploads docker images. merged_map = upload_workflow_deps(self, tool) # Reload tool object which may have been updated by # upload_workflow_deps # Don't validate this time because it will just print redundant errors. loadingContext = self.loadingContext.copy() loadingContext.loader = tool.doc_loader loadingContext.avsc_names = tool.doc_schema loadingContext.metadata = tool.metadata loadingContext.do_validate = False tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]], loadingContext) # Upload local file references in the job order. job_order = upload_job_order(self, "%s input" % runtimeContext.name, tool, job_order) existing_uuid = runtimeContext.update_workflow if existing_uuid or runtimeContext.create_workflow: # Create a pipeline template or workflow record and exit. if self.work_api == "jobs": tmpl = RunnerTemplate( self, tool, job_order, runtimeContext.enable_reuse, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map, loadingContext=loadingContext) tmpl.save() # cwltool.main will write our return value to stdout. return (tmpl.uuid, "success") elif self.work_api == "containers": return (upload_workflow( self, tool, job_order, self.project_uuid, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map), "success") self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse self.eval_timeout = runtimeContext.eval_timeout runtimeContext = runtimeContext.copy() runtimeContext.use_container = True runtimeContext.tmpdir_prefix = "tmp" runtimeContext.work_api = self.work_api if self.work_api == "containers": if self.ignore_docker_for_reuse: raise Exception( "--ignore-docker-for-reuse not supported with containers API." ) runtimeContext.outdir = "/var/spool/cwl" runtimeContext.docker_outdir = "/var/spool/cwl" runtimeContext.tmpdir = "/tmp" runtimeContext.docker_tmpdir = "/tmp" elif self.work_api == "jobs": if runtimeContext.priority != DEFAULT_PRIORITY: raise Exception("--priority not implemented for jobs API.") runtimeContext.outdir = "$(task.outdir)" runtimeContext.docker_outdir = "$(task.outdir)" runtimeContext.tmpdir = "$(task.tmpdir)" if runtimeContext.priority < 1 or runtimeContext.priority > 1000: raise Exception("--priority must be in the range 1..1000.") if self.should_estimate_cache_size: visited = set() estimated_size = [0] def estimate_collection_cache(obj): if obj.get("location", "").startswith("keep:"): m = pdh_size.match(obj["location"][5:]) if m and m.group(1) not in visited: visited.add(m.group(1)) estimated_size[0] += int(m.group(2)) visit_class(job_order, ("File", "Directory"), estimate_collection_cache) runtimeContext.collection_cache_size = max( ((estimated_size[0] * 192) / (1024 * 1024)) + 1, 256) self.collection_cache.set_cap( runtimeContext.collection_cache_size * 1024 * 1024) logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size) runnerjob = None if runtimeContext.submit: # Submit a runner job to run the workflow for us. if self.work_api == "containers": if tool.tool[ "class"] == "CommandLineTool" and runtimeContext.wait and ( not runtimeContext.always_submit_runner): runtimeContext.runnerjob = tool.tool["id"] else: tool = RunnerContainer( self, tool, loadingContext, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, intermediate_output_ttl=runtimeContext. intermediate_output_ttl, merged_map=merged_map, priority=runtimeContext.priority, secret_store=self.secret_store, collection_cache_size=runtimeContext. collection_cache_size, collection_cache_is_default=self. should_estimate_cache_size) elif self.work_api == "jobs": tool = RunnerJob( self, tool, loadingContext, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, merged_map=merged_map) elif runtimeContext.cwl_runner_job is None and self.work_api == "jobs": # Create pipeline for local run self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": runtimeContext.name if runtimeContext. name else shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient" }).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) if runtimeContext.cwl_runner_job is not None: self.uuid = runtimeContext.cwl_runner_job.get('uuid') jobiter = tool.job(job_order, self.output_callback, runtimeContext) if runtimeContext.submit and not runtimeContext.wait: runnerjob = jobiter.next() runnerjob.run(runtimeContext) return (runnerjob.uuid, "success") current_container = arvados_cwl.util.get_current_container( self.api, self.num_retries, logger) if current_container: logger.info("Running inside container %s", current_container.get("uuid")) self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout) self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count) try: self.workflow_eval_lock.acquire() # Holds the lock while this code runs and releases it when # it is safe to do so in self.workflow_eval_lock.wait(), # at which point on_message can update job state and # process output callbacks. loopperf = Perf(metrics, "jobiter") loopperf.__enter__() for runnable in jobiter: loopperf.__exit__() if self.stop_polling.is_set(): break if self.task_queue.error is not None: raise self.task_queue.error if runnable: with Perf(metrics, "run"): self.start_run(runnable, runtimeContext) else: if (self.task_queue.in_flight + len(self.processes)) > 0: self.workflow_eval_lock.wait(3) else: logger.error( "Workflow is deadlocked, no runnable processes and not waiting on any pending processes." ) break if self.stop_polling.is_set(): break loopperf.__enter__() loopperf.__exit__() while (self.task_queue.in_flight + len(self.processes)) > 0: if self.task_queue.error is not None: raise self.task_queue.error self.workflow_eval_lock.wait(3) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info( )[0] is SystemExit: logger.error("Interrupted, workflow will be cancelled") else: logger.error( "Execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update( uuid=self.pipeline["uuid"], body={ "state": "Failed" }).execute(num_retries=self.num_retries) if runtimeContext.submit and isinstance(tool, Runner): runnerjob = tool if runnerjob.uuid and self.work_api == "containers": self.api.container_requests().update( uuid=runnerjob.uuid, body={ "priority": "0" }).execute(num_retries=self.num_retries) finally: self.workflow_eval_lock.release() self.task_queue.drain() self.stop_polling.set() self.polling_thread.join() self.task_queue.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if runtimeContext.submit and isinstance(tool, Runner): logger.info("Final output collection %s", tool.final_output) else: if self.output_name is None: self.output_name = "Output of %s" % (shortname( tool.tool["id"])) if self.output_tags is None: self.output_tags = "" storage_classes = runtimeContext.storage_classes.strip().split(",") self.final_output, self.final_output_collection = self.make_output_collection( self.output_name, storage_classes, self.output_tags, self.final_output) self.set_crunch_output() if runtimeContext.compute_checksum: adjustDirObjs(self.final_output, partial(get_listing, self.fs_access)) adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) if self.trash_intermediate and self.final_status == "success": self.trash_intermediate_output() return (self.final_output, self.final_status)
def collect_output( self, schema, # type: Dict[Text, Any] builder, # type: Builder outdir, # type: Text fs_access, # type: StdFsAccess compute_checksum=True # type: bool ): # type: (...) -> Optional[Union[Dict[Text, Any], List[Union[Dict[Text, Any], Text]]]] """ Collect outputs from the step :term:`Process` following its execution. .. note: When :term:`CWL` runner tries to forward ``step(i) outputs -> step(i+1) inputs`` using :meth:`collect_outputs`, it expects exact ``outputBindings`` locations to be matched. In other words, a definition like ``outputBindings: {glob: outputs/*.txt}`` will generate results located in ``step(i)`` as ``"<tmp-workdir>/outputs/file.txt"`` and ``step(i+1)`` will look explicitly in ``"<tmp-workdir>/outputs`` using the ``glob`` pattern. Because each of our :term:`Process` in the workflow are distinct/remote entities, each one stages its outputs at different URL locations, not sharing the same *root directory*. When we stage intermediate results locally, the sub-dirs are lost. Therefore, they act like individual :term:`CWL` runner calls where the *final results* are moved back to the local directory for convenient access, but our *local directory* is the URL WPS-outputs location. To let :term:`CWL` :term:`Workflow` inter-steps mapping work as intended, we must remap the locations ignoring any nested dirs where the modified *outputBindings* definition will be able to match as if each step :term:`Process` outputs were generated locally. """ result = [] # type: List[Any] empty_and_optional = False debug = LOGGER.isEnabledFor(logging.DEBUG) if "outputBinding" in schema: binding = schema["outputBinding"] globpatterns = [] # type: List[Text] revmap = partial(command_line_tool.revmap_file, builder, outdir) if "glob" in binding: with SourceLine(binding, "glob", WorkflowException, debug): for glob in aslist(binding["glob"]): glob = builder.do_eval(glob) if glob: globpatterns.extend(aslist(glob)) # rebase glob pattern as applicable (see note) for glob in list(globpatterns): if not any( glob.startswith(part) for part in [".", "/", "~"]) and "/" in glob: glob = builder.do_eval(glob.split("/")[-1]) if glob: globpatterns.extend(aslist(glob)) for glob in globpatterns: if glob.startswith(outdir): glob = glob[len(outdir) + 1:] elif glob == ".": glob = outdir elif glob.startswith("/"): raise WorkflowException( "glob patterns must not start with '/'") try: prefix = fs_access.glob(outdir) key = cmp_to_key( cast(Callable[[Text, Text], int], locale.strcoll)) # In case of stdout.log or stderr.log file not created if "stdout" in self.tool and "stderr" in self.tool \ and glob in (self.tool["stdout"], self.tool["stderr"]): filepath = Path(fs_access.join(outdir, glob)) if not filepath.is_file(): Path(filepath).touch() result.extend([{ "location": g, "path": fs_access.join(builder.outdir, g[len(prefix[0]) + 1:]), "basename": os.path.basename(g), "nameroot": os.path.splitext(os.path.basename(g))[0], "nameext": os.path.splitext(os.path.basename(g))[1], "class": "File" if fs_access.isfile(g) else "Directory" } for g in sorted(fs_access.glob( fs_access.join(outdir, glob)), key=key)]) except (OSError, IOError) as exc: LOGGER.warning(Text(exc)) except Exception: LOGGER.exception("Unexpected error from fs_access") raise for files in result: rfile = files.copy() # TODO This function raise an exception and seems to be related to docker (which is not used here) # revmap(rfile) if files["class"] == "Directory": load_listing = builder.loadListing or ( binding and binding.get("loadListing")) if load_listing and load_listing != "no_listing": get_listing(fs_access, files, (load_listing == "deep_listing")) else: with fs_access.open(rfile["location"], "rb") as f: contents = b"" if binding.get("loadContents") or compute_checksum: contents = f.read(CONTENT_LIMIT) if binding.get("loadContents"): files["contents"] = contents.decode("utf-8") if compute_checksum: checksum = hashlib.sha1() # nosec: B303 while contents != b"": checksum.update(contents) contents = f.read(1024 * 1024) files[ "checksum"] = f"sha1${checksum.hexdigest()}" f.seek(0, 2) file_size = f.tell() files["size"] = file_size optional = False single = False if isinstance(schema["type"], list): if "null" in schema["type"]: optional = True if "File" in schema["type"] or "Directory" in schema["type"]: single = True elif schema["type"] == "File" or schema["type"] == "Directory": single = True if "outputEval" in binding: with SourceLine(binding, "outputEval", WorkflowException, debug): result = builder.do_eval(binding["outputEval"], context=result) if single: if not result and not optional: with SourceLine(binding, "glob", WorkflowException, debug): raise WorkflowException( f"Did not find output file with glob pattern: '{globpatterns}'" ) elif not result and optional: pass elif isinstance(result, list): if len(result) > 1: raise WorkflowException( "Multiple matches for output item that is a single file." ) result = result[0] if "secondaryFiles" in schema: with SourceLine(schema, "secondaryFiles", WorkflowException, debug): for primary in aslist(result): if isinstance(primary, dict): primary.setdefault("secondaryFiles", []) pathprefix = primary["path"][0:primary["path"]. rindex("/") + 1] for file in aslist(schema["secondaryFiles"]): if isinstance( file, dict) or "$(" in file or "${" in file: sfpath = builder.do_eval(file, context=primary) subst = False else: sfpath = file subst = True for sfitem in aslist(sfpath): if isinstance(sfitem, str): if subst: sfitem = { "path": substitute( primary["path"], sfitem) } else: sfitem = { "path": pathprefix + sfitem } if "path" in sfitem and "location" not in sfitem: revmap(sfitem) if fs_access.isfile(sfitem["location"]): sfitem["class"] = "File" primary["secondaryFiles"].append( sfitem) elif fs_access.isdir(sfitem["location"]): sfitem["class"] = "Directory" primary["secondaryFiles"].append( sfitem) if "format" in schema: for primary in aslist(result): primary["format"] = builder.do_eval(schema["format"], context=primary) # Ensure files point to local references outside of the run environment # TODO: Again removing revmap.... # adjustFileObjs(result, revmap) if not result and optional: return None if not empty_and_optional and isinstance( schema["type"], dict) and schema["type"]["type"] == "record": out = {} for f in schema["type"]["fields"]: out[shortname( f["name"])] = self.collect_output( # type: ignore f, builder, outdir, fs_access, compute_checksum=compute_checksum) return out return result
def add_volumes(self, pathmapper, mounts, secret_store=None): # type: (PathMapper, List[Text], SecretStore) -> None host_outdir = self.outdir container_outdir = self.builder.outdir for src, vol in pathmapper.items(): if not vol.staged: continue host_outdir_tgt = None # type: Optional[Text] if vol.target.startswith(container_outdir+"/"): host_outdir_tgt = os.path.join( host_outdir, vol.target[len(container_outdir)+1:]) if vol.type in ("File", "Directory"): if not vol.resolved.startswith("_:"): mounts.append(u"%s:%s" % ( docker_windows_path_adjust(vol.resolved), docker_windows_path_adjust(vol.target))) elif vol.type == "WritableFile": if self.inplace_update: mounts.append(u"%s:%s" % ( docker_windows_path_adjust(vol.resolved), docker_windows_path_adjust(vol.target))) else: if host_outdir_tgt: shutil.copy(vol.resolved, host_outdir_tgt) ensure_writable(host_outdir_tgt) else: raise WorkflowException( "Unable to compute host_outdir_tgt for " "WriteableFile.") elif vol.type == "WritableDirectory": if vol.resolved.startswith("_:"): if host_outdir_tgt: os.makedirs(host_outdir_tgt, 0o0755) else: raise WorkflowException( "Unable to compute host_outdir_tgt for " "WritableDirectory.") else: if self.inplace_update: mounts.append(u"%s:%s" % ( docker_windows_path_adjust(vol.resolved), docker_windows_path_adjust(vol.target))) else: if host_outdir_tgt: shutil.copytree(vol.resolved, host_outdir_tgt) ensure_writable(host_outdir_tgt) else: raise WorkflowException( "Unable to compute host_outdir_tgt for " "WritableDirectory.") elif vol.type == "CreateFile": if secret_store: contents = secret_store.retrieve(vol.resolved) else: contents = vol.resolved if host_outdir_tgt: with open(host_outdir_tgt, "wb") as f: f.write(contents.encode("utf-8")) else: fd, createtmp = tempfile.mkstemp(dir=self.tmpdir) with os.fdopen(fd, "wb") as f: f.write(contents.encode("utf-8")) mounts.append(u"%s:%s" % ( docker_windows_path_adjust(createtmp), docker_windows_path_adjust(vol.target))) return mounts
def executor(self, tool, job_order, **kwargs): final_output = [] final_status = [] def output_callback(out, status): final_status.append(status) final_output.append(out) if "basedir" not in kwargs: raise WorkflowException("Must provide 'basedir' in kwargs") output_dirs = set() if kwargs.get("outdir"): finaloutdir = os.path.abspath(kwargs.get("outdir")) else: finaloutdir = None if kwargs.get("tmp_outdir_prefix"): kwargs["outdir"] = tempfile.mkdtemp( prefix=kwargs["tmp_outdir_prefix"] ) else: kwargs["outdir"] = tempfile.mkdtemp() output_dirs.add(kwargs["outdir"]) kwargs["mutation_manager"] = MutationManager() jobReqs = None if "cwl:requirements" in job_order: jobReqs = job_order["cwl:requirements"] elif ("cwl:defaults" in tool.metadata and "cwl:requirements" in tool.metadata["cwl:defaults"]): jobReqs = tool.metadata["cwl:defaults"]["cwl:requirements"] if jobReqs: for req in jobReqs: tool.requirements.append(req) if kwargs.get("default_container"): tool.requirements.insert(0, { "class": "DockerRequirement", "dockerPull": kwargs["default_container"] }) jobs = tool.job(job_order, output_callback, **kwargs) try: for runnable in jobs: if runnable: builder = kwargs.get("builder", None) if builder is not None: runnable.builder = builder if runnable.outdir: output_dirs.add(runnable.outdir) runnable.run(**kwargs) else: time.sleep(1) except WorkflowException as e: raise e except Exception as e: log.error('Workflow error') raise WorkflowException(unicode(e)) self.wait() log.info('All processes have joined') if final_output and final_output[0] and finaloutdir: final_output[0] = relocateOutputs( final_output[0], finaloutdir, output_dirs, kwargs.get("move_outputs"), kwargs["make_fs_access"]("")) if kwargs.get("rm_tmpdir"): cleanIntermediate(output_dirs) if final_output and final_status: return (final_output[0], final_status[0]) else: return (None, "permanentFail")
def make_workflow_exception(msg): name = shortname(port["id"]) return WorkflowException( f"Error collecting output for parameter '{name}':\n{msg}" )
def arv_docker_get_image(api_client, dockerRequirement, pull_image, project_uuid, force_pull, tmp_outdir_prefix, match_local_docker): """Check if a Docker image is available in Keep, if not, upload it using arv-keepdocker.""" if "http://arvados.org/cwl#dockerCollectionPDH" in dockerRequirement: return dockerRequirement["http://arvados.org/cwl#dockerCollectionPDH"] if "dockerImageId" not in dockerRequirement and "dockerPull" in dockerRequirement: dockerRequirement = copy.deepcopy(dockerRequirement) dockerRequirement["dockerImageId"] = dockerRequirement["dockerPull"] if hasattr(dockerRequirement, 'lc'): dockerRequirement.lc.data[ "dockerImageId"] = dockerRequirement.lc.data["dockerPull"] global cached_lookups global cached_lookups_lock with cached_lookups_lock: if dockerRequirement["dockerImageId"] in cached_lookups: return cached_lookups[dockerRequirement["dockerImageId"]] with SourceLine(dockerRequirement, "dockerImageId", WorkflowException, logger.isEnabledFor(logging.DEBUG)): sp = dockerRequirement["dockerImageId"].split(":") image_name = sp[0] image_tag = sp[1] if len(sp) > 1 else "latest" images = arvados.commands.keepdocker.list_images_in_arv( api_client, 3, image_name=image_name, image_tag=image_tag) if images and match_local_docker: local_image_id = determine_image_id( dockerRequirement["dockerImageId"]) if local_image_id: # find it in the list found = False for i in images: if i[1]["dockerhash"] == local_image_id: found = True images = [i] break if not found: # force re-upload. images = [] if not images: # Fetch Docker image if necessary. try: result = cwltool.docker.DockerCommandLineJob.get_image( dockerRequirement, pull_image, force_pull, tmp_outdir_prefix) if not result: raise WorkflowException("Docker image '%s' not available" % dockerRequirement["dockerImageId"]) except OSError as e: raise WorkflowException( "While trying to get Docker image '%s', failed to execute 'docker': %s" % (dockerRequirement["dockerImageId"], e)) # Upload image to Arvados args = [] if project_uuid: args.append("--project-uuid=" + project_uuid) args.append(image_name) args.append(image_tag) logger.info("Uploading Docker image %s:%s", image_name, image_tag) try: arvados.commands.put.api_client = api_client arvados.commands.keepdocker.main(args, stdout=sys.stderr, install_sig_handlers=False, api=api_client) except SystemExit as e: # If e.code is None or zero, then keepdocker exited normally and we can continue if e.code: raise WorkflowException("keepdocker exited with code %s" % e.code) images = arvados.commands.keepdocker.list_images_in_arv( api_client, 3, image_name=image_name, image_tag=image_tag) if not images: raise WorkflowException("Could not find Docker image %s:%s" % (image_name, image_tag)) pdh = api_client.collections().get( uuid=images[0][0]).execute()["portable_data_hash"] with cached_lookups_lock: cached_lookups[dockerRequirement["dockerImageId"]] = pdh return pdh
def setUp(self): self.executor = ThreadPoolJobExecutor(1000, 2) self.workflow_exception = WorkflowException('workflow exception') self.logger = Mock()
def validate_cluster_target(arvrunner, runtimeContext): if (runtimeContext.submit_runner_cluster and runtimeContext.submit_runner_cluster not in arvrunner.api._rootDesc["remoteHosts"] and runtimeContext.submit_runner_cluster != arvrunner.api._rootDesc["uuidPrefix"]): raise WorkflowException("Unknown or invalid cluster id '%s' known remote clusters are %s" % (runtimeContext.submit_runner_cluster, ", ".join(list(arvrunner.api._rootDesc["remoteHosts"].keys()))))
def _parsl_execute( self, runtime, # type: List[Text] env, # type: MutableMapping[Text, Text] runtimeContext # type: RuntimeContext ): # type: (...) -> None scr, _ = self.get_requirement("ShellCommandRequirement") shouldquote = needs_shell_quoting_re.search # type: Callable[[Any], Any] if scr: shouldquote = lambda x: False _logger.info( u"[job %s] %s$ %s%s%s%s", self.name, self.outdir, " \\\n ".join([ shellescape.quote(Text(arg)) if shouldquote(Text(arg)) else Text(arg) for arg in (runtime + self.command_line) ]), u' < %s' % self.stdin if self.stdin else '', u' > %s' % os.path.join(self.outdir, self.stdout) if self.stdout else '', u' 2> %s' % os.path.join(self.outdir, self.stderr) if self.stderr else '') if self.joborder and runtimeContext.research_obj: job_order = self.joborder assert runtimeContext.prov_obj runtimeContext.prov_obj.used_artefacts( job_order, runtimeContext.process_run_id, runtimeContext.reference_locations, str(self.name)) outputs = {} # type: Dict[Text,Text] try: stdin_path = None if self.stdin: rmap = self.pathmapper.reversemap(self.stdin) if not rmap: raise WorkflowException("{} missing from pathmapper".format( self.stdin)) else: stdin_path = rmap[1] stderr_path = None if self.stderr: abserr = os.path.join(self.outdir, self.stderr) dnerr = os.path.dirname(abserr) if dnerr and not os.path.exists(dnerr): os.makedirs(dnerr) stderr_path = abserr stdout_path = None if self.stdout: absout = os.path.join(self.outdir, self.stdout) dn = os.path.dirname(absout) if dn and not os.path.exists(dn): os.makedirs(dn) stdout_path = absout commands = [Text(x) for x in (runtime + self.command_line)] if runtimeContext.secret_store: commands = runtimeContext.secret_store.retrieve(commands) env = runtimeContext.secret_store.retrieve(env) job_script_contents = None # type: Optional[Text] builder = getattr(self, "builder", None) # type: Builder if builder is not None: job_script_contents = builder.build_job_script(commands) print("Running my own execution layer") rcode = _job_popen( commands, stdin_path=stdin_path, stdout_path=stdout_path, stderr_path=stderr_path, env=env, cwd=self.outdir, job_dir=tempfile.mkdtemp(prefix=getdefault( runtimeContext.tmp_outdir_prefix, DEFAULT_TMP_PREFIX)), job_script_contents=job_script_contents, timelimit=self.timelimit, name=self.name) if self.successCodes and rcode in self.successCodes: processStatus = "success" elif self.temporaryFailCodes and rcode in self.temporaryFailCodes: processStatus = "temporaryFail" elif self.permanentFailCodes and rcode in self.permanentFailCodes: processStatus = "permanentFail" elif rcode == 0: processStatus = "success" else: processStatus = "permanentFail" if self.generatefiles["listing"]: assert self.generatemapper is not None relink_initialworkdir(self.generatemapper, self.outdir, self.builder.outdir, inplace_update=self.inplace_update) outputs = self.collect_outputs(self.outdir) outputs = bytes2str_in_dicts(outputs) # type: ignore except OSError as e: if e.errno == 2: if runtime: _logger.error(u"'%s' not found", runtime[0]) else: _logger.error(u"'%s' not found", self.command_line[0]) else: _logger.exception("Exception while running job") processStatus = "permanentFail" except WorkflowException as e: _logger.error(u"[job %s] Job error:\n%s" % (self.name, e)) processStatus = "permanentFail" except Exception as e: _logger.exception("Exception while running job") processStatus = "permanentFail" if runtimeContext.research_obj and self.prov_obj and \ runtimeContext.process_run_id: #creating entities for the outputs produced by each step (in the provenance document) self.prov_obj.generate_output_prov(outputs, runtimeContext.process_run_id, str(self.name)) self.prov_obj.document.wasEndedBy(runtimeContext.process_run_id, None, self.prov_obj.workflow_run_uri, datetime.datetime.now()) if processStatus != "success": _logger.warning(u"[job %s] completed %s", self.name, processStatus) else: _logger.info(u"[job %s] completed %s", self.name, processStatus) if _logger.isEnabledFor(logging.DEBUG): _logger.debug(u"[job %s] %s", self.name, json_dumps(outputs, indent=4)) if self.generatemapper and runtimeContext.secret_store: # Delete any runtime-generated files containing secrets. for f, p in self.generatemapper.items(): if p.type == "CreateFile": if runtimeContext.secret_store.has_secret(p.resolved): host_outdir = self.outdir container_outdir = self.builder.outdir host_outdir_tgt = p.target if p.target.startswith(container_outdir + "/"): host_outdir_tgt = os.path.join( host_outdir, p.target[len(container_outdir) + 1:]) os.remove(host_outdir_tgt) if runtimeContext.workflow_eval_lock is None: raise WorkflowException( "runtimeContext.workflow_eval_lock must not be None") with runtimeContext.workflow_eval_lock: self.output_callback(outputs, processStatus) if self.stagedir and os.path.exists(self.stagedir): _logger.debug(u"[job %s] Removing input staging directory %s", self.name, self.stagedir) shutil.rmtree(self.stagedir, True) if runtimeContext.rm_tmpdir: _logger.debug(u"[job %s] Removing temporary directory %s", self.name, self.tmpdir) shutil.rmtree(self.tmpdir, True)
def collect_output_ports( self, ports, # type: Set[Dict[Text, Any]] builder, # type: Builder outdir, # type: Text compute_checksum=True, # type: bool jobname="", # type: Text readers=None # type: Dict[Text, Any] ): # type: (...) -> OutputPorts ret = {} # type: OutputPorts debug = LOGGER.isEnabledFor(logging.DEBUG) try: fs_access = builder.make_fs_access(outdir) custom_output = fs_access.join(outdir, "cwl.output.json") if fs_access.exists(custom_output): with fs_access.open(custom_output, "r") as f: ret = json.load(f) if debug: LOGGER.debug(u"Raw output from %s: %s", custom_output, json.dumps(ret, indent=4)) else: for i, port in enumerate(ports): def make_workflow_exception(msg): return WorkflowException( u"Error collecting output for parameter '%s':\n%s" % (shortname(port["id"]), msg)) with SourceLine(ports, i, make_workflow_exception, debug): fragment = shortname(port["id"]) ret[fragment] = self.collect_output( port, builder, outdir, fs_access, compute_checksum=compute_checksum) if ret: # revmap = partial(command_line_tool.revmap_file, builder, outdir) adjustDirObjs(ret, trim_listing) # TODO: Attempt to avoid a crash because the revmap fct is not functional # (intend for a docker usage only?) # visit_class(ret, ("File", "Directory"), cast(Callable[[Any], Any], revmap)) visit_class(ret, ("File", "Directory"), command_line_tool.remove_path) normalizeFilesDirs(ret) visit_class( ret, ("File", "Directory"), partial(command_line_tool.check_valid_locations, fs_access)) if compute_checksum: adjustFileObjs(ret, partial(compute_checksums, fs_access)) validate.validate_ex(self.names.get_name("outputs_record_schema", ""), ret, strict=False, logger=LOGGER) if ret is not None and builder.mutation_manager is not None: adjustFileObjs(ret, builder.mutation_manager.set_generation) return ret if ret is not None else {} except validate.ValidationException as exc: raise WorkflowException( "Error validating output record: {!s}\nIn:\n{}".format( exc, json.dumps(ret, indent=4))) finally: if builder.mutation_manager and readers: for reader in readers.values(): builder.mutation_manager.release_reader(jobname, reader)
def done(self, record): try: self.update_pipeline_component(record) except: pass try: if record["state"] == "Complete": processStatus = "success" else: processStatus = "permanentFail" try: outputs = {} if record["output"]: logc = arvados.collection.Collection(record["log"]) log = logc.open(logc.keys()[0]) tmpdir = None outdir = None keepdir = None for l in log: # Determine the tmpdir, outdir and keepdir paths from # the job run. Unfortunately, we can't take the first # values we find (which are expected to be near the # top) and stop scanning because if the node fails and # the job restarts on a different node these values # will different runs, and we need to know about the # final run that actually produced output. g = tmpdirre.match(l) if g: tmpdir = g.group(1) g = outdirre.match(l) if g: outdir = g.group(1) g = keepre.match(l) if g: keepdir = g.group(1) colname = "Output %s of %s" % (record["output"][0:7], self.name) # check if collection already exists with same owner, name and content collection_exists = self.arvrunner.api.collections().list( filters=[[ "owner_uuid", "=", self.arvrunner.project_uuid ], ['portable_data_hash', '=', record["output"]], ["name", "=", colname]]).execute( num_retries=self.arvrunner.num_retries) if not collection_exists["items"]: # Create a collection located in the same project as the # pipeline with the contents of the output. # First, get output record. collections = self.arvrunner.api.collections().list( limit=1, filters=[[ 'portable_data_hash', '=', record["output"] ]], select=[ "manifest_text" ]).execute(num_retries=self.arvrunner.num_retries) if not collections["items"]: raise WorkflowException( "Job output '%s' cannot be found on API server" % (record["output"])) # Create new collection in the parent project # with the output contents. self.arvrunner.api.collections().create( body={ "owner_uuid": self.arvrunner.project_uuid, "name": colname, "portable_data_hash": record["output"], "manifest_text": collections["items"][0]["manifest_text"] }, ensure_unique_name=True).execute( num_retries=self.arvrunner.num_retries) self.builder.outdir = outdir self.builder.pathmapper.keepdir = keepdir outputs = self.collect_outputs("keep:" + record["output"]) except WorkflowException as e: logger.error("Error while collecting job outputs:\n%s", e, exc_info=(e if self.arvrunner.debug else False)) processStatus = "permanentFail" except Exception as e: logger.exception( "Got unknown exception while collecting job outputs:") processStatus = "permanentFail" self.output_callback(outputs, processStatus) finally: del self.arvrunner.jobs[record["uuid"]]
def make_workflow_exception(msg): return WorkflowException( u"Error collecting output for parameter '%s':\n%s" % (shortname(port["id"]), msg))
def run(self, runtimeContext): # type: (RuntimeContext) -> None (docker_req, docker_is_req) = self.get_requirement("DockerRequirement") self.prov_obj = runtimeContext.prov_obj img_id = None env = cast(MutableMapping[Text, Text], os.environ) user_space_docker_cmd = runtimeContext.user_space_docker_cmd if docker_req and user_space_docker_cmd: # For user-space docker implementations, a local image name or ID # takes precedence over a network pull if 'dockerImageId' in docker_req: img_id = str(docker_req["dockerImageId"]) elif 'dockerPull' in docker_req: img_id = str(docker_req["dockerPull"]) # else: # raise WorkflowException(SourceLine(docker_req).makeError( # "Docker image must be specified as 'dockerImageId' or " # "'dockerPull' when using user space implementations of " # "Docker")) else: try: if docker_req and runtimeContext.use_container: img_id = str( self.get_from_requirements( docker_req, True, runtimeContext.pull_image, getdefault(runtimeContext.force_docker_pull, False), getdefault(runtimeContext.tmp_outdir_prefix, DEFAULT_TMP_PREFIX))) if img_id is None: if self.builder.find_default_container: default_container = self.builder.find_default_container() if default_container: img_id = str(default_container) if docker_req and img_id is None and runtimeContext.use_container: raise Exception("Docker image not available") if self.prov_obj and img_id and runtimeContext.process_run_id: # TODO: Integrate with record_container_id container_agent = self.prov_obj.document.agent( uuid.uuid4().urn, {"prov:type": PROV["SoftwareAgent"], "cwlprov:image": img_id, "prov:label": "Container execution of image %s" % img_id}) # FIXME: img_id is not a sha256 id, it might just be "debian:8" #img_entity = document.entity("nih:sha-256;%s" % img_id, # {"prov:label": "Container image %s" % img_id} ) # The image is the plan for this activity-agent association #document.wasAssociatedWith(process_run_ID, container_agent, img_entity) self.prov_obj.document.wasAssociatedWith( runtimeContext.process_run_id, container_agent) except Exception as err: container = "Shifter" _logger.debug("%s error", container, exc_info=True) if docker_is_req: raise UnsupportedRequirement( "%s is required to run this tool: %s" % (container, err)) else: raise WorkflowException( "{0} is not available for this tool, try " "--no-container to disable {0}, or install " "a user space Docker replacement like uDocker with " "--user-space-docker-cmd.: {1}".format(container, err)) self._setup(runtimeContext) stageFiles(self.pathmapper, ignoreWritable=True, symLink=True, secret_store=runtimeContext.secret_store) runtime = self.create_runtime(env, runtimeContext, img_id) self._execute(runtime, env, runtimeContext)
def collect_output( self, schema, # type: Dict[Text, Any] builder, # type: Builder outdir, # type: Text fs_access, # type: StdFsAccess compute_checksum=True # type: bool ): # type: (...) -> Optional[Union[Dict[Text, Any], List[Union[Dict[Text, Any], Text]]]] result = [] # type: List[Any] empty_and_optional = False debug = LOGGER.isEnabledFor(logging.DEBUG) if "outputBinding" in schema: binding = schema["outputBinding"] globpatterns = [] # type: List[Text] revmap = partial(command_line_tool.revmap_file, builder, outdir) if "glob" in binding: with SourceLine(binding, "glob", WorkflowException, debug): for glob in aslist(binding["glob"]): glob = builder.do_eval(glob) if glob: globpatterns.extend(aslist(glob)) for glob in globpatterns: if glob.startswith(outdir): glob = glob[len(outdir) + 1:] elif glob == ".": glob = outdir elif glob.startswith("/"): raise WorkflowException( "glob patterns must not start with '/'") try: prefix = fs_access.glob(outdir) key = cmp_to_key( cast(Callable[[Text, Text], int], locale.strcoll)) # In case of stdout.log or stderr.log file not created if "stdout" in self.tool and "stderr" in self.tool \ and glob in (self.tool["stdout"], self.tool["stderr"]): filepath = Path(fs_access.join(outdir, glob)) if not filepath.is_file(): Path(filepath).touch() result.extend([{ "location": g, "path": fs_access.join(builder.outdir, g[len(prefix[0]) + 1:]), "basename": os.path.basename(g), "nameroot": os.path.splitext(os.path.basename(g))[0], "nameext": os.path.splitext(os.path.basename(g))[1], "class": "File" if fs_access.isfile(g) else "Directory" } for g in sorted(fs_access.glob( fs_access.join(outdir, glob)), key=key)]) except (OSError, IOError) as exc: LOGGER.warning(Text(exc)) except Exception: LOGGER.exception("Unexpected error from fs_access") raise for files in result: rfile = files.copy() # TODO This function raise an exception and seems to be related to docker (which is not used here) # revmap(rfile) if files["class"] == "Directory": load_listing = builder.loadListing or ( binding and binding.get("loadListing")) if load_listing and load_listing != "no_listing": get_listing(fs_access, files, (load_listing == "deep_listing")) else: with fs_access.open(rfile["location"], "rb") as f: contents = b"" if binding.get("loadContents") or compute_checksum: contents = f.read(CONTENT_LIMIT) if binding.get("loadContents"): files["contents"] = contents.decode("utf-8") if compute_checksum: checksum = hashlib.sha1() # nosec: B303 while contents != b"": checksum.update(contents) contents = f.read(1024 * 1024) files[ "checksum"] = "sha1$%s" % checksum.hexdigest( ) f.seek(0, 2) file_size = f.tell() files["size"] = file_size optional = False single = False if isinstance(schema["type"], list): if "null" in schema["type"]: optional = True if "File" in schema["type"] or "Directory" in schema["type"]: single = True elif schema["type"] == "File" or schema["type"] == "Directory": single = True if "outputEval" in binding: with SourceLine(binding, "outputEval", WorkflowException, debug): result = builder.do_eval(binding["outputEval"], context=result) if single: if not result and not optional: with SourceLine(binding, "glob", WorkflowException, debug): raise WorkflowException( "Did not find output file with glob pattern: '{}'". format(globpatterns)) elif not result and optional: pass elif isinstance(result, list): if len(result) > 1: raise WorkflowException( "Multiple matches for output item that is a single file." ) result = result[0] if "secondaryFiles" in schema: with SourceLine(schema, "secondaryFiles", WorkflowException, debug): for primary in aslist(result): if isinstance(primary, dict): primary.setdefault("secondaryFiles", []) pathprefix = primary["path"][0:primary["path"]. rindex("/") + 1] for file in aslist(schema["secondaryFiles"]): if isinstance( file, dict) or "$(" in file or "${" in file: sfpath = builder.do_eval(file, context=primary) subst = False else: sfpath = file subst = True for sfitem in aslist(sfpath): if isinstance(sfitem, str): if subst: sfitem = { "path": substitute( primary["path"], sfitem) } else: sfitem = { "path": pathprefix + sfitem } if "path" in sfitem and "location" not in sfitem: revmap(sfitem) if fs_access.isfile(sfitem["location"]): sfitem["class"] = "File" primary["secondaryFiles"].append( sfitem) elif fs_access.isdir(sfitem["location"]): sfitem["class"] = "Directory" primary["secondaryFiles"].append( sfitem) if "format" in schema: for primary in aslist(result): primary["format"] = builder.do_eval(schema["format"], context=primary) # Ensure files point to local references outside of the run environment # TODO: Again removing revmap.... # adjustFileObjs(result, revmap) if not result and optional: return None if not empty_and_optional and isinstance( schema["type"], dict) and schema["type"]["type"] == "record": out = {} for f in schema["type"]["fields"]: out[shortname( f["name"])] = self.collect_output( # type: ignore f, builder, outdir, fs_access, compute_checksum=compute_checksum) return out return result
def executor(self, tool, job_order, runtimeContext, **kwargs): """Executor method.""" final_output = [] final_status = [] def output_callback(out, processStatus): final_status.append(processStatus) final_output.append(out) if not runtimeContext.basedir: raise WorkflowException('`runtimeContext` should contain a ' '`basedir`') output_dirs = set() if runtimeContext.outdir: finaloutdir = os.path.abspath(runtimeContext.outdir) else: finaloutdir = None if runtimeContext.tmp_outdir_prefix: runtimeContext.outdir = tempfile.mkdtemp( prefix=runtimeContext.tmp_outdir_prefix) else: runtimeContext.outdir = tempfile.mkdtemp() output_dirs.add(runtimeContext.outdir) runtimeContext.mutation_manager = MutationManager() jobReqs = None if "cwl:requirements" in job_order: jobReqs = job_order["cwl:requirements"] elif ("cwl:defaults" in tool.metadata and "cwl:requirements" in tool.metadata["cwl:defaults"]): jobReqs = tool.metadata["cwl:defaults"]["cwl:requirements"] if jobReqs: for req in jobReqs: tool.requirements.append(req) if not runtimeContext.default_container: runtimeContext.default_container = 'frolvlad/alpine-bash' runtimeContext.docker_outdir = os.path.join(runtimeContext.working_dir, "cwl/docker_outdir") runtimeContext.docker_tmpdir = os.path.join(runtimeContext.working_dir, "cwl/docker_tmpdir") runtimeContext.docker_stagedir = os.path.join( runtimeContext.working_dir, "cwl/docker_stagedir") jobs = tool.job(job_order, output_callback, runtimeContext) try: for runnable in jobs: if runnable: if runtimeContext.builder: runnable.builder = runtimeContext.builder if runnable.outdir: output_dirs.add(runnable.outdir) runnable.run(runtimeContext) else: # log.error( # "Workflow cannot make any more progress" # ) # break time.sleep(1) except WorkflowException as e: traceback.print_exc() raise e except Exception as e: traceback.print_exc() raise WorkflowException(str(e)) # wait for all processes to finish self.wait() if final_output and final_output[0] and finaloutdir: final_output[0] = relocateOutputs( final_output[0], finaloutdir, output_dirs, runtimeContext.move_outputs, runtimeContext.make_fs_access("")) if runtimeContext.rm_tmpdir: cleanIntermediate(output_dirs) if final_output and final_status: return str(final_output[0]), str(final_status[0]) else: return None, "permanentFail"
def execute(self, runtime, env, runtime_context): # noqa: E811 # type: (List[Text], MutableMapping[Text, Text], RuntimeContext) -> None self.results = self.wps_process.execute(self.builder.job, self.outdir, self.expected_outputs) if self.joborder and runtime_context.research_obj: job_order = self.joborder assert runtime_context.prov_obj assert runtime_context.process_run_id runtime_context.prov_obj.used_artefacts( job_order, runtime_context.process_run_id, str(self.name)) outputs = {} # type: Dict[Text, Text] try: rcode = 0 if self.successCodes: process_status = "success" elif self.temporaryFailCodes: process_status = "temporaryFail" elif self.permanentFailCodes: process_status = "permanentFail" elif rcode == 0: process_status = "success" else: process_status = "permanentFail" if self.generatefiles["listing"]: assert self.generatemapper is not None relink_initialworkdir(self.generatemapper, self.outdir, self.builder.outdir, inplace_update=self.inplace_update) outputs = self.collect_outputs(self.outdir) outputs = bytes2str_in_dicts(outputs) # type: ignore except OSError as exc: if exc.errno == 2: if runtime: LOGGER.exception(u"'%s' not found", runtime[0]) else: LOGGER.exception(u"'%s' not found", self.command_line[0]) else: LOGGER.exception("Exception while running job") process_status = "permanentFail" except WorkflowException as err: LOGGER.exception(u"[job %s] Job error:\n%s", self.name, err) process_status = "permanentFail" except Exception: # noqa: W0703 # nosec: B110 LOGGER.exception("Exception while running job") process_status = "permanentFail" if runtime_context.research_obj and self.prov_obj and \ runtime_context.process_run_id: # creating entities for the outputs produced by each step (in the provenance document) self.prov_obj.generate_output_prov(outputs, runtime_context.process_run_id, str(self.name)) self.prov_obj.document.wasEndedBy(runtime_context.process_run_id, None, self.prov_obj.workflow_run_uri, now()) if process_status != "success": LOGGER.warning(u"[job %s] completed %s", self.name, process_status) else: LOGGER.info(u"[job %s] completed %s", self.name, process_status) if LOGGER.isEnabledFor(logging.DEBUG): LOGGER.debug(u"[job %s] %s", self.name, json.dumps(outputs, indent=4)) if self.generatemapper and runtime_context.secret_store: # Delete any runtime-generated files containing secrets. for _, path_item in self.generatemapper.items(): if path_item.type == "CreateFile": if runtime_context.secret_store.has_secret( path_item.resolved): host_outdir = self.outdir container_outdir = self.builder.outdir host_outdir_tgt = path_item.target if path_item.target.startswith(container_outdir + "/"): host_outdir_tgt = os.path.join( host_outdir, path_item.target[len(container_outdir) + 1:]) os.remove(host_outdir_tgt) if runtime_context.workflow_eval_lock is None: raise WorkflowException( "runtime_context.workflow_eval_lock must not be None") with runtime_context.workflow_eval_lock: self.output_callback(outputs, process_status) if self.stagedir and os.path.exists(self.stagedir): LOGGER.debug(u"[job %s] Removing input staging directory %s", self.name, self.stagedir) shutil.rmtree(self.stagedir, True) if runtime_context.rm_tmpdir: LOGGER.debug(u"[job %s] Removing temporary directory %s", self.name, self.tmpdir) shutil.rmtree(self.tmpdir, True)
def arv_executor(self, tool, job_order, **kwargs): self.debug = kwargs.get("debug") tool.visit(self.check_features) self.project_uuid = kwargs.get("project_uuid") self.pipeline = None make_fs_access = kwargs.get("make_fs_access") or partial( CollectionFsAccess, collection_cache=self.collection_cache) self.fs_access = make_fs_access(kwargs["basedir"]) self.trash_intermediate = kwargs["trash_intermediate"] if self.trash_intermediate and self.work_api != "containers": raise Exception( "--trash-intermediate is only supported with --api=containers." ) self.intermediate_output_ttl = kwargs["intermediate_output_ttl"] if self.intermediate_output_ttl and self.work_api != "containers": raise Exception( "--intermediate-output-ttl is only supported with --api=containers." ) if self.intermediate_output_ttl < 0: raise Exception( "Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl) if not kwargs.get("name"): kwargs["name"] = self.name = tool.tool.get( "label") or tool.metadata.get("label") or os.path.basename( tool.tool["id"]) # Upload direct dependencies of workflow steps, get back mapping of files to keep references. # Also uploads docker images. override_tools = {} upload_workflow_deps(self, tool, override_tools) # Reload tool object which may have been updated by # upload_workflow_deps tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]], makeTool=self.arv_make_tool, loader=tool.doc_loader, avsc_names=tool.doc_schema, metadata=tool.metadata, override_tools=override_tools) # Upload local file references in the job order. job_order = upload_job_order(self, "%s input" % kwargs["name"], tool, job_order) existing_uuid = kwargs.get("update_workflow") if existing_uuid or kwargs.get("create_workflow"): # Create a pipeline template or workflow record and exit. if self.work_api == "jobs": tmpl = RunnerTemplate( self, tool, job_order, kwargs.get("enable_reuse"), uuid=existing_uuid, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs["name"]) tmpl.save() # cwltool.main will write our return value to stdout. return (tmpl.uuid, "success") elif self.work_api == "containers": return (upload_workflow( self, tool, job_order, self.project_uuid, uuid=existing_uuid, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs["name"]), "success") self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse") kwargs["make_fs_access"] = make_fs_access kwargs["enable_reuse"] = kwargs.get("enable_reuse") kwargs["use_container"] = True kwargs["tmpdir_prefix"] = "tmp" kwargs["compute_checksum"] = kwargs.get("compute_checksum") if self.work_api == "containers": kwargs["outdir"] = "/var/spool/cwl" kwargs["docker_outdir"] = "/var/spool/cwl" kwargs["tmpdir"] = "/tmp" kwargs["docker_tmpdir"] = "/tmp" elif self.work_api == "jobs": kwargs["outdir"] = "$(task.outdir)" kwargs["docker_outdir"] = "$(task.outdir)" kwargs["tmpdir"] = "$(task.tmpdir)" runnerjob = None if kwargs.get("submit"): # Submit a runner job to run the workflow for us. if self.work_api == "containers": if tool.tool["class"] == "CommandLineTool" and kwargs.get( "wait"): kwargs["runnerjob"] = tool.tool["id"] runnerjob = tool.job(job_order, self.output_callback, **kwargs).next() else: runnerjob = RunnerContainer( self, tool, job_order, kwargs.get("enable_reuse"), self.output_name, self.output_tags, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs.get("name"), on_error=kwargs.get("on_error"), submit_runner_image=kwargs.get("submit_runner_image"), intermediate_output_ttl=kwargs.get( "intermediate_output_ttl")) elif self.work_api == "jobs": runnerjob = RunnerJob( self, tool, job_order, kwargs.get("enable_reuse"), self.output_name, self.output_tags, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs.get("name"), on_error=kwargs.get("on_error"), submit_runner_image=kwargs.get("submit_runner_image")) elif "cwl_runner_job" not in kwargs and self.work_api == "jobs": # Create pipeline for local run self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": kwargs["name"] if kwargs. get("name") else shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient" }).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) if runnerjob and not kwargs.get("wait"): runnerjob.run(wait=kwargs.get("wait")) return (runnerjob.uuid, "success") self.poll_api = arvados.api('v1') self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() if runnerjob: jobiter = iter((runnerjob, )) else: if "cwl_runner_job" in kwargs: self.uuid = kwargs.get("cwl_runner_job").get('uuid') jobiter = tool.job(job_order, self.output_callback, **kwargs) try: self.cond.acquire() # Will continue to hold the lock for the duration of this code # except when in cond.wait(), at which point on_message can update # job state and process output callbacks. loopperf = Perf(metrics, "jobiter") loopperf.__enter__() for runnable in jobiter: loopperf.__exit__() if self.stop_polling.is_set(): break if runnable: with Perf(metrics, "run"): runnable.run(**kwargs) else: if self.processes: self.cond.wait(1) else: logger.error( "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs." ) break loopperf.__enter__() loopperf.__exit__() while self.processes: self.cond.wait(1) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt: logger.error("Interrupted, marking pipeline as failed") else: logger.error( "Execution failed: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update( uuid=self.pipeline["uuid"], body={ "state": "Failed" }).execute(num_retries=self.num_retries) if runnerjob and runnerjob.uuid and self.work_api == "containers": self.api.container_requests().update( uuid=runnerjob.uuid, body={ "priority": "0" }).execute(num_retries=self.num_retries) finally: self.cond.release() self.stop_polling.set() self.polling_thread.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if kwargs.get("submit") and isinstance(runnerjob, Runner): logger.info("Final output collection %s", runnerjob.final_output) else: if self.output_name is None: self.output_name = "Output of %s" % (shortname( tool.tool["id"])) if self.output_tags is None: self.output_tags = "" self.final_output, self.final_output_collection = self.make_output_collection( self.output_name, self.output_tags, self.final_output) self.set_crunch_output() if kwargs.get("compute_checksum"): adjustDirObjs(self.final_output, partial(get_listing, self.fs_access)) adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) if self.trash_intermediate and self.final_status == "success": self.trash_intermediate_output() return (self.final_output, self.final_status)
def arv_executor(self, tool, job_order, **kwargs): self.debug = kwargs.get("debug") tool.visit(self.check_writable) self.project_uuid = kwargs.get("project_uuid") self.pipeline = None make_fs_access = kwargs.get("make_fs_access") or partial( CollectionFsAccess, api_client=self.api, keep_client=self.keep_client) self.fs_access = make_fs_access(kwargs["basedir"]) existing_uuid = kwargs.get("update_workflow") if existing_uuid or kwargs.get("create_workflow"): if self.work_api == "jobs": tmpl = RunnerTemplate( self, tool, job_order, kwargs.get("enable_reuse"), uuid=existing_uuid, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs.get("name")) tmpl.save() # cwltool.main will write our return value to stdout. return tmpl.uuid else: return upload_workflow( self, tool, job_order, self.project_uuid, uuid=existing_uuid, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs.get("name")) self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse") kwargs["make_fs_access"] = make_fs_access kwargs["enable_reuse"] = kwargs.get("enable_reuse") kwargs["use_container"] = True kwargs["tmpdir_prefix"] = "tmp" kwargs["on_error"] = "continue" kwargs["compute_checksum"] = kwargs.get("compute_checksum") if not kwargs["name"]: del kwargs["name"] if self.work_api == "containers": kwargs["outdir"] = "/var/spool/cwl" kwargs["docker_outdir"] = "/var/spool/cwl" kwargs["tmpdir"] = "/tmp" kwargs["docker_tmpdir"] = "/tmp" elif self.work_api == "jobs": kwargs["outdir"] = "$(task.outdir)" kwargs["docker_outdir"] = "$(task.outdir)" kwargs["tmpdir"] = "$(task.tmpdir)" upload_instance(self, shortname(tool.tool["id"]), tool, job_order) runnerjob = None if kwargs.get("submit"): if self.work_api == "containers": if tool.tool["class"] == "CommandLineTool": kwargs["runnerjob"] = tool.tool["id"] runnerjob = tool.job(job_order, self.output_callback, **kwargs).next() else: runnerjob = RunnerContainer( self, tool, job_order, kwargs.get("enable_reuse"), self.output_name, self.output_tags, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs.get("name")) else: runnerjob = RunnerJob( self, tool, job_order, kwargs.get("enable_reuse"), self.output_name, self.output_tags, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs.get("name")) if not kwargs.get( "submit" ) and "cwl_runner_job" not in kwargs and not self.work_api == "containers": # Create pipeline for local run self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": kwargs["name"] if kwargs. get("name") else shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient" }).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) if runnerjob and not kwargs.get("wait"): runnerjob.run(wait=kwargs.get("wait")) return runnerjob.uuid self.poll_api = arvados.api('v1') self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() if runnerjob: jobiter = iter((runnerjob, )) else: if "cwl_runner_job" in kwargs: self.uuid = kwargs.get("cwl_runner_job").get('uuid') jobiter = tool.job(job_order, self.output_callback, **kwargs) try: self.cond.acquire() # Will continue to hold the lock for the duration of this code # except when in cond.wait(), at which point on_message can update # job state and process output callbacks. loopperf = Perf(metrics, "jobiter") loopperf.__enter__() for runnable in jobiter: loopperf.__exit__() if self.stop_polling.is_set(): break if runnable: with Perf(metrics, "run"): runnable.run(**kwargs) else: if self.processes: self.cond.wait(1) else: logger.error( "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs." ) break loopperf.__enter__() loopperf.__exit__() while self.processes: self.cond.wait(1) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt: logger.error("Interrupted, marking pipeline as failed") else: logger.error( "Execution failed: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update( uuid=self.pipeline["uuid"], body={ "state": "Failed" }).execute(num_retries=self.num_retries) if runnerjob and runnerjob.uuid and self.work_api == "containers": self.api.container_requests().update( uuid=runnerjob.uuid, body={ "priority": "0" }).execute(num_retries=self.num_retries) finally: self.cond.release() self.stop_polling.set() self.polling_thread.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if kwargs.get("submit") and isinstance(runnerjob, Runner): logger.info("Final output collection %s", runnerjob.final_output) else: if self.output_name is None: self.output_name = "Output of %s" % (shortname( tool.tool["id"])) if self.output_tags is None: self.output_tags = "" self.final_output, self.final_output_collection = self.make_output_collection( self.output_name, self.output_tags, self.final_output) self.set_crunch_output() if self.final_status != "success": raise WorkflowException("Workflow failed.") if kwargs.get("compute_checksum"): adjustDirObjs(self.final_output, partial(getListing, self.fs_access)) adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) return self.final_output