def check_writable(self, obj): if isinstance(obj, dict): if obj.get("writable"): raise UnsupportedRequirement("InitialWorkDir feature 'writable: true' not supported") for v in obj.itervalues(): self.check_writable(v) if isinstance(obj, list): for v in obj: self.check_writable(v)
def upload_docker(arvrunner, tool): if isinstance(tool, CommandLineTool): (docker_req, docker_is_req) = get_feature(tool, "DockerRequirement") if docker_req: if docker_req.get("dockerOutputDirectory"): # TODO: can be supported by containers API, but not jobs API. raise UnsupportedRequirement( "Option 'dockerOutputDirectory' of DockerRequirement not supported." ) arv_docker_get_image(arvrunner.api, docker_req, True, arvrunner.project_uuid) elif isinstance(tool, cwltool.workflow.Workflow): for s in tool.steps: upload_docker(arvrunner, s.embedded_tool)
def arv_executor(self, updated_tool, job_order, runtimeContext, logger=None): self.debug = runtimeContext.debug updated_tool.visit(self.check_features) self.project_uuid = runtimeContext.project_uuid self.pipeline = None self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir) self.secret_store = runtimeContext.secret_store self.trash_intermediate = runtimeContext.trash_intermediate if self.trash_intermediate and self.work_api != "containers": raise Exception("--trash-intermediate is only supported with --api=containers.") self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl if self.intermediate_output_ttl and self.work_api != "containers": raise Exception("--intermediate-output-ttl is only supported with --api=containers.") if self.intermediate_output_ttl < 0: raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl) if runtimeContext.submit_request_uuid and self.work_api != "containers": raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api)) if not runtimeContext.name: runtimeContext.name = self.name = updated_tool.tool.get("label") or updated_tool.metadata.get("label") or os.path.basename(updated_tool.tool["id"]) # Upload local file references in the job order. job_order = upload_job_order(self, "%s input" % runtimeContext.name, updated_tool, job_order) # the last clause means: if it is a command line tool, and we # are going to wait for the result, and always_submit_runner # is false, then we don't submit a runner process. submitting = (runtimeContext.update_workflow or runtimeContext.create_workflow or (runtimeContext.submit and not (updated_tool.tool["class"] == "CommandLineTool" and runtimeContext.wait and not runtimeContext.always_submit_runner))) loadingContext = self.loadingContext.copy() loadingContext.do_validate = False loadingContext.do_update = False if submitting: # Document may have been auto-updated. Reload the original # document with updating disabled because we want to # submit the document with its original CWL version, not # the auto-updated one. tool = load_tool(updated_tool.tool["id"], loadingContext) else: tool = updated_tool # Upload direct dependencies of workflow steps, get back mapping of files to keep references. # Also uploads docker images. merged_map = upload_workflow_deps(self, tool) # Recreate process object (ArvadosWorkflow or # ArvadosCommandTool) because tool document may have been # updated by upload_workflow_deps in ways that modify # inheritance of hints or requirements. loadingContext.loader = tool.doc_loader loadingContext.avsc_names = tool.doc_schema loadingContext.metadata = tool.metadata tool = load_tool(tool.tool, loadingContext) existing_uuid = runtimeContext.update_workflow if existing_uuid or runtimeContext.create_workflow: # Create a pipeline template or workflow record and exit. if self.work_api == "containers": return (upload_workflow(self, tool, job_order, self.project_uuid, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map), "success") self.apply_reqs(job_order, tool) self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse self.eval_timeout = runtimeContext.eval_timeout runtimeContext = runtimeContext.copy() runtimeContext.use_container = True runtimeContext.tmpdir_prefix = "tmp" runtimeContext.work_api = self.work_api if self.work_api == "containers": if self.ignore_docker_for_reuse: raise Exception("--ignore-docker-for-reuse not supported with containers API.") runtimeContext.outdir = "/var/spool/cwl" runtimeContext.docker_outdir = "/var/spool/cwl" runtimeContext.tmpdir = "/tmp" runtimeContext.docker_tmpdir = "/tmp" if runtimeContext.priority < 1 or runtimeContext.priority > 1000: raise Exception("--priority must be in the range 1..1000.") if self.should_estimate_cache_size: visited = set() estimated_size = [0] def estimate_collection_cache(obj): if obj.get("location", "").startswith("keep:"): m = pdh_size.match(obj["location"][5:]) if m and m.group(1) not in visited: visited.add(m.group(1)) estimated_size[0] += int(m.group(2)) visit_class(job_order, ("File", "Directory"), estimate_collection_cache) runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256) self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024) logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size) runnerjob = None if runtimeContext.submit: # Submit a runner job to run the workflow for us. if self.work_api == "containers": if submitting: tool = RunnerContainer(self, updated_tool, tool, loadingContext, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, intermediate_output_ttl=runtimeContext.intermediate_output_ttl, merged_map=merged_map, priority=runtimeContext.priority, secret_store=self.secret_store, collection_cache_size=runtimeContext.collection_cache_size, collection_cache_is_default=self.should_estimate_cache_size) else: runtimeContext.runnerjob = tool.tool["id"] if runtimeContext.cwl_runner_job is not None: self.uuid = runtimeContext.cwl_runner_job.get('uuid') jobiter = tool.job(job_order, self.output_callback, runtimeContext) if runtimeContext.submit and not runtimeContext.wait: runnerjob = next(jobiter) runnerjob.run(runtimeContext) return (runnerjob.uuid, "success") current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger) if current_container: logger.info("Running inside container %s", current_container.get("uuid")) self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout) self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count) try: self.workflow_eval_lock.acquire() # Holds the lock while this code runs and releases it when # it is safe to do so in self.workflow_eval_lock.wait(), # at which point on_message can update job state and # process output callbacks. loopperf = Perf(metrics, "jobiter") loopperf.__enter__() for runnable in jobiter: loopperf.__exit__() if self.stop_polling.is_set(): break if self.task_queue.error is not None: raise self.task_queue.error if runnable: with Perf(metrics, "run"): self.start_run(runnable, runtimeContext) else: if (self.task_queue.in_flight + len(self.processes)) > 0: self.workflow_eval_lock.wait(3) else: logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.") break if self.stop_polling.is_set(): break loopperf.__enter__() loopperf.__exit__() while (self.task_queue.in_flight + len(self.processes)) > 0: if self.task_queue.error is not None: raise self.task_queue.error self.workflow_eval_lock.wait(3) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit: logger.error("Interrupted, workflow will be cancelled") elif isinstance(sys.exc_info()[1], WorkflowException): logger.error("Workflow execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) else: logger.exception("Workflow execution failed") if self.pipeline: self.api.pipeline_instances().update(uuid=self.pipeline["uuid"], body={"state": "Failed"}).execute(num_retries=self.num_retries) if self.work_api == "containers" and not current_container: # Not running in a crunch container, so cancel any outstanding processes. for p in self.processes: try: self.api.container_requests().update(uuid=p, body={"priority": "0"} ).execute(num_retries=self.num_retries) except Exception: pass finally: self.workflow_eval_lock.release() self.task_queue.drain() self.stop_polling.set() self.polling_thread.join() self.task_queue.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if runtimeContext.submit and isinstance(tool, Runner): logger.info("Final output collection %s", tool.final_output) else: if self.output_name is None: self.output_name = "Output of %s" % (shortname(tool.tool["id"])) if self.output_tags is None: self.output_tags = "" storage_classes = runtimeContext.storage_classes.strip().split(",") self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output) self.set_crunch_output() if runtimeContext.compute_checksum: adjustDirObjs(self.final_output, partial(get_listing, self.fs_access)) adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) if self.trash_intermediate and self.final_status == "success": self.trash_intermediate_output() return (self.final_output, self.final_status)
def arv_executor(self, tool, job_order, **kwargs): self.debug = kwargs.get("debug") tool.visit(self.check_features) self.project_uuid = kwargs.get("project_uuid") self.pipeline = None make_fs_access = kwargs.get("make_fs_access") or partial( CollectionFsAccess, api_client=self.api, keep_client=self.keep_client) self.fs_access = make_fs_access(kwargs["basedir"]) if not kwargs.get("name"): kwargs["name"] = self.name = tool.tool.get( "label") or tool.metadata.get("label") or os.path.basename( tool.tool["id"]) # Upload direct dependencies of workflow steps, get back mapping of files to keep references. # Also uploads docker images. upload_workflow_deps(self, tool) # Reload tool object which may have been updated by # upload_workflow_deps tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]], makeTool=self.arv_make_tool, loader=tool.doc_loader, avsc_names=tool.doc_schema, metadata=tool.metadata) # Upload local file references in the job order. job_order = upload_job_order(self, "%s input" % kwargs["name"], tool, job_order) existing_uuid = kwargs.get("update_workflow") if existing_uuid or kwargs.get("create_workflow"): # Create a pipeline template or workflow record and exit. if self.work_api == "jobs": tmpl = RunnerTemplate( self, tool, job_order, kwargs.get("enable_reuse"), uuid=existing_uuid, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs["name"]) tmpl.save() # cwltool.main will write our return value to stdout. return (tmpl.uuid, "success") elif self.work_api == "containers": return (upload_workflow( self, tool, job_order, self.project_uuid, uuid=existing_uuid, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs["name"]), "success") self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse") kwargs["make_fs_access"] = make_fs_access kwargs["enable_reuse"] = kwargs.get("enable_reuse") kwargs["use_container"] = True kwargs["tmpdir_prefix"] = "tmp" kwargs["compute_checksum"] = kwargs.get("compute_checksum") if self.work_api == "containers": kwargs["outdir"] = "/var/spool/cwl" kwargs["docker_outdir"] = "/var/spool/cwl" kwargs["tmpdir"] = "/tmp" kwargs["docker_tmpdir"] = "/tmp" elif self.work_api == "jobs": kwargs["outdir"] = "$(task.outdir)" kwargs["docker_outdir"] = "$(task.outdir)" kwargs["tmpdir"] = "$(task.tmpdir)" runnerjob = None if kwargs.get("submit"): # Submit a runner job to run the workflow for us. if self.work_api == "containers": if tool.tool["class"] == "CommandLineTool": kwargs["runnerjob"] = tool.tool["id"] upload_dependencies(self, kwargs["name"], tool.doc_loader, tool.tool, tool.tool["id"], False) runnerjob = tool.job(job_order, self.output_callback, **kwargs).next() else: runnerjob = RunnerContainer( self, tool, job_order, kwargs.get("enable_reuse"), self.output_name, self.output_tags, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs.get("name"), on_error=kwargs.get("on_error"), submit_runner_image=kwargs.get("submit_runner_image")) elif self.work_api == "jobs": runnerjob = RunnerJob( self, tool, job_order, kwargs.get("enable_reuse"), self.output_name, self.output_tags, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs.get("name"), on_error=kwargs.get("on_error"), submit_runner_image=kwargs.get("submit_runner_image")) if not kwargs.get( "submit" ) and "cwl_runner_job" not in kwargs and self.work_api == "jobs": # Create pipeline for local run self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": kwargs["name"] if kwargs. get("name") else shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient" }).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) if runnerjob and not kwargs.get("wait"): runnerjob.run(wait=kwargs.get("wait")) return (runnerjob.uuid, "success") self.poll_api = arvados.api('v1') self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() if runnerjob: jobiter = iter((runnerjob, )) else: if "cwl_runner_job" in kwargs: self.uuid = kwargs.get("cwl_runner_job").get('uuid') jobiter = tool.job(job_order, self.output_callback, **kwargs) try: self.cond.acquire() # Will continue to hold the lock for the duration of this code # except when in cond.wait(), at which point on_message can update # job state and process output callbacks. loopperf = Perf(metrics, "jobiter") loopperf.__enter__() for runnable in jobiter: loopperf.__exit__() if self.stop_polling.is_set(): break if runnable: with Perf(metrics, "run"): runnable.run(**kwargs) else: if self.processes: self.cond.wait(1) else: logger.error( "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs." ) break loopperf.__enter__() loopperf.__exit__() while self.processes: self.cond.wait(1) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt: logger.error("Interrupted, marking pipeline as failed") else: logger.error( "Execution failed: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update( uuid=self.pipeline["uuid"], body={ "state": "Failed" }).execute(num_retries=self.num_retries) if runnerjob and runnerjob.uuid and self.work_api == "containers": self.api.container_requests().update( uuid=runnerjob.uuid, body={ "priority": "0" }).execute(num_retries=self.num_retries) finally: self.cond.release() self.stop_polling.set() self.polling_thread.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if kwargs.get("submit") and isinstance(runnerjob, Runner): logger.info("Final output collection %s", runnerjob.final_output) else: if self.output_name is None: self.output_name = "Output of %s" % (shortname( tool.tool["id"])) if self.output_tags is None: self.output_tags = "" self.final_output, self.final_output_collection = self.make_output_collection( self.output_name, self.output_tags, self.final_output) self.set_crunch_output() if kwargs.get("compute_checksum"): adjustDirObjs(self.final_output, partial(getListing, self.fs_access)) adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) return (self.final_output, self.final_status)
def run(self, dry_run=False, pull_image=True, **kwargs): container_request = { "command": self.command_line, "owner_uuid": self.arvrunner.project_uuid, "name": self.name, "output_path": self.outdir, "cwd": self.outdir, "priority": 1, "state": "Committed" } runtime_constraints = {} mounts = { self.outdir: { "kind": "tmp" } } dirs = set() for f in self.pathmapper.files(): _, p, tp = self.pathmapper.mapper(f) if tp == "Directory" and '/' not in p[6:]: mounts[p] = { "kind": "collection", "portable_data_hash": p[6:] } dirs.add(p[6:]) for f in self.pathmapper.files(): _, p, tp = self.pathmapper.mapper(f) if p[6:].split("/")[0] not in dirs: mounts[p] = { "kind": "collection", "portable_data_hash": p[6:] } if self.generatefiles["listing"]: raise UnsupportedRequirement("Generate files not supported") container_request["environment"] = {"TMPDIR": self.tmpdir, "HOME": self.outdir} if self.environment: container_request["environment"].update(self.environment) if self.stdin: raise UnsupportedRequirement("Stdin redirection currently not suppported") if self.stderr: raise UnsupportedRequirement("Stderr redirection currently not suppported") if self.stdout: mounts["stdout"] = {"kind": "file", "path": "%s/%s" % (self.outdir, self.stdout)} (docker_req, docker_is_req) = get_feature(self, "DockerRequirement") if not docker_req: docker_req = {"dockerImageId": "arvados/jobs"} container_request["container_image"] = arv_docker_get_image(self.arvrunner.api, docker_req, pull_image, self.arvrunner.project_uuid) resources = self.builder.resources if resources is not None: runtime_constraints["vcpus"] = resources.get("cores", 1) runtime_constraints["ram"] = resources.get("ram") * 2**20 container_request["mounts"] = mounts container_request["runtime_constraints"] = runtime_constraints try: response = self.arvrunner.api.container_requests().create( body=container_request ).execute(num_retries=self.arvrunner.num_retries) self.arvrunner.processes[response["container_uuid"]] = self logger.info("Container %s (%s) request state is %s", self.name, response["uuid"], response["state"]) if response["state"] == "Final": self.done(response) except Exception as e: logger.error("Got error %s" % str(e)) self.output_callback({}, "permanentFail")
def run(self, runtimeContext): # type: (RuntimeContext) -> None (docker_req, docker_is_req) = self.get_requirement("DockerRequirement") self.prov_obj = runtimeContext.prov_obj img_id = None env = cast(MutableMapping[Text, Text], os.environ) user_space_docker_cmd = runtimeContext.user_space_docker_cmd if docker_req and user_space_docker_cmd: # For user-space docker implementations, a local image name or ID # takes precedence over a network pull if 'dockerImageId' in docker_req: img_id = str(docker_req["dockerImageId"]) elif 'dockerPull' in docker_req: img_id = str(docker_req["dockerPull"]) # else: # raise WorkflowException(SourceLine(docker_req).makeError( # "Docker image must be specified as 'dockerImageId' or " # "'dockerPull' when using user space implementations of " # "Docker")) else: try: if docker_req and runtimeContext.use_container: img_id = str( self.get_from_requirements( docker_req, True, runtimeContext.pull_image, getdefault(runtimeContext.force_docker_pull, False), getdefault(runtimeContext.tmp_outdir_prefix, DEFAULT_TMP_PREFIX))) if img_id is None: if self.builder.find_default_container: default_container = self.builder.find_default_container() if default_container: img_id = str(default_container) if docker_req and img_id is None and runtimeContext.use_container: raise Exception("Docker image not available") if self.prov_obj and img_id and runtimeContext.process_run_id: # TODO: Integrate with record_container_id container_agent = self.prov_obj.document.agent( uuid.uuid4().urn, {"prov:type": PROV["SoftwareAgent"], "cwlprov:image": img_id, "prov:label": "Container execution of image %s" % img_id}) # FIXME: img_id is not a sha256 id, it might just be "debian:8" #img_entity = document.entity("nih:sha-256;%s" % img_id, # {"prov:label": "Container image %s" % img_id} ) # The image is the plan for this activity-agent association #document.wasAssociatedWith(process_run_ID, container_agent, img_entity) self.prov_obj.document.wasAssociatedWith( runtimeContext.process_run_id, container_agent) except Exception as err: container = "Shifter" _logger.debug("%s error", container, exc_info=True) if docker_is_req: raise UnsupportedRequirement( "%s is required to run this tool: %s" % (container, err)) else: raise WorkflowException( "{0} is not available for this tool, try " "--no-container to disable {0}, or install " "a user space Docker replacement like uDocker with " "--user-space-docker-cmd.: {1}".format(container, err)) self._setup(runtimeContext) stageFiles(self.pathmapper, ignoreWritable=True, symLink=True, secret_store=runtimeContext.secret_store) runtime = self.create_runtime(env, runtimeContext, img_id) self._execute(runtime, env, runtimeContext)
def arv_executor(self, tool, job_order, runtimeContext, logger=None): self.debug = runtimeContext.debug tool.visit(self.check_features) self.project_uuid = runtimeContext.project_uuid self.pipeline = None self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir) self.secret_store = runtimeContext.secret_store self.trash_intermediate = runtimeContext.trash_intermediate if self.trash_intermediate and self.work_api != "containers": raise Exception( "--trash-intermediate is only supported with --api=containers." ) self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl if self.intermediate_output_ttl and self.work_api != "containers": raise Exception( "--intermediate-output-ttl is only supported with --api=containers." ) if self.intermediate_output_ttl < 0: raise Exception( "Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl) if runtimeContext.submit_request_uuid and self.work_api != "containers": raise Exception( "--submit-request-uuid requires containers API, but using '{}' api" .format(self.work_api)) if not runtimeContext.name: runtimeContext.name = self.name = tool.tool.get( "label") or tool.metadata.get("label") or os.path.basename( tool.tool["id"]) # Upload direct dependencies of workflow steps, get back mapping of files to keep references. # Also uploads docker images. merged_map = upload_workflow_deps(self, tool) # Reload tool object which may have been updated by # upload_workflow_deps # Don't validate this time because it will just print redundant errors. loadingContext = self.loadingContext.copy() loadingContext.loader = tool.doc_loader loadingContext.avsc_names = tool.doc_schema loadingContext.metadata = tool.metadata loadingContext.do_validate = False tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]], loadingContext) # Upload local file references in the job order. job_order = upload_job_order(self, "%s input" % runtimeContext.name, tool, job_order) existing_uuid = runtimeContext.update_workflow if existing_uuid or runtimeContext.create_workflow: # Create a pipeline template or workflow record and exit. if self.work_api == "jobs": tmpl = RunnerTemplate( self, tool, job_order, runtimeContext.enable_reuse, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map) tmpl.save() # cwltool.main will write our return value to stdout. return (tmpl.uuid, "success") elif self.work_api == "containers": return (upload_workflow( self, tool, job_order, self.project_uuid, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map), "success") self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse self.eval_timeout = runtimeContext.eval_timeout runtimeContext = runtimeContext.copy() runtimeContext.use_container = True runtimeContext.tmpdir_prefix = "tmp" runtimeContext.work_api = self.work_api if self.work_api == "containers": if self.ignore_docker_for_reuse: raise Exception( "--ignore-docker-for-reuse not supported with containers API." ) runtimeContext.outdir = "/var/spool/cwl" runtimeContext.docker_outdir = "/var/spool/cwl" runtimeContext.tmpdir = "/tmp" runtimeContext.docker_tmpdir = "/tmp" elif self.work_api == "jobs": if runtimeContext.priority != DEFAULT_PRIORITY: raise Exception("--priority not implemented for jobs API.") runtimeContext.outdir = "$(task.outdir)" runtimeContext.docker_outdir = "$(task.outdir)" runtimeContext.tmpdir = "$(task.tmpdir)" if runtimeContext.priority < 1 or runtimeContext.priority > 1000: raise Exception("--priority must be in the range 1..1000.") runnerjob = None if runtimeContext.submit: # Submit a runner job to run the workflow for us. if self.work_api == "containers": if tool.tool[ "class"] == "CommandLineTool" and runtimeContext.wait: runtimeContext.runnerjob = tool.tool["id"] runnerjob = tool.job(job_order, self.output_callback, runtimeContext).next() else: runnerjob = RunnerContainer( self, tool, job_order, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, intermediate_output_ttl=runtimeContext. intermediate_output_ttl, merged_map=merged_map, priority=runtimeContext.priority, secret_store=self.secret_store) elif self.work_api == "jobs": runnerjob = RunnerJob( self, tool, job_order, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, merged_map=merged_map) elif runtimeContext.cwl_runner_job is None and self.work_api == "jobs": # Create pipeline for local run self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": runtimeContext.name if runtimeContext. name else shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient" }).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) if runnerjob and not runtimeContext.wait: submitargs = runtimeContext.copy() submitargs.submit = False runnerjob.run(submitargs) return (runnerjob.uuid, "success") self.poll_api = arvados.api('v1') self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count) if runnerjob: jobiter = iter((runnerjob, )) else: if runtimeContext.cwl_runner_job is not None: self.uuid = runtimeContext.cwl_runner_job.get('uuid') jobiter = tool.job(job_order, self.output_callback, runtimeContext) try: self.workflow_eval_lock.acquire() # Holds the lock while this code runs and releases it when # it is safe to do so in self.workflow_eval_lock.wait(), # at which point on_message can update job state and # process output callbacks. loopperf = Perf(metrics, "jobiter") loopperf.__enter__() for runnable in jobiter: loopperf.__exit__() if self.stop_polling.is_set(): break if self.task_queue.error is not None: raise self.task_queue.error if runnable: with Perf(metrics, "run"): self.start_run(runnable, runtimeContext) else: if (self.task_queue.in_flight + len(self.processes)) > 0: self.workflow_eval_lock.wait(3) else: logger.error( "Workflow is deadlocked, no runnable processes and not waiting on any pending processes." ) break loopperf.__enter__() loopperf.__exit__() while (self.task_queue.in_flight + len(self.processes)) > 0: if self.task_queue.error is not None: raise self.task_queue.error self.workflow_eval_lock.wait(3) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info( )[0] is SystemExit: logger.error("Interrupted, workflow will be cancelled") else: logger.error( "Execution failed: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update( uuid=self.pipeline["uuid"], body={ "state": "Failed" }).execute(num_retries=self.num_retries) if runnerjob and runnerjob.uuid and self.work_api == "containers": self.api.container_requests().update( uuid=runnerjob.uuid, body={ "priority": "0" }).execute(num_retries=self.num_retries) finally: self.workflow_eval_lock.release() self.task_queue.drain() self.stop_polling.set() self.polling_thread.join() self.task_queue.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if runtimeContext.submit and isinstance(runnerjob, Runner): logger.info("Final output collection %s", runnerjob.final_output) else: if self.output_name is None: self.output_name = "Output of %s" % (shortname( tool.tool["id"])) if self.output_tags is None: self.output_tags = "" storage_classes = runtimeContext.storage_classes.strip().split(",") self.final_output, self.final_output_collection = self.make_output_collection( self.output_name, storage_classes, self.output_tags, self.final_output) self.set_crunch_output() if runtimeContext.compute_checksum: adjustDirObjs(self.final_output, partial(get_listing, self.fs_access)) adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) if self.trash_intermediate and self.final_status == "success": self.trash_intermediate_output() return (self.final_output, self.final_status)
def run(self, dry_run=False, pull_image=True, **kwargs): container_request = { "command": self.command_line, "owner_uuid": self.arvrunner.project_uuid, "name": self.name, "output_path": self.outdir, "cwd": self.outdir, "priority": 1, "state": "Committed", "properties": {} } runtime_constraints = {} mounts = {self.outdir: {"kind": "tmp"}} scheduling_parameters = {} dirs = set() for f in self.pathmapper.files(): pdh, p, tp = self.pathmapper.mapper(f) if tp == "Directory" and '/' not in pdh: mounts[p] = { "kind": "collection", "portable_data_hash": pdh[5:] } dirs.add(pdh) for f in self.pathmapper.files(): res, p, tp = self.pathmapper.mapper(f) if res.startswith("keep:"): res = res[5:] elif res.startswith("/keep/"): res = res[6:] else: continue sp = res.split("/", 1) pdh = sp[0] if pdh not in dirs: mounts[p] = {"kind": "collection", "portable_data_hash": pdh} if len(sp) == 2: mounts[p]["path"] = sp[1] with Perf(metrics, "generatefiles %s" % self.name): if self.generatefiles["listing"]: vwd = arvados.collection.Collection( api_client=self.arvrunner.api, keep_client=self.arvrunner.keep_client, num_retries=self.arvrunner.num_retries) generatemapper = NoFollowPathMapper([self.generatefiles], "", "", separateDirs=False) with Perf(metrics, "createfiles %s" % self.name): for f, p in generatemapper.items(): if not p.target: pass elif p.type in ("File", "Directory"): source, path = self.arvrunner.fs_access.get_collection( p.resolved) vwd.copy(path, p.target, source_collection=source) elif p.type == "CreateFile": with vwd.open(p.target, "w") as n: n.write(p.resolved.encode("utf-8")) with Perf(metrics, "generatefiles.save_new %s" % self.name): vwd.save_new() for f, p in generatemapper.items(): if not p.target: continue mountpoint = "%s/%s" % (self.outdir, p.target) mounts[mountpoint] = { "kind": "collection", "portable_data_hash": vwd.portable_data_hash(), "path": p.target } container_request["environment"] = { "TMPDIR": self.tmpdir, "HOME": self.outdir } if self.environment: container_request["environment"].update(self.environment) if self.stdin: raise UnsupportedRequirement( "Stdin redirection currently not suppported") if self.stderr: raise UnsupportedRequirement( "Stderr redirection currently not suppported") if self.stdout: mounts["stdout"] = { "kind": "file", "path": "%s/%s" % (self.outdir, self.stdout) } (docker_req, docker_is_req) = get_feature(self, "DockerRequirement") if not docker_req: docker_req = {"dockerImageId": "arvados/jobs"} container_request["container_image"] = arv_docker_get_image( self.arvrunner.api, docker_req, pull_image, self.arvrunner.project_uuid) resources = self.builder.resources if resources is not None: runtime_constraints["vcpus"] = resources.get("cores", 1) runtime_constraints["ram"] = resources.get("ram") * 2**20 api_req, _ = get_feature(self, "http://arvados.org/cwl#APIRequirement") if api_req: runtime_constraints["API"] = True runtime_req, _ = get_feature( self, "http://arvados.org/cwl#RuntimeConstraints") if runtime_req: if "keep_cache" in runtime_req: runtime_constraints[ "keep_cache_ram"] = runtime_req["keep_cache"] * 2**20 partition_req, _ = get_feature( self, "http://arvados.org/cwl#PartitionRequirement") if partition_req: scheduling_parameters["partitions"] = aslist( partition_req["partition"]) container_request["mounts"] = mounts container_request["runtime_constraints"] = runtime_constraints container_request["use_existing"] = kwargs.get("enable_reuse", True) container_request["scheduling_parameters"] = scheduling_parameters if kwargs.get("runnerjob", "").startswith("arvwf:"): wfuuid = kwargs["runnerjob"][6:kwargs["runnerjob"].index("#")] wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute( num_retries=self.arvrunner.num_retries) if container_request["name"] == "main": container_request["name"] = wfrecord["name"] container_request["properties"]["template_uuid"] = wfuuid try: response = self.arvrunner.api.container_requests().create( body=container_request).execute( num_retries=self.arvrunner.num_retries) self.uuid = response["uuid"] self.arvrunner.processes[self.uuid] = self if response["state"] == "Final": logger.info("%s reused container %s", self.arvrunner.label(self), response["container_uuid"]) self.done(response) else: logger.info("%s %s state is %s", self.arvrunner.label(self), response["uuid"], response["state"]) except Exception as e: logger.error("%s got error %s" % (self.arvrunner.label(self), str(e))) self.output_callback({}, "permanentFail")
def arvExecutor(self, tool, job_order, **kwargs): self.debug = kwargs.get("debug") tool.visit(self.check_writable) if kwargs.get("quiet"): logger.setLevel(logging.WARN) logging.getLogger('arvados.arv-run').setLevel(logging.WARN) useruuid = self.api.users().current().execute()["uuid"] self.project_uuid = kwargs.get("project_uuid") if kwargs.get( "project_uuid") else useruuid self.pipeline = None make_fs_access = kwargs.get("make_fs_access") or partial( CollectionFsAccess, api_client=self.api) self.fs_access = make_fs_access(kwargs["basedir"]) if kwargs.get("create_template"): tmpl = RunnerTemplate(self, tool, job_order, kwargs.get("enable_reuse")) tmpl.save() # cwltool.main will write our return value to stdout. return tmpl.uuid self.debug = kwargs.get("debug") self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse") kwargs["make_fs_access"] = make_fs_access kwargs["enable_reuse"] = kwargs.get("enable_reuse") kwargs["use_container"] = True kwargs["tmpdir_prefix"] = "tmp" kwargs["on_error"] = "continue" kwargs["compute_checksum"] = kwargs.get("compute_checksum") if self.work_api == "containers": kwargs["outdir"] = "/var/spool/cwl" kwargs["docker_outdir"] = "/var/spool/cwl" kwargs["tmpdir"] = "/tmp" kwargs["docker_tmpdir"] = "/tmp" elif self.work_api == "jobs": kwargs["outdir"] = "$(task.outdir)" kwargs["docker_outdir"] = "$(task.outdir)" kwargs["tmpdir"] = "$(task.tmpdir)" runnerjob = None if kwargs.get("submit"): if self.work_api == "containers": if tool.tool["class"] == "CommandLineTool": runnerjob = tool.job(job_order, self.output_callback, **kwargs).next() else: runnerjob = RunnerContainer(self, tool, job_order, kwargs.get("enable_reuse")) else: runnerjob = RunnerJob(self, tool, job_order, kwargs.get("enable_reuse")) if not kwargs.get( "submit" ) and "cwl_runner_job" not in kwargs and not self.work_api == "containers": # Create pipeline for local run self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient" }).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) if runnerjob and not kwargs.get("wait"): runnerjob.run() return runnerjob.uuid self.poll_api = arvados.api('v1') self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() if runnerjob: jobiter = iter((runnerjob, )) else: if "cwl_runner_job" in kwargs: self.uuid = kwargs.get("cwl_runner_job").get('uuid') jobiter = tool.job(job_order, self.output_callback, **kwargs) try: self.cond.acquire() # Will continue to hold the lock for the duration of this code # except when in cond.wait(), at which point on_message can update # job state and process output callbacks. for runnable in jobiter: if runnable: runnable.run(**kwargs) else: if self.processes: self.cond.wait(1) else: logger.error( "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs." ) break while self.processes: self.cond.wait(1) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt: logger.error("Interrupted, marking pipeline as failed") else: logger.error( "Caught unhandled exception, marking pipeline as failed. Error was: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update( uuid=self.pipeline["uuid"], body={ "state": "Failed" }).execute(num_retries=self.num_retries) if runnerjob and runnerjob.uuid and self.work_api == "containers": self.api.container_requests().update( uuid=runnerjob.uuid, body={ "priority": "0" }).execute(num_retries=self.num_retries) finally: self.cond.release() self.stop_polling.set() self.polling_thread.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_status != "success": raise WorkflowException("Workflow failed.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if kwargs.get("compute_checksum"): adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) return self.final_output
def run(self, dry_run=False, pull_image=True, **kwargs): container_request = { "command": self.command_line, "owner_uuid": self.arvrunner.project_uuid, "name": self.name, "output_path": self.outdir, "cwd": self.outdir, "priority": 1, "state": "Committed", "properties": {} } runtime_constraints = {} mounts = {self.outdir: {"kind": "tmp"}} scheduling_parameters = {} dirs = set() for f in self.pathmapper.files(): _, p, tp = self.pathmapper.mapper(f) if tp == "Directory" and '/' not in p[6:]: mounts[p] = {"kind": "collection", "portable_data_hash": p[6:]} dirs.add(p[6:]) for f in self.pathmapper.files(): _, p, tp = self.pathmapper.mapper(f) if p[6:].split("/")[0] not in dirs: mounts[p] = {"kind": "collection", "portable_data_hash": p[6:]} if self.generatefiles["listing"]: raise UnsupportedRequirement( "InitialWorkDirRequirement not supported with --api=containers" ) container_request["environment"] = { "TMPDIR": self.tmpdir, "HOME": self.outdir } if self.environment: container_request["environment"].update(self.environment) if self.stdin: raise UnsupportedRequirement( "Stdin redirection currently not suppported") if self.stderr: raise UnsupportedRequirement( "Stderr redirection currently not suppported") if self.stdout: mounts["stdout"] = { "kind": "file", "path": "%s/%s" % (self.outdir, self.stdout) } (docker_req, docker_is_req) = get_feature(self, "DockerRequirement") if not docker_req: docker_req = {"dockerImageId": arvados_jobs_image(self.arvrunner)} container_request["container_image"] = arv_docker_get_image( self.arvrunner.api, docker_req, pull_image, self.arvrunner.project_uuid) resources = self.builder.resources if resources is not None: runtime_constraints["vcpus"] = resources.get("cores", 1) runtime_constraints["ram"] = resources.get("ram") * 2**20 api_req, _ = get_feature(self, "http://arvados.org/cwl#APIRequirement") if api_req: runtime_constraints["API"] = True runtime_req, _ = get_feature( self, "http://arvados.org/cwl#RuntimeConstraints") if runtime_req: if "keep_cache" in runtime_req: runtime_constraints["keep_cache_ram"] = runtime_req[ "keep_cache"] partition_req, _ = get_feature( self, "http://arvados.org/cwl#PartitionRequirement") if partition_req: scheduling_parameters["partitions"] = aslist( partition_req["partition"]) container_request["mounts"] = mounts container_request["runtime_constraints"] = runtime_constraints container_request["use_existing"] = kwargs.get("enable_reuse", True) container_request["scheduling_parameters"] = scheduling_parameters if kwargs.get("runnerjob", "").startswith("arvwf:"): wfuuid = kwargs["runnerjob"][6:kwargs["runnerjob"].index("#")] wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute( num_retries=self.arvrunner.num_retries) if container_request["name"] == "main": container_request["name"] = wfrecord["name"] container_request["properties"]["template_uuid"] = wfuuid try: response = self.arvrunner.api.container_requests().create( body=container_request).execute( num_retries=self.arvrunner.num_retries) self.uuid = response["uuid"] self.arvrunner.processes[self.uuid] = self logger.info("Container request %s (%s) state is %s", self.name, response["uuid"], response["state"]) if response["state"] == "Final": self.done(response) except Exception as e: logger.error("Got error %s" % str(e)) self.output_callback({}, "permanentFail")