Exemple #1
0
 def check_writable(self, obj):
     if isinstance(obj, dict):
         if obj.get("writable"):
             raise UnsupportedRequirement("InitialWorkDir feature 'writable: true' not supported")
         for v in obj.itervalues():
             self.check_writable(v)
     if isinstance(obj, list):
         for v in obj:
             self.check_writable(v)
Exemple #2
0
def upload_docker(arvrunner, tool):
    if isinstance(tool, CommandLineTool):
        (docker_req, docker_is_req) = get_feature(tool, "DockerRequirement")
        if docker_req:
            if docker_req.get("dockerOutputDirectory"):
                # TODO: can be supported by containers API, but not jobs API.
                raise UnsupportedRequirement(
                    "Option 'dockerOutputDirectory' of DockerRequirement not supported."
                )
            arv_docker_get_image(arvrunner.api, docker_req, True,
                                 arvrunner.project_uuid)
    elif isinstance(tool, cwltool.workflow.Workflow):
        for s in tool.steps:
            upload_docker(arvrunner, s.embedded_tool)
Exemple #3
0
    def arv_executor(self, updated_tool, job_order, runtimeContext, logger=None):
        self.debug = runtimeContext.debug

        updated_tool.visit(self.check_features)

        self.project_uuid = runtimeContext.project_uuid
        self.pipeline = None
        self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
        self.secret_store = runtimeContext.secret_store

        self.trash_intermediate = runtimeContext.trash_intermediate
        if self.trash_intermediate and self.work_api != "containers":
            raise Exception("--trash-intermediate is only supported with --api=containers.")

        self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
        if self.intermediate_output_ttl and self.work_api != "containers":
            raise Exception("--intermediate-output-ttl is only supported with --api=containers.")
        if self.intermediate_output_ttl < 0:
            raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl)

        if runtimeContext.submit_request_uuid and self.work_api != "containers":
            raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api))

        if not runtimeContext.name:
            runtimeContext.name = self.name = updated_tool.tool.get("label") or updated_tool.metadata.get("label") or os.path.basename(updated_tool.tool["id"])

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % runtimeContext.name,
                                     updated_tool, job_order)

        # the last clause means: if it is a command line tool, and we
        # are going to wait for the result, and always_submit_runner
        # is false, then we don't submit a runner process.

        submitting = (runtimeContext.update_workflow or
                      runtimeContext.create_workflow or
                      (runtimeContext.submit and not
                       (updated_tool.tool["class"] == "CommandLineTool" and
                        runtimeContext.wait and
                        not runtimeContext.always_submit_runner)))

        loadingContext = self.loadingContext.copy()
        loadingContext.do_validate = False
        loadingContext.do_update = False
        if submitting:
            # Document may have been auto-updated. Reload the original
            # document with updating disabled because we want to
            # submit the document with its original CWL version, not
            # the auto-updated one.
            tool = load_tool(updated_tool.tool["id"], loadingContext)
        else:
            tool = updated_tool

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        merged_map = upload_workflow_deps(self, tool)

        # Recreate process object (ArvadosWorkflow or
        # ArvadosCommandTool) because tool document may have been
        # updated by upload_workflow_deps in ways that modify
        # inheritance of hints or requirements.
        loadingContext.loader = tool.doc_loader
        loadingContext.avsc_names = tool.doc_schema
        loadingContext.metadata = tool.metadata
        tool = load_tool(tool.tool, loadingContext)

        existing_uuid = runtimeContext.update_workflow
        if existing_uuid or runtimeContext.create_workflow:
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "containers":
                return (upload_workflow(self, tool, job_order,
                                        self.project_uuid,
                                        uuid=existing_uuid,
                                        submit_runner_ram=runtimeContext.submit_runner_ram,
                                        name=runtimeContext.name,
                                        merged_map=merged_map),
                        "success")

        self.apply_reqs(job_order, tool)

        self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
        self.eval_timeout = runtimeContext.eval_timeout

        runtimeContext = runtimeContext.copy()
        runtimeContext.use_container = True
        runtimeContext.tmpdir_prefix = "tmp"
        runtimeContext.work_api = self.work_api

        if self.work_api == "containers":
            if self.ignore_docker_for_reuse:
                raise Exception("--ignore-docker-for-reuse not supported with containers API.")
            runtimeContext.outdir = "/var/spool/cwl"
            runtimeContext.docker_outdir = "/var/spool/cwl"
            runtimeContext.tmpdir = "/tmp"
            runtimeContext.docker_tmpdir = "/tmp"

        if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
            raise Exception("--priority must be in the range 1..1000.")

        if self.should_estimate_cache_size:
            visited = set()
            estimated_size = [0]
            def estimate_collection_cache(obj):
                if obj.get("location", "").startswith("keep:"):
                    m = pdh_size.match(obj["location"][5:])
                    if m and m.group(1) not in visited:
                        visited.add(m.group(1))
                        estimated_size[0] += int(m.group(2))
            visit_class(job_order, ("File", "Directory"), estimate_collection_cache)
            runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256)
            self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024)

        logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size)

        runnerjob = None
        if runtimeContext.submit:
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if submitting:
                    tool = RunnerContainer(self, updated_tool,
                                           tool, loadingContext, runtimeContext.enable_reuse,
                                           self.output_name,
                                           self.output_tags,
                                           submit_runner_ram=runtimeContext.submit_runner_ram,
                                           name=runtimeContext.name,
                                           on_error=runtimeContext.on_error,
                                           submit_runner_image=runtimeContext.submit_runner_image,
                                           intermediate_output_ttl=runtimeContext.intermediate_output_ttl,
                                           merged_map=merged_map,
                                           priority=runtimeContext.priority,
                                           secret_store=self.secret_store,
                                           collection_cache_size=runtimeContext.collection_cache_size,
                                           collection_cache_is_default=self.should_estimate_cache_size)
                else:
                    runtimeContext.runnerjob = tool.tool["id"]

        if runtimeContext.cwl_runner_job is not None:
            self.uuid = runtimeContext.cwl_runner_job.get('uuid')

        jobiter = tool.job(job_order,
                           self.output_callback,
                           runtimeContext)

        if runtimeContext.submit and not runtimeContext.wait:
            runnerjob = next(jobiter)
            runnerjob.run(runtimeContext)
            return (runnerjob.uuid, "success")

        current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
        if current_container:
            logger.info("Running inside container %s", current_container.get("uuid"))

        self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout)
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)

        try:
            self.workflow_eval_lock.acquire()

            # Holds the lock while this code runs and releases it when
            # it is safe to do so in self.workflow_eval_lock.wait(),
            # at which point on_message can update job state and
            # process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if self.task_queue.error is not None:
                    raise self.task_queue.error

                if runnable:
                    with Perf(metrics, "run"):
                        self.start_run(runnable, runtimeContext)
                else:
                    if (self.task_queue.in_flight + len(self.processes)) > 0:
                        self.workflow_eval_lock.wait(3)
                    else:
                        logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.")
                        break

                if self.stop_polling.is_set():
                    break

                loopperf.__enter__()
            loopperf.__exit__()

            while (self.task_queue.in_flight + len(self.processes)) > 0:
                if self.task_queue.error is not None:
                    raise self.task_queue.error
                self.workflow_eval_lock.wait(3)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit:
                logger.error("Interrupted, workflow will be cancelled")
            elif isinstance(sys.exc_info()[1], WorkflowException):
                logger.error("Workflow execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
            else:
                logger.exception("Workflow execution failed")

            if self.pipeline:
                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
                                                     body={"state": "Failed"}).execute(num_retries=self.num_retries)

            if self.work_api == "containers" and not current_container:
                # Not running in a crunch container, so cancel any outstanding processes.
                for p in self.processes:
                    try:
                        self.api.container_requests().update(uuid=p,
                                                             body={"priority": "0"}
                        ).execute(num_retries=self.num_retries)
                    except Exception:
                        pass
        finally:
            self.workflow_eval_lock.release()
            self.task_queue.drain()
            self.stop_polling.set()
            self.polling_thread.join()
            self.task_queue.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if runtimeContext.submit and isinstance(tool, Runner):
            logger.info("Final output collection %s", tool.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""

            storage_classes = runtimeContext.storage_classes.strip().split(",")
            self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output)
            self.set_crunch_output()

        if runtimeContext.compute_checksum:
            adjustDirObjs(self.final_output, partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access))

        if self.trash_intermediate and self.final_status == "success":
            self.trash_intermediate_output()

        return (self.final_output, self.final_status)
Exemple #4
0
    def arv_executor(self, tool, job_order, **kwargs):
        self.debug = kwargs.get("debug")

        tool.visit(self.check_features)

        self.project_uuid = kwargs.get("project_uuid")
        self.pipeline = None
        make_fs_access = kwargs.get("make_fs_access") or partial(
            CollectionFsAccess,
            api_client=self.api,
            keep_client=self.keep_client)
        self.fs_access = make_fs_access(kwargs["basedir"])

        if not kwargs.get("name"):
            kwargs["name"] = self.name = tool.tool.get(
                "label") or tool.metadata.get("label") or os.path.basename(
                    tool.tool["id"])

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        upload_workflow_deps(self, tool)

        # Reload tool object which may have been updated by
        # upload_workflow_deps
        tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]],
                                  makeTool=self.arv_make_tool,
                                  loader=tool.doc_loader,
                                  avsc_names=tool.doc_schema,
                                  metadata=tool.metadata)

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % kwargs["name"], tool,
                                     job_order)

        existing_uuid = kwargs.get("update_workflow")
        if existing_uuid or kwargs.get("create_workflow"):
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "jobs":
                tmpl = RunnerTemplate(
                    self,
                    tool,
                    job_order,
                    kwargs.get("enable_reuse"),
                    uuid=existing_uuid,
                    submit_runner_ram=kwargs.get("submit_runner_ram"),
                    name=kwargs["name"])
                tmpl.save()
                # cwltool.main will write our return value to stdout.
                return (tmpl.uuid, "success")
            elif self.work_api == "containers":
                return (upload_workflow(
                    self,
                    tool,
                    job_order,
                    self.project_uuid,
                    uuid=existing_uuid,
                    submit_runner_ram=kwargs.get("submit_runner_ram"),
                    name=kwargs["name"]), "success")

        self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")

        kwargs["make_fs_access"] = make_fs_access
        kwargs["enable_reuse"] = kwargs.get("enable_reuse")
        kwargs["use_container"] = True
        kwargs["tmpdir_prefix"] = "tmp"
        kwargs["compute_checksum"] = kwargs.get("compute_checksum")

        if self.work_api == "containers":
            kwargs["outdir"] = "/var/spool/cwl"
            kwargs["docker_outdir"] = "/var/spool/cwl"
            kwargs["tmpdir"] = "/tmp"
            kwargs["docker_tmpdir"] = "/tmp"
        elif self.work_api == "jobs":
            kwargs["outdir"] = "$(task.outdir)"
            kwargs["docker_outdir"] = "$(task.outdir)"
            kwargs["tmpdir"] = "$(task.tmpdir)"

        runnerjob = None
        if kwargs.get("submit"):
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if tool.tool["class"] == "CommandLineTool":
                    kwargs["runnerjob"] = tool.tool["id"]
                    upload_dependencies(self, kwargs["name"], tool.doc_loader,
                                        tool.tool, tool.tool["id"], False)
                    runnerjob = tool.job(job_order, self.output_callback,
                                         **kwargs).next()
                else:
                    runnerjob = RunnerContainer(
                        self,
                        tool,
                        job_order,
                        kwargs.get("enable_reuse"),
                        self.output_name,
                        self.output_tags,
                        submit_runner_ram=kwargs.get("submit_runner_ram"),
                        name=kwargs.get("name"),
                        on_error=kwargs.get("on_error"),
                        submit_runner_image=kwargs.get("submit_runner_image"))
            elif self.work_api == "jobs":
                runnerjob = RunnerJob(
                    self,
                    tool,
                    job_order,
                    kwargs.get("enable_reuse"),
                    self.output_name,
                    self.output_tags,
                    submit_runner_ram=kwargs.get("submit_runner_ram"),
                    name=kwargs.get("name"),
                    on_error=kwargs.get("on_error"),
                    submit_runner_image=kwargs.get("submit_runner_image"))

        if not kwargs.get(
                "submit"
        ) and "cwl_runner_job" not in kwargs and self.work_api == "jobs":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid":
                    self.project_uuid,
                    "name":
                    kwargs["name"] if kwargs.
                    get("name") else shortname(tool.tool["id"]),
                    "components": {},
                    "state":
                    "RunningOnClient"
                }).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runnerjob and not kwargs.get("wait"):
            runnerjob.run(wait=kwargs.get("wait"))
            return (runnerjob.uuid, "success")

        self.poll_api = arvados.api('v1')
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        if runnerjob:
            jobiter = iter((runnerjob, ))
        else:
            if "cwl_runner_job" in kwargs:
                self.uuid = kwargs.get("cwl_runner_job").get('uuid')
            jobiter = tool.job(job_order, self.output_callback, **kwargs)

        try:
            self.cond.acquire()
            # Will continue to hold the lock for the duration of this code
            # except when in cond.wait(), at which point on_message can update
            # job state and process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if runnable:
                    with Perf(metrics, "run"):
                        runnable.run(**kwargs)
                else:
                    if self.processes:
                        self.cond.wait(1)
                    else:
                        logger.error(
                            "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs."
                        )
                        break
                loopperf.__enter__()
            loopperf.__exit__()

            while self.processes:
                self.cond.wait(1)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt:
                logger.error("Interrupted, marking pipeline as failed")
            else:
                logger.error(
                    "Execution failed: %s",
                    sys.exc_info()[1],
                    exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(
                    uuid=self.pipeline["uuid"], body={
                        "state": "Failed"
                    }).execute(num_retries=self.num_retries)
            if runnerjob and runnerjob.uuid and self.work_api == "containers":
                self.api.container_requests().update(
                    uuid=runnerjob.uuid, body={
                        "priority": "0"
                    }).execute(num_retries=self.num_retries)
        finally:
            self.cond.release()
            self.stop_polling.set()
            self.polling_thread.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if kwargs.get("submit") and isinstance(runnerjob, Runner):
            logger.info("Final output collection %s", runnerjob.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(
                    tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""
            self.final_output, self.final_output_collection = self.make_output_collection(
                self.output_name, self.output_tags, self.final_output)
            self.set_crunch_output()

        if kwargs.get("compute_checksum"):
            adjustDirObjs(self.final_output, partial(getListing,
                                                     self.fs_access))
            adjustFileObjs(self.final_output,
                           partial(compute_checksums, self.fs_access))

        return (self.final_output, self.final_status)
Exemple #5
0
    def run(self, dry_run=False, pull_image=True, **kwargs):
        container_request = {
            "command": self.command_line,
            "owner_uuid": self.arvrunner.project_uuid,
            "name": self.name,
            "output_path": self.outdir,
            "cwd": self.outdir,
            "priority": 1,
            "state": "Committed"
        }
        runtime_constraints = {}
        mounts = {
            self.outdir: {
                "kind": "tmp"
            }
        }

        dirs = set()
        for f in self.pathmapper.files():
            _, p, tp = self.pathmapper.mapper(f)
            if tp == "Directory" and '/' not in p[6:]:
                mounts[p] = {
                    "kind": "collection",
                    "portable_data_hash": p[6:]
                }
                dirs.add(p[6:])
        for f in self.pathmapper.files():
            _, p, tp = self.pathmapper.mapper(f)
            if p[6:].split("/")[0] not in dirs:
                mounts[p] = {
                    "kind": "collection",
                    "portable_data_hash": p[6:]
                }

        if self.generatefiles["listing"]:
            raise UnsupportedRequirement("Generate files not supported")

        container_request["environment"] = {"TMPDIR": self.tmpdir, "HOME": self.outdir}
        if self.environment:
            container_request["environment"].update(self.environment)

        if self.stdin:
            raise UnsupportedRequirement("Stdin redirection currently not suppported")

        if self.stderr:
            raise UnsupportedRequirement("Stderr redirection currently not suppported")

        if self.stdout:
            mounts["stdout"] = {"kind": "file",
                                "path": "%s/%s" % (self.outdir, self.stdout)}

        (docker_req, docker_is_req) = get_feature(self, "DockerRequirement")
        if not docker_req:
            docker_req = {"dockerImageId": "arvados/jobs"}

        container_request["container_image"] = arv_docker_get_image(self.arvrunner.api,
                                                                     docker_req,
                                                                     pull_image,
                                                                     self.arvrunner.project_uuid)

        resources = self.builder.resources
        if resources is not None:
            runtime_constraints["vcpus"] = resources.get("cores", 1)
            runtime_constraints["ram"] = resources.get("ram") * 2**20

        container_request["mounts"] = mounts
        container_request["runtime_constraints"] = runtime_constraints

        try:
            response = self.arvrunner.api.container_requests().create(
                body=container_request
            ).execute(num_retries=self.arvrunner.num_retries)

            self.arvrunner.processes[response["container_uuid"]] = self

            logger.info("Container %s (%s) request state is %s", self.name, response["uuid"], response["state"])

            if response["state"] == "Final":
                self.done(response)
        except Exception as e:
            logger.error("Got error %s" % str(e))
            self.output_callback({}, "permanentFail")
Exemple #6
0
    def run(self, runtimeContext):
        # type: (RuntimeContext) -> None

        (docker_req, docker_is_req) = self.get_requirement("DockerRequirement")
        self.prov_obj = runtimeContext.prov_obj
        img_id = None
        env = cast(MutableMapping[Text, Text], os.environ)
        user_space_docker_cmd = runtimeContext.user_space_docker_cmd
        if docker_req and user_space_docker_cmd:
            # For user-space docker implementations, a local image name or ID
            # takes precedence over a network pull
            if 'dockerImageId' in docker_req:
                img_id = str(docker_req["dockerImageId"])
            elif 'dockerPull' in docker_req:
                img_id = str(docker_req["dockerPull"])
            # else:
            #     raise WorkflowException(SourceLine(docker_req).makeError(
            #         "Docker image must be specified as 'dockerImageId' or "
            #         "'dockerPull' when using user space implementations of "
            #         "Docker"))
        else:
            try:
                if docker_req and runtimeContext.use_container:
                    img_id = str(
                        self.get_from_requirements(
                            docker_req, True, runtimeContext.pull_image,
                            getdefault(runtimeContext.force_docker_pull, False),
                            getdefault(runtimeContext.tmp_outdir_prefix, DEFAULT_TMP_PREFIX)))
                if img_id is None:
                    if self.builder.find_default_container:
                        default_container = self.builder.find_default_container()
                        if default_container:
                            img_id = str(default_container)

                if docker_req and img_id is None and runtimeContext.use_container:
                    raise Exception("Docker image not available")

                if self.prov_obj and img_id and runtimeContext.process_run_id:
                    # TODO: Integrate with record_container_id
                    container_agent = self.prov_obj.document.agent(
                        uuid.uuid4().urn,
                        {"prov:type": PROV["SoftwareAgent"],
                         "cwlprov:image": img_id,
                         "prov:label": "Container execution of image %s" % img_id})
                    # FIXME: img_id is not a sha256 id, it might just be "debian:8"
                    #img_entity = document.entity("nih:sha-256;%s" % img_id,
                    #                  {"prov:label": "Container image %s" % img_id} )
                    # The image is the plan for this activity-agent association
                    #document.wasAssociatedWith(process_run_ID, container_agent, img_entity)
                    self.prov_obj.document.wasAssociatedWith(
                        runtimeContext.process_run_id, container_agent)
            except Exception as err:
                container = "Shifter"
                _logger.debug("%s error", container, exc_info=True)
                if docker_is_req:
                    raise UnsupportedRequirement(
                        "%s is required to run this tool: %s" % (container, err))
                else:
                    raise WorkflowException(
                        "{0} is not available for this tool, try "
                        "--no-container to disable {0}, or install "
                        "a user space Docker replacement like uDocker with "
                        "--user-space-docker-cmd.: {1}".format(container, err))

        self._setup(runtimeContext)
        stageFiles(self.pathmapper, ignoreWritable=True, symLink=True,
            secret_store=runtimeContext.secret_store)
        runtime = self.create_runtime(env, runtimeContext, img_id)
        self._execute(runtime, env, runtimeContext)
Exemple #7
0
    def arv_executor(self, tool, job_order, runtimeContext, logger=None):
        self.debug = runtimeContext.debug

        tool.visit(self.check_features)

        self.project_uuid = runtimeContext.project_uuid
        self.pipeline = None
        self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
        self.secret_store = runtimeContext.secret_store

        self.trash_intermediate = runtimeContext.trash_intermediate
        if self.trash_intermediate and self.work_api != "containers":
            raise Exception(
                "--trash-intermediate is only supported with --api=containers."
            )

        self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
        if self.intermediate_output_ttl and self.work_api != "containers":
            raise Exception(
                "--intermediate-output-ttl is only supported with --api=containers."
            )
        if self.intermediate_output_ttl < 0:
            raise Exception(
                "Invalid value %d for --intermediate-output-ttl, cannot be less than zero"
                % self.intermediate_output_ttl)

        if runtimeContext.submit_request_uuid and self.work_api != "containers":
            raise Exception(
                "--submit-request-uuid requires containers API, but using '{}' api"
                .format(self.work_api))

        if not runtimeContext.name:
            runtimeContext.name = self.name = tool.tool.get(
                "label") or tool.metadata.get("label") or os.path.basename(
                    tool.tool["id"])

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        merged_map = upload_workflow_deps(self, tool)

        # Reload tool object which may have been updated by
        # upload_workflow_deps
        # Don't validate this time because it will just print redundant errors.
        loadingContext = self.loadingContext.copy()
        loadingContext.loader = tool.doc_loader
        loadingContext.avsc_names = tool.doc_schema
        loadingContext.metadata = tool.metadata
        loadingContext.do_validate = False

        tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]],
                                  loadingContext)

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % runtimeContext.name,
                                     tool, job_order)

        existing_uuid = runtimeContext.update_workflow
        if existing_uuid or runtimeContext.create_workflow:
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "jobs":
                tmpl = RunnerTemplate(
                    self,
                    tool,
                    job_order,
                    runtimeContext.enable_reuse,
                    uuid=existing_uuid,
                    submit_runner_ram=runtimeContext.submit_runner_ram,
                    name=runtimeContext.name,
                    merged_map=merged_map)
                tmpl.save()
                # cwltool.main will write our return value to stdout.
                return (tmpl.uuid, "success")
            elif self.work_api == "containers":
                return (upload_workflow(
                    self,
                    tool,
                    job_order,
                    self.project_uuid,
                    uuid=existing_uuid,
                    submit_runner_ram=runtimeContext.submit_runner_ram,
                    name=runtimeContext.name,
                    merged_map=merged_map), "success")

        self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
        self.eval_timeout = runtimeContext.eval_timeout

        runtimeContext = runtimeContext.copy()
        runtimeContext.use_container = True
        runtimeContext.tmpdir_prefix = "tmp"
        runtimeContext.work_api = self.work_api

        if self.work_api == "containers":
            if self.ignore_docker_for_reuse:
                raise Exception(
                    "--ignore-docker-for-reuse not supported with containers API."
                )
            runtimeContext.outdir = "/var/spool/cwl"
            runtimeContext.docker_outdir = "/var/spool/cwl"
            runtimeContext.tmpdir = "/tmp"
            runtimeContext.docker_tmpdir = "/tmp"
        elif self.work_api == "jobs":
            if runtimeContext.priority != DEFAULT_PRIORITY:
                raise Exception("--priority not implemented for jobs API.")
            runtimeContext.outdir = "$(task.outdir)"
            runtimeContext.docker_outdir = "$(task.outdir)"
            runtimeContext.tmpdir = "$(task.tmpdir)"

        if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
            raise Exception("--priority must be in the range 1..1000.")

        runnerjob = None
        if runtimeContext.submit:
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if tool.tool[
                        "class"] == "CommandLineTool" and runtimeContext.wait:
                    runtimeContext.runnerjob = tool.tool["id"]
                    runnerjob = tool.job(job_order, self.output_callback,
                                         runtimeContext).next()
                else:
                    runnerjob = RunnerContainer(
                        self,
                        tool,
                        job_order,
                        runtimeContext.enable_reuse,
                        self.output_name,
                        self.output_tags,
                        submit_runner_ram=runtimeContext.submit_runner_ram,
                        name=runtimeContext.name,
                        on_error=runtimeContext.on_error,
                        submit_runner_image=runtimeContext.submit_runner_image,
                        intermediate_output_ttl=runtimeContext.
                        intermediate_output_ttl,
                        merged_map=merged_map,
                        priority=runtimeContext.priority,
                        secret_store=self.secret_store)
            elif self.work_api == "jobs":
                runnerjob = RunnerJob(
                    self,
                    tool,
                    job_order,
                    runtimeContext.enable_reuse,
                    self.output_name,
                    self.output_tags,
                    submit_runner_ram=runtimeContext.submit_runner_ram,
                    name=runtimeContext.name,
                    on_error=runtimeContext.on_error,
                    submit_runner_image=runtimeContext.submit_runner_image,
                    merged_map=merged_map)
        elif runtimeContext.cwl_runner_job is None and self.work_api == "jobs":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid":
                    self.project_uuid,
                    "name":
                    runtimeContext.name if runtimeContext.
                    name else shortname(tool.tool["id"]),
                    "components": {},
                    "state":
                    "RunningOnClient"
                }).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runnerjob and not runtimeContext.wait:
            submitargs = runtimeContext.copy()
            submitargs.submit = False
            runnerjob.run(submitargs)
            return (runnerjob.uuid, "success")

        self.poll_api = arvados.api('v1')
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)

        if runnerjob:
            jobiter = iter((runnerjob, ))
        else:
            if runtimeContext.cwl_runner_job is not None:
                self.uuid = runtimeContext.cwl_runner_job.get('uuid')
            jobiter = tool.job(job_order, self.output_callback, runtimeContext)

        try:
            self.workflow_eval_lock.acquire()
            # Holds the lock while this code runs and releases it when
            # it is safe to do so in self.workflow_eval_lock.wait(),
            # at which point on_message can update job state and
            # process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if self.task_queue.error is not None:
                    raise self.task_queue.error

                if runnable:
                    with Perf(metrics, "run"):
                        self.start_run(runnable, runtimeContext)
                else:
                    if (self.task_queue.in_flight + len(self.processes)) > 0:
                        self.workflow_eval_lock.wait(3)
                    else:
                        logger.error(
                            "Workflow is deadlocked, no runnable processes and not waiting on any pending processes."
                        )
                        break
                loopperf.__enter__()
            loopperf.__exit__()

            while (self.task_queue.in_flight + len(self.processes)) > 0:
                if self.task_queue.error is not None:
                    raise self.task_queue.error
                self.workflow_eval_lock.wait(3)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info(
            )[0] is SystemExit:
                logger.error("Interrupted, workflow will be cancelled")
            else:
                logger.error(
                    "Execution failed: %s",
                    sys.exc_info()[1],
                    exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(
                    uuid=self.pipeline["uuid"], body={
                        "state": "Failed"
                    }).execute(num_retries=self.num_retries)
            if runnerjob and runnerjob.uuid and self.work_api == "containers":
                self.api.container_requests().update(
                    uuid=runnerjob.uuid, body={
                        "priority": "0"
                    }).execute(num_retries=self.num_retries)
        finally:
            self.workflow_eval_lock.release()
            self.task_queue.drain()
            self.stop_polling.set()
            self.polling_thread.join()
            self.task_queue.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if runtimeContext.submit and isinstance(runnerjob, Runner):
            logger.info("Final output collection %s", runnerjob.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(
                    tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""

            storage_classes = runtimeContext.storage_classes.strip().split(",")
            self.final_output, self.final_output_collection = self.make_output_collection(
                self.output_name, storage_classes, self.output_tags,
                self.final_output)
            self.set_crunch_output()

        if runtimeContext.compute_checksum:
            adjustDirObjs(self.final_output,
                          partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output,
                           partial(compute_checksums, self.fs_access))

        if self.trash_intermediate and self.final_status == "success":
            self.trash_intermediate_output()

        return (self.final_output, self.final_status)
Exemple #8
0
    def run(self, dry_run=False, pull_image=True, **kwargs):
        container_request = {
            "command": self.command_line,
            "owner_uuid": self.arvrunner.project_uuid,
            "name": self.name,
            "output_path": self.outdir,
            "cwd": self.outdir,
            "priority": 1,
            "state": "Committed",
            "properties": {}
        }
        runtime_constraints = {}
        mounts = {self.outdir: {"kind": "tmp"}}
        scheduling_parameters = {}

        dirs = set()
        for f in self.pathmapper.files():
            pdh, p, tp = self.pathmapper.mapper(f)
            if tp == "Directory" and '/' not in pdh:
                mounts[p] = {
                    "kind": "collection",
                    "portable_data_hash": pdh[5:]
                }
                dirs.add(pdh)

        for f in self.pathmapper.files():
            res, p, tp = self.pathmapper.mapper(f)
            if res.startswith("keep:"):
                res = res[5:]
            elif res.startswith("/keep/"):
                res = res[6:]
            else:
                continue
            sp = res.split("/", 1)
            pdh = sp[0]
            if pdh not in dirs:
                mounts[p] = {"kind": "collection", "portable_data_hash": pdh}
                if len(sp) == 2:
                    mounts[p]["path"] = sp[1]

        with Perf(metrics, "generatefiles %s" % self.name):
            if self.generatefiles["listing"]:
                vwd = arvados.collection.Collection(
                    api_client=self.arvrunner.api,
                    keep_client=self.arvrunner.keep_client,
                    num_retries=self.arvrunner.num_retries)
                generatemapper = NoFollowPathMapper([self.generatefiles],
                                                    "",
                                                    "",
                                                    separateDirs=False)

                with Perf(metrics, "createfiles %s" % self.name):
                    for f, p in generatemapper.items():
                        if not p.target:
                            pass
                        elif p.type in ("File", "Directory"):
                            source, path = self.arvrunner.fs_access.get_collection(
                                p.resolved)
                            vwd.copy(path, p.target, source_collection=source)
                        elif p.type == "CreateFile":
                            with vwd.open(p.target, "w") as n:
                                n.write(p.resolved.encode("utf-8"))

                with Perf(metrics, "generatefiles.save_new %s" % self.name):
                    vwd.save_new()

                for f, p in generatemapper.items():
                    if not p.target:
                        continue
                    mountpoint = "%s/%s" % (self.outdir, p.target)
                    mounts[mountpoint] = {
                        "kind": "collection",
                        "portable_data_hash": vwd.portable_data_hash(),
                        "path": p.target
                    }

        container_request["environment"] = {
            "TMPDIR": self.tmpdir,
            "HOME": self.outdir
        }
        if self.environment:
            container_request["environment"].update(self.environment)

        if self.stdin:
            raise UnsupportedRequirement(
                "Stdin redirection currently not suppported")

        if self.stderr:
            raise UnsupportedRequirement(
                "Stderr redirection currently not suppported")

        if self.stdout:
            mounts["stdout"] = {
                "kind": "file",
                "path": "%s/%s" % (self.outdir, self.stdout)
            }

        (docker_req, docker_is_req) = get_feature(self, "DockerRequirement")
        if not docker_req:
            docker_req = {"dockerImageId": "arvados/jobs"}

        container_request["container_image"] = arv_docker_get_image(
            self.arvrunner.api, docker_req, pull_image,
            self.arvrunner.project_uuid)

        resources = self.builder.resources
        if resources is not None:
            runtime_constraints["vcpus"] = resources.get("cores", 1)
            runtime_constraints["ram"] = resources.get("ram") * 2**20

        api_req, _ = get_feature(self, "http://arvados.org/cwl#APIRequirement")
        if api_req:
            runtime_constraints["API"] = True

        runtime_req, _ = get_feature(
            self, "http://arvados.org/cwl#RuntimeConstraints")
        if runtime_req:
            if "keep_cache" in runtime_req:
                runtime_constraints[
                    "keep_cache_ram"] = runtime_req["keep_cache"] * 2**20

        partition_req, _ = get_feature(
            self, "http://arvados.org/cwl#PartitionRequirement")
        if partition_req:
            scheduling_parameters["partitions"] = aslist(
                partition_req["partition"])

        container_request["mounts"] = mounts
        container_request["runtime_constraints"] = runtime_constraints
        container_request["use_existing"] = kwargs.get("enable_reuse", True)
        container_request["scheduling_parameters"] = scheduling_parameters

        if kwargs.get("runnerjob", "").startswith("arvwf:"):
            wfuuid = kwargs["runnerjob"][6:kwargs["runnerjob"].index("#")]
            wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(
                num_retries=self.arvrunner.num_retries)
            if container_request["name"] == "main":
                container_request["name"] = wfrecord["name"]
            container_request["properties"]["template_uuid"] = wfuuid

        try:
            response = self.arvrunner.api.container_requests().create(
                body=container_request).execute(
                    num_retries=self.arvrunner.num_retries)

            self.uuid = response["uuid"]
            self.arvrunner.processes[self.uuid] = self

            if response["state"] == "Final":
                logger.info("%s reused container %s",
                            self.arvrunner.label(self),
                            response["container_uuid"])
                self.done(response)
            else:
                logger.info("%s %s state is %s", self.arvrunner.label(self),
                            response["uuid"], response["state"])
        except Exception as e:
            logger.error("%s got error %s" %
                         (self.arvrunner.label(self), str(e)))
            self.output_callback({}, "permanentFail")
Exemple #9
0
    def arvExecutor(self, tool, job_order, **kwargs):
        self.debug = kwargs.get("debug")

        tool.visit(self.check_writable)

        if kwargs.get("quiet"):
            logger.setLevel(logging.WARN)
            logging.getLogger('arvados.arv-run').setLevel(logging.WARN)

        useruuid = self.api.users().current().execute()["uuid"]
        self.project_uuid = kwargs.get("project_uuid") if kwargs.get(
            "project_uuid") else useruuid
        self.pipeline = None
        make_fs_access = kwargs.get("make_fs_access") or partial(
            CollectionFsAccess, api_client=self.api)
        self.fs_access = make_fs_access(kwargs["basedir"])

        if kwargs.get("create_template"):
            tmpl = RunnerTemplate(self, tool, job_order,
                                  kwargs.get("enable_reuse"))
            tmpl.save()
            # cwltool.main will write our return value to stdout.
            return tmpl.uuid

        self.debug = kwargs.get("debug")
        self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")

        kwargs["make_fs_access"] = make_fs_access
        kwargs["enable_reuse"] = kwargs.get("enable_reuse")
        kwargs["use_container"] = True
        kwargs["tmpdir_prefix"] = "tmp"
        kwargs["on_error"] = "continue"
        kwargs["compute_checksum"] = kwargs.get("compute_checksum")

        if self.work_api == "containers":
            kwargs["outdir"] = "/var/spool/cwl"
            kwargs["docker_outdir"] = "/var/spool/cwl"
            kwargs["tmpdir"] = "/tmp"
            kwargs["docker_tmpdir"] = "/tmp"
        elif self.work_api == "jobs":
            kwargs["outdir"] = "$(task.outdir)"
            kwargs["docker_outdir"] = "$(task.outdir)"
            kwargs["tmpdir"] = "$(task.tmpdir)"

        runnerjob = None
        if kwargs.get("submit"):
            if self.work_api == "containers":
                if tool.tool["class"] == "CommandLineTool":
                    runnerjob = tool.job(job_order, self.output_callback,
                                         **kwargs).next()
                else:
                    runnerjob = RunnerContainer(self, tool, job_order,
                                                kwargs.get("enable_reuse"))
            else:
                runnerjob = RunnerJob(self, tool, job_order,
                                      kwargs.get("enable_reuse"))

        if not kwargs.get(
                "submit"
        ) and "cwl_runner_job" not in kwargs and not self.work_api == "containers":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid": self.project_uuid,
                    "name": shortname(tool.tool["id"]),
                    "components": {},
                    "state": "RunningOnClient"
                }).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runnerjob and not kwargs.get("wait"):
            runnerjob.run()
            return runnerjob.uuid

        self.poll_api = arvados.api('v1')
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        if runnerjob:
            jobiter = iter((runnerjob, ))
        else:
            if "cwl_runner_job" in kwargs:
                self.uuid = kwargs.get("cwl_runner_job").get('uuid')
            jobiter = tool.job(job_order, self.output_callback, **kwargs)

        try:
            self.cond.acquire()
            # Will continue to hold the lock for the duration of this code
            # except when in cond.wait(), at which point on_message can update
            # job state and process output callbacks.

            for runnable in jobiter:
                if runnable:
                    runnable.run(**kwargs)
                else:
                    if self.processes:
                        self.cond.wait(1)
                    else:
                        logger.error(
                            "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs."
                        )
                        break

            while self.processes:
                self.cond.wait(1)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt:
                logger.error("Interrupted, marking pipeline as failed")
            else:
                logger.error(
                    "Caught unhandled exception, marking pipeline as failed.  Error was: %s",
                    sys.exc_info()[1],
                    exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(
                    uuid=self.pipeline["uuid"], body={
                        "state": "Failed"
                    }).execute(num_retries=self.num_retries)
            if runnerjob and runnerjob.uuid and self.work_api == "containers":
                self.api.container_requests().update(
                    uuid=runnerjob.uuid, body={
                        "priority": "0"
                    }).execute(num_retries=self.num_retries)
        finally:
            self.cond.release()
            self.stop_polling.set()
            self.polling_thread.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_status != "success":
            raise WorkflowException("Workflow failed.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if kwargs.get("compute_checksum"):
            adjustFileObjs(self.final_output,
                           partial(compute_checksums, self.fs_access))

        return self.final_output
Exemple #10
0
    def run(self, dry_run=False, pull_image=True, **kwargs):
        container_request = {
            "command": self.command_line,
            "owner_uuid": self.arvrunner.project_uuid,
            "name": self.name,
            "output_path": self.outdir,
            "cwd": self.outdir,
            "priority": 1,
            "state": "Committed",
            "properties": {}
        }
        runtime_constraints = {}
        mounts = {self.outdir: {"kind": "tmp"}}
        scheduling_parameters = {}

        dirs = set()
        for f in self.pathmapper.files():
            _, p, tp = self.pathmapper.mapper(f)
            if tp == "Directory" and '/' not in p[6:]:
                mounts[p] = {"kind": "collection", "portable_data_hash": p[6:]}
                dirs.add(p[6:])
        for f in self.pathmapper.files():
            _, p, tp = self.pathmapper.mapper(f)
            if p[6:].split("/")[0] not in dirs:
                mounts[p] = {"kind": "collection", "portable_data_hash": p[6:]}

        if self.generatefiles["listing"]:
            raise UnsupportedRequirement(
                "InitialWorkDirRequirement not supported with --api=containers"
            )

        container_request["environment"] = {
            "TMPDIR": self.tmpdir,
            "HOME": self.outdir
        }
        if self.environment:
            container_request["environment"].update(self.environment)

        if self.stdin:
            raise UnsupportedRequirement(
                "Stdin redirection currently not suppported")

        if self.stderr:
            raise UnsupportedRequirement(
                "Stderr redirection currently not suppported")

        if self.stdout:
            mounts["stdout"] = {
                "kind": "file",
                "path": "%s/%s" % (self.outdir, self.stdout)
            }

        (docker_req, docker_is_req) = get_feature(self, "DockerRequirement")
        if not docker_req:
            docker_req = {"dockerImageId": arvados_jobs_image(self.arvrunner)}

        container_request["container_image"] = arv_docker_get_image(
            self.arvrunner.api, docker_req, pull_image,
            self.arvrunner.project_uuid)

        resources = self.builder.resources
        if resources is not None:
            runtime_constraints["vcpus"] = resources.get("cores", 1)
            runtime_constraints["ram"] = resources.get("ram") * 2**20

        api_req, _ = get_feature(self, "http://arvados.org/cwl#APIRequirement")
        if api_req:
            runtime_constraints["API"] = True

        runtime_req, _ = get_feature(
            self, "http://arvados.org/cwl#RuntimeConstraints")
        if runtime_req:
            if "keep_cache" in runtime_req:
                runtime_constraints["keep_cache_ram"] = runtime_req[
                    "keep_cache"]

        partition_req, _ = get_feature(
            self, "http://arvados.org/cwl#PartitionRequirement")
        if partition_req:
            scheduling_parameters["partitions"] = aslist(
                partition_req["partition"])

        container_request["mounts"] = mounts
        container_request["runtime_constraints"] = runtime_constraints
        container_request["use_existing"] = kwargs.get("enable_reuse", True)
        container_request["scheduling_parameters"] = scheduling_parameters

        if kwargs.get("runnerjob", "").startswith("arvwf:"):
            wfuuid = kwargs["runnerjob"][6:kwargs["runnerjob"].index("#")]
            wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(
                num_retries=self.arvrunner.num_retries)
            if container_request["name"] == "main":
                container_request["name"] = wfrecord["name"]
            container_request["properties"]["template_uuid"] = wfuuid

        try:
            response = self.arvrunner.api.container_requests().create(
                body=container_request).execute(
                    num_retries=self.arvrunner.num_retries)

            self.uuid = response["uuid"]
            self.arvrunner.processes[self.uuid] = self

            logger.info("Container request %s (%s) state is %s", self.name,
                        response["uuid"], response["state"])

            if response["state"] == "Final":
                self.done(response)
        except Exception as e:
            logger.error("Got error %s" % str(e))
            self.output_callback({}, "permanentFail")