コード例 #1
0
    def done(self, record):
        if record["state"] == "Complete":
            if record.get("exit_code") is not None:
                if record["exit_code"] == 33:
                    processStatus = "UnsupportedRequirement"
                elif record["exit_code"] == 0:
                    processStatus = "success"
                else:
                    processStatus = "permanentFail"
            else:
                processStatus = "success"
        else:
            processStatus = "permanentFail"

        outputs = None
        try:
            try:
                outc = arvados.collection.Collection(record["output"])
                with outc.open("cwl.output.json") as f:
                    outputs = json.load(f)
                def keepify(fileobj):
                    path = fileobj["location"]
                    if not path.startswith("keep:"):
                        fileobj["location"] = "keep:%s/%s" % (record["output"], path)
                adjustFileObjs(outputs, keepify)
                adjustDirObjs(outputs, keepify)
            except Exception as e:
                logger.error("While getting final output object: %s", e)
            self.arvrunner.output_callback(outputs, processStatus)
        finally:
            del self.arvrunner.processes[record["uuid"]]
コード例 #2
0
ファイル: test_pack.py プロジェクト: illusional/cwltool
def test_pack_input_named_name():
    loadingContext, workflowobj, uri = fetch_document(
        get_data("tests/wf/trick_revsort.cwl")
    )
    loadingContext.do_update = False
    loadingContext, uri = resolve_and_validate_document(
        loadingContext, workflowobj, uri
    )
    processobj = loadingContext.loader.resolve_ref(uri)[0]

    with open(get_data("tests/wf/expect_trick_packed.cwl")) as packed_file:
        expect_packed = yaml.round_trip_load(packed_file)

    packed = cwltool.pack.pack(
        loadingContext.loader, processobj, uri, loadingContext.metadata
    )
    adjustFileObjs(
        packed, partial(make_relative, os.path.abspath(get_data("tests/wf")))
    )
    adjustDirObjs(packed, partial(make_relative, os.path.abspath(get_data("tests/wf"))))

    assert "$schemas" in packed
    assert len(packed["$schemas"]) == len(expect_packed["$schemas"])
    del packed["$schemas"]
    del expect_packed["$schemas"]

    assert packed == expect_packed
コード例 #3
0
    def done(self, record):
        """Base method for handling a completed runner."""

        try:
            if record["state"] == "Complete":
                if record.get("exit_code") is not None:
                    if record["exit_code"] == 33:
                        processStatus = "UnsupportedRequirement"
                    elif record["exit_code"] == 0:
                        processStatus = "success"
                    else:
                        processStatus = "permanentFail"
                else:
                    processStatus = "success"
            else:
                processStatus = "permanentFail"

            outputs = {}

            if processStatus == "permanentFail":
                logc = arvados.collection.CollectionReader(
                    record["log"],
                    api_client=self.arvrunner.api,
                    keep_client=self.arvrunner.keep_client,
                    num_retries=self.arvrunner.num_retries)
                done.logtail(logc,
                             logger,
                             "%s error log:" % self.arvrunner.label(self),
                             maxlen=40)

            self.final_output = record["output"]
            outc = arvados.collection.CollectionReader(
                self.final_output,
                api_client=self.arvrunner.api,
                keep_client=self.arvrunner.keep_client,
                num_retries=self.arvrunner.num_retries)
            if "cwl.output.json" in outc:
                with outc.open("cwl.output.json") as f:
                    if f.size() > 0:
                        outputs = json.load(f)

            def keepify(fileobj):
                path = fileobj["location"]
                if not path.startswith("keep:"):
                    fileobj["location"] = "keep:%s/%s" % (record["output"],
                                                          path)

            adjustFileObjs(outputs, keepify)
            adjustDirObjs(outputs, keepify)
        except Exception as e:
            logger.exception("[%s] While getting final output object: %s",
                             self.name, e)
            self.arvrunner.output_callback({}, "permanentFail")
        else:
            self.arvrunner.output_callback(outputs, processStatus)
        finally:
            if record["uuid"] in self.arvrunner.processes:
                del self.arvrunner.processes[record["uuid"]]
コード例 #4
0
ファイル: cwltoil.py プロジェクト: chapmanb/toil
 def importFiles(tool):
     visit_class(tool, ("File", "Directory"), pathToLoc)
     normalizeFilesDirs(tool)
     adjustDirObjs(tool, functools.partial(get_listing,
                                           cwltool.stdfsaccess.StdFsAccess(""),
                                           recursive=True))
     adjustFileObjs(tool, functools.partial(uploadFile,
                                            toil.importFile,
                                            fileindex, existing, skip_broken=True))
コード例 #5
0
 def importFiles(tool):
     visit_class(tool, ("File", "Directory"), pathToLoc)
     normalizeFilesDirs(tool)
     adjustDirObjs(tool, functools.partial(get_listing,
                                           cwltool.stdfsaccess.StdFsAccess(""),
                                           recursive=True))
     adjustFileObjs(tool, functools.partial(uploadFile,
                                            toil.importFile,
                                            fileindex, existing, skip_broken=True))
コード例 #6
0
ファイル: cwltoil.py プロジェクト: vallurumk/toil
 def import_files(tool):
     visit_class(tool, ("File", "Directory"), path_to_loc)
     visit_class(tool, ("File", ), functools.partial(
         add_sizes, fs_access))
     normalizeFilesDirs(tool)
     adjustDirObjs(tool, functools.partial(
         get_listing, fs_access, recursive=True))
     adjustFileObjs(tool, functools.partial(
         uploadFile, toil.importFile, fileindex, existing,
         skip_broken=True))
コード例 #7
0
ファイル: wps_workflow.py プロジェクト: crim-ca/weaver
    def collect_output_ports(self,
                             ports,                  # type: Set[Dict[Text, Any]]
                             builder,                # type: Builder
                             outdir,                 # type: Text
                             compute_checksum=True,  # type: bool
                             jobname="",             # type: Text
                             readers=None            # type: Dict[Text, Any]
                             ):                      # type: (...) -> OutputPorts
        ret = {}  # type: OutputPorts
        debug = LOGGER.isEnabledFor(logging.DEBUG)
        try:
            fs_access = builder.make_fs_access(outdir)
            custom_output = fs_access.join(outdir, "cwl.output.json")
            if fs_access.exists(custom_output):
                with fs_access.open(custom_output, "r") as f:
                    ret = json.load(f)
                if debug:
                    LOGGER.debug(u"Raw output from %s: %s", custom_output, json.dumps(ret, indent=4))
            else:
                for i, port in enumerate(ports):
                    def make_workflow_exception(msg):
                        return WorkflowException(
                            u"Error collecting output for parameter '%s':\n%s"
                            % (shortname(port["id"]), msg))
                    with SourceLine(ports, i, make_workflow_exception, debug):
                        fragment = shortname(port["id"])
                        ret[fragment] = self.collect_output(port, builder, outdir, fs_access,
                                                            compute_checksum=compute_checksum)
            if ret:
                # revmap = partial(command_line_tool.revmap_file, builder, outdir)
                adjustDirObjs(ret, trim_listing)

                # TODO: Attempt to avoid a crash because the revmap fct is not functional
                #       (intend for a docker usage only?)
                # visit_class(ret, ("File", "Directory"), cast(Callable[[Any], Any], revmap))
                visit_class(ret, ("File", "Directory"), command_line_tool.remove_path)
                normalizeFilesDirs(ret)
                visit_class(ret, ("File", "Directory"), partial(command_line_tool.check_valid_locations, fs_access))

                if compute_checksum:
                    adjustFileObjs(ret, partial(compute_checksums, fs_access))

            validate.validate_ex(
                self.names.get_name("outputs_record_schema", ""), ret,
                strict=False, logger=LOGGER)
            if ret is not None and builder.mutation_manager is not None:
                adjustFileObjs(ret, builder.mutation_manager.set_generation)
            return ret if ret is not None else {}
        except validate.ValidationException as exc:
            raise WorkflowException("Error validating output record: {!s}\nIn:\n{}"
                                    .format(exc, json.dumps(ret, indent=4)))
        finally:
            if builder.mutation_manager and readers:
                for reader in readers.values():
                    builder.mutation_manager.release_reader(jobname, reader)
コード例 #8
0
    def run(self, fileStore):
        cwljob = resolve_indirect(self.cwljob)
        fillInDefaults(self.step_inputs, cwljob)

        opts = copy.deepcopy(self.executor_options)
        # Exports temporary directory for batch systems that reset TMPDIR
        os.environ["TMPDIR"] = os.path.realpath(
            opts.pop("tmpdir", None) or fileStore.getLocalTempDir())
        outdir = os.path.join(fileStore.getLocalTempDir(), "out")
        os.mkdir(outdir)
        tmp_outdir_prefix = os.path.join(
            opts.pop("workdir", None) or os.environ["TMPDIR"], "out_tmpdir")

        index = {}
        existing = {}
        opts.update({
            't':
            self.cwltool,
            'job_order_object':
            cwljob,
            'basedir':
            os.getcwd(),
            'outdir':
            outdir,
            'tmp_outdir_prefix':
            tmp_outdir_prefix,
            'tmpdir_prefix':
            fileStore.getLocalTempDir(),
            'make_fs_access':
            functools.partial(ToilFsAccess, fileStore=fileStore),
            'toil_get_file':
            functools.partial(toilGetFile, fileStore, index, existing),
            'no_match_user':
            False
        })
        del opts['job_order']

        # Run the tool
        (output, status) = cwltool.main.single_job_executor(**opts)
        if status != "success":
            raise cwltool.errors.WorkflowException(status)

        adjustDirObjs(
            output,
            functools.partial(get_listing,
                              cwltool.stdfsaccess.StdFsAccess(outdir),
                              recursive=True))

        adjustFileObjs(
            output,
            functools.partial(
                uploadFile, functools.partial(writeGlobalFileWrapper,
                                              fileStore), index, existing))

        return output
コード例 #9
0
ファイル: cwltoil.py プロジェクト: stevencox/toil
    def run(self, fileStore):
        cwljob = resolve_indirect(self.cwljob)
        fillInDefaults(self.step_inputs, cwljob)

        opts = copy.deepcopy(self.executor_options)
        # Exports temporary directory for batch systems that reset TMPDIR
        os.environ["TMPDIR"] = os.path.realpath(
            opts.pop("tmpdir", None) or fileStore.getLocalTempDir())
        outdir = os.path.join(fileStore.getLocalTempDir(), "out")
        os.mkdir(outdir)
        tmp_outdir_prefix = os.path.join(
            opts.pop("workdir", None) or os.environ["TMPDIR"], "out_tmpdir")

        index = {}
        existing = {}

        # Run the tool
        (output, status) = cwltool.main.single_job_executor(
            self.cwltool,
            cwljob,
            basedir=os.getcwd(),
            outdir=outdir,
            tmp_outdir_prefix=tmp_outdir_prefix,
            tmpdir_prefix=fileStore.getLocalTempDir(),
            make_fs_access=functools.partial(ToilFsAccess,
                                             fileStore=fileStore),
            toil_get_file=functools.partial(toilGetFile, fileStore, index,
                                            existing),
            no_match_user=False,
            **opts)
        if status != "success":
            raise cwltool.errors.WorkflowException(status)

        adjustDirObjs(
            output,
            functools.partial(get_listing,
                              cwltool.stdfsaccess.StdFsAccess(outdir),
                              recursive=True))

        def make_dir_literal(obj):
            if "location" in obj and obj["location"].startswith("file:"):
                obj["location"] = "_:" + str(uuid.uuid4())

        adjustDirObjs(output, make_dir_literal)

        adjustFileObjs(
            output,
            functools.partial(
                uploadFile, functools.partial(writeGlobalFileWrapper,
                                              fileStore), index, existing))

        return output
コード例 #10
0
    def make_output_collection(self, name, outputObj):
        outputObj = copy.deepcopy(outputObj)

        files = []
        def capture(fileobj):
            files.append(fileobj)

        adjustDirObjs(outputObj, capture)
        adjustFileObjs(outputObj, capture)

        generatemapper = FinalOutputPathMapper(files, "", "", separateDirs=False)

        final = arvados.collection.Collection(api_client=self.api,
                                              keep_client=self.keep_client,
                                              num_retries=self.num_retries)

        srccollections = {}
        for k,v in generatemapper.items():
            sp = k.split("/")
            srccollection = sp[0][5:]
            if srccollection not in srccollections:
                srccollections[srccollection] = arvados.collection.CollectionReader(
                    srccollection,
                    api_client=self.api,
                    keep_client=self.keep_client,
                    num_retries=self.num_retries)
            reader = srccollections[srccollection]
            try:
                srcpath = "/".join(sp[1:]) if len(sp) > 1 else "."
                final.copy(srcpath, v.target, source_collection=reader, overwrite=False)
            except IOError as e:
                logger.warn("While preparing output collection: %s", e)

        def rewrite(fileobj):
            fileobj["location"] = generatemapper.mapper(fileobj["location"]).target
            for k in ("basename", "size", "listing"):
                if k in fileobj:
                    del fileobj[k]

        adjustDirObjs(outputObj, rewrite)
        adjustFileObjs(outputObj, rewrite)

        with final.open("cwl.output.json", "w") as f:
            json.dump(outputObj, f, sort_keys=True, indent=4, separators=(',',': '))

        final.save_new(name=name, owner_uuid=self.project_uuid, ensure_unique_name=True)

        logger.info("Final output collection %s \"%s\" (%s)", final.portable_data_hash(),
                    final.api_response()["name"],
                    final.manifest_locator())

        self.final_output_collection = final
コード例 #11
0
ファイル: cwltoil.py プロジェクト: vallurumk/toil
    def run(self, file_store):
        cwljob = resolve_indirect(self.cwljob)
        fill_in_defaults(
            self.step_inputs, cwljob,
            self.runtime_context.make_fs_access(""))
        for inp_id in cwljob.keys():
            found = False
            for field in self.cwltool.inputs_record_schema['fields']:
                if field['name'] == inp_id:
                    found = True
            if not found:
                cwljob.pop(inp_id)

        # Exports temporary directory for batch systems that reset TMPDIR
        os.environ["TMPDIR"] = os.path.realpath(
            self.runtime_context.tmpdir or file_store.getLocalTempDir())
        outdir = os.path.join(file_store.getLocalTempDir(), "out")
        os.mkdir(outdir)
        top_tmp_outdir = self.workdir or os.environ["TMPDIR"]
        tmp_outdir_prefix = os.path.join(
            _makeNestedTempDir(top=top_tmp_outdir, seed=outdir, levels=2),
            "out_tmpdir")
        self.openTempDirs.append(top_tmp_outdir)

        index = {}
        existing = {}
        runtime_context = self.runtime_context.copy()
        runtime_context.basedir = os.getcwd()
        runtime_context.outdir = outdir
        runtime_context.tmp_outdir_prefix = tmp_outdir_prefix
        runtime_context.tmpdir_prefix = file_store.getLocalTempDir()
        runtime_context.make_fs_access = functools.partial(
            ToilFsAccess, file_store=file_store)
        runtime_context.toil_get_file = functools.partial(
            toil_get_file, file_store, index, existing)
        # Run the tool
        (output, status) = cwltool.executors.SingleJobExecutor().execute(
            self.cwltool, cwljob, runtime_context, cwllogger)
        if status != "success":
            raise cwltool.errors.WorkflowException(status)

        adjustDirObjs(output, functools.partial(
            get_listing, cwltool.stdfsaccess.StdFsAccess(outdir),
            recursive=True))

        adjustFileObjs(output, functools.partial(
            uploadFile, functools.partial(writeGlobalFileWrapper, file_store),
            index, existing))

        return output
コード例 #12
0
def upload_workflow(arvRunner, tool, job_order, project_uuid, uuid=None,
                    submit_runner_ram=0, name=None, merged_map=None):

    packed = packed_workflow(arvRunner, tool, merged_map)

    adjustDirObjs(job_order, trim_listing)
    adjustFileObjs(job_order, trim_anonymous_location)
    adjustDirObjs(job_order, trim_anonymous_location)

    main = [p for p in packed["$graph"] if p["id"] == "#main"][0]
    for inp in main["inputs"]:
        sn = shortname(inp["id"])
        if sn in job_order:
            inp["default"] = job_order[sn]

    if not name:
        name = tool.tool.get("label", os.path.basename(tool.tool["id"]))

    upload_dependencies(arvRunner, name, tool.doc_loader,
                        packed, tool.tool["id"], False)

    if submit_runner_ram:
        hints = main.get("hints", [])
        found = False
        for h in hints:
            if h["class"] == "http://arvados.org/cwl#WorkflowRunnerResources":
                h["ramMin"] = submit_runner_ram
                found = True
                break
        if not found:
            hints.append({"class": "http://arvados.org/cwl#WorkflowRunnerResources",
                          "ramMin": submit_runner_ram})
        main["hints"] = hints

    body = {
        "workflow": {
            "name": name,
            "description": tool.tool.get("doc", ""),
            "definition":json.dumps(packed, sort_keys=True, indent=4, separators=(',',': '))
        }}
    if project_uuid:
        body["workflow"]["owner_uuid"] = project_uuid

    if uuid:
        call = arvRunner.api.workflows().update(uuid=uuid, body=body)
    else:
        call = arvRunner.api.workflows().create(body=body)
    return call.execute(num_retries=arvRunner.num_retries)["uuid"]
コード例 #13
0
ファイル: arvworkflow.py プロジェクト: chapmanb/arvados
def upload_workflow(arvRunner, tool, job_order, project_uuid, uuid=None,
                    submit_runner_ram=0, name=None, merged_map=None):

    packed = packed_workflow(arvRunner, tool, merged_map)

    adjustDirObjs(job_order, trim_listing)
    adjustFileObjs(job_order, trim_anonymous_location)
    adjustDirObjs(job_order, trim_anonymous_location)

    main = [p for p in packed["$graph"] if p["id"] == "#main"][0]
    for inp in main["inputs"]:
        sn = shortname(inp["id"])
        if sn in job_order:
            inp["default"] = job_order[sn]

    if not name:
        name = tool.tool.get("label", os.path.basename(tool.tool["id"]))

    upload_dependencies(arvRunner, name, tool.doc_loader,
                        packed, tool.tool["id"], False)

    if submit_runner_ram:
        hints = main.get("hints", [])
        found = False
        for h in hints:
            if h["class"] == "http://arvados.org/cwl#WorkflowRunnerResources":
                h["ramMin"] = submit_runner_ram
                found = True
                break
        if not found:
            hints.append({"class": "http://arvados.org/cwl#WorkflowRunnerResources",
                          "ramMin": submit_runner_ram})
        main["hints"] = hints

    body = {
        "workflow": {
            "name": name,
            "description": tool.tool.get("doc", ""),
            "definition":json.dumps(packed, sort_keys=True, indent=4, separators=(',',': '))
        }}
    if project_uuid:
        body["workflow"]["owner_uuid"] = project_uuid

    if uuid:
        call = arvRunner.api.workflows().update(uuid=uuid, body=body)
    else:
        call = arvRunner.api.workflows().create(body=body)
    return call.execute(num_retries=arvRunner.num_retries)["uuid"]
コード例 #14
0
ファイル: test_pack.py プロジェクト: sbt4104/cwltool
def test_pack_fragment():
    with open(get_data("tests/wf/scatter2_subwf.cwl")) as packed_file:
        expect_packed = yaml.safe_load(packed_file)

    loadingContext, workflowobj, uri = fetch_document(
        get_data("tests/wf/scatter2.cwl"))
    packed = cwltool.pack.pack(loadingContext, uri + "#scatterstep/mysub")
    adjustFileObjs(
        packed, partial(make_relative, os.path.abspath(get_data("tests/wf"))))
    adjustDirObjs(
        packed, partial(make_relative, os.path.abspath(get_data("tests/wf"))))

    assert json.dumps(packed, sort_keys=True,
                      indent=2) == json.dumps(expect_packed,
                                              sort_keys=True,
                                              indent=2)
コード例 #15
0
ファイル: runner.py プロジェクト: wtsi-hgi/arvados
    def done(self, record):
        """Base method for handling a completed runner."""

        try:
            if record["state"] == "Complete":
                if record.get("exit_code") is not None:
                    if record["exit_code"] == 33:
                        processStatus = "UnsupportedRequirement"
                    elif record["exit_code"] == 0:
                        processStatus = "success"
                    else:
                        processStatus = "permanentFail"
                else:
                    processStatus = "success"
            else:
                processStatus = "permanentFail"

            outputs = {}

            if processStatus == "permanentFail":
                logc = arvados.collection.CollectionReader(record["log"],
                                                           api_client=self.arvrunner.api,
                                                           keep_client=self.arvrunner.keep_client,
                                                           num_retries=self.arvrunner.num_retries)
                done.logtail(logc, logger.error, "%s (%s) error log:" % (self.arvrunner.label(self), record["uuid"]), maxlen=40)

            self.final_output = record["output"]
            outc = arvados.collection.CollectionReader(self.final_output,
                                                       api_client=self.arvrunner.api,
                                                       keep_client=self.arvrunner.keep_client,
                                                       num_retries=self.arvrunner.num_retries)
            if "cwl.output.json" in outc:
                with outc.open("cwl.output.json") as f:
                    if f.size() > 0:
                        outputs = json.load(f)
            def keepify(fileobj):
                path = fileobj["location"]
                if not path.startswith("keep:"):
                    fileobj["location"] = "keep:%s/%s" % (record["output"], path)
            adjustFileObjs(outputs, keepify)
            adjustDirObjs(outputs, keepify)
        except Exception as e:
            logger.exception("[%s] While getting final output object: %s", self.name, e)
            self.arvrunner.output_callback({}, "permanentFail")
        else:
            self.arvrunner.output_callback(outputs, processStatus)
コード例 #16
0
    def test_pack(self):

        document_loader, workflowobj, uri = fetch_document(
            get_data("tests/wf/revsort.cwl"))
        document_loader, avsc_names, processobj, metadata, uri = validate_document(
            document_loader, workflowobj, uri)
        packed = cwltool.pack.pack(document_loader, processobj, uri, metadata)
        with open(get_data("tests/wf/expect_packed.cwl")) as f:
            expect_packed = json.load(f)
        adjustFileObjs(packed, partial(makeRelative,
            os.path.abspath(get_data("tests/wf"))))
        adjustDirObjs(packed, partial(makeRelative,
            os.path.abspath(get_data("tests/wf"))))
        self.assertIn("$schemas", packed)
        del packed["$schemas"]
        del expect_packed["$schemas"]

        self.assertEqual(expect_packed, packed)
コード例 #17
0
    def arvados_job_spec(self, dry_run=False, pull_image=True, **kwargs):
        """Create an Arvados job specification for this workflow.

        The returned dict can be used to create a job (i.e., passed as
        the +body+ argument to jobs().create()), or as a component in
        a pipeline template or pipeline instance.
        """

        if self.tool.tool["id"].startswith("keep:"):
            self.job_order["cwl:tool"] = self.tool.tool["id"][5:]
        else:
            packed = packed_workflow(self.arvrunner, self.tool)
            wf_pdh = upload_workflow_collection(self.arvrunner, self.name,
                                                packed)
            self.job_order["cwl:tool"] = "%s/workflow.cwl#main" % wf_pdh

        adjustDirObjs(self.job_order, trim_listing)
        adjustFileObjs(self.job_order, trim_anonymous_location)
        adjustDirObjs(self.job_order, trim_anonymous_location)

        if self.output_name:
            self.job_order["arv:output_name"] = self.output_name

        if self.output_tags:
            self.job_order["arv:output_tags"] = self.output_tags

        self.job_order["arv:enable_reuse"] = self.enable_reuse

        if self.on_error:
            self.job_order["arv:on_error"] = self.on_error

        return {
            "script": "cwl-runner",
            "script_version": "master",
            "minimum_script_version":
            "570509ab4d2ef93d870fd2b1f2eab178afb1bad9",
            "repository": "arvados",
            "script_parameters": self.job_order,
            "runtime_constraints": {
                "docker_image": arvados_jobs_image(self.arvrunner,
                                                   self.jobs_image),
                "min_ram_mb_per_node": self.submit_runner_ram
            }
        }
コード例 #18
0
ファイル: test_pack.py プロジェクト: sbt4104/cwltool
def test_pack():
    loadingContext, workflowobj, uri = fetch_document(
        get_data("tests/wf/revsort.cwl"))

    with open(get_data("tests/wf/expect_packed.cwl")) as packed_file:
        expect_packed = yaml.safe_load(packed_file)

    packed = cwltool.pack.pack(loadingContext, uri)
    adjustFileObjs(
        packed, partial(make_relative, os.path.abspath(get_data("tests/wf"))))
    adjustDirObjs(
        packed, partial(make_relative, os.path.abspath(get_data("tests/wf"))))

    assert "$schemas" in packed
    assert len(packed["$schemas"]) == len(expect_packed["$schemas"])
    del packed["$schemas"]
    del expect_packed["$schemas"]

    assert packed == expect_packed
コード例 #19
0
ファイル: test_pack.py プロジェクト: pvanheus/cwltool
    def test_pack(self):
        load_tool.loaders = {}

        document_loader, workflowobj, uri = fetch_document(
            get_data("tests/wf/revsort.cwl"))
        document_loader, avsc_names, processobj, metadata, uri = validate_document(
            document_loader, workflowobj, uri)
        packed = cwltool.pack.pack(document_loader, processobj, uri, metadata)
        with open(get_data("tests/wf/expect_packed.cwl")) as f:
            expect_packed = json.load(f)
        adjustFileObjs(packed, partial(make_relative,
            os.path.abspath(get_data("tests/wf"))))
        adjustDirObjs(packed, partial(make_relative,
            os.path.abspath(get_data("tests/wf"))))
        self.assertIn("$schemas", packed)
        del packed["$schemas"]
        del expect_packed["$schemas"]

        self.assertEqual(expect_packed, packed)
コード例 #20
0
ファイル: arvworkflow.py プロジェクト: oliveralka/arvados
def upload_workflow(arvRunner,
                    tool,
                    job_order,
                    project_uuid,
                    uuid=None,
                    submit_runner_ram=0,
                    name=None):

    packed = packed_workflow(arvRunner, tool)

    adjustDirObjs(job_order, trim_listing)
    adjustFileObjs(job_order, trim_anonymous_location)
    adjustDirObjs(job_order, trim_anonymous_location)

    main = [p for p in packed["$graph"] if p["id"] == "#main"][0]
    for inp in main["inputs"]:
        sn = shortname(inp["id"])
        if sn in job_order:
            inp["default"] = job_order[sn]

    if not name:
        name = tool.tool.get("label", os.path.basename(tool.tool["id"]))

    upload_dependencies(arvRunner, name, tool.doc_loader, packed,
                        tool.tool["id"], False)

    # TODO nowhere for submit_runner_ram to go.

    body = {
        "workflow": {
            "name": name,
            "description": tool.tool.get("doc", ""),
            "definition": yaml.round_trip_dump(packed)
        }
    }
    if project_uuid:
        body["workflow"]["owner_uuid"] = project_uuid

    if uuid:
        call = arvRunner.api.workflows().update(uuid=uuid, body=body)
    else:
        call = arvRunner.api.workflows().create(body=body)
    return call.execute(num_retries=arvRunner.num_retries)["uuid"]
コード例 #21
0
ファイル: cwltoil.py プロジェクト: chapmanb/toil
    def run(self, fileStore):
        cwljob = resolve_indirect(self.cwljob)
        fillInDefaults(self.step_inputs, cwljob)

        opts = copy.deepcopy(self.executor_options)
        # Exports temporary directory for batch systems that reset TMPDIR
        os.environ["TMPDIR"] = os.path.realpath(opts.pop("tmpdir", None) or fileStore.getLocalTempDir())
        outdir = os.path.join(fileStore.getLocalTempDir(), "out")
        os.mkdir(outdir)
        tmp_outdir_prefix = os.path.join(opts.pop("workdir", None) or os.environ["TMPDIR"], "out_tmpdir")

        index = {}
        existing = {}

        # Run the tool
        (output, status) = cwltool.main.single_job_executor(self.cwltool, cwljob,
                                                            basedir=os.getcwd(),
                                                            outdir=outdir,
                                                            tmp_outdir_prefix=tmp_outdir_prefix,
                                                            tmpdir_prefix=fileStore.getLocalTempDir(),
                                                            make_fs_access=functools.partial(ToilFsAccess, fileStore=fileStore),
                                                            toil_get_file=functools.partial(toilGetFile, fileStore, index, existing),
                                                            no_match_user=False,
                                                            **opts)
        if status != "success":
            raise cwltool.errors.WorkflowException(status)

        adjustDirObjs(output, functools.partial(get_listing,
                                                cwltool.stdfsaccess.StdFsAccess(outdir),
                                                recursive=True))

        def make_dir_literal(obj):
            if "location" in obj and obj["location"].startswith("file:"):
                obj["location"] = "_:" + str(uuid.uuid4())

        adjustDirObjs(output, make_dir_literal)

        adjustFileObjs(output, functools.partial(uploadFile,
                                                 functools.partial(writeGlobalFileWrapper, fileStore),
                                                 index, existing))

        return output
コード例 #22
0
ファイル: test_pack.py プロジェクト: hmenager/cwltool
def test_pack():
    loadingContext, workflowobj, uri = fetch_document(get_data("tests/wf/revsort.cwl"))
    loadingContext.do_update = False
    loadingContext, uri = resolve_and_validate_document(
        loadingContext, workflowobj, uri)
    processobj = loadingContext.loader.resolve_ref(uri)[0]

    with open(get_data("tests/wf/expect_packed.cwl")) as packed_file:
        expect_packed = json.load(packed_file)

    packed = cwltool.pack.pack(loadingContext.loader, processobj, uri, loadingContext.metadata)
    adjustFileObjs(packed, partial(make_relative, os.path.abspath(get_data("tests/wf"))))
    adjustDirObjs(packed, partial(make_relative, os.path.abspath(get_data("tests/wf"))))

    assert "$schemas" in packed
    assert len(packed["$schemas"]) == len(expect_packed["$schemas"])
    del packed["$schemas"]
    del expect_packed["$schemas"]

    assert packed == expect_packed
コード例 #23
0
def test_pack():
    load_tool.loaders = {}

    document_loader, workflowobj, uri = fetch_document(get_data("tests/wf/revsort.cwl"))
    document_loader, _, processobj, metadata, uri = validate_document(
        document_loader, workflowobj, uri, [], {})

    with open(get_data("tests/wf/expect_packed.cwl")) as packed_file:
        expect_packed = json.load(packed_file)

    packed = cwltool.pack.pack(document_loader, processobj, uri, metadata)
    adjustFileObjs(packed, partial(make_relative, os.path.abspath(get_data("tests/wf"))))
    adjustDirObjs(packed, partial(make_relative, os.path.abspath(get_data("tests/wf"))))

    assert "$schemas" in packed
    assert len(packed["$schemas"]) == len(expect_packed["$schemas"])
    del packed["$schemas"]
    del expect_packed["$schemas"]

    assert packed == expect_packed
コード例 #24
0
ファイル: arvjob.py プロジェクト: tomclegg/arvados
    def arvados_job_spec(self, dry_run=False, pull_image=True, **kwargs):
        """Create an Arvados job specification for this workflow.

        The returned dict can be used to create a job (i.e., passed as
        the +body+ argument to jobs().create()), or as a component in
        a pipeline template or pipeline instance.
        """

        if self.tool.tool["id"].startswith("keep:"):
            self.job_order["cwl:tool"] = self.tool.tool["id"][5:]
        else:
            packed = packed_workflow(self.arvrunner, self.tool)
            wf_pdh = upload_workflow_collection(self.arvrunner, self.name, packed)
            self.job_order["cwl:tool"] = "%s/workflow.cwl#main" % wf_pdh

        adjustDirObjs(self.job_order, trim_listing)
        adjustFileObjs(self.job_order, trim_anonymous_location)
        adjustDirObjs(self.job_order, trim_anonymous_location)

        if self.output_name:
            self.job_order["arv:output_name"] = self.output_name

        if self.output_tags:
            self.job_order["arv:output_tags"] = self.output_tags

        self.job_order["arv:enable_reuse"] = self.enable_reuse

        if self.on_error:
            self.job_order["arv:on_error"] = self.on_error

        return {
            "script": "cwl-runner",
            "script_version": "master",
            "minimum_script_version": "570509ab4d2ef93d870fd2b1f2eab178afb1bad9",
            "repository": "arvados",
            "script_parameters": self.job_order,
            "runtime_constraints": {
                "docker_image": arvados_jobs_image(self.arvrunner, self.jobs_image),
                "min_ram_mb_per_node": self.submit_runner_ram
            }
        }
コード例 #25
0
ファイル: arvworkflow.py プロジェクト: tomclegg/arvados
def upload_workflow(arvRunner, tool, job_order, project_uuid, uuid=None,
                    submit_runner_ram=0, name=None):

    packed = packed_workflow(arvRunner, tool)

    adjustDirObjs(job_order, trim_listing)
    adjustFileObjs(job_order, trim_anonymous_location)
    adjustDirObjs(job_order, trim_anonymous_location)

    main = [p for p in packed["$graph"] if p["id"] == "#main"][0]
    for inp in main["inputs"]:
        sn = shortname(inp["id"])
        if sn in job_order:
            inp["default"] = job_order[sn]

    if not name:
        name = tool.tool.get("label", os.path.basename(tool.tool["id"]))

    upload_dependencies(arvRunner, name, tool.doc_loader,
                        packed, tool.tool["id"], False)

    # TODO nowhere for submit_runner_ram to go.

    body = {
        "workflow": {
            "name": name,
            "description": tool.tool.get("doc", ""),
            "definition":yaml.round_trip_dump(packed)
        }}
    if project_uuid:
        body["workflow"]["owner_uuid"] = project_uuid

    if uuid:
        call = arvRunner.api.workflows().update(uuid=uuid, body=body)
    else:
        call = arvRunner.api.workflows().create(body=body)
    return call.execute(num_retries=arvRunner.num_retries)["uuid"]
コード例 #26
0
    def arvados_job_spec(self, *args, **kwargs):
        self.upload_docker(self.tool)

        workflowfiles = []
        jobfiles = []
        workflowfiles.append({"class":"File", "location": self.tool.tool["id"]})

        self.name = os.path.basename(self.tool.tool["id"])

        def visitFiles(files, path):
            files.append(path)

        document_loader, workflowobj, uri = fetch_document(self.tool.tool["id"])
        loaded = set()
        def loadref(b, u):
            joined = urlparse.urljoin(b, u)
            if joined not in loaded:
                loaded.add(joined)
                return document_loader.fetch(urlparse.urljoin(b, u))
            else:
                return {}

        sc = scandeps(uri, workflowobj,
                      set(("$import", "run")),
                      set(("$include", "$schemas", "path", "location")),
                      loadref)
        adjustFileObjs(sc, partial(visitFiles, workflowfiles))
        adjustFileObjs(self.job_order, partial(visitFiles, jobfiles))
        adjustDirObjs(sc, partial(visitFiles, workflowfiles))
        adjustDirObjs(self.job_order, partial(visitFiles, jobfiles))

        normalizeFilesDirs(jobfiles)
        normalizeFilesDirs(workflowfiles)

        keepprefix = kwargs.get("keepprefix", "")
        workflowmapper = ArvPathMapper(self.arvrunner, workflowfiles, "",
                                       keepprefix+"%s",
                                       keepprefix+"%s/%s",
                                       name=self.name,
                                       **kwargs)

        jobmapper = ArvPathMapper(self.arvrunner, jobfiles, "",
                                  keepprefix+"%s",
                                  keepprefix+"%s/%s",
                                  name=os.path.basename(self.job_order.get("id", "#")),
                                  **kwargs)

        def setloc(p):
            p["location"] = jobmapper.mapper(p["location"])[1]
        adjustFileObjs(self.job_order, setloc)
        adjustDirObjs(self.job_order, setloc)

        if "id" in self.job_order:
            del self.job_order["id"]

        return workflowmapper
コード例 #27
0
ファイル: __init__.py プロジェクト: resurgo-genetics/arvados
    def arv_executor(self, tool, job_order, **kwargs):
        self.debug = kwargs.get("debug")

        tool.visit(self.check_features)

        self.project_uuid = kwargs.get("project_uuid")
        self.pipeline = None
        make_fs_access = kwargs.get("make_fs_access") or partial(
            CollectionFsAccess,
            api_client=self.api,
            keep_client=self.keep_client)
        self.fs_access = make_fs_access(kwargs["basedir"])

        if not kwargs.get("name"):
            kwargs["name"] = self.name = tool.tool.get(
                "label") or tool.metadata.get("label") or os.path.basename(
                    tool.tool["id"])

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        upload_workflow_deps(self, tool)

        # Reload tool object which may have been updated by
        # upload_workflow_deps
        tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]],
                                  makeTool=self.arv_make_tool,
                                  loader=tool.doc_loader,
                                  avsc_names=tool.doc_schema,
                                  metadata=tool.metadata)

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % kwargs["name"], tool,
                                     job_order)

        existing_uuid = kwargs.get("update_workflow")
        if existing_uuid or kwargs.get("create_workflow"):
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "jobs":
                tmpl = RunnerTemplate(
                    self,
                    tool,
                    job_order,
                    kwargs.get("enable_reuse"),
                    uuid=existing_uuid,
                    submit_runner_ram=kwargs.get("submit_runner_ram"),
                    name=kwargs["name"])
                tmpl.save()
                # cwltool.main will write our return value to stdout.
                return (tmpl.uuid, "success")
            elif self.work_api == "containers":
                return (upload_workflow(
                    self,
                    tool,
                    job_order,
                    self.project_uuid,
                    uuid=existing_uuid,
                    submit_runner_ram=kwargs.get("submit_runner_ram"),
                    name=kwargs["name"]), "success")

        self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")

        kwargs["make_fs_access"] = make_fs_access
        kwargs["enable_reuse"] = kwargs.get("enable_reuse")
        kwargs["use_container"] = True
        kwargs["tmpdir_prefix"] = "tmp"
        kwargs["compute_checksum"] = kwargs.get("compute_checksum")

        if self.work_api == "containers":
            kwargs["outdir"] = "/var/spool/cwl"
            kwargs["docker_outdir"] = "/var/spool/cwl"
            kwargs["tmpdir"] = "/tmp"
            kwargs["docker_tmpdir"] = "/tmp"
        elif self.work_api == "jobs":
            kwargs["outdir"] = "$(task.outdir)"
            kwargs["docker_outdir"] = "$(task.outdir)"
            kwargs["tmpdir"] = "$(task.tmpdir)"

        runnerjob = None
        if kwargs.get("submit"):
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if tool.tool["class"] == "CommandLineTool":
                    kwargs["runnerjob"] = tool.tool["id"]
                    upload_dependencies(self, kwargs["name"], tool.doc_loader,
                                        tool.tool, tool.tool["id"], False)
                    runnerjob = tool.job(job_order, self.output_callback,
                                         **kwargs).next()
                else:
                    runnerjob = RunnerContainer(
                        self,
                        tool,
                        job_order,
                        kwargs.get("enable_reuse"),
                        self.output_name,
                        self.output_tags,
                        submit_runner_ram=kwargs.get("submit_runner_ram"),
                        name=kwargs.get("name"),
                        on_error=kwargs.get("on_error"),
                        submit_runner_image=kwargs.get("submit_runner_image"))
            elif self.work_api == "jobs":
                runnerjob = RunnerJob(
                    self,
                    tool,
                    job_order,
                    kwargs.get("enable_reuse"),
                    self.output_name,
                    self.output_tags,
                    submit_runner_ram=kwargs.get("submit_runner_ram"),
                    name=kwargs.get("name"),
                    on_error=kwargs.get("on_error"),
                    submit_runner_image=kwargs.get("submit_runner_image"))

        if not kwargs.get(
                "submit"
        ) and "cwl_runner_job" not in kwargs and self.work_api == "jobs":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid":
                    self.project_uuid,
                    "name":
                    kwargs["name"] if kwargs.
                    get("name") else shortname(tool.tool["id"]),
                    "components": {},
                    "state":
                    "RunningOnClient"
                }).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runnerjob and not kwargs.get("wait"):
            runnerjob.run(wait=kwargs.get("wait"))
            return (runnerjob.uuid, "success")

        self.poll_api = arvados.api('v1')
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        if runnerjob:
            jobiter = iter((runnerjob, ))
        else:
            if "cwl_runner_job" in kwargs:
                self.uuid = kwargs.get("cwl_runner_job").get('uuid')
            jobiter = tool.job(job_order, self.output_callback, **kwargs)

        try:
            self.cond.acquire()
            # Will continue to hold the lock for the duration of this code
            # except when in cond.wait(), at which point on_message can update
            # job state and process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if runnable:
                    with Perf(metrics, "run"):
                        runnable.run(**kwargs)
                else:
                    if self.processes:
                        self.cond.wait(1)
                    else:
                        logger.error(
                            "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs."
                        )
                        break
                loopperf.__enter__()
            loopperf.__exit__()

            while self.processes:
                self.cond.wait(1)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt:
                logger.error("Interrupted, marking pipeline as failed")
            else:
                logger.error(
                    "Execution failed: %s",
                    sys.exc_info()[1],
                    exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(
                    uuid=self.pipeline["uuid"], body={
                        "state": "Failed"
                    }).execute(num_retries=self.num_retries)
            if runnerjob and runnerjob.uuid and self.work_api == "containers":
                self.api.container_requests().update(
                    uuid=runnerjob.uuid, body={
                        "priority": "0"
                    }).execute(num_retries=self.num_retries)
        finally:
            self.cond.release()
            self.stop_polling.set()
            self.polling_thread.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if kwargs.get("submit") and isinstance(runnerjob, Runner):
            logger.info("Final output collection %s", runnerjob.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(
                    tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""
            self.final_output, self.final_output_collection = self.make_output_collection(
                self.output_name, self.output_tags, self.final_output)
            self.set_crunch_output()

        if kwargs.get("compute_checksum"):
            adjustDirObjs(self.final_output, partial(getListing,
                                                     self.fs_access))
            adjustFileObjs(self.final_output,
                           partial(compute_checksums, self.fs_access))

        return (self.final_output, self.final_status)
コード例 #28
0
ファイル: crunch_script.py プロジェクト: tomclegg/arvados
def run():
    # Timestamps are added by crunch-job, so don't print redundant timestamps.
    arvados.log_handler.setFormatter(logging.Formatter('%(name)s %(levelname)s: %(message)s'))

    # Print package versions
    logger.info(arvados_cwl.versionstring())

    api = arvados.api("v1")

    arvados_cwl.add_arv_hints()

    runner = None
    try:
        job_order_object = arvados.current_job()['script_parameters']
        toolpath = "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'], job_order_object.pop("cwl:tool"))

        pdh_path = re.compile(r'^[0-9a-f]{32}\+\d+(/.+)?$')

        def keeppath(v):
            if pdh_path.match(v):
                return "keep:%s" % v
            else:
                return v

        def keeppathObj(v):
            if "location" in v:
                v["location"] = keeppath(v["location"])

        for k,v in job_order_object.items():
            if isinstance(v, basestring) and arvados.util.keep_locator_pattern.match(v):
                job_order_object[k] = {
                    "class": "File",
                    "location": "keep:%s" % v
                }

        adjustFileObjs(job_order_object, keeppathObj)
        adjustDirObjs(job_order_object, keeppathObj)
        normalizeFilesDirs(job_order_object)

        output_name = None
        output_tags = None
        enable_reuse = True
        on_error = "continue"
        if "arv:output_name" in job_order_object:
            output_name = job_order_object["arv:output_name"]
            del job_order_object["arv:output_name"]

        if "arv:output_tags" in job_order_object:
            output_tags = job_order_object["arv:output_tags"]
            del job_order_object["arv:output_tags"]

        if "arv:enable_reuse" in job_order_object:
            enable_reuse = job_order_object["arv:enable_reuse"]
            del job_order_object["arv:enable_reuse"]

        if "arv:on_error" in job_order_object:
            on_error = job_order_object["arv:on_error"]
            del job_order_object["arv:on_error"]

        runner = arvados_cwl.ArvCwlRunner(api_client=arvados.api('v1', model=OrderedJsonModel()),
                                          output_name=output_name, output_tags=output_tags)

        make_fs_access = functools.partial(CollectionFsAccess,
                                 collection_cache=runner.collection_cache)

        t = load_tool(toolpath, runner.arv_make_tool,
                      fetcher_constructor=functools.partial(CollectionFetcher,
                                                  api_client=runner.api,
                                                  fs_access=make_fs_access(""),
                                                  num_retries=runner.num_retries))

        args = argparse.Namespace()
        args.project_uuid = arvados.current_job()["owner_uuid"]
        args.enable_reuse = enable_reuse
        args.on_error = on_error
        args.submit = False
        args.debug = False
        args.quiet = False
        args.ignore_docker_for_reuse = False
        args.basedir = os.getcwd()
        args.name = None
        args.cwl_runner_job={"uuid": arvados.current_job()["uuid"], "state": arvados.current_job()["state"]}
        args.make_fs_access = make_fs_access

        runner.arv_executor(t, job_order_object, **vars(args))
    except Exception as e:
        if isinstance(e, WorkflowException):
            logging.info("Workflow error %s", e)
        else:
            logging.exception("Unhandled exception")
        if runner and runner.final_output_collection:
            outputCollection = runner.final_output_collection.portable_data_hash()
        else:
            outputCollection = None
        api.job_tasks().update(uuid=arvados.current_task()['uuid'],
                                             body={
                                                 'output': outputCollection,
                                                 'success': False,
                                                 'progress':1.0
                                             }).execute()
コード例 #29
0
    def job(self, joborder, output_callback, **kwargs):
        kwargs["work_api"] = self.work_api
        req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer")
        if req:
            with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                if "id" not in self.tool:
                    raise WorkflowException("%s object must have 'id'" % (self.tool["class"]))
            document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch(self.tool["id"]), self.tool["id"])

            discover_secondary_files(self.tool["inputs"], joborder)

            with Perf(metrics, "subworkflow upload_deps"):
                upload_dependencies(self.arvrunner,
                                    os.path.basename(joborder.get("id", "#")),
                                    document_loader,
                                    joborder,
                                    joborder.get("id", "#"),
                                    False)

                if self.wf_pdh is None:
                    workflowobj["requirements"] = dedup_reqs(self.requirements)
                    workflowobj["hints"] = dedup_reqs(self.hints)

                    packed = pack(document_loader, workflowobj, uri, self.metadata)

                    upload_dependencies(self.arvrunner,
                                        kwargs.get("name", ""),
                                        document_loader,
                                        packed,
                                        uri,
                                        False)

            with Perf(metrics, "subworkflow adjust"):
                joborder_resolved = copy.deepcopy(joborder)
                joborder_keepmount = copy.deepcopy(joborder)

                reffiles = []
                visit_class(joborder_keepmount, ("File", "Directory"), lambda x: reffiles.append(x))

                mapper = ArvPathMapper(self.arvrunner, reffiles, kwargs["basedir"],
                                 "/keep/%s",
                                 "/keep/%s/%s",
                                 **kwargs)

                def keepmount(obj):
                    remove_redundant_fields(obj)
                    with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                        if "location" not in obj:
                            raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj))
                    with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                        if obj["location"].startswith("keep:"):
                            obj["location"] = mapper.mapper(obj["location"]).target
                            if "listing" in obj:
                                del obj["listing"]
                        elif obj["location"].startswith("_:"):
                            del obj["location"]
                        else:
                            raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"])

                visit_class(joborder_keepmount, ("File", "Directory"), keepmount)

                def resolved(obj):
                    if obj["location"].startswith("keep:"):
                        obj["location"] = mapper.mapper(obj["location"]).resolved

                visit_class(joborder_resolved, ("File", "Directory"), resolved)

                if self.wf_pdh is None:
                    adjustFileObjs(packed, keepmount)
                    adjustDirObjs(packed, keepmount)
                    self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed)

            wf_runner = cmap({
                "class": "CommandLineTool",
                "baseCommand": "cwltool",
                "inputs": self.tool["inputs"],
                "outputs": self.tool["outputs"],
                "stdout": "cwl.output.json",
                "requirements": self.requirements+[
                    {
                    "class": "InitialWorkDirRequirement",
                    "listing": [{
                            "entryname": "workflow.cwl",
                            "entry": {
                                "class": "File",
                                "location": "keep:%s/workflow.cwl" % self.wf_pdh
                            }
                        }, {
                            "entryname": "cwl.input.yml",
                            "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${')
                        }]
                }],
                "hints": self.hints,
                "arguments": ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml"],
                "id": "#"
            })
            kwargs["loader"] = self.doc_loader
            kwargs["avsc_names"] = self.doc_schema
            return ArvadosCommandTool(self.arvrunner, wf_runner, **kwargs).job(joborder_resolved, output_callback, **kwargs)
        else:
            return super(ArvadosWorkflow, self).job(joborder, output_callback, **kwargs)
コード例 #30
0
ファイル: __init__.py プロジェクト: graingert/arvados
    def arv_executor(self, tool, job_order, runtimeContext, logger=None):
        self.debug = runtimeContext.debug

        tool.visit(self.check_features)

        self.project_uuid = runtimeContext.project_uuid
        self.pipeline = None
        self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
        self.secret_store = runtimeContext.secret_store

        self.trash_intermediate = runtimeContext.trash_intermediate
        if self.trash_intermediate and self.work_api != "containers":
            raise Exception(
                "--trash-intermediate is only supported with --api=containers."
            )

        self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
        if self.intermediate_output_ttl and self.work_api != "containers":
            raise Exception(
                "--intermediate-output-ttl is only supported with --api=containers."
            )
        if self.intermediate_output_ttl < 0:
            raise Exception(
                "Invalid value %d for --intermediate-output-ttl, cannot be less than zero"
                % self.intermediate_output_ttl)

        if runtimeContext.submit_request_uuid and self.work_api != "containers":
            raise Exception(
                "--submit-request-uuid requires containers API, but using '{}' api"
                .format(self.work_api))

        if not runtimeContext.name:
            runtimeContext.name = self.name = tool.tool.get(
                "label") or tool.metadata.get("label") or os.path.basename(
                    tool.tool["id"])

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        merged_map = upload_workflow_deps(self, tool)

        # Reload tool object which may have been updated by
        # upload_workflow_deps
        # Don't validate this time because it will just print redundant errors.
        loadingContext = self.loadingContext.copy()
        loadingContext.loader = tool.doc_loader
        loadingContext.avsc_names = tool.doc_schema
        loadingContext.metadata = tool.metadata
        loadingContext.do_validate = False

        tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]],
                                  loadingContext)

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % runtimeContext.name,
                                     tool, job_order)

        existing_uuid = runtimeContext.update_workflow
        if existing_uuid or runtimeContext.create_workflow:
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "jobs":
                tmpl = RunnerTemplate(
                    self,
                    tool,
                    job_order,
                    runtimeContext.enable_reuse,
                    uuid=existing_uuid,
                    submit_runner_ram=runtimeContext.submit_runner_ram,
                    name=runtimeContext.name,
                    merged_map=merged_map)
                tmpl.save()
                # cwltool.main will write our return value to stdout.
                return (tmpl.uuid, "success")
            elif self.work_api == "containers":
                return (upload_workflow(
                    self,
                    tool,
                    job_order,
                    self.project_uuid,
                    uuid=existing_uuid,
                    submit_runner_ram=runtimeContext.submit_runner_ram,
                    name=runtimeContext.name,
                    merged_map=merged_map), "success")

        self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
        self.eval_timeout = runtimeContext.eval_timeout

        runtimeContext = runtimeContext.copy()
        runtimeContext.use_container = True
        runtimeContext.tmpdir_prefix = "tmp"
        runtimeContext.work_api = self.work_api

        if self.work_api == "containers":
            if self.ignore_docker_for_reuse:
                raise Exception(
                    "--ignore-docker-for-reuse not supported with containers API."
                )
            runtimeContext.outdir = "/var/spool/cwl"
            runtimeContext.docker_outdir = "/var/spool/cwl"
            runtimeContext.tmpdir = "/tmp"
            runtimeContext.docker_tmpdir = "/tmp"
        elif self.work_api == "jobs":
            if runtimeContext.priority != DEFAULT_PRIORITY:
                raise Exception("--priority not implemented for jobs API.")
            runtimeContext.outdir = "$(task.outdir)"
            runtimeContext.docker_outdir = "$(task.outdir)"
            runtimeContext.tmpdir = "$(task.tmpdir)"

        if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
            raise Exception("--priority must be in the range 1..1000.")

        runnerjob = None
        if runtimeContext.submit:
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if tool.tool[
                        "class"] == "CommandLineTool" and runtimeContext.wait:
                    runtimeContext.runnerjob = tool.tool["id"]
                    runnerjob = tool.job(job_order, self.output_callback,
                                         runtimeContext).next()
                else:
                    runnerjob = RunnerContainer(
                        self,
                        tool,
                        job_order,
                        runtimeContext.enable_reuse,
                        self.output_name,
                        self.output_tags,
                        submit_runner_ram=runtimeContext.submit_runner_ram,
                        name=runtimeContext.name,
                        on_error=runtimeContext.on_error,
                        submit_runner_image=runtimeContext.submit_runner_image,
                        intermediate_output_ttl=runtimeContext.
                        intermediate_output_ttl,
                        merged_map=merged_map,
                        priority=runtimeContext.priority,
                        secret_store=self.secret_store)
            elif self.work_api == "jobs":
                runnerjob = RunnerJob(
                    self,
                    tool,
                    job_order,
                    runtimeContext.enable_reuse,
                    self.output_name,
                    self.output_tags,
                    submit_runner_ram=runtimeContext.submit_runner_ram,
                    name=runtimeContext.name,
                    on_error=runtimeContext.on_error,
                    submit_runner_image=runtimeContext.submit_runner_image,
                    merged_map=merged_map)
        elif runtimeContext.cwl_runner_job is None and self.work_api == "jobs":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid":
                    self.project_uuid,
                    "name":
                    runtimeContext.name if runtimeContext.
                    name else shortname(tool.tool["id"]),
                    "components": {},
                    "state":
                    "RunningOnClient"
                }).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runnerjob and not runtimeContext.wait:
            submitargs = runtimeContext.copy()
            submitargs.submit = False
            runnerjob.run(submitargs)
            return (runnerjob.uuid, "success")

        self.poll_api = arvados.api('v1')
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)

        if runnerjob:
            jobiter = iter((runnerjob, ))
        else:
            if runtimeContext.cwl_runner_job is not None:
                self.uuid = runtimeContext.cwl_runner_job.get('uuid')
            jobiter = tool.job(job_order, self.output_callback, runtimeContext)

        try:
            self.workflow_eval_lock.acquire()
            # Holds the lock while this code runs and releases it when
            # it is safe to do so in self.workflow_eval_lock.wait(),
            # at which point on_message can update job state and
            # process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if self.task_queue.error is not None:
                    raise self.task_queue.error

                if runnable:
                    with Perf(metrics, "run"):
                        self.start_run(runnable, runtimeContext)
                else:
                    if (self.task_queue.in_flight + len(self.processes)) > 0:
                        self.workflow_eval_lock.wait(3)
                    else:
                        logger.error(
                            "Workflow is deadlocked, no runnable processes and not waiting on any pending processes."
                        )
                        break
                loopperf.__enter__()
            loopperf.__exit__()

            while (self.task_queue.in_flight + len(self.processes)) > 0:
                if self.task_queue.error is not None:
                    raise self.task_queue.error
                self.workflow_eval_lock.wait(3)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info(
            )[0] is SystemExit:
                logger.error("Interrupted, workflow will be cancelled")
            else:
                logger.error(
                    "Execution failed: %s",
                    sys.exc_info()[1],
                    exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(
                    uuid=self.pipeline["uuid"], body={
                        "state": "Failed"
                    }).execute(num_retries=self.num_retries)
            if runnerjob and runnerjob.uuid and self.work_api == "containers":
                self.api.container_requests().update(
                    uuid=runnerjob.uuid, body={
                        "priority": "0"
                    }).execute(num_retries=self.num_retries)
        finally:
            self.workflow_eval_lock.release()
            self.task_queue.drain()
            self.stop_polling.set()
            self.polling_thread.join()
            self.task_queue.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if runtimeContext.submit and isinstance(runnerjob, Runner):
            logger.info("Final output collection %s", runnerjob.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(
                    tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""

            storage_classes = runtimeContext.storage_classes.strip().split(",")
            self.final_output, self.final_output_collection = self.make_output_collection(
                self.output_name, storage_classes, self.output_tags,
                self.final_output)
            self.set_crunch_output()

        if runtimeContext.compute_checksum:
            adjustDirObjs(self.final_output,
                          partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output,
                           partial(compute_checksums, self.fs_access))

        if self.trash_intermediate and self.final_status == "success":
            self.trash_intermediate_output()

        return (self.final_output, self.final_status)
コード例 #31
0
ファイル: arvcontainer.py プロジェクト: top501/arvados
    def arvados_job_spec(self, dry_run=False, pull_image=True, **kwargs):
        """Create an Arvados container request for this workflow.

        The returned dict can be used to create a container passed as
        the +body+ argument to container_requests().create().
        """

        adjustDirObjs(self.job_order, trim_listing)
        adjustFileObjs(self.job_order, trim_anonymous_location)
        adjustDirObjs(self.job_order, trim_anonymous_location)

        container_req = {
            "owner_uuid": self.arvrunner.project_uuid,
            "name": self.name,
            "output_path": "/var/spool/cwl",
            "cwd": "/var/spool/cwl",
            "priority": 1,
            "state": "Committed",
            "container_image": arvados_jobs_image(self.arvrunner,
                                                  self.jobs_image),
            "mounts": {
                "/var/lib/cwl/cwl.input.json": {
                    "kind": "json",
                    "content": self.job_order
                },
                "stdout": {
                    "kind": "file",
                    "path": "/var/spool/cwl/cwl.output.json"
                },
                "/var/spool/cwl": {
                    "kind": "collection",
                    "writable": True
                }
            },
            "runtime_constraints": {
                "vcpus": 1,
                "ram": 1024 * 1024 * self.submit_runner_ram,
                "API": True
            },
            "properties": {}
        }

        if self.tool.tool.get("id", "").startswith("keep:"):
            sp = self.tool.tool["id"].split('/')
            workflowcollection = sp[0][5:]
            workflowname = "/".join(sp[1:])
            workflowpath = "/var/lib/cwl/workflow/%s" % workflowname
            container_req["mounts"]["/var/lib/cwl/workflow"] = {
                "kind": "collection",
                "portable_data_hash": "%s" % workflowcollection
            }
        else:
            packed = packed_workflow(self.arvrunner, self.tool)
            workflowpath = "/var/lib/cwl/workflow.json#main"
            container_req["mounts"]["/var/lib/cwl/workflow.json"] = {
                "kind": "json",
                "content": packed
            }
            if self.tool.tool.get("id", "").startswith("arvwf:"):
                container_req["properties"]["template_uuid"] = self.tool.tool[
                    "id"][6:33]

        command = [
            "arvados-cwl-runner", "--local", "--api=containers",
            "--no-log-timestamps"
        ]
        if self.output_name:
            command.append("--output-name=" + self.output_name)
            container_req["output_name"] = self.output_name

        if self.output_tags:
            command.append("--output-tags=" + self.output_tags)

        if kwargs.get("debug"):
            command.append("--debug")

        if self.enable_reuse:
            command.append("--enable-reuse")
        else:
            command.append("--disable-reuse")

        if self.on_error:
            command.append("--on-error=" + self.on_error)

        if self.intermediate_output_ttl:
            command.append("--intermediate-output-ttl=%d" %
                           self.intermediate_output_ttl)

        if self.arvrunner.trash_intermediate:
            command.append("--trash-intermediate")

        command.extend([workflowpath, "/var/lib/cwl/cwl.input.json"])

        container_req["command"] = command

        return container_req
コード例 #32
0
ファイル: executor.py プロジェクト: tfmorris/arvados
    def arv_executor(self, updated_tool, job_order, runtimeContext, logger=None):
        self.debug = runtimeContext.debug

        updated_tool.visit(self.check_features)

        self.project_uuid = runtimeContext.project_uuid
        self.pipeline = None
        self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
        self.secret_store = runtimeContext.secret_store

        self.trash_intermediate = runtimeContext.trash_intermediate
        if self.trash_intermediate and self.work_api != "containers":
            raise Exception("--trash-intermediate is only supported with --api=containers.")

        self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
        if self.intermediate_output_ttl and self.work_api != "containers":
            raise Exception("--intermediate-output-ttl is only supported with --api=containers.")
        if self.intermediate_output_ttl < 0:
            raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl)

        if runtimeContext.submit_request_uuid and self.work_api != "containers":
            raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api))

        if not runtimeContext.name:
            runtimeContext.name = self.name = updated_tool.tool.get("label") or updated_tool.metadata.get("label") or os.path.basename(updated_tool.tool["id"])

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % runtimeContext.name,
                                     updated_tool, job_order)

        # the last clause means: if it is a command line tool, and we
        # are going to wait for the result, and always_submit_runner
        # is false, then we don't submit a runner process.

        submitting = (runtimeContext.update_workflow or
                      runtimeContext.create_workflow or
                      (runtimeContext.submit and not
                       (updated_tool.tool["class"] == "CommandLineTool" and
                        runtimeContext.wait and
                        not runtimeContext.always_submit_runner)))

        loadingContext = self.loadingContext.copy()
        loadingContext.do_validate = False
        loadingContext.do_update = False
        if submitting:
            # Document may have been auto-updated. Reload the original
            # document with updating disabled because we want to
            # submit the document with its original CWL version, not
            # the auto-updated one.
            tool = load_tool(updated_tool.tool["id"], loadingContext)
        else:
            tool = updated_tool

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        merged_map = upload_workflow_deps(self, tool)

        # Recreate process object (ArvadosWorkflow or
        # ArvadosCommandTool) because tool document may have been
        # updated by upload_workflow_deps in ways that modify
        # inheritance of hints or requirements.
        loadingContext.loader = tool.doc_loader
        loadingContext.avsc_names = tool.doc_schema
        loadingContext.metadata = tool.metadata
        tool = load_tool(tool.tool, loadingContext)

        existing_uuid = runtimeContext.update_workflow
        if existing_uuid or runtimeContext.create_workflow:
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "containers":
                return (upload_workflow(self, tool, job_order,
                                        self.project_uuid,
                                        uuid=existing_uuid,
                                        submit_runner_ram=runtimeContext.submit_runner_ram,
                                        name=runtimeContext.name,
                                        merged_map=merged_map),
                        "success")

        self.apply_reqs(job_order, tool)

        self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
        self.eval_timeout = runtimeContext.eval_timeout

        runtimeContext = runtimeContext.copy()
        runtimeContext.use_container = True
        runtimeContext.tmpdir_prefix = "tmp"
        runtimeContext.work_api = self.work_api

        if self.work_api == "containers":
            if self.ignore_docker_for_reuse:
                raise Exception("--ignore-docker-for-reuse not supported with containers API.")
            runtimeContext.outdir = "/var/spool/cwl"
            runtimeContext.docker_outdir = "/var/spool/cwl"
            runtimeContext.tmpdir = "/tmp"
            runtimeContext.docker_tmpdir = "/tmp"

        if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
            raise Exception("--priority must be in the range 1..1000.")

        if self.should_estimate_cache_size:
            visited = set()
            estimated_size = [0]
            def estimate_collection_cache(obj):
                if obj.get("location", "").startswith("keep:"):
                    m = pdh_size.match(obj["location"][5:])
                    if m and m.group(1) not in visited:
                        visited.add(m.group(1))
                        estimated_size[0] += int(m.group(2))
            visit_class(job_order, ("File", "Directory"), estimate_collection_cache)
            runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256)
            self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024)

        logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size)

        runnerjob = None
        if runtimeContext.submit:
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if submitting:
                    tool = RunnerContainer(self, updated_tool,
                                           tool, loadingContext, runtimeContext.enable_reuse,
                                           self.output_name,
                                           self.output_tags,
                                           submit_runner_ram=runtimeContext.submit_runner_ram,
                                           name=runtimeContext.name,
                                           on_error=runtimeContext.on_error,
                                           submit_runner_image=runtimeContext.submit_runner_image,
                                           intermediate_output_ttl=runtimeContext.intermediate_output_ttl,
                                           merged_map=merged_map,
                                           priority=runtimeContext.priority,
                                           secret_store=self.secret_store,
                                           collection_cache_size=runtimeContext.collection_cache_size,
                                           collection_cache_is_default=self.should_estimate_cache_size)
                else:
                    runtimeContext.runnerjob = tool.tool["id"]

        if runtimeContext.cwl_runner_job is not None:
            self.uuid = runtimeContext.cwl_runner_job.get('uuid')

        jobiter = tool.job(job_order,
                           self.output_callback,
                           runtimeContext)

        if runtimeContext.submit and not runtimeContext.wait:
            runnerjob = next(jobiter)
            runnerjob.run(runtimeContext)
            return (runnerjob.uuid, "success")

        current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
        if current_container:
            logger.info("Running inside container %s", current_container.get("uuid"))

        self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout)
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)

        try:
            self.workflow_eval_lock.acquire()

            # Holds the lock while this code runs and releases it when
            # it is safe to do so in self.workflow_eval_lock.wait(),
            # at which point on_message can update job state and
            # process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if self.task_queue.error is not None:
                    raise self.task_queue.error

                if runnable:
                    with Perf(metrics, "run"):
                        self.start_run(runnable, runtimeContext)
                else:
                    if (self.task_queue.in_flight + len(self.processes)) > 0:
                        self.workflow_eval_lock.wait(3)
                    else:
                        logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.")
                        break

                if self.stop_polling.is_set():
                    break

                loopperf.__enter__()
            loopperf.__exit__()

            while (self.task_queue.in_flight + len(self.processes)) > 0:
                if self.task_queue.error is not None:
                    raise self.task_queue.error
                self.workflow_eval_lock.wait(3)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit:
                logger.error("Interrupted, workflow will be cancelled")
            elif isinstance(sys.exc_info()[1], WorkflowException):
                logger.error("Workflow execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
            else:
                logger.exception("Workflow execution failed")

            if self.pipeline:
                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
                                                     body={"state": "Failed"}).execute(num_retries=self.num_retries)

            if self.work_api == "containers" and not current_container:
                # Not running in a crunch container, so cancel any outstanding processes.
                for p in self.processes:
                    try:
                        self.api.container_requests().update(uuid=p,
                                                             body={"priority": "0"}
                        ).execute(num_retries=self.num_retries)
                    except Exception:
                        pass
        finally:
            self.workflow_eval_lock.release()
            self.task_queue.drain()
            self.stop_polling.set()
            self.polling_thread.join()
            self.task_queue.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if runtimeContext.submit and isinstance(tool, Runner):
            logger.info("Final output collection %s", tool.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""

            storage_classes = runtimeContext.storage_classes.strip().split(",")
            self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output)
            self.set_crunch_output()

        if runtimeContext.compute_checksum:
            adjustDirObjs(self.final_output, partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access))

        if self.trash_intermediate and self.final_status == "success":
            self.trash_intermediate_output()

        return (self.final_output, self.final_status)
コード例 #33
0
ファイル: arvworkflow.py プロジェクト: tomclegg/arvados
    def job(self, joborder, output_callback, **kwargs):
        kwargs["work_api"] = self.work_api
        req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer")
        if req:
            with SourceLine(self.tool, None, WorkflowException):
                if "id" not in self.tool:
                    raise WorkflowException("%s object must have 'id'" % (self.tool["class"]))
            document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch(self.tool["id"]), self.tool["id"])

            with Perf(metrics, "subworkflow upload_deps"):
                upload_dependencies(self.arvrunner,
                                    os.path.basename(joborder.get("id", "#")),
                                    document_loader,
                                    joborder,
                                    joborder.get("id", "#"),
                                    False)

                if self.wf_pdh is None:
                    workflowobj["requirements"] = dedup_reqs(self.requirements)
                    workflowobj["hints"] = dedup_reqs(self.hints)

                    packed = pack(document_loader, workflowobj, uri, self.metadata)

                    upload_dependencies(self.arvrunner,
                                        kwargs.get("name", ""),
                                        document_loader,
                                        packed,
                                        uri,
                                        False)

            with Perf(metrics, "subworkflow adjust"):
                joborder_keepmount = copy.deepcopy(joborder)

                def keepmount(obj):
                    with SourceLine(obj, None, WorkflowException):
                        if "location" not in obj:
                            raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj))
                    with SourceLine(obj, "location", WorkflowException):
                        if obj["location"].startswith("keep:"):
                            obj["location"] = "/keep/" + obj["location"][5:]
                            if "listing" in obj:
                                del obj["listing"]
                        elif obj["location"].startswith("_:"):
                            del obj["location"]
                        else:
                            raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"])

                adjustFileObjs(joborder_keepmount, keepmount)
                adjustDirObjs(joborder_keepmount, keepmount)

                if self.wf_pdh is None:
                    adjustFileObjs(packed, keepmount)
                    adjustDirObjs(packed, keepmount)
                    self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed)

            wf_runner = cmap({
                "class": "CommandLineTool",
                "baseCommand": "cwltool",
                "inputs": self.tool["inputs"],
                "outputs": self.tool["outputs"],
                "stdout": "cwl.output.json",
                "requirements": workflowobj["requirements"]+[
                    {
                    "class": "InitialWorkDirRequirement",
                    "listing": [{
                            "entryname": "workflow.cwl",
                            "entry": {
                                "class": "File",
                                "location": "keep:%s/workflow.cwl" % self.wf_pdh
                            }
                        }, {
                            "entryname": "cwl.input.yml",
                            "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${')
                        }]
                }],
                "hints": workflowobj["hints"],
                "arguments": ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml"]
            })
            kwargs["loader"] = self.doc_loader
            kwargs["avsc_names"] = self.doc_schema
            return ArvadosCommandTool(self.arvrunner, wf_runner, **kwargs).job(joborder, output_callback, **kwargs)
        else:
            return super(ArvadosWorkflow, self).job(joborder, output_callback, **kwargs)
コード例 #34
0
ファイル: runner.py プロジェクト: thehyve/arvados
def upload_dependencies(arvrunner, name, document_loader,
                        workflowobj, uri, loadref_run, include_primary=True):
    """Upload the dependencies of the workflowobj document to Keep.

    Returns a pathmapper object mapping local paths to keep references.  Also
    does an in-place update of references in "workflowobj".

    Use scandeps to find $import, $include, $schemas, run, File and Directory
    fields that represent external references.

    If workflowobj has an "id" field, this will reload the document to ensure
    it is scanning the raw document prior to preprocessing.
    """

    loaded = set()
    def loadref(b, u):
        joined = document_loader.fetcher.urljoin(b, u)
        defrg, _ = urlparse.urldefrag(joined)
        if defrg not in loaded:
            loaded.add(defrg)
            # Use fetch_text to get raw file (before preprocessing).
            text = document_loader.fetch_text(defrg)
            if isinstance(text, bytes):
                textIO = StringIO(text.decode('utf-8'))
            else:
                textIO = StringIO(text)
            return yaml.safe_load(textIO)
        else:
            return {}

    if loadref_run:
        loadref_fields = set(("$import", "run"))
    else:
        loadref_fields = set(("$import",))

    scanobj = workflowobj
    if "id" in workflowobj:
        # Need raw file content (before preprocessing) to ensure
        # that external references in $include and $mixin are captured.
        scanobj = loadref("", workflowobj["id"])

    sc = scandeps(uri, scanobj,
                  loadref_fields,
                  set(("$include", "$schemas", "location")),
                  loadref, urljoin=document_loader.fetcher.urljoin)

    normalizeFilesDirs(sc)

    if include_primary and "id" in workflowobj:
        sc.append({"class": "File", "location": workflowobj["id"]})

    if "$schemas" in workflowobj:
        for s in workflowobj["$schemas"]:
            sc.append({"class": "File", "location": s})

    def capture_default(obj):
        remove = [False]
        def add_default(f):
            if "location" not in f and "path" in f:
                f["location"] = f["path"]
                del f["path"]
            if "location" in f and not arvrunner.fs_access.exists(f["location"]):
                # Remove from sc
                sc[:] = [x for x in sc if x["location"] != f["location"]]
                # Delete "default" from workflowobj
                remove[0] = True
        visit_class(obj["default"], ("File", "Directory"), add_default)
        if remove[0]:
            del obj["default"]

    find_defaults(workflowobj, capture_default)

    mapper = ArvPathMapper(arvrunner, sc, "",
                           "keep:%s",
                           "keep:%s/%s",
                           name=name,
                           single_collection=True)

    def setloc(p):
        if "location" in p and (not p["location"].startswith("_:")) and (not p["location"].startswith("keep:")):
            p["location"] = mapper.mapper(p["location"]).resolved
    adjustFileObjs(workflowobj, setloc)
    adjustDirObjs(workflowobj, setloc)

    if "$schemas" in workflowobj:
        sch = []
        for s in workflowobj["$schemas"]:
            sch.append(mapper.mapper(s).resolved)
        workflowobj["$schemas"] = sch

    return mapper
コード例 #35
0
ファイル: __init__.py プロジェクト: 151706061/arvados
    def arvExecutor(self, tool, job_order, **kwargs):
        self.debug = kwargs.get("debug")

        tool.visit(self.check_writable)

        if kwargs.get("quiet"):
            logger.setLevel(logging.WARN)
            logging.getLogger('arvados.arv-run').setLevel(logging.WARN)

        useruuid = self.api.users().current().execute()["uuid"]
        self.project_uuid = kwargs.get("project_uuid") if kwargs.get(
            "project_uuid") else useruuid
        self.pipeline = None
        make_fs_access = kwargs.get("make_fs_access") or partial(
            CollectionFsAccess, api_client=self.api)
        self.fs_access = make_fs_access(kwargs["basedir"])

        if kwargs.get("create_template"):
            tmpl = RunnerTemplate(self, tool, job_order,
                                  kwargs.get("enable_reuse"))
            tmpl.save()
            # cwltool.main will write our return value to stdout.
            return tmpl.uuid

        self.debug = kwargs.get("debug")
        self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")

        kwargs["make_fs_access"] = make_fs_access
        kwargs["enable_reuse"] = kwargs.get("enable_reuse")
        kwargs["use_container"] = True
        kwargs["tmpdir_prefix"] = "tmp"
        kwargs["on_error"] = "continue"
        kwargs["compute_checksum"] = kwargs.get("compute_checksum")

        if self.work_api == "containers":
            kwargs["outdir"] = "/var/spool/cwl"
            kwargs["docker_outdir"] = "/var/spool/cwl"
            kwargs["tmpdir"] = "/tmp"
            kwargs["docker_tmpdir"] = "/tmp"
        elif self.work_api == "jobs":
            kwargs["outdir"] = "$(task.outdir)"
            kwargs["docker_outdir"] = "$(task.outdir)"
            kwargs["tmpdir"] = "$(task.tmpdir)"

        runnerjob = None
        if kwargs.get("submit"):
            if self.work_api == "containers":
                if tool.tool["class"] == "CommandLineTool":
                    runnerjob = tool.job(job_order, self.output_callback,
                                         **kwargs).next()
                else:
                    runnerjob = RunnerContainer(self, tool, job_order,
                                                kwargs.get("enable_reuse"))
            else:
                runnerjob = RunnerJob(self, tool, job_order,
                                      kwargs.get("enable_reuse"))

        if not kwargs.get(
                "submit"
        ) and "cwl_runner_job" not in kwargs and not self.work_api == "containers":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid": self.project_uuid,
                    "name": shortname(tool.tool["id"]),
                    "components": {},
                    "state": "RunningOnClient"
                }).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runnerjob and not kwargs.get("wait"):
            runnerjob.run()
            return runnerjob.uuid

        self.poll_api = arvados.api('v1')
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        if runnerjob:
            jobiter = iter((runnerjob, ))
        else:
            if "cwl_runner_job" in kwargs:
                self.uuid = kwargs.get("cwl_runner_job").get('uuid')
            jobiter = tool.job(job_order, self.output_callback, **kwargs)

        try:
            self.cond.acquire()
            # Will continue to hold the lock for the duration of this code
            # except when in cond.wait(), at which point on_message can update
            # job state and process output callbacks.

            for runnable in jobiter:
                if runnable:
                    runnable.run(**kwargs)
                else:
                    if self.processes:
                        self.cond.wait(1)
                    else:
                        logger.error(
                            "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs."
                        )
                        break

            while self.processes:
                self.cond.wait(1)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt:
                logger.error("Interrupted, marking pipeline as failed")
            else:
                logger.error(
                    "Caught unhandled exception, marking pipeline as failed.  Error was: %s",
                    sys.exc_info()[1],
                    exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(
                    uuid=self.pipeline["uuid"], body={
                        "state": "Failed"
                    }).execute(num_retries=self.num_retries)
            if runnerjob and runnerjob.uuid and self.work_api == "containers":
                self.api.container_requests().update(
                    uuid=runnerjob.uuid, body={
                        "priority": "0"
                    }).execute(num_retries=self.num_retries)
        finally:
            self.cond.release()
            self.stop_polling.set()
            self.polling_thread.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_status != "success":
            raise WorkflowException("Workflow failed.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if kwargs.get("compute_checksum"):
            adjustFileObjs(self.final_output,
                           partial(compute_checksums, self.fs_access))

        return self.final_output
コード例 #36
0
ファイル: arvcontainer.py プロジェクト: tomclegg/arvados
    def arvados_job_spec(self, dry_run=False, pull_image=True, **kwargs):
        """Create an Arvados container request for this workflow.

        The returned dict can be used to create a container passed as
        the +body+ argument to container_requests().create().
        """

        adjustDirObjs(self.job_order, trim_listing)
        adjustFileObjs(self.job_order, trim_anonymous_location)
        adjustDirObjs(self.job_order, trim_anonymous_location)

        container_req = {
            "owner_uuid": self.arvrunner.project_uuid,
            "name": self.name,
            "output_path": "/var/spool/cwl",
            "cwd": "/var/spool/cwl",
            "priority": 1,
            "state": "Committed",
            "container_image": arvados_jobs_image(self.arvrunner, self.jobs_image),
            "mounts": {
                "/var/lib/cwl/cwl.input.json": {
                    "kind": "json",
                    "content": self.job_order
                },
                "stdout": {
                    "kind": "file",
                    "path": "/var/spool/cwl/cwl.output.json"
                },
                "/var/spool/cwl": {
                    "kind": "collection",
                    "writable": True
                }
            },
            "runtime_constraints": {
                "vcpus": 1,
                "ram": 1024*1024 * self.submit_runner_ram,
                "API": True
            },
            "properties": {}
        }

        if self.tool.tool.get("id", "").startswith("keep:"):
            sp = self.tool.tool["id"].split('/')
            workflowcollection = sp[0][5:]
            workflowname = "/".join(sp[1:])
            workflowpath = "/var/lib/cwl/workflow/%s" % workflowname
            container_req["mounts"]["/var/lib/cwl/workflow"] = {
                "kind": "collection",
                "portable_data_hash": "%s" % workflowcollection
            }
        else:
            packed = packed_workflow(self.arvrunner, self.tool)
            workflowpath = "/var/lib/cwl/workflow.json#main"
            container_req["mounts"]["/var/lib/cwl/workflow.json"] = {
                "kind": "json",
                "content": packed
            }
            if self.tool.tool.get("id", "").startswith("arvwf:"):
                container_req["properties"]["template_uuid"] = self.tool.tool["id"][6:33]

        command = ["arvados-cwl-runner", "--local", "--api=containers", "--no-log-timestamps"]
        if self.output_name:
            command.append("--output-name=" + self.output_name)
            container_req["output_name"] = self.output_name

        if self.output_tags:
            command.append("--output-tags=" + self.output_tags)

        if kwargs.get("debug"):
            command.append("--debug")

        if self.enable_reuse:
            command.append("--enable-reuse")
        else:
            command.append("--disable-reuse")

        if self.on_error:
            command.append("--on-error=" + self.on_error)

        command.extend([workflowpath, "/var/lib/cwl/cwl.input.json"])

        container_req["command"] = command

        return container_req
コード例 #37
0
ファイル: crunch_script.py プロジェクト: wtsi-hgi/arvados
def run():
    # Timestamps are added by crunch-job, so don't print redundant timestamps.
    arvados.log_handler.setFormatter(logging.Formatter('%(name)s %(levelname)s: %(message)s'))

    # Print package versions
    logger.info(arvados_cwl.versionstring())

    api = arvados.api("v1")

    arvados_cwl.add_arv_hints()

    runner = None
    try:
        job_order_object = arvados.current_job()['script_parameters']
        toolpath = "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'], job_order_object.pop("cwl:tool"))

        pdh_path = re.compile(r'^[0-9a-f]{32}\+\d+(/.+)?$')

        def keeppath(v):
            if pdh_path.match(v):
                return "keep:%s" % v
            else:
                return v

        def keeppathObj(v):
            if "location" in v:
                v["location"] = keeppath(v["location"])

        for k,v in job_order_object.items():
            if isinstance(v, basestring) and arvados.util.keep_locator_pattern.match(v):
                job_order_object[k] = {
                    "class": "File",
                    "location": "keep:%s" % v
                }

        adjustFileObjs(job_order_object, keeppathObj)
        adjustDirObjs(job_order_object, keeppathObj)
        normalizeFilesDirs(job_order_object)

        output_name = None
        output_tags = None
        enable_reuse = True
        on_error = "continue"
        debug = False

        if "arv:output_name" in job_order_object:
            output_name = job_order_object["arv:output_name"]
            del job_order_object["arv:output_name"]

        if "arv:output_tags" in job_order_object:
            output_tags = job_order_object["arv:output_tags"]
            del job_order_object["arv:output_tags"]

        if "arv:enable_reuse" in job_order_object:
            enable_reuse = job_order_object["arv:enable_reuse"]
            del job_order_object["arv:enable_reuse"]

        if "arv:on_error" in job_order_object:
            on_error = job_order_object["arv:on_error"]
            del job_order_object["arv:on_error"]

        if "arv:debug" in job_order_object:
            debug = job_order_object["arv:debug"]
            del job_order_object["arv:debug"]

        arvargs = argparse.Namespace()
        arvargs.work_api = "jobs"
        arvargs.output_name = output_name
        arvargs.output_tags = output_tags
        arvargs.thread_count = 1

        runner = arvados_cwl.ArvCwlRunner(api_client=arvados.safeapi.ThreadSafeApiCache(
            api_params={"model": OrderedJsonModel()}, keep_params={"num_retries": 4}),
                                          arvargs=arvargs)

        make_fs_access = functools.partial(CollectionFsAccess,
                                 collection_cache=runner.collection_cache)

        t = load_tool(toolpath, runner.loadingContext)

        if debug:
            logger.setLevel(logging.DEBUG)
            logging.getLogger('arvados').setLevel(logging.DEBUG)
            logging.getLogger("cwltool").setLevel(logging.DEBUG)

        args = ArvRuntimeContext(vars(arvargs))
        args.project_uuid = arvados.current_job()["owner_uuid"]
        args.enable_reuse = enable_reuse
        args.on_error = on_error
        args.submit = False
        args.debug = debug
        args.quiet = False
        args.ignore_docker_for_reuse = False
        args.basedir = os.getcwd()
        args.name = None
        args.cwl_runner_job={"uuid": arvados.current_job()["uuid"], "state": arvados.current_job()["state"]}
        args.make_fs_access = make_fs_access
        args.trash_intermediate = False
        args.intermediate_output_ttl = 0
        args.priority = arvados_cwl.DEFAULT_PRIORITY
        args.do_validate = True
        args.disable_js_validation = False
        args.tmp_outdir_prefix = "tmp"

        runner.arv_executor(t, job_order_object, args, logger=logger)
    except Exception as e:
        if isinstance(e, WorkflowException):
            logging.info("Workflow error %s", e)
        else:
            logging.exception("Unhandled exception")
        if runner and runner.final_output_collection:
            outputCollection = runner.final_output_collection.portable_data_hash()
        else:
            outputCollection = None
        api.job_tasks().update(uuid=arvados.current_task()['uuid'],
                                             body={
                                                 'output': outputCollection,
                                                 'success': False,
                                                 'progress':1.0
                                             }).execute()
コード例 #38
0
def add_details_to_outputs(outputs):
    adjustFileObjs(outputs, expand_to_file_uri)
    adjustDirObjs(outputs, expand_to_file_uri)
    normalizeFilesDirs(outputs)
    adjustFileObjs(outputs, partial(compute_checksums, StdFsAccess("")))
コード例 #39
0
ファイル: runner.py プロジェクト: hotkee/arvados
def upload_dependencies(arvrunner, name, document_loader, workflowobj, uri,
                        loadref_run):
    """Upload the dependencies of the workflowobj document to Keep.

    Returns a pathmapper object mapping local paths to keep references.  Also
    does an in-place update of references in "workflowobj".

    Use scandeps to find $import, $include, $schemas, run, File and Directory
    fields that represent external references.

    If workflowobj has an "id" field, this will reload the document to ensure
    it is scanning the raw document prior to preprocessing.
    """

    loaded = set()

    def loadref(b, u):
        joined = document_loader.fetcher.urljoin(b, u)
        defrg, _ = urlparse.urldefrag(joined)
        if defrg not in loaded:
            loaded.add(defrg)
            # Use fetch_text to get raw file (before preprocessing).
            text = document_loader.fetch_text(defrg)
            if isinstance(text, bytes):
                textIO = StringIO(text.decode('utf-8'))
            else:
                textIO = StringIO(text)
            return yaml.safe_load(textIO)
        else:
            return {}

    if loadref_run:
        loadref_fields = set(("$import", "run"))
    else:
        loadref_fields = set(("$import", ))

    scanobj = workflowobj
    if "id" in workflowobj:
        # Need raw file content (before preprocessing) to ensure
        # that external references in $include and $mixin are captured.
        scanobj = loadref("", workflowobj["id"])

    sc = scandeps(uri,
                  scanobj,
                  loadref_fields,
                  set(("$include", "$schemas", "location")),
                  loadref,
                  urljoin=document_loader.fetcher.urljoin)

    normalizeFilesDirs(sc)

    if "id" in workflowobj:
        sc.append({"class": "File", "location": workflowobj["id"]})

    mapper = ArvPathMapper(arvrunner,
                           sc,
                           "",
                           "keep:%s",
                           "keep:%s/%s",
                           name=name)

    def setloc(p):
        if "location" in p and (not p["location"].startswith("_:")) and (
                not p["location"].startswith("keep:")):
            p["location"] = mapper.mapper(p["location"]).resolved

    adjustFileObjs(workflowobj, setloc)
    adjustDirObjs(workflowobj, setloc)

    return mapper
コード例 #40
0
ファイル: executor.py プロジェクト: tfmorris/arvados
    def make_output_collection(self, name, storage_classes, tagsString, outputObj):
        outputObj = copy.deepcopy(outputObj)

        files = []
        def capture(fileobj):
            files.append(fileobj)

        adjustDirObjs(outputObj, capture)
        adjustFileObjs(outputObj, capture)

        generatemapper = NoFollowPathMapper(files, "", "", separateDirs=False)

        final = arvados.collection.Collection(api_client=self.api,
                                              keep_client=self.keep_client,
                                              num_retries=self.num_retries)

        for k,v in generatemapper.items():
            if v.type == "Directory" and v.resolved.startswith("_:"):
                    continue
            if v.type == "CreateFile" and (k.startswith("_:") or v.resolved.startswith("_:")):
                with final.open(v.target, "wb") as f:
                    f.write(v.resolved.encode("utf-8"))
                    continue

            if not v.resolved.startswith("keep:"):
                raise Exception("Output source is not in keep or a literal")
            sp = v.resolved.split("/")
            srccollection = sp[0][5:]
            try:
                reader = self.collection_cache.get(srccollection)
                srcpath = "/".join(sp[1:]) if len(sp) > 1 else "."
                final.copy(srcpath, v.target, source_collection=reader, overwrite=False)
            except arvados.errors.ArgumentError as e:
                logger.error("Creating CollectionReader for '%s' '%s': %s", k, v, e)
                raise
            except IOError as e:
                logger.error("While preparing output collection: %s", e)
                raise

        def rewrite(fileobj):
            fileobj["location"] = generatemapper.mapper(fileobj["location"]).target
            for k in ("listing", "contents", "nameext", "nameroot", "dirname"):
                if k in fileobj:
                    del fileobj[k]

        adjustDirObjs(outputObj, rewrite)
        adjustFileObjs(outputObj, rewrite)

        with final.open("cwl.output.json", "w") as f:
            res = str(json.dumps(outputObj, sort_keys=True, indent=4, separators=(',',': '), ensure_ascii=False))
            f.write(res)

        final.save_new(name=name, owner_uuid=self.project_uuid, storage_classes=storage_classes, ensure_unique_name=True)

        logger.info("Final output collection %s \"%s\" (%s)", final.portable_data_hash(),
                    final.api_response()["name"],
                    final.manifest_locator())

        final_uuid = final.manifest_locator()
        tags = tagsString.split(',')
        for tag in tags:
             self.api.links().create(body={
                "head_uuid": final_uuid, "link_class": "tag", "name": tag
                }).execute(num_retries=self.num_retries)

        def finalcollection(fileobj):
            fileobj["location"] = "keep:%s/%s" % (final.portable_data_hash(), fileobj["location"])

        adjustDirObjs(outputObj, finalcollection)
        adjustFileObjs(outputObj, finalcollection)

        return (outputObj, final)
コード例 #41
0
    def job(self, joborder, output_callback, runtimeContext):

        builder = make_builder(joborder, self.hints, self.requirements, runtimeContext)
        runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext)

        req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer")
        if not req:
            return super(ArvadosWorkflow, self).job(joborder, output_callback, runtimeContext)

        # RunInSingleContainer is true

        with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
            if "id" not in self.tool:
                raise WorkflowException("%s object must have 'id'" % (self.tool["class"]))

        discover_secondary_files(self.arvrunner.fs_access, builder,
                                 self.tool["inputs"], joborder)

        with Perf(metrics, "subworkflow upload_deps"):
            upload_dependencies(self.arvrunner,
                                os.path.basename(joborder.get("id", "#")),
                                self.doc_loader,
                                joborder,
                                joborder.get("id", "#"),
                                False)

            if self.wf_pdh is None:
                packed = pack(self.loadingContext, self.tool["id"], loader=self.doc_loader)

                for p in packed["$graph"]:
                    if p["id"] == "#main":
                        p["requirements"] = dedup_reqs(self.requirements)
                        p["hints"] = dedup_reqs(self.hints)

                def visit(item):
                    if "requirements" in item:
                        item["requirements"] = [i for i in item["requirements"] if i["class"] != "DockerRequirement"]
                    for t in ("hints", "requirements"):
                        if t not in item:
                            continue
                        for req in item[t]:
                            if req["class"] == "ResourceRequirement":
                                dyn = False
                                for k in max_res_pars + sum_res_pars:
                                    if k in req:
                                        if isinstance(req[k], basestring):
                                            if item["id"] == "#main":
                                                # only the top-level requirements/hints may contain expressions
                                                self.dynamic_resource_req.append(req)
                                                dyn = True
                                                break
                                            else:
                                                with SourceLine(req, k, WorkflowException):
                                                    raise WorkflowException("Non-top-level ResourceRequirement in single container cannot have expressions")
                                if not dyn:
                                    self.static_resource_req.append(req)

                visit_class(packed["$graph"], ("Workflow", "CommandLineTool"), visit)

                if self.static_resource_req:
                    self.static_resource_req = [get_overall_res_req(self.static_resource_req)]

                upload_dependencies(self.arvrunner,
                                    runtimeContext.name,
                                    self.doc_loader,
                                    packed,
                                    self.tool["id"],
                                    False)

                # Discover files/directories referenced by the
                # workflow (mainly "default" values)
                visit_class(packed, ("File", "Directory"), self.wf_reffiles.append)


        if self.dynamic_resource_req:
            # Evaluate dynamic resource requirements using current builder
            rs = copy.copy(self.static_resource_req)
            for dyn_rs in self.dynamic_resource_req:
                eval_req = {"class": "ResourceRequirement"}
                for a in max_res_pars + sum_res_pars:
                    if a in dyn_rs:
                        eval_req[a] = builder.do_eval(dyn_rs[a])
                rs.append(eval_req)
            job_res_reqs = [get_overall_res_req(rs)]
        else:
            job_res_reqs = self.static_resource_req

        with Perf(metrics, "subworkflow adjust"):
            joborder_resolved = copy.deepcopy(joborder)
            joborder_keepmount = copy.deepcopy(joborder)

            reffiles = []
            visit_class(joborder_keepmount, ("File", "Directory"), reffiles.append)

            mapper = ArvPathMapper(self.arvrunner, reffiles+self.wf_reffiles, runtimeContext.basedir,
                                   "/keep/%s",
                                   "/keep/%s/%s")

            # For containers API, we need to make sure any extra
            # referenced files (ie referenced by the workflow but
            # not in the inputs) are included in the mounts.
            if self.wf_reffiles:
                runtimeContext = runtimeContext.copy()
                runtimeContext.extra_reffiles = copy.deepcopy(self.wf_reffiles)

            def keepmount(obj):
                remove_redundant_fields(obj)
                with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                    if "location" not in obj:
                        raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj))
                with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                    if obj["location"].startswith("keep:"):
                        obj["location"] = mapper.mapper(obj["location"]).target
                        if "listing" in obj:
                            del obj["listing"]
                    elif obj["location"].startswith("_:"):
                        del obj["location"]
                    else:
                        raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"])

            visit_class(joborder_keepmount, ("File", "Directory"), keepmount)

            def resolved(obj):
                if obj["location"].startswith("keep:"):
                    obj["location"] = mapper.mapper(obj["location"]).resolved

            visit_class(joborder_resolved, ("File", "Directory"), resolved)

            if self.wf_pdh is None:
                adjustFileObjs(packed, keepmount)
                adjustDirObjs(packed, keepmount)
                self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed)

        self.loadingContext = self.loadingContext.copy()
        self.loadingContext.metadata = self.loadingContext.metadata.copy()
        self.loadingContext.metadata["http://commonwl.org/cwltool#original_cwlVersion"] = "v1.0"

        if len(job_res_reqs) == 1:
            # RAM request needs to be at least 128 MiB or the workflow
            # runner itself won't run reliably.
            if job_res_reqs[0].get("ramMin", 1024) < 128:
                job_res_reqs[0]["ramMin"] = 128

        arguments = ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl", "cwl.input.yml"]
        if runtimeContext.debug:
            arguments.insert(0, '--debug')

        wf_runner = cmap({
            "class": "CommandLineTool",
            "baseCommand": "cwltool",
            "inputs": self.tool["inputs"],
            "outputs": self.tool["outputs"],
            "stdout": "cwl.output.json",
            "requirements": self.requirements+job_res_reqs+[
                {"class": "InlineJavascriptRequirement"},
                {
                "class": "InitialWorkDirRequirement",
                "listing": [{
                        "entryname": "workflow.cwl",
                        "entry": '$({"class": "File", "location": "keep:%s/workflow.cwl"})' % self.wf_pdh
                    }, {
                        "entryname": "cwl.input.yml",
                        "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${')
                    }]
            }],
            "hints": self.hints,
            "arguments": arguments,
            "id": "#"
        })
        return ArvadosCommandTool(self.arvrunner, wf_runner, self.loadingContext).job(joborder_resolved, output_callback, runtimeContext)
コード例 #42
0
ファイル: __init__.py プロジェクト: tomclegg/arvados
    def arv_executor(self, tool, job_order, **kwargs):
        self.debug = kwargs.get("debug")

        tool.visit(self.check_features)

        self.project_uuid = kwargs.get("project_uuid")
        self.pipeline = None
        make_fs_access = kwargs.get("make_fs_access") or partial(CollectionFsAccess,
                                                                 collection_cache=self.collection_cache)
        self.fs_access = make_fs_access(kwargs["basedir"])

        if not kwargs.get("name"):
            kwargs["name"] = self.name = tool.tool.get("label") or tool.metadata.get("label") or os.path.basename(tool.tool["id"])

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        upload_workflow_deps(self, tool)

        # Reload tool object which may have been updated by
        # upload_workflow_deps
        tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]],
                                  makeTool=self.arv_make_tool,
                                  loader=tool.doc_loader,
                                  avsc_names=tool.doc_schema,
                                  metadata=tool.metadata)

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % kwargs["name"],
                                     tool, job_order)

        existing_uuid = kwargs.get("update_workflow")
        if existing_uuid or kwargs.get("create_workflow"):
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "jobs":
                tmpl = RunnerTemplate(self, tool, job_order,
                                      kwargs.get("enable_reuse"),
                                      uuid=existing_uuid,
                                      submit_runner_ram=kwargs.get("submit_runner_ram"),
                                      name=kwargs["name"])
                tmpl.save()
                # cwltool.main will write our return value to stdout.
                return (tmpl.uuid, "success")
            elif self.work_api == "containers":
                return (upload_workflow(self, tool, job_order,
                                        self.project_uuid,
                                        uuid=existing_uuid,
                                        submit_runner_ram=kwargs.get("submit_runner_ram"),
                                        name=kwargs["name"]),
                        "success")

        self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")

        kwargs["make_fs_access"] = make_fs_access
        kwargs["enable_reuse"] = kwargs.get("enable_reuse")
        kwargs["use_container"] = True
        kwargs["tmpdir_prefix"] = "tmp"
        kwargs["compute_checksum"] = kwargs.get("compute_checksum")

        if self.work_api == "containers":
            kwargs["outdir"] = "/var/spool/cwl"
            kwargs["docker_outdir"] = "/var/spool/cwl"
            kwargs["tmpdir"] = "/tmp"
            kwargs["docker_tmpdir"] = "/tmp"
        elif self.work_api == "jobs":
            kwargs["outdir"] = "$(task.outdir)"
            kwargs["docker_outdir"] = "$(task.outdir)"
            kwargs["tmpdir"] = "$(task.tmpdir)"

        runnerjob = None
        if kwargs.get("submit"):
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if tool.tool["class"] == "CommandLineTool":
                    kwargs["runnerjob"] = tool.tool["id"]
                    upload_dependencies(self,
                                        kwargs["name"],
                                        tool.doc_loader,
                                        tool.tool,
                                        tool.tool["id"],
                                        False)
                    runnerjob = tool.job(job_order,
                                         self.output_callback,
                                         **kwargs).next()
                else:
                    runnerjob = RunnerContainer(self, tool, job_order, kwargs.get("enable_reuse"),
                                                self.output_name,
                                                self.output_tags,
                                                submit_runner_ram=kwargs.get("submit_runner_ram"),
                                                name=kwargs.get("name"),
                                                on_error=kwargs.get("on_error"),
                                                submit_runner_image=kwargs.get("submit_runner_image"))
            elif self.work_api == "jobs":
                runnerjob = RunnerJob(self, tool, job_order, kwargs.get("enable_reuse"),
                                      self.output_name,
                                      self.output_tags,
                                      submit_runner_ram=kwargs.get("submit_runner_ram"),
                                      name=kwargs.get("name"),
                                      on_error=kwargs.get("on_error"),
                                      submit_runner_image=kwargs.get("submit_runner_image"))

        if not kwargs.get("submit") and "cwl_runner_job" not in kwargs and self.work_api == "jobs":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid": self.project_uuid,
                    "name": kwargs["name"] if kwargs.get("name") else shortname(tool.tool["id"]),
                    "components": {},
                    "state": "RunningOnClient"}).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runnerjob and not kwargs.get("wait"):
            runnerjob.run(wait=kwargs.get("wait"))
            return (runnerjob.uuid, "success")

        self.poll_api = arvados.api('v1')
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        if runnerjob:
            jobiter = iter((runnerjob,))
        else:
            if "cwl_runner_job" in kwargs:
                self.uuid = kwargs.get("cwl_runner_job").get('uuid')
            jobiter = tool.job(job_order,
                               self.output_callback,
                               **kwargs)

        try:
            self.cond.acquire()
            # Will continue to hold the lock for the duration of this code
            # except when in cond.wait(), at which point on_message can update
            # job state and process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if runnable:
                    with Perf(metrics, "run"):
                        runnable.run(**kwargs)
                else:
                    if self.processes:
                        self.cond.wait(1)
                    else:
                        logger.error("Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs.")
                        break
                loopperf.__enter__()
            loopperf.__exit__()

            while self.processes:
                self.cond.wait(1)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt:
                logger.error("Interrupted, marking pipeline as failed")
            else:
                logger.error("Execution failed: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
                                                     body={"state": "Failed"}).execute(num_retries=self.num_retries)
            if runnerjob and runnerjob.uuid and self.work_api == "containers":
                self.api.container_requests().update(uuid=runnerjob.uuid,
                                                     body={"priority": "0"}).execute(num_retries=self.num_retries)
        finally:
            self.cond.release()
            self.stop_polling.set()
            self.polling_thread.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if kwargs.get("submit") and isinstance(runnerjob, Runner):
            logger.info("Final output collection %s", runnerjob.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""
            self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, self.output_tags, self.final_output)
            self.set_crunch_output()

        if kwargs.get("compute_checksum"):
            adjustDirObjs(self.final_output, partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access))

        return (self.final_output, self.final_status)
コード例 #43
0
ファイル: __init__.py プロジェクト: ginkgobioworks/arvados
    def arvExecutor(self, tool, job_order, **kwargs):
        self.debug = kwargs.get("debug")

        if kwargs.get("quiet"):
            logger.setLevel(logging.WARN)
            logging.getLogger('arvados.arv-run').setLevel(logging.WARN)

        useruuid = self.api.users().current().execute()["uuid"]
        self.project_uuid = kwargs.get("project_uuid") if kwargs.get("project_uuid") else useruuid
        self.pipeline = None
        self.fs_access = CollectionFsAccess(kwargs["basedir"], api_client=self.api)

        if kwargs.get("create_template"):
            tmpl = RunnerTemplate(self, tool, job_order, kwargs.get("enable_reuse"))
            tmpl.save()
            # cwltool.main will write our return value to stdout.
            return tmpl.uuid

        self.debug = kwargs.get("debug")
        self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")

        kwargs["fs_access"] = self.fs_access
        kwargs["enable_reuse"] = kwargs.get("enable_reuse")
        kwargs["use_container"] = True
        kwargs["tmpdir_prefix"] = "tmp"
        kwargs["on_error"] = "continue"
        kwargs["compute_checksum"] = kwargs.get("compute_checksum")

        if self.work_api == "containers":
            kwargs["outdir"] = "/var/spool/cwl"
            kwargs["docker_outdir"] = "/var/spool/cwl"
            kwargs["tmpdir"] = "/tmp"
        elif self.work_api == "jobs":
            kwargs["outdir"] = "$(task.outdir)"
            kwargs["docker_outdir"] = "$(task.outdir)"
            kwargs["tmpdir"] = "$(task.tmpdir)"

        runnerjob = None
        if kwargs.get("submit"):
            if self.work_api == "containers":
                if tool.tool["class"] == "CommandLineTool":
                    runnerjob = tool.job(job_order,
                                         self.output_callback,
                                         **kwargs).next()
                else:
                    runnerjob = RunnerContainer(self, tool, job_order, kwargs.get("enable_reuse"))
            else:
                runnerjob = RunnerJob(self, tool, job_order, kwargs.get("enable_reuse"))

        if not kwargs.get("submit") and "cwl_runner_job" not in kwargs and not self.work_api == "containers":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid": self.project_uuid,
                    "name": shortname(tool.tool["id"]),
                    "components": {},
                    "state": "RunningOnClient"}).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runnerjob and not kwargs.get("wait"):
            runnerjob.run()
            return runnerjob.uuid

        arvados.config.settings()["ARVADOS_DISABLE_WEBSOCKETS"] = "1"

        if self.work_api == "containers":
            events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#container"]], self.on_message)
        if self.work_api == "jobs":
            events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message)

        if runnerjob:
            jobiter = iter((runnerjob,))
        else:
            if "cwl_runner_job" in kwargs:
                self.uuid = kwargs.get("cwl_runner_job").get('uuid')
            jobiter = tool.job(job_order,
                               self.output_callback,
                               **kwargs)

        try:
            self.cond.acquire()
            # Will continue to hold the lock for the duration of this code
            # except when in cond.wait(), at which point on_message can update
            # job state and process output callbacks.

            for runnable in jobiter:
                if runnable:
                    runnable.run(**kwargs)
                else:
                    if self.processes:
                        self.cond.wait(1)
                    else:
                        logger.error("Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs.")
                        break

            while self.processes:
                self.cond.wait(1)

            events.close()
        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt:
                logger.error("Interrupted, marking pipeline as failed")
            else:
                logger.error("Caught unhandled exception, marking pipeline as failed.  Error was: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
                                                     body={"state": "Failed"}).execute(num_retries=self.num_retries)
            if runnerjob and runnerjob.uuid and self.work_api == "containers":
                self.api.container_requests().update(uuid=runnerjob.uuid,
                                                     body={"priority": "0"}).execute(num_retries=self.num_retries)
        finally:
            self.cond.release()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_status != "success":
            raise WorkflowException("Workflow failed.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if kwargs.get("compute_checksum"):
            def compute_checksums(fileobj):
                if "checksum" not in fileobj:
                    checksum = hashlib.sha1()
                    with self.fs_access.open(fileobj["location"], "rb") as f:
                        contents = f.read(1024*1024)
                        while contents != "":
                            checksum.update(contents)
                            contents = f.read(1024*1024)
                    fileobj["checksum"] = "sha1$%s" % checksum.hexdigest()

            adjustFileObjs(self.final_output, compute_checksums)

        return self.final_output
コード例 #44
0
ファイル: arvworkflow.py プロジェクト: chapmanb/arvados
    def job(self, joborder, output_callback, runtimeContext):

        builder = make_builder(joborder, self.hints, self.requirements, runtimeContext)
        runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext)

        req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer")
        if not req:
            return super(ArvadosWorkflow, self).job(joborder, output_callback, runtimeContext)

        # RunInSingleContainer is true

        with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
            if "id" not in self.tool:
                raise WorkflowException("%s object must have 'id'" % (self.tool["class"]))
        document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch(self.tool["id"]), self.tool["id"])

        discover_secondary_files(self.tool["inputs"], joborder)

        with Perf(metrics, "subworkflow upload_deps"):
            upload_dependencies(self.arvrunner,
                                os.path.basename(joborder.get("id", "#")),
                                document_loader,
                                joborder,
                                joborder.get("id", "#"),
                                False)

            if self.wf_pdh is None:
                workflowobj["requirements"] = dedup_reqs(self.requirements)
                workflowobj["hints"] = dedup_reqs(self.hints)

                packed = pack(document_loader, workflowobj, uri, self.metadata)

                def visit(item):
                    for t in ("hints", "requirements"):
                        if t not in item:
                            continue
                        for req in item[t]:
                            if req["class"] == "ResourceRequirement":
                                dyn = False
                                for k in max_res_pars + sum_res_pars:
                                    if k in req:
                                        if isinstance(req[k], basestring):
                                            if item["id"] == "#main":
                                                # only the top-level requirements/hints may contain expressions
                                                self.dynamic_resource_req.append(req)
                                                dyn = True
                                                break
                                            else:
                                                with SourceLine(req, k, WorkflowException):
                                                    raise WorkflowException("Non-top-level ResourceRequirement in single container cannot have expressions")
                                if not dyn:
                                    self.static_resource_req.append(req)
                            if req["class"] == "DockerRequirement":
                                if "http://arvados.org/cwl#dockerCollectionPDH" in req:
                                    del req["http://arvados.org/cwl#dockerCollectionPDH"]

                visit_class(packed["$graph"], ("Workflow", "CommandLineTool"), visit)

                if self.static_resource_req:
                    self.static_resource_req = [get_overall_res_req(self.static_resource_req)]

                upload_dependencies(self.arvrunner,
                                    runtimeContext.name,
                                    document_loader,
                                    packed,
                                    uri,
                                    False)

                # Discover files/directories referenced by the
                # workflow (mainly "default" values)
                visit_class(packed, ("File", "Directory"), self.wf_reffiles.append)


        if self.dynamic_resource_req:
            # Evaluate dynamic resource requirements using current builder
            rs = copy.copy(self.static_resource_req)
            for dyn_rs in self.dynamic_resource_req:
                eval_req = {"class": "ResourceRequirement"}
                for a in max_res_pars + sum_res_pars:
                    if a in dyn_rs:
                        eval_req[a] = builder.do_eval(dyn_rs[a])
                rs.append(eval_req)
            job_res_reqs = [get_overall_res_req(rs)]
        else:
            job_res_reqs = self.static_resource_req

        with Perf(metrics, "subworkflow adjust"):
            joborder_resolved = copy.deepcopy(joborder)
            joborder_keepmount = copy.deepcopy(joborder)

            reffiles = []
            visit_class(joborder_keepmount, ("File", "Directory"), reffiles.append)

            mapper = ArvPathMapper(self.arvrunner, reffiles+self.wf_reffiles, runtimeContext.basedir,
                                   "/keep/%s",
                                   "/keep/%s/%s")

            # For containers API, we need to make sure any extra
            # referenced files (ie referenced by the workflow but
            # not in the inputs) are included in the mounts.
            if self.wf_reffiles:
                runtimeContext = runtimeContext.copy()
                runtimeContext.extra_reffiles = copy.deepcopy(self.wf_reffiles)

            def keepmount(obj):
                remove_redundant_fields(obj)
                with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                    if "location" not in obj:
                        raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj))
                with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)):
                    if obj["location"].startswith("keep:"):
                        obj["location"] = mapper.mapper(obj["location"]).target
                        if "listing" in obj:
                            del obj["listing"]
                    elif obj["location"].startswith("_:"):
                        del obj["location"]
                    else:
                        raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"])

            visit_class(joborder_keepmount, ("File", "Directory"), keepmount)

            def resolved(obj):
                if obj["location"].startswith("keep:"):
                    obj["location"] = mapper.mapper(obj["location"]).resolved

            visit_class(joborder_resolved, ("File", "Directory"), resolved)

            if self.wf_pdh is None:
                adjustFileObjs(packed, keepmount)
                adjustDirObjs(packed, keepmount)
                self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed)

        wf_runner = cmap({
            "class": "CommandLineTool",
            "baseCommand": "cwltool",
            "inputs": self.tool["inputs"],
            "outputs": self.tool["outputs"],
            "stdout": "cwl.output.json",
            "requirements": self.requirements+job_res_reqs+[
                {"class": "InlineJavascriptRequirement"},
                {
                "class": "InitialWorkDirRequirement",
                "listing": [{
                        "entryname": "workflow.cwl",
                        "entry": '$({"class": "File", "location": "keep:%s/workflow.cwl"})' % self.wf_pdh
                    }, {
                        "entryname": "cwl.input.yml",
                        "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${')
                    }]
            }],
            "hints": self.hints,
            "arguments": ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml"],
            "id": "#"
        })
        return ArvadosCommandTool(self.arvrunner, wf_runner, self.loadingContext).job(joborder_resolved, output_callback, runtimeContext)
コード例 #45
0
ファイル: executor.py プロジェクト: chapmanb/arvados
    def arv_executor(self, tool, job_order, runtimeContext, logger=None):
        self.debug = runtimeContext.debug

        tool.visit(self.check_features)

        self.project_uuid = runtimeContext.project_uuid
        self.pipeline = None
        self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
        self.secret_store = runtimeContext.secret_store

        self.trash_intermediate = runtimeContext.trash_intermediate
        if self.trash_intermediate and self.work_api != "containers":
            raise Exception("--trash-intermediate is only supported with --api=containers.")

        self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
        if self.intermediate_output_ttl and self.work_api != "containers":
            raise Exception("--intermediate-output-ttl is only supported with --api=containers.")
        if self.intermediate_output_ttl < 0:
            raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl)

        if runtimeContext.submit_request_uuid and self.work_api != "containers":
            raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api))

        if not runtimeContext.name:
            runtimeContext.name = self.name = tool.tool.get("label") or tool.metadata.get("label") or os.path.basename(tool.tool["id"])

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        merged_map = upload_workflow_deps(self, tool)

        # Reload tool object which may have been updated by
        # upload_workflow_deps
        # Don't validate this time because it will just print redundant errors.
        loadingContext = self.loadingContext.copy()
        loadingContext.loader = tool.doc_loader
        loadingContext.avsc_names = tool.doc_schema
        loadingContext.metadata = tool.metadata
        loadingContext.do_validate = False

        tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]],
                                  loadingContext)

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % runtimeContext.name,
                                     tool, job_order)

        existing_uuid = runtimeContext.update_workflow
        if existing_uuid or runtimeContext.create_workflow:
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "jobs":
                tmpl = RunnerTemplate(self, tool, job_order,
                                      runtimeContext.enable_reuse,
                                      uuid=existing_uuid,
                                      submit_runner_ram=runtimeContext.submit_runner_ram,
                                      name=runtimeContext.name,
                                      merged_map=merged_map,
                                      loadingContext=loadingContext)
                tmpl.save()
                # cwltool.main will write our return value to stdout.
                return (tmpl.uuid, "success")
            elif self.work_api == "containers":
                return (upload_workflow(self, tool, job_order,
                                        self.project_uuid,
                                        uuid=existing_uuid,
                                        submit_runner_ram=runtimeContext.submit_runner_ram,
                                        name=runtimeContext.name,
                                        merged_map=merged_map),
                        "success")

        self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
        self.eval_timeout = runtimeContext.eval_timeout

        runtimeContext = runtimeContext.copy()
        runtimeContext.use_container = True
        runtimeContext.tmpdir_prefix = "tmp"
        runtimeContext.work_api = self.work_api

        if self.work_api == "containers":
            if self.ignore_docker_for_reuse:
                raise Exception("--ignore-docker-for-reuse not supported with containers API.")
            runtimeContext.outdir = "/var/spool/cwl"
            runtimeContext.docker_outdir = "/var/spool/cwl"
            runtimeContext.tmpdir = "/tmp"
            runtimeContext.docker_tmpdir = "/tmp"
        elif self.work_api == "jobs":
            if runtimeContext.priority != DEFAULT_PRIORITY:
                raise Exception("--priority not implemented for jobs API.")
            runtimeContext.outdir = "$(task.outdir)"
            runtimeContext.docker_outdir = "$(task.outdir)"
            runtimeContext.tmpdir = "$(task.tmpdir)"

        if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
            raise Exception("--priority must be in the range 1..1000.")

        if self.should_estimate_cache_size:
            visited = set()
            estimated_size = [0]
            def estimate_collection_cache(obj):
                if obj.get("location", "").startswith("keep:"):
                    m = pdh_size.match(obj["location"][5:])
                    if m and m.group(1) not in visited:
                        visited.add(m.group(1))
                        estimated_size[0] += int(m.group(2))
            visit_class(job_order, ("File", "Directory"), estimate_collection_cache)
            runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256)
            self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024)

        logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size)

        runnerjob = None
        if runtimeContext.submit:
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if tool.tool["class"] == "CommandLineTool" and runtimeContext.wait and (not runtimeContext.always_submit_runner):
                    runtimeContext.runnerjob = tool.tool["id"]
                else:
                    tool = RunnerContainer(self, tool, loadingContext, runtimeContext.enable_reuse,
                                                self.output_name,
                                                self.output_tags,
                                                submit_runner_ram=runtimeContext.submit_runner_ram,
                                                name=runtimeContext.name,
                                                on_error=runtimeContext.on_error,
                                                submit_runner_image=runtimeContext.submit_runner_image,
                                                intermediate_output_ttl=runtimeContext.intermediate_output_ttl,
                                                merged_map=merged_map,
                                                priority=runtimeContext.priority,
                                                secret_store=self.secret_store,
                                                collection_cache_size=runtimeContext.collection_cache_size,
                                                collection_cache_is_default=self.should_estimate_cache_size)
            elif self.work_api == "jobs":
                tool = RunnerJob(self, tool, loadingContext, runtimeContext.enable_reuse,
                                      self.output_name,
                                      self.output_tags,
                                      submit_runner_ram=runtimeContext.submit_runner_ram,
                                      name=runtimeContext.name,
                                      on_error=runtimeContext.on_error,
                                      submit_runner_image=runtimeContext.submit_runner_image,
                                      merged_map=merged_map)
        elif runtimeContext.cwl_runner_job is None and self.work_api == "jobs":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid": self.project_uuid,
                    "name": runtimeContext.name if runtimeContext.name else shortname(tool.tool["id"]),
                    "components": {},
                    "state": "RunningOnClient"}).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runtimeContext.cwl_runner_job is not None:
            self.uuid = runtimeContext.cwl_runner_job.get('uuid')

        jobiter = tool.job(job_order,
                           self.output_callback,
                           runtimeContext)

        if runtimeContext.submit and not runtimeContext.wait:
            runnerjob = next(jobiter)
            runnerjob.run(runtimeContext)
            return (runnerjob.uuid, "success")

        current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
        if current_container:
            logger.info("Running inside container %s", current_container.get("uuid"))

        self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout)
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)

        try:
            self.workflow_eval_lock.acquire()

            # Holds the lock while this code runs and releases it when
            # it is safe to do so in self.workflow_eval_lock.wait(),
            # at which point on_message can update job state and
            # process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if self.task_queue.error is not None:
                    raise self.task_queue.error

                if runnable:
                    with Perf(metrics, "run"):
                        self.start_run(runnable, runtimeContext)
                else:
                    if (self.task_queue.in_flight + len(self.processes)) > 0:
                        self.workflow_eval_lock.wait(3)
                    else:
                        logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.")
                        break

                if self.stop_polling.is_set():
                    break

                loopperf.__enter__()
            loopperf.__exit__()

            while (self.task_queue.in_flight + len(self.processes)) > 0:
                if self.task_queue.error is not None:
                    raise self.task_queue.error
                self.workflow_eval_lock.wait(3)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit:
                logger.error("Interrupted, workflow will be cancelled")
            else:
                logger.error("Execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
                                                     body={"state": "Failed"}).execute(num_retries=self.num_retries)
            if runtimeContext.submit and isinstance(tool, Runner):
                runnerjob = tool
                if runnerjob.uuid and self.work_api == "containers":
                    self.api.container_requests().update(uuid=runnerjob.uuid,
                                                     body={"priority": "0"}).execute(num_retries=self.num_retries)
        finally:
            self.workflow_eval_lock.release()
            self.task_queue.drain()
            self.stop_polling.set()
            self.polling_thread.join()
            self.task_queue.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if runtimeContext.submit and isinstance(tool, Runner):
            logger.info("Final output collection %s", tool.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""

            storage_classes = runtimeContext.storage_classes.strip().split(",")
            self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output)
            self.set_crunch_output()

        if runtimeContext.compute_checksum:
            adjustDirObjs(self.final_output, partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access))

        if self.trash_intermediate and self.final_status == "success":
            self.trash_intermediate_output()

        return (self.final_output, self.final_status)
コード例 #46
0
ファイル: executor.py プロジェクト: chapmanb/arvados
    def make_output_collection(self, name, storage_classes, tagsString, outputObj):
        outputObj = copy.deepcopy(outputObj)

        files = []
        def capture(fileobj):
            files.append(fileobj)

        adjustDirObjs(outputObj, capture)
        adjustFileObjs(outputObj, capture)

        generatemapper = NoFollowPathMapper(files, "", "", separateDirs=False)

        final = arvados.collection.Collection(api_client=self.api,
                                              keep_client=self.keep_client,
                                              num_retries=self.num_retries)

        for k,v in generatemapper.items():
            if k.startswith("_:"):
                if v.type == "Directory":
                    continue
                if v.type == "CreateFile":
                    with final.open(v.target, "wb") as f:
                        f.write(v.resolved.encode("utf-8"))
                    continue

            if not k.startswith("keep:"):
                raise Exception("Output source is not in keep or a literal")
            sp = k.split("/")
            srccollection = sp[0][5:]
            try:
                reader = self.collection_cache.get(srccollection)
                srcpath = "/".join(sp[1:]) if len(sp) > 1 else "."
                final.copy(srcpath, v.target, source_collection=reader, overwrite=False)
            except arvados.errors.ArgumentError as e:
                logger.error("Creating CollectionReader for '%s' '%s': %s", k, v, e)
                raise
            except IOError as e:
                logger.warning("While preparing output collection: %s", e)

        def rewrite(fileobj):
            fileobj["location"] = generatemapper.mapper(fileobj["location"]).target
            for k in ("listing", "contents", "nameext", "nameroot", "dirname"):
                if k in fileobj:
                    del fileobj[k]

        adjustDirObjs(outputObj, rewrite)
        adjustFileObjs(outputObj, rewrite)

        with final.open("cwl.output.json", "w") as f:
            res = str(json.dumps(outputObj, sort_keys=True, indent=4, separators=(',',': '), ensure_ascii=False))
            f.write(res)           

        final.save_new(name=name, owner_uuid=self.project_uuid, storage_classes=storage_classes, ensure_unique_name=True)

        logger.info("Final output collection %s \"%s\" (%s)", final.portable_data_hash(),
                    final.api_response()["name"],
                    final.manifest_locator())

        final_uuid = final.manifest_locator()
        tags = tagsString.split(',')
        for tag in tags:
             self.api.links().create(body={
                "head_uuid": final_uuid, "link_class": "tag", "name": tag
                }).execute(num_retries=self.num_retries)

        def finalcollection(fileobj):
            fileobj["location"] = "keep:%s/%s" % (final.portable_data_hash(), fileobj["location"])

        adjustDirObjs(outputObj, finalcollection)
        adjustFileObjs(outputObj, finalcollection)

        return (outputObj, final)
コード例 #47
0
def run():
    # Timestamps are added by crunch-job, so don't print redundant timestamps.
    arvados.log_handler.setFormatter(logging.Formatter('%(name)s %(levelname)s: %(message)s'))

    # Print package versions
    logger.info(arvados_cwl.versionstring())

    api = arvados.api("v1")

    arvados_cwl.add_arv_hints()

    runner = None
    try:
        job_order_object = arvados.current_job()['script_parameters']
        toolpath = "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'], job_order_object.pop("cwl:tool"))

        pdh_path = re.compile(r'^[0-9a-f]{32}\+\d+(/.+)?$')

        def keeppath(v):
            if pdh_path.match(v):
                return "keep:%s" % v
            else:
                return v

        def keeppathObj(v):
            if "location" in v:
                v["location"] = keeppath(v["location"])

        for k,v in job_order_object.items():
            if isinstance(v, basestring) and arvados.util.keep_locator_pattern.match(v):
                job_order_object[k] = {
                    "class": "File",
                    "location": "keep:%s" % v
                }

        adjustFileObjs(job_order_object, keeppathObj)
        adjustDirObjs(job_order_object, keeppathObj)
        normalizeFilesDirs(job_order_object)

        output_name = None
        output_tags = None
        enable_reuse = True
        on_error = "continue"
        debug = False

        if "arv:output_name" in job_order_object:
            output_name = job_order_object["arv:output_name"]
            del job_order_object["arv:output_name"]

        if "arv:output_tags" in job_order_object:
            output_tags = job_order_object["arv:output_tags"]
            del job_order_object["arv:output_tags"]

        if "arv:enable_reuse" in job_order_object:
            enable_reuse = job_order_object["arv:enable_reuse"]
            del job_order_object["arv:enable_reuse"]

        if "arv:on_error" in job_order_object:
            on_error = job_order_object["arv:on_error"]
            del job_order_object["arv:on_error"]

        if "arv:debug" in job_order_object:
            debug = job_order_object["arv:debug"]
            del job_order_object["arv:debug"]

        runner = arvados_cwl.ArvCwlRunner(api_client=arvados.safeapi.ThreadSafeApiCache(
            api_params={"model": OrderedJsonModel()}, keep_params={"num_retries": 4}),
                                          output_name=output_name, output_tags=output_tags)

        make_fs_access = functools.partial(CollectionFsAccess,
                                 collection_cache=runner.collection_cache)

        t = load_tool(toolpath, runner.arv_make_tool,
                      fetcher_constructor=functools.partial(CollectionFetcher,
                                                  api_client=runner.api,
                                                  fs_access=make_fs_access(""),
                                                  num_retries=runner.num_retries))

        if debug:
            logger.setLevel(logging.DEBUG)
            logging.getLogger('arvados').setLevel(logging.DEBUG)
            logging.getLogger("cwltool").setLevel(logging.DEBUG)

        args = argparse.Namespace()
        args.project_uuid = arvados.current_job()["owner_uuid"]
        args.enable_reuse = enable_reuse
        args.on_error = on_error
        args.submit = False
        args.debug = debug
        args.quiet = False
        args.ignore_docker_for_reuse = False
        args.basedir = os.getcwd()
        args.name = None
        args.cwl_runner_job={"uuid": arvados.current_job()["uuid"], "state": arvados.current_job()["state"]}
        args.make_fs_access = make_fs_access
        args.trash_intermediate = False
        args.intermediate_output_ttl = 0
        args.priority = arvados_cwl.DEFAULT_PRIORITY
        args.do_validate = True
        args.disable_js_validation = False

        runner.arv_executor(t, job_order_object, **vars(args))
    except Exception as e:
        if isinstance(e, WorkflowException):
            logging.info("Workflow error %s", e)
        else:
            logging.exception("Unhandled exception")
        if runner and runner.final_output_collection:
            outputCollection = runner.final_output_collection.portable_data_hash()
        else:
            outputCollection = None
        api.job_tasks().update(uuid=arvados.current_task()['uuid'],
                                             body={
                                                 'output': outputCollection,
                                                 'success': False,
                                                 'progress':1.0
                                             }).execute()