コード例 #1
0
ファイル: provenance.py プロジェクト: rupertnash/cwltool
    def user_provenance(self, document: ProvDocument) -> None:
        """Add the user provenance."""
        self.self_check()
        (username, fullname) = _whoami()

        if not self.full_name:
            self.full_name = fullname

        document.add_namespace(UUID)
        document.add_namespace(ORCID)
        document.add_namespace(FOAF)
        account = document.agent(
            ACCOUNT_UUID,
            {
                provM.PROV_TYPE: FOAF["OnlineAccount"],
                "prov:label": username,
                FOAF["accountName"]: username,
            },
        )

        user = document.agent(
            self.orcid or USER_UUID,
            {
                provM.PROV_TYPE: PROV["Person"],
                "prov:label": self.full_name,
                FOAF["name"]: self.full_name,
                FOAF["account"]: account,
            },
        )
        # cwltool may be started on the shell (directly by user),
        # by shell script (indirectly by user)
        # or from a different program
        #   (which again is launched by any of the above)
        #
        # We can't tell in which way, but ultimately we're still
        # acting in behalf of that user (even if we might
        # get their name wrong!)
        document.actedOnBehalfOf(account, user)
コード例 #2
0
class ProvenanceProfile:
    """
    Provenance profile.

    Populated as the workflow runs.
    """
    def __init__(
        self,
        research_object: "ResearchObject",
        full_name: str,
        host_provenance: bool,
        user_provenance: bool,
        orcid: str,
        fsaccess: StdFsAccess,
        run_uuid: Optional[uuid.UUID] = None,
    ) -> None:
        """Initialize the provenance profile."""
        self.fsaccess = fsaccess
        self.orcid = orcid
        self.research_object = research_object
        self.folder = self.research_object.folder
        self.document = ProvDocument()
        self.host_provenance = host_provenance
        self.user_provenance = user_provenance
        self.engine_uuid = research_object.engine_uuid  # type: str
        self.add_to_manifest = self.research_object.add_to_manifest
        if self.orcid:
            _logger.debug("[provenance] Creator ORCID: %s", self.orcid)
        self.full_name = full_name
        if self.full_name:
            _logger.debug("[provenance] Creator Full name: %s", self.full_name)
        self.workflow_run_uuid = run_uuid or uuid.uuid4()
        self.workflow_run_uri = self.workflow_run_uuid.urn  # type: str
        self.generate_prov_doc()

    def __str__(self) -> str:
        """Represent this Provenvance profile as a string."""
        return "ProvenanceProfile <{}> in <{}>".format(
            self.workflow_run_uri,
            self.research_object,
        )

    def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
        """Add basic namespaces."""
        def host_provenance(document: ProvDocument) -> None:
            """Record host provenance."""
            document.add_namespace(CWLPROV)
            document.add_namespace(UUID)
            document.add_namespace(FOAF)

            hostname = getfqdn()
            # won't have a foaf:accountServiceHomepage for unix hosts, but
            # we can at least provide hostname
            document.agent(
                ACCOUNT_UUID,
                {
                    PROV_TYPE: FOAF["OnlineAccount"],
                    "prov:location": hostname,
                    CWLPROV["hostname"]: hostname,
                },
            )

        self.cwltool_version = "cwltool %s" % versionstring().split()[-1]
        self.document.add_namespace("wfprov",
                                    "http://purl.org/wf4ever/wfprov#")
        # document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
        self.document.add_namespace("wfdesc",
                                    "http://purl.org/wf4ever/wfdesc#")
        # TODO: Make this ontology. For now only has cwlprov:image
        self.document.add_namespace("cwlprov", "https://w3id.org/cwl/prov#")
        self.document.add_namespace("foaf", "http://xmlns.com/foaf/0.1/")
        self.document.add_namespace("schema", "http://schema.org/")
        self.document.add_namespace("orcid", "https://orcid.org/")
        self.document.add_namespace("id", "urn:uuid:")
        # NOTE: Internet draft expired 2004-03-04 (!)
        #  https://tools.ietf.org/html/draft-thiemann-hash-urn-01
        # TODO: Change to nih:sha-256; hashes
        #  https://tools.ietf.org/html/rfc6920#section-7
        self.document.add_namespace("data", "urn:hash::sha1:")
        # Also needed for docker images
        self.document.add_namespace(SHA256, "nih:sha-256;")

        # info only, won't really be used by prov as sub-resources use /
        self.document.add_namespace("researchobject",
                                    self.research_object.base_uri)
        # annotations
        self.metadata_ns = self.document.add_namespace(
            "metadata", self.research_object.base_uri + METADATA + "/")
        # Pre-register provenance directory so we can refer to its files
        self.provenance_ns = self.document.add_namespace(
            "provenance",
            self.research_object.base_uri + posix_path(PROVENANCE) + "/")
        ro_identifier_workflow = self.research_object.base_uri + "workflow/packed.cwl#"
        self.wf_ns = self.document.add_namespace("wf", ro_identifier_workflow)
        ro_identifier_input = (self.research_object.base_uri +
                               "workflow/primary-job.json#")
        self.document.add_namespace("input", ro_identifier_input)

        # More info about the account (e.g. username, fullname)
        # may or may not have been previously logged by user_provenance()
        # .. but we always know cwltool was launched (directly or indirectly)
        # by a user account, as cwltool is a command line tool
        account = self.document.agent(ACCOUNT_UUID)
        if self.orcid or self.full_name:
            person = {PROV_TYPE: PROV["Person"], "prov:type": SCHEMA["Person"]}
            if self.full_name:
                person["prov:label"] = self.full_name
                person["foaf:name"] = self.full_name
                person["schema:name"] = self.full_name
            else:
                # TODO: Look up name from ORCID API?
                pass
            agent = self.document.agent(self.orcid or uuid.uuid4().urn, person)
            self.document.actedOnBehalfOf(account, agent)
        else:
            if self.host_provenance:
                host_provenance(self.document)
            if self.user_provenance:
                self.research_object.user_provenance(self.document)
        # The execution of cwltool
        wfengine = self.document.agent(
            self.engine_uuid,
            {
                PROV_TYPE: PROV["SoftwareAgent"],
                "prov:type": WFPROV["WorkflowEngine"],
                "prov:label": self.cwltool_version,
            },
        )
        # FIXME: This datetime will be a bit too delayed, we should
        # capture when cwltool.py earliest started?
        self.document.wasStartedBy(wfengine, None, account,
                                   datetime.datetime.now())
        # define workflow run level activity
        self.document.activity(
            self.workflow_run_uri,
            datetime.datetime.now(),
            None,
            {
                PROV_TYPE: WFPROV["WorkflowRun"],
                "prov:label": "Run of workflow/packed.cwl#main",
            },
        )
        # association between SoftwareAgent and WorkflowRun
        main_workflow = "wf:main"
        self.document.wasAssociatedWith(self.workflow_run_uri,
                                        self.engine_uuid, main_workflow)
        self.document.wasStartedBy(self.workflow_run_uri, None,
                                   self.engine_uuid, datetime.datetime.now())
        return (self.workflow_run_uri, self.document)

    def evaluate(
        self,
        process: Process,
        job: JobsType,
        job_order_object: CWLObjectType,
        research_obj: "ResearchObject",
    ) -> None:
        """Evaluate the nature of job."""
        if not hasattr(process, "steps"):
            # record provenance of independent commandline tool executions
            self.prospective_prov(job)
            customised_job = copy_job_order(job, job_order_object)
            self.used_artefacts(customised_job, self.workflow_run_uri)
            research_obj.create_job(customised_job)
        elif hasattr(job, "workflow"):
            # record provenance of workflow executions
            self.prospective_prov(job)
            customised_job = copy_job_order(job, job_order_object)
            self.used_artefacts(customised_job, self.workflow_run_uri)

    def record_process_start(
            self,
            process: Process,
            job: JobsType,
            process_run_id: Optional[str] = None) -> Optional[str]:
        if not hasattr(process, "steps"):
            process_run_id = self.workflow_run_uri
        elif not hasattr(job, "workflow"):
            # commandline tool execution as part of workflow
            name = ""
            if isinstance(job, (CommandLineJob, JobBase, WorkflowJob)):
                name = job.name
            process_name = urllib.parse.quote(name, safe=":/,#")
            process_run_id = self.start_process(process_name,
                                                datetime.datetime.now())
        return process_run_id

    def start_process(
        self,
        process_name: str,
        when: datetime.datetime,
        process_run_id: Optional[str] = None,
    ) -> str:
        """Record the start of each Process."""
        if process_run_id is None:
            process_run_id = uuid.uuid4().urn
        prov_label = "Run of workflow/packed.cwl#main/" + process_name
        self.document.activity(
            process_run_id,
            None,
            None,
            {
                PROV_TYPE: WFPROV["ProcessRun"],
                PROV_LABEL: prov_label
            },
        )
        self.document.wasAssociatedWith(process_run_id, self.engine_uuid,
                                        str("wf:main/" + process_name))
        self.document.wasStartedBy(process_run_id, None, self.workflow_run_uri,
                                   when, None, None)
        return process_run_id

    def record_process_end(
        self,
        process_name: str,
        process_run_id: str,
        outputs: Union[CWLObjectType, MutableSequence[CWLObjectType], None],
        when: datetime.datetime,
    ) -> None:
        self.generate_output_prov(outputs, process_run_id, process_name)
        self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri,
                                 when)

    def declare_file(
            self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, str]:
        if value["class"] != "File":
            raise ValueError("Must have class:File: %s" % value)
        # Need to determine file hash aka RO filename
        entity = None  # type: Optional[ProvEntity]
        checksum = None
        if "checksum" in value:
            csum = cast(str, value["checksum"])
            (method, checksum) = csum.split("$", 1)
            if method == SHA1 and self.research_object.has_data_file(checksum):
                entity = self.document.entity("data:" + checksum)

        if not entity and "location" in value:
            location = str(value["location"])
            # If we made it here, we'll have to add it to the RO
            with self.fsaccess.open(location, "rb") as fhandle:
                relative_path = self.research_object.add_data_file(fhandle)
                # FIXME: This naively relies on add_data_file setting hash as filename
                checksum = PurePath(relative_path).name
                entity = self.document.entity("data:" + checksum,
                                              {PROV_TYPE: WFPROV["Artifact"]})
                if "checksum" not in value:
                    value["checksum"] = f"{SHA1}${checksum}"

        if not entity and "contents" in value:
            # Anonymous file, add content as string
            entity, checksum = self.declare_string(cast(
                str, value["contents"]))

        # By here one of them should have worked!
        if not entity or not checksum:
            raise ValueError(
                "class:File but missing checksum/location/content: %r" % value)

        # Track filename and extension, this is generally useful only for
        # secondaryFiles. Note that multiple uses of a file might thus record
        # different names for the same entity, so we'll
        # make/track a specialized entity by UUID
        file_id = value.setdefault("@id", uuid.uuid4().urn)
        # A specialized entity that has just these names
        file_entity = self.document.entity(
            file_id,
            [(PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, WF4EVER["File"])],
        )  # type: ProvEntity

        if "basename" in value:
            file_entity.add_attributes(
                {CWLPROV["basename"]: value["basename"]})
        if "nameroot" in value:
            file_entity.add_attributes(
                {CWLPROV["nameroot"]: value["nameroot"]})
        if "nameext" in value:
            file_entity.add_attributes({CWLPROV["nameext"]: value["nameext"]})
        self.document.specializationOf(file_entity, entity)

        # Check for secondaries
        for sec in cast(MutableSequence[CWLObjectType],
                        value.get("secondaryFiles", [])):
            # TODO: Record these in a specializationOf entity with UUID?
            if sec["class"] == "File":
                (sec_entity, _, _) = self.declare_file(sec)
            elif sec["class"] == "Directory":
                sec_entity = self.declare_directory(sec)
            else:
                raise ValueError(f"Got unexpected secondaryFiles value: {sec}")
            # We don't know how/when/where the secondary file was generated,
            # but CWL convention is a kind of summary/index derived
            # from the original file. As its generally in a different format
            # then prov:Quotation is not appropriate.
            self.document.derivation(
                sec_entity,
                file_entity,
                other_attributes={PROV["type"]: CWLPROV["SecondaryFile"]},
            )

        return file_entity, entity, checksum

    def declare_directory(self, value: CWLObjectType) -> ProvEntity:
        """Register any nested files/directories."""
        # FIXME: Calculate a hash-like identifier for directory
        # so we get same value if it's the same filenames/hashes
        # in a different location.
        # For now, mint a new UUID to identify this directory, but
        # attempt to keep it inside the value dictionary
        dir_id = cast(str, value.setdefault("@id", uuid.uuid4().urn))

        # New annotation file to keep the ORE Folder listing
        ore_doc_fn = dir_id.replace("urn:uuid:", "directory-") + ".ttl"
        dir_bundle = self.document.bundle(self.metadata_ns[ore_doc_fn])

        coll = self.document.entity(
            dir_id,
            [
                (PROV_TYPE, WFPROV["Artifact"]),
                (PROV_TYPE, PROV["Collection"]),
                (PROV_TYPE, PROV["Dictionary"]),
                (PROV_TYPE, RO["Folder"]),
            ],
        )
        # ORE description of ro:Folder, saved separately
        coll_b = dir_bundle.entity(
            dir_id,
            [(PROV_TYPE, RO["Folder"]), (PROV_TYPE, ORE["Aggregation"])],
        )
        self.document.mentionOf(dir_id + "#ore", dir_id, dir_bundle.identifier)

        # dir_manifest = dir_bundle.entity(
        #     dir_bundle.identifier, {PROV["type"]: ORE["ResourceMap"],
        #                             ORE["describes"]: coll_b.identifier})

        coll_attribs = [(ORE["isDescribedBy"], dir_bundle.identifier)]
        coll_b_attribs = []  # type: List[Tuple[Identifier, ProvEntity]]

        # FIXME: .listing might not be populated yet - hopefully
        # a later call to this method will sort that
        is_empty = True

        if "listing" not in value:
            get_listing(self.fsaccess, value)
        for entry in cast(MutableSequence[CWLObjectType],
                          value.get("listing", [])):
            is_empty = False
            # Declare child-artifacts
            entity = self.declare_artefact(entry)
            self.document.membership(coll, entity)
            # Membership relation aka our ORE Proxy
            m_id = uuid.uuid4().urn
            m_entity = self.document.entity(m_id)
            m_b = dir_bundle.entity(m_id)

            # PROV-O style Dictionary
            # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition
            # ..as prov.py do not currently allow PROV-N extensions
            # like hadDictionaryMember(..)
            m_entity.add_asserted_type(PROV["KeyEntityPair"])

            m_entity.add_attributes({
                PROV["pairKey"]: entry["basename"],
                PROV["pairEntity"]: entity,
            })

            # As well as a being a
            # http://wf4ever.github.io/ro/2016-01-28/ro/#FolderEntry
            m_b.add_asserted_type(RO["FolderEntry"])
            m_b.add_asserted_type(ORE["Proxy"])
            m_b.add_attributes({
                RO["entryName"]: entry["basename"],
                ORE["proxyIn"]: coll,
                ORE["proxyFor"]: entity,
            })
            coll_attribs.append((PROV["hadDictionaryMember"], m_entity))
            coll_b_attribs.append((ORE["aggregates"], m_b))

        coll.add_attributes(coll_attribs)
        coll_b.add_attributes(coll_b_attribs)

        # Also Save ORE Folder as annotation metadata
        ore_doc = ProvDocument()
        ore_doc.add_namespace(ORE)
        ore_doc.add_namespace(RO)
        ore_doc.add_namespace(UUID)
        ore_doc.add_bundle(dir_bundle)
        ore_doc = ore_doc.flattened()
        ore_doc_path = str(PurePosixPath(METADATA, ore_doc_fn))
        with self.research_object.write_bag_file(
                ore_doc_path) as provenance_file:
            ore_doc.serialize(provenance_file,
                              format="rdf",
                              rdf_format="turtle")
        self.research_object.add_annotation(dir_id, [ore_doc_fn],
                                            ORE["isDescribedBy"].uri)

        if is_empty:
            # Empty directory
            coll.add_asserted_type(PROV["EmptyCollection"])
            coll.add_asserted_type(PROV["EmptyDictionary"])
        self.research_object.add_uri(coll.identifier.uri)
        return coll

    def declare_string(self, value: str) -> Tuple[ProvEntity, str]:
        """Save as string in UTF-8."""
        byte_s = BytesIO(str(value).encode(ENCODING))
        data_file = self.research_object.add_data_file(byte_s,
                                                       content_type=TEXT_PLAIN)
        checksum = PurePosixPath(data_file).name
        # FIXME: Don't naively assume add_data_file uses hash in filename!
        data_id = "data:%s" % PurePosixPath(data_file).stem
        entity = self.document.entity(data_id, {
            PROV_TYPE: WFPROV["Artifact"],
            PROV_VALUE: str(value)
        })  # type: ProvEntity
        return entity, checksum

    def declare_artefact(self, value: Optional[CWLOutputType]) -> ProvEntity:
        """Create data artefact entities for all file objects."""
        if value is None:
            # FIXME: If this can happen in CWL, we'll
            # need a better way to represent this in PROV
            return self.document.entity(CWLPROV["None"], {PROV_LABEL: "None"})

        if isinstance(value, (bool, int, float)):
            # Typically used in job documents for flags

            # FIXME: Make consistent hash URIs for these
            # that somehow include the type
            # (so "1" != 1 != "1.0" != true)
            entity = self.document.entity(uuid.uuid4().urn,
                                          {PROV_VALUE: value})
            self.research_object.add_uri(entity.identifier.uri)
            return entity

        if isinstance(value, (str, str)):
            (entity, _) = self.declare_string(value)
            return entity

        if isinstance(value, bytes):
            # If we got here then we must be in Python 3
            byte_s = BytesIO(value)
            data_file = self.research_object.add_data_file(byte_s)
            # FIXME: Don't naively assume add_data_file uses hash in filename!
            data_id = "data:%s" % PurePosixPath(data_file).stem
            return self.document.entity(
                data_id,
                {
                    PROV_TYPE: WFPROV["Artifact"],
                    PROV_VALUE: str(value)
                },
            )

        if isinstance(value, MutableMapping):
            if "@id" in value:
                # Already processed this value, but it might not be in this PROV
                entities = self.document.get_record(value["@id"])
                if entities:
                    return entities[0]
                # else, unknown in PROV, re-add below as if it's fresh

            # Base case - we found a File we need to update
            if value.get("class") == "File":
                (entity, _, _) = self.declare_file(value)
                value["@id"] = entity.identifier.uri
                return entity

            if value.get("class") == "Directory":
                entity = self.declare_directory(value)
                value["@id"] = entity.identifier.uri
                return entity
            coll_id = value.setdefault("@id", uuid.uuid4().urn)
            # some other kind of dictionary?
            # TODO: also Save as JSON
            coll = self.document.entity(
                coll_id,
                [
                    (PROV_TYPE, WFPROV["Artifact"]),
                    (PROV_TYPE, PROV["Collection"]),
                    (PROV_TYPE, PROV["Dictionary"]),
                ],
            )

            if value.get("class"):
                _logger.warning("Unknown data class %s.", value["class"])
                # FIXME: The class might be "http://example.com/somethingelse"
                coll.add_asserted_type(CWLPROV[value["class"]])

            # Let's iterate and recurse
            coll_attribs = []  # type: List[Tuple[Identifier, ProvEntity]]
            for (key, val) in value.items():
                v_ent = self.declare_artefact(val)
                self.document.membership(coll, v_ent)
                m_entity = self.document.entity(uuid.uuid4().urn)
                # Note: only support PROV-O style dictionary
                # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition
                # as prov.py do not easily allow PROV-N extensions
                m_entity.add_asserted_type(PROV["KeyEntityPair"])
                m_entity.add_attributes({
                    PROV["pairKey"]: str(key),
                    PROV["pairEntity"]: v_ent
                })
                coll_attribs.append((PROV["hadDictionaryMember"], m_entity))
            coll.add_attributes(coll_attribs)
            self.research_object.add_uri(coll.identifier.uri)
            return coll

        # some other kind of Collection?
        # TODO: also save as JSON
        try:
            members = []
            for each_input_obj in iter(value):
                # Recurse and register any nested objects
                e = self.declare_artefact(each_input_obj)
                members.append(e)

            # If we reached this, then we were allowed to iterate
            coll = self.document.entity(
                uuid.uuid4().urn,
                [
                    (PROV_TYPE, WFPROV["Artifact"]),
                    (PROV_TYPE, PROV["Collection"]),
                ],
            )
            if not members:
                coll.add_asserted_type(PROV["EmptyCollection"])
            else:
                for member in members:
                    # FIXME: This won't preserve order, for that
                    # we would need to use PROV.Dictionary
                    # with numeric keys
                    self.document.membership(coll, member)
            self.research_object.add_uri(coll.identifier.uri)
            # FIXME: list value does not support adding "@id"
            return coll
        except TypeError:
            _logger.warning("Unrecognized type %s of %r", type(value), value)
            # Let's just fall back to Python repr()
            entity = self.document.entity(uuid.uuid4().urn,
                                          {PROV_LABEL: repr(value)})
            self.research_object.add_uri(entity.identifier.uri)
            return entity

    def used_artefacts(
        self,
        job_order: Union[CWLObjectType, List[CWLObjectType]],
        process_run_id: str,
        name: Optional[str] = None,
    ) -> None:
        """Add used() for each data artefact."""
        if isinstance(job_order, list):
            for entry in job_order:
                self.used_artefacts(entry, process_run_id, name)
        else:
            # FIXME: Use workflow name in packed.cwl, "main" is wrong for nested workflows
            base = "main"
            if name is not None:
                base += "/" + name
            for key, value in job_order.items():
                prov_role = self.wf_ns[f"{base}/{key}"]
                try:
                    entity = self.declare_artefact(value)
                    self.document.used(
                        process_run_id,
                        entity,
                        datetime.datetime.now(),
                        None,
                        {"prov:role": prov_role},
                    )
                except OSError:
                    pass

    def generate_output_prov(
        self,
        final_output: Union[CWLObjectType, MutableSequence[CWLObjectType],
                            None],
        process_run_id: Optional[str],
        name: Optional[str],
    ) -> None:
        """Call wasGeneratedBy() for each output,copy the files into the RO."""
        if isinstance(final_output, MutableSequence):
            for entry in final_output:
                self.generate_output_prov(entry, process_run_id, name)
        elif final_output is not None:
            # Timestamp should be created at the earliest
            timestamp = datetime.datetime.now()

            # For each output, find/register the corresponding
            # entity (UUID) and document it as generated in
            # a role corresponding to the output
            for output, value in final_output.items():
                entity = self.declare_artefact(value)
                if name is not None:
                    name = urllib.parse.quote(str(name), safe=":/,#")
                    # FIXME: Probably not "main" in nested workflows
                    role = self.wf_ns[f"main/{name}/{output}"]
                else:
                    role = self.wf_ns["main/%s" % output]

                if not process_run_id:
                    process_run_id = self.workflow_run_uri

                self.document.wasGeneratedBy(entity, process_run_id, timestamp,
                                             None, {"prov:role": role})

    def prospective_prov(self, job: JobsType) -> None:
        """Create prospective prov recording as wfdesc prov:Plan."""
        if not isinstance(job, WorkflowJob):
            # direct command line tool execution
            self.document.entity(
                "wf:main",
                {
                    PROV_TYPE: WFDESC["Process"],
                    "prov:type": PROV["Plan"],
                    "prov:label": "Prospective provenance",
                },
            )
            return

        self.document.entity(
            "wf:main",
            {
                PROV_TYPE: WFDESC["Workflow"],
                "prov:type": PROV["Plan"],
                "prov:label": "Prospective provenance",
            },
        )

        for step in job.steps:
            stepnametemp = "wf:main/" + str(step.name)[5:]
            stepname = urllib.parse.quote(stepnametemp, safe=":/,#")
            provstep = self.document.entity(
                stepname,
                {
                    PROV_TYPE: WFDESC["Process"],
                    "prov:type": PROV["Plan"]
                },
            )
            self.document.entity(
                "wf:main",
                {
                    "wfdesc:hasSubProcess": provstep,
                    "prov:label": "Prospective provenance",
                },
            )
        # TODO: Declare roles/parameters as well

    def activity_has_provenance(self, activity, prov_ids):
        # type: (str, List[Identifier]) -> None
        """Add http://www.w3.org/TR/prov-aq/ relations to nested PROV files."""
        # NOTE: The below will only work if the corresponding metadata/provenance arcp URI
        # is a pre-registered namespace in the PROV Document
        attribs = [(PROV["has_provenance"], prov_id) for prov_id in prov_ids]
        self.document.activity(activity, other_attributes=attribs)
        # Tip: we can't use https://www.w3.org/TR/prov-links/#term-mention
        # as prov:mentionOf() is only for entities, not activities
        uris = [i.uri for i in prov_ids]
        self.research_object.add_annotation(activity, uris,
                                            PROV["has_provenance"].uri)

    def finalize_prov_profile(self, name):
        # type: (Optional[str]) -> List[Identifier]
        """Transfer the provenance related files to the RO."""
        # NOTE: Relative posix path
        if name is None:
            # main workflow, fixed filenames
            filename = "primary.cwlprov"
        else:
            # ASCII-friendly filename, avoiding % as we don't want %2520 in manifest.json
            wf_name = urllib.parse.quote(str(name), safe="").replace("%", "_")
            # Note that the above could cause overlaps for similarly named
            # workflows, but that's OK as we'll also include run uuid
            # which also covers thhe case of this step being run in
            # multiple places or iterations
            filename = f"{wf_name}.{self.workflow_run_uuid}.cwlprov"

        basename = str(PurePosixPath(PROVENANCE) / filename)

        # TODO: Also support other profiles than CWLProv, e.g. ProvOne

        # list of prov identifiers of provenance files
        prov_ids = []

        # https://www.w3.org/TR/prov-xml/
        with self.research_object.write_bag_file(basename +
                                                 ".xml") as provenance_file:
            self.document.serialize(provenance_file, format="xml", indent=4)
            prov_ids.append(self.provenance_ns[filename + ".xml"])

        # https://www.w3.org/TR/prov-n/
        with self.research_object.write_bag_file(basename +
                                                 ".provn") as provenance_file:
            self.document.serialize(provenance_file, format="provn", indent=2)
            prov_ids.append(self.provenance_ns[filename + ".provn"])

        # https://www.w3.org/Submission/prov-json/
        with self.research_object.write_bag_file(basename +
                                                 ".json") as provenance_file:
            self.document.serialize(provenance_file, format="json", indent=2)
            prov_ids.append(self.provenance_ns[filename + ".json"])

        # "rdf" aka https://www.w3.org/TR/prov-o/
        # which can be serialized to ttl/nt/jsonld (and more!)

        # https://www.w3.org/TR/turtle/
        with self.research_object.write_bag_file(basename +
                                                 ".ttl") as provenance_file:
            self.document.serialize(provenance_file,
                                    format="rdf",
                                    rdf_format="turtle")
            prov_ids.append(self.provenance_ns[filename + ".ttl"])

        # https://www.w3.org/TR/n-triples/
        with self.research_object.write_bag_file(basename +
                                                 ".nt") as provenance_file:
            self.document.serialize(provenance_file,
                                    format="rdf",
                                    rdf_format="ntriples")
            prov_ids.append(self.provenance_ns[filename + ".nt"])

        # https://www.w3.org/TR/json-ld/
        # TODO: Use a nice JSON-LD context
        # see also https://eprints.soton.ac.uk/395985/
        # 404 Not Found on https://provenance.ecs.soton.ac.uk/prov.jsonld :(
        with self.research_object.write_bag_file(basename +
                                                 ".jsonld") as provenance_file:
            self.document.serialize(provenance_file,
                                    format="rdf",
                                    rdf_format="json-ld")
            prov_ids.append(self.provenance_ns[filename + ".jsonld"])

        _logger.debug("[provenance] added provenance: %s", prov_ids)
        return prov_ids
コード例 #3
0
def primer_example():
    # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/primer.pn
    # ===========================================================================
    # document
    g = ProvDocument()

    #    prefix ex <http://example/>
    #    prefix dcterms <http://purl.org/dc/terms/>
    #    prefix foaf <http://xmlns.com/foaf/0.1/>
    ex = Namespace(
        "ex", "http://example/"
    )  # namespaces do not need to be explicitly added to a document
    g.add_namespace("dcterms", "http://purl.org/dc/terms/")
    g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/")

    #    entity(ex:article, [dcterms:title="Crime rises in cities"])
    # first time the ex namespace was used, it is added to the document automatically
    g.entity(ex["article"], {"dcterms:title": "Crime rises in cities"})
    #    entity(ex:articleV1)
    g.entity(ex["articleV1"])
    #    entity(ex:articleV2)
    g.entity(ex["articleV2"])
    #    entity(ex:dataSet1)
    g.entity(ex["dataSet1"])
    #    entity(ex:dataSet2)
    g.entity(ex["dataSet2"])
    #    entity(ex:regionList)
    g.entity(ex["regionList"])
    #    entity(ex:composition)
    g.entity(ex["composition"])
    #    entity(ex:chart1)
    g.entity(ex["chart1"])
    #    entity(ex:chart2)
    g.entity(ex["chart2"])
    #    entity(ex:blogEntry)
    g.entity(ex["blogEntry"])

    #    activity(ex:compile)
    g.activity(
        "ex:compile")  # since ex is registered, it can be used like this
    #    activity(ex:compile2)
    g.activity("ex:compile2")
    #    activity(ex:compose)
    g.activity("ex:compose")
    #    activity(ex:correct, 2012-03-31T09:21:00, 2012-04-01T15:21:00)
    g.activity("ex:correct", "2012-03-31T09:21:00",
               "2012-04-01T15:21:00")  # date time can be provided as strings
    #    activity(ex:illustrate)
    g.activity("ex:illustrate")

    #    used(ex:compose, ex:dataSet1, -,   [ prov:role = "ex:dataToCompose"])
    g.used("ex:compose",
           "ex:dataSet1",
           other_attributes={"prov:role": "ex:dataToCompose"})
    #    used(ex:compose, ex:regionList, -, [ prov:role = "ex:regionsToAggregateBy"])
    g.used(
        "ex:compose",
        "ex:regionList",
        other_attributes={"prov:role": "ex:regionsToAggregateBy"},
    )
    #    wasGeneratedBy(ex:composition, ex:compose, -)
    g.wasGeneratedBy("ex:composition", "ex:compose")

    #    used(ex:illustrate, ex:composition, -)
    g.used("ex:illustrate", "ex:composition")
    #    wasGeneratedBy(ex:chart1, ex:illustrate, -)
    g.wasGeneratedBy("ex:chart1", "ex:illustrate")

    #    wasGeneratedBy(ex:chart1, ex:compile,  2012-03-02T10:30:00)
    g.wasGeneratedBy("ex:chart1", "ex:compile", "2012-03-02T10:30:00")
    #    wasGeneratedBy(ex:chart2, ex:compile2, 2012-04-01T15:21:00)
    #
    #
    #    agent(ex:derek, [ prov:type="prov:Person", foaf:givenName = "Derek",
    #           foaf:mbox= "<mailto:[email protected]>"])
    g.agent(
        "ex:derek",
        {
            "prov:type": PROV["Person"],
            "foaf:givenName": "Derek",
            "foaf:mbox": "<mailto:[email protected]>",
        },
    )
    #    wasAssociatedWith(ex:compose, ex:derek, -)
    g.wasAssociatedWith("ex:compose", "ex:derek")
    #    wasAssociatedWith(ex:illustrate, ex:derek, -)
    g.wasAssociatedWith("ex:illustrate", "ex:derek")
    #
    #    agent(ex:chartgen, [ prov:type="prov:Organization",
    #           foaf:name = "Chart Generators Inc"])
    g.agent(
        "ex:chartgen",
        {
            "prov:type": PROV["Organization"],
            "foaf:name": "Chart Generators Inc"
        },
    )
    #    actedOnBehalfOf(ex:derek, ex:chartgen, ex:compose)
    g.actedOnBehalfOf("ex:derek", "ex:chartgen", "ex:compose")
    #    wasAttributedTo(ex:chart1, ex:derek)
    g.wasAttributedTo("ex:chart1", "ex:derek")

    #    wasGeneratedBy(ex:dataSet2, ex:correct, -)
    g.wasGeneratedBy("ex:dataSet2", "ex:correct")
    #    used(ex:correct, ex:dataSet1, -)
    g.used("ex:correct", "ex:dataSet1")
    #    wasDerivedFrom(ex:dataSet2, ex:dataSet1, [prov:type='prov:Revision'])
    g.wasDerivedFrom("ex:dataSet2",
                     "ex:dataSet1",
                     other_attributes={"prov:type": PROV["Revision"]})
    #    wasDerivedFrom(ex:chart2, ex:dataSet2)
    g.wasDerivedFrom("ex:chart2", "ex:dataSet2")

    #    wasDerivedFrom(ex:blogEntry, ex:article, [prov:type='prov:Quotation'])
    g.wasDerivedFrom("ex:blogEntry",
                     "ex:article",
                     other_attributes={"prov:type": PROV["Quotation"]})
    #    specializationOf(ex:articleV1, ex:article)
    g.specializationOf("ex:articleV1", "ex:article")
    #    wasDerivedFrom(ex:articleV1, ex:dataSet1)
    g.wasDerivedFrom("ex:articleV1", "ex:dataSet1")

    #    specializationOf(ex:articleV2, ex:article)
    g.specializationOf("ex:articleV2", "ex:article")
    #    wasDerivedFrom(ex:articleV2, ex:dataSet2)
    g.wasDerivedFrom("ex:articleV2", "ex:dataSet2")

    #    alternateOf(ex:articleV2, ex:articleV1)
    g.alternateOf("ex:articleV2", "ex:articleV1")

    # endDocument
    return g
コード例 #4
0
ファイル: examples.py プロジェクト: KNMI/wps_prov
def primer_example():
    # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/primer.pn
    #===========================================================================
    # document
    g = ProvDocument()

    #    prefix ex <http://example/>
    #    prefix dcterms <http://purl.org/dc/terms/>
    #    prefix foaf <http://xmlns.com/foaf/0.1/>
    ex = Namespace('ex', 'http://example/')  # namespaces do not need to be explicitly added to a document
    g.add_namespace("dcterms", "http://purl.org/dc/terms/")
    g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/")

    #    entity(ex:article, [dcterms:title="Crime rises in cities"])
    # first time the ex namespace was used, it is added to the document automatically
    g.entity(ex['article'], {'dcterms:title': "Crime rises in cities"})
    #    entity(ex:articleV1)
    g.entity(ex['articleV1'])
    #    entity(ex:articleV2)
    g.entity(ex['articleV2'])
    #    entity(ex:dataSet1)
    g.entity(ex['dataSet1'])
    #    entity(ex:dataSet2)
    g.entity(ex['dataSet2'])
    #    entity(ex:regionList)
    g.entity(ex['regionList'])
    #    entity(ex:composition)
    g.entity(ex['composition'])
    #    entity(ex:chart1)
    g.entity(ex['chart1'])
    #    entity(ex:chart2)
    g.entity(ex['chart2'])
    #    entity(ex:blogEntry)
    g.entity(ex['blogEntry'])

    #    activity(ex:compile)
    g.activity('ex:compile')  # since ex is registered, it can be used like this
    #    activity(ex:compile2)
    g.activity('ex:compile2')
    #    activity(ex:compose)
    g.activity('ex:compose')
    #    activity(ex:correct, 2012-03-31T09:21:00, 2012-04-01T15:21:00)
    g.activity('ex:correct', '2012-03-31T09:21:00', '2012-04-01T15:21:00')  # date time can be provided as strings
    #    activity(ex:illustrate)
    g.activity('ex:illustrate')

    #    used(ex:compose, ex:dataSet1, -,   [ prov:role = "ex:dataToCompose"])
    g.used('ex:compose', 'ex:dataSet1', other_attributes={'prov:role': "ex:dataToCompose"})
    #    used(ex:compose, ex:regionList, -, [ prov:role = "ex:regionsToAggregateBy"])
    g.used('ex:compose', 'ex:regionList', other_attributes={'prov:role': "ex:regionsToAggregateBy"})
    #    wasGeneratedBy(ex:composition, ex:compose, -)
    g.wasGeneratedBy('ex:composition', 'ex:compose')

    #    used(ex:illustrate, ex:composition, -)
    g.used('ex:illustrate', 'ex:composition')
    #    wasGeneratedBy(ex:chart1, ex:illustrate, -)
    g.wasGeneratedBy('ex:chart1', 'ex:illustrate')

    #    wasGeneratedBy(ex:chart1, ex:compile,  2012-03-02T10:30:00)
    g.wasGeneratedBy('ex:chart1', 'ex:compile', '2012-03-02T10:30:00')
    #    wasGeneratedBy(ex:chart2, ex:compile2, 2012-04-01T15:21:00)
    #
    #
    #    agent(ex:derek, [ prov:type="prov:Person", foaf:givenName = "Derek",
    #           foaf:mbox= "<mailto:[email protected]>"])
    g.agent('ex:derek', {
        'prov:type': PROV["Person"], 'foaf:givenName': "Derek", 'foaf:mbox': "<mailto:[email protected]>"
    })
    #    wasAssociatedWith(ex:compose, ex:derek, -)
    g.wasAssociatedWith('ex:compose', 'ex:derek')
    #    wasAssociatedWith(ex:illustrate, ex:derek, -)
    g.wasAssociatedWith('ex:illustrate', 'ex:derek')
    #
    #    agent(ex:chartgen, [ prov:type="prov:Organization",
    #           foaf:name = "Chart Generators Inc"])
    g.agent('ex:chartgen', {'prov:type': PROV["Organization"], 'foaf:name': "Chart Generators Inc"})
    #    actedOnBehalfOf(ex:derek, ex:chartgen, ex:compose)
    g.actedOnBehalfOf('ex:derek', 'ex:chartgen', 'ex:compose')
    #    wasAttributedTo(ex:chart1, ex:derek)
    g.wasAttributedTo('ex:chart1', 'ex:derek')

    #    wasGeneratedBy(ex:dataSet2, ex:correct, -)
    g.wasGeneratedBy('ex:dataSet2', 'ex:correct')
    #    used(ex:correct, ex:dataSet1, -)
    g.used('ex:correct', 'ex:dataSet1')
    #    wasDerivedFrom(ex:dataSet2, ex:dataSet1, [prov:type='prov:Revision'])
    g.wasDerivedFrom('ex:dataSet2', 'ex:dataSet1', other_attributes={'prov:type': PROV['Revision']})
    #    wasDerivedFrom(ex:chart2, ex:dataSet2)
    g.wasDerivedFrom('ex:chart2', 'ex:dataSet2')

    #    wasDerivedFrom(ex:blogEntry, ex:article, [prov:type='prov:Quotation'])
    g.wasDerivedFrom('ex:blogEntry', 'ex:article', other_attributes={'prov:type': PROV['Quotation']})
    #    specializationOf(ex:articleV1, ex:article)
    g.specializationOf('ex:articleV1', 'ex:article')
    #    wasDerivedFrom(ex:articleV1, ex:dataSet1)
    g.wasDerivedFrom('ex:articleV1', 'ex:dataSet1')

    #    specializationOf(ex:articleV2, ex:article)
    g.specializationOf('ex:articleV2', 'ex:article')
    #    wasDerivedFrom(ex:articleV2, ex:dataSet2)
    g.wasDerivedFrom('ex:articleV2', 'ex:dataSet2')

    #    alternateOf(ex:articleV2, ex:articleV1)
    g.alternateOf('ex:articleV2', 'ex:articleV1')

    # endDocument
    return g
コード例 #5
0
def to_prov(obj, namespace, service):
    """
    :type obj: dict
    :rtype: prov.model.ProvDocument
    """
    g = ProvDocument()
    ap = Namespace('aip', 'https://araport.org/provenance/')

    g.add_namespace("dcterms", "http://purl.org/dc/terms/")
    g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/")

    vaughn = g.agent(ap['matthew_vaughn'], {
        'prov:type': PROV["Person"], 'foaf:givenName': "Matthew Vaughn",
        'foaf:mbox': "<mailto:[email protected]>"
    })
    # Hard coded for now
    walter = g.agent(ap['walter_moreira'], {
        'prov:type': PROV["Person"], 'foaf:givenName': "Walter Moreira",
        'foaf:mbox': "<mailto:[email protected]>"
    })
    utexas = g.agent(ap['university_of_texas'], {
        'prov:type': PROV["Organization"],
        'foaf:givenName': "University of Texas at Austin"
    })
    g.actedOnBehalfOf(walter, utexas)
    g.actedOnBehalfOf(vaughn, utexas)
    adama_platform = g.agent(
        ap['adama_platform'],
        {'dcterms:title': "ADAMA",
         'dcterms:description': "Araport Data And Microservices API",
         'dcterms:language': "en-US",
         'dcterms:identifier': "https://api.araport.org/community/v0.3/",
         'dcterms:updated': "2015-04-17T09:44:56"})
    g.wasGeneratedBy(adama_platform, walter)
    g.wasGeneratedBy(adama_platform, vaughn)

    iden = service_iden(namespace, service)
    srv = service_store[iden]['service']
    adama_microservice = g.agent(
        ap[iden],
        {'dcterms:title': srv.name.title(),
         'dcterms:description': srv.description,
         'dcterms:language': "en-US",
         'dcterms:identifier': api_url_for('service',
                                           namespace=namespace,
                                           service=service),
         'dcterms:source': srv.git_repository
         })

    g.used(adama_microservice, adama_platform, datetime.datetime.now())

    for author in getattr(srv, 'authors', []):
        try:
            author_name = author['name']
            author_email = author['email']
        except KeyError:
            raise APIException(
                'name and email are required in author field')
        author_agent = g.agent(
            ap[slugify(author_name)],
            {'prov:type': PROV['Person'],
             'foaf:givenName': author_name,
             'foaf:mbox': '<mailto:{}>'.format(author_email)})
        sponsor_name = author.get('sponsor_organization_name', None)
        if sponsor_name:
            sponsor_agent = g.agent(
                ap[slugify(sponsor_name)],
                {'prov:type': PROV['Organization'],
                 'foaf:givenName': sponsor_name,
                 'dcterms:identifier': author.get('sponsor_uri', '')})
            g.actedOnBehalfOf(author_agent, sponsor_agent)
        g.wasGeneratedBy(adama_microservice,
                         author_agent,
                         datetime.datetime.now())

    sources_entities = process_sources(srv.sources, g, ap)
    for src in sources_entities:
        g.used(adama_microservice, src, datetime.datetime.now())

    response = g.entity(ap['adama_response'])
    g.wasGeneratedBy(response, ap[srv.type], datetime.datetime.now())
    g.used(ap[srv.type], adama_microservice, datetime.datetime.now())

    return g
コード例 #6
0
class Context(object):
    """
    Context is a singlton storing all
    of the run specific data.
    """
    def __init__(self):
        # Warning;
        # If new data is added with a site dimension the
        # clip exposure function may need to be updated
        # so the site data stays consistent.

        # --------------  These variables are saved ----
        #  If new variables are added the save functions
        # will need to be modified.

        # Latitude and longitude values of the exposure data
        # Has a site dimension
        self.exposure_lat = None
        self.exposure_long = None

        # Data with a site dimension
        # key - data name
        # value - A numpy array. First dimension is site. (0 axis)
        # Has a site dimension
        self.exposure_att = None

        # Data for aggregation across sites
        self.exposure_agg = None

        #
        # --------------  The above variables are saved ----

        # key - intensity measure
        # value - One instance of RealisedVulnerabilityCurves.  An att in this
        #         class has a site dimension.
        self.exposure_vuln_curves = None

        # A dictionary of the vulnerability sets.
        # Not associated with exposures.
        # key - vulnerability set ID
        # value - vulnerability set instance
        self.vulnerability_sets = {}

        # A dictionary with keys being vulnerability_set_ids and
        # value being the exposure attribute who's values are vulnerability
        # function ID's.
        self.vul_function_titles = {}

        # A `prov.ProvDocument` to manage provenance information, including
        # adding required namespaces
        self.prov = ProvDocument()
        self.prov.set_default_namespace("")
        self.prov.add_namespace('prov', 'http://www.w3.org/ns/prov#')
        self.prov.add_namespace('xsd', 'http://www.w3.org/2001/XMLSchema#')
        self.prov.add_namespace('foaf', 'http://xmlns.com/foaf/0.1/')
        self.prov.add_namespace('void', 'http://vocab.deri.ie/void#')
        self.prov.add_namespace('dcterms', 'http://purl.org/dc/terms/')

        commit, branch, dt = misc.get_git_commit()
        # Create the fundamental software agent that is this code:
        self.prov.agent(
            ":hazimp", {
                "prov:type": "prov:SoftwareAgent",
                "prov:Revision": commit,
                "prov:branch": branch,
                "prov:date": dt
            })
        self.prov.agent(f":{getpass.getuser()}", {"prov:type": "foaf:Person"})
        self.prov.actedOnBehalfOf(":hazimp", f":{getpass.getuser()}")
        self.provlabel = ''

    def set_prov_label(self, label, title="HazImp analysis"):
        """
        Set the qualified label for the provenance data
        """

        self.provlabel = f":{label}"
        self.prov.activity(f":{label}",
                           datetime.now().strftime(DATEFMT), None, {
                               "dcterms:title": title,
                               "prov:type": "void:Analysis"
                           })
        self.prov.wasAttributedTo(self.provlabel, ":hazimp")

    def get_site_shape(self):
        """
        Get the numpy shape of sites the context is storing.
        It is based on the shape of exposure_long.

        :return: The numpy shape of sites the context is storing.
        """
        if self.exposure_long is None:
            shape = (0)
        else:
            shape = self.exposure_long.shape
        return shape

    def clip_exposure(self, min_long, min_lat, max_long, max_lat):
        """ min_long, min_lat, max_long, max_lat
        Clip the exposure data so only the exposure values within
        the rectangle formed by  max_lat, min_lat, max_long and
        min_long are included.

        Note: This must be called before the exposure_vuln_curves
        are determined, since the curves have a site dimension.
        """
        assert self.exposure_vuln_curves is None

        bad_indexes = set()
        bad_indexes = bad_indexes.union(
            numpy.where(self.exposure_long < min_long)[0])
        bad_indexes = bad_indexes.union(
            numpy.where(self.exposure_long > max_long)[0])
        bad_indexes = bad_indexes.union(
            numpy.where(self.exposure_lat < min_lat)[0])
        bad_indexes = bad_indexes.union(
            numpy.where(self.exposure_lat > max_lat)[0])
        good_indexes = numpy.array(list(
            set(range(self.exposure_lat.size)).difference(bad_indexes)),
                                   dtype=int)

        if good_indexes.shape[0] == 0:
            self.exposure_lat = numpy.array([])
            self.exposure_long = numpy.array([])
        else:
            self.exposure_lat = self.exposure_lat[good_indexes]
            self.exposure_long = self.exposure_long[good_indexes]

        if isinstance(self.exposure_att, dict):
            for key in self.exposure_att:
                if good_indexes.shape[0] == 0:
                    exp_att = numpy.array([])
                else:
                    exp_att = self.exposure_att[key][good_indexes]
                self.exposure_att[key] = exp_att
        else:
            self.exposure_att = self.exposure_att.take(good_indexes)

    def save_exposure_atts(self, filename, use_parallel=True):
        """
        Save the exposure attributes, including latitude and longitude.
        The file type saved is based on the filename extension.
        Options
           '.npz': Save the arrays into a single file in uncompressed .npz
                   format.

        :param use_parallel: Set to True for parallel behaviour
        Which is only node 0 writing to file.
        :param filename: The file to be written.
        :return write_dict: The whole dictionary, returned for testing.
        """
        [filename, bucket_name, bucket_key] = \
            misc.create_temp_file_path_for_s3(filename)
        s1 = self.prov.entity(
            ":HazImp output file", {
                "prov:label": "Full HazImp output file",
                "prov:type": "void:Dataset",
                "prov:atLocation": os.path.basename(filename)
            })
        a1 = self.prov.activity(":SaveImpactData",
                                datetime.now().strftime(DATEFMT), None)
        self.prov.wasGeneratedBy(s1, a1)
        self.prov.wasInformedBy(a1, self.provlabel)
        write_dict = self.exposure_att.copy()
        write_dict[EX_LAT] = self.exposure_lat
        write_dict[EX_LONG] = self.exposure_long

        if use_parallel:
            assert misc.INTID in write_dict
            write_dict = parallel.gather_dict(write_dict,
                                              write_dict[misc.INTID])

        if parallel.STATE.rank == 0 or not use_parallel:
            if filename[-4:] == '.csv':
                save_csv(write_dict, filename)
            else:
                numpy.savez(filename, **write_dict)
            misc.upload_to_s3_if_applicable(filename, bucket_name, bucket_key)
            # The write_dict is returned for testing
            # When running in paralled this is a way of getting all
            # of the context info
            return write_dict

    def save_exposure_aggregation(self, filename, use_parallel=True):
        """
        Save the aggregated exposure attributes.
        The file type saved is based on the filename extension.
        Options
           '.npz': Save the arrays into a single file in uncompressed .npz
                   format.

        :param use_parallel: Set to True for parallel behaviour which
        is only node 0 writing to file.
        :param filename: The file to be written.
        :return write_dict: The whole dictionary, returned for testing.
        """
        write_dict = self.exposure_agg.copy()

        s1 = self.prov.entity(
            ":Aggregated HazImp output file", {
                "prov:label": "Aggregated HazImp output file",
                "prov:type": "void:Dataset",
                "prov:atLocation": os.path.basename(filename)
            })
        a1 = self.prov.activity(":SaveAggregatedImpactData",
                                datetime.now().strftime(DATEFMT), None)
        self.prov.wasGeneratedBy(s1, a1)
        self.prov.wasInformedBy(a1, self.prov.activity(":AggregateLoss"))

        if parallel.STATE.rank == 0 or not use_parallel:
            if filename[-4:] == '.csv':
                save_csv_agg(write_dict, filename)
            else:
                numpy.savez(filename, **write_dict)
            # The write_dict is returned for testing
            # When running in paralled this is a way of getting all
            # of the context info
            return write_dict

    def save_aggregation(self,
                         filename,
                         boundaries,
                         impactcode,
                         boundarycode,
                         categories,
                         use_parallel=True):
        """
        Save data aggregated to geospatial regions

        :param str filename: Destination filename
        :param bool use_parallel: True for parallel behaviout, which
                                  is only node 0 writing to file

        """
        LOGGER.info("Saving aggregated data")
        boundaries = misc.download_file_from_s3_if_needed(boundaries)
        [filename, bucket_name, bucket_key] = \
            misc.create_temp_file_path_for_s3(filename)
        write_dict = self.exposure_att.copy()
        dt = datetime.now().strftime(DATEFMT)
        atts = {
            "prov:type": "void:Dataset",
            "prov:atLocation": os.path.basename(boundaries),
            "prov:generatedAtTime": misc.get_file_mtime(boundaries),
            "void:boundary_code": boundarycode
        }
        bdyent = self.prov.entity(":Aggregation boundaries", atts)
        aggact = self.prov.activity(":AggregationByRegions", dt, None,
                                    {'prov:type': "Spatial aggregation"})
        aggatts = {
            "prov:type": "void:Dataset",
            "prov:atLocation": os.path.basename(filename),
            "prov:generatedAtTime": dt
        }
        aggfileent = self.prov.entity(":AggregationFile", aggatts)
        self.prov.used(aggact, bdyent)
        self.prov.wasInformedBy(aggact, self.provlabel)
        self.prov.wasGeneratedBy(aggfileent, aggact)
        if parallel.STATE.rank == 0 or not use_parallel:
            aggregate.choropleth(write_dict, boundaries, impactcode,
                                 boundarycode, filename, categories)
            misc.upload_to_s3_if_applicable(filename, bucket_name, bucket_key)
            if (bucket_name is not None and bucket_key is not None
                    and bucket_key.endswith('.shp')):
                [rootname, ext] = os.path.splitext(filename)
                base_bucket_key = bucket_key[:-len(ext)]
                misc.upload_to_s3_if_applicable(rootname + '.dbf', bucket_name,
                                                base_bucket_key + '.dbf')
                misc.upload_to_s3_if_applicable(rootname + '.shx', bucket_name,
                                                base_bucket_key + '.shx')
                misc.upload_to_s3_if_applicable(rootname + '.prj', bucket_name,
                                                base_bucket_key + '.prj')
                misc.upload_to_s3_if_applicable(rootname + '.cpg', bucket_name,
                                                base_bucket_key + '.cpg', True)
        else:
            pass

    def aggregate_loss(self, groupby=None, kwargs=None):
        """
        Aggregate data by the `groupby` attribute, using the `kwargs` to
        perform any arithmetic aggregation on fields (e.g. summation,
        mean, etc.)

        :param str groupby: A column in the `DataFrame` that corresponds to
        regions by which to aggregate data
        :param dict kwargs: A `dict` with keys of valid column names (from the
        `DataFrame`) and values being lists of aggregation functions to apply
        to the columns.

        For example::

        kwargs = {'REPLACEMENT_VALUE': ['mean', 'sum'],
                'structural_loss_ratio': ['mean', 'std']}


        See
        https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#aggregation
        for more guidance on using aggregation with `DataFrames`

        """
        LOGGER.info(f"Aggregating loss using {groupby} attribute")
        a1 = self.prov.activity(":AggregateLoss",
                                datetime.now().strftime(DATEFMT), None, {
                                    "prov:type": "Aggregation",
                                    "void:aggregator": repr(groupby)
                                })
        self.prov.wasInformedBy(a1, self.provlabel)
        self.exposure_agg = aggregate.aggregate_loss_atts(
            self.exposure_att, groupby, kwargs)

    def categorise(self, bins, labels, field_name):
        """
        Bin values into discrete intervals.

        :param list bins: Monotonically increasing array of bin edges,
                          including the rightmost edge, allowing for
                          non-uniform bin widths.
        :param labels: Specifies the labels for the returned
                       bins. Must be the same length as the resulting bins.
        :param str field_name: Name of the new column in the `exposure_att`
                                `DataFrame`
        """

        for intensity_key in self.exposure_vuln_curves:
            vc = self.exposure_vuln_curves[intensity_key]
            lct = vc.loss_category_type
        LOGGER.info(f"Categorising {lct} values into {len(labels)} categories")
        self.exposure_att[field_name] = pd.cut(self.exposure_att[lct],
                                               bins,
                                               right=False,
                                               labels=labels)

    def tabulate(self, file_name, index=None, columns=None, aggfunc=None):
        """
        Reshape data (produce a "pivot" table) based on column values. Uses
        unique values from specified `index` / `columns` to form axes of the
        resulting DataFrame, then writes to an Excel file. This function does
        not support data aggregation - multiple values will result in a
        MultiIndex in the columns.
        See
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html
        for further details.

        Parameters
        ----------
        file_name : destination for the pivot table
        index : column or list of columns
            Keys to group by on the pivot table index.  If an array is passed,
            it is being used as the same manner as column values.
        columns : column, or list of the columns
            Keys to group by on the pivot table column.  If an array is passed,
            it is being used as the same manner as column values.
        aggfunc : function, list of functions, dict, default numpy.mean
            If list of functions passed, the resulting pivot table will have
            hierarchical columns whose top level are the function names
            (inferred from the function objects themselves)
            If dict is passed, the key is column to aggregate and value
            is function or list of functions.
        """
        if index not in self.exposure_att.columns:
            LOGGER.error(f"Cannot tabulate data using {index} as index")
            LOGGER.error(f"{index} is not an attribute of the exposure data")
            return

        if columns not in self.exposure_att.columns:
            LOGGER.error(
                f"Required attribute(s) {columns} not in the exposure data")
            LOGGER.error(
                "Maybe you need to run a categorise job before this one?")
            return

        dt = datetime.now().strftime(DATEFMT)
        a1 = self.prov.activity(
            ":Tabulate", dt, None, {
                "prov:type": "Tabulation",
                "void:aggregator": repr(index),
                "void:attributes": repr(columns),
                "void:aggregation": repr(aggfunc)
            })
        tblatts = {
            "prov:type": "void:Dataset",
            "prov:atLocation": os.path.basename(file_name),
            "prov:generatedAtTime": dt
        }
        tblfileent = self.prov.entity(":TabulationFile", tblatts)

        self.pivot = self.exposure_att.pivot_table(index=index,
                                                   columns=columns,
                                                   aggfunc=aggfunc,
                                                   fill_value=0)
        try:
            self.pivot.to_excel(file_name)
        except TypeError as te:
            LOGGER.error(te)
            raise
        except KeyError as ke:
            LOGGER.error(ke)
            raise
        except ValueError as ve:
            LOGGER.error(f"Unable to save tabulated data to {file_name}")
            LOGGER.error(ve)
        else:
            self.prov.wasGeneratedBy(tblfileent, a1)
            self.prov.wasInformedBy(a1, self.provlabel)
コード例 #7
0
def get_provenance_history(uuid, normalized_provenance_dict):
    prov_doc = ProvDocument()
    # The 'prov' prefix is build-in namespace, no need to redefine here
    prov_doc.add_namespace(HUBMAP_NAMESPACE, 'https://hubmapconsortium.org/')

    # A bit validation
    if 'relationships' not in normalized_provenance_dict:
        raise LookupError(
            f'Missing "relationships" key from the normalized_provenance_dict for Entity of uuid: {uuid}'
        )

    if 'nodes' not in normalized_provenance_dict:
        raise LookupError(
            f'Missing "nodes" key from the normalized_provenance_dict for Entity of uuid: {uuid}'
        )

    # Pack the nodes into a dictionary using the uuid as key
    nodes_dict = {}
    for node in normalized_provenance_dict['nodes']:
        nodes_dict[node['uuid']] = node

    # Loop through the relationships and build the provenance document
    for rel_dict in normalized_provenance_dict['relationships']:
        # (Activity) - [ACTIVITY_OUTPUT] -> (Entity)
        if rel_dict['rel_data']['type'] == 'ACTIVITY_OUTPUT':
            activity_uuid = rel_dict['fromNode']['uuid']
            entity_uuid = rel_dict['toNode']['uuid']
        # (Entity) - [ACTIVITY_INPUT] -> (Activity)
        elif rel_dict['rel_data']['type'] == 'ACTIVITY_INPUT':
            entity_uuid = rel_dict['fromNode']['uuid']
            activity_uuid = rel_dict['toNode']['uuid']

        activity_node = nodes_dict[activity_uuid]
        entity_node = nodes_dict[entity_uuid]

        activity_uri = None
        entity_uri = None

        # Skip Lab nodes for agent and organization
        if entity_node['entity_type'] != 'Lab':
            # Get the agent information from the entity node
            agent_record = get_agent_record(entity_node)

            # Use 'created_by_user_sub' as agent ID if presents
            # Otherwise, fall back to use email by replacing @ and .
            created_by_user_sub_prov_key = f'{HUBMAP_NAMESPACE}:userUUID'
            created_by_user_email_prov_key = f'{HUBMAP_NAMESPACE}:userEmail'
            if created_by_user_sub_prov_key in agent_record:
                agent_id = agent_record[created_by_user_sub_prov_key]
            elif created_by_user_email_prov_key in agent_record:
                agent_id = str(
                    agent_record[created_by_user_email_prov_key]).replace(
                        '@', '-')
                agent_id = str(agent_id).replace('.', '-')
            else:
                msg = f"Both 'created_by_user_sub' and 'created_by_user_email' are missing form entity of uuid: {entity_node['uuid']}"
                logger.error(msg)
                raise LookupError(msg)

            # Build the agent uri
            agent_uri = build_uri(HUBMAP_NAMESPACE, 'agent', agent_id)

            # Only add the same agent once
            # Multiple entities can be associated to the same agent
            agent = prov_doc.get_record(agent_uri)
            if len(agent) == 0:
                doc_agent = prov_doc.agent(agent_uri, agent_record)
            else:
                doc_agent = agent[0]

            # Organization
            # Get the organization information from the entity node
            org_record = get_organization_record(entity_node)

            # Build the organization uri
            group_uuid_prov_key = f'{HUBMAP_NAMESPACE}:groupUUID'
            org_uri = build_uri(HUBMAP_NAMESPACE, 'organization',
                                org_record[group_uuid_prov_key])

            # Only add the same organization once
            # Multiple entities can be associated to different agents who are from the same organization
            org = prov_doc.get_record(org_uri)
            if len(org) == 0:
                doc_org = prov_doc.agent(org_uri, org_record)
            else:
                doc_org = org[0]

            # Build the activity uri
            activity_uri = build_uri(HUBMAP_NAMESPACE, 'activities',
                                     activity_node['uuid'])

            # Register activity if not already registered
            activity = prov_doc.get_record(activity_uri)
            if len(activity) == 0:
                # Shared attributes to be added to the PROV document
                activity_attributes = {'prov:type': 'Activity'}

                # Convert the timestampt integer to datetime string
                # Note: in our case, prov:startTime is the same as prov:endTime
                activity_time = timestamp_to_datetime(
                    activity_node['created_timestamp'])

                # Add prefix to all other attributes
                for key in activity_node:
                    prov_key = f'{HUBMAP_NAMESPACE}:{key}'
                    # Use datetime string instead of timestamp integer
                    if key == 'created_timestamp':
                        activity_attributes[prov_key] = activity_time
                    else:
                        activity_attributes[prov_key] = activity_node[key]

                # Register activity
                doc_activity = prov_doc.activity(activity_uri, activity_time,
                                                 activity_time,
                                                 activity_attributes)

                # Relationship: the agent actedOnBehalfOf the org
                prov_doc.actedOnBehalfOf(doc_agent, doc_org, doc_activity)
            else:
                doc_activity = activity[0]

            # Build the entity uri
            entity_uri = build_uri(HUBMAP_NAMESPACE, 'entities',
                                   entity_node['uuid'])

            # Register entity is not already registered
            if len(prov_doc.get_record(entity_uri)) == 0:
                # Shared attributes to be added to the PROV document
                entity_attributes = {'prov:type': 'Entity'}

                # Add prefix to all other attributes
                for key in entity_node:
                    # Entity property values can be list or dict, skip
                    # And list and dict are unhashable types when calling `prov_doc.entity()`
                    if not isinstance(entity_node[key], (list, dict)):
                        prov_key = f'{HUBMAP_NAMESPACE}:{key}'
                        # Use datetime string instead of timestamp integer
                        if key in [
                                'created_timestamp', 'last_modified_timestamp',
                                'published_timestamp'
                        ]:
                            entity_attributes[prov_key] = activity_time
                        else:
                            entity_attributes[prov_key] = entity_node[key]

                # Register entity
                prov_doc.entity(entity_uri, entity_attributes)

        # Build activity uri and entity uri if not already built
        # For the Lab nodes
        if activity_uri is None:
            activity_uri = build_uri(HUBMAP_NAMESPACE, 'activities',
                                     activity_node['uuid'])

        if entity_uri is None:
            entity_uri = build_uri(HUBMAP_NAMESPACE, 'entities',
                                   entity_node['uuid'])

        # The following relationships apply to all node including Lab entity nodes
        # (Activity) - [ACTIVITY_OUTPUT] -> (Entity)
        if rel_dict['rel_data']['type'] == 'ACTIVITY_OUTPUT':
            # Relationship: the entity wasGeneratedBy the activity
            prov_doc.wasGeneratedBy(entity_uri, activity_uri)
        # (Entity) - [ACTIVITY_INPUT] -> (Activity)
        elif rel_dict['rel_data']['type'] == 'ACTIVITY_INPUT':
            # Relationship: the activity used the entity
            prov_doc.used(activity_uri, entity_uri)

    # Format into json string based on the PROV-JSON Serialization
    # https://www.w3.org/Submission/prov-json/
    serialized_json = prov_doc.serialize()

    return serialized_json
コード例 #8
0
def example():

    g = ProvDocument()
    # Local namespace
    # Doesnt exist yet so we are creating it
    ap = Namespace('aip', 'https://araport.org/provenance/')
    # Dublin Core
    g.add_namespace("dcterms", "http://purl.org/dc/terms/")
    # FOAF
    g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/")

    # Add sponsors and contributors as Agents
    # ap['matthew_vaughn']
    # aip:matthew_vaughn
    # https://araport.org/provenance/:matthew_vaughn
    # Learn this from a call to profiles service? Adds a dependency on Agave so I am open to figuring out another way
    me = g.agent(
        ap['matthew_vaughn'], {
            'prov:type': PROV["Person"],
            'foaf:givenName': "Matthew Vaughn",
            'foaf:mbox': "<mailto:[email protected]>"
        })
    # Hard coded for now
    walter = g.agent(
        ap['walter_moreira'], {
            'prov:type': PROV["Person"],
            'foaf:givenName': "Walter Moreira",
            'foaf:mbox': "<mailto:[email protected]>"
        })
    utexas = g.agent(
        ap['university_of_texas'], {
            'prov:type': PROV["Organization"],
            'foaf:givenName': "University of Texas at Austin"
        })

    # Set delegation to our host University
    # We may have trouble doing this for other users since we don't always capture their host instituion
    g.actedOnBehalfOf(walter, utexas)
    g.actedOnBehalfOf(me, utexas)

    # Include the ADAMA platform as an Agent and set attribution
    # dcterms:title and dcterms:description are hardcoded
    # dcterms:language is hard-coded
    # dcterms:source is the URI of the public git source repository for ADAMA
    # "dcterms:updated": "2015-04-17T09:44:56" - this would actually be the date ADAMA was updated
    adama_platform = g.agent(
        ap['adama_platform'], {
            'dcterms:title': "ADAMA",
            'dcterms:description': "Araport Data and Microservices API",
            'dcterms:language': "en-US",
            'dcterms:identifier': "https://api.araport.org/community/v0.3/",
            'dcterms:updated': "2015-04-17T09:44:56"
        })
    g.wasGeneratedBy(adama_platform, walter)

    # Include the ADAMA microservice as an Agent and set attribution+delegation
    # dcterms:title and dcterms:description are inherited from the service's metadata
    # dcterms:language is hard-coded
    # dcterms:identifier is the deployment URI for the service
    # dcterms:source is the URI of the public git source repository. The URL in this example is just a dummy
    #
    # The name for each microservice should be unique. We've decided to
    # use the combination of namespace, service name, and version
    microservice_name = 'mwvaughn/bar_annotation_v1.0.0'
    adama_microservice = g.agent(
        ap[microservice_name], {
            'dcterms:title':
            "BAR Annotation Service",
            'dcterms:description':
            "Returns annotation from locus ID",
            'dcterms:language':
            "en-US",
            'dcterms:identifier':
            "https://api.araport.org/community/v0.3/mwvaughn/bar_annotation_v1.0.0",
            'dcterms:source':
            "https://github.com/Arabidopsis-Information-Portal/prov-enabled-api-sample"
        })

    # the microservice was generated by me on date X (don't use now, use when the service was updated)
    g.wasGeneratedBy(adama_microservice, me, datetime.datetime.now())
    # The microservice used the platform now
    g.used(adama_microservice, adama_platform, datetime.datetime.now())

    # Sources
    #
    # Define BAR
    # Agents
    nick = g.agent(
        ap['nicholas_provart'], {
            'prov:type': PROV["Person"],
            'foaf:givenName': "Nicholas Provart",
            'foaf:mbox': "*****@*****.**"
        })
    utoronto = g.agent(
        ap['university_of_toronto'], {
            'prov:type': PROV["Organization"],
            'foaf:givenName': "University of Toronto",
            'dcterms:identifier': "http://www.utoronto.ca/"
        })
    g.actedOnBehalfOf(nick, utoronto)

    # Entity
    # All fields derived from Sources.yml
    # dcterms:title and dcterms:description come straight from the YAML
    # dcterms:identifier - URI pointing to the source's canonical URI representation
    # optional - dcterms:language: Recommended best practice is to use a controlled vocabulary such as RFC 4646
    # optional - dcterms:updated: date the source was published or last updated
    # optional - dcterms:license: Simple string or URI to license. Validate URI if provided?
    datasource1 = g.entity(
        ap['datasource1'], {
            'dcterms:title': "BAR Arabidopsis AGI -> Annotation",
            'dcterms:description': "Most recent annotation for given AGI",
            'dcterms:language': "en-US",
            'dcterms:identifier':
            "http://bar.utoronto.ca/webservices/agiToAnnot.php",
            'dcterms:updated': "2015-04-17T09:44:56",
            'dcterms:license': "Creative Commons 3.0"
        })
    # Set up attribution to Nick
    g.wasAttributedTo(datasource1, nick)

    # Define TAIR
    # Agents
    # dcterms:language: Recommended best practice is to use a controlled vocabulary such as RFC 4646
    eva = g.agent(ap['eva_huala'], {
        'prov:type': PROV["Person"],
        'foaf:givenName': "Eva Huala"
    })
    phoenix = g.agent(
        ap['phoenix_bioinformatics'], {
            'prov:type': PROV["Organization"],
            'foaf:givenName': "Phoenix Bioinformatics"
        })
    g.actedOnBehalfOf(eva, phoenix)

    # Entity
    # All fields derived from Sources.yml
    # optional - dcterms:citation: Plain text bibliographic citation. If only provided as doi, should we try to validate it?
    datasource2 = g.entity(
        ap['datasource2'], {
            'dcterms:title':
            "TAIR",
            'dcterms:description':
            "The Arabidopsis Information Resource",
            'dcterms:language':
            "en-US",
            'dcterms:identifier':
            "https://www.arabidopsis.org/",
            'dcterms:citation':
            "The Arabidopsis Information Resource (TAIR): improved gene annotation and new tools. Nucleic Acids Research 2011 doi: 10.1093/nar/gkr1090"
        })
    g.wasAttributedTo(datasource2, eva)

    # In Sources.yml, these two sources are nested. Define that relationship here
    # There are other types of relationships but we will just use derived from for simplicity in this prototype
    g.wasDerivedFrom(ap['datasource1'], ap['datasource2'])

    # Depending on which ADAMA microservice type we are using, define an activity
    # Eventually, break these into more atomic actions in a chain
    action1 = g.activity(ap['do_query'], datetime.datetime.now())
    # action1 = g.activity(ap['do_map'], datetime.datetime.now())
    # action1 = g.activity(ap['do_generic'], datetime.datetime.now())
    # action1 = g.activity(ap['do_passthrough'], datetime.datetime.now())
    # Future... Support for ADAMA-native microservices
    # action1 = g.activity(ap['generate'], datetime.datetime.now())

    # Define current ADAMA response as an Entity
    # This is what's being returned to the user and is thus the subject of the PROV record
    # May be able to add more attributes to it but this is the minimum
    response = g.entity(ap['adama_response'])

    # Response is generated by the process_query action
    # Time-stamp it!
    g.wasGeneratedBy(response, ap['do_query'], datetime.datetime.now())
    # The process_query used the microservice
    g.used(ap['do_query'], adama_microservice, datetime.datetime.now())
    # The microservice used datasource1
    g.used(adama_microservice, datasource1, datetime.datetime.now())

    # Print prov_n
    print(g.get_provn())
    # Print prov-json
    print(g.serialize())
    # Write out as a pretty picture
    graph = prov.dot.prov_to_dot(g)
    graph.write_png('Sources.png')
コード例 #9
0
def example():

    g = ProvDocument()
    # Local namespace
    # Doesnt exist yet so we are creating it
    ap = Namespace('aip', 'https://araport.org/provenance/')
    # Dublin Core
    g.add_namespace("dcterms", "http://purl.org/dc/terms/")
    # FOAF
    g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/")

    # Add sponsors and contributors as Agents
    # ap['matthew_vaughn']
    # aip:matthew_vaughn
    # https://araport.org/provenance/:matthew_vaughn
    # Learn this from a call to profiles service? Adds a dependency on Agave so I am open to figuring out another way
    me = g.agent(ap['matthew_vaughn'], {
        'prov:type': PROV["Person"], 'foaf:givenName': "Matthew Vaughn", 'foaf:mbox': "<mailto:[email protected]>"
    })
    # Hard coded for now
    walter = g.agent(ap['walter_moreira'], {
        'prov:type': PROV["Person"], 'foaf:givenName': "Walter Moreira", 'foaf:mbox': "<mailto:[email protected]>"
    })
    utexas = g.agent(ap['university_of_texas'], {
        'prov:type': PROV["Organization"], 'foaf:givenName': "University of Texas at Austin"
    })

    # Set delegation to our host University
    # We may have trouble doing this for other users since we don't always capture their host instituion
    g.actedOnBehalfOf(walter, utexas)
    g.actedOnBehalfOf(me, utexas)

    # Include the ADAMA platform as an Agent and set attribution
    # dcterms:title and dcterms:description are hardcoded
    # dcterms:language is hard-coded
    # dcterms:source is the URI of the public git source repository for ADAMA
    # "dcterms:updated": "2015-04-17T09:44:56" - this would actually be the date ADAMA was updated
    adama_platform = g.agent(ap['adama_platform'], {'dcterms:title': "ADAMA", 'dcterms:description': "Araport Data and Microservices API", 'dcterms:language':"en-US", 'dcterms:identifier':"https://api.araport.org/community/v0.3/", 'dcterms:updated': "2015-04-17T09:44:56" })
    g.wasGeneratedBy(adama_platform, walter)

    # Include the ADAMA microservice as an Agent and set attribution+delegation
    # dcterms:title and dcterms:description are inherited from the service's metadata
    # dcterms:language is hard-coded
    # dcterms:identifier is the deployment URI for the service
    # dcterms:source is the URI of the public git source repository. The URL in this example is just a dummy
    #
    # The name for each microservice should be unique. We've decided to
    # use the combination of namespace, service name, and version
    microservice_name = 'mwvaughn/bar_annotation_v1.0.0'
    adama_microservice = g.agent(ap[microservice_name], {'dcterms:title': "BAR Annotation Service", 'dcterms:description': "Returns annotation from locus ID", 'dcterms:language':"en-US", 'dcterms:identifier':"https://api.araport.org/community/v0.3/mwvaughn/bar_annotation_v1.0.0", 'dcterms:source':"https://github.com/Arabidopsis-Information-Portal/prov-enabled-api-sample" })

    # the microservice was generated by me on date X (don't use now, use when the service was updated)
    g.wasGeneratedBy(adama_microservice, me, datetime.datetime.now())
    # The microservice used the platform now
    g.used(adama_microservice, adama_platform, datetime.datetime.now())

    # Sources
    #
    # Define BAR
    # Agents
    nick = g.agent(ap['nicholas_provart'], {
        'prov:type': PROV["Person"], 'foaf:givenName': "Nicholas Provart", 'foaf:mbox': "*****@*****.**"
    })
    utoronto = g.agent(ap['university_of_toronto'], {
        'prov:type': PROV["Organization"], 'foaf:givenName': "University of Toronto", 'dcterms:identifier':"http://www.utoronto.ca/"
    })
    g.actedOnBehalfOf(nick, utoronto)

    # Entity
    # All fields derived from Sources.yml
    # dcterms:title and dcterms:description come straight from the YAML
    # dcterms:identifier - URI pointing to the source's canonical URI representation
    # optional - dcterms:language: Recommended best practice is to use a controlled vocabulary such as RFC 4646
    # optional - dcterms:updated: date the source was published or last updated
    # optional - dcterms:license: Simple string or URI to license. Validate URI if provided?
    datasource1 = g.entity(ap['datasource1'], {'dcterms:title': "BAR Arabidopsis AGI -> Annotation", 'dcterms:description': "Most recent annotation for given AGI", 'dcterms:language':"en-US", 'dcterms:identifier':"http://bar.utoronto.ca/webservices/agiToAnnot.php", 'dcterms:updated':"2015-04-17T09:44:56", 'dcterms:license':"Creative Commons 3.0" })
    # Set up attribution to Nick
    g.wasAttributedTo(datasource1, nick)

    # Define TAIR
    # Agents
    # dcterms:language: Recommended best practice is to use a controlled vocabulary such as RFC 4646
    eva = g.agent(ap['eva_huala'], {
        'prov:type': PROV["Person"], 'foaf:givenName': "Eva Huala"
    })
    phoenix = g.agent(ap['phoenix_bioinformatics'], {
        'prov:type': PROV["Organization"], 'foaf:givenName': "Phoenix Bioinformatics"
    })
    g.actedOnBehalfOf(eva, phoenix)

    # Entity
    # All fields derived from Sources.yml
    # optional - dcterms:citation: Plain text bibliographic citation. If only provided as doi, should we try to validate it?
    datasource2 = g.entity(ap['datasource2'], {'dcterms:title': "TAIR", 'dcterms:description': "The Arabidopsis Information Resource", 'dcterms:language':"en-US", 'dcterms:identifier':"https://www.arabidopsis.org/", 'dcterms:citation':"The Arabidopsis Information Resource (TAIR): improved gene annotation and new tools. Nucleic Acids Research 2011 doi: 10.1093/nar/gkr1090"})
    g.wasAttributedTo(datasource2, eva)

    # In Sources.yml, these two sources are nested. Define that relationship here
    # There are other types of relationships but we will just use derived from for simplicity in this prototype
    g.wasDerivedFrom(ap['datasource1'], ap['datasource2'])

    # Depending on which ADAMA microservice type we are using, define an activity
    # Eventually, break these into more atomic actions in a chain
    action1 = g.activity(ap['do_query'], datetime.datetime.now())
    # action1 = g.activity(ap['do_map'], datetime.datetime.now())
    # action1 = g.activity(ap['do_generic'], datetime.datetime.now())
    # action1 = g.activity(ap['do_passthrough'], datetime.datetime.now())
    # Future... Support for ADAMA-native microservices
    # action1 = g.activity(ap['generate'], datetime.datetime.now())

    # Define current ADAMA response as an Entity
    # This is what's being returned to the user and is thus the subject of the PROV record
    # May be able to add more attributes to it but this is the minimum
    response = g.entity(ap['adama_response'])

    # Response is generated by the process_query action
    # Time-stamp it!
    g.wasGeneratedBy(response, ap['do_query'], datetime.datetime.now())
    # The process_query used the microservice
    g.used(ap['do_query'], adama_microservice, datetime.datetime.now())
    # The microservice used datasource1
    g.used(adama_microservice, datasource1, datetime.datetime.now())

    # Print prov_n
    print(g.get_provn())
    # Print prov-json
    print(g.serialize())
    # Write out as a pretty picture
    graph = prov.dot.prov_to_dot(g)
    graph.write_png('Sources.png')
コード例 #10
0
    def get_provenance_history(self, driver, uuid, depth=None):
        prov_doc = ProvDocument()
        #prov_doc.
        #NOTE!! There is a bug with the JSON serializer.  I can't add the prov prefix using this mechanism

        prov_doc.add_namespace('ex', 'http://example.org/')
        prov_doc.add_namespace('hubmap', 'https://hubmapconsortium.org/')

        #prov_doc.add_namespace('dct', 'http://purl.org/dc/terms/')
        #prov_doc.add_namespace('foaf','http://xmlns.com/foaf/0.1/')
        relation_list = []
        with driver.session() as session:
            try:
                # max_level_str is the string used to put a limit on the number of levels to traverse
                max_level_str = ''
                if depth is not None and len(str(depth)) > 0:
                    max_level_str = """maxLevel: {depth},""".format(
                        depth=depth)
                """
                Basically this Cypher query returns a collection of nodes and relationships.  The relationships include ACTIVITY_INPUT, ACTIVITY_OUTPUT and
                HAS_METADATA.  First, we build a dictionary of the nodes using uuid as a key.  Next, we loop through the relationships looking for HAS_METADATA 
                relationships.  The HAS_METADATA relationships connect the Entity nodes with their metadata.  The data from the Metadata node
                becomes the 'metadata' attribute for the Entity node.
                """
                """Possible replacement:
                THIS WORKS...NEEDS LOTS of COMMENTS!!
                MATCH (entity_metadata)<-[r1:HAS_METADATA]-(e)<-[r2:ACTIVITY_OUTPUT]-(a:Activity)-[r3:HAS_METADATA]->(activity_metadata) 
                                WHERE e.hubmap_identifier = 'TEST0010-LK-1-1'
                                WITH [e,a, entity_metadata, activity_metadata] AS entities, COLLECT(r1) + COLLECT(r2) + COLLECT(r3) AS relationships
                                WITH [node in entities | node {.*, label:labels(node)}] AS nodes, [rel in relationships | rel { .*, fromNode: { label:labels(startNode(rel))[0], uuid:startNode(rel).uuid } , toNode: { label:labels(endNode(rel))[0], uuid:endNode(rel).uuid }, rel_data: { type: type(rel) } } ] as rels
                                RETURN nodes, rels
                UNION OPTIONAL MATCH (activity_metadata)<-[r1:HAS_METADATA]-(a:Activity)<-[r2:ACTIVITY_INPUT|:ACTIVITY_OUTPUT*]-(parent)-[r3:HAS_METADATA]->(parent_metadata),
                (e)<-[r4:ACTIVITY_OUTPUT]-(a:Activity) 
                                WHERE e.hubmap_identifier = 'TEST0010-LK-1-1'
                                WITH [parent,parent_metadata, a, activity_metadata] AS nodes, [rel in COLLECT(r1) + COLLECT(r3) + COLLECT(r4)+COLLECT(apoc.convert.toRelationship(r2)) | rel { .*, fromNode: { label:labels(startNode(rel))[0], uuid:startNode(rel).uuid } , toNode: { label:labels(endNode(rel))[0], uuid:endNode(rel).uuid }, rel_data: { type: type(rel) } } ] as rels
                                RETURN DISTINCT nodes, rels                

                uuid for TEST0010-LK-1-1 for testing: eda3916db4695d834eb6c51a893d06f1
                """

                stmt = """MATCH (n:Entity {{ uuid: '{uuid}' }}) 
                CALL apoc.path.subgraphAll(n, {{ {max_level_str} relationshipFilter:'<ACTIVITY_INPUT|<ACTIVITY_OUTPUT|HAS_METADATA>' }}) YIELD nodes, relationships
                WITH [node in nodes | node {{ .*, label:labels(node)[0] }} ] as nodes, 
                     [rel in relationships | rel {{ .*, fromNode: {{ label:labels(startNode(rel))[0], uuid:startNode(rel).uuid }} , toNode: {{ label:labels(endNode(rel))[0], uuid:endNode(rel).uuid }}, rel_data: {{ type: type(rel) }} }} ] as rels
                WITH {{ nodes:nodes, relationships:rels }} as json
                RETURN json""".format(uuid=uuid, max_level_str=max_level_str)

                result = session.run(stmt)

                #there should only be one record
                for jsonData in result:
                    try:
                        record = dict(jsonData)['json']

                        if 'relationships' not in record:
                            raise LookupError(
                                'Error, unable to find relationships for uuid:'
                                + uuid)
                        if 'nodes' not in record:
                            raise LookupError(
                                'Error, unable to find nodes for uuid:' + uuid)

                        node_dict = {}
                        # pack the nodes into a dictionary using the uuid as a key
                        for node_record in record['nodes']:
                            node_dict[node_record['uuid']] = node_record

                        # TODO: clean up nodes
                        # remove nodes that lack metadata

                        # need to devise a methodology for this
                        # try preprocessing the record['relationships'] here:
                        # make a copy of the node_dict called unreferenced_node_dict
                        # loop through the relationships and find all the has_metadata relationships
                        # for each node pair in the has_metadata relationship, delete it from the unreferenced_node_dict
                        # once the loop is finished, continue as before
                        # add some logic when generating the wasGenerated and used relationships.  If either node is in the
                        # unreferenced_node_dict, then ignore the relationship

                        # now, connect the nodes
                        for rel_record in record['relationships']:
                            from_uuid = rel_record['fromNode']['uuid']
                            to_uuid = rel_record['toNode']['uuid']
                            from_node = node_dict[from_uuid]
                            to_node = node_dict[to_uuid]
                            if rel_record['rel_data'][
                                    'type'] == HubmapConst.HAS_METADATA_REL:
                                # assign the metadata node as the metadata attribute
                                # just extract the provenance information from the metadata node

                                entity_timestamp_json = Provenance.get_json_timestamp(
                                    int(to_node[
                                        HubmapConst.
                                        PROVENANCE_CREATE_TIMESTAMP_ATTRIBUTE])
                                )
                                provenance_data = {
                                    ProvConst.PROV_GENERATED_TIME_ATTRIBUTE:
                                    entity_timestamp_json
                                }
                                type_code = None
                                isEntity = True
                                if HubmapConst.ENTITY_TYPE_ATTRIBUTE in from_node:
                                    type_code = from_node[
                                        HubmapConst.ENTITY_TYPE_ATTRIBUTE]
                                elif HubmapConst.ACTIVITY_TYPE_ATTRIBUTE in from_node:
                                    type_code = from_node[
                                        HubmapConst.ACTIVITY_TYPE_ATTRIBUTE]
                                    isEntity = False
                                label_text = None
                                if HubmapConst.LAB_IDENTIFIER_ATTRIBUTE in from_node:
                                    label_text = from_node[
                                        HubmapConst.LAB_IDENTIFIER_ATTRIBUTE]
                                else:
                                    label_text = from_node[
                                        HubmapConst.UUID_ATTRIBUTE]

                                # build metadata attribute from the Metadata node
                                metadata_attribute = {}
                                for attribute_key in to_node:
                                    if attribute_key not in self.metadata_ignore_attributes:
                                        if attribute_key in self.known_attribute_map:
                                            # special case: timestamps
                                            if attribute_key == HubmapConst.PROVENANCE_MODIFIED_TIMESTAMP_ATTRIBUTE:
                                                provenance_data[
                                                    self.known_attribute_map[
                                                        attribute_key]] = Provenance.get_json_timestamp(
                                                            int(to_node[
                                                                attribute_key])
                                                        )
                                        else:  #add any extraneous data to the metadata attribute
                                            metadata_attribute[
                                                attribute_key] = to_node[
                                                    attribute_key]

                                # Need to add the agent and organization here, plus the appropriate relationships (between the entity and the agent plus orgainzation)
                                agent_record = self.get_agent_record(to_node)
                                agent_unique_id = str(agent_record[
                                    ProvConst.HUBMAP_PROV_USER_EMAIL]).replace(
                                        '@', '-')
                                agent_unique_id = str(agent_unique_id).replace(
                                    '.', '-')
                                if ProvConst.HUBMAP_PROV_USER_UUID in agent_record:
                                    agent_unique_id = agent_record[
                                        ProvConst.HUBMAP_PROV_USER_UUID]
                                agent_uri = Provenance.build_uri(
                                    'hubmap', 'agent', agent_unique_id)
                                organization_record = self.get_organization_record(
                                    to_node)
                                organization_uri = Provenance.build_uri(
                                    'hubmap', 'organization',
                                    organization_record[
                                        ProvConst.HUBMAP_PROV_GROUP_UUID])
                                doc_agent = None
                                doc_org = None

                                get_agent = prov_doc.get_record(agent_uri)
                                # only add this once
                                if len(get_agent) == 0:
                                    doc_agent = prov_doc.agent(
                                        agent_uri, agent_record)
                                else:
                                    doc_agent = get_agent[0]

                                get_org = prov_doc.get_record(organization_uri)
                                # only add this once
                                if len(get_org) == 0:
                                    doc_org = prov_doc.agent(
                                        organization_uri, organization_record)
                                else:
                                    doc_org = get_org[0]

                                other_attributes = {
                                    ProvConst.PROV_LABEL_ATTRIBUTE:
                                    label_text,
                                    ProvConst.PROV_TYPE_ATTRIBUTE:
                                    type_code,
                                    ProvConst.HUBMAP_DOI_ATTRIBUTE:
                                    from_node[HubmapConst.DOI_ATTRIBUTE],
                                    ProvConst.HUBMAP_DISPLAY_DOI_ATTRIBUTE:
                                    from_node[
                                        HubmapConst.DISPLAY_DOI_ATTRIBUTE],
                                    ProvConst.HUBMAP_DISPLAY_IDENTIFIER_ATTRIBUTE:
                                    label_text,
                                    ProvConst.HUBMAP_UUID_ATTRIBUTE:
                                    from_node[HubmapConst.UUID_ATTRIBUTE]
                                }
                                # only add metadata if it contains data
                                if len(metadata_attribute) > 0:
                                    other_attributes[
                                        ProvConst.
                                        HUBMAP_METADATA_ATTRIBUTE] = json.dumps(
                                            metadata_attribute)
                                # add the provenance data to the other_attributes
                                other_attributes.update(provenance_data)
                                if isEntity == True:
                                    prov_doc.entity(
                                        Provenance.build_uri(
                                            'hubmap', 'entities',
                                            from_node['uuid']),
                                        other_attributes)
                                else:
                                    activity_timestamp_json = Provenance.get_json_timestamp(
                                        int(to_node[
                                            HubmapConst.
                                            PROVENANCE_CREATE_TIMESTAMP_ATTRIBUTE]
                                            ))
                                    doc_activity = prov_doc.activity(
                                        Provenance.build_uri(
                                            'hubmap', 'activities',
                                            from_node['uuid']),
                                        activity_timestamp_json,
                                        activity_timestamp_json,
                                        other_attributes)
                                    prov_doc.actedOnBehalfOf(
                                        doc_agent, doc_org, doc_activity)
                            elif rel_record['rel_data']['type'] in [
                                    HubmapConst.ACTIVITY_OUTPUT_REL,
                                    HubmapConst.ACTIVITY_INPUT_REL
                            ]:
                                to_node_uri = None
                                from_node_uri = None
                                if HubmapConst.ENTITY_TYPE_ATTRIBUTE in to_node:
                                    to_node_uri = Provenance.build_uri(
                                        'hubmap', 'entities', to_node['uuid'])
                                else:
                                    to_node_uri = Provenance.build_uri(
                                        'hubmap', 'activities',
                                        to_node['uuid'])
                                if HubmapConst.ENTITY_TYPE_ATTRIBUTE in from_node:
                                    from_node_uri = Provenance.build_uri(
                                        'hubmap', 'entities',
                                        from_node['uuid'])
                                else:
                                    from_node_uri = Provenance.build_uri(
                                        'hubmap', 'activities',
                                        from_node['uuid'])

                                if rel_record['rel_data'][
                                        'type'] == 'ACTIVITY_OUTPUT':
                                    #prov_doc.wasGeneratedBy(entity, activity, time, identifier, other_attributes)
                                    prov_doc.wasGeneratedBy(
                                        to_node_uri, from_node_uri)

                                if rel_record['rel_data'][
                                        'type'] == 'ACTIVITY_INPUT':
                                    #prov_doc.used(activity, entity, time, identifier, other_attributes)
                                    prov_doc.used(to_node_uri, from_node_uri)

                                # for now, simply create a "relation" where the fromNode's uuid is connected to a toNode's uuid via a relationship:
                                # ex: {'fromNodeUUID': '42e10053358328c9079f1c8181287b6d', 'relationship': 'ACTIVITY_OUTPUT', 'toNodeUUID': '398400024fda58e293cdb435db3c777e'}
                                rel_data_record = {
                                    'fromNodeUUID': from_node['uuid'],
                                    'relationship':
                                    rel_record['rel_data']['type'],
                                    'toNodeUUID': to_node['uuid']
                                }
                                relation_list.append(rel_data_record)
                        return_data = {
                            'nodes': node_dict,
                            'relations': relation_list
                        }
                    except Exception as e:
                        raise e

                # there is a bug in the JSON serializer.  So manually insert the prov prefix

                output_doc = prov_doc.serialize(indent=2)
                output_doc = output_doc.replace(
                    '"prefix": {',
                    '"prefix": {\n    "prov" : "http://www.w3.org/ns/prov#", ')

                #output_doc = prov_doc.serialize(format='rdf', rdf_format='trig')
                #output_doc = prov_doc.serialize(format='provn')
                return output_doc

            except ConnectionError as ce:
                print('A connection error occurred: ', str(ce.args[0]))
                raise ce
            except ValueError as ve:
                print('A value error occurred: ', ve.value)
                raise ve
            except Exception as e:
                print('An exception occurred in get_provenance_history: ' +
                      str(e))
                traceback.print_exc()