Esempio n. 1
0
def get_provenance_history(uuid, normalized_provenance_dict):
    prov_doc = ProvDocument()
    # The 'prov' prefix is build-in namespace, no need to redefine here
    prov_doc.add_namespace(HUBMAP_NAMESPACE, 'https://hubmapconsortium.org/')

    # A bit validation
    if 'relationships' not in normalized_provenance_dict:
        raise LookupError(
            f'Missing "relationships" key from the normalized_provenance_dict for Entity of uuid: {uuid}'
        )

    if 'nodes' not in normalized_provenance_dict:
        raise LookupError(
            f'Missing "nodes" key from the normalized_provenance_dict for Entity of uuid: {uuid}'
        )

    # Pack the nodes into a dictionary using the uuid as key
    nodes_dict = {}
    for node in normalized_provenance_dict['nodes']:
        nodes_dict[node['uuid']] = node

    # Loop through the relationships and build the provenance document
    for rel_dict in normalized_provenance_dict['relationships']:
        # (Activity) - [ACTIVITY_OUTPUT] -> (Entity)
        if rel_dict['rel_data']['type'] == 'ACTIVITY_OUTPUT':
            activity_uuid = rel_dict['fromNode']['uuid']
            entity_uuid = rel_dict['toNode']['uuid']
        # (Entity) - [ACTIVITY_INPUT] -> (Activity)
        elif rel_dict['rel_data']['type'] == 'ACTIVITY_INPUT':
            entity_uuid = rel_dict['fromNode']['uuid']
            activity_uuid = rel_dict['toNode']['uuid']

        activity_node = nodes_dict[activity_uuid]
        entity_node = nodes_dict[entity_uuid]

        activity_uri = None
        entity_uri = None

        # Skip Lab nodes for agent and organization
        if entity_node['entity_type'] != 'Lab':
            # Get the agent information from the entity node
            agent_record = get_agent_record(entity_node)

            # Use 'created_by_user_sub' as agent ID if presents
            # Otherwise, fall back to use email by replacing @ and .
            created_by_user_sub_prov_key = f'{HUBMAP_NAMESPACE}:userUUID'
            created_by_user_email_prov_key = f'{HUBMAP_NAMESPACE}:userEmail'
            if created_by_user_sub_prov_key in agent_record:
                agent_id = agent_record[created_by_user_sub_prov_key]
            elif created_by_user_email_prov_key in agent_record:
                agent_id = str(
                    agent_record[created_by_user_email_prov_key]).replace(
                        '@', '-')
                agent_id = str(agent_id).replace('.', '-')
            else:
                msg = f"Both 'created_by_user_sub' and 'created_by_user_email' are missing form entity of uuid: {entity_node['uuid']}"
                logger.error(msg)
                raise LookupError(msg)

            # Build the agent uri
            agent_uri = build_uri(HUBMAP_NAMESPACE, 'agent', agent_id)

            # Only add the same agent once
            # Multiple entities can be associated to the same agent
            agent = prov_doc.get_record(agent_uri)
            if len(agent) == 0:
                doc_agent = prov_doc.agent(agent_uri, agent_record)
            else:
                doc_agent = agent[0]

            # Organization
            # Get the organization information from the entity node
            org_record = get_organization_record(entity_node)

            # Build the organization uri
            group_uuid_prov_key = f'{HUBMAP_NAMESPACE}:groupUUID'
            org_uri = build_uri(HUBMAP_NAMESPACE, 'organization',
                                org_record[group_uuid_prov_key])

            # Only add the same organization once
            # Multiple entities can be associated to different agents who are from the same organization
            org = prov_doc.get_record(org_uri)
            if len(org) == 0:
                doc_org = prov_doc.agent(org_uri, org_record)
            else:
                doc_org = org[0]

            # Build the activity uri
            activity_uri = build_uri(HUBMAP_NAMESPACE, 'activities',
                                     activity_node['uuid'])

            # Register activity if not already registered
            activity = prov_doc.get_record(activity_uri)
            if len(activity) == 0:
                # Shared attributes to be added to the PROV document
                activity_attributes = {'prov:type': 'Activity'}

                # Convert the timestampt integer to datetime string
                # Note: in our case, prov:startTime is the same as prov:endTime
                activity_time = timestamp_to_datetime(
                    activity_node['created_timestamp'])

                # Add prefix to all other attributes
                for key in activity_node:
                    prov_key = f'{HUBMAP_NAMESPACE}:{key}'
                    # Use datetime string instead of timestamp integer
                    if key == 'created_timestamp':
                        activity_attributes[prov_key] = activity_time
                    else:
                        activity_attributes[prov_key] = activity_node[key]

                # Register activity
                doc_activity = prov_doc.activity(activity_uri, activity_time,
                                                 activity_time,
                                                 activity_attributes)

                # Relationship: the agent actedOnBehalfOf the org
                prov_doc.actedOnBehalfOf(doc_agent, doc_org, doc_activity)
            else:
                doc_activity = activity[0]

            # Build the entity uri
            entity_uri = build_uri(HUBMAP_NAMESPACE, 'entities',
                                   entity_node['uuid'])

            # Register entity is not already registered
            if len(prov_doc.get_record(entity_uri)) == 0:
                # Shared attributes to be added to the PROV document
                entity_attributes = {'prov:type': 'Entity'}

                # Add prefix to all other attributes
                for key in entity_node:
                    # Entity property values can be list or dict, skip
                    # And list and dict are unhashable types when calling `prov_doc.entity()`
                    if not isinstance(entity_node[key], (list, dict)):
                        prov_key = f'{HUBMAP_NAMESPACE}:{key}'
                        # Use datetime string instead of timestamp integer
                        if key in [
                                'created_timestamp', 'last_modified_timestamp',
                                'published_timestamp'
                        ]:
                            entity_attributes[prov_key] = activity_time
                        else:
                            entity_attributes[prov_key] = entity_node[key]

                # Register entity
                prov_doc.entity(entity_uri, entity_attributes)

        # Build activity uri and entity uri if not already built
        # For the Lab nodes
        if activity_uri is None:
            activity_uri = build_uri(HUBMAP_NAMESPACE, 'activities',
                                     activity_node['uuid'])

        if entity_uri is None:
            entity_uri = build_uri(HUBMAP_NAMESPACE, 'entities',
                                   entity_node['uuid'])

        # The following relationships apply to all node including Lab entity nodes
        # (Activity) - [ACTIVITY_OUTPUT] -> (Entity)
        if rel_dict['rel_data']['type'] == 'ACTIVITY_OUTPUT':
            # Relationship: the entity wasGeneratedBy the activity
            prov_doc.wasGeneratedBy(entity_uri, activity_uri)
        # (Entity) - [ACTIVITY_INPUT] -> (Activity)
        elif rel_dict['rel_data']['type'] == 'ACTIVITY_INPUT':
            # Relationship: the activity used the entity
            prov_doc.used(activity_uri, entity_uri)

    # Format into json string based on the PROV-JSON Serialization
    # https://www.w3.org/Submission/prov-json/
    serialized_json = prov_doc.serialize()

    return serialized_json
Esempio n. 2
0
class ProvenanceProfile:
    """
    Provenance profile.

    Populated as the workflow runs.
    """
    def __init__(
        self,
        research_object: "ResearchObject",
        full_name: str,
        host_provenance: bool,
        user_provenance: bool,
        orcid: str,
        fsaccess: StdFsAccess,
        run_uuid: Optional[uuid.UUID] = None,
    ) -> None:
        """Initialize the provenance profile."""
        self.fsaccess = fsaccess
        self.orcid = orcid
        self.research_object = research_object
        self.folder = self.research_object.folder
        self.document = ProvDocument()
        self.host_provenance = host_provenance
        self.user_provenance = user_provenance
        self.engine_uuid = research_object.engine_uuid  # type: str
        self.add_to_manifest = self.research_object.add_to_manifest
        if self.orcid:
            _logger.debug("[provenance] Creator ORCID: %s", self.orcid)
        self.full_name = full_name
        if self.full_name:
            _logger.debug("[provenance] Creator Full name: %s", self.full_name)
        self.workflow_run_uuid = run_uuid or uuid.uuid4()
        self.workflow_run_uri = self.workflow_run_uuid.urn  # type: str
        self.generate_prov_doc()

    def __str__(self) -> str:
        """Represent this Provenvance profile as a string."""
        return "ProvenanceProfile <{}> in <{}>".format(
            self.workflow_run_uri,
            self.research_object,
        )

    def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
        """Add basic namespaces."""
        def host_provenance(document: ProvDocument) -> None:
            """Record host provenance."""
            document.add_namespace(CWLPROV)
            document.add_namespace(UUID)
            document.add_namespace(FOAF)

            hostname = getfqdn()
            # won't have a foaf:accountServiceHomepage for unix hosts, but
            # we can at least provide hostname
            document.agent(
                ACCOUNT_UUID,
                {
                    PROV_TYPE: FOAF["OnlineAccount"],
                    "prov:location": hostname,
                    CWLPROV["hostname"]: hostname,
                },
            )

        self.cwltool_version = "cwltool %s" % versionstring().split()[-1]
        self.document.add_namespace("wfprov",
                                    "http://purl.org/wf4ever/wfprov#")
        # document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
        self.document.add_namespace("wfdesc",
                                    "http://purl.org/wf4ever/wfdesc#")
        # TODO: Make this ontology. For now only has cwlprov:image
        self.document.add_namespace("cwlprov", "https://w3id.org/cwl/prov#")
        self.document.add_namespace("foaf", "http://xmlns.com/foaf/0.1/")
        self.document.add_namespace("schema", "http://schema.org/")
        self.document.add_namespace("orcid", "https://orcid.org/")
        self.document.add_namespace("id", "urn:uuid:")
        # NOTE: Internet draft expired 2004-03-04 (!)
        #  https://tools.ietf.org/html/draft-thiemann-hash-urn-01
        # TODO: Change to nih:sha-256; hashes
        #  https://tools.ietf.org/html/rfc6920#section-7
        self.document.add_namespace("data", "urn:hash::sha1:")
        # Also needed for docker images
        self.document.add_namespace(SHA256, "nih:sha-256;")

        # info only, won't really be used by prov as sub-resources use /
        self.document.add_namespace("researchobject",
                                    self.research_object.base_uri)
        # annotations
        self.metadata_ns = self.document.add_namespace(
            "metadata", self.research_object.base_uri + METADATA + "/")
        # Pre-register provenance directory so we can refer to its files
        self.provenance_ns = self.document.add_namespace(
            "provenance",
            self.research_object.base_uri + posix_path(PROVENANCE) + "/")
        ro_identifier_workflow = self.research_object.base_uri + "workflow/packed.cwl#"
        self.wf_ns = self.document.add_namespace("wf", ro_identifier_workflow)
        ro_identifier_input = (self.research_object.base_uri +
                               "workflow/primary-job.json#")
        self.document.add_namespace("input", ro_identifier_input)

        # More info about the account (e.g. username, fullname)
        # may or may not have been previously logged by user_provenance()
        # .. but we always know cwltool was launched (directly or indirectly)
        # by a user account, as cwltool is a command line tool
        account = self.document.agent(ACCOUNT_UUID)
        if self.orcid or self.full_name:
            person = {PROV_TYPE: PROV["Person"], "prov:type": SCHEMA["Person"]}
            if self.full_name:
                person["prov:label"] = self.full_name
                person["foaf:name"] = self.full_name
                person["schema:name"] = self.full_name
            else:
                # TODO: Look up name from ORCID API?
                pass
            agent = self.document.agent(self.orcid or uuid.uuid4().urn, person)
            self.document.actedOnBehalfOf(account, agent)
        else:
            if self.host_provenance:
                host_provenance(self.document)
            if self.user_provenance:
                self.research_object.user_provenance(self.document)
        # The execution of cwltool
        wfengine = self.document.agent(
            self.engine_uuid,
            {
                PROV_TYPE: PROV["SoftwareAgent"],
                "prov:type": WFPROV["WorkflowEngine"],
                "prov:label": self.cwltool_version,
            },
        )
        # FIXME: This datetime will be a bit too delayed, we should
        # capture when cwltool.py earliest started?
        self.document.wasStartedBy(wfengine, None, account,
                                   datetime.datetime.now())
        # define workflow run level activity
        self.document.activity(
            self.workflow_run_uri,
            datetime.datetime.now(),
            None,
            {
                PROV_TYPE: WFPROV["WorkflowRun"],
                "prov:label": "Run of workflow/packed.cwl#main",
            },
        )
        # association between SoftwareAgent and WorkflowRun
        main_workflow = "wf:main"
        self.document.wasAssociatedWith(self.workflow_run_uri,
                                        self.engine_uuid, main_workflow)
        self.document.wasStartedBy(self.workflow_run_uri, None,
                                   self.engine_uuid, datetime.datetime.now())
        return (self.workflow_run_uri, self.document)

    def evaluate(
        self,
        process: Process,
        job: JobsType,
        job_order_object: CWLObjectType,
        research_obj: "ResearchObject",
    ) -> None:
        """Evaluate the nature of job."""
        if not hasattr(process, "steps"):
            # record provenance of independent commandline tool executions
            self.prospective_prov(job)
            customised_job = copy_job_order(job, job_order_object)
            self.used_artefacts(customised_job, self.workflow_run_uri)
            research_obj.create_job(customised_job)
        elif hasattr(job, "workflow"):
            # record provenance of workflow executions
            self.prospective_prov(job)
            customised_job = copy_job_order(job, job_order_object)
            self.used_artefacts(customised_job, self.workflow_run_uri)

    def record_process_start(
            self,
            process: Process,
            job: JobsType,
            process_run_id: Optional[str] = None) -> Optional[str]:
        if not hasattr(process, "steps"):
            process_run_id = self.workflow_run_uri
        elif not hasattr(job, "workflow"):
            # commandline tool execution as part of workflow
            name = ""
            if isinstance(job, (CommandLineJob, JobBase, WorkflowJob)):
                name = job.name
            process_name = urllib.parse.quote(name, safe=":/,#")
            process_run_id = self.start_process(process_name,
                                                datetime.datetime.now())
        return process_run_id

    def start_process(
        self,
        process_name: str,
        when: datetime.datetime,
        process_run_id: Optional[str] = None,
    ) -> str:
        """Record the start of each Process."""
        if process_run_id is None:
            process_run_id = uuid.uuid4().urn
        prov_label = "Run of workflow/packed.cwl#main/" + process_name
        self.document.activity(
            process_run_id,
            None,
            None,
            {
                PROV_TYPE: WFPROV["ProcessRun"],
                PROV_LABEL: prov_label
            },
        )
        self.document.wasAssociatedWith(process_run_id, self.engine_uuid,
                                        str("wf:main/" + process_name))
        self.document.wasStartedBy(process_run_id, None, self.workflow_run_uri,
                                   when, None, None)
        return process_run_id

    def record_process_end(
        self,
        process_name: str,
        process_run_id: str,
        outputs: Union[CWLObjectType, MutableSequence[CWLObjectType], None],
        when: datetime.datetime,
    ) -> None:
        self.generate_output_prov(outputs, process_run_id, process_name)
        self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri,
                                 when)

    def declare_file(
            self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, str]:
        if value["class"] != "File":
            raise ValueError("Must have class:File: %s" % value)
        # Need to determine file hash aka RO filename
        entity = None  # type: Optional[ProvEntity]
        checksum = None
        if "checksum" in value:
            csum = cast(str, value["checksum"])
            (method, checksum) = csum.split("$", 1)
            if method == SHA1 and self.research_object.has_data_file(checksum):
                entity = self.document.entity("data:" + checksum)

        if not entity and "location" in value:
            location = str(value["location"])
            # If we made it here, we'll have to add it to the RO
            with self.fsaccess.open(location, "rb") as fhandle:
                relative_path = self.research_object.add_data_file(fhandle)
                # FIXME: This naively relies on add_data_file setting hash as filename
                checksum = PurePath(relative_path).name
                entity = self.document.entity("data:" + checksum,
                                              {PROV_TYPE: WFPROV["Artifact"]})
                if "checksum" not in value:
                    value["checksum"] = f"{SHA1}${checksum}"

        if not entity and "contents" in value:
            # Anonymous file, add content as string
            entity, checksum = self.declare_string(cast(
                str, value["contents"]))

        # By here one of them should have worked!
        if not entity or not checksum:
            raise ValueError(
                "class:File but missing checksum/location/content: %r" % value)

        # Track filename and extension, this is generally useful only for
        # secondaryFiles. Note that multiple uses of a file might thus record
        # different names for the same entity, so we'll
        # make/track a specialized entity by UUID
        file_id = value.setdefault("@id", uuid.uuid4().urn)
        # A specialized entity that has just these names
        file_entity = self.document.entity(
            file_id,
            [(PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, WF4EVER["File"])],
        )  # type: ProvEntity

        if "basename" in value:
            file_entity.add_attributes(
                {CWLPROV["basename"]: value["basename"]})
        if "nameroot" in value:
            file_entity.add_attributes(
                {CWLPROV["nameroot"]: value["nameroot"]})
        if "nameext" in value:
            file_entity.add_attributes({CWLPROV["nameext"]: value["nameext"]})
        self.document.specializationOf(file_entity, entity)

        # Check for secondaries
        for sec in cast(MutableSequence[CWLObjectType],
                        value.get("secondaryFiles", [])):
            # TODO: Record these in a specializationOf entity with UUID?
            if sec["class"] == "File":
                (sec_entity, _, _) = self.declare_file(sec)
            elif sec["class"] == "Directory":
                sec_entity = self.declare_directory(sec)
            else:
                raise ValueError(f"Got unexpected secondaryFiles value: {sec}")
            # We don't know how/when/where the secondary file was generated,
            # but CWL convention is a kind of summary/index derived
            # from the original file. As its generally in a different format
            # then prov:Quotation is not appropriate.
            self.document.derivation(
                sec_entity,
                file_entity,
                other_attributes={PROV["type"]: CWLPROV["SecondaryFile"]},
            )

        return file_entity, entity, checksum

    def declare_directory(self, value: CWLObjectType) -> ProvEntity:
        """Register any nested files/directories."""
        # FIXME: Calculate a hash-like identifier for directory
        # so we get same value if it's the same filenames/hashes
        # in a different location.
        # For now, mint a new UUID to identify this directory, but
        # attempt to keep it inside the value dictionary
        dir_id = cast(str, value.setdefault("@id", uuid.uuid4().urn))

        # New annotation file to keep the ORE Folder listing
        ore_doc_fn = dir_id.replace("urn:uuid:", "directory-") + ".ttl"
        dir_bundle = self.document.bundle(self.metadata_ns[ore_doc_fn])

        coll = self.document.entity(
            dir_id,
            [
                (PROV_TYPE, WFPROV["Artifact"]),
                (PROV_TYPE, PROV["Collection"]),
                (PROV_TYPE, PROV["Dictionary"]),
                (PROV_TYPE, RO["Folder"]),
            ],
        )
        # ORE description of ro:Folder, saved separately
        coll_b = dir_bundle.entity(
            dir_id,
            [(PROV_TYPE, RO["Folder"]), (PROV_TYPE, ORE["Aggregation"])],
        )
        self.document.mentionOf(dir_id + "#ore", dir_id, dir_bundle.identifier)

        # dir_manifest = dir_bundle.entity(
        #     dir_bundle.identifier, {PROV["type"]: ORE["ResourceMap"],
        #                             ORE["describes"]: coll_b.identifier})

        coll_attribs = [(ORE["isDescribedBy"], dir_bundle.identifier)]
        coll_b_attribs = []  # type: List[Tuple[Identifier, ProvEntity]]

        # FIXME: .listing might not be populated yet - hopefully
        # a later call to this method will sort that
        is_empty = True

        if "listing" not in value:
            get_listing(self.fsaccess, value)
        for entry in cast(MutableSequence[CWLObjectType],
                          value.get("listing", [])):
            is_empty = False
            # Declare child-artifacts
            entity = self.declare_artefact(entry)
            self.document.membership(coll, entity)
            # Membership relation aka our ORE Proxy
            m_id = uuid.uuid4().urn
            m_entity = self.document.entity(m_id)
            m_b = dir_bundle.entity(m_id)

            # PROV-O style Dictionary
            # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition
            # ..as prov.py do not currently allow PROV-N extensions
            # like hadDictionaryMember(..)
            m_entity.add_asserted_type(PROV["KeyEntityPair"])

            m_entity.add_attributes({
                PROV["pairKey"]: entry["basename"],
                PROV["pairEntity"]: entity,
            })

            # As well as a being a
            # http://wf4ever.github.io/ro/2016-01-28/ro/#FolderEntry
            m_b.add_asserted_type(RO["FolderEntry"])
            m_b.add_asserted_type(ORE["Proxy"])
            m_b.add_attributes({
                RO["entryName"]: entry["basename"],
                ORE["proxyIn"]: coll,
                ORE["proxyFor"]: entity,
            })
            coll_attribs.append((PROV["hadDictionaryMember"], m_entity))
            coll_b_attribs.append((ORE["aggregates"], m_b))

        coll.add_attributes(coll_attribs)
        coll_b.add_attributes(coll_b_attribs)

        # Also Save ORE Folder as annotation metadata
        ore_doc = ProvDocument()
        ore_doc.add_namespace(ORE)
        ore_doc.add_namespace(RO)
        ore_doc.add_namespace(UUID)
        ore_doc.add_bundle(dir_bundle)
        ore_doc = ore_doc.flattened()
        ore_doc_path = str(PurePosixPath(METADATA, ore_doc_fn))
        with self.research_object.write_bag_file(
                ore_doc_path) as provenance_file:
            ore_doc.serialize(provenance_file,
                              format="rdf",
                              rdf_format="turtle")
        self.research_object.add_annotation(dir_id, [ore_doc_fn],
                                            ORE["isDescribedBy"].uri)

        if is_empty:
            # Empty directory
            coll.add_asserted_type(PROV["EmptyCollection"])
            coll.add_asserted_type(PROV["EmptyDictionary"])
        self.research_object.add_uri(coll.identifier.uri)
        return coll

    def declare_string(self, value: str) -> Tuple[ProvEntity, str]:
        """Save as string in UTF-8."""
        byte_s = BytesIO(str(value).encode(ENCODING))
        data_file = self.research_object.add_data_file(byte_s,
                                                       content_type=TEXT_PLAIN)
        checksum = PurePosixPath(data_file).name
        # FIXME: Don't naively assume add_data_file uses hash in filename!
        data_id = "data:%s" % PurePosixPath(data_file).stem
        entity = self.document.entity(data_id, {
            PROV_TYPE: WFPROV["Artifact"],
            PROV_VALUE: str(value)
        })  # type: ProvEntity
        return entity, checksum

    def declare_artefact(self, value: Optional[CWLOutputType]) -> ProvEntity:
        """Create data artefact entities for all file objects."""
        if value is None:
            # FIXME: If this can happen in CWL, we'll
            # need a better way to represent this in PROV
            return self.document.entity(CWLPROV["None"], {PROV_LABEL: "None"})

        if isinstance(value, (bool, int, float)):
            # Typically used in job documents for flags

            # FIXME: Make consistent hash URIs for these
            # that somehow include the type
            # (so "1" != 1 != "1.0" != true)
            entity = self.document.entity(uuid.uuid4().urn,
                                          {PROV_VALUE: value})
            self.research_object.add_uri(entity.identifier.uri)
            return entity

        if isinstance(value, (str, str)):
            (entity, _) = self.declare_string(value)
            return entity

        if isinstance(value, bytes):
            # If we got here then we must be in Python 3
            byte_s = BytesIO(value)
            data_file = self.research_object.add_data_file(byte_s)
            # FIXME: Don't naively assume add_data_file uses hash in filename!
            data_id = "data:%s" % PurePosixPath(data_file).stem
            return self.document.entity(
                data_id,
                {
                    PROV_TYPE: WFPROV["Artifact"],
                    PROV_VALUE: str(value)
                },
            )

        if isinstance(value, MutableMapping):
            if "@id" in value:
                # Already processed this value, but it might not be in this PROV
                entities = self.document.get_record(value["@id"])
                if entities:
                    return entities[0]
                # else, unknown in PROV, re-add below as if it's fresh

            # Base case - we found a File we need to update
            if value.get("class") == "File":
                (entity, _, _) = self.declare_file(value)
                value["@id"] = entity.identifier.uri
                return entity

            if value.get("class") == "Directory":
                entity = self.declare_directory(value)
                value["@id"] = entity.identifier.uri
                return entity
            coll_id = value.setdefault("@id", uuid.uuid4().urn)
            # some other kind of dictionary?
            # TODO: also Save as JSON
            coll = self.document.entity(
                coll_id,
                [
                    (PROV_TYPE, WFPROV["Artifact"]),
                    (PROV_TYPE, PROV["Collection"]),
                    (PROV_TYPE, PROV["Dictionary"]),
                ],
            )

            if value.get("class"):
                _logger.warning("Unknown data class %s.", value["class"])
                # FIXME: The class might be "http://example.com/somethingelse"
                coll.add_asserted_type(CWLPROV[value["class"]])

            # Let's iterate and recurse
            coll_attribs = []  # type: List[Tuple[Identifier, ProvEntity]]
            for (key, val) in value.items():
                v_ent = self.declare_artefact(val)
                self.document.membership(coll, v_ent)
                m_entity = self.document.entity(uuid.uuid4().urn)
                # Note: only support PROV-O style dictionary
                # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition
                # as prov.py do not easily allow PROV-N extensions
                m_entity.add_asserted_type(PROV["KeyEntityPair"])
                m_entity.add_attributes({
                    PROV["pairKey"]: str(key),
                    PROV["pairEntity"]: v_ent
                })
                coll_attribs.append((PROV["hadDictionaryMember"], m_entity))
            coll.add_attributes(coll_attribs)
            self.research_object.add_uri(coll.identifier.uri)
            return coll

        # some other kind of Collection?
        # TODO: also save as JSON
        try:
            members = []
            for each_input_obj in iter(value):
                # Recurse and register any nested objects
                e = self.declare_artefact(each_input_obj)
                members.append(e)

            # If we reached this, then we were allowed to iterate
            coll = self.document.entity(
                uuid.uuid4().urn,
                [
                    (PROV_TYPE, WFPROV["Artifact"]),
                    (PROV_TYPE, PROV["Collection"]),
                ],
            )
            if not members:
                coll.add_asserted_type(PROV["EmptyCollection"])
            else:
                for member in members:
                    # FIXME: This won't preserve order, for that
                    # we would need to use PROV.Dictionary
                    # with numeric keys
                    self.document.membership(coll, member)
            self.research_object.add_uri(coll.identifier.uri)
            # FIXME: list value does not support adding "@id"
            return coll
        except TypeError:
            _logger.warning("Unrecognized type %s of %r", type(value), value)
            # Let's just fall back to Python repr()
            entity = self.document.entity(uuid.uuid4().urn,
                                          {PROV_LABEL: repr(value)})
            self.research_object.add_uri(entity.identifier.uri)
            return entity

    def used_artefacts(
        self,
        job_order: Union[CWLObjectType, List[CWLObjectType]],
        process_run_id: str,
        name: Optional[str] = None,
    ) -> None:
        """Add used() for each data artefact."""
        if isinstance(job_order, list):
            for entry in job_order:
                self.used_artefacts(entry, process_run_id, name)
        else:
            # FIXME: Use workflow name in packed.cwl, "main" is wrong for nested workflows
            base = "main"
            if name is not None:
                base += "/" + name
            for key, value in job_order.items():
                prov_role = self.wf_ns[f"{base}/{key}"]
                try:
                    entity = self.declare_artefact(value)
                    self.document.used(
                        process_run_id,
                        entity,
                        datetime.datetime.now(),
                        None,
                        {"prov:role": prov_role},
                    )
                except OSError:
                    pass

    def generate_output_prov(
        self,
        final_output: Union[CWLObjectType, MutableSequence[CWLObjectType],
                            None],
        process_run_id: Optional[str],
        name: Optional[str],
    ) -> None:
        """Call wasGeneratedBy() for each output,copy the files into the RO."""
        if isinstance(final_output, MutableSequence):
            for entry in final_output:
                self.generate_output_prov(entry, process_run_id, name)
        elif final_output is not None:
            # Timestamp should be created at the earliest
            timestamp = datetime.datetime.now()

            # For each output, find/register the corresponding
            # entity (UUID) and document it as generated in
            # a role corresponding to the output
            for output, value in final_output.items():
                entity = self.declare_artefact(value)
                if name is not None:
                    name = urllib.parse.quote(str(name), safe=":/,#")
                    # FIXME: Probably not "main" in nested workflows
                    role = self.wf_ns[f"main/{name}/{output}"]
                else:
                    role = self.wf_ns["main/%s" % output]

                if not process_run_id:
                    process_run_id = self.workflow_run_uri

                self.document.wasGeneratedBy(entity, process_run_id, timestamp,
                                             None, {"prov:role": role})

    def prospective_prov(self, job: JobsType) -> None:
        """Create prospective prov recording as wfdesc prov:Plan."""
        if not isinstance(job, WorkflowJob):
            # direct command line tool execution
            self.document.entity(
                "wf:main",
                {
                    PROV_TYPE: WFDESC["Process"],
                    "prov:type": PROV["Plan"],
                    "prov:label": "Prospective provenance",
                },
            )
            return

        self.document.entity(
            "wf:main",
            {
                PROV_TYPE: WFDESC["Workflow"],
                "prov:type": PROV["Plan"],
                "prov:label": "Prospective provenance",
            },
        )

        for step in job.steps:
            stepnametemp = "wf:main/" + str(step.name)[5:]
            stepname = urllib.parse.quote(stepnametemp, safe=":/,#")
            provstep = self.document.entity(
                stepname,
                {
                    PROV_TYPE: WFDESC["Process"],
                    "prov:type": PROV["Plan"]
                },
            )
            self.document.entity(
                "wf:main",
                {
                    "wfdesc:hasSubProcess": provstep,
                    "prov:label": "Prospective provenance",
                },
            )
        # TODO: Declare roles/parameters as well

    def activity_has_provenance(self, activity, prov_ids):
        # type: (str, List[Identifier]) -> None
        """Add http://www.w3.org/TR/prov-aq/ relations to nested PROV files."""
        # NOTE: The below will only work if the corresponding metadata/provenance arcp URI
        # is a pre-registered namespace in the PROV Document
        attribs = [(PROV["has_provenance"], prov_id) for prov_id in prov_ids]
        self.document.activity(activity, other_attributes=attribs)
        # Tip: we can't use https://www.w3.org/TR/prov-links/#term-mention
        # as prov:mentionOf() is only for entities, not activities
        uris = [i.uri for i in prov_ids]
        self.research_object.add_annotation(activity, uris,
                                            PROV["has_provenance"].uri)

    def finalize_prov_profile(self, name):
        # type: (Optional[str]) -> List[Identifier]
        """Transfer the provenance related files to the RO."""
        # NOTE: Relative posix path
        if name is None:
            # main workflow, fixed filenames
            filename = "primary.cwlprov"
        else:
            # ASCII-friendly filename, avoiding % as we don't want %2520 in manifest.json
            wf_name = urllib.parse.quote(str(name), safe="").replace("%", "_")
            # Note that the above could cause overlaps for similarly named
            # workflows, but that's OK as we'll also include run uuid
            # which also covers thhe case of this step being run in
            # multiple places or iterations
            filename = f"{wf_name}.{self.workflow_run_uuid}.cwlprov"

        basename = str(PurePosixPath(PROVENANCE) / filename)

        # TODO: Also support other profiles than CWLProv, e.g. ProvOne

        # list of prov identifiers of provenance files
        prov_ids = []

        # https://www.w3.org/TR/prov-xml/
        with self.research_object.write_bag_file(basename +
                                                 ".xml") as provenance_file:
            self.document.serialize(provenance_file, format="xml", indent=4)
            prov_ids.append(self.provenance_ns[filename + ".xml"])

        # https://www.w3.org/TR/prov-n/
        with self.research_object.write_bag_file(basename +
                                                 ".provn") as provenance_file:
            self.document.serialize(provenance_file, format="provn", indent=2)
            prov_ids.append(self.provenance_ns[filename + ".provn"])

        # https://www.w3.org/Submission/prov-json/
        with self.research_object.write_bag_file(basename +
                                                 ".json") as provenance_file:
            self.document.serialize(provenance_file, format="json", indent=2)
            prov_ids.append(self.provenance_ns[filename + ".json"])

        # "rdf" aka https://www.w3.org/TR/prov-o/
        # which can be serialized to ttl/nt/jsonld (and more!)

        # https://www.w3.org/TR/turtle/
        with self.research_object.write_bag_file(basename +
                                                 ".ttl") as provenance_file:
            self.document.serialize(provenance_file,
                                    format="rdf",
                                    rdf_format="turtle")
            prov_ids.append(self.provenance_ns[filename + ".ttl"])

        # https://www.w3.org/TR/n-triples/
        with self.research_object.write_bag_file(basename +
                                                 ".nt") as provenance_file:
            self.document.serialize(provenance_file,
                                    format="rdf",
                                    rdf_format="ntriples")
            prov_ids.append(self.provenance_ns[filename + ".nt"])

        # https://www.w3.org/TR/json-ld/
        # TODO: Use a nice JSON-LD context
        # see also https://eprints.soton.ac.uk/395985/
        # 404 Not Found on https://provenance.ecs.soton.ac.uk/prov.jsonld :(
        with self.research_object.write_bag_file(basename +
                                                 ".jsonld") as provenance_file:
            self.document.serialize(provenance_file,
                                    format="rdf",
                                    rdf_format="json-ld")
            prov_ids.append(self.provenance_ns[filename + ".jsonld"])

        _logger.debug("[provenance] added provenance: %s", prov_ids)
        return prov_ids
Esempio n. 3
0
    def get_provenance_history(self, driver, uuid, depth=None):
        prov_doc = ProvDocument()
        #prov_doc.
        #NOTE!! There is a bug with the JSON serializer.  I can't add the prov prefix using this mechanism

        prov_doc.add_namespace('ex', 'http://example.org/')
        prov_doc.add_namespace('hubmap', 'https://hubmapconsortium.org/')

        #prov_doc.add_namespace('dct', 'http://purl.org/dc/terms/')
        #prov_doc.add_namespace('foaf','http://xmlns.com/foaf/0.1/')
        relation_list = []
        with driver.session() as session:
            try:
                # max_level_str is the string used to put a limit on the number of levels to traverse
                max_level_str = ''
                if depth is not None and len(str(depth)) > 0:
                    max_level_str = """maxLevel: {depth},""".format(
                        depth=depth)
                """
                Basically this Cypher query returns a collection of nodes and relationships.  The relationships include ACTIVITY_INPUT, ACTIVITY_OUTPUT and
                HAS_METADATA.  First, we build a dictionary of the nodes using uuid as a key.  Next, we loop through the relationships looking for HAS_METADATA 
                relationships.  The HAS_METADATA relationships connect the Entity nodes with their metadata.  The data from the Metadata node
                becomes the 'metadata' attribute for the Entity node.
                """
                """Possible replacement:
                THIS WORKS...NEEDS LOTS of COMMENTS!!
                MATCH (entity_metadata)<-[r1:HAS_METADATA]-(e)<-[r2:ACTIVITY_OUTPUT]-(a:Activity)-[r3:HAS_METADATA]->(activity_metadata) 
                                WHERE e.hubmap_identifier = 'TEST0010-LK-1-1'
                                WITH [e,a, entity_metadata, activity_metadata] AS entities, COLLECT(r1) + COLLECT(r2) + COLLECT(r3) AS relationships
                                WITH [node in entities | node {.*, label:labels(node)}] AS nodes, [rel in relationships | rel { .*, fromNode: { label:labels(startNode(rel))[0], uuid:startNode(rel).uuid } , toNode: { label:labels(endNode(rel))[0], uuid:endNode(rel).uuid }, rel_data: { type: type(rel) } } ] as rels
                                RETURN nodes, rels
                UNION OPTIONAL MATCH (activity_metadata)<-[r1:HAS_METADATA]-(a:Activity)<-[r2:ACTIVITY_INPUT|:ACTIVITY_OUTPUT*]-(parent)-[r3:HAS_METADATA]->(parent_metadata),
                (e)<-[r4:ACTIVITY_OUTPUT]-(a:Activity) 
                                WHERE e.hubmap_identifier = 'TEST0010-LK-1-1'
                                WITH [parent,parent_metadata, a, activity_metadata] AS nodes, [rel in COLLECT(r1) + COLLECT(r3) + COLLECT(r4)+COLLECT(apoc.convert.toRelationship(r2)) | rel { .*, fromNode: { label:labels(startNode(rel))[0], uuid:startNode(rel).uuid } , toNode: { label:labels(endNode(rel))[0], uuid:endNode(rel).uuid }, rel_data: { type: type(rel) } } ] as rels
                                RETURN DISTINCT nodes, rels                

                uuid for TEST0010-LK-1-1 for testing: eda3916db4695d834eb6c51a893d06f1
                """

                stmt = """MATCH (n:Entity {{ uuid: '{uuid}' }}) 
                CALL apoc.path.subgraphAll(n, {{ {max_level_str} relationshipFilter:'<ACTIVITY_INPUT|<ACTIVITY_OUTPUT|HAS_METADATA>' }}) YIELD nodes, relationships
                WITH [node in nodes | node {{ .*, label:labels(node)[0] }} ] as nodes, 
                     [rel in relationships | rel {{ .*, fromNode: {{ label:labels(startNode(rel))[0], uuid:startNode(rel).uuid }} , toNode: {{ label:labels(endNode(rel))[0], uuid:endNode(rel).uuid }}, rel_data: {{ type: type(rel) }} }} ] as rels
                WITH {{ nodes:nodes, relationships:rels }} as json
                RETURN json""".format(uuid=uuid, max_level_str=max_level_str)

                result = session.run(stmt)

                #there should only be one record
                for jsonData in result:
                    try:
                        record = dict(jsonData)['json']

                        if 'relationships' not in record:
                            raise LookupError(
                                'Error, unable to find relationships for uuid:'
                                + uuid)
                        if 'nodes' not in record:
                            raise LookupError(
                                'Error, unable to find nodes for uuid:' + uuid)

                        node_dict = {}
                        # pack the nodes into a dictionary using the uuid as a key
                        for node_record in record['nodes']:
                            node_dict[node_record['uuid']] = node_record

                        # TODO: clean up nodes
                        # remove nodes that lack metadata

                        # need to devise a methodology for this
                        # try preprocessing the record['relationships'] here:
                        # make a copy of the node_dict called unreferenced_node_dict
                        # loop through the relationships and find all the has_metadata relationships
                        # for each node pair in the has_metadata relationship, delete it from the unreferenced_node_dict
                        # once the loop is finished, continue as before
                        # add some logic when generating the wasGenerated and used relationships.  If either node is in the
                        # unreferenced_node_dict, then ignore the relationship

                        # now, connect the nodes
                        for rel_record in record['relationships']:
                            from_uuid = rel_record['fromNode']['uuid']
                            to_uuid = rel_record['toNode']['uuid']
                            from_node = node_dict[from_uuid]
                            to_node = node_dict[to_uuid]
                            if rel_record['rel_data'][
                                    'type'] == HubmapConst.HAS_METADATA_REL:
                                # assign the metadata node as the metadata attribute
                                # just extract the provenance information from the metadata node

                                entity_timestamp_json = Provenance.get_json_timestamp(
                                    int(to_node[
                                        HubmapConst.
                                        PROVENANCE_CREATE_TIMESTAMP_ATTRIBUTE])
                                )
                                provenance_data = {
                                    ProvConst.PROV_GENERATED_TIME_ATTRIBUTE:
                                    entity_timestamp_json
                                }
                                type_code = None
                                isEntity = True
                                if HubmapConst.ENTITY_TYPE_ATTRIBUTE in from_node:
                                    type_code = from_node[
                                        HubmapConst.ENTITY_TYPE_ATTRIBUTE]
                                elif HubmapConst.ACTIVITY_TYPE_ATTRIBUTE in from_node:
                                    type_code = from_node[
                                        HubmapConst.ACTIVITY_TYPE_ATTRIBUTE]
                                    isEntity = False
                                label_text = None
                                if HubmapConst.LAB_IDENTIFIER_ATTRIBUTE in from_node:
                                    label_text = from_node[
                                        HubmapConst.LAB_IDENTIFIER_ATTRIBUTE]
                                else:
                                    label_text = from_node[
                                        HubmapConst.UUID_ATTRIBUTE]

                                # build metadata attribute from the Metadata node
                                metadata_attribute = {}
                                for attribute_key in to_node:
                                    if attribute_key not in self.metadata_ignore_attributes:
                                        if attribute_key in self.known_attribute_map:
                                            # special case: timestamps
                                            if attribute_key == HubmapConst.PROVENANCE_MODIFIED_TIMESTAMP_ATTRIBUTE:
                                                provenance_data[
                                                    self.known_attribute_map[
                                                        attribute_key]] = Provenance.get_json_timestamp(
                                                            int(to_node[
                                                                attribute_key])
                                                        )
                                        else:  #add any extraneous data to the metadata attribute
                                            metadata_attribute[
                                                attribute_key] = to_node[
                                                    attribute_key]

                                # Need to add the agent and organization here, plus the appropriate relationships (between the entity and the agent plus orgainzation)
                                agent_record = self.get_agent_record(to_node)
                                agent_unique_id = str(agent_record[
                                    ProvConst.HUBMAP_PROV_USER_EMAIL]).replace(
                                        '@', '-')
                                agent_unique_id = str(agent_unique_id).replace(
                                    '.', '-')
                                if ProvConst.HUBMAP_PROV_USER_UUID in agent_record:
                                    agent_unique_id = agent_record[
                                        ProvConst.HUBMAP_PROV_USER_UUID]
                                agent_uri = Provenance.build_uri(
                                    'hubmap', 'agent', agent_unique_id)
                                organization_record = self.get_organization_record(
                                    to_node)
                                organization_uri = Provenance.build_uri(
                                    'hubmap', 'organization',
                                    organization_record[
                                        ProvConst.HUBMAP_PROV_GROUP_UUID])
                                doc_agent = None
                                doc_org = None

                                get_agent = prov_doc.get_record(agent_uri)
                                # only add this once
                                if len(get_agent) == 0:
                                    doc_agent = prov_doc.agent(
                                        agent_uri, agent_record)
                                else:
                                    doc_agent = get_agent[0]

                                get_org = prov_doc.get_record(organization_uri)
                                # only add this once
                                if len(get_org) == 0:
                                    doc_org = prov_doc.agent(
                                        organization_uri, organization_record)
                                else:
                                    doc_org = get_org[0]

                                other_attributes = {
                                    ProvConst.PROV_LABEL_ATTRIBUTE:
                                    label_text,
                                    ProvConst.PROV_TYPE_ATTRIBUTE:
                                    type_code,
                                    ProvConst.HUBMAP_DOI_ATTRIBUTE:
                                    from_node[HubmapConst.DOI_ATTRIBUTE],
                                    ProvConst.HUBMAP_DISPLAY_DOI_ATTRIBUTE:
                                    from_node[
                                        HubmapConst.DISPLAY_DOI_ATTRIBUTE],
                                    ProvConst.HUBMAP_DISPLAY_IDENTIFIER_ATTRIBUTE:
                                    label_text,
                                    ProvConst.HUBMAP_UUID_ATTRIBUTE:
                                    from_node[HubmapConst.UUID_ATTRIBUTE]
                                }
                                # only add metadata if it contains data
                                if len(metadata_attribute) > 0:
                                    other_attributes[
                                        ProvConst.
                                        HUBMAP_METADATA_ATTRIBUTE] = json.dumps(
                                            metadata_attribute)
                                # add the provenance data to the other_attributes
                                other_attributes.update(provenance_data)
                                if isEntity == True:
                                    prov_doc.entity(
                                        Provenance.build_uri(
                                            'hubmap', 'entities',
                                            from_node['uuid']),
                                        other_attributes)
                                else:
                                    activity_timestamp_json = Provenance.get_json_timestamp(
                                        int(to_node[
                                            HubmapConst.
                                            PROVENANCE_CREATE_TIMESTAMP_ATTRIBUTE]
                                            ))
                                    doc_activity = prov_doc.activity(
                                        Provenance.build_uri(
                                            'hubmap', 'activities',
                                            from_node['uuid']),
                                        activity_timestamp_json,
                                        activity_timestamp_json,
                                        other_attributes)
                                    prov_doc.actedOnBehalfOf(
                                        doc_agent, doc_org, doc_activity)
                            elif rel_record['rel_data']['type'] in [
                                    HubmapConst.ACTIVITY_OUTPUT_REL,
                                    HubmapConst.ACTIVITY_INPUT_REL
                            ]:
                                to_node_uri = None
                                from_node_uri = None
                                if HubmapConst.ENTITY_TYPE_ATTRIBUTE in to_node:
                                    to_node_uri = Provenance.build_uri(
                                        'hubmap', 'entities', to_node['uuid'])
                                else:
                                    to_node_uri = Provenance.build_uri(
                                        'hubmap', 'activities',
                                        to_node['uuid'])
                                if HubmapConst.ENTITY_TYPE_ATTRIBUTE in from_node:
                                    from_node_uri = Provenance.build_uri(
                                        'hubmap', 'entities',
                                        from_node['uuid'])
                                else:
                                    from_node_uri = Provenance.build_uri(
                                        'hubmap', 'activities',
                                        from_node['uuid'])

                                if rel_record['rel_data'][
                                        'type'] == 'ACTIVITY_OUTPUT':
                                    #prov_doc.wasGeneratedBy(entity, activity, time, identifier, other_attributes)
                                    prov_doc.wasGeneratedBy(
                                        to_node_uri, from_node_uri)

                                if rel_record['rel_data'][
                                        'type'] == 'ACTIVITY_INPUT':
                                    #prov_doc.used(activity, entity, time, identifier, other_attributes)
                                    prov_doc.used(to_node_uri, from_node_uri)

                                # for now, simply create a "relation" where the fromNode's uuid is connected to a toNode's uuid via a relationship:
                                # ex: {'fromNodeUUID': '42e10053358328c9079f1c8181287b6d', 'relationship': 'ACTIVITY_OUTPUT', 'toNodeUUID': '398400024fda58e293cdb435db3c777e'}
                                rel_data_record = {
                                    'fromNodeUUID': from_node['uuid'],
                                    'relationship':
                                    rel_record['rel_data']['type'],
                                    'toNodeUUID': to_node['uuid']
                                }
                                relation_list.append(rel_data_record)
                        return_data = {
                            'nodes': node_dict,
                            'relations': relation_list
                        }
                    except Exception as e:
                        raise e

                # there is a bug in the JSON serializer.  So manually insert the prov prefix

                output_doc = prov_doc.serialize(indent=2)
                output_doc = output_doc.replace(
                    '"prefix": {',
                    '"prefix": {\n    "prov" : "http://www.w3.org/ns/prov#", ')

                #output_doc = prov_doc.serialize(format='rdf', rdf_format='trig')
                #output_doc = prov_doc.serialize(format='provn')
                return output_doc

            except ConnectionError as ce:
                print('A connection error occurred: ', str(ce.args[0]))
                raise ce
            except ValueError as ve:
                print('A value error occurred: ', ve.value)
                raise ve
            except Exception as e:
                print('An exception occurred in get_provenance_history: ' +
                      str(e))
                traceback.print_exc()