def get_provenance_history(uuid, normalized_provenance_dict): prov_doc = ProvDocument() # The 'prov' prefix is build-in namespace, no need to redefine here prov_doc.add_namespace(HUBMAP_NAMESPACE, 'https://hubmapconsortium.org/') # A bit validation if 'relationships' not in normalized_provenance_dict: raise LookupError( f'Missing "relationships" key from the normalized_provenance_dict for Entity of uuid: {uuid}' ) if 'nodes' not in normalized_provenance_dict: raise LookupError( f'Missing "nodes" key from the normalized_provenance_dict for Entity of uuid: {uuid}' ) # Pack the nodes into a dictionary using the uuid as key nodes_dict = {} for node in normalized_provenance_dict['nodes']: nodes_dict[node['uuid']] = node # Loop through the relationships and build the provenance document for rel_dict in normalized_provenance_dict['relationships']: # (Activity) - [ACTIVITY_OUTPUT] -> (Entity) if rel_dict['rel_data']['type'] == 'ACTIVITY_OUTPUT': activity_uuid = rel_dict['fromNode']['uuid'] entity_uuid = rel_dict['toNode']['uuid'] # (Entity) - [ACTIVITY_INPUT] -> (Activity) elif rel_dict['rel_data']['type'] == 'ACTIVITY_INPUT': entity_uuid = rel_dict['fromNode']['uuid'] activity_uuid = rel_dict['toNode']['uuid'] activity_node = nodes_dict[activity_uuid] entity_node = nodes_dict[entity_uuid] activity_uri = None entity_uri = None # Skip Lab nodes for agent and organization if entity_node['entity_type'] != 'Lab': # Get the agent information from the entity node agent_record = get_agent_record(entity_node) # Use 'created_by_user_sub' as agent ID if presents # Otherwise, fall back to use email by replacing @ and . created_by_user_sub_prov_key = f'{HUBMAP_NAMESPACE}:userUUID' created_by_user_email_prov_key = f'{HUBMAP_NAMESPACE}:userEmail' if created_by_user_sub_prov_key in agent_record: agent_id = agent_record[created_by_user_sub_prov_key] elif created_by_user_email_prov_key in agent_record: agent_id = str( agent_record[created_by_user_email_prov_key]).replace( '@', '-') agent_id = str(agent_id).replace('.', '-') else: msg = f"Both 'created_by_user_sub' and 'created_by_user_email' are missing form entity of uuid: {entity_node['uuid']}" logger.error(msg) raise LookupError(msg) # Build the agent uri agent_uri = build_uri(HUBMAP_NAMESPACE, 'agent', agent_id) # Only add the same agent once # Multiple entities can be associated to the same agent agent = prov_doc.get_record(agent_uri) if len(agent) == 0: doc_agent = prov_doc.agent(agent_uri, agent_record) else: doc_agent = agent[0] # Organization # Get the organization information from the entity node org_record = get_organization_record(entity_node) # Build the organization uri group_uuid_prov_key = f'{HUBMAP_NAMESPACE}:groupUUID' org_uri = build_uri(HUBMAP_NAMESPACE, 'organization', org_record[group_uuid_prov_key]) # Only add the same organization once # Multiple entities can be associated to different agents who are from the same organization org = prov_doc.get_record(org_uri) if len(org) == 0: doc_org = prov_doc.agent(org_uri, org_record) else: doc_org = org[0] # Build the activity uri activity_uri = build_uri(HUBMAP_NAMESPACE, 'activities', activity_node['uuid']) # Register activity if not already registered activity = prov_doc.get_record(activity_uri) if len(activity) == 0: # Shared attributes to be added to the PROV document activity_attributes = {'prov:type': 'Activity'} # Convert the timestampt integer to datetime string # Note: in our case, prov:startTime is the same as prov:endTime activity_time = timestamp_to_datetime( activity_node['created_timestamp']) # Add prefix to all other attributes for key in activity_node: prov_key = f'{HUBMAP_NAMESPACE}:{key}' # Use datetime string instead of timestamp integer if key == 'created_timestamp': activity_attributes[prov_key] = activity_time else: activity_attributes[prov_key] = activity_node[key] # Register activity doc_activity = prov_doc.activity(activity_uri, activity_time, activity_time, activity_attributes) # Relationship: the agent actedOnBehalfOf the org prov_doc.actedOnBehalfOf(doc_agent, doc_org, doc_activity) else: doc_activity = activity[0] # Build the entity uri entity_uri = build_uri(HUBMAP_NAMESPACE, 'entities', entity_node['uuid']) # Register entity is not already registered if len(prov_doc.get_record(entity_uri)) == 0: # Shared attributes to be added to the PROV document entity_attributes = {'prov:type': 'Entity'} # Add prefix to all other attributes for key in entity_node: # Entity property values can be list or dict, skip # And list and dict are unhashable types when calling `prov_doc.entity()` if not isinstance(entity_node[key], (list, dict)): prov_key = f'{HUBMAP_NAMESPACE}:{key}' # Use datetime string instead of timestamp integer if key in [ 'created_timestamp', 'last_modified_timestamp', 'published_timestamp' ]: entity_attributes[prov_key] = activity_time else: entity_attributes[prov_key] = entity_node[key] # Register entity prov_doc.entity(entity_uri, entity_attributes) # Build activity uri and entity uri if not already built # For the Lab nodes if activity_uri is None: activity_uri = build_uri(HUBMAP_NAMESPACE, 'activities', activity_node['uuid']) if entity_uri is None: entity_uri = build_uri(HUBMAP_NAMESPACE, 'entities', entity_node['uuid']) # The following relationships apply to all node including Lab entity nodes # (Activity) - [ACTIVITY_OUTPUT] -> (Entity) if rel_dict['rel_data']['type'] == 'ACTIVITY_OUTPUT': # Relationship: the entity wasGeneratedBy the activity prov_doc.wasGeneratedBy(entity_uri, activity_uri) # (Entity) - [ACTIVITY_INPUT] -> (Activity) elif rel_dict['rel_data']['type'] == 'ACTIVITY_INPUT': # Relationship: the activity used the entity prov_doc.used(activity_uri, entity_uri) # Format into json string based on the PROV-JSON Serialization # https://www.w3.org/Submission/prov-json/ serialized_json = prov_doc.serialize() return serialized_json
class ProvenanceProfile: """ Provenance profile. Populated as the workflow runs. """ def __init__( self, research_object: "ResearchObject", full_name: str, host_provenance: bool, user_provenance: bool, orcid: str, fsaccess: StdFsAccess, run_uuid: Optional[uuid.UUID] = None, ) -> None: """Initialize the provenance profile.""" self.fsaccess = fsaccess self.orcid = orcid self.research_object = research_object self.folder = self.research_object.folder self.document = ProvDocument() self.host_provenance = host_provenance self.user_provenance = user_provenance self.engine_uuid = research_object.engine_uuid # type: str self.add_to_manifest = self.research_object.add_to_manifest if self.orcid: _logger.debug("[provenance] Creator ORCID: %s", self.orcid) self.full_name = full_name if self.full_name: _logger.debug("[provenance] Creator Full name: %s", self.full_name) self.workflow_run_uuid = run_uuid or uuid.uuid4() self.workflow_run_uri = self.workflow_run_uuid.urn # type: str self.generate_prov_doc() def __str__(self) -> str: """Represent this Provenvance profile as a string.""" return "ProvenanceProfile <{}> in <{}>".format( self.workflow_run_uri, self.research_object, ) def generate_prov_doc(self) -> Tuple[str, ProvDocument]: """Add basic namespaces.""" def host_provenance(document: ProvDocument) -> None: """Record host provenance.""" document.add_namespace(CWLPROV) document.add_namespace(UUID) document.add_namespace(FOAF) hostname = getfqdn() # won't have a foaf:accountServiceHomepage for unix hosts, but # we can at least provide hostname document.agent( ACCOUNT_UUID, { PROV_TYPE: FOAF["OnlineAccount"], "prov:location": hostname, CWLPROV["hostname"]: hostname, }, ) self.cwltool_version = "cwltool %s" % versionstring().split()[-1] self.document.add_namespace("wfprov", "http://purl.org/wf4ever/wfprov#") # document.add_namespace('prov', 'http://www.w3.org/ns/prov#') self.document.add_namespace("wfdesc", "http://purl.org/wf4ever/wfdesc#") # TODO: Make this ontology. For now only has cwlprov:image self.document.add_namespace("cwlprov", "https://w3id.org/cwl/prov#") self.document.add_namespace("foaf", "http://xmlns.com/foaf/0.1/") self.document.add_namespace("schema", "http://schema.org/") self.document.add_namespace("orcid", "https://orcid.org/") self.document.add_namespace("id", "urn:uuid:") # NOTE: Internet draft expired 2004-03-04 (!) # https://tools.ietf.org/html/draft-thiemann-hash-urn-01 # TODO: Change to nih:sha-256; hashes # https://tools.ietf.org/html/rfc6920#section-7 self.document.add_namespace("data", "urn:hash::sha1:") # Also needed for docker images self.document.add_namespace(SHA256, "nih:sha-256;") # info only, won't really be used by prov as sub-resources use / self.document.add_namespace("researchobject", self.research_object.base_uri) # annotations self.metadata_ns = self.document.add_namespace( "metadata", self.research_object.base_uri + METADATA + "/") # Pre-register provenance directory so we can refer to its files self.provenance_ns = self.document.add_namespace( "provenance", self.research_object.base_uri + posix_path(PROVENANCE) + "/") ro_identifier_workflow = self.research_object.base_uri + "workflow/packed.cwl#" self.wf_ns = self.document.add_namespace("wf", ro_identifier_workflow) ro_identifier_input = (self.research_object.base_uri + "workflow/primary-job.json#") self.document.add_namespace("input", ro_identifier_input) # More info about the account (e.g. username, fullname) # may or may not have been previously logged by user_provenance() # .. but we always know cwltool was launched (directly or indirectly) # by a user account, as cwltool is a command line tool account = self.document.agent(ACCOUNT_UUID) if self.orcid or self.full_name: person = {PROV_TYPE: PROV["Person"], "prov:type": SCHEMA["Person"]} if self.full_name: person["prov:label"] = self.full_name person["foaf:name"] = self.full_name person["schema:name"] = self.full_name else: # TODO: Look up name from ORCID API? pass agent = self.document.agent(self.orcid or uuid.uuid4().urn, person) self.document.actedOnBehalfOf(account, agent) else: if self.host_provenance: host_provenance(self.document) if self.user_provenance: self.research_object.user_provenance(self.document) # The execution of cwltool wfengine = self.document.agent( self.engine_uuid, { PROV_TYPE: PROV["SoftwareAgent"], "prov:type": WFPROV["WorkflowEngine"], "prov:label": self.cwltool_version, }, ) # FIXME: This datetime will be a bit too delayed, we should # capture when cwltool.py earliest started? self.document.wasStartedBy(wfengine, None, account, datetime.datetime.now()) # define workflow run level activity self.document.activity( self.workflow_run_uri, datetime.datetime.now(), None, { PROV_TYPE: WFPROV["WorkflowRun"], "prov:label": "Run of workflow/packed.cwl#main", }, ) # association between SoftwareAgent and WorkflowRun main_workflow = "wf:main" self.document.wasAssociatedWith(self.workflow_run_uri, self.engine_uuid, main_workflow) self.document.wasStartedBy(self.workflow_run_uri, None, self.engine_uuid, datetime.datetime.now()) return (self.workflow_run_uri, self.document) def evaluate( self, process: Process, job: JobsType, job_order_object: CWLObjectType, research_obj: "ResearchObject", ) -> None: """Evaluate the nature of job.""" if not hasattr(process, "steps"): # record provenance of independent commandline tool executions self.prospective_prov(job) customised_job = copy_job_order(job, job_order_object) self.used_artefacts(customised_job, self.workflow_run_uri) research_obj.create_job(customised_job) elif hasattr(job, "workflow"): # record provenance of workflow executions self.prospective_prov(job) customised_job = copy_job_order(job, job_order_object) self.used_artefacts(customised_job, self.workflow_run_uri) def record_process_start( self, process: Process, job: JobsType, process_run_id: Optional[str] = None) -> Optional[str]: if not hasattr(process, "steps"): process_run_id = self.workflow_run_uri elif not hasattr(job, "workflow"): # commandline tool execution as part of workflow name = "" if isinstance(job, (CommandLineJob, JobBase, WorkflowJob)): name = job.name process_name = urllib.parse.quote(name, safe=":/,#") process_run_id = self.start_process(process_name, datetime.datetime.now()) return process_run_id def start_process( self, process_name: str, when: datetime.datetime, process_run_id: Optional[str] = None, ) -> str: """Record the start of each Process.""" if process_run_id is None: process_run_id = uuid.uuid4().urn prov_label = "Run of workflow/packed.cwl#main/" + process_name self.document.activity( process_run_id, None, None, { PROV_TYPE: WFPROV["ProcessRun"], PROV_LABEL: prov_label }, ) self.document.wasAssociatedWith(process_run_id, self.engine_uuid, str("wf:main/" + process_name)) self.document.wasStartedBy(process_run_id, None, self.workflow_run_uri, when, None, None) return process_run_id def record_process_end( self, process_name: str, process_run_id: str, outputs: Union[CWLObjectType, MutableSequence[CWLObjectType], None], when: datetime.datetime, ) -> None: self.generate_output_prov(outputs, process_run_id, process_name) self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri, when) def declare_file( self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, str]: if value["class"] != "File": raise ValueError("Must have class:File: %s" % value) # Need to determine file hash aka RO filename entity = None # type: Optional[ProvEntity] checksum = None if "checksum" in value: csum = cast(str, value["checksum"]) (method, checksum) = csum.split("$", 1) if method == SHA1 and self.research_object.has_data_file(checksum): entity = self.document.entity("data:" + checksum) if not entity and "location" in value: location = str(value["location"]) # If we made it here, we'll have to add it to the RO with self.fsaccess.open(location, "rb") as fhandle: relative_path = self.research_object.add_data_file(fhandle) # FIXME: This naively relies on add_data_file setting hash as filename checksum = PurePath(relative_path).name entity = self.document.entity("data:" + checksum, {PROV_TYPE: WFPROV["Artifact"]}) if "checksum" not in value: value["checksum"] = f"{SHA1}${checksum}" if not entity and "contents" in value: # Anonymous file, add content as string entity, checksum = self.declare_string(cast( str, value["contents"])) # By here one of them should have worked! if not entity or not checksum: raise ValueError( "class:File but missing checksum/location/content: %r" % value) # Track filename and extension, this is generally useful only for # secondaryFiles. Note that multiple uses of a file might thus record # different names for the same entity, so we'll # make/track a specialized entity by UUID file_id = value.setdefault("@id", uuid.uuid4().urn) # A specialized entity that has just these names file_entity = self.document.entity( file_id, [(PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, WF4EVER["File"])], ) # type: ProvEntity if "basename" in value: file_entity.add_attributes( {CWLPROV["basename"]: value["basename"]}) if "nameroot" in value: file_entity.add_attributes( {CWLPROV["nameroot"]: value["nameroot"]}) if "nameext" in value: file_entity.add_attributes({CWLPROV["nameext"]: value["nameext"]}) self.document.specializationOf(file_entity, entity) # Check for secondaries for sec in cast(MutableSequence[CWLObjectType], value.get("secondaryFiles", [])): # TODO: Record these in a specializationOf entity with UUID? if sec["class"] == "File": (sec_entity, _, _) = self.declare_file(sec) elif sec["class"] == "Directory": sec_entity = self.declare_directory(sec) else: raise ValueError(f"Got unexpected secondaryFiles value: {sec}") # We don't know how/when/where the secondary file was generated, # but CWL convention is a kind of summary/index derived # from the original file. As its generally in a different format # then prov:Quotation is not appropriate. self.document.derivation( sec_entity, file_entity, other_attributes={PROV["type"]: CWLPROV["SecondaryFile"]}, ) return file_entity, entity, checksum def declare_directory(self, value: CWLObjectType) -> ProvEntity: """Register any nested files/directories.""" # FIXME: Calculate a hash-like identifier for directory # so we get same value if it's the same filenames/hashes # in a different location. # For now, mint a new UUID to identify this directory, but # attempt to keep it inside the value dictionary dir_id = cast(str, value.setdefault("@id", uuid.uuid4().urn)) # New annotation file to keep the ORE Folder listing ore_doc_fn = dir_id.replace("urn:uuid:", "directory-") + ".ttl" dir_bundle = self.document.bundle(self.metadata_ns[ore_doc_fn]) coll = self.document.entity( dir_id, [ (PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, PROV["Collection"]), (PROV_TYPE, PROV["Dictionary"]), (PROV_TYPE, RO["Folder"]), ], ) # ORE description of ro:Folder, saved separately coll_b = dir_bundle.entity( dir_id, [(PROV_TYPE, RO["Folder"]), (PROV_TYPE, ORE["Aggregation"])], ) self.document.mentionOf(dir_id + "#ore", dir_id, dir_bundle.identifier) # dir_manifest = dir_bundle.entity( # dir_bundle.identifier, {PROV["type"]: ORE["ResourceMap"], # ORE["describes"]: coll_b.identifier}) coll_attribs = [(ORE["isDescribedBy"], dir_bundle.identifier)] coll_b_attribs = [] # type: List[Tuple[Identifier, ProvEntity]] # FIXME: .listing might not be populated yet - hopefully # a later call to this method will sort that is_empty = True if "listing" not in value: get_listing(self.fsaccess, value) for entry in cast(MutableSequence[CWLObjectType], value.get("listing", [])): is_empty = False # Declare child-artifacts entity = self.declare_artefact(entry) self.document.membership(coll, entity) # Membership relation aka our ORE Proxy m_id = uuid.uuid4().urn m_entity = self.document.entity(m_id) m_b = dir_bundle.entity(m_id) # PROV-O style Dictionary # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition # ..as prov.py do not currently allow PROV-N extensions # like hadDictionaryMember(..) m_entity.add_asserted_type(PROV["KeyEntityPair"]) m_entity.add_attributes({ PROV["pairKey"]: entry["basename"], PROV["pairEntity"]: entity, }) # As well as a being a # http://wf4ever.github.io/ro/2016-01-28/ro/#FolderEntry m_b.add_asserted_type(RO["FolderEntry"]) m_b.add_asserted_type(ORE["Proxy"]) m_b.add_attributes({ RO["entryName"]: entry["basename"], ORE["proxyIn"]: coll, ORE["proxyFor"]: entity, }) coll_attribs.append((PROV["hadDictionaryMember"], m_entity)) coll_b_attribs.append((ORE["aggregates"], m_b)) coll.add_attributes(coll_attribs) coll_b.add_attributes(coll_b_attribs) # Also Save ORE Folder as annotation metadata ore_doc = ProvDocument() ore_doc.add_namespace(ORE) ore_doc.add_namespace(RO) ore_doc.add_namespace(UUID) ore_doc.add_bundle(dir_bundle) ore_doc = ore_doc.flattened() ore_doc_path = str(PurePosixPath(METADATA, ore_doc_fn)) with self.research_object.write_bag_file( ore_doc_path) as provenance_file: ore_doc.serialize(provenance_file, format="rdf", rdf_format="turtle") self.research_object.add_annotation(dir_id, [ore_doc_fn], ORE["isDescribedBy"].uri) if is_empty: # Empty directory coll.add_asserted_type(PROV["EmptyCollection"]) coll.add_asserted_type(PROV["EmptyDictionary"]) self.research_object.add_uri(coll.identifier.uri) return coll def declare_string(self, value: str) -> Tuple[ProvEntity, str]: """Save as string in UTF-8.""" byte_s = BytesIO(str(value).encode(ENCODING)) data_file = self.research_object.add_data_file(byte_s, content_type=TEXT_PLAIN) checksum = PurePosixPath(data_file).name # FIXME: Don't naively assume add_data_file uses hash in filename! data_id = "data:%s" % PurePosixPath(data_file).stem entity = self.document.entity(data_id, { PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value) }) # type: ProvEntity return entity, checksum def declare_artefact(self, value: Optional[CWLOutputType]) -> ProvEntity: """Create data artefact entities for all file objects.""" if value is None: # FIXME: If this can happen in CWL, we'll # need a better way to represent this in PROV return self.document.entity(CWLPROV["None"], {PROV_LABEL: "None"}) if isinstance(value, (bool, int, float)): # Typically used in job documents for flags # FIXME: Make consistent hash URIs for these # that somehow include the type # (so "1" != 1 != "1.0" != true) entity = self.document.entity(uuid.uuid4().urn, {PROV_VALUE: value}) self.research_object.add_uri(entity.identifier.uri) return entity if isinstance(value, (str, str)): (entity, _) = self.declare_string(value) return entity if isinstance(value, bytes): # If we got here then we must be in Python 3 byte_s = BytesIO(value) data_file = self.research_object.add_data_file(byte_s) # FIXME: Don't naively assume add_data_file uses hash in filename! data_id = "data:%s" % PurePosixPath(data_file).stem return self.document.entity( data_id, { PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value) }, ) if isinstance(value, MutableMapping): if "@id" in value: # Already processed this value, but it might not be in this PROV entities = self.document.get_record(value["@id"]) if entities: return entities[0] # else, unknown in PROV, re-add below as if it's fresh # Base case - we found a File we need to update if value.get("class") == "File": (entity, _, _) = self.declare_file(value) value["@id"] = entity.identifier.uri return entity if value.get("class") == "Directory": entity = self.declare_directory(value) value["@id"] = entity.identifier.uri return entity coll_id = value.setdefault("@id", uuid.uuid4().urn) # some other kind of dictionary? # TODO: also Save as JSON coll = self.document.entity( coll_id, [ (PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, PROV["Collection"]), (PROV_TYPE, PROV["Dictionary"]), ], ) if value.get("class"): _logger.warning("Unknown data class %s.", value["class"]) # FIXME: The class might be "http://example.com/somethingelse" coll.add_asserted_type(CWLPROV[value["class"]]) # Let's iterate and recurse coll_attribs = [] # type: List[Tuple[Identifier, ProvEntity]] for (key, val) in value.items(): v_ent = self.declare_artefact(val) self.document.membership(coll, v_ent) m_entity = self.document.entity(uuid.uuid4().urn) # Note: only support PROV-O style dictionary # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition # as prov.py do not easily allow PROV-N extensions m_entity.add_asserted_type(PROV["KeyEntityPair"]) m_entity.add_attributes({ PROV["pairKey"]: str(key), PROV["pairEntity"]: v_ent }) coll_attribs.append((PROV["hadDictionaryMember"], m_entity)) coll.add_attributes(coll_attribs) self.research_object.add_uri(coll.identifier.uri) return coll # some other kind of Collection? # TODO: also save as JSON try: members = [] for each_input_obj in iter(value): # Recurse and register any nested objects e = self.declare_artefact(each_input_obj) members.append(e) # If we reached this, then we were allowed to iterate coll = self.document.entity( uuid.uuid4().urn, [ (PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, PROV["Collection"]), ], ) if not members: coll.add_asserted_type(PROV["EmptyCollection"]) else: for member in members: # FIXME: This won't preserve order, for that # we would need to use PROV.Dictionary # with numeric keys self.document.membership(coll, member) self.research_object.add_uri(coll.identifier.uri) # FIXME: list value does not support adding "@id" return coll except TypeError: _logger.warning("Unrecognized type %s of %r", type(value), value) # Let's just fall back to Python repr() entity = self.document.entity(uuid.uuid4().urn, {PROV_LABEL: repr(value)}) self.research_object.add_uri(entity.identifier.uri) return entity def used_artefacts( self, job_order: Union[CWLObjectType, List[CWLObjectType]], process_run_id: str, name: Optional[str] = None, ) -> None: """Add used() for each data artefact.""" if isinstance(job_order, list): for entry in job_order: self.used_artefacts(entry, process_run_id, name) else: # FIXME: Use workflow name in packed.cwl, "main" is wrong for nested workflows base = "main" if name is not None: base += "/" + name for key, value in job_order.items(): prov_role = self.wf_ns[f"{base}/{key}"] try: entity = self.declare_artefact(value) self.document.used( process_run_id, entity, datetime.datetime.now(), None, {"prov:role": prov_role}, ) except OSError: pass def generate_output_prov( self, final_output: Union[CWLObjectType, MutableSequence[CWLObjectType], None], process_run_id: Optional[str], name: Optional[str], ) -> None: """Call wasGeneratedBy() for each output,copy the files into the RO.""" if isinstance(final_output, MutableSequence): for entry in final_output: self.generate_output_prov(entry, process_run_id, name) elif final_output is not None: # Timestamp should be created at the earliest timestamp = datetime.datetime.now() # For each output, find/register the corresponding # entity (UUID) and document it as generated in # a role corresponding to the output for output, value in final_output.items(): entity = self.declare_artefact(value) if name is not None: name = urllib.parse.quote(str(name), safe=":/,#") # FIXME: Probably not "main" in nested workflows role = self.wf_ns[f"main/{name}/{output}"] else: role = self.wf_ns["main/%s" % output] if not process_run_id: process_run_id = self.workflow_run_uri self.document.wasGeneratedBy(entity, process_run_id, timestamp, None, {"prov:role": role}) def prospective_prov(self, job: JobsType) -> None: """Create prospective prov recording as wfdesc prov:Plan.""" if not isinstance(job, WorkflowJob): # direct command line tool execution self.document.entity( "wf:main", { PROV_TYPE: WFDESC["Process"], "prov:type": PROV["Plan"], "prov:label": "Prospective provenance", }, ) return self.document.entity( "wf:main", { PROV_TYPE: WFDESC["Workflow"], "prov:type": PROV["Plan"], "prov:label": "Prospective provenance", }, ) for step in job.steps: stepnametemp = "wf:main/" + str(step.name)[5:] stepname = urllib.parse.quote(stepnametemp, safe=":/,#") provstep = self.document.entity( stepname, { PROV_TYPE: WFDESC["Process"], "prov:type": PROV["Plan"] }, ) self.document.entity( "wf:main", { "wfdesc:hasSubProcess": provstep, "prov:label": "Prospective provenance", }, ) # TODO: Declare roles/parameters as well def activity_has_provenance(self, activity, prov_ids): # type: (str, List[Identifier]) -> None """Add http://www.w3.org/TR/prov-aq/ relations to nested PROV files.""" # NOTE: The below will only work if the corresponding metadata/provenance arcp URI # is a pre-registered namespace in the PROV Document attribs = [(PROV["has_provenance"], prov_id) for prov_id in prov_ids] self.document.activity(activity, other_attributes=attribs) # Tip: we can't use https://www.w3.org/TR/prov-links/#term-mention # as prov:mentionOf() is only for entities, not activities uris = [i.uri for i in prov_ids] self.research_object.add_annotation(activity, uris, PROV["has_provenance"].uri) def finalize_prov_profile(self, name): # type: (Optional[str]) -> List[Identifier] """Transfer the provenance related files to the RO.""" # NOTE: Relative posix path if name is None: # main workflow, fixed filenames filename = "primary.cwlprov" else: # ASCII-friendly filename, avoiding % as we don't want %2520 in manifest.json wf_name = urllib.parse.quote(str(name), safe="").replace("%", "_") # Note that the above could cause overlaps for similarly named # workflows, but that's OK as we'll also include run uuid # which also covers thhe case of this step being run in # multiple places or iterations filename = f"{wf_name}.{self.workflow_run_uuid}.cwlprov" basename = str(PurePosixPath(PROVENANCE) / filename) # TODO: Also support other profiles than CWLProv, e.g. ProvOne # list of prov identifiers of provenance files prov_ids = [] # https://www.w3.org/TR/prov-xml/ with self.research_object.write_bag_file(basename + ".xml") as provenance_file: self.document.serialize(provenance_file, format="xml", indent=4) prov_ids.append(self.provenance_ns[filename + ".xml"]) # https://www.w3.org/TR/prov-n/ with self.research_object.write_bag_file(basename + ".provn") as provenance_file: self.document.serialize(provenance_file, format="provn", indent=2) prov_ids.append(self.provenance_ns[filename + ".provn"]) # https://www.w3.org/Submission/prov-json/ with self.research_object.write_bag_file(basename + ".json") as provenance_file: self.document.serialize(provenance_file, format="json", indent=2) prov_ids.append(self.provenance_ns[filename + ".json"]) # "rdf" aka https://www.w3.org/TR/prov-o/ # which can be serialized to ttl/nt/jsonld (and more!) # https://www.w3.org/TR/turtle/ with self.research_object.write_bag_file(basename + ".ttl") as provenance_file: self.document.serialize(provenance_file, format="rdf", rdf_format="turtle") prov_ids.append(self.provenance_ns[filename + ".ttl"]) # https://www.w3.org/TR/n-triples/ with self.research_object.write_bag_file(basename + ".nt") as provenance_file: self.document.serialize(provenance_file, format="rdf", rdf_format="ntriples") prov_ids.append(self.provenance_ns[filename + ".nt"]) # https://www.w3.org/TR/json-ld/ # TODO: Use a nice JSON-LD context # see also https://eprints.soton.ac.uk/395985/ # 404 Not Found on https://provenance.ecs.soton.ac.uk/prov.jsonld :( with self.research_object.write_bag_file(basename + ".jsonld") as provenance_file: self.document.serialize(provenance_file, format="rdf", rdf_format="json-ld") prov_ids.append(self.provenance_ns[filename + ".jsonld"]) _logger.debug("[provenance] added provenance: %s", prov_ids) return prov_ids
def get_provenance_history(self, driver, uuid, depth=None): prov_doc = ProvDocument() #prov_doc. #NOTE!! There is a bug with the JSON serializer. I can't add the prov prefix using this mechanism prov_doc.add_namespace('ex', 'http://example.org/') prov_doc.add_namespace('hubmap', 'https://hubmapconsortium.org/') #prov_doc.add_namespace('dct', 'http://purl.org/dc/terms/') #prov_doc.add_namespace('foaf','http://xmlns.com/foaf/0.1/') relation_list = [] with driver.session() as session: try: # max_level_str is the string used to put a limit on the number of levels to traverse max_level_str = '' if depth is not None and len(str(depth)) > 0: max_level_str = """maxLevel: {depth},""".format( depth=depth) """ Basically this Cypher query returns a collection of nodes and relationships. The relationships include ACTIVITY_INPUT, ACTIVITY_OUTPUT and HAS_METADATA. First, we build a dictionary of the nodes using uuid as a key. Next, we loop through the relationships looking for HAS_METADATA relationships. The HAS_METADATA relationships connect the Entity nodes with their metadata. The data from the Metadata node becomes the 'metadata' attribute for the Entity node. """ """Possible replacement: THIS WORKS...NEEDS LOTS of COMMENTS!! MATCH (entity_metadata)<-[r1:HAS_METADATA]-(e)<-[r2:ACTIVITY_OUTPUT]-(a:Activity)-[r3:HAS_METADATA]->(activity_metadata) WHERE e.hubmap_identifier = 'TEST0010-LK-1-1' WITH [e,a, entity_metadata, activity_metadata] AS entities, COLLECT(r1) + COLLECT(r2) + COLLECT(r3) AS relationships WITH [node in entities | node {.*, label:labels(node)}] AS nodes, [rel in relationships | rel { .*, fromNode: { label:labels(startNode(rel))[0], uuid:startNode(rel).uuid } , toNode: { label:labels(endNode(rel))[0], uuid:endNode(rel).uuid }, rel_data: { type: type(rel) } } ] as rels RETURN nodes, rels UNION OPTIONAL MATCH (activity_metadata)<-[r1:HAS_METADATA]-(a:Activity)<-[r2:ACTIVITY_INPUT|:ACTIVITY_OUTPUT*]-(parent)-[r3:HAS_METADATA]->(parent_metadata), (e)<-[r4:ACTIVITY_OUTPUT]-(a:Activity) WHERE e.hubmap_identifier = 'TEST0010-LK-1-1' WITH [parent,parent_metadata, a, activity_metadata] AS nodes, [rel in COLLECT(r1) + COLLECT(r3) + COLLECT(r4)+COLLECT(apoc.convert.toRelationship(r2)) | rel { .*, fromNode: { label:labels(startNode(rel))[0], uuid:startNode(rel).uuid } , toNode: { label:labels(endNode(rel))[0], uuid:endNode(rel).uuid }, rel_data: { type: type(rel) } } ] as rels RETURN DISTINCT nodes, rels uuid for TEST0010-LK-1-1 for testing: eda3916db4695d834eb6c51a893d06f1 """ stmt = """MATCH (n:Entity {{ uuid: '{uuid}' }}) CALL apoc.path.subgraphAll(n, {{ {max_level_str} relationshipFilter:'<ACTIVITY_INPUT|<ACTIVITY_OUTPUT|HAS_METADATA>' }}) YIELD nodes, relationships WITH [node in nodes | node {{ .*, label:labels(node)[0] }} ] as nodes, [rel in relationships | rel {{ .*, fromNode: {{ label:labels(startNode(rel))[0], uuid:startNode(rel).uuid }} , toNode: {{ label:labels(endNode(rel))[0], uuid:endNode(rel).uuid }}, rel_data: {{ type: type(rel) }} }} ] as rels WITH {{ nodes:nodes, relationships:rels }} as json RETURN json""".format(uuid=uuid, max_level_str=max_level_str) result = session.run(stmt) #there should only be one record for jsonData in result: try: record = dict(jsonData)['json'] if 'relationships' not in record: raise LookupError( 'Error, unable to find relationships for uuid:' + uuid) if 'nodes' not in record: raise LookupError( 'Error, unable to find nodes for uuid:' + uuid) node_dict = {} # pack the nodes into a dictionary using the uuid as a key for node_record in record['nodes']: node_dict[node_record['uuid']] = node_record # TODO: clean up nodes # remove nodes that lack metadata # need to devise a methodology for this # try preprocessing the record['relationships'] here: # make a copy of the node_dict called unreferenced_node_dict # loop through the relationships and find all the has_metadata relationships # for each node pair in the has_metadata relationship, delete it from the unreferenced_node_dict # once the loop is finished, continue as before # add some logic when generating the wasGenerated and used relationships. If either node is in the # unreferenced_node_dict, then ignore the relationship # now, connect the nodes for rel_record in record['relationships']: from_uuid = rel_record['fromNode']['uuid'] to_uuid = rel_record['toNode']['uuid'] from_node = node_dict[from_uuid] to_node = node_dict[to_uuid] if rel_record['rel_data'][ 'type'] == HubmapConst.HAS_METADATA_REL: # assign the metadata node as the metadata attribute # just extract the provenance information from the metadata node entity_timestamp_json = Provenance.get_json_timestamp( int(to_node[ HubmapConst. PROVENANCE_CREATE_TIMESTAMP_ATTRIBUTE]) ) provenance_data = { ProvConst.PROV_GENERATED_TIME_ATTRIBUTE: entity_timestamp_json } type_code = None isEntity = True if HubmapConst.ENTITY_TYPE_ATTRIBUTE in from_node: type_code = from_node[ HubmapConst.ENTITY_TYPE_ATTRIBUTE] elif HubmapConst.ACTIVITY_TYPE_ATTRIBUTE in from_node: type_code = from_node[ HubmapConst.ACTIVITY_TYPE_ATTRIBUTE] isEntity = False label_text = None if HubmapConst.LAB_IDENTIFIER_ATTRIBUTE in from_node: label_text = from_node[ HubmapConst.LAB_IDENTIFIER_ATTRIBUTE] else: label_text = from_node[ HubmapConst.UUID_ATTRIBUTE] # build metadata attribute from the Metadata node metadata_attribute = {} for attribute_key in to_node: if attribute_key not in self.metadata_ignore_attributes: if attribute_key in self.known_attribute_map: # special case: timestamps if attribute_key == HubmapConst.PROVENANCE_MODIFIED_TIMESTAMP_ATTRIBUTE: provenance_data[ self.known_attribute_map[ attribute_key]] = Provenance.get_json_timestamp( int(to_node[ attribute_key]) ) else: #add any extraneous data to the metadata attribute metadata_attribute[ attribute_key] = to_node[ attribute_key] # Need to add the agent and organization here, plus the appropriate relationships (between the entity and the agent plus orgainzation) agent_record = self.get_agent_record(to_node) agent_unique_id = str(agent_record[ ProvConst.HUBMAP_PROV_USER_EMAIL]).replace( '@', '-') agent_unique_id = str(agent_unique_id).replace( '.', '-') if ProvConst.HUBMAP_PROV_USER_UUID in agent_record: agent_unique_id = agent_record[ ProvConst.HUBMAP_PROV_USER_UUID] agent_uri = Provenance.build_uri( 'hubmap', 'agent', agent_unique_id) organization_record = self.get_organization_record( to_node) organization_uri = Provenance.build_uri( 'hubmap', 'organization', organization_record[ ProvConst.HUBMAP_PROV_GROUP_UUID]) doc_agent = None doc_org = None get_agent = prov_doc.get_record(agent_uri) # only add this once if len(get_agent) == 0: doc_agent = prov_doc.agent( agent_uri, agent_record) else: doc_agent = get_agent[0] get_org = prov_doc.get_record(organization_uri) # only add this once if len(get_org) == 0: doc_org = prov_doc.agent( organization_uri, organization_record) else: doc_org = get_org[0] other_attributes = { ProvConst.PROV_LABEL_ATTRIBUTE: label_text, ProvConst.PROV_TYPE_ATTRIBUTE: type_code, ProvConst.HUBMAP_DOI_ATTRIBUTE: from_node[HubmapConst.DOI_ATTRIBUTE], ProvConst.HUBMAP_DISPLAY_DOI_ATTRIBUTE: from_node[ HubmapConst.DISPLAY_DOI_ATTRIBUTE], ProvConst.HUBMAP_DISPLAY_IDENTIFIER_ATTRIBUTE: label_text, ProvConst.HUBMAP_UUID_ATTRIBUTE: from_node[HubmapConst.UUID_ATTRIBUTE] } # only add metadata if it contains data if len(metadata_attribute) > 0: other_attributes[ ProvConst. HUBMAP_METADATA_ATTRIBUTE] = json.dumps( metadata_attribute) # add the provenance data to the other_attributes other_attributes.update(provenance_data) if isEntity == True: prov_doc.entity( Provenance.build_uri( 'hubmap', 'entities', from_node['uuid']), other_attributes) else: activity_timestamp_json = Provenance.get_json_timestamp( int(to_node[ HubmapConst. PROVENANCE_CREATE_TIMESTAMP_ATTRIBUTE] )) doc_activity = prov_doc.activity( Provenance.build_uri( 'hubmap', 'activities', from_node['uuid']), activity_timestamp_json, activity_timestamp_json, other_attributes) prov_doc.actedOnBehalfOf( doc_agent, doc_org, doc_activity) elif rel_record['rel_data']['type'] in [ HubmapConst.ACTIVITY_OUTPUT_REL, HubmapConst.ACTIVITY_INPUT_REL ]: to_node_uri = None from_node_uri = None if HubmapConst.ENTITY_TYPE_ATTRIBUTE in to_node: to_node_uri = Provenance.build_uri( 'hubmap', 'entities', to_node['uuid']) else: to_node_uri = Provenance.build_uri( 'hubmap', 'activities', to_node['uuid']) if HubmapConst.ENTITY_TYPE_ATTRIBUTE in from_node: from_node_uri = Provenance.build_uri( 'hubmap', 'entities', from_node['uuid']) else: from_node_uri = Provenance.build_uri( 'hubmap', 'activities', from_node['uuid']) if rel_record['rel_data'][ 'type'] == 'ACTIVITY_OUTPUT': #prov_doc.wasGeneratedBy(entity, activity, time, identifier, other_attributes) prov_doc.wasGeneratedBy( to_node_uri, from_node_uri) if rel_record['rel_data'][ 'type'] == 'ACTIVITY_INPUT': #prov_doc.used(activity, entity, time, identifier, other_attributes) prov_doc.used(to_node_uri, from_node_uri) # for now, simply create a "relation" where the fromNode's uuid is connected to a toNode's uuid via a relationship: # ex: {'fromNodeUUID': '42e10053358328c9079f1c8181287b6d', 'relationship': 'ACTIVITY_OUTPUT', 'toNodeUUID': '398400024fda58e293cdb435db3c777e'} rel_data_record = { 'fromNodeUUID': from_node['uuid'], 'relationship': rel_record['rel_data']['type'], 'toNodeUUID': to_node['uuid'] } relation_list.append(rel_data_record) return_data = { 'nodes': node_dict, 'relations': relation_list } except Exception as e: raise e # there is a bug in the JSON serializer. So manually insert the prov prefix output_doc = prov_doc.serialize(indent=2) output_doc = output_doc.replace( '"prefix": {', '"prefix": {\n "prov" : "http://www.w3.org/ns/prov#", ') #output_doc = prov_doc.serialize(format='rdf', rdf_format='trig') #output_doc = prov_doc.serialize(format='provn') return output_doc except ConnectionError as ce: print('A connection error occurred: ', str(ce.args[0])) raise ce except ValueError as ve: print('A value error occurred: ', ve.value) raise ve except Exception as e: print('An exception occurred in get_provenance_history: ' + str(e)) traceback.print_exc()