def base_connector_bundle_parameter_example(): doc = ProvDocument() doc.add_namespace("ex", "http://example.com") attributes = dict() namespaces = dict() namespaces.update({"ex": "http://example.com"}) type_map = dict() type_map.update({"int value": "int"}) type_map.update({"date value": "xds:datetime"}) metadata = dict() metadata.update( {METADATA_KEY_PROV_TYPE: doc.valid_qualified_name("prov:Bundle")}) metadata.update( {METADATA_KEY_IDENTIFIER: doc.valid_qualified_name("ex:bundle name")}) metadata.update({METADATA_KEY_TYPE_MAP: type_map}) metadata.update({METADATA_KEY_NAMESPACES: namespaces}) return_data = dict() return_data.update({"attributes": attributes}) return_data.update({"metadata": metadata}) return return_data
def deriveDependency(self, aDO, aRO, derivedList): d1 = ProvDocument() # d1 is now an empty provenance document d1.add_namespace("dt", "http://cs.ncl.ac.uk/dtsim/") e1 = d1.entity(DTns + aRO.id) # deriving ag1 = d1.agent(DTns + str(aDO.id)) for der in derivedList: # create provlet e2 = d1.entity(DTns + der.id) # derived d1.wasAttributedTo(e2, ag1) d1.wasDerivedFrom(e2, e1) # update upstream pointer der.upstream = [(aRO, None)] # aRO is upstream from aRO with no activity # update downstream aRO.downstream.append((der, None)) # aR1 is downstream from aR1 with no activity # update global graph e1 = pGlobal.entity(DTns + aRO.id) # deriving ag1 = pGlobal.agent(DTns + str(aDO.id)) pGlobal.wasAttributedTo(e2, ag1) for der in derivedList: e2 = pGlobal.entity(DTns + der.id) # derived pGlobal.wasDerivedFrom(e2, e1) # trigger credit recomputation for der in derivedList: # aRO needs its credit updated with aRO1.credit aCreditManager.addDerivationCredit(aRO, der.currentTotalCredit) # self.notify(d1) return d1
def test_namespace_inheritance(self): prov_doc = ProvDocument() prov_doc.add_namespace("ex", "http://www.example.org/") bundle = prov_doc.bundle("ex:bundle") e1 = bundle.entity("ex:e1") self.assertIsNotNone(e1.identifier, "e1's identifier is None!") self.do_tests(prov_doc)
def datatypes(): g = ProvDocument() ex = Namespace('ex', 'http://example.org/') g.add_namespace(ex) attributes = { 'ex:int': 100, 'ex:float': 100.123456, 'ex:long': 123456789000, 'ex:bool': True, 'ex:str': 'Some string', 'ex:unicode': u'Some unicode string with accents: Huỳnh Trung Đông', 'ex:timedate': datetime.datetime(2012, 12, 12, 14, 7, 48), 'ex:intstr': Literal("PROV Internationalized string", PROV["InternationalizedString"], "en"), } multiline = """Line1 Line2 Line3""" attributes['ex:multi-line'] = multiline g.entity('ex:e1', attributes) return g
class ProjectProvenance: def __init__(self, database_helper, full_provenance=False): """ Initializes the provenance for the mjclawar_rarshad project Parameters ---------- database_helper: DatabaseHelper full_provenance: bool Returns ------- """ assert isinstance(database_helper, DatabaseHelper) self.database_helper = database_helper if full_provenance: self.prov_doc = ProvDocument.deserialize(dir_info.plan_json) else: self.prov_doc = ProvDocument() self.prov_doc.add_namespace(mcras.BDP_NAMESPACE.name, mcras.BDP_NAMESPACE.link) self.prov_doc.add_namespace(mcras.ALG_NAMESPACE.name, mcras.ALG_NAMESPACE.link) self.prov_doc.add_namespace(mcras.DAT_NAMESPACE.name, mcras.DAT_NAMESPACE.link) self.prov_doc.add_namespace(mcras.LOG_NAMESPACE.name, mcras.LOG_NAMESPACE.link) self.prov_doc.add_namespace(mcras.ONT_NAMESPACE.name, mcras.ONT_NAMESPACE.link) def write_provenance_json(self): self.prov_doc.serialize(dir_info.plan_json)
def test_namespace_inheritance(self): prov_doc = ProvDocument() prov_doc.add_namespace('ex', 'http://www.example.org/') bundle = prov_doc.bundle('ex:bundle') e1 = bundle.entity('ex:e1') self.assertIsNotNone(e1.identifier, "e1's identifier is None!") self.assertRoundTripEquivalence(prov_doc)
def test_cmip6_data_citation_url(tmp_path): """Test3: CMIP6 info_url is retrieved from ES-DOC.""" # Create fake provenance provenance = ProvDocument() provenance.add_namespace('file', uri=ESMVALTOOL_URI_PREFIX + 'file') provenance.add_namespace('attribute', uri=ESMVALTOOL_URI_PREFIX + 'attribute') attributes = { 'attribute:mip_era': 'CMIP6', 'attribute:activity_id': 'activity', 'attribute:institution_id': 'institution', 'attribute:source_id': 'source', 'attribute:experiment_id': 'experiment', } filename = str(tmp_path / 'output.nc') provenance.entity('file:' + filename, attributes) _write_citation_files(filename, provenance) citation_url = tmp_path / 'output_data_citation_info.txt' # Create fake info url fake_url_prefix = '.'.join(attributes.values()) text = '\n'.join([ "Follow the links below to find more information about CMIP6 data:", f"- {CMIP6_URL_STEM}/cmip6?input={fake_url_prefix}", '', ]) assert citation_url.read_text() == text
def base_connector_bundle_parameter_example(): """ This example returns a dict with example arguments for a db_adapter :return: dict {attributes, metadata} :rtype: dict """ doc = ProvDocument() doc.add_namespace("ex", "http://example.com") attributes = dict() attributes.update({"prov:type": "prov:Bundle"}) namespaces = dict() namespaces.update({"ex": "http://example.com"}) type_map = dict() type_map.update({"int value": "int"}) type_map.update({"date value": "xds:datetime"}) metadata = dict() metadata.update( {METADATA_KEY_PROV_TYPE: doc.valid_qualified_name("prov:Entity")}) metadata.update( {METADATA_KEY_IDENTIFIER: doc.valid_qualified_name("ex:bundle name")}) metadata.update({METADATA_KEY_TYPE_MAP: type_map}) metadata.update({METADATA_KEY_NAMESPACES: namespaces}) return_data = dict() return_data.update({"attributes": attributes}) return_data.update({"metadata": metadata}) return return_data
def prov_db_unknown_prov_typ_example(): doc = ProvDocument() doc.add_namespace("ex", "https://example.com") doc.entity(identifier="ex:Entity1") doc.entity(identifier="ex:Entity2") doc.influence(influencee="ex:Entity1", influencer="ex:Entity2") return doc
def datatypes(): g = ProvDocument() ex = Namespace("ex", "http://example.org/") g.add_namespace(ex) attributes = { "ex:int": 100, "ex:float": 100.123456, "ex:long": 123456789000, "ex:bool": True, "ex:str": "Some string", "ex:unicode": "Some unicode string with accents: Huỳnh Trung Đông", "ex:timedate": datetime.datetime(2012, 12, 12, 14, 7, 48), "ex:intstr": Literal("PROV Internationalized string", PROV["InternationalizedString"], "en"), } multiline = """Line1 Line2 Line3""" attributes["ex:multi-line"] = multiline g.entity("ex:e1", attributes) return g
def base_connector_relation_parameter_example(): doc = ProvDocument() doc.add_namespace("ex", "http://example.com") doc.add_namespace("custom", "http://custom.com") namespaces = dict() namespaces.update({"ex": "http://example.com"}) namespaces.update({"custom": "http://custom.com"}) type_map = dict() type_map.update({"int value": "int"}) type_map.update({"date value": "xds:datetime"}) metadata = dict() metadata.update({METADATA_KEY_PROV_TYPE: PROV_RECORD_IDS_MAP["mentionOf"]}) metadata.update({METADATA_KEY_IDENTIFIER: "identifier for the relation"}) metadata.update({METADATA_KEY_TYPE_MAP: type_map}) metadata.update({METADATA_KEY_NAMESPACES: namespaces}) return_data = dict() return_data.update({"attributes": attributes_dict_example()}) return_data.update({"metadata": metadata}) return_data.update({"from_node": doc.valid_qualified_name("ex:Yoda")}) return_data.update( {"to_node": doc.valid_qualified_name("ex:Luke Skywalker")}) return_data.update({"doc": doc}) return return_data
def test_references(tmp_path, monkeypatch): """Test1: references are replaced with bibtex.""" # Create fake provenance provenance = ProvDocument() provenance.add_namespace('file', uri=ESMVALTOOL_URI_PREFIX + 'file') provenance.add_namespace('attribute', uri=ESMVALTOOL_URI_PREFIX + 'attribute') filename = str(tmp_path / 'output.nc') attributes = { 'attribute:references': 'test_tag', 'attribute:script_file': 'diagnostics.py' } provenance.entity('file:' + filename, attributes) # Create fake bibtex references tag file references_path = tmp_path / 'references' references_path.mkdir() monkeypatch.setattr(esmvalcore._citation.DIAGNOSTICS, 'path', tmp_path) fake_bibtex_file = references_path / 'test_tag.bibtex' fake_bibtex = "Fake bibtex file content\n" fake_bibtex_file.write_text(fake_bibtex) _write_citation_files(filename, provenance) citation_file = tmp_path / 'output_citation.bibtex' citation = citation_file.read_text() assert citation == '\n'.join([ESMVALTOOL_PAPER, fake_bibtex])
def document_with_n_bundles_having_default_namespace(n): prov_doc = ProvDocument() prov_doc.add_namespace('ex', 'http://www.example.org/') for i in range(n): x = str(i + 1) bundle = prov_doc.bundle('ex:bundle/' + x) bundle.set_default_namespace('http://www.example.org/default/' + x) bundle.entity('e') return prov_doc
def document_with_n_bundles_having_default_namespace(n): prov_doc = ProvDocument() prov_doc.add_namespace("ex", "http://www.example.org/") for i in range(n): x = str(i + 1) bundle = prov_doc.bundle("ex:bundle/" + x) bundle.set_default_namespace("http://www.example.org/default/" + x) bundle.entity("e") return prov_doc
def test_xsd_qnames(self): prov_doc = ProvDocument() ex = Namespace('ex', 'http://www.example.org') prov_doc.add_namespace(ex) an_xsd_qname = XSDQName(ex['a_value']) prov_doc.entity('ex:e1', {'prov:value': an_xsd_qname}) self.assertPROVJSONRoundTripEquivalence(prov_doc)
def long_literals(): g = ProvDocument() long_uri = "http://Lorem.ipsum/dolor/sit/amet/consectetur/adipiscing/elit/Quisque/vel/sollicitudin/felis/nec/venenatis/massa/Aenean/lectus/arcu/sagittis/sit/amet/nisl/nec/varius/eleifend/sem/In/hac/habitasse/platea/dictumst/Aliquam/eget/fermentum/enim/Curabitur/auctor/elit/non/ipsum/interdum/at/orci/aliquam/" ex = Namespace('ex', long_uri) g.add_namespace(ex) g.entity('ex:e1', {'prov:label': 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec pellentesque luctus nulla vel ullamcorper. Donec sit amet ligula sit amet lorem pretium rhoncus vel vel lorem. Sed at consequat metus, eget eleifend massa. Fusce a facilisis turpis. Lorem volutpat.'}) return g
def job2prov(job): """ Create ProvDocument based on job description :param job: UWS job :return: ProvDocument """ # job.jdl.content = { # 'description': description, # 'parameters': parameters, # 'results': results, # 'executionduration': execdur, # 'quote': quote # } # parameters[pname] = { # 'type': p.get('type'), # 'required': p.get('required'), # 'default': p.get('default'), # 'description': list(p)[0].text, # } # results[r.get('value')] = { # 'mediaType': r.get('mediaType'), # 'default': r.get('default'), # 'description': list(r)[0].text, # } pdoc = ProvDocument() # Declaring namespaces for various prefixes used in the example pdoc.add_namespace('prov', 'http://www.w3.org/ns/prov#') pdoc.add_namespace('voprov', 'http://www.ivoa.net/ns/voprov#') pdoc.add_namespace('cta', 'http://www.cta-observatory.org#') pdoc.add_namespace('uwsdata', 'https://voparis-uws-test.obspm.fr/rest/' + job.jobname + '/' + job.jobid + '/') pdoc.add_namespace('ctajobs', 'http://www.cta-observatory.org#') # Adding an activity ctbin = pdoc.activity('ctajobs:' + job.jobname, job.start_time, job.end_time) # TODO: add job description, version, url, ... # Agent pdoc.agent('cta:consortium', other_attributes={'prov:type': "Organization"}) pdoc.wasAssociatedWith(ctbin, 'cta:consortium') # Entities, in and out with relations e_in = [] for pname, pdict in job.jdl.content['parameters'].iteritems(): #if pname.startswith('in'): if any(x in pdict['type'] for x in ['file', 'xs:anyURI']): e_in.append(pdoc.entity('uwsdata:parameters/' + pname)) # TODO: use publisher_did? add prov attributes, add voprov attributes? ctbin.used(e_in[-1]) e_out = [] for rname, rdict in job.jdl.content['results'].iteritems(): e_out.append(pdoc.entity('uwsdata:results/' + rname)) # TODO: use publisher_did? add prov attributes, add voprov attributes? e_out[-1].wasGeneratedBy(ctbin) for e in e_in: e_out[-1].wasDerivedFrom(e) return pdoc
def gen_prov_graph(file_path, option): ''' generates prov graph from form json file option = "all": add attribues to nodes ''' form_file = open(file_path, "r") json_info = form_file.read() form_file.close() sf_dict = json.loads(json_info) d1 = ProvDocument() d1.add_namespace('subm', 'http://www.enes.org/enes_entity/data_submsission') global_in_out = d1.entity("subm:" + "form_name_xx") print("workflow definition: ", sf_dict['workflow']) for [act_name, act] in sf_dict['workflow']: print("adding entities for workflow_step: ", act_name) entity_in_dict = sf_dict[act_name]['entity_in'] entity_out_dict = sf_dict[act_name]['entity_out'] agent_dict = sf_dict[act_name]['agent'] activity_dict = sf_dict[act_name]['activity'] # generate nodes in_node = d1.entity("subm:" + entity_in_dict['i_name']) out_node = d1.entity("subm:" + entity_out_dict['i_name']) agent = d1.agent("subm:" + agent_dict['i_name']) activity = d1.activity("subm:" + activity_dict['i_name']) #clean up and prefix dictionaries entity_in_dict = prefix_dict(entity_in_dict, 'subm') entity_out_dict = prefix_dict(entity_out_dict, 'subm') agent_dict = prefix_dict(agent_dict, 'subm') activity_dict = prefix_dict(activity_dict, 'subm') if option == "all": in_node.add_attributes(entity_in_dict) out_node.add_attributes(entity_out_dict) agent.add_attributes(agent_dict) activity.add_attributes(activity_dict) # connect nodes in graph d1.wasGeneratedBy(out_node, activity) d1.used(activity, in_node) d1.wasAssociatedWith(activity, agent) d1.wasDerivedFrom(in_node, out_node) d1.used(activity, global_in_out) d1.wasGeneratedBy(global_in_out, activity) return d1
def test_xsd_qnames(self): prov_doc = ProvDocument() ex = Namespace('ex', 'http://www.example.org/') prov_doc.add_namespace(ex) ex1 = Namespace('ex1', 'http://www.example1.org/') # ex1 is not added to the document an_xsd_qname = XSDQName(ex['a_value']) another_xsd_qname = XSDQName(ex1['another_value']) e1 = prov_doc.entity('ex:e1', {'prov:value': an_xsd_qname, 'prov:type': another_xsd_qname}) for _, attr_value in e1.attributes: self.assertIsInstance(attr_value, XSDQName) self.assertRoundTripEquivalence(prov_doc)
def long_literals(): g = ProvDocument() long_uri = "http://Lorem.ipsum/dolor/sit/amet/consectetur/adipiscing/elit/Quisque/vel/sollicitudin/felis/nec/venenatis/massa/Aenean/lectus/arcu/sagittis/sit/amet/nisl/nec/varius/eleifend/sem/In/hac/habitasse/platea/dictumst/Aliquam/eget/fermentum/enim/Curabitur/auctor/elit/non/ipsum/interdum/at/orci/aliquam/" ex = Namespace('ex', long_uri) g.add_namespace(ex) g.entity( 'ex:e1', { 'prov:label': 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec pellentesque luctus nulla vel ullamcorper. Donec sit amet ligula sit amet lorem pretium rhoncus vel vel lorem. Sed at consequat metus, eget eleifend massa. Fusce a facilisis turpis. Lorem volutpat.' }) return g
def generateProvlet(self, aDO, aRO): # create provlet d1 = ProvDocument() # d1 is now an empty provenance document d1.add_namespace("dt", "http://cs.ncl.ac.uk/dtsim/") e1 = d1.entity(DTns + aRO.id) ag1 = d1.agent(DTns + str(aDO.id)) d1.wasAttributedTo(e1, ag1) # update global graph e1 = pGlobal.entity(DTns + aRO.id) ag1 = pGlobal.agent(DTns + str(aDO.id)) pGlobal.wasAttributedTo(e1, ag1) # self.notify(d1) return d1
def host_provenance(document: ProvDocument) -> None: """Record host provenance.""" document.add_namespace(CWLPROV) document.add_namespace(UUID) document.add_namespace(FOAF) hostname = getfqdn() # won't have a foaf:accountServiceHomepage for unix hosts, but # we can at least provide hostname document.agent( ACCOUNT_UUID, { PROV_TYPE: FOAF["OnlineAccount"], "prov:location": hostname, CWLPROV["hostname"]: hostname, }, )
def datatypes(): g = ProvDocument() ex = Namespace('ex', 'http://example.org/') g.add_namespace(ex) attributes = {'ex:int': 100, 'ex:float': 100.123456, 'ex:long': 123456789000, 'ex:bool': True, 'ex:str': 'Some string', 'ex:unicode': u'Some unicode string with accents: Huỳnh Trung Đông', 'ex:timedate': datetime.datetime(2012, 12, 12, 14, 7, 48), 'ex:intstr': Literal("PROV Internationalized string", PROV["InternationalizedString"], "en"), } multiline = """Line1 Line2 Line3""" attributes['ex:multi-line'] = multiline g.entity('ex:e1', attributes) return g
def test_cmip6_data_citation(tmp_path, monkeypatch): """Test2: CMIP6 citation info is retrieved from ES-DOC.""" # Create fake provenance provenance = ProvDocument() provenance.add_namespace('file', uri=ESMVALTOOL_URI_PREFIX + 'file') provenance.add_namespace('attribute', uri=ESMVALTOOL_URI_PREFIX + 'attribute') attributes = { 'attribute:mip_era': 'CMIP6', 'attribute:activity_id': 'activity', 'attribute:institution_id': 'institution', 'attribute:source_id': 'source', 'attribute:experiment_id': 'experiment', } filename = str(tmp_path / 'output.nc') provenance.entity('file:' + filename, attributes) monkeypatch.setattr(esmvalcore._citation, '_get_response', mock_get_response) _write_citation_files(filename, provenance) citation_file = tmp_path / 'output_citation.bibtex' # Create fake bibtex entry url = 'url not found' title = 'title is found' publisher = 'publisher not found' year = 'publicationYear not found' authors = 'creators not found' doi = 'doi not found' fake_bibtex_entry = textwrap.dedent(f""" @misc{{{url}, \turl = {{{url}}}, \ttitle = {{{title}}}, \tpublisher = {{{publisher}}}, \tyear = {year}, \tauthor = {{{authors}}}, \tdoi = {{{doi}}}, }} """).lstrip() assert citation_file.read_text() == '\n'.join( [ESMVALTOOL_PAPER, fake_bibtex_entry])
def insert_document_with_bundles(instance): args_record = base_connector_record_parameter_example() args_bundle = base_connector_bundle_parameter_example() doc = ProvDocument() doc.add_namespace("ex", "http://example.com") # document with 1 record doc_id = instance.save_document() doc_record_id = instance.save_record(doc_id, args_record["attributes"], args_record["metadata"]) # bundle with 1 record bundle_id = instance.save_bundle(doc_id, args_bundle["attributes"], args_bundle["metadata"]) bundle_record_id = instance.save_record(bundle_id, args_record["attributes"], args_record["metadata"]) # add relation from_record_args = base_connector_record_parameter_example() to_record_args = base_connector_record_parameter_example() relation_args = base_connector_relation_parameter_example() from_label = doc.valid_qualified_name("ex:FROM NODE") to_label = doc.valid_qualified_name("ex:TO NODE") from_record_args["metadata"][METADATA_KEY_IDENTIFIER] = from_label to_record_args["metadata"][METADATA_KEY_IDENTIFIER] = to_label from_record_id = instance.save_record(doc_id, from_record_args["attributes"], from_record_args["metadata"]) to_record_id = instance.save_record(doc_id, to_record_args["attributes"], to_record_args["metadata"]) relation_id = instance.save_relation(doc_id, from_label, doc_id, to_label, relation_args["attributes"], relation_args["metadata"]) return { "relation_id": relation_id, "from_record_id": from_record_id, "to_record_id": to_record_id, "bundle_id": bundle_id, "bundle_record_id": bundle_record_id, "doc_id": doc_id, "doc_record_id": doc_record_id }
def bundles2(): # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles2.provn #=========================================================================== # document g = ProvDocument() # prefix ex <http://example.org/example/> g.add_namespace("ex", "http://www.example.com/") # prefix alice <http://example.org/alice/> # prefix bob <http://example.org/bob/> g.add_namespace('alice', 'http://example.org/alice/') g.add_namespace('bob', 'http://example.org/bob/') # entity(bob:bundle4, [prov:type='prov:Bundle']) # wasGeneratedBy(bob:bundle4, -, 2012-05-24T10:30:00) # agent(ex:Bob) # wasAttributedTo(bob:bundle4, ex:Bob) g.entity('bob:bundle4', {'prov:type': PROV['Bundle']}) g.wasGeneratedBy('bob:bundle4', time='2012-05-24T10:30:00') g.agent('ex:Bob') g.wasAttributedTo('bob:bundle4', 'ex:Bob') # entity(alice:bundle5, [ prov:type='prov:Bundle' ]) # wasGeneratedBy(alice:bundle5, -, 2012-05-25T11:15:00) # agent(ex:Alice) # wasAttributedTo(alice:bundle5, ex:Alice) g.entity('alice:bundle5', {'prov:type': PROV['Bundle']}) g.wasGeneratedBy('alice:bundle5', time='2012-05-25T11:15:00') g.agent('ex:Alice') g.wasAttributedTo('alice:bundle5', 'ex:Alice') # bundle bob:bundle4 # entity(ex:report1, [ prov:type="report", ex:version=1 ]) # wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01) # endBundle b4 = g.bundle('bob:bundle4') b4.entity('ex:report1', {'prov:type': "report", 'ex:version': 1}) b4.wasGeneratedBy('ex:report1', time='2012-05-24T10:00:01') # bundle alice:bundle5 # entity(ex:report1bis) # mentionOf(ex:report1bis, ex:report1, bob:bundle4) # entity(ex:report2, [ prov:type="report", ex:version=2 ]) # wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01) # wasDerivedFrom(ex:report2, ex:report1bis) # endBundle b5 = g.bundle('alice:bundle5') b5.entity('ex:report1bis') b5.mentionOf('ex:report1bis', 'ex:report1', 'bob:bundle4') b5.entity('ex:report2', [('prov:type', "report"), ('ex:version', 2)]) b5.wasGeneratedBy('ex:report2', time='2012-05-25T11:00:01') b5.wasDerivedFrom('ex:report2', 'ex:report1bis') # endDocument return g
def bundles2(): # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles2.provn # =========================================================================== # document g = ProvDocument() # prefix ex <http://example.org/example/> g.add_namespace("ex", "http://www.example.com/") # prefix alice <http://example.org/alice/> # prefix bob <http://example.org/bob/> g.add_namespace("alice", "http://example.org/alice/") g.add_namespace("bob", "http://example.org/bob/") # entity(bob:bundle4, [prov:type='prov:Bundle']) # wasGeneratedBy(bob:bundle4, -, 2012-05-24T10:30:00) # agent(ex:Bob) # wasAttributedTo(bob:bundle4, ex:Bob) g.entity("bob:bundle4", {"prov:type": PROV["Bundle"]}) g.wasGeneratedBy("bob:bundle4", time="2012-05-24T10:30:00") g.agent("ex:Bob") g.wasAttributedTo("bob:bundle4", "ex:Bob") # entity(alice:bundle5, [ prov:type='prov:Bundle' ]) # wasGeneratedBy(alice:bundle5, -, 2012-05-25T11:15:00) # agent(ex:Alice) # wasAttributedTo(alice:bundle5, ex:Alice) g.entity("alice:bundle5", {"prov:type": PROV["Bundle"]}) g.wasGeneratedBy("alice:bundle5", time="2012-05-25T11:15:00") g.agent("ex:Alice") g.wasAttributedTo("alice:bundle5", "ex:Alice") # bundle bob:bundle4 # entity(ex:report1, [ prov:type="report", ex:version=1 ]) # wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01) # endBundle b4 = g.bundle("bob:bundle4") b4.entity("ex:report1", {"prov:type": "report", "ex:version": 1}) b4.wasGeneratedBy("ex:report1", time="2012-05-24T10:00:01") # bundle alice:bundle5 # entity(ex:report1bis) # mentionOf(ex:report1bis, ex:report1, bob:bundle4) # entity(ex:report2, [ prov:type="report", ex:version=2 ]) # wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01) # wasDerivedFrom(ex:report2, ex:report1bis) # endBundle b5 = g.bundle("alice:bundle5") b5.entity("ex:report1bis") b5.mentionOf("ex:report1bis", "ex:report1", "bob:bundle4") b5.entity("ex:report2", [("prov:type", "report"), ("ex:version", 2)]) b5.wasGeneratedBy("ex:report2", time="2012-05-25T11:00:01") b5.wasDerivedFrom("ex:report2", "ex:report1bis") # endDocument return g
def bundles1(): # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles1.provn # =============================================================================== # document g = ProvDocument() # prefix ex <http://example.org/example/> EX = Namespace("ex", "http://www.example.com/") g.add_namespace(EX) # prefix alice <http://example.org/alice/> # prefix bob <http://example.org/bob/> g.add_namespace("alice", "http://example.org/alice/") g.add_namespace("bob", "http://example.org/bob/") # entity(bob:bundle1, [prov:type='prov:Bundle']) g.entity("bob:bundle1", {"prov:type": PROV["Bundle"]}) # wasGeneratedBy(bob:bundle1, -, 2012-05-24T10:30:00) g.wasGeneratedBy("bob:bundle1", time="2012-05-24T10:30:00") # agent(ex:Bob) g.agent("ex:Bob") # wasAttributedTo(bob:bundle1, ex:Bob) g.wasAttributedTo("bob:bundle1", "ex:Bob") # entity(alice:bundle2, [ prov:type='prov:Bundle' ]) g.entity("alice:bundle2", {"prov:type": PROV["Bundle"]}) # wasGeneratedBy(alice:bundle2, -, 2012-05-25T11:15:00) g.wasGeneratedBy("alice:bundle2", time="2012-05-25T11:15:00") # agent(ex:Alice) g.agent("ex:Alice") # wasAttributedTo(alice:bundle2, ex:Alice) g.wasAttributedTo("alice:bundle2", "ex:Alice") # bundle bob:bundle1 b1 = g.bundle("bob:bundle1") # entity(ex:report1, [ prov:type="report", ex:version=1 ]) b1.entity("ex:report1", {"prov:type": "report", "ex:version": 1}) # wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01) b1.wasGeneratedBy("ex:report1", time="2012-05-24T10:00:01") # endBundle # bundle alice:bundle2 b2 = g.bundle("alice:bundle2") # entity(ex:report1) b2.entity("ex:report1") # entity(ex:report2, [ prov:type="report", ex:version=2 ]) b2.entity("ex:report2", {"prov:type": "report", "ex:version": 2}) # wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01) b2.wasGeneratedBy("ex:report2", time="2012-05-25T11:00:01") # wasDerivedFrom(ex:report2, ex:report1) b2.wasDerivedFrom("ex:report2", "ex:report1") # endBundle # endDocument return g
def bundles1(): # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles1.provn #=============================================================================== # document g = ProvDocument() # prefix ex <http://example.org/example/> EX = Namespace("ex", "http://www.example.com/") g.add_namespace(EX) # prefix alice <http://example.org/alice/> # prefix bob <http://example.org/bob/> g.add_namespace('alice', 'http://example.org/alice/') g.add_namespace('bob', 'http://example.org/bob/') # entity(bob:bundle1, [prov:type='prov:Bundle']) g.entity('bob:bundle1', {'prov:type': PROV['Bundle']}) # wasGeneratedBy(bob:bundle1, -, 2012-05-24T10:30:00) g.wasGeneratedBy('bob:bundle1', time='2012-05-24T10:30:00') # agent(ex:Bob) g.agent('ex:Bob') # wasAttributedTo(bob:bundle1, ex:Bob) g.wasAttributedTo('bob:bundle1', 'ex:Bob') # entity(alice:bundle2, [ prov:type='prov:Bundle' ]) g.entity('alice:bundle2', {'prov:type': PROV['Bundle']}) # wasGeneratedBy(alice:bundle2, -, 2012-05-25T11:15:00) g.wasGeneratedBy('alice:bundle2', time='2012-05-25T11:15:00') # agent(ex:Alice) g.agent('ex:Alice') # wasAttributedTo(alice:bundle2, ex:Alice) g.wasAttributedTo('alice:bundle2', 'ex:Alice') # bundle bob:bundle1 b1 = g.bundle('bob:bundle1') # entity(ex:report1, [ prov:type="report", ex:version=1 ]) b1.entity('ex:report1', {'prov:type': "report", 'ex:version': 1}) # wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01) b1.wasGeneratedBy('ex:report1', time='2012-05-24T10:00:01') # endBundle # bundle alice:bundle2 b2 = g.bundle('alice:bundle2') # entity(ex:report1) b2.entity('ex:report1') # entity(ex:report2, [ prov:type="report", ex:version=2 ]) b2.entity('ex:report2', {'prov:type': "report", 'ex:version': 2}) # wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01) b2.wasGeneratedBy('ex:report2', time='2012-05-25T11:00:01') # wasDerivedFrom(ex:report2, ex:report1) b2.wasDerivedFrom('ex:report2', 'ex:report1') # endBundle # endDocument return g
def transform_to_prov(context_model): from prov.model import ProvDocument from prov.dot import prov_to_dot doc = ProvDocument() doc.add_namespace('is', 'http://www.provbook.org/nownews/is/#') doc.add_namespace('void', 'http://vocab.deri.ie/void#') doc.add_namespace('nowpeople', 'http://www.provbook.org/nownews/people/') input_data = doc.entity("void:Inputdata") backend_agent = doc.agent("nowpeople:EODC") user_agent = doc.agent("nowpeople:OpenEO-User") doc.wasAttributedTo(input_data, backend_agent) process_details = context_model["process_details"] prev_key = input_data for key in process_details: key_entity = doc.entity("void:" + key + "_output") key_activity = doc.activity('is:' + key) doc.used(key_activity, prev_key) doc.wasDerivedFrom(key_entity, prev_key) doc.wasGeneratedBy(key_entity, key_activity, time=process_details[key]["timing"]["end"]) doc.wasStartedBy(key_activity, user_agent, time=process_details[key]["timing"]["start"]) prev_key = key_entity dot = prov_to_dot(doc) dot.write_png('output-prov.png') return doc
def user_provenance(self, document: ProvDocument) -> None: """Add the user provenance.""" self.self_check() (username, fullname) = _whoami() if not self.full_name: self.full_name = fullname document.add_namespace(UUID) document.add_namespace(ORCID) document.add_namespace(FOAF) account = document.agent( ACCOUNT_UUID, { provM.PROV_TYPE: FOAF["OnlineAccount"], "prov:label": username, FOAF["accountName"]: username, }, ) user = document.agent( self.orcid or USER_UUID, { provM.PROV_TYPE: PROV["Person"], "prov:label": self.full_name, FOAF["name"]: self.full_name, FOAF["account"]: account, }, ) # cwltool may be started on the shell (directly by user), # by shell script (indirectly by user) # or from a different program # (which again is launched by any of the above) # # We can't tell in which way, but ultimately we're still # acting in behalf of that user (even if we might # get their name wrong!) document.actedOnBehalfOf(account, user)
def get_base_prov_document(): d = ProvDocument() d.add_namespace("invenio", "http://example.org/invenio/") return d
def toW3Cprov(ling,bundl,format='w3c-prov-xml'): g = ProvDocument() vc = Namespace("knmi", "http://knmi.nl") # namespaces do not need to be explicitly added to a document con = Namespace("dfp", "http://dispel4py.org") g.add_namespace("dcterms", "http://purl.org/dc/terms/") 'specify bundle' bundle=None for trace in bundl: 'specifing user' ag=g.agent(vc[trace["username"]],other_attributes={"dcterms:author":trace["username"]}) # first time the ex namespace was used, it is added to the document automatically if trace['type']=='workflow_run': trace.update({'runId':trace['_id']}) bundle=g.bundle(vc[trace["runId"]]) bundle.actedOnBehalfOf(vc[trace["runId"]], vc[trace["username"]]) dic={} i=0 for key in trace: if key != "input": if ':' in key: dic.update({key: trace[key]}) else: dic.update({vc[key]: trace[key]}) dic.update({'prov:type': PROV['Bundle']}) g.entity(vc[trace["runId"]], dic) dic={} i=0 if type(trace['input'])!=list: trace['input']=[trace['input']] for y in trace['input']: for key in y: if ':' in key: dic.update({key: y[key]}) else: dic.update({vc[key]: y[key]}) dic.update({'prov:type': 'worklfow_input'}) bundle.entity(vc[trace["_id"]+"_"+str(i)], dic) bundle.used(vc[trace["_id"]], vc[trace["_id"]+"_"+str(i)], identifier=vc["used_"+trace["_id"]+"_"+str(i)]) i=i+1 'specify lineage' for trace in ling: #pprint(trace) try: bundle=g.bundle(vc[trace["runId"]]) bundle.wasAttributedTo(vc[trace["runId"]], vc["ag_"+trace["username"]],identifier=vc["attr_"+trace["runId"]]) except: pass 'specifing creator of the activity (to be collected from the registy)' if 'creator' in trace: bundle.agent(vc["ag_"+trace["creator"]],other_attributes={"dcterms:creator":trace["creator"]}) # first time the ex namespace was used, it is added to the document automatically bundle.wasAssociatedWith('process_'+trace["iterationId"],vc["ag_"+trace["creator"]]) bundle.wasAttributedTo(vc[trace["runId"]], vc["ag_"+trace["creator"]]) 'adding activity information for lineage' dic={} for key in trace: if type(trace[key])!=list: if ':' in key: dic.update({key: trace[key]}) else: if key=='location': dic.update({"prov:location": trace[key]}) else: dic.update({vc[key]: trace[key]}) bundle.activity(vc["process_"+trace["iterationId"]], trace["startTime"], trace["endTime"], dic.update({'prov:type': trace["name"]})) 'adding parameters to the document as input entities' dic={} for x in trace["parameters"]: #print x if ':' in x["key"]: dic.update({x["key"]: x["val"]}) else: dic.update({vc[x["key"]]: x["val"]}) dic.update({'prov:type':'parameters'}) bundle.entity(vc["parameters_"+trace["instanceId"]], dic) bundle.used(vc['process_'+trace["iterationId"]], vc["parameters_"+trace["instanceId"]], identifier=vc["used_"+trace["iterationId"]]) 'adding input dependencies to the document as input entities' dic={} for x in trace["derivationIds"]: 'state could be added' #dic.update({'prov:type':'parameters'}) bundle.used(vc['process_'+trace["iterationId"]], vc[x["DerivedFromDatasetID"]], identifier=vc["used_"+x["DerivedFromDatasetID"]]) 'adding entities to the document as output metadata' for x in trace["streams"]: i=0 parent_dic={} for key in x: if key=='con:immediateAccess': parent_dic.update({vc['immediateAccess']: x[key]}) elif key=='location': parent_dic.update({"prov:location": str(x[key])}) else: parent_dic.update({vc[key]: str(x[key])}) c1=bundle.collection(vc[x["id"]],other_attributes=parent_dic) bundle.wasGeneratedBy(vc[x["id"]], vc["process_"+trace["iterationId"]], identifier=vc["wgb_"+x["id"]]) for d in trace['derivationIds']: bundle.wasDerivedFrom(vc[x["id"]], vc[d['DerivedFromDatasetID']],identifier=vc["wdf_"+x["id"]]) for y in x["content"]: dic={} if isinstance(y, dict): val=None for key in y: try: val =num(y[key]) except Exception,e: val =str(y[key]) if ':' in key: dic.update({key: val}) else: dic.update({vc[key]: val}) else: dic={vc['text']:y} dic.update({"verce:parent_entity": vc["data_"+x["id"]]}) print x["id"] print str(i) print dic e1=bundle.entity(vc["data_"+x["id"]+"_"+str(i)], dic) bundle.hadMember(c1, e1) bundle.wasGeneratedBy(vc["data_"+x["id"]+"_"+str(i)], vc["process_"+trace["iterationId"]], identifier=vc["wgb_"+x["id"]+"_"+str(i)]) for d in trace['derivationIds']: bundle.wasDerivedFrom(vc["data_"+x["id"]+"_"+str(i)], vc[d['DerivedFromDatasetID']],identifier=vc["wdf_"+"data_"+x["id"]+"_"+str(i)]) i=i+1
def write_targets_prov(self, tlist, C, bundle_id): #Initialisation # cs = b.agent('CrowdScanner') if self.document_id == -1: d = ProvDocument() d.add_namespace(AO) d.set_default_namespace(self.defaultns % self.game_id) if uploadprov: provstore_document = self.api.document.create(d, name="Operation%s CrowdScanner" % self.game_id, public=True) document_uri = provstore_document.url logging.info("prov doc URI: " + str(document_uri)) self.provfilelist.append(provstore_document.id) self.savelocalrecord() self.document_id = provstore_document.id b = ProvDocument() # Create a new document for this update b.add_namespace(AO) b.set_default_namespace(self.defaultns % self.game_id) # cs to be used with all targets cs = b.agent('agent/CrowdScanner', (('prov:type', AO['IBCCAlgo']), ('prov:type', PROV['SoftwareAgent']))) timestamp = time.time() # Record the timestamp at each update to generate unique identifiers startTime = datetime.datetime.fromtimestamp(timestamp) endTime = startTime activity = b.activity('activity/cs/update_report_%s' % timestamp, startTime, endTime) activity.wasAssociatedWith(cs) #Add target and report entities for i, tdata in enumerate(tlist): if self.changedtargets[i]==0: continue #Target entity for target i tid = int(tdata[0]) x = tdata[1] y = tdata[2] # targettype = tdata[3] #don't record here, it will be revealed and recorded by UAVs v = int(tdata[4]) agentids = tdata[7] targetattributes = {'ao:longitude': x, 'ao:latitude': y, } #'ao:asset_type':str(targettype)} target_v0 = b.entity('cs/target/'+str(tid)+'.'+str(v), targetattributes) #Post the root report if this is the first version if v==0: self.targets[tid] = b.entity('cs/target/'+str(tid)) else: try: target_v0.wasDerivedFrom(self.targetversions[tid]) except KeyError: logging.error("Got a key error for key " + str(tid) + ', which is supposed to be version' + str(v)) self.targetversions[tid] = target_v0 target_v0.specializationOf(self.targets[tid]) target_v0.wasAttributedTo(cs) #Report entities for origins of target i for j, r in enumerate(self.target_rep_ids[i]): if r not in self.postedreports: Crow = C[r,:] x = Crow[1] y = Crow[2] reptext = tdata[5][j].decode('utf8') # Try to replace unusual characters reptext = reptext.encode('ascii', 'replace') agentid = agentids[j] reporter_name = 'agent/crowdreporter%s' % agentid b.agent(reporter_name, (('prov:type', AO['CrowdReporter']), ('prov:type', PROV['Person']))) reportattributes = {'ao:longitude': x, 'ao:latitude': y, 'ao:report': reptext} self.postedreports[r] = b.entity('cs/report/'+str(r), reportattributes) self.postedreports[r].wasAttributedTo(reporter_name) activity.used(self.postedreports[r]) target_v0.wasDerivedFrom(self.postedreports[r]) if uploadprov: #Invalidate old targets no longer in use for i,tid in enumerate(self.targets_to_invalidate): target_v = self.targetversions[tid] b.wasInvalidatedBy(target_v, activity) #Post the document to the server #bundle = b.bundle('crowd_scanner') bundle_id = 'bundle/csupdate/%s' % timestamp self.api.add_bundle(self.document_id, b.serialize(), bundle_id)
def computeDT(self, aRO): aDT = ProvDocument() aDT.add_namespace("dt", "http://cs.ncl.ac.uk/dtsim/") self.recComputeDT(aRO, aDT) # print "final DT: {dt}".format(dt=aDT.get_provn()) return aDT
def primer_example(): # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/primer.pn # =========================================================================== # document g = ProvDocument() # prefix ex <http://example/> # prefix dcterms <http://purl.org/dc/terms/> # prefix foaf <http://xmlns.com/foaf/0.1/> ex = Namespace( "ex", "http://example/" ) # namespaces do not need to be explicitly added to a document g.add_namespace("dcterms", "http://purl.org/dc/terms/") g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/") # entity(ex:article, [dcterms:title="Crime rises in cities"]) # first time the ex namespace was used, it is added to the document automatically g.entity(ex["article"], {"dcterms:title": "Crime rises in cities"}) # entity(ex:articleV1) g.entity(ex["articleV1"]) # entity(ex:articleV2) g.entity(ex["articleV2"]) # entity(ex:dataSet1) g.entity(ex["dataSet1"]) # entity(ex:dataSet2) g.entity(ex["dataSet2"]) # entity(ex:regionList) g.entity(ex["regionList"]) # entity(ex:composition) g.entity(ex["composition"]) # entity(ex:chart1) g.entity(ex["chart1"]) # entity(ex:chart2) g.entity(ex["chart2"]) # entity(ex:blogEntry) g.entity(ex["blogEntry"]) # activity(ex:compile) g.activity( "ex:compile") # since ex is registered, it can be used like this # activity(ex:compile2) g.activity("ex:compile2") # activity(ex:compose) g.activity("ex:compose") # activity(ex:correct, 2012-03-31T09:21:00, 2012-04-01T15:21:00) g.activity("ex:correct", "2012-03-31T09:21:00", "2012-04-01T15:21:00") # date time can be provided as strings # activity(ex:illustrate) g.activity("ex:illustrate") # used(ex:compose, ex:dataSet1, -, [ prov:role = "ex:dataToCompose"]) g.used("ex:compose", "ex:dataSet1", other_attributes={"prov:role": "ex:dataToCompose"}) # used(ex:compose, ex:regionList, -, [ prov:role = "ex:regionsToAggregateBy"]) g.used( "ex:compose", "ex:regionList", other_attributes={"prov:role": "ex:regionsToAggregateBy"}, ) # wasGeneratedBy(ex:composition, ex:compose, -) g.wasGeneratedBy("ex:composition", "ex:compose") # used(ex:illustrate, ex:composition, -) g.used("ex:illustrate", "ex:composition") # wasGeneratedBy(ex:chart1, ex:illustrate, -) g.wasGeneratedBy("ex:chart1", "ex:illustrate") # wasGeneratedBy(ex:chart1, ex:compile, 2012-03-02T10:30:00) g.wasGeneratedBy("ex:chart1", "ex:compile", "2012-03-02T10:30:00") # wasGeneratedBy(ex:chart2, ex:compile2, 2012-04-01T15:21:00) # # # agent(ex:derek, [ prov:type="prov:Person", foaf:givenName = "Derek", # foaf:mbox= "<mailto:[email protected]>"]) g.agent( "ex:derek", { "prov:type": PROV["Person"], "foaf:givenName": "Derek", "foaf:mbox": "<mailto:[email protected]>", }, ) # wasAssociatedWith(ex:compose, ex:derek, -) g.wasAssociatedWith("ex:compose", "ex:derek") # wasAssociatedWith(ex:illustrate, ex:derek, -) g.wasAssociatedWith("ex:illustrate", "ex:derek") # # agent(ex:chartgen, [ prov:type="prov:Organization", # foaf:name = "Chart Generators Inc"]) g.agent( "ex:chartgen", { "prov:type": PROV["Organization"], "foaf:name": "Chart Generators Inc" }, ) # actedOnBehalfOf(ex:derek, ex:chartgen, ex:compose) g.actedOnBehalfOf("ex:derek", "ex:chartgen", "ex:compose") # wasAttributedTo(ex:chart1, ex:derek) g.wasAttributedTo("ex:chart1", "ex:derek") # wasGeneratedBy(ex:dataSet2, ex:correct, -) g.wasGeneratedBy("ex:dataSet2", "ex:correct") # used(ex:correct, ex:dataSet1, -) g.used("ex:correct", "ex:dataSet1") # wasDerivedFrom(ex:dataSet2, ex:dataSet1, [prov:type='prov:Revision']) g.wasDerivedFrom("ex:dataSet2", "ex:dataSet1", other_attributes={"prov:type": PROV["Revision"]}) # wasDerivedFrom(ex:chart2, ex:dataSet2) g.wasDerivedFrom("ex:chart2", "ex:dataSet2") # wasDerivedFrom(ex:blogEntry, ex:article, [prov:type='prov:Quotation']) g.wasDerivedFrom("ex:blogEntry", "ex:article", other_attributes={"prov:type": PROV["Quotation"]}) # specializationOf(ex:articleV1, ex:article) g.specializationOf("ex:articleV1", "ex:article") # wasDerivedFrom(ex:articleV1, ex:dataSet1) g.wasDerivedFrom("ex:articleV1", "ex:dataSet1") # specializationOf(ex:articleV2, ex:article) g.specializationOf("ex:articleV2", "ex:article") # wasDerivedFrom(ex:articleV2, ex:dataSet2) g.wasDerivedFrom("ex:articleV2", "ex:dataSet2") # alternateOf(ex:articleV2, ex:articleV1) g.alternateOf("ex:articleV2", "ex:articleV1") # endDocument return g
def useGenDependency(self, aDO, usedList, genList, throughActivity): aID = throughActivity.id # create provlet d1 = ProvDocument() # d1 is now an empty provenance document d1.add_namespace("dt", "http://cs.ncl.ac.uk/dtsim/") usedEntities = [] for aRO in usedList: usedEntities.append(d1.entity(DTns + aRO.id)) genEntities = [] for aRO1 in genList: genEntities.append(d1.entity(DTns + aRO1.id)) a = d1.activity(DTns + aID) ag1 = d1.agent(DTns + str(aDO.id)) d1.wasAssociatedWith(a, ag1) for ue in usedEntities: d1.used(a, ue) for gene in genEntities: d1.wasAttributedTo(gene, ag1) d1.wasGeneratedBy(gene, a) # associate this provlet to each generated RO for aRO1 in genList: aRO1.provlet = d1 print "event {n}: DO {do}: {ro1} <- wgby <- {act} <- used {ro}".format( n=currentReuseCount, do=aDO.id, ro1=aRO1.id, act=aID, ro=aRO.id ) for genRO in genList: for uRO in usedList: # update upstream pointer genRO.upstream.append( (uRO, throughActivity) ) # dep on aRO through activity aID FIXME URGENTLY!!! not designed for M-M for uRO in usedList: for genRO in genList: # update downstream uRO.downstream.append((genRO, throughActivity)) # aR1 is downstream from aR1 through activity aID # update global graph globalUsedEntities = [] for aRO in usedList: globalUsedEntities.append(pGlobal.entity(DTns + aRO.id)) globalGenEntities = [] for aR1 in genList: globalGenEntities.append(pGlobal.entity(DTns + aR1.id)) a = pGlobal.activity(DTns + aID) ag1 = pGlobal.agent(DTns + str(aDO.id)) pGlobal.wasAssociatedWith(a, ag1) for ue in globalUsedEntities: pGlobal.used(a, ue) for gene in globalGenEntities: pGlobal.wasAttributedTo(gene, ag1) pGlobal.wasGeneratedBy(gene, a) # trigger credit recomputation # each used RO needs its credit updated with aRO1.credit for each generated aRO1 through activity aID aCreditManager.addGenerationCredit(usedList, genList, throughActivity) # self.notify(d1) return d1
def w3c_publication_1(): # https://github.com/lucmoreau/ProvToolbox/blob/master/asn/src/test/resources/prov/w3c-publication1.prov-asn # =========================================================================== # bundle # # prefix ex <http://example.org/> # # prefix w3 <http://www.w3.org/> # prefix tr <http://www.w3.org/TR/2011/> # prefix process <http://www.w3.org/2005/10/Process-20051014/tr.html#> # prefix email <https://lists.w3.org/Archives/Member/w3c-archive/> # prefix chairs <https://lists.w3.org/Archives/Member/chairs/> # prefix trans <http://www.w3.org/2005/08/01-transitions.html#> # prefix rec54 <http://www.w3.org/2001/02pd/rec54#> # # # entity(tr:WD-prov-dm-20111018, [ prov:type='rec54:WD' ]) # entity(tr:WD-prov-dm-20111215, [ prov:type='rec54:WD' ]) # entity(process:rec-advance, [ prov:type='prov:Plan' ]) # # # entity(chairs:2011OctDec/0004, [ prov:type='trans:transreq' ]) # entity(email:2011Oct/0141, [ prov:type='trans:pubreq' ]) # entity(email:2011Dec/0111, [ prov:type='trans:pubreq' ]) # # # wasDerivedFrom(tr:WD-prov-dm-20111215, tr:WD-prov-dm-20111018) # # # activity(ex:act1,-,-,[prov:type="publish"]) # activity(ex:act2,-,-,[prov:type="publish"]) # # wasGeneratedBy(tr:WD-prov-dm-20111018, ex:act1, -) # wasGeneratedBy(tr:WD-prov-dm-20111215, ex:act2, -) # # used(ex:act1, chairs:2011OctDec/0004, -) # used(ex:act1, email:2011Oct/0141, -) # used(ex:act2, email:2011Dec/0111, -) # # agent(w3:Consortium, [ prov:type='prov:Organization' ]) # # wasAssociatedWith(ex:act1, w3:Consortium, process:rec-advance) # wasAssociatedWith(ex:act2, w3:Consortium, process:rec-advance) # # endBundle # =========================================================================== g = ProvDocument() g.add_namespace("ex", "http://example.org/") g.add_namespace("w3", "http://www.w3.org/") g.add_namespace("tr", "http://www.w3.org/TR/2011/") g.add_namespace("process", "http://www.w3.org/2005/10/Process-20051014/tr.html#") g.add_namespace("email", "https://lists.w3.org/Archives/Member/w3c-archive/") g.add_namespace("chairs", "https://lists.w3.org/Archives/Member/chairs/") g.add_namespace("trans", "http://www.w3.org/2005/08/01-transitions.html#") g.add_namespace("rec54", "http://www.w3.org/2001/02pd/rec54#") g.entity("tr:WD-prov-dm-20111018", {"prov:type": "rec54:WD"}) g.entity("tr:WD-prov-dm-20111215", {"prov:type": "rec54:WD"}) g.entity("process:rec-advance", {"prov:type": "prov:Plan"}) g.entity("chairs:2011OctDec/0004", {"prov:type": "trans:transreq"}) g.entity("email:2011Oct/0141", {"prov:type": "trans:pubreq"}) g.entity("email:2011Dec/0111", {"prov:type": "trans:pubreq"}) g.wasDerivedFrom("tr:WD-prov-dm-20111215", "tr:WD-prov-dm-20111018") g.activity("ex:act1", other_attributes={"prov:type": "publish"}) g.activity("ex:act2", other_attributes={"prov:type": "publish"}) g.wasGeneratedBy("tr:WD-prov-dm-20111018", "ex:act1") g.wasGeneratedBy("tr:WD-prov-dm-20111215", "ex:act2") g.used("ex:act1", "chairs:2011OctDec/0004") g.used("ex:act1", "email:2011Oct/0141") g.used("ex:act2", "email:2011Dec/0111") g.agent("w3:Consortium", other_attributes={"prov:type": "Organization"}) g.wasAssociatedWith("ex:act1", "w3:Consortium", "process:rec-advance") g.wasAssociatedWith("ex:act2", "w3:Consortium", "process:rec-advance") return g
def document_1(self): d1 = ProvDocument() ns_ex = d1.add_namespace('ex', EX_URI) d1.entity(ns_ex['e1']) return d1
class BioProvDocument: """ Class containing base provenance information for a Project. """ def __init__( self, project, add_attributes=False, add_users=True, _add_project_namespaces=True, _iter_samples=True, _iter_project=True, ): """ Constructs the W3C-PROV document for a project. :param Project project: instance of bioprov.src.Project. :param bool add_attributes: whether to add object attributes. :param bool add_users: whether to add users and environments. :param bool _add_project_namespaces: :param bool _iter_samples: :param bool _iter_project: """ # Assert Project is good before constructing instance assert isinstance(project, Project), Warnings()["incorrect_type"](project, Project) self.ProvDocument = ProvDocument() self.project = project self.project.document = self.ProvDocument self._dot = prov_to_dot(self.ProvDocument) self._provn = self.ProvDocument.get_provn() self._entities = dict() self._activities = dict() self._agents = dict() self._user_bundles = dict() self._provstore_document = None # Don't add attributes if you plan on exporting to graphic format self.add_attributes = add_attributes # Set this before running Namespaces if add_users: self._create_envs_and_users = True else: self._create_envs_and_users = False # Default actions to create the document if _add_project_namespaces: self._add_project_namespaces() if self._create_envs_and_users: self._iter_envs_and_users() if _iter_project: self._iter_project() if _iter_samples: self._iter_samples() def __repr__(self): return "BioProvDocument describing Project '{}' with {} samples.".format( self.project.tag, len(self.project)) @property def dot(self): self._dot = prov_to_dot(self.ProvDocument) return self._dot @dot.setter def dot(self, value): self._dot = value @property def provn(self): self._provn = self.ProvDocument.get_provn() return self._provn @provn.setter def provn(self, value): self._provn = value @property def provstore_document(self): self._provstore_document = self.ProvDocument return self._provstore_document @provstore_document.setter def provstore_document(self, value): self._provstore_document = value def _add_project_namespaces(self): """ Runs the three _add_namespace functions. :return: """ self._add_project_namespace() if self._create_envs_and_users: self._add_env_and_user_namespace() self._add_samples_namespace() self._add_activities_namespace() def _add_project_namespace(self): """ Creates the Project Namespace and Project Entity. # Sets the default Namespace of the BioProvDocument as the Project. :return: updates self.project and self.ProvDocument. """ self.ProvDocument.add_namespace("project", str(self.project)) def _add_env_and_user_namespace(self): self.ProvDocument.add_namespace( "users", f"Users associated with BioProv Project '{self.project.tag}'") def _add_samples_namespace(self): self.ProvDocument.add_namespace( "samples", f"Samples associated with bioprov Project '{self.project.tag}'", ) def _add_files_namespace(self): self.ProvDocument.add_namespace( "files", f"Files associated with bioprov Project '{self.project.tag}'") def _iter_project(self): self._create_sample_bundle(self.project, kind="Project") self._create_sample_file_entities(self.project, kind="Project") self._create_program_entities(self.project, kind="Project") def _iter_envs_and_users(self): for _user, _env_dict in self.project.users.items(): _user_preffix = f"users:{_user}" _user_bundle = self._user_bundles[ _user] = self.ProvDocument.bundle(_user_preffix) _user_bundle.set_default_namespace(_user) _user_bundle.add_namespace( "envs", f"Environments associated with User '{_user}'") self._agents[_user] = _user_bundle.agent(_user_preffix) def _iter_samples(self): for _, sample in self.project.samples.items(): for statement in ( self._create_sample_bundle(sample), self._create_sample_file_entities(sample), self._create_program_entities(sample), ): try: statement except KeyError: config.logger.debug( f"Could not run function '{statement.__name__}' for sample {sample.name}." ) pass def _create_sample_bundle(self, object_, kind="Sample"): """ Creates a ProvBundle for the Sample and associates it to self.ProvDocument. :param object_: instance of bioprov.Sample :return: updates self.ProvDocument by creating PROV objects for the sample. """ choices = ("Sample", "Project") assert kind in choices, Warnings()["choices"](kind, choices, "kind") # Sample PROV attributes: bundle, namespace, entity object_.ProvBundle = self.ProvDocument.bundle( object_.namespace_preffix) object_.ProvBundle.set_default_namespace(object_.name) self._entities[ object_.name] = object_.entity = object_.ProvBundle.entity( object_.namespace_preffix) if kind == "Sample": object_.ProvBundle.wasDerivedFrom(self._entities[object_.name], self.project.entity) def _create_sample_file_entities(self, sample, kind="Sample"): """ Creates a ProvBundle for the Sample and associates it to self.ProvDocument. :param sample: instance of bioprov.Sample :return: updates the sample.ProvBundle by creating PROV objects for the files. """ sample.files_namespace_preffix = "files" sample.file_namespace = sample.ProvBundle.add_namespace( sample.files_namespace_preffix, f"Files associated with {kind} {sample.name}", ) # Files PROV attributes: namespace, entities for key, file in sample.files.items(): # This prevents errors when the file refers to a project csv or JSON if file.name == sample.name: file.name = file.basename # Same function call, but in the first we pass the 'other_attributes' argument if self.add_attributes: self._entities[file.name] = sample.ProvBundle.entity( f"{sample.files_namespace_preffix}:{file.tag}", other_attributes=build_prov_attributes( file.serializer(), sample.file_namespace), ) else: self._entities[file.name] = sample.ProvBundle.entity( f"{sample.files_namespace_preffix}:{file.tag}", ) # Adding relationships sample.ProvBundle.wasDerivedFrom( self._entities[file.name], self._entities[sample.name], ) def _create_program_entities(self, sample, kind="Sample"): # Programs PROV attributes: namespace, entities programs_namespace_prefix = f"programs" programs_namespace = sample.ProvBundle.add_namespace( programs_namespace_prefix, f"Programs associated with {kind} {sample.name}", ) for key, program in sample.programs.items(): last_run = program.runs[str(len(program.runs))] # We want to exclude _runs from the program serializer # So we put a custom serializer filter keys = ("sample", "_runs") serialized_program = serializer_filter(program, keys) try: del serialized_program["params"] except KeyError: pass # Same function call, but in the first we pass the 'other_attributes' argument if self.add_attributes: self._activities[program.name] = sample.ProvBundle.activity( f"{programs_namespace_prefix}:{program.name}", startTime=last_run.start_time, endTime=last_run.end_time, other_attributes=build_prov_attributes( serialized_program, programs_namespace), ) else: self._activities[program.name] = sample.ProvBundle.activity( f"{programs_namespace_prefix}:{program.name}", startTime=last_run.start_time, endTime=last_run.end_time, ) if self._create_envs_and_users: for _user, _env_dict in self.project.users.items(): _user_bundle = self._user_bundles[_user] for _env_hash, _env in _env_dict.items(): if _env_hash == last_run.env: if self.add_attributes: self._agents[_env_hash] = _user_bundle.agent( f"envs:{_env}", other_attributes=build_prov_attributes( _env.env_dict, _env.env_namespace), ) else: self._agents[_env_hash] = _user_bundle.agent( f"envs:{_env}") if not _env.actedOnBehalfOf: _user_bundle.actedOnBehalfOf( self._agents[_env_hash], self._agents[_user]) _env.actedOnBehalfOf = True sample.ProvBundle.wasAssociatedWith( self._activities[program.name], self._agents[last_run.env]) inputs, outputs = self._get_IO_from_params(program) self._add_IO_relationships(sample, program, inputs, "input") self._add_IO_relationships(sample, program, outputs, "output") def _add_IO_relationships(self, sample, program, io_list, io_type): # TODO: replace Sample for Project when implementing Project.files and programs """ Add PROV relationships between Program and input/output files. :param sample: instance of bioprov.Sample :param program: instance of bioprov.Program :param io_list: list of input/output files :param io_type: 'input' or 'output' :return: Adds relationship between """ # Small assertion block choices = ("input", "output") assert io_type in choices, Warnings()["choices"](io_type, choices, "io_type") # Start function sample_files = [str(file) for _, file in sample.files.items()] for value in io_list: if value in sample_files: file_obj = [ file_ for _, file_ in sample.files.items() if str(file_) == value ] if file_obj: file_obj, *_ = file_obj if io_type == "input": sample.ProvBundle.used( self._entities[file_obj.name], self._activities[program.name], ) elif io_type == "output": sample.ProvBundle.wasGeneratedBy( self._entities[file_obj.name], self._activities[program.name], ) @staticmethod def _get_IO_from_params(program): """ :param program: instance of bioprov.Program :return: list of input parameter values and list of output parameter values """ # Relationships based on Parameters inputs, outputs = [], [] for _, parameter in program.params.items(): assert isinstance(parameter, Parameter), ( Warnings()["incorrect_type"](parameter, Parameter) + "\nPlease check if Programs were correctly deserialized.") if parameter.kind == "input": # This loop is because some positional arguments may have empty values (value stored in parameter.key) if parameter.value: inputs.append(parameter.value) else: inputs.append(parameter.key) elif parameter.kind == "output": if parameter.value: outputs.append(parameter.value) else: outputs.append(parameter.key) return inputs, outputs def _add_activities_namespace(self): """ Add activities Namespace to self. :return: """ if len(self.ProvDocument.namespaces) == 0: self.ProvDocument.add_namespace( "activities", f"Activities associated with bioprov Project '{self.project.tag}'", ) def upload_to_provstore(self, api=None): """ Uploads self.ProvDocument. to ProvStore (https://openprovenance.org/store/) :param api: provstore.api.Api :return: Sends POST request to ProvStore API and updates self.ProvDocument if successful. """ if api is None: api = config.provstore_api try: self.provstore_document = api.document.create( self.ProvDocument, name=self.project.tag) except ConnectionError: logging.error( "Could not create remote document. Please check your internet connection and ProvStore credentials." ) def write_provn(self, path=None): """ Writes PROVN output of document. :param path: Path to write file. :return: Writes file. """ if path is None: path = f"./{self.project.tag}_provn" if self.add_attributes: path += "_attrs" path += ".txt" path = Path(path) assert ( path.parent.exists() ), f"Directory '{path.parent}' not found.\nPlease provide a valid directory." if path.exists(): logging.info(f"Overwriting file at '{path}'") with open(path, "w") as f: f.write(self.provn) if path.exists(): logging.info(f"Wrote PROVN record to {path}.")
def document_2(self): d2 = ProvDocument() ns_ex = d2.add_namespace('ex', EX2_URI) d2.activity(ns_ex['a1']) return d2
def to_prov(obj, namespace, service): """ :type obj: dict :rtype: prov.model.ProvDocument """ g = ProvDocument() ap = Namespace('aip', 'https://araport.org/provenance/') g.add_namespace("dcterms", "http://purl.org/dc/terms/") g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/") vaughn = g.agent(ap['matthew_vaughn'], { 'prov:type': PROV["Person"], 'foaf:givenName': "Matthew Vaughn", 'foaf:mbox': "<mailto:[email protected]>" }) # Hard coded for now walter = g.agent(ap['walter_moreira'], { 'prov:type': PROV["Person"], 'foaf:givenName': "Walter Moreira", 'foaf:mbox': "<mailto:[email protected]>" }) utexas = g.agent(ap['university_of_texas'], { 'prov:type': PROV["Organization"], 'foaf:givenName': "University of Texas at Austin" }) g.actedOnBehalfOf(walter, utexas) g.actedOnBehalfOf(vaughn, utexas) adama_platform = g.agent( ap['adama_platform'], {'dcterms:title': "ADAMA", 'dcterms:description': "Araport Data And Microservices API", 'dcterms:language': "en-US", 'dcterms:identifier': "https://api.araport.org/community/v0.3/", 'dcterms:updated': "2015-04-17T09:44:56"}) g.wasGeneratedBy(adama_platform, walter) g.wasGeneratedBy(adama_platform, vaughn) iden = service_iden(namespace, service) srv = service_store[iden]['service'] adama_microservice = g.agent( ap[iden], {'dcterms:title': srv.name.title(), 'dcterms:description': srv.description, 'dcterms:language': "en-US", 'dcterms:identifier': api_url_for('service', namespace=namespace, service=service), 'dcterms:source': srv.git_repository }) g.used(adama_microservice, adama_platform, datetime.datetime.now()) for author in getattr(srv, 'authors', []): try: author_name = author['name'] author_email = author['email'] except KeyError: raise APIException( 'name and email are required in author field') author_agent = g.agent( ap[slugify(author_name)], {'prov:type': PROV['Person'], 'foaf:givenName': author_name, 'foaf:mbox': '<mailto:{}>'.format(author_email)}) sponsor_name = author.get('sponsor_organization_name', None) if sponsor_name: sponsor_agent = g.agent( ap[slugify(sponsor_name)], {'prov:type': PROV['Organization'], 'foaf:givenName': sponsor_name, 'dcterms:identifier': author.get('sponsor_uri', '')}) g.actedOnBehalfOf(author_agent, sponsor_agent) g.wasGeneratedBy(adama_microservice, author_agent, datetime.datetime.now()) sources_entities = process_sources(srv.sources, g, ap) for src in sources_entities: g.used(adama_microservice, src, datetime.datetime.now()) response = g.entity(ap['adama_response']) g.wasGeneratedBy(response, ap[srv.type], datetime.datetime.now()) g.used(ap[srv.type], adama_microservice, datetime.datetime.now()) return g
def primer_example(): # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/primer.pn #=========================================================================== # document g = ProvDocument() # prefix ex <http://example/> # prefix dcterms <http://purl.org/dc/terms/> # prefix foaf <http://xmlns.com/foaf/0.1/> ex = Namespace('ex', 'http://example/') # namespaces do not need to be explicitly added to a document g.add_namespace("dcterms", "http://purl.org/dc/terms/") g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/") # entity(ex:article, [dcterms:title="Crime rises in cities"]) # first time the ex namespace was used, it is added to the document automatically g.entity(ex['article'], {'dcterms:title': "Crime rises in cities"}) # entity(ex:articleV1) g.entity(ex['articleV1']) # entity(ex:articleV2) g.entity(ex['articleV2']) # entity(ex:dataSet1) g.entity(ex['dataSet1']) # entity(ex:dataSet2) g.entity(ex['dataSet2']) # entity(ex:regionList) g.entity(ex['regionList']) # entity(ex:composition) g.entity(ex['composition']) # entity(ex:chart1) g.entity(ex['chart1']) # entity(ex:chart2) g.entity(ex['chart2']) # entity(ex:blogEntry) g.entity(ex['blogEntry']) # activity(ex:compile) g.activity('ex:compile') # since ex is registered, it can be used like this # activity(ex:compile2) g.activity('ex:compile2') # activity(ex:compose) g.activity('ex:compose') # activity(ex:correct, 2012-03-31T09:21:00, 2012-04-01T15:21:00) g.activity('ex:correct', '2012-03-31T09:21:00', '2012-04-01T15:21:00') # date time can be provided as strings # activity(ex:illustrate) g.activity('ex:illustrate') # used(ex:compose, ex:dataSet1, -, [ prov:role = "ex:dataToCompose"]) g.used('ex:compose', 'ex:dataSet1', other_attributes={'prov:role': "ex:dataToCompose"}) # used(ex:compose, ex:regionList, -, [ prov:role = "ex:regionsToAggregateBy"]) g.used('ex:compose', 'ex:regionList', other_attributes={'prov:role': "ex:regionsToAggregateBy"}) # wasGeneratedBy(ex:composition, ex:compose, -) g.wasGeneratedBy('ex:composition', 'ex:compose') # used(ex:illustrate, ex:composition, -) g.used('ex:illustrate', 'ex:composition') # wasGeneratedBy(ex:chart1, ex:illustrate, -) g.wasGeneratedBy('ex:chart1', 'ex:illustrate') # wasGeneratedBy(ex:chart1, ex:compile, 2012-03-02T10:30:00) g.wasGeneratedBy('ex:chart1', 'ex:compile', '2012-03-02T10:30:00') # wasGeneratedBy(ex:chart2, ex:compile2, 2012-04-01T15:21:00) # # # agent(ex:derek, [ prov:type="prov:Person", foaf:givenName = "Derek", # foaf:mbox= "<mailto:[email protected]>"]) g.agent('ex:derek', { 'prov:type': PROV["Person"], 'foaf:givenName': "Derek", 'foaf:mbox': "<mailto:[email protected]>" }) # wasAssociatedWith(ex:compose, ex:derek, -) g.wasAssociatedWith('ex:compose', 'ex:derek') # wasAssociatedWith(ex:illustrate, ex:derek, -) g.wasAssociatedWith('ex:illustrate', 'ex:derek') # # agent(ex:chartgen, [ prov:type="prov:Organization", # foaf:name = "Chart Generators Inc"]) g.agent('ex:chartgen', {'prov:type': PROV["Organization"], 'foaf:name': "Chart Generators Inc"}) # actedOnBehalfOf(ex:derek, ex:chartgen, ex:compose) g.actedOnBehalfOf('ex:derek', 'ex:chartgen', 'ex:compose') # wasAttributedTo(ex:chart1, ex:derek) g.wasAttributedTo('ex:chart1', 'ex:derek') # wasGeneratedBy(ex:dataSet2, ex:correct, -) g.wasGeneratedBy('ex:dataSet2', 'ex:correct') # used(ex:correct, ex:dataSet1, -) g.used('ex:correct', 'ex:dataSet1') # wasDerivedFrom(ex:dataSet2, ex:dataSet1, [prov:type='prov:Revision']) g.wasDerivedFrom('ex:dataSet2', 'ex:dataSet1', other_attributes={'prov:type': PROV['Revision']}) # wasDerivedFrom(ex:chart2, ex:dataSet2) g.wasDerivedFrom('ex:chart2', 'ex:dataSet2') # wasDerivedFrom(ex:blogEntry, ex:article, [prov:type='prov:Quotation']) g.wasDerivedFrom('ex:blogEntry', 'ex:article', other_attributes={'prov:type': PROV['Quotation']}) # specializationOf(ex:articleV1, ex:article) g.specializationOf('ex:articleV1', 'ex:article') # wasDerivedFrom(ex:articleV1, ex:dataSet1) g.wasDerivedFrom('ex:articleV1', 'ex:dataSet1') # specializationOf(ex:articleV2, ex:article) g.specializationOf('ex:articleV2', 'ex:article') # wasDerivedFrom(ex:articleV2, ex:dataSet2) g.wasDerivedFrom('ex:articleV2', 'ex:dataSet2') # alternateOf(ex:articleV2, ex:articleV1) g.alternateOf('ex:articleV2', 'ex:articleV1') # endDocument return g
# simulate uploading by simply recording this ID into a dictionary def uploadRO(self, aRO): repoSim[aRO.id] = aRO ROIdQueue.append(aRO.id) # ========================= # DT manager class # ========================= ## the DT manager maintains a composite graph, ## which gets updated with every new incoming provlet # global prov document pGlobal = ProvDocument() # empty provenance document pGlobal.add_namespace("dt", "http://cs.ncl.ac.uk/dtsim/") class DTManager: PATH = "graphs" SEP = "/" def notify(self, provDoc): print "DT manager adding provlet\n{d}\n".format(d=provDoc.get_provn()) # ========================= ## generate provlets on demand # ========================= # aRO attributed to aDO
def declare_directory(self, value: CWLObjectType) -> ProvEntity: """Register any nested files/directories.""" # FIXME: Calculate a hash-like identifier for directory # so we get same value if it's the same filenames/hashes # in a different location. # For now, mint a new UUID to identify this directory, but # attempt to keep it inside the value dictionary dir_id = cast(str, value.setdefault("@id", uuid.uuid4().urn)) # New annotation file to keep the ORE Folder listing ore_doc_fn = dir_id.replace("urn:uuid:", "directory-") + ".ttl" dir_bundle = self.document.bundle(self.metadata_ns[ore_doc_fn]) coll = self.document.entity( dir_id, [ (PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, PROV["Collection"]), (PROV_TYPE, PROV["Dictionary"]), (PROV_TYPE, RO["Folder"]), ], ) # ORE description of ro:Folder, saved separately coll_b = dir_bundle.entity( dir_id, [(PROV_TYPE, RO["Folder"]), (PROV_TYPE, ORE["Aggregation"])], ) self.document.mentionOf(dir_id + "#ore", dir_id, dir_bundle.identifier) # dir_manifest = dir_bundle.entity( # dir_bundle.identifier, {PROV["type"]: ORE["ResourceMap"], # ORE["describes"]: coll_b.identifier}) coll_attribs = [(ORE["isDescribedBy"], dir_bundle.identifier)] coll_b_attribs = [] # type: List[Tuple[Identifier, ProvEntity]] # FIXME: .listing might not be populated yet - hopefully # a later call to this method will sort that is_empty = True if "listing" not in value: get_listing(self.fsaccess, value) for entry in cast(MutableSequence[CWLObjectType], value.get("listing", [])): is_empty = False # Declare child-artifacts entity = self.declare_artefact(entry) self.document.membership(coll, entity) # Membership relation aka our ORE Proxy m_id = uuid.uuid4().urn m_entity = self.document.entity(m_id) m_b = dir_bundle.entity(m_id) # PROV-O style Dictionary # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition # ..as prov.py do not currently allow PROV-N extensions # like hadDictionaryMember(..) m_entity.add_asserted_type(PROV["KeyEntityPair"]) m_entity.add_attributes({ PROV["pairKey"]: entry["basename"], PROV["pairEntity"]: entity, }) # As well as a being a # http://wf4ever.github.io/ro/2016-01-28/ro/#FolderEntry m_b.add_asserted_type(RO["FolderEntry"]) m_b.add_asserted_type(ORE["Proxy"]) m_b.add_attributes({ RO["entryName"]: entry["basename"], ORE["proxyIn"]: coll, ORE["proxyFor"]: entity, }) coll_attribs.append((PROV["hadDictionaryMember"], m_entity)) coll_b_attribs.append((ORE["aggregates"], m_b)) coll.add_attributes(coll_attribs) coll_b.add_attributes(coll_b_attribs) # Also Save ORE Folder as annotation metadata ore_doc = ProvDocument() ore_doc.add_namespace(ORE) ore_doc.add_namespace(RO) ore_doc.add_namespace(UUID) ore_doc.add_bundle(dir_bundle) ore_doc = ore_doc.flattened() ore_doc_path = str(PurePosixPath(METADATA, ore_doc_fn)) with self.research_object.write_bag_file( ore_doc_path) as provenance_file: ore_doc.serialize(provenance_file, format="rdf", rdf_format="turtle") self.research_object.add_annotation(dir_id, [ore_doc_fn], ORE["isDescribedBy"].uri) if is_empty: # Empty directory coll.add_asserted_type(PROV["EmptyCollection"]) coll.add_asserted_type(PROV["EmptyDictionary"]) self.research_object.add_uri(coll.identifier.uri) return coll
def w3c_publication_1(): # https://github.com/lucmoreau/ProvToolbox/blob/master/asn/src/test/resources/prov/w3c-publication1.prov-asn #=========================================================================== # bundle # # prefix ex <http://example.org/> # # prefix w3 <http://www.w3.org/> # prefix tr <http://www.w3.org/TR/2011/> # prefix process <http://www.w3.org/2005/10/Process-20051014/tr.html#> # prefix email <https://lists.w3.org/Archives/Member/w3c-archive/> # prefix chairs <https://lists.w3.org/Archives/Member/chairs/> # prefix trans <http://www.w3.org/2005/08/01-transitions.html#> # prefix rec54 <http://www.w3.org/2001/02pd/rec54#> # # # entity(tr:WD-prov-dm-20111018, [ prov:type='rec54:WD' ]) # entity(tr:WD-prov-dm-20111215, [ prov:type='rec54:WD' ]) # entity(process:rec-advance, [ prov:type='prov:Plan' ]) # # # entity(chairs:2011OctDec/0004, [ prov:type='trans:transreq' ]) # entity(email:2011Oct/0141, [ prov:type='trans:pubreq' ]) # entity(email:2011Dec/0111, [ prov:type='trans:pubreq' ]) # # # wasDerivedFrom(tr:WD-prov-dm-20111215, tr:WD-prov-dm-20111018) # # # activity(ex:act1,-,-,[prov:type="publish"]) # activity(ex:act2,-,-,[prov:type="publish"]) # # wasGeneratedBy(tr:WD-prov-dm-20111018, ex:act1, -) # wasGeneratedBy(tr:WD-prov-dm-20111215, ex:act2, -) # # used(ex:act1, chairs:2011OctDec/0004, -) # used(ex:act1, email:2011Oct/0141, -) # used(ex:act2, email:2011Dec/0111, -) # # agent(w3:Consortium, [ prov:type='prov:Organization' ]) # # wasAssociatedWith(ex:act1, w3:Consortium, process:rec-advance) # wasAssociatedWith(ex:act2, w3:Consortium, process:rec-advance) # # endBundle #=========================================================================== g = ProvDocument() g.add_namespace('ex', 'http://example.org/') g.add_namespace('w3', 'http://www.w3.org/') g.add_namespace('tr', 'http://www.w3.org/TR/2011/') g.add_namespace('process', 'http://www.w3.org/2005/10/Process-20051014/tr.html#') g.add_namespace('email', 'https://lists.w3.org/Archives/Member/w3c-archive/') g.add_namespace('chairs', 'https://lists.w3.org/Archives/Member/chairs/') g.add_namespace('trans', 'http://www.w3.org/2005/08/01-transitions.html#') g.add_namespace('rec54', 'http://www.w3.org/2001/02pd/rec54#') g.entity('tr:WD-prov-dm-20111018', {'prov:type': 'rec54:WD'}) g.entity('tr:WD-prov-dm-20111215', {'prov:type': 'rec54:WD'}) g.entity('process:rec-advance', {'prov:type': 'prov:Plan'}) g.entity('chairs:2011OctDec/0004', {'prov:type': 'trans:transreq'}) g.entity('email:2011Oct/0141', {'prov:type': 'trans:pubreq'}) g.entity('email:2011Dec/0111', {'prov:type': 'trans:pubreq'}) g.wasDerivedFrom('tr:WD-prov-dm-20111215', 'tr:WD-prov-dm-20111018') g.activity('ex:act1', other_attributes={'prov:type': "publish"}) g.activity('ex:act2', other_attributes={'prov:type': "publish"}) g.wasGeneratedBy('tr:WD-prov-dm-20111018', 'ex:act1') g.wasGeneratedBy('tr:WD-prov-dm-20111215', 'ex:act2') g.used('ex:act1', 'chairs:2011OctDec/0004') g.used('ex:act1', 'email:2011Oct/0141') g.used('ex:act2', 'email:2011Dec/0111') g.agent('w3:Consortium', other_attributes={'prov:type': "Organization"}) g.wasAssociatedWith('ex:act1', 'w3:Consortium', 'process:rec-advance') g.wasAssociatedWith('ex:act2', 'w3:Consortium', 'process:rec-advance') return g
class Provenance(object): def __init__(self, output_dir): self.output_dir = output_dir self.doc = None self.workflow = None def start(self, workflow=False): from daops import __version__ as daops_version from housemartin import __version__ as housemartin_version self.doc = ProvDocument() # Declaring namespaces for various prefixes self.doc.set_default_namespace(uri="http://purl.org/roocs/prov#") self.doc.add_namespace("prov", uri="http://www.w3.org/ns/prov#") self.doc.add_namespace( "provone", uri="http://purl.dataone.org/provone/2015/01/15/ontology#" ) self.doc.add_namespace("dcterms", uri="http://purl.org/dc/terms/") # Define entities project_cds = self.doc.agent( ":copernicus_CDS", { "prov:type": "prov:Organization", "dcterms:title": "Copernicus Climate Data Store", }, ) self.sw_housemartin = self.doc.agent( ":housemartin", { "prov:type": "prov:SoftwareAgent", "dcterms:source": f"https://github.com/cedadev/housemartin/releases/tag/v{housemartin_version}", }, ) self.doc.wasAttributedTo(self.sw_housemartin, project_cds) self.sw_daops = self.doc.agent( ":daops", { "prov:type": "prov:SoftwareAgent", "dcterms:source": f"https://github.com/roocs/daops/releases/tag/v{daops_version}", }, ) # workflow if workflow is True: self.workflow = self.doc.entity( ":workflow", {"prov:type": "provone:Workflow"} ) orchestrate = self.doc.activity( ":orchestrate", other_attributes={ "prov:startedAtTime": "2020-11-26T09:15:00", "prov:endedAtTime": "2020-11-26T09:30:00", }, ) self.doc.wasAssociatedWith( orchestrate, agent=self.sw_housemartin, plan=self.workflow ) def add_operator(self, operator, parameters, collection, output): op = self.doc.activity( f":{operator}", other_attributes={ ":time": parameters.get("time"), ":apply_fixes": parameters.get("apply_fixes"), }, ) # input data ds_in = os.path.basename(collection[0]) # ds_in_attrs = { # 'prov:type': 'provone:Data', # 'prov:value': f'{ds_in}', # } op_in = self.doc.entity(f":{ds_in}") # operator started by daops if self.workflow: self.doc.wasAssociatedWith(op, agent=self.sw_daops, plan=self.workflow) else: self.doc.start(op, starter=self.sw_daops, trigger=self.sw_housemartin) # Generated output file ds_out = os.path.basename(output[0]) # ds_out_attrs = { # 'prov:type': 'provone:Data', # 'prov:value': f'{ds_out}', # } op_out = self.doc.entity(f":{ds_out}") self.doc.wasDerivedFrom(op_out, op_in, activity=op) def write_json(self): outfile = os.path.join(self.output_dir, "provenance.json") self.doc.serialize(outfile, format="json") return outfile def write_png(self): outfile = os.path.join(self.output_dir, "provenance.png") figure = prov_to_dot(self.doc) figure.write_png(outfile) return outfile
def example(): g = ProvDocument() # Local namespace # Doesnt exist yet so we are creating it ap = Namespace('aip', 'https://araport.org/provenance/') # Dublin Core g.add_namespace("dcterms", "http://purl.org/dc/terms/") # FOAF g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/") # Add sponsors and contributors as Agents # ap['matthew_vaughn'] # aip:matthew_vaughn # https://araport.org/provenance/:matthew_vaughn # Learn this from a call to profiles service? Adds a dependency on Agave so I am open to figuring out another way me = g.agent(ap['matthew_vaughn'], { 'prov:type': PROV["Person"], 'foaf:givenName': "Matthew Vaughn", 'foaf:mbox': "<mailto:[email protected]>" }) # Hard coded for now walter = g.agent(ap['walter_moreira'], { 'prov:type': PROV["Person"], 'foaf:givenName': "Walter Moreira", 'foaf:mbox': "<mailto:[email protected]>" }) utexas = g.agent(ap['university_of_texas'], { 'prov:type': PROV["Organization"], 'foaf:givenName': "University of Texas at Austin" }) # Set delegation to our host University # We may have trouble doing this for other users since we don't always capture their host instituion g.actedOnBehalfOf(walter, utexas) g.actedOnBehalfOf(me, utexas) # Include the ADAMA platform as an Agent and set attribution # dcterms:title and dcterms:description are hardcoded # dcterms:language is hard-coded # dcterms:source is the URI of the public git source repository for ADAMA # "dcterms:updated": "2015-04-17T09:44:56" - this would actually be the date ADAMA was updated adama_platform = g.agent(ap['adama_platform'], {'dcterms:title': "ADAMA", 'dcterms:description': "Araport Data and Microservices API", 'dcterms:language':"en-US", 'dcterms:identifier':"https://api.araport.org/community/v0.3/", 'dcterms:updated': "2015-04-17T09:44:56" }) g.wasGeneratedBy(adama_platform, walter) # Include the ADAMA microservice as an Agent and set attribution+delegation # dcterms:title and dcterms:description are inherited from the service's metadata # dcterms:language is hard-coded # dcterms:identifier is the deployment URI for the service # dcterms:source is the URI of the public git source repository. The URL in this example is just a dummy # # The name for each microservice should be unique. We've decided to # use the combination of namespace, service name, and version microservice_name = 'mwvaughn/bar_annotation_v1.0.0' adama_microservice = g.agent(ap[microservice_name], {'dcterms:title': "BAR Annotation Service", 'dcterms:description': "Returns annotation from locus ID", 'dcterms:language':"en-US", 'dcterms:identifier':"https://api.araport.org/community/v0.3/mwvaughn/bar_annotation_v1.0.0", 'dcterms:source':"https://github.com/Arabidopsis-Information-Portal/prov-enabled-api-sample" }) # the microservice was generated by me on date X (don't use now, use when the service was updated) g.wasGeneratedBy(adama_microservice, me, datetime.datetime.now()) # The microservice used the platform now g.used(adama_microservice, adama_platform, datetime.datetime.now()) # Sources # # Define BAR # Agents nick = g.agent(ap['nicholas_provart'], { 'prov:type': PROV["Person"], 'foaf:givenName': "Nicholas Provart", 'foaf:mbox': "*****@*****.**" }) utoronto = g.agent(ap['university_of_toronto'], { 'prov:type': PROV["Organization"], 'foaf:givenName': "University of Toronto", 'dcterms:identifier':"http://www.utoronto.ca/" }) g.actedOnBehalfOf(nick, utoronto) # Entity # All fields derived from Sources.yml # dcterms:title and dcterms:description come straight from the YAML # dcterms:identifier - URI pointing to the source's canonical URI representation # optional - dcterms:language: Recommended best practice is to use a controlled vocabulary such as RFC 4646 # optional - dcterms:updated: date the source was published or last updated # optional - dcterms:license: Simple string or URI to license. Validate URI if provided? datasource1 = g.entity(ap['datasource1'], {'dcterms:title': "BAR Arabidopsis AGI -> Annotation", 'dcterms:description': "Most recent annotation for given AGI", 'dcterms:language':"en-US", 'dcterms:identifier':"http://bar.utoronto.ca/webservices/agiToAnnot.php", 'dcterms:updated':"2015-04-17T09:44:56", 'dcterms:license':"Creative Commons 3.0" }) # Set up attribution to Nick g.wasAttributedTo(datasource1, nick) # Define TAIR # Agents # dcterms:language: Recommended best practice is to use a controlled vocabulary such as RFC 4646 eva = g.agent(ap['eva_huala'], { 'prov:type': PROV["Person"], 'foaf:givenName': "Eva Huala" }) phoenix = g.agent(ap['phoenix_bioinformatics'], { 'prov:type': PROV["Organization"], 'foaf:givenName': "Phoenix Bioinformatics" }) g.actedOnBehalfOf(eva, phoenix) # Entity # All fields derived from Sources.yml # optional - dcterms:citation: Plain text bibliographic citation. If only provided as doi, should we try to validate it? datasource2 = g.entity(ap['datasource2'], {'dcterms:title': "TAIR", 'dcterms:description': "The Arabidopsis Information Resource", 'dcterms:language':"en-US", 'dcterms:identifier':"https://www.arabidopsis.org/", 'dcterms:citation':"The Arabidopsis Information Resource (TAIR): improved gene annotation and new tools. Nucleic Acids Research 2011 doi: 10.1093/nar/gkr1090"}) g.wasAttributedTo(datasource2, eva) # In Sources.yml, these two sources are nested. Define that relationship here # There are other types of relationships but we will just use derived from for simplicity in this prototype g.wasDerivedFrom(ap['datasource1'], ap['datasource2']) # Depending on which ADAMA microservice type we are using, define an activity # Eventually, break these into more atomic actions in a chain action1 = g.activity(ap['do_query'], datetime.datetime.now()) # action1 = g.activity(ap['do_map'], datetime.datetime.now()) # action1 = g.activity(ap['do_generic'], datetime.datetime.now()) # action1 = g.activity(ap['do_passthrough'], datetime.datetime.now()) # Future... Support for ADAMA-native microservices # action1 = g.activity(ap['generate'], datetime.datetime.now()) # Define current ADAMA response as an Entity # This is what's being returned to the user and is thus the subject of the PROV record # May be able to add more attributes to it but this is the minimum response = g.entity(ap['adama_response']) # Response is generated by the process_query action # Time-stamp it! g.wasGeneratedBy(response, ap['do_query'], datetime.datetime.now()) # The process_query used the microservice g.used(ap['do_query'], adama_microservice, datetime.datetime.now()) # The microservice used datasource1 g.used(adama_microservice, datasource1, datetime.datetime.now()) # Print prov_n print(g.get_provn()) # Print prov-json print(g.serialize()) # Write out as a pretty picture graph = prov.dot.prov_to_dot(g) graph.write_png('Sources.png')
class ProvenanceProfile: """ Provenance profile. Populated as the workflow runs. """ def __init__( self, research_object: "ResearchObject", full_name: str, host_provenance: bool, user_provenance: bool, orcid: str, fsaccess: StdFsAccess, run_uuid: Optional[uuid.UUID] = None, ) -> None: """Initialize the provenance profile.""" self.fsaccess = fsaccess self.orcid = orcid self.research_object = research_object self.folder = self.research_object.folder self.document = ProvDocument() self.host_provenance = host_provenance self.user_provenance = user_provenance self.engine_uuid = research_object.engine_uuid # type: str self.add_to_manifest = self.research_object.add_to_manifest if self.orcid: _logger.debug("[provenance] Creator ORCID: %s", self.orcid) self.full_name = full_name if self.full_name: _logger.debug("[provenance] Creator Full name: %s", self.full_name) self.workflow_run_uuid = run_uuid or uuid.uuid4() self.workflow_run_uri = self.workflow_run_uuid.urn # type: str self.generate_prov_doc() def __str__(self) -> str: """Represent this Provenvance profile as a string.""" return "ProvenanceProfile <{}> in <{}>".format( self.workflow_run_uri, self.research_object, ) def generate_prov_doc(self) -> Tuple[str, ProvDocument]: """Add basic namespaces.""" def host_provenance(document: ProvDocument) -> None: """Record host provenance.""" document.add_namespace(CWLPROV) document.add_namespace(UUID) document.add_namespace(FOAF) hostname = getfqdn() # won't have a foaf:accountServiceHomepage for unix hosts, but # we can at least provide hostname document.agent( ACCOUNT_UUID, { PROV_TYPE: FOAF["OnlineAccount"], "prov:location": hostname, CWLPROV["hostname"]: hostname, }, ) self.cwltool_version = "cwltool %s" % versionstring().split()[-1] self.document.add_namespace("wfprov", "http://purl.org/wf4ever/wfprov#") # document.add_namespace('prov', 'http://www.w3.org/ns/prov#') self.document.add_namespace("wfdesc", "http://purl.org/wf4ever/wfdesc#") # TODO: Make this ontology. For now only has cwlprov:image self.document.add_namespace("cwlprov", "https://w3id.org/cwl/prov#") self.document.add_namespace("foaf", "http://xmlns.com/foaf/0.1/") self.document.add_namespace("schema", "http://schema.org/") self.document.add_namespace("orcid", "https://orcid.org/") self.document.add_namespace("id", "urn:uuid:") # NOTE: Internet draft expired 2004-03-04 (!) # https://tools.ietf.org/html/draft-thiemann-hash-urn-01 # TODO: Change to nih:sha-256; hashes # https://tools.ietf.org/html/rfc6920#section-7 self.document.add_namespace("data", "urn:hash::sha1:") # Also needed for docker images self.document.add_namespace(SHA256, "nih:sha-256;") # info only, won't really be used by prov as sub-resources use / self.document.add_namespace("researchobject", self.research_object.base_uri) # annotations self.metadata_ns = self.document.add_namespace( "metadata", self.research_object.base_uri + METADATA + "/") # Pre-register provenance directory so we can refer to its files self.provenance_ns = self.document.add_namespace( "provenance", self.research_object.base_uri + posix_path(PROVENANCE) + "/") ro_identifier_workflow = self.research_object.base_uri + "workflow/packed.cwl#" self.wf_ns = self.document.add_namespace("wf", ro_identifier_workflow) ro_identifier_input = (self.research_object.base_uri + "workflow/primary-job.json#") self.document.add_namespace("input", ro_identifier_input) # More info about the account (e.g. username, fullname) # may or may not have been previously logged by user_provenance() # .. but we always know cwltool was launched (directly or indirectly) # by a user account, as cwltool is a command line tool account = self.document.agent(ACCOUNT_UUID) if self.orcid or self.full_name: person = {PROV_TYPE: PROV["Person"], "prov:type": SCHEMA["Person"]} if self.full_name: person["prov:label"] = self.full_name person["foaf:name"] = self.full_name person["schema:name"] = self.full_name else: # TODO: Look up name from ORCID API? pass agent = self.document.agent(self.orcid or uuid.uuid4().urn, person) self.document.actedOnBehalfOf(account, agent) else: if self.host_provenance: host_provenance(self.document) if self.user_provenance: self.research_object.user_provenance(self.document) # The execution of cwltool wfengine = self.document.agent( self.engine_uuid, { PROV_TYPE: PROV["SoftwareAgent"], "prov:type": WFPROV["WorkflowEngine"], "prov:label": self.cwltool_version, }, ) # FIXME: This datetime will be a bit too delayed, we should # capture when cwltool.py earliest started? self.document.wasStartedBy(wfengine, None, account, datetime.datetime.now()) # define workflow run level activity self.document.activity( self.workflow_run_uri, datetime.datetime.now(), None, { PROV_TYPE: WFPROV["WorkflowRun"], "prov:label": "Run of workflow/packed.cwl#main", }, ) # association between SoftwareAgent and WorkflowRun main_workflow = "wf:main" self.document.wasAssociatedWith(self.workflow_run_uri, self.engine_uuid, main_workflow) self.document.wasStartedBy(self.workflow_run_uri, None, self.engine_uuid, datetime.datetime.now()) return (self.workflow_run_uri, self.document) def evaluate( self, process: Process, job: JobsType, job_order_object: CWLObjectType, research_obj: "ResearchObject", ) -> None: """Evaluate the nature of job.""" if not hasattr(process, "steps"): # record provenance of independent commandline tool executions self.prospective_prov(job) customised_job = copy_job_order(job, job_order_object) self.used_artefacts(customised_job, self.workflow_run_uri) research_obj.create_job(customised_job) elif hasattr(job, "workflow"): # record provenance of workflow executions self.prospective_prov(job) customised_job = copy_job_order(job, job_order_object) self.used_artefacts(customised_job, self.workflow_run_uri) def record_process_start( self, process: Process, job: JobsType, process_run_id: Optional[str] = None) -> Optional[str]: if not hasattr(process, "steps"): process_run_id = self.workflow_run_uri elif not hasattr(job, "workflow"): # commandline tool execution as part of workflow name = "" if isinstance(job, (CommandLineJob, JobBase, WorkflowJob)): name = job.name process_name = urllib.parse.quote(name, safe=":/,#") process_run_id = self.start_process(process_name, datetime.datetime.now()) return process_run_id def start_process( self, process_name: str, when: datetime.datetime, process_run_id: Optional[str] = None, ) -> str: """Record the start of each Process.""" if process_run_id is None: process_run_id = uuid.uuid4().urn prov_label = "Run of workflow/packed.cwl#main/" + process_name self.document.activity( process_run_id, None, None, { PROV_TYPE: WFPROV["ProcessRun"], PROV_LABEL: prov_label }, ) self.document.wasAssociatedWith(process_run_id, self.engine_uuid, str("wf:main/" + process_name)) self.document.wasStartedBy(process_run_id, None, self.workflow_run_uri, when, None, None) return process_run_id def record_process_end( self, process_name: str, process_run_id: str, outputs: Union[CWLObjectType, MutableSequence[CWLObjectType], None], when: datetime.datetime, ) -> None: self.generate_output_prov(outputs, process_run_id, process_name) self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri, when) def declare_file( self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, str]: if value["class"] != "File": raise ValueError("Must have class:File: %s" % value) # Need to determine file hash aka RO filename entity = None # type: Optional[ProvEntity] checksum = None if "checksum" in value: csum = cast(str, value["checksum"]) (method, checksum) = csum.split("$", 1) if method == SHA1 and self.research_object.has_data_file(checksum): entity = self.document.entity("data:" + checksum) if not entity and "location" in value: location = str(value["location"]) # If we made it here, we'll have to add it to the RO with self.fsaccess.open(location, "rb") as fhandle: relative_path = self.research_object.add_data_file(fhandle) # FIXME: This naively relies on add_data_file setting hash as filename checksum = PurePath(relative_path).name entity = self.document.entity("data:" + checksum, {PROV_TYPE: WFPROV["Artifact"]}) if "checksum" not in value: value["checksum"] = f"{SHA1}${checksum}" if not entity and "contents" in value: # Anonymous file, add content as string entity, checksum = self.declare_string(cast( str, value["contents"])) # By here one of them should have worked! if not entity or not checksum: raise ValueError( "class:File but missing checksum/location/content: %r" % value) # Track filename and extension, this is generally useful only for # secondaryFiles. Note that multiple uses of a file might thus record # different names for the same entity, so we'll # make/track a specialized entity by UUID file_id = value.setdefault("@id", uuid.uuid4().urn) # A specialized entity that has just these names file_entity = self.document.entity( file_id, [(PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, WF4EVER["File"])], ) # type: ProvEntity if "basename" in value: file_entity.add_attributes( {CWLPROV["basename"]: value["basename"]}) if "nameroot" in value: file_entity.add_attributes( {CWLPROV["nameroot"]: value["nameroot"]}) if "nameext" in value: file_entity.add_attributes({CWLPROV["nameext"]: value["nameext"]}) self.document.specializationOf(file_entity, entity) # Check for secondaries for sec in cast(MutableSequence[CWLObjectType], value.get("secondaryFiles", [])): # TODO: Record these in a specializationOf entity with UUID? if sec["class"] == "File": (sec_entity, _, _) = self.declare_file(sec) elif sec["class"] == "Directory": sec_entity = self.declare_directory(sec) else: raise ValueError(f"Got unexpected secondaryFiles value: {sec}") # We don't know how/when/where the secondary file was generated, # but CWL convention is a kind of summary/index derived # from the original file. As its generally in a different format # then prov:Quotation is not appropriate. self.document.derivation( sec_entity, file_entity, other_attributes={PROV["type"]: CWLPROV["SecondaryFile"]}, ) return file_entity, entity, checksum def declare_directory(self, value: CWLObjectType) -> ProvEntity: """Register any nested files/directories.""" # FIXME: Calculate a hash-like identifier for directory # so we get same value if it's the same filenames/hashes # in a different location. # For now, mint a new UUID to identify this directory, but # attempt to keep it inside the value dictionary dir_id = cast(str, value.setdefault("@id", uuid.uuid4().urn)) # New annotation file to keep the ORE Folder listing ore_doc_fn = dir_id.replace("urn:uuid:", "directory-") + ".ttl" dir_bundle = self.document.bundle(self.metadata_ns[ore_doc_fn]) coll = self.document.entity( dir_id, [ (PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, PROV["Collection"]), (PROV_TYPE, PROV["Dictionary"]), (PROV_TYPE, RO["Folder"]), ], ) # ORE description of ro:Folder, saved separately coll_b = dir_bundle.entity( dir_id, [(PROV_TYPE, RO["Folder"]), (PROV_TYPE, ORE["Aggregation"])], ) self.document.mentionOf(dir_id + "#ore", dir_id, dir_bundle.identifier) # dir_manifest = dir_bundle.entity( # dir_bundle.identifier, {PROV["type"]: ORE["ResourceMap"], # ORE["describes"]: coll_b.identifier}) coll_attribs = [(ORE["isDescribedBy"], dir_bundle.identifier)] coll_b_attribs = [] # type: List[Tuple[Identifier, ProvEntity]] # FIXME: .listing might not be populated yet - hopefully # a later call to this method will sort that is_empty = True if "listing" not in value: get_listing(self.fsaccess, value) for entry in cast(MutableSequence[CWLObjectType], value.get("listing", [])): is_empty = False # Declare child-artifacts entity = self.declare_artefact(entry) self.document.membership(coll, entity) # Membership relation aka our ORE Proxy m_id = uuid.uuid4().urn m_entity = self.document.entity(m_id) m_b = dir_bundle.entity(m_id) # PROV-O style Dictionary # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition # ..as prov.py do not currently allow PROV-N extensions # like hadDictionaryMember(..) m_entity.add_asserted_type(PROV["KeyEntityPair"]) m_entity.add_attributes({ PROV["pairKey"]: entry["basename"], PROV["pairEntity"]: entity, }) # As well as a being a # http://wf4ever.github.io/ro/2016-01-28/ro/#FolderEntry m_b.add_asserted_type(RO["FolderEntry"]) m_b.add_asserted_type(ORE["Proxy"]) m_b.add_attributes({ RO["entryName"]: entry["basename"], ORE["proxyIn"]: coll, ORE["proxyFor"]: entity, }) coll_attribs.append((PROV["hadDictionaryMember"], m_entity)) coll_b_attribs.append((ORE["aggregates"], m_b)) coll.add_attributes(coll_attribs) coll_b.add_attributes(coll_b_attribs) # Also Save ORE Folder as annotation metadata ore_doc = ProvDocument() ore_doc.add_namespace(ORE) ore_doc.add_namespace(RO) ore_doc.add_namespace(UUID) ore_doc.add_bundle(dir_bundle) ore_doc = ore_doc.flattened() ore_doc_path = str(PurePosixPath(METADATA, ore_doc_fn)) with self.research_object.write_bag_file( ore_doc_path) as provenance_file: ore_doc.serialize(provenance_file, format="rdf", rdf_format="turtle") self.research_object.add_annotation(dir_id, [ore_doc_fn], ORE["isDescribedBy"].uri) if is_empty: # Empty directory coll.add_asserted_type(PROV["EmptyCollection"]) coll.add_asserted_type(PROV["EmptyDictionary"]) self.research_object.add_uri(coll.identifier.uri) return coll def declare_string(self, value: str) -> Tuple[ProvEntity, str]: """Save as string in UTF-8.""" byte_s = BytesIO(str(value).encode(ENCODING)) data_file = self.research_object.add_data_file(byte_s, content_type=TEXT_PLAIN) checksum = PurePosixPath(data_file).name # FIXME: Don't naively assume add_data_file uses hash in filename! data_id = "data:%s" % PurePosixPath(data_file).stem entity = self.document.entity(data_id, { PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value) }) # type: ProvEntity return entity, checksum def declare_artefact(self, value: Optional[CWLOutputType]) -> ProvEntity: """Create data artefact entities for all file objects.""" if value is None: # FIXME: If this can happen in CWL, we'll # need a better way to represent this in PROV return self.document.entity(CWLPROV["None"], {PROV_LABEL: "None"}) if isinstance(value, (bool, int, float)): # Typically used in job documents for flags # FIXME: Make consistent hash URIs for these # that somehow include the type # (so "1" != 1 != "1.0" != true) entity = self.document.entity(uuid.uuid4().urn, {PROV_VALUE: value}) self.research_object.add_uri(entity.identifier.uri) return entity if isinstance(value, (str, str)): (entity, _) = self.declare_string(value) return entity if isinstance(value, bytes): # If we got here then we must be in Python 3 byte_s = BytesIO(value) data_file = self.research_object.add_data_file(byte_s) # FIXME: Don't naively assume add_data_file uses hash in filename! data_id = "data:%s" % PurePosixPath(data_file).stem return self.document.entity( data_id, { PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value) }, ) if isinstance(value, MutableMapping): if "@id" in value: # Already processed this value, but it might not be in this PROV entities = self.document.get_record(value["@id"]) if entities: return entities[0] # else, unknown in PROV, re-add below as if it's fresh # Base case - we found a File we need to update if value.get("class") == "File": (entity, _, _) = self.declare_file(value) value["@id"] = entity.identifier.uri return entity if value.get("class") == "Directory": entity = self.declare_directory(value) value["@id"] = entity.identifier.uri return entity coll_id = value.setdefault("@id", uuid.uuid4().urn) # some other kind of dictionary? # TODO: also Save as JSON coll = self.document.entity( coll_id, [ (PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, PROV["Collection"]), (PROV_TYPE, PROV["Dictionary"]), ], ) if value.get("class"): _logger.warning("Unknown data class %s.", value["class"]) # FIXME: The class might be "http://example.com/somethingelse" coll.add_asserted_type(CWLPROV[value["class"]]) # Let's iterate and recurse coll_attribs = [] # type: List[Tuple[Identifier, ProvEntity]] for (key, val) in value.items(): v_ent = self.declare_artefact(val) self.document.membership(coll, v_ent) m_entity = self.document.entity(uuid.uuid4().urn) # Note: only support PROV-O style dictionary # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition # as prov.py do not easily allow PROV-N extensions m_entity.add_asserted_type(PROV["KeyEntityPair"]) m_entity.add_attributes({ PROV["pairKey"]: str(key), PROV["pairEntity"]: v_ent }) coll_attribs.append((PROV["hadDictionaryMember"], m_entity)) coll.add_attributes(coll_attribs) self.research_object.add_uri(coll.identifier.uri) return coll # some other kind of Collection? # TODO: also save as JSON try: members = [] for each_input_obj in iter(value): # Recurse and register any nested objects e = self.declare_artefact(each_input_obj) members.append(e) # If we reached this, then we were allowed to iterate coll = self.document.entity( uuid.uuid4().urn, [ (PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, PROV["Collection"]), ], ) if not members: coll.add_asserted_type(PROV["EmptyCollection"]) else: for member in members: # FIXME: This won't preserve order, for that # we would need to use PROV.Dictionary # with numeric keys self.document.membership(coll, member) self.research_object.add_uri(coll.identifier.uri) # FIXME: list value does not support adding "@id" return coll except TypeError: _logger.warning("Unrecognized type %s of %r", type(value), value) # Let's just fall back to Python repr() entity = self.document.entity(uuid.uuid4().urn, {PROV_LABEL: repr(value)}) self.research_object.add_uri(entity.identifier.uri) return entity def used_artefacts( self, job_order: Union[CWLObjectType, List[CWLObjectType]], process_run_id: str, name: Optional[str] = None, ) -> None: """Add used() for each data artefact.""" if isinstance(job_order, list): for entry in job_order: self.used_artefacts(entry, process_run_id, name) else: # FIXME: Use workflow name in packed.cwl, "main" is wrong for nested workflows base = "main" if name is not None: base += "/" + name for key, value in job_order.items(): prov_role = self.wf_ns[f"{base}/{key}"] try: entity = self.declare_artefact(value) self.document.used( process_run_id, entity, datetime.datetime.now(), None, {"prov:role": prov_role}, ) except OSError: pass def generate_output_prov( self, final_output: Union[CWLObjectType, MutableSequence[CWLObjectType], None], process_run_id: Optional[str], name: Optional[str], ) -> None: """Call wasGeneratedBy() for each output,copy the files into the RO.""" if isinstance(final_output, MutableSequence): for entry in final_output: self.generate_output_prov(entry, process_run_id, name) elif final_output is not None: # Timestamp should be created at the earliest timestamp = datetime.datetime.now() # For each output, find/register the corresponding # entity (UUID) and document it as generated in # a role corresponding to the output for output, value in final_output.items(): entity = self.declare_artefact(value) if name is not None: name = urllib.parse.quote(str(name), safe=":/,#") # FIXME: Probably not "main" in nested workflows role = self.wf_ns[f"main/{name}/{output}"] else: role = self.wf_ns["main/%s" % output] if not process_run_id: process_run_id = self.workflow_run_uri self.document.wasGeneratedBy(entity, process_run_id, timestamp, None, {"prov:role": role}) def prospective_prov(self, job: JobsType) -> None: """Create prospective prov recording as wfdesc prov:Plan.""" if not isinstance(job, WorkflowJob): # direct command line tool execution self.document.entity( "wf:main", { PROV_TYPE: WFDESC["Process"], "prov:type": PROV["Plan"], "prov:label": "Prospective provenance", }, ) return self.document.entity( "wf:main", { PROV_TYPE: WFDESC["Workflow"], "prov:type": PROV["Plan"], "prov:label": "Prospective provenance", }, ) for step in job.steps: stepnametemp = "wf:main/" + str(step.name)[5:] stepname = urllib.parse.quote(stepnametemp, safe=":/,#") provstep = self.document.entity( stepname, { PROV_TYPE: WFDESC["Process"], "prov:type": PROV["Plan"] }, ) self.document.entity( "wf:main", { "wfdesc:hasSubProcess": provstep, "prov:label": "Prospective provenance", }, ) # TODO: Declare roles/parameters as well def activity_has_provenance(self, activity, prov_ids): # type: (str, List[Identifier]) -> None """Add http://www.w3.org/TR/prov-aq/ relations to nested PROV files.""" # NOTE: The below will only work if the corresponding metadata/provenance arcp URI # is a pre-registered namespace in the PROV Document attribs = [(PROV["has_provenance"], prov_id) for prov_id in prov_ids] self.document.activity(activity, other_attributes=attribs) # Tip: we can't use https://www.w3.org/TR/prov-links/#term-mention # as prov:mentionOf() is only for entities, not activities uris = [i.uri for i in prov_ids] self.research_object.add_annotation(activity, uris, PROV["has_provenance"].uri) def finalize_prov_profile(self, name): # type: (Optional[str]) -> List[Identifier] """Transfer the provenance related files to the RO.""" # NOTE: Relative posix path if name is None: # main workflow, fixed filenames filename = "primary.cwlprov" else: # ASCII-friendly filename, avoiding % as we don't want %2520 in manifest.json wf_name = urllib.parse.quote(str(name), safe="").replace("%", "_") # Note that the above could cause overlaps for similarly named # workflows, but that's OK as we'll also include run uuid # which also covers thhe case of this step being run in # multiple places or iterations filename = f"{wf_name}.{self.workflow_run_uuid}.cwlprov" basename = str(PurePosixPath(PROVENANCE) / filename) # TODO: Also support other profiles than CWLProv, e.g. ProvOne # list of prov identifiers of provenance files prov_ids = [] # https://www.w3.org/TR/prov-xml/ with self.research_object.write_bag_file(basename + ".xml") as provenance_file: self.document.serialize(provenance_file, format="xml", indent=4) prov_ids.append(self.provenance_ns[filename + ".xml"]) # https://www.w3.org/TR/prov-n/ with self.research_object.write_bag_file(basename + ".provn") as provenance_file: self.document.serialize(provenance_file, format="provn", indent=2) prov_ids.append(self.provenance_ns[filename + ".provn"]) # https://www.w3.org/Submission/prov-json/ with self.research_object.write_bag_file(basename + ".json") as provenance_file: self.document.serialize(provenance_file, format="json", indent=2) prov_ids.append(self.provenance_ns[filename + ".json"]) # "rdf" aka https://www.w3.org/TR/prov-o/ # which can be serialized to ttl/nt/jsonld (and more!) # https://www.w3.org/TR/turtle/ with self.research_object.write_bag_file(basename + ".ttl") as provenance_file: self.document.serialize(provenance_file, format="rdf", rdf_format="turtle") prov_ids.append(self.provenance_ns[filename + ".ttl"]) # https://www.w3.org/TR/n-triples/ with self.research_object.write_bag_file(basename + ".nt") as provenance_file: self.document.serialize(provenance_file, format="rdf", rdf_format="ntriples") prov_ids.append(self.provenance_ns[filename + ".nt"]) # https://www.w3.org/TR/json-ld/ # TODO: Use a nice JSON-LD context # see also https://eprints.soton.ac.uk/395985/ # 404 Not Found on https://provenance.ecs.soton.ac.uk/prov.jsonld :( with self.research_object.write_bag_file(basename + ".jsonld") as provenance_file: self.document.serialize(provenance_file, format="rdf", rdf_format="json-ld") prov_ids.append(self.provenance_ns[filename + ".jsonld"]) _logger.debug("[provenance] added provenance: %s", prov_ids) return prov_ids
class NIDMExporter(): """ Generic class to parse a result directory to extract the pieces of information to be stored in NIDM-Results and to generate a NIDM-Results export. """ def __init__(self, version, out_dir, zipped=True): out_dirname = os.path.basename(out_dir) out_path = os.path.dirname(out_dir) # Create output path from output name self.zipped = zipped if not self.zipped: out_dirname = out_dirname+".nidm" else: out_dirname = out_dirname+".nidm.zip" out_dir = os.path.join(out_path, out_dirname) # Quit if output path already exists and user doesn't want to overwrite # it if os.path.exists(out_dir): msg = out_dir+" already exists, overwrite?" if not input("%s (y/N) " % msg).lower() == 'y': quit("Bye.") if os.path.isdir(out_dir): shutil.rmtree(out_dir) else: os.remove(out_dir) self.out_dir = out_dir if version == "dev": self.version = {'major': 10000, 'minor': 0, 'revision': 0, 'num': version} else: major, minor, revision = version.split(".") if "-rc" in revision: revision, rc = revision.split("-rc") else: rc = -1 self.version = {'major': int(major), 'minor': int(minor), 'revision': int(revision), 'rc': int(rc), 'num': version} # Initialise prov document self.doc = ProvDocument() self._add_namespaces() # A temp directory that will contain the exported data self.export_dir = tempfile.mkdtemp(prefix="nidm-", dir=out_path) self.prepend_path = '' def parse(self): """ Parse a result directory to extract the pieces information to be stored in NIDM-Results. """ try: # Methods: find_software, find_model_fitting, find_contrasts and # find_inferences should be defined in the children classes and # return a list of NIDM Objects as specified in the objects module # Object of type Software describing the neuroimaging software # package used for the analysis self.software = self._find_software() # List of objects of type ModelFitting describing the # model fitting step in NIDM-Results (main activity: Model # Parameters Estimation) self.model_fittings = self._find_model_fitting() # Dictionary of (key, value) pairs where where key is a tuple # containing the identifier of a ModelParametersEstimation object # and a tuple of identifiers of ParameterEstimateMap objects and # value is an object of type Contrast describing the contrast # estimation step in NIDM-Results (main activity: Contrast # Estimation) self.contrasts = self._find_contrasts() # Inference activity and entities # Dictionary of (key, value) pairs where key is the identifier of a # ContrastEstimation object and value is an object of type # Inference describing the inference step in NIDM-Results (main # activity: Inference) self.inferences = self._find_inferences() except Exception: self.cleanup() raise def cleanup(self): if os.path.isdir(self.export_dir): shutil.rmtree(self.export_dir) def add_object(self, nidm_object, export_file=True): """ Add a NIDMObject to a NIDM-Results export. """ if not export_file: export_dir = None else: export_dir = self.export_dir if not isinstance(nidm_object, NIDMFile): nidm_object.export(self.version, export_dir) else: nidm_object.export(self.version, export_dir, self.prepend_path) # ProvDocument: add object to the bundle if nidm_object.prov_type == PROV['Activity']: self.bundle.activity(nidm_object.id, other_attributes=nidm_object.attributes) elif nidm_object.prov_type == PROV['Entity']: self.bundle.entity(nidm_object.id, other_attributes=nidm_object.attributes) elif nidm_object.prov_type == PROV['Agent']: self.bundle.agent(nidm_object.id, other_attributes=nidm_object.attributes) # self.bundle.update(nidm_object.p) def export(self): """ Generate a NIDM-Results export. """ try: if not os.path.isdir(self.export_dir): os.mkdir(self.export_dir) # Initialise main bundle self._create_bundle(self.version) self.add_object(self.software) # Add model fitting steps if not isinstance(self.model_fittings, list): self.model_fittings = list(self.model_fittings.values()) for model_fitting in self.model_fittings: # Design Matrix # model_fitting.activity.used(model_fitting.design_matrix) self.bundle.used(model_fitting.activity.id, model_fitting.design_matrix.id) self.add_object(model_fitting.design_matrix) # *** Export visualisation of the design matrix self.add_object(model_fitting.design_matrix.image) if model_fitting.design_matrix.image.file is not None: self.add_object(model_fitting.design_matrix.image.file) if model_fitting.design_matrix.hrf_models is not None: # drift model self.add_object(model_fitting.design_matrix.drift_model) if self.version['major'] > 1 or \ (self.version['major'] == 1 and self.version['minor'] >= 3): # Machine # model_fitting.data.wasAttributedTo(model_fitting.machine) self.bundle.wasAttributedTo(model_fitting.data.id, model_fitting.machine.id) self.add_object(model_fitting.machine) # Imaged subject or group(s) for sub in model_fitting.subjects: self.add_object(sub) # model_fitting.data.wasAttributedTo(sub) self.bundle.wasAttributedTo(model_fitting.data.id, sub.id) # Data # model_fitting.activity.used(model_fitting.data) self.bundle.used(model_fitting.activity.id, model_fitting.data.id) self.add_object(model_fitting.data) # Error Model # model_fitting.activity.used(model_fitting.error_model) self.bundle.used(model_fitting.activity.id, model_fitting.error_model.id) self.add_object(model_fitting.error_model) # Parameter Estimate Maps for param_estimate in model_fitting.param_estimates: # param_estimate.wasGeneratedBy(model_fitting.activity) self.bundle.wasGeneratedBy(param_estimate.id, model_fitting.activity.id) self.add_object(param_estimate) self.add_object(param_estimate.coord_space) self.add_object(param_estimate.file) if param_estimate.derfrom is not None: self.bundle.wasDerivedFrom(param_estimate.id, param_estimate.derfrom.id) self.add_object(param_estimate.derfrom) self.add_object(param_estimate.derfrom.file, export_file=False) # Residual Mean Squares Map # model_fitting.rms_map.wasGeneratedBy(model_fitting.activity) self.add_object(model_fitting.rms_map) self.bundle.wasGeneratedBy(model_fitting.rms_map.id, model_fitting.activity.id) self.add_object(model_fitting.rms_map.coord_space) self.add_object(model_fitting.rms_map.file) if model_fitting.rms_map.derfrom is not None: self.bundle.wasDerivedFrom( model_fitting.rms_map.id, model_fitting.rms_map.derfrom.id) self.add_object(model_fitting.rms_map.derfrom) self.add_object(model_fitting.rms_map.derfrom.file, export_file=False) # Resels per Voxel Map if model_fitting.rpv_map is not None: self.add_object(model_fitting.rpv_map) self.bundle.wasGeneratedBy(model_fitting.rpv_map.id, model_fitting.activity.id) self.add_object(model_fitting.rpv_map.coord_space) self.add_object(model_fitting.rpv_map.file) if model_fitting.rpv_map.inf_id is not None: self.bundle.used(model_fitting.rpv_map.inf_id, model_fitting.rpv_map.id) if model_fitting.rpv_map.derfrom is not None: self.bundle.wasDerivedFrom( model_fitting.rpv_map.id, model_fitting.rpv_map.derfrom.id) self.add_object(model_fitting.rpv_map.derfrom) self.add_object(model_fitting.rpv_map.derfrom.file, export_file=False) # Mask # model_fitting.mask_map.wasGeneratedBy(model_fitting.activity) self.bundle.wasGeneratedBy(model_fitting.mask_map.id, model_fitting.activity.id) self.add_object(model_fitting.mask_map) if model_fitting.mask_map.derfrom is not None: self.bundle.wasDerivedFrom( model_fitting.mask_map.id, model_fitting.mask_map.derfrom.id) self.add_object(model_fitting.mask_map.derfrom) self.add_object(model_fitting.mask_map.derfrom.file, export_file=False) # Create coordinate space export self.add_object(model_fitting.mask_map.coord_space) # Create "Mask map" entity self.add_object(model_fitting.mask_map.file) # Grand Mean map # model_fitting.grand_mean_map.wasGeneratedBy(model_fitting.activity) self.bundle.wasGeneratedBy(model_fitting.grand_mean_map.id, model_fitting.activity.id) self.add_object(model_fitting.grand_mean_map) # Coordinate space entity self.add_object(model_fitting.grand_mean_map.coord_space) # Grand Mean Map entity self.add_object(model_fitting.grand_mean_map.file) # Model Parameters Estimation activity self.add_object(model_fitting.activity) self.bundle.wasAssociatedWith(model_fitting.activity.id, self.software.id) # model_fitting.activity.wasAssociatedWith(self.software) # self.add_object(model_fitting) # Add contrast estimation steps analysis_masks = dict() for (model_fitting_id, pe_ids), contrasts in list( self.contrasts.items()): for contrast in contrasts: model_fitting = self._get_model_fitting(model_fitting_id) # for contrast in contrasts: # contrast.estimation.used(model_fitting.rms_map) self.bundle.used(contrast.estimation.id, model_fitting.rms_map.id) # contrast.estimation.used(model_fitting.mask_map) self.bundle.used(contrast.estimation.id, model_fitting.mask_map.id) analysis_masks[contrast.estimation.id] = \ model_fitting.mask_map.id self.bundle.used(contrast.estimation.id, contrast.weights.id) self.bundle.used(contrast.estimation.id, model_fitting.design_matrix.id) # contrast.estimation.wasAssociatedWith(self.software) self.bundle.wasAssociatedWith(contrast.estimation.id, self.software.id) for pe_id in pe_ids: # contrast.estimation.used(pe_id) self.bundle.used(contrast.estimation.id, pe_id) # Create estimation activity self.add_object(contrast.estimation) # Create contrast weights self.add_object(contrast.weights) if contrast.contrast_map is not None: # Create contrast Map # contrast.contrast_map.wasGeneratedBy(contrast.estimation) self.bundle.wasGeneratedBy(contrast.contrast_map.id, contrast.estimation.id) self.add_object(contrast.contrast_map) self.add_object(contrast.contrast_map.coord_space) # Copy contrast map in export directory self.add_object(contrast.contrast_map.file) if contrast.contrast_map.derfrom is not None: self.bundle.wasDerivedFrom( contrast.contrast_map.id, contrast.contrast_map.derfrom.id) self.add_object(contrast.contrast_map.derfrom) self.add_object(contrast.contrast_map.derfrom.file, export_file=False) # Create Std Err. Map (T-tests) or Explained Mean Sq. Map # (F-tests) # contrast.stderr_or_expl_mean_sq_map.wasGeneratedBy # (contrast.estimation) stderr_explmeansq_map = ( contrast.stderr_or_expl_mean_sq_map) self.bundle.wasGeneratedBy( stderr_explmeansq_map.id, contrast.estimation.id) self.add_object(stderr_explmeansq_map) self.add_object( stderr_explmeansq_map.coord_space) if isinstance(stderr_explmeansq_map, ContrastStdErrMap) and \ stderr_explmeansq_map.contrast_var: self.add_object( stderr_explmeansq_map.contrast_var) if stderr_explmeansq_map.var_coord_space: self.add_object( stderr_explmeansq_map.var_coord_space) if stderr_explmeansq_map.contrast_var.coord_space: self.add_object( stderr_explmeansq_map.contrast_var.coord_space) self.add_object( stderr_explmeansq_map.contrast_var.file, export_file=False) self.bundle.wasDerivedFrom( stderr_explmeansq_map.id, stderr_explmeansq_map.contrast_var.id) self.add_object(stderr_explmeansq_map.file) # Create Statistic Map # contrast.stat_map.wasGeneratedBy(contrast.estimation) self.bundle.wasGeneratedBy(contrast.stat_map.id, contrast.estimation.id) self.add_object(contrast.stat_map) self.add_object(contrast.stat_map.coord_space) # Copy Statistical map in export directory self.add_object(contrast.stat_map.file) if contrast.stat_map.derfrom is not None: self.bundle.wasDerivedFrom( contrast.stat_map.id, contrast.stat_map.derfrom.id) self.add_object(contrast.stat_map.derfrom) self.add_object(contrast.stat_map.derfrom.file, export_file=False) # Create Z Statistic Map if contrast.z_stat_map: # contrast.z_stat_map.wasGeneratedBy(contrast.estimation) self.bundle.wasGeneratedBy(contrast.z_stat_map.id, contrast.estimation.id) self.add_object(contrast.z_stat_map) self.add_object(contrast.z_stat_map.coord_space) # Copy Statistical map in export directory self.add_object(contrast.z_stat_map.file) # self.add_object(contrast) # Add inference steps for contrast_id, inferences in list(self.inferences.items()): contrast = self._get_contrast(contrast_id) for inference in inferences: if contrast.z_stat_map: used_id = contrast.z_stat_map.id else: used_id = contrast.stat_map.id # inference.inference_act.used(used_id) self.bundle.used(inference.inference_act.id, used_id) # inference.inference_act.wasAssociatedWith(self.software) self.bundle.wasAssociatedWith(inference.inference_act.id, self.software.id) # self.add_object(inference) # Excursion set # inference.excursion_set.wasGeneratedBy(inference.inference_act) self.bundle.wasGeneratedBy(inference.excursion_set.id, inference.inference_act.id) self.add_object(inference.excursion_set) self.add_object(inference.excursion_set.coord_space) if inference.excursion_set.visu is not None: self.add_object(inference.excursion_set.visu) if inference.excursion_set.visu.file is not None: self.add_object(inference.excursion_set.visu.file) # Copy "Excursion set map" file in export directory self.add_object(inference.excursion_set.file) if inference.excursion_set.clust_map is not None: self.add_object(inference.excursion_set.clust_map) self.add_object(inference.excursion_set.clust_map.file) self.add_object( inference.excursion_set.clust_map.coord_space) if inference.excursion_set.mip is not None: self.add_object(inference.excursion_set.mip) self.add_object(inference.excursion_set.mip.file) # Height threshold if inference.height_thresh.equiv_thresh is not None: for equiv in inference.height_thresh.equiv_thresh: self.add_object(equiv) self.add_object(inference.height_thresh) # Extent threshold if inference.extent_thresh.equiv_thresh is not None: for equiv in inference.extent_thresh.equiv_thresh: self.add_object(equiv) self.add_object(inference.extent_thresh) # Display Mask (potentially more than 1) if inference.disp_mask: for mask in inference.disp_mask: # inference.inference_act.used(mask) self.bundle.used(inference.inference_act.id, mask.id) self.add_object(mask) # Create coordinate space entity self.add_object(mask.coord_space) # Create "Display Mask Map" entity self.add_object(mask.file) if mask.derfrom is not None: self.bundle.wasDerivedFrom(mask.id, mask.derfrom.id) self.add_object(mask.derfrom) self.add_object(mask.derfrom.file, export_file=False) # Search Space self.bundle.wasGeneratedBy(inference.search_space.id, inference.inference_act.id) # inference.search_space.wasGeneratedBy(inference.inference_act) self.add_object(inference.search_space) self.add_object(inference.search_space.coord_space) # Copy "Mask map" in export directory self.add_object(inference.search_space.file) # Peak Definition if inference.peak_criteria: # inference.inference_act.used(inference.peak_criteria) self.bundle.used(inference.inference_act.id, inference.peak_criteria.id) self.add_object(inference.peak_criteria) # Cluster Definition if inference.cluster_criteria: # inference.inference_act.used(inference.cluster_criteria) self.bundle.used(inference.inference_act.id, inference.cluster_criteria.id) self.add_object(inference.cluster_criteria) if inference.clusters: # Clusters and peaks for cluster in inference.clusters: # cluster.wasDerivedFrom(inference.excursion_set) self.bundle.wasDerivedFrom( cluster.id, inference.excursion_set.id) self.add_object(cluster) for peak in cluster.peaks: self.bundle.wasDerivedFrom(peak.id, cluster.id) self.add_object(peak) self.add_object(peak.coordinate) if cluster.cog is not None: self.bundle.wasDerivedFrom(cluster.cog.id, cluster.id) self.add_object(cluster.cog) self.add_object(cluster.cog.coordinate) # Inference activity # inference.inference_act.wasAssociatedWith(inference.software_id) # inference.inference_act.used(inference.height_thresh) self.bundle.used(inference.inference_act.id, inference.height_thresh.id) # inference.inference_act.used(inference.extent_thresh) self.bundle.used(inference.inference_act.id, inference.extent_thresh.id) self.bundle.used(inference.inference_act.id, analysis_masks[contrast.estimation.id]) self.add_object(inference.inference_act) # Write-out prov file self.save_prov_to_files() return self.out_dir except Exception: self.cleanup() raise def _get_model_fitting(self, mf_id): """ Retreive model fitting with identifier 'mf_id' from the list of model fitting objects stored in self.model_fitting """ for model_fitting in self.model_fittings: if model_fitting.activity.id == mf_id: return model_fitting raise Exception("Model fitting activity with id: " + str(mf_id) + " not found.") def _get_contrast(self, con_id): """ Retreive contrast with identifier 'con_id' from the list of contrast objects stored in self.contrasts """ for contrasts in list(self.contrasts.values()): for contrast in contrasts: if contrast.estimation.id == con_id: return contrast raise Exception("Contrast activity with id: " + str(con_id) + " not found.") def _add_namespaces(self): """ Add namespaces to NIDM document. """ self.doc.add_namespace(NIDM) self.doc.add_namespace(NIIRI) self.doc.add_namespace(CRYPTO) self.doc.add_namespace(DCT) self.doc.add_namespace(DC) self.doc.add_namespace(NFO) self.doc.add_namespace(OBO) self.doc.add_namespace(SCR) self.doc.add_namespace(NIF) def _create_bundle(self, version): """ Initialise NIDM-Results bundle. """ # *** Bundle entity if not hasattr(self, 'bundle_ent'): self.bundle_ent = NIDMResultsBundle(nidm_version=version['num']) self.bundle = ProvBundle(identifier=self.bundle_ent.id) self.bundle_ent.export(self.version, self.export_dir) # # provn export # self.bundle = ProvBundle(identifier=bundle_id) self.doc.entity(self.bundle_ent.id, other_attributes=self.bundle_ent.attributes) # *** NIDM-Results Export Activity if version['num'] not in ["1.0.0", "1.1.0"]: if not hasattr(self, 'export_act'): self.export_act = NIDMResultsExport() self.export_act.export(self.version, self.export_dir) # self.doc.update(self.export_act.p) self.doc.activity(self.export_act.id, other_attributes=self.export_act.attributes) # *** bundle was Generated by NIDM-Results Export Activity if not hasattr(self, 'export_time'): self.export_time = str(datetime.datetime.now().time()) if version['num'] in ["1.0.0", "1.1.0"]: self.doc.wasGeneratedBy(entity=self.bundle_ent.id, time=self.export_time) else: # provn self.doc.wasGeneratedBy( entity=self.bundle_ent.id, activity=self.export_act.id, time=self.export_time) # *** NIDM-Results Exporter (Software Agent) if version['num'] not in ["1.0.0", "1.1.0"]: if not hasattr(self, 'exporter'): self.exporter = self._get_exporter() self.exporter.export(self.version, self.export_dir) # self.doc.update(self.exporter.p) self.doc.agent(self.exporter.id, other_attributes=self.exporter.attributes) self.doc.wasAssociatedWith(self.export_act.id, self.exporter.id) def _get_model_parameters_estimations(self, error_model): """ Infer model estimation method from the 'error_model'. Return an object of type ModelParametersEstimation. """ if error_model.dependance == NIDM_INDEPEDENT_ERROR: if error_model.variance_homo: estimation_method = STATO_OLS else: estimation_method = STATO_WLS else: estimation_method = STATO_GLS mpe = ModelParametersEstimation(estimation_method, self.software.id) return mpe def use_prefixes(self, ttl): prefix_file = os.path.join(os.path.dirname(__file__), 'prefixes.csv') context = dict() with open(prefix_file, encoding="ascii") as csvfile: reader = csv.reader(csvfile) next(reader, None) # skip the headers for alphanum_id, prefix, uri in reader: if alphanum_id in ttl: context[prefix] = uri ttl = "@prefix " + prefix + ": <" + uri + "> .\n" + ttl ttl = ttl.replace(alphanum_id, prefix + ":") if uri in ttl: ttl = ttl.replace(alphanum_id, prefix + ":") elif uri in ttl: context[prefix] = uri ttl = "@prefix " + prefix + ": <" + uri + "> .\n" + ttl ttl = ttl.replace(alphanum_id, prefix + ":") return (ttl, context) def save_prov_to_files(self, showattributes=False): """ Write-out provn serialisation to nidm.provn. """ self.doc.add_bundle(self.bundle) # provn_file = os.path.join(self.export_dir, 'nidm.provn') # provn_fid = open(provn_file, 'w') # # FIXME None # # provn_fid.write(self.doc.get_provn(4).replace("None", "-")) # provn_fid.close() ttl_file = os.path.join(self.export_dir, 'nidm.ttl') ttl_txt = self.doc.serialize(format='rdf', rdf_format='turtle') ttl_txt, json_context = self.use_prefixes(ttl_txt) # Add namespaces to json-ld context for namespace in self.doc._namespaces.get_registered_namespaces(): json_context[namespace._prefix] = namespace._uri for namespace in \ list(self.doc._namespaces._default_namespaces.values()): json_context[namespace._prefix] = namespace._uri json_context["xsd"] = "http://www.w3.org/2000/01/rdf-schema#" # Work-around to issue with INF value in rdflib (reported in # https://github.com/RDFLib/rdflib/pull/655) ttl_txt = ttl_txt.replace(' inf ', ' "INF"^^xsd:float ') with open(ttl_file, 'w') as ttl_fid: ttl_fid.write(ttl_txt) # print(json_context) jsonld_file = os.path.join(self.export_dir, 'nidm.json') jsonld_txt = self.doc.serialize(format='rdf', rdf_format='json-ld', context=json_context) with open(jsonld_file, 'w') as jsonld_fid: jsonld_fid.write(jsonld_txt) # provjsonld_file = os.path.join(self.export_dir, 'nidm.provjsonld') # provjsonld_txt = self.doc.serialize(format='jsonld') # with open(provjsonld_file, 'w') as provjsonld_fid: # provjsonld_fid.write(provjsonld_txt) # provn_file = os.path.join(self.export_dir, 'nidm.provn') # provn_txt = self.doc.serialize(format='provn') # with open(provn_file, 'w') as provn_fid: # provn_fid.write(provn_txt) # Post-processing if not self.zipped: # Just rename temp directory to output_path os.rename(self.export_dir, self.out_dir) else: # Create a zip file that contains the content of the temp directory os.chdir(self.export_dir) zf = zipfile.ZipFile(os.path.join("..", self.out_dir), mode='w') try: for root, dirnames, filenames in os.walk("."): for filename in filenames: zf.write(os.path.join(filename)) finally: zf.close() # Need to move up before deleting the folder os.chdir("..") shutil.rmtree(os.path.join("..", self.export_dir))