def bundles2(): # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles2.provn #=========================================================================== # document g = ProvDocument() # prefix ex <http://example.org/example/> g.add_namespace("ex", "http://www.example.com/") # prefix alice <http://example.org/alice/> # prefix bob <http://example.org/bob/> g.add_namespace('alice', 'http://example.org/alice/') g.add_namespace('bob', 'http://example.org/bob/') # entity(bob:bundle4, [prov:type='prov:Bundle']) # wasGeneratedBy(bob:bundle4, -, 2012-05-24T10:30:00) # agent(ex:Bob) # wasAttributedTo(bob:bundle4, ex:Bob) g.entity('bob:bundle4', {'prov:type': PROV['Bundle']}) g.wasGeneratedBy('bob:bundle4', time='2012-05-24T10:30:00') g.agent('ex:Bob') g.wasAttributedTo('bob:bundle4', 'ex:Bob') # entity(alice:bundle5, [ prov:type='prov:Bundle' ]) # wasGeneratedBy(alice:bundle5, -, 2012-05-25T11:15:00) # agent(ex:Alice) # wasAttributedTo(alice:bundle5, ex:Alice) g.entity('alice:bundle5', {'prov:type': PROV['Bundle']}) g.wasGeneratedBy('alice:bundle5', time='2012-05-25T11:15:00') g.agent('ex:Alice') g.wasAttributedTo('alice:bundle5', 'ex:Alice') # bundle bob:bundle4 # entity(ex:report1, [ prov:type="report", ex:version=1 ]) # wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01) # endBundle b4 = g.bundle('bob:bundle4') b4.entity('ex:report1', {'prov:type': "report", 'ex:version': 1}) b4.wasGeneratedBy('ex:report1', time='2012-05-24T10:00:01') # bundle alice:bundle5 # entity(ex:report1bis) # mentionOf(ex:report1bis, ex:report1, bob:bundle4) # entity(ex:report2, [ prov:type="report", ex:version=2 ]) # wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01) # wasDerivedFrom(ex:report2, ex:report1bis) # endBundle b5 = g.bundle('alice:bundle5') b5.entity('ex:report1bis') b5.mentionOf('ex:report1bis', 'ex:report1', 'bob:bundle4') b5.entity('ex:report2', [('prov:type', "report"), ('ex:version', 2)]) b5.wasGeneratedBy('ex:report2', time='2012-05-25T11:00:01') b5.wasDerivedFrom('ex:report2', 'ex:report1bis') # endDocument return g
def bundles2(): # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles2.provn #=========================================================================== # document g = ProvDocument() # prefix ex <http://example.org/example/> g.add_namespace("ex", "http://www.example.com/") # prefix alice <http://example.org/alice/> # prefix bob <http://example.org/bob/> g.add_namespace('alice', 'http://example.org/alice/') g.add_namespace('bob', 'http://example.org/bob/') # entity(bob:bundle4, [prov:type='prov:Bundle']) # wasGeneratedBy(bob:bundle4, -, 2012-05-24T10:30:00) # agent(ex:Bob) # wasAttributedTo(bob:bundle4, ex:Bob) g.entity('bob:bundle4', {'prov:type': PROV['Bundle']}) g.wasGeneratedBy('bob:bundle4', time='2012-05-24T10:30:00') g.agent('ex:Bob') g.wasAttributedTo('bob:bundle4', 'ex:Bob') # entity(alice:bundle5, [ prov:type='prov:Bundle' ]) # wasGeneratedBy(alice:bundle5, -, 2012-05-25T11:15:00) # agent(ex:Alice) # wasAttributedTo(alice:bundle5, ex:Alice) g.entity('alice:bundle5', {'prov:type': PROV['Bundle']}) g.wasGeneratedBy('alice:bundle5', time='2012-05-25T11:15:00') g.agent('ex:Alice') g.wasAttributedTo('alice:bundle5', 'ex:Alice') # bundle bob:bundle4 # entity(ex:report1, [ prov:type="report", ex:version=1 ]) # wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01) # endBundle b4 = g.bundle('bob:bundle4') b4.entity('ex:report1', {'prov:type': "report", 'ex:version': 1}) b4.wasGeneratedBy('ex:report1', time='2012-05-24T10:00:01') # bundle alice:bundle5 # entity(ex:report1bis) # mentionOf(ex:report1bis, ex:report1, bob:bundle4) # entity(ex:report2, [ prov:type="report", ex:version=2 ]) # wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01) # wasDerivedFrom(ex:report2, ex:report1bis) # endBundle b5 = g.bundle('alice:bundle5') b5.entity('ex:report1bis') b5.mentionOf('ex:report1bis', 'ex:report1', 'bob:bundle4') b5.entity('ex:report2', [('prov:type', "report"), ('ex:version', 2)]) b5.wasGeneratedBy('ex:report2', time='2012-05-25T11:00:01') b5.wasDerivedFrom('ex:report2', 'ex:report1bis') # endDocument return g
def bundles2(): # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles2.provn # =========================================================================== # document g = ProvDocument() # prefix ex <http://example.org/example/> g.add_namespace("ex", "http://www.example.com/") # prefix alice <http://example.org/alice/> # prefix bob <http://example.org/bob/> g.add_namespace("alice", "http://example.org/alice/") g.add_namespace("bob", "http://example.org/bob/") # entity(bob:bundle4, [prov:type='prov:Bundle']) # wasGeneratedBy(bob:bundle4, -, 2012-05-24T10:30:00) # agent(ex:Bob) # wasAttributedTo(bob:bundle4, ex:Bob) g.entity("bob:bundle4", {"prov:type": PROV["Bundle"]}) g.wasGeneratedBy("bob:bundle4", time="2012-05-24T10:30:00") g.agent("ex:Bob") g.wasAttributedTo("bob:bundle4", "ex:Bob") # entity(alice:bundle5, [ prov:type='prov:Bundle' ]) # wasGeneratedBy(alice:bundle5, -, 2012-05-25T11:15:00) # agent(ex:Alice) # wasAttributedTo(alice:bundle5, ex:Alice) g.entity("alice:bundle5", {"prov:type": PROV["Bundle"]}) g.wasGeneratedBy("alice:bundle5", time="2012-05-25T11:15:00") g.agent("ex:Alice") g.wasAttributedTo("alice:bundle5", "ex:Alice") # bundle bob:bundle4 # entity(ex:report1, [ prov:type="report", ex:version=1 ]) # wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01) # endBundle b4 = g.bundle("bob:bundle4") b4.entity("ex:report1", {"prov:type": "report", "ex:version": 1}) b4.wasGeneratedBy("ex:report1", time="2012-05-24T10:00:01") # bundle alice:bundle5 # entity(ex:report1bis) # mentionOf(ex:report1bis, ex:report1, bob:bundle4) # entity(ex:report2, [ prov:type="report", ex:version=2 ]) # wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01) # wasDerivedFrom(ex:report2, ex:report1bis) # endBundle b5 = g.bundle("alice:bundle5") b5.entity("ex:report1bis") b5.mentionOf("ex:report1bis", "ex:report1", "bob:bundle4") b5.entity("ex:report2", [("prov:type", "report"), ("ex:version", 2)]) b5.wasGeneratedBy("ex:report2", time="2012-05-25T11:00:01") b5.wasDerivedFrom("ex:report2", "ex:report1bis") # endDocument return g
def bundles1(): # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles1.provn # =============================================================================== # document g = ProvDocument() # prefix ex <http://example.org/example/> EX = Namespace("ex", "http://www.example.com/") g.add_namespace(EX) # prefix alice <http://example.org/alice/> # prefix bob <http://example.org/bob/> g.add_namespace("alice", "http://example.org/alice/") g.add_namespace("bob", "http://example.org/bob/") # entity(bob:bundle1, [prov:type='prov:Bundle']) g.entity("bob:bundle1", {"prov:type": PROV["Bundle"]}) # wasGeneratedBy(bob:bundle1, -, 2012-05-24T10:30:00) g.wasGeneratedBy("bob:bundle1", time="2012-05-24T10:30:00") # agent(ex:Bob) g.agent("ex:Bob") # wasAttributedTo(bob:bundle1, ex:Bob) g.wasAttributedTo("bob:bundle1", "ex:Bob") # entity(alice:bundle2, [ prov:type='prov:Bundle' ]) g.entity("alice:bundle2", {"prov:type": PROV["Bundle"]}) # wasGeneratedBy(alice:bundle2, -, 2012-05-25T11:15:00) g.wasGeneratedBy("alice:bundle2", time="2012-05-25T11:15:00") # agent(ex:Alice) g.agent("ex:Alice") # wasAttributedTo(alice:bundle2, ex:Alice) g.wasAttributedTo("alice:bundle2", "ex:Alice") # bundle bob:bundle1 b1 = g.bundle("bob:bundle1") # entity(ex:report1, [ prov:type="report", ex:version=1 ]) b1.entity("ex:report1", {"prov:type": "report", "ex:version": 1}) # wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01) b1.wasGeneratedBy("ex:report1", time="2012-05-24T10:00:01") # endBundle # bundle alice:bundle2 b2 = g.bundle("alice:bundle2") # entity(ex:report1) b2.entity("ex:report1") # entity(ex:report2, [ prov:type="report", ex:version=2 ]) b2.entity("ex:report2", {"prov:type": "report", "ex:version": 2}) # wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01) b2.wasGeneratedBy("ex:report2", time="2012-05-25T11:00:01") # wasDerivedFrom(ex:report2, ex:report1) b2.wasDerivedFrom("ex:report2", "ex:report1") # endBundle # endDocument return g
def bundles1(): # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles1.provn #=============================================================================== # document g = ProvDocument() # prefix ex <http://example.org/example/> EX = Namespace("ex", "http://www.example.com/") g.add_namespace(EX) # prefix alice <http://example.org/alice/> # prefix bob <http://example.org/bob/> g.add_namespace('alice', 'http://example.org/alice/') g.add_namespace('bob', 'http://example.org/bob/') # entity(bob:bundle1, [prov:type='prov:Bundle']) g.entity('bob:bundle1', {'prov:type': PROV['Bundle']}) # wasGeneratedBy(bob:bundle1, -, 2012-05-24T10:30:00) g.wasGeneratedBy('bob:bundle1', time='2012-05-24T10:30:00') # agent(ex:Bob) g.agent('ex:Bob') # wasAttributedTo(bob:bundle1, ex:Bob) g.wasAttributedTo('bob:bundle1', 'ex:Bob') # entity(alice:bundle2, [ prov:type='prov:Bundle' ]) g.entity('alice:bundle2', {'prov:type': PROV['Bundle']}) # wasGeneratedBy(alice:bundle2, -, 2012-05-25T11:15:00) g.wasGeneratedBy('alice:bundle2', time='2012-05-25T11:15:00') # agent(ex:Alice) g.agent('ex:Alice') # wasAttributedTo(alice:bundle2, ex:Alice) g.wasAttributedTo('alice:bundle2', 'ex:Alice') # bundle bob:bundle1 b1 = g.bundle('bob:bundle1') # entity(ex:report1, [ prov:type="report", ex:version=1 ]) b1.entity('ex:report1', {'prov:type': "report", 'ex:version': 1}) # wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01) b1.wasGeneratedBy('ex:report1', time='2012-05-24T10:00:01') # endBundle # bundle alice:bundle2 b2 = g.bundle('alice:bundle2') # entity(ex:report1) b2.entity('ex:report1') # entity(ex:report2, [ prov:type="report", ex:version=2 ]) b2.entity('ex:report2', {'prov:type': "report", 'ex:version': 2}) # wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01) b2.wasGeneratedBy('ex:report2', time='2012-05-25T11:00:01') # wasDerivedFrom(ex:report2, ex:report1) b2.wasDerivedFrom('ex:report2', 'ex:report1') # endBundle # endDocument return g
def test_bundle_update_simple(self): doc = ProvDocument() doc.set_default_namespace(EX_URI) b1 = doc.bundle('b1') b1.entity('e') b2 = doc.bundle('b2') b2.entity('e') self.assertRaises(ProvException, lambda: b1.update(1)) self.assertRaises(ProvException, lambda: b1.update(doc)) b1.update(b2) self.assertEqual(len(b1.get_records()), 2)
def test_bundle_update_simple(self): doc = ProvDocument() doc.set_default_namespace(EX_URI) b1 = doc.bundle('b1') b1.entity('e') b2 = doc.bundle('b2') b2.entity('e') self.assertRaises(ProvException, lambda: b1.update(1)) self.assertRaises(ProvException, lambda: b1.update(doc)) b1.update(b2) self.assertEqual(len(b1.get_records()), 2)
def test_default_namespace_inheritance(self): prov_doc = ProvDocument() prov_doc.set_default_namespace("http://www.example.org/") bundle = prov_doc.bundle("bundle") e1 = bundle.entity("e1") self.assertIsNotNone(e1.identifier, "e1's identifier is None!") self.do_tests(prov_doc)
def test_namespace_inheritance(self): prov_doc = ProvDocument() prov_doc.add_namespace('ex', 'http://www.example.org/') bundle = prov_doc.bundle('ex:bundle') e1 = bundle.entity('ex:e1') self.assertIsNotNone(e1.identifier, "e1's identifier is None!") self.do_tests(prov_doc)
def get_document_as_prov(self, document_id=None): """ Get a ProvDocument from the database based on the document_id :param document_id: The id as a sting value :return: ProvDocument """ if type(document_id) is not str: raise InvalidArgumentTypeException() raw_doc = self._adapter.get_document(document_id) # parse document prov_document = ProvDocument() for record in raw_doc.document.records: self._parse_record(prov_document, record) for bundle in raw_doc.bundles: prefixed_identifier = bundle.bundle_record.metadata[ METADATA_KEY_IDENTIFIER] # remove prefix identifier = prefixed_identifier[ len(PROV_API_BUNDLE_IDENTIFIER_PREFIX) - 2:] prov_bundle = prov_document.bundle(identifier=identifier) for record in bundle.records: self._parse_record(prov_bundle, record) return prov_document
def test_namespace_inheritance(self): prov_doc = ProvDocument() prov_doc.add_namespace('ex', 'http://www.example.org/') bundle = prov_doc.bundle('ex:bundle') e1 = bundle.entity('ex:e1') self.assertIsNotNone(e1.identifier, "e1's identifier is None!") self.do_tests(prov_doc)
def test_default_namespace_inheritance(self): prov_doc = ProvDocument() prov_doc.set_default_namespace('http://www.example.org/') bundle = prov_doc.bundle('bundle') e1 = bundle.entity('e1') self.assertIsNotNone(e1.identifier, "e1's identifier is None!") self.assertRoundTripEquivalence(prov_doc)
def document_with_n_bundles_having_default_namespace(n): prov_doc = ProvDocument() prov_doc.add_namespace("ex", "http://www.example.org/") for i in range(n): x = str(i + 1) bundle = prov_doc.bundle("ex:bundle/" + x) bundle.set_default_namespace("http://www.example.org/default/" + x) bundle.entity("e") return prov_doc
def document_with_n_bundles_having_default_namespace(n): prov_doc = ProvDocument() prov_doc.add_namespace('ex', 'http://www.example.org/') for i in range(n): x = str(i + 1) bundle = prov_doc.bundle('ex:bundle/' + x) bundle.set_default_namespace('http://www.example.org/default/' + x) bundle.entity('e') return prov_doc
def document_with_n_bundles_having_default_namespace(n): prov_doc = ProvDocument() prov_doc.add_namespace('ex', 'http://www.example.org/') for i in range(n): x = str(i + 1) bundle = prov_doc.bundle('ex:bundle/' + x) bundle.set_default_namespace('http://www.example.org/default/' + x) bundle.entity('e') return prov_doc
def test_document_update_simple(self): d1 = ProvDocument() d1.set_default_namespace(EX_URI) d1.entity('e') b1 = d1.bundle('b1') b1.entity('e') d2 = ProvDocument() d2.set_default_namespace(EX_URI) d2.entity('e') b1 = d2.bundle('b1') b1.entity('e') b2 = d2.bundle('b2') b2.entity('e') self.assertRaises(ProvException, lambda: d1.update(1)) d1.update(d2) self.assertEqual(len(d1.get_records()), 2) self.assertEqual(len(d1.bundles), 2)
def test_document_update_simple(self): d1 = ProvDocument() d1.set_default_namespace(EX_URI) d1.entity('e') b1 = d1.bundle('b1') b1.entity('e') d2 = ProvDocument() d2.set_default_namespace(EX_URI) d2.entity('e') b1 = d2.bundle('b1') b1.entity('e') b2 = d2.bundle('b2') b2.entity('e') self.assertRaises(ProvException, lambda: d1.update(1)) d1.update(d2) self.assertEqual(len(d1.get_records()), 2) self.assertEqual(len(d1.bundles), 2)
def toW3Cprov(ling, bundl, format='w3c-prov-xml'): g = ProvDocument() vc = Namespace( "knmi", "http://knmi.nl" ) # namespaces do not need to be explicitly added to a document con = Namespace("dfp", "http://dispel4py.org") g.add_namespace("dcterms", "http://purl.org/dc/terms/") 'specify bundle' bundle = None for trace in bundl: 'specifing user' ag = g.agent( vc[trace["username"]], other_attributes={"dcterms:author": trace["username"]} ) # first time the ex namespace was used, it is added to the document automatically if trace['type'] == 'workflow_run': trace.update({'runId': trace['_id']}) bundle = g.bundle(vc[trace["runId"]]) bundle.actedOnBehalfOf(vc[trace["runId"]], vc[trace["username"]]) dic = {} i = 0 for key in trace: if key != "input": if ':' in key: dic.update({key: trace[key]}) else: dic.update({vc[key]: trace[key]}) dic.update({'prov:type': PROV['Bundle']}) g.entity(vc[trace["runId"]], dic) dic = {} i = 0 if type(trace['input']) != list: trace['input'] = [trace['input']] for y in trace['input']: for key in y: if ':' in key: dic.update({key: y[key]}) else: dic.update({vc[key]: y[key]}) dic.update({'prov:type': 'worklfow_input'}) bundle.entity(vc[trace["_id"] + "_" + str(i)], dic) bundle.used(vc[trace["_id"]], vc[trace["_id"] + "_" + str(i)], identifier=vc["used_" + trace["_id"] + "_" + str(i)]) i = i + 1 'specify lineage' for trace in ling: #pprint(trace) try: bundle = g.bundle(vc[trace["runId"]]) bundle.wasAttributedTo(vc[trace["runId"]], vc["ag_" + trace["username"]], identifier=vc["attr_" + trace["runId"]]) except: pass 'specifing creator of the activity (to be collected from the registy)' if 'creator' in trace: bundle.agent( vc["ag_" + trace["creator"]], other_attributes={"dcterms:creator": trace["creator"]} ) # first time the ex namespace was used, it is added to the document automatically bundle.wasAssociatedWith('process_' + trace["iterationId"], vc["ag_" + trace["creator"]]) bundle.wasAttributedTo(vc[trace["runId"]], vc["ag_" + trace["creator"]]) 'adding activity information for lineage' dic = {} for key in trace: if type(trace[key]) != list: if ':' in key: dic.update({key: trace[key]}) else: if key == 'location': dic.update({"prov:location": trace[key]}) else: dic.update({vc[key]: trace[key]}) bundle.activity(vc["process_" + trace["iterationId"]], trace["startTime"], trace["endTime"], dic.update({'prov:type': trace["name"]})) 'adding parameters to the document as input entities' dic = {} for x in trace["parameters"]: #print x if ':' in x["key"]: dic.update({x["key"]: x["val"]}) else: dic.update({vc[x["key"]]: x["val"]}) dic.update({'prov:type': 'parameters'}) bundle.entity(vc["parameters_" + trace["instanceId"]], dic) bundle.used(vc['process_' + trace["iterationId"]], vc["parameters_" + trace["instanceId"]], identifier=vc["used_" + trace["iterationId"]]) 'adding input dependencies to the document as input entities' dic = {} for x in trace["derivationIds"]: 'state could be added' #dic.update({'prov:type':'parameters'}) bundle.used(vc['process_' + trace["iterationId"]], vc[x["DerivedFromDatasetID"]], identifier=vc["used_" + x["DerivedFromDatasetID"]]) 'adding entities to the document as output metadata' for x in trace["streams"]: i = 0 parent_dic = {} for key in x: if key == 'con:immediateAccess': parent_dic.update({vc['immediateAccess']: x[key]}) elif key == 'location': parent_dic.update({"prov:location": str(x[key])}) else: parent_dic.update({vc[key]: str(x[key])}) c1 = bundle.collection(vc[x["id"]], other_attributes=parent_dic) bundle.wasGeneratedBy(vc[x["id"]], vc["process_" + trace["iterationId"]], identifier=vc["wgb_" + x["id"]]) for d in trace['derivationIds']: bundle.wasDerivedFrom(vc[x["id"]], vc[d['DerivedFromDatasetID']], identifier=vc["wdf_" + x["id"]]) for y in x["content"]: dic = {} if isinstance(y, dict): val = None for key in y: try: val = num(y[key]) except Exception, e: val = str(y[key]) if ':' in key: dic.update({key: val}) else: dic.update({vc[key]: val}) else: dic = {vc['text']: y} dic.update({"verce:parent_entity": vc["data_" + x["id"]]}) print x["id"] print str(i) print dic e1 = bundle.entity(vc["data_" + x["id"] + "_" + str(i)], dic) bundle.hadMember(c1, e1) bundle.wasGeneratedBy(vc["data_" + x["id"] + "_" + str(i)], vc["process_" + trace["iterationId"]], identifier=vc["wgb_" + x["id"] + "_" + str(i)]) for d in trace['derivationIds']: bundle.wasDerivedFrom( vc["data_" + x["id"] + "_" + str(i)], vc[d['DerivedFromDatasetID']], identifier=vc["wdf_" + "data_" + x["id"] + "_" + str(i)]) i = i + 1
def toW3Cprov(ling,bundl,format='w3c-prov-xml'): g = ProvDocument() vc = Namespace("knmi", "http://knmi.nl") # namespaces do not need to be explicitly added to a document con = Namespace("dfp", "http://dispel4py.org") g.add_namespace("dcterms", "http://purl.org/dc/terms/") 'specify bundle' bundle=None for trace in bundl: 'specifing user' ag=g.agent(vc[trace["username"]],other_attributes={"dcterms:author":trace["username"]}) # first time the ex namespace was used, it is added to the document automatically if trace['type']=='workflow_run': trace.update({'runId':trace['_id']}) bundle=g.bundle(vc[trace["runId"]]) bundle.actedOnBehalfOf(vc[trace["runId"]], vc[trace["username"]]) dic={} i=0 for key in trace: if key != "input": if ':' in key: dic.update({key: trace[key]}) else: dic.update({vc[key]: trace[key]}) dic.update({'prov:type': PROV['Bundle']}) g.entity(vc[trace["runId"]], dic) dic={} i=0 if type(trace['input'])!=list: trace['input']=[trace['input']] for y in trace['input']: for key in y: if ':' in key: dic.update({key: y[key]}) else: dic.update({vc[key]: y[key]}) dic.update({'prov:type': 'worklfow_input'}) bundle.entity(vc[trace["_id"]+"_"+str(i)], dic) bundle.used(vc[trace["_id"]], vc[trace["_id"]+"_"+str(i)], identifier=vc["used_"+trace["_id"]+"_"+str(i)]) i=i+1 'specify lineage' for trace in ling: #pprint(trace) try: bundle=g.bundle(vc[trace["runId"]]) bundle.wasAttributedTo(vc[trace["runId"]], vc["ag_"+trace["username"]],identifier=vc["attr_"+trace["runId"]]) except: pass 'specifing creator of the activity (to be collected from the registy)' if 'creator' in trace: bundle.agent(vc["ag_"+trace["creator"]],other_attributes={"dcterms:creator":trace["creator"]}) # first time the ex namespace was used, it is added to the document automatically bundle.wasAssociatedWith('process_'+trace["iterationId"],vc["ag_"+trace["creator"]]) bundle.wasAttributedTo(vc[trace["runId"]], vc["ag_"+trace["creator"]]) 'adding activity information for lineage' dic={} for key in trace: if type(trace[key])!=list: if ':' in key: dic.update({key: trace[key]}) else: if key=='location': dic.update({"prov:location": trace[key]}) else: dic.update({vc[key]: trace[key]}) bundle.activity(vc["process_"+trace["iterationId"]], trace["startTime"], trace["endTime"], dic.update({'prov:type': trace["name"]})) 'adding parameters to the document as input entities' dic={} for x in trace["parameters"]: #print x if ':' in x["key"]: dic.update({x["key"]: x["val"]}) else: dic.update({vc[x["key"]]: x["val"]}) dic.update({'prov:type':'parameters'}) bundle.entity(vc["parameters_"+trace["instanceId"]], dic) bundle.used(vc['process_'+trace["iterationId"]], vc["parameters_"+trace["instanceId"]], identifier=vc["used_"+trace["iterationId"]]) 'adding input dependencies to the document as input entities' dic={} for x in trace["derivationIds"]: 'state could be added' #dic.update({'prov:type':'parameters'}) bundle.used(vc['process_'+trace["iterationId"]], vc[x["DerivedFromDatasetID"]], identifier=vc["used_"+x["DerivedFromDatasetID"]]) 'adding entities to the document as output metadata' for x in trace["streams"]: i=0 parent_dic={} for key in x: if key=='con:immediateAccess': parent_dic.update({vc['immediateAccess']: x[key]}) elif key=='location': parent_dic.update({"prov:location": str(x[key])}) else: parent_dic.update({vc[key]: str(x[key])}) c1=bundle.collection(vc[x["id"]],other_attributes=parent_dic) bundle.wasGeneratedBy(vc[x["id"]], vc["process_"+trace["iterationId"]], identifier=vc["wgb_"+x["id"]]) for d in trace['derivationIds']: bundle.wasDerivedFrom(vc[x["id"]], vc[d['DerivedFromDatasetID']],identifier=vc["wdf_"+x["id"]]) for y in x["content"]: dic={} if isinstance(y, dict): val=None for key in y: try: val =num(y[key]) except Exception,e: val =str(y[key]) if ':' in key: dic.update({key: val}) else: dic.update({vc[key]: val}) else: dic={vc['text']:y} dic.update({"verce:parent_entity": vc["data_"+x["id"]]}) print x["id"] print str(i) print dic e1=bundle.entity(vc["data_"+x["id"]+"_"+str(i)], dic) bundle.hadMember(c1, e1) bundle.wasGeneratedBy(vc["data_"+x["id"]+"_"+str(i)], vc["process_"+trace["iterationId"]], identifier=vc["wgb_"+x["id"]+"_"+str(i)]) for d in trace['derivationIds']: bundle.wasDerivedFrom(vc["data_"+x["id"]+"_"+str(i)], vc[d['DerivedFromDatasetID']],identifier=vc["wdf_"+"data_"+x["id"]+"_"+str(i)]) i=i+1
class BioProvDocument: """ Class containing base provenance information for a Project. """ def __init__( self, project, add_attributes=False, add_users=True, _add_project_namespaces=True, _iter_samples=True, _iter_project=True, ): """ Constructs the W3C-PROV document for a project. :param Project project: instance of bioprov.src.Project. :param bool add_attributes: whether to add object attributes. :param bool add_users: whether to add users and environments. :param bool _add_project_namespaces: :param bool _iter_samples: :param bool _iter_project: """ # Assert Project is good before constructing instance assert isinstance(project, Project), Warnings()["incorrect_type"](project, Project) self.ProvDocument = ProvDocument() self.project = project self.project.document = self.ProvDocument self._dot = prov_to_dot(self.ProvDocument) self._provn = self.ProvDocument.get_provn() self._entities = dict() self._activities = dict() self._agents = dict() self._user_bundles = dict() self._provstore_document = None # Don't add attributes if you plan on exporting to graphic format self.add_attributes = add_attributes # Set this before running Namespaces if add_users: self._create_envs_and_users = True else: self._create_envs_and_users = False # Default actions to create the document if _add_project_namespaces: self._add_project_namespaces() if self._create_envs_and_users: self._iter_envs_and_users() if _iter_project: self._iter_project() if _iter_samples: self._iter_samples() def __repr__(self): return "BioProvDocument describing Project '{}' with {} samples.".format( self.project.tag, len(self.project)) @property def dot(self): self._dot = prov_to_dot(self.ProvDocument) return self._dot @dot.setter def dot(self, value): self._dot = value @property def provn(self): self._provn = self.ProvDocument.get_provn() return self._provn @provn.setter def provn(self, value): self._provn = value @property def provstore_document(self): self._provstore_document = self.ProvDocument return self._provstore_document @provstore_document.setter def provstore_document(self, value): self._provstore_document = value def _add_project_namespaces(self): """ Runs the three _add_namespace functions. :return: """ self._add_project_namespace() if self._create_envs_and_users: self._add_env_and_user_namespace() self._add_samples_namespace() self._add_activities_namespace() def _add_project_namespace(self): """ Creates the Project Namespace and Project Entity. # Sets the default Namespace of the BioProvDocument as the Project. :return: updates self.project and self.ProvDocument. """ self.ProvDocument.add_namespace("project", str(self.project)) def _add_env_and_user_namespace(self): self.ProvDocument.add_namespace( "users", f"Users associated with BioProv Project '{self.project.tag}'") def _add_samples_namespace(self): self.ProvDocument.add_namespace( "samples", f"Samples associated with bioprov Project '{self.project.tag}'", ) def _add_files_namespace(self): self.ProvDocument.add_namespace( "files", f"Files associated with bioprov Project '{self.project.tag}'") def _iter_project(self): self._create_sample_bundle(self.project, kind="Project") self._create_sample_file_entities(self.project, kind="Project") self._create_program_entities(self.project, kind="Project") def _iter_envs_and_users(self): for _user, _env_dict in self.project.users.items(): _user_preffix = f"users:{_user}" _user_bundle = self._user_bundles[ _user] = self.ProvDocument.bundle(_user_preffix) _user_bundle.set_default_namespace(_user) _user_bundle.add_namespace( "envs", f"Environments associated with User '{_user}'") self._agents[_user] = _user_bundle.agent(_user_preffix) def _iter_samples(self): for _, sample in self.project.samples.items(): for statement in ( self._create_sample_bundle(sample), self._create_sample_file_entities(sample), self._create_program_entities(sample), ): try: statement except KeyError: config.logger.debug( f"Could not run function '{statement.__name__}' for sample {sample.name}." ) pass def _create_sample_bundle(self, object_, kind="Sample"): """ Creates a ProvBundle for the Sample and associates it to self.ProvDocument. :param object_: instance of bioprov.Sample :return: updates self.ProvDocument by creating PROV objects for the sample. """ choices = ("Sample", "Project") assert kind in choices, Warnings()["choices"](kind, choices, "kind") # Sample PROV attributes: bundle, namespace, entity object_.ProvBundle = self.ProvDocument.bundle( object_.namespace_preffix) object_.ProvBundle.set_default_namespace(object_.name) self._entities[ object_.name] = object_.entity = object_.ProvBundle.entity( object_.namespace_preffix) if kind == "Sample": object_.ProvBundle.wasDerivedFrom(self._entities[object_.name], self.project.entity) def _create_sample_file_entities(self, sample, kind="Sample"): """ Creates a ProvBundle for the Sample and associates it to self.ProvDocument. :param sample: instance of bioprov.Sample :return: updates the sample.ProvBundle by creating PROV objects for the files. """ sample.files_namespace_preffix = "files" sample.file_namespace = sample.ProvBundle.add_namespace( sample.files_namespace_preffix, f"Files associated with {kind} {sample.name}", ) # Files PROV attributes: namespace, entities for key, file in sample.files.items(): # This prevents errors when the file refers to a project csv or JSON if file.name == sample.name: file.name = file.basename # Same function call, but in the first we pass the 'other_attributes' argument if self.add_attributes: self._entities[file.name] = sample.ProvBundle.entity( f"{sample.files_namespace_preffix}:{file.tag}", other_attributes=build_prov_attributes( file.serializer(), sample.file_namespace), ) else: self._entities[file.name] = sample.ProvBundle.entity( f"{sample.files_namespace_preffix}:{file.tag}", ) # Adding relationships sample.ProvBundle.wasDerivedFrom( self._entities[file.name], self._entities[sample.name], ) def _create_program_entities(self, sample, kind="Sample"): # Programs PROV attributes: namespace, entities programs_namespace_prefix = f"programs" programs_namespace = sample.ProvBundle.add_namespace( programs_namespace_prefix, f"Programs associated with {kind} {sample.name}", ) for key, program in sample.programs.items(): last_run = program.runs[str(len(program.runs))] # We want to exclude _runs from the program serializer # So we put a custom serializer filter keys = ("sample", "_runs") serialized_program = serializer_filter(program, keys) try: del serialized_program["params"] except KeyError: pass # Same function call, but in the first we pass the 'other_attributes' argument if self.add_attributes: self._activities[program.name] = sample.ProvBundle.activity( f"{programs_namespace_prefix}:{program.name}", startTime=last_run.start_time, endTime=last_run.end_time, other_attributes=build_prov_attributes( serialized_program, programs_namespace), ) else: self._activities[program.name] = sample.ProvBundle.activity( f"{programs_namespace_prefix}:{program.name}", startTime=last_run.start_time, endTime=last_run.end_time, ) if self._create_envs_and_users: for _user, _env_dict in self.project.users.items(): _user_bundle = self._user_bundles[_user] for _env_hash, _env in _env_dict.items(): if _env_hash == last_run.env: if self.add_attributes: self._agents[_env_hash] = _user_bundle.agent( f"envs:{_env}", other_attributes=build_prov_attributes( _env.env_dict, _env.env_namespace), ) else: self._agents[_env_hash] = _user_bundle.agent( f"envs:{_env}") if not _env.actedOnBehalfOf: _user_bundle.actedOnBehalfOf( self._agents[_env_hash], self._agents[_user]) _env.actedOnBehalfOf = True sample.ProvBundle.wasAssociatedWith( self._activities[program.name], self._agents[last_run.env]) inputs, outputs = self._get_IO_from_params(program) self._add_IO_relationships(sample, program, inputs, "input") self._add_IO_relationships(sample, program, outputs, "output") def _add_IO_relationships(self, sample, program, io_list, io_type): # TODO: replace Sample for Project when implementing Project.files and programs """ Add PROV relationships between Program and input/output files. :param sample: instance of bioprov.Sample :param program: instance of bioprov.Program :param io_list: list of input/output files :param io_type: 'input' or 'output' :return: Adds relationship between """ # Small assertion block choices = ("input", "output") assert io_type in choices, Warnings()["choices"](io_type, choices, "io_type") # Start function sample_files = [str(file) for _, file in sample.files.items()] for value in io_list: if value in sample_files: file_obj = [ file_ for _, file_ in sample.files.items() if str(file_) == value ] if file_obj: file_obj, *_ = file_obj if io_type == "input": sample.ProvBundle.used( self._entities[file_obj.name], self._activities[program.name], ) elif io_type == "output": sample.ProvBundle.wasGeneratedBy( self._entities[file_obj.name], self._activities[program.name], ) @staticmethod def _get_IO_from_params(program): """ :param program: instance of bioprov.Program :return: list of input parameter values and list of output parameter values """ # Relationships based on Parameters inputs, outputs = [], [] for _, parameter in program.params.items(): assert isinstance(parameter, Parameter), ( Warnings()["incorrect_type"](parameter, Parameter) + "\nPlease check if Programs were correctly deserialized.") if parameter.kind == "input": # This loop is because some positional arguments may have empty values (value stored in parameter.key) if parameter.value: inputs.append(parameter.value) else: inputs.append(parameter.key) elif parameter.kind == "output": if parameter.value: outputs.append(parameter.value) else: outputs.append(parameter.key) return inputs, outputs def _add_activities_namespace(self): """ Add activities Namespace to self. :return: """ if len(self.ProvDocument.namespaces) == 0: self.ProvDocument.add_namespace( "activities", f"Activities associated with bioprov Project '{self.project.tag}'", ) def upload_to_provstore(self, api=None): """ Uploads self.ProvDocument. to ProvStore (https://openprovenance.org/store/) :param api: provstore.api.Api :return: Sends POST request to ProvStore API and updates self.ProvDocument if successful. """ if api is None: api = config.provstore_api try: self.provstore_document = api.document.create( self.ProvDocument, name=self.project.tag) except ConnectionError: logging.error( "Could not create remote document. Please check your internet connection and ProvStore credentials." ) def write_provn(self, path=None): """ Writes PROVN output of document. :param path: Path to write file. :return: Writes file. """ if path is None: path = f"./{self.project.tag}_provn" if self.add_attributes: path += "_attrs" path += ".txt" path = Path(path) assert ( path.parent.exists() ), f"Directory '{path.parent}' not found.\nPlease provide a valid directory." if path.exists(): logging.info(f"Overwriting file at '{path}'") with open(path, "w") as f: f.write(self.provn) if path.exists(): logging.info(f"Wrote PROVN record to {path}.")
class ProvenanceProfile: """ Provenance profile. Populated as the workflow runs. """ def __init__( self, research_object: "ResearchObject", full_name: str, host_provenance: bool, user_provenance: bool, orcid: str, fsaccess: StdFsAccess, run_uuid: Optional[uuid.UUID] = None, ) -> None: """Initialize the provenance profile.""" self.fsaccess = fsaccess self.orcid = orcid self.research_object = research_object self.folder = self.research_object.folder self.document = ProvDocument() self.host_provenance = host_provenance self.user_provenance = user_provenance self.engine_uuid = research_object.engine_uuid # type: str self.add_to_manifest = self.research_object.add_to_manifest if self.orcid: _logger.debug("[provenance] Creator ORCID: %s", self.orcid) self.full_name = full_name if self.full_name: _logger.debug("[provenance] Creator Full name: %s", self.full_name) self.workflow_run_uuid = run_uuid or uuid.uuid4() self.workflow_run_uri = self.workflow_run_uuid.urn # type: str self.generate_prov_doc() def __str__(self) -> str: """Represent this Provenvance profile as a string.""" return "ProvenanceProfile <{}> in <{}>".format( self.workflow_run_uri, self.research_object, ) def generate_prov_doc(self) -> Tuple[str, ProvDocument]: """Add basic namespaces.""" def host_provenance(document: ProvDocument) -> None: """Record host provenance.""" document.add_namespace(CWLPROV) document.add_namespace(UUID) document.add_namespace(FOAF) hostname = getfqdn() # won't have a foaf:accountServiceHomepage for unix hosts, but # we can at least provide hostname document.agent( ACCOUNT_UUID, { PROV_TYPE: FOAF["OnlineAccount"], "prov:location": hostname, CWLPROV["hostname"]: hostname, }, ) self.cwltool_version = "cwltool %s" % versionstring().split()[-1] self.document.add_namespace("wfprov", "http://purl.org/wf4ever/wfprov#") # document.add_namespace('prov', 'http://www.w3.org/ns/prov#') self.document.add_namespace("wfdesc", "http://purl.org/wf4ever/wfdesc#") # TODO: Make this ontology. For now only has cwlprov:image self.document.add_namespace("cwlprov", "https://w3id.org/cwl/prov#") self.document.add_namespace("foaf", "http://xmlns.com/foaf/0.1/") self.document.add_namespace("schema", "http://schema.org/") self.document.add_namespace("orcid", "https://orcid.org/") self.document.add_namespace("id", "urn:uuid:") # NOTE: Internet draft expired 2004-03-04 (!) # https://tools.ietf.org/html/draft-thiemann-hash-urn-01 # TODO: Change to nih:sha-256; hashes # https://tools.ietf.org/html/rfc6920#section-7 self.document.add_namespace("data", "urn:hash::sha1:") # Also needed for docker images self.document.add_namespace(SHA256, "nih:sha-256;") # info only, won't really be used by prov as sub-resources use / self.document.add_namespace("researchobject", self.research_object.base_uri) # annotations self.metadata_ns = self.document.add_namespace( "metadata", self.research_object.base_uri + METADATA + "/") # Pre-register provenance directory so we can refer to its files self.provenance_ns = self.document.add_namespace( "provenance", self.research_object.base_uri + posix_path(PROVENANCE) + "/") ro_identifier_workflow = self.research_object.base_uri + "workflow/packed.cwl#" self.wf_ns = self.document.add_namespace("wf", ro_identifier_workflow) ro_identifier_input = (self.research_object.base_uri + "workflow/primary-job.json#") self.document.add_namespace("input", ro_identifier_input) # More info about the account (e.g. username, fullname) # may or may not have been previously logged by user_provenance() # .. but we always know cwltool was launched (directly or indirectly) # by a user account, as cwltool is a command line tool account = self.document.agent(ACCOUNT_UUID) if self.orcid or self.full_name: person = {PROV_TYPE: PROV["Person"], "prov:type": SCHEMA["Person"]} if self.full_name: person["prov:label"] = self.full_name person["foaf:name"] = self.full_name person["schema:name"] = self.full_name else: # TODO: Look up name from ORCID API? pass agent = self.document.agent(self.orcid or uuid.uuid4().urn, person) self.document.actedOnBehalfOf(account, agent) else: if self.host_provenance: host_provenance(self.document) if self.user_provenance: self.research_object.user_provenance(self.document) # The execution of cwltool wfengine = self.document.agent( self.engine_uuid, { PROV_TYPE: PROV["SoftwareAgent"], "prov:type": WFPROV["WorkflowEngine"], "prov:label": self.cwltool_version, }, ) # FIXME: This datetime will be a bit too delayed, we should # capture when cwltool.py earliest started? self.document.wasStartedBy(wfengine, None, account, datetime.datetime.now()) # define workflow run level activity self.document.activity( self.workflow_run_uri, datetime.datetime.now(), None, { PROV_TYPE: WFPROV["WorkflowRun"], "prov:label": "Run of workflow/packed.cwl#main", }, ) # association between SoftwareAgent and WorkflowRun main_workflow = "wf:main" self.document.wasAssociatedWith(self.workflow_run_uri, self.engine_uuid, main_workflow) self.document.wasStartedBy(self.workflow_run_uri, None, self.engine_uuid, datetime.datetime.now()) return (self.workflow_run_uri, self.document) def evaluate( self, process: Process, job: JobsType, job_order_object: CWLObjectType, research_obj: "ResearchObject", ) -> None: """Evaluate the nature of job.""" if not hasattr(process, "steps"): # record provenance of independent commandline tool executions self.prospective_prov(job) customised_job = copy_job_order(job, job_order_object) self.used_artefacts(customised_job, self.workflow_run_uri) research_obj.create_job(customised_job) elif hasattr(job, "workflow"): # record provenance of workflow executions self.prospective_prov(job) customised_job = copy_job_order(job, job_order_object) self.used_artefacts(customised_job, self.workflow_run_uri) def record_process_start( self, process: Process, job: JobsType, process_run_id: Optional[str] = None) -> Optional[str]: if not hasattr(process, "steps"): process_run_id = self.workflow_run_uri elif not hasattr(job, "workflow"): # commandline tool execution as part of workflow name = "" if isinstance(job, (CommandLineJob, JobBase, WorkflowJob)): name = job.name process_name = urllib.parse.quote(name, safe=":/,#") process_run_id = self.start_process(process_name, datetime.datetime.now()) return process_run_id def start_process( self, process_name: str, when: datetime.datetime, process_run_id: Optional[str] = None, ) -> str: """Record the start of each Process.""" if process_run_id is None: process_run_id = uuid.uuid4().urn prov_label = "Run of workflow/packed.cwl#main/" + process_name self.document.activity( process_run_id, None, None, { PROV_TYPE: WFPROV["ProcessRun"], PROV_LABEL: prov_label }, ) self.document.wasAssociatedWith(process_run_id, self.engine_uuid, str("wf:main/" + process_name)) self.document.wasStartedBy(process_run_id, None, self.workflow_run_uri, when, None, None) return process_run_id def record_process_end( self, process_name: str, process_run_id: str, outputs: Union[CWLObjectType, MutableSequence[CWLObjectType], None], when: datetime.datetime, ) -> None: self.generate_output_prov(outputs, process_run_id, process_name) self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri, when) def declare_file( self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, str]: if value["class"] != "File": raise ValueError("Must have class:File: %s" % value) # Need to determine file hash aka RO filename entity = None # type: Optional[ProvEntity] checksum = None if "checksum" in value: csum = cast(str, value["checksum"]) (method, checksum) = csum.split("$", 1) if method == SHA1 and self.research_object.has_data_file(checksum): entity = self.document.entity("data:" + checksum) if not entity and "location" in value: location = str(value["location"]) # If we made it here, we'll have to add it to the RO with self.fsaccess.open(location, "rb") as fhandle: relative_path = self.research_object.add_data_file(fhandle) # FIXME: This naively relies on add_data_file setting hash as filename checksum = PurePath(relative_path).name entity = self.document.entity("data:" + checksum, {PROV_TYPE: WFPROV["Artifact"]}) if "checksum" not in value: value["checksum"] = f"{SHA1}${checksum}" if not entity and "contents" in value: # Anonymous file, add content as string entity, checksum = self.declare_string(cast( str, value["contents"])) # By here one of them should have worked! if not entity or not checksum: raise ValueError( "class:File but missing checksum/location/content: %r" % value) # Track filename and extension, this is generally useful only for # secondaryFiles. Note that multiple uses of a file might thus record # different names for the same entity, so we'll # make/track a specialized entity by UUID file_id = value.setdefault("@id", uuid.uuid4().urn) # A specialized entity that has just these names file_entity = self.document.entity( file_id, [(PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, WF4EVER["File"])], ) # type: ProvEntity if "basename" in value: file_entity.add_attributes( {CWLPROV["basename"]: value["basename"]}) if "nameroot" in value: file_entity.add_attributes( {CWLPROV["nameroot"]: value["nameroot"]}) if "nameext" in value: file_entity.add_attributes({CWLPROV["nameext"]: value["nameext"]}) self.document.specializationOf(file_entity, entity) # Check for secondaries for sec in cast(MutableSequence[CWLObjectType], value.get("secondaryFiles", [])): # TODO: Record these in a specializationOf entity with UUID? if sec["class"] == "File": (sec_entity, _, _) = self.declare_file(sec) elif sec["class"] == "Directory": sec_entity = self.declare_directory(sec) else: raise ValueError(f"Got unexpected secondaryFiles value: {sec}") # We don't know how/when/where the secondary file was generated, # but CWL convention is a kind of summary/index derived # from the original file. As its generally in a different format # then prov:Quotation is not appropriate. self.document.derivation( sec_entity, file_entity, other_attributes={PROV["type"]: CWLPROV["SecondaryFile"]}, ) return file_entity, entity, checksum def declare_directory(self, value: CWLObjectType) -> ProvEntity: """Register any nested files/directories.""" # FIXME: Calculate a hash-like identifier for directory # so we get same value if it's the same filenames/hashes # in a different location. # For now, mint a new UUID to identify this directory, but # attempt to keep it inside the value dictionary dir_id = cast(str, value.setdefault("@id", uuid.uuid4().urn)) # New annotation file to keep the ORE Folder listing ore_doc_fn = dir_id.replace("urn:uuid:", "directory-") + ".ttl" dir_bundle = self.document.bundle(self.metadata_ns[ore_doc_fn]) coll = self.document.entity( dir_id, [ (PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, PROV["Collection"]), (PROV_TYPE, PROV["Dictionary"]), (PROV_TYPE, RO["Folder"]), ], ) # ORE description of ro:Folder, saved separately coll_b = dir_bundle.entity( dir_id, [(PROV_TYPE, RO["Folder"]), (PROV_TYPE, ORE["Aggregation"])], ) self.document.mentionOf(dir_id + "#ore", dir_id, dir_bundle.identifier) # dir_manifest = dir_bundle.entity( # dir_bundle.identifier, {PROV["type"]: ORE["ResourceMap"], # ORE["describes"]: coll_b.identifier}) coll_attribs = [(ORE["isDescribedBy"], dir_bundle.identifier)] coll_b_attribs = [] # type: List[Tuple[Identifier, ProvEntity]] # FIXME: .listing might not be populated yet - hopefully # a later call to this method will sort that is_empty = True if "listing" not in value: get_listing(self.fsaccess, value) for entry in cast(MutableSequence[CWLObjectType], value.get("listing", [])): is_empty = False # Declare child-artifacts entity = self.declare_artefact(entry) self.document.membership(coll, entity) # Membership relation aka our ORE Proxy m_id = uuid.uuid4().urn m_entity = self.document.entity(m_id) m_b = dir_bundle.entity(m_id) # PROV-O style Dictionary # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition # ..as prov.py do not currently allow PROV-N extensions # like hadDictionaryMember(..) m_entity.add_asserted_type(PROV["KeyEntityPair"]) m_entity.add_attributes({ PROV["pairKey"]: entry["basename"], PROV["pairEntity"]: entity, }) # As well as a being a # http://wf4ever.github.io/ro/2016-01-28/ro/#FolderEntry m_b.add_asserted_type(RO["FolderEntry"]) m_b.add_asserted_type(ORE["Proxy"]) m_b.add_attributes({ RO["entryName"]: entry["basename"], ORE["proxyIn"]: coll, ORE["proxyFor"]: entity, }) coll_attribs.append((PROV["hadDictionaryMember"], m_entity)) coll_b_attribs.append((ORE["aggregates"], m_b)) coll.add_attributes(coll_attribs) coll_b.add_attributes(coll_b_attribs) # Also Save ORE Folder as annotation metadata ore_doc = ProvDocument() ore_doc.add_namespace(ORE) ore_doc.add_namespace(RO) ore_doc.add_namespace(UUID) ore_doc.add_bundle(dir_bundle) ore_doc = ore_doc.flattened() ore_doc_path = str(PurePosixPath(METADATA, ore_doc_fn)) with self.research_object.write_bag_file( ore_doc_path) as provenance_file: ore_doc.serialize(provenance_file, format="rdf", rdf_format="turtle") self.research_object.add_annotation(dir_id, [ore_doc_fn], ORE["isDescribedBy"].uri) if is_empty: # Empty directory coll.add_asserted_type(PROV["EmptyCollection"]) coll.add_asserted_type(PROV["EmptyDictionary"]) self.research_object.add_uri(coll.identifier.uri) return coll def declare_string(self, value: str) -> Tuple[ProvEntity, str]: """Save as string in UTF-8.""" byte_s = BytesIO(str(value).encode(ENCODING)) data_file = self.research_object.add_data_file(byte_s, content_type=TEXT_PLAIN) checksum = PurePosixPath(data_file).name # FIXME: Don't naively assume add_data_file uses hash in filename! data_id = "data:%s" % PurePosixPath(data_file).stem entity = self.document.entity(data_id, { PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value) }) # type: ProvEntity return entity, checksum def declare_artefact(self, value: Optional[CWLOutputType]) -> ProvEntity: """Create data artefact entities for all file objects.""" if value is None: # FIXME: If this can happen in CWL, we'll # need a better way to represent this in PROV return self.document.entity(CWLPROV["None"], {PROV_LABEL: "None"}) if isinstance(value, (bool, int, float)): # Typically used in job documents for flags # FIXME: Make consistent hash URIs for these # that somehow include the type # (so "1" != 1 != "1.0" != true) entity = self.document.entity(uuid.uuid4().urn, {PROV_VALUE: value}) self.research_object.add_uri(entity.identifier.uri) return entity if isinstance(value, (str, str)): (entity, _) = self.declare_string(value) return entity if isinstance(value, bytes): # If we got here then we must be in Python 3 byte_s = BytesIO(value) data_file = self.research_object.add_data_file(byte_s) # FIXME: Don't naively assume add_data_file uses hash in filename! data_id = "data:%s" % PurePosixPath(data_file).stem return self.document.entity( data_id, { PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value) }, ) if isinstance(value, MutableMapping): if "@id" in value: # Already processed this value, but it might not be in this PROV entities = self.document.get_record(value["@id"]) if entities: return entities[0] # else, unknown in PROV, re-add below as if it's fresh # Base case - we found a File we need to update if value.get("class") == "File": (entity, _, _) = self.declare_file(value) value["@id"] = entity.identifier.uri return entity if value.get("class") == "Directory": entity = self.declare_directory(value) value["@id"] = entity.identifier.uri return entity coll_id = value.setdefault("@id", uuid.uuid4().urn) # some other kind of dictionary? # TODO: also Save as JSON coll = self.document.entity( coll_id, [ (PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, PROV["Collection"]), (PROV_TYPE, PROV["Dictionary"]), ], ) if value.get("class"): _logger.warning("Unknown data class %s.", value["class"]) # FIXME: The class might be "http://example.com/somethingelse" coll.add_asserted_type(CWLPROV[value["class"]]) # Let's iterate and recurse coll_attribs = [] # type: List[Tuple[Identifier, ProvEntity]] for (key, val) in value.items(): v_ent = self.declare_artefact(val) self.document.membership(coll, v_ent) m_entity = self.document.entity(uuid.uuid4().urn) # Note: only support PROV-O style dictionary # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition # as prov.py do not easily allow PROV-N extensions m_entity.add_asserted_type(PROV["KeyEntityPair"]) m_entity.add_attributes({ PROV["pairKey"]: str(key), PROV["pairEntity"]: v_ent }) coll_attribs.append((PROV["hadDictionaryMember"], m_entity)) coll.add_attributes(coll_attribs) self.research_object.add_uri(coll.identifier.uri) return coll # some other kind of Collection? # TODO: also save as JSON try: members = [] for each_input_obj in iter(value): # Recurse and register any nested objects e = self.declare_artefact(each_input_obj) members.append(e) # If we reached this, then we were allowed to iterate coll = self.document.entity( uuid.uuid4().urn, [ (PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, PROV["Collection"]), ], ) if not members: coll.add_asserted_type(PROV["EmptyCollection"]) else: for member in members: # FIXME: This won't preserve order, for that # we would need to use PROV.Dictionary # with numeric keys self.document.membership(coll, member) self.research_object.add_uri(coll.identifier.uri) # FIXME: list value does not support adding "@id" return coll except TypeError: _logger.warning("Unrecognized type %s of %r", type(value), value) # Let's just fall back to Python repr() entity = self.document.entity(uuid.uuid4().urn, {PROV_LABEL: repr(value)}) self.research_object.add_uri(entity.identifier.uri) return entity def used_artefacts( self, job_order: Union[CWLObjectType, List[CWLObjectType]], process_run_id: str, name: Optional[str] = None, ) -> None: """Add used() for each data artefact.""" if isinstance(job_order, list): for entry in job_order: self.used_artefacts(entry, process_run_id, name) else: # FIXME: Use workflow name in packed.cwl, "main" is wrong for nested workflows base = "main" if name is not None: base += "/" + name for key, value in job_order.items(): prov_role = self.wf_ns[f"{base}/{key}"] try: entity = self.declare_artefact(value) self.document.used( process_run_id, entity, datetime.datetime.now(), None, {"prov:role": prov_role}, ) except OSError: pass def generate_output_prov( self, final_output: Union[CWLObjectType, MutableSequence[CWLObjectType], None], process_run_id: Optional[str], name: Optional[str], ) -> None: """Call wasGeneratedBy() for each output,copy the files into the RO.""" if isinstance(final_output, MutableSequence): for entry in final_output: self.generate_output_prov(entry, process_run_id, name) elif final_output is not None: # Timestamp should be created at the earliest timestamp = datetime.datetime.now() # For each output, find/register the corresponding # entity (UUID) and document it as generated in # a role corresponding to the output for output, value in final_output.items(): entity = self.declare_artefact(value) if name is not None: name = urllib.parse.quote(str(name), safe=":/,#") # FIXME: Probably not "main" in nested workflows role = self.wf_ns[f"main/{name}/{output}"] else: role = self.wf_ns["main/%s" % output] if not process_run_id: process_run_id = self.workflow_run_uri self.document.wasGeneratedBy(entity, process_run_id, timestamp, None, {"prov:role": role}) def prospective_prov(self, job: JobsType) -> None: """Create prospective prov recording as wfdesc prov:Plan.""" if not isinstance(job, WorkflowJob): # direct command line tool execution self.document.entity( "wf:main", { PROV_TYPE: WFDESC["Process"], "prov:type": PROV["Plan"], "prov:label": "Prospective provenance", }, ) return self.document.entity( "wf:main", { PROV_TYPE: WFDESC["Workflow"], "prov:type": PROV["Plan"], "prov:label": "Prospective provenance", }, ) for step in job.steps: stepnametemp = "wf:main/" + str(step.name)[5:] stepname = urllib.parse.quote(stepnametemp, safe=":/,#") provstep = self.document.entity( stepname, { PROV_TYPE: WFDESC["Process"], "prov:type": PROV["Plan"] }, ) self.document.entity( "wf:main", { "wfdesc:hasSubProcess": provstep, "prov:label": "Prospective provenance", }, ) # TODO: Declare roles/parameters as well def activity_has_provenance(self, activity, prov_ids): # type: (str, List[Identifier]) -> None """Add http://www.w3.org/TR/prov-aq/ relations to nested PROV files.""" # NOTE: The below will only work if the corresponding metadata/provenance arcp URI # is a pre-registered namespace in the PROV Document attribs = [(PROV["has_provenance"], prov_id) for prov_id in prov_ids] self.document.activity(activity, other_attributes=attribs) # Tip: we can't use https://www.w3.org/TR/prov-links/#term-mention # as prov:mentionOf() is only for entities, not activities uris = [i.uri for i in prov_ids] self.research_object.add_annotation(activity, uris, PROV["has_provenance"].uri) def finalize_prov_profile(self, name): # type: (Optional[str]) -> List[Identifier] """Transfer the provenance related files to the RO.""" # NOTE: Relative posix path if name is None: # main workflow, fixed filenames filename = "primary.cwlprov" else: # ASCII-friendly filename, avoiding % as we don't want %2520 in manifest.json wf_name = urllib.parse.quote(str(name), safe="").replace("%", "_") # Note that the above could cause overlaps for similarly named # workflows, but that's OK as we'll also include run uuid # which also covers thhe case of this step being run in # multiple places or iterations filename = f"{wf_name}.{self.workflow_run_uuid}.cwlprov" basename = str(PurePosixPath(PROVENANCE) / filename) # TODO: Also support other profiles than CWLProv, e.g. ProvOne # list of prov identifiers of provenance files prov_ids = [] # https://www.w3.org/TR/prov-xml/ with self.research_object.write_bag_file(basename + ".xml") as provenance_file: self.document.serialize(provenance_file, format="xml", indent=4) prov_ids.append(self.provenance_ns[filename + ".xml"]) # https://www.w3.org/TR/prov-n/ with self.research_object.write_bag_file(basename + ".provn") as provenance_file: self.document.serialize(provenance_file, format="provn", indent=2) prov_ids.append(self.provenance_ns[filename + ".provn"]) # https://www.w3.org/Submission/prov-json/ with self.research_object.write_bag_file(basename + ".json") as provenance_file: self.document.serialize(provenance_file, format="json", indent=2) prov_ids.append(self.provenance_ns[filename + ".json"]) # "rdf" aka https://www.w3.org/TR/prov-o/ # which can be serialized to ttl/nt/jsonld (and more!) # https://www.w3.org/TR/turtle/ with self.research_object.write_bag_file(basename + ".ttl") as provenance_file: self.document.serialize(provenance_file, format="rdf", rdf_format="turtle") prov_ids.append(self.provenance_ns[filename + ".ttl"]) # https://www.w3.org/TR/n-triples/ with self.research_object.write_bag_file(basename + ".nt") as provenance_file: self.document.serialize(provenance_file, format="rdf", rdf_format="ntriples") prov_ids.append(self.provenance_ns[filename + ".nt"]) # https://www.w3.org/TR/json-ld/ # TODO: Use a nice JSON-LD context # see also https://eprints.soton.ac.uk/395985/ # 404 Not Found on https://provenance.ecs.soton.ac.uk/prov.jsonld :( with self.research_object.write_bag_file(basename + ".jsonld") as provenance_file: self.document.serialize(provenance_file, format="rdf", rdf_format="json-ld") prov_ids.append(self.provenance_ns[filename + ".jsonld"]) _logger.debug("[provenance] added provenance: %s", prov_ids) return prov_ids
from prov.model import ProvDocument from provdbconnector import ProvApi from provdbconnector.db_adapters.in_memory import SimpleInMemoryAdapter prov_api = ProvApi(adapter=SimpleInMemoryAdapter, auth_info=None) # create the prov document prov_document = ProvDocument() prov_document.add_namespace("ex", "http://example.com") prov_document.agent("ex:Bob") prov_document.activity("ex:Alice") prov_document.association("ex:Alice", "ex:Bob") # create bundle b1 = prov_document.bundle("ex:bundle1") b1.agent("ex:Yoda") b2 = prov_document.bundle("ex:bundle2") b2.agent("ex:Jabba the Hutt") document_id = prov_api.create_document(prov_document) print(prov_api.get_document_as_provn(document_id)) # Output: # # document # prefix ex <http://example.com> # # agent(ex:Bob)