def datatypes(): g = ProvDocument() ex = Namespace('ex', 'http://example.org/') g.add_namespace(ex) attributes = { 'ex:int': 100, 'ex:float': 100.123456, 'ex:long': 123456789000, 'ex:bool': True, 'ex:str': 'Some string', 'ex:unicode': u'Some unicode string with accents: Huỳnh Trung Đông', 'ex:timedate': datetime.datetime(2012, 12, 12, 14, 7, 48), 'ex:intstr': Literal("PROV Internationalized string", PROV["InternationalizedString"], "en"), } multiline = """Line1 Line2 Line3""" attributes['ex:multi-line'] = multiline g.entity('ex:e1', attributes) return g
def add_namespaces_to_bundle(prov_bundle, metadata): namespaces = dict() try: namespace_str = metadata[METADATA_KEY_NAMESPACES] except ValueError: SerializerException("No valid namespace provided, should be a string of a dict: {}".format(metadata)) return if type(namespace_str) is str: io = StringIO(namespace_str) namespaces = json.load(io) elif type(namespace_str) is dict: namespaces = namespace_str else: raise SerializerException( "Namespaces metadata should returned as json string or dict not as {}".format(type(namespace_str))) for prefix, uri in namespaces.items(): if prefix is not None and uri is not None: if prefix != 'default': prov_bundle.add_namespace(Namespace(prefix, uri)) else: prov_bundle.set_default_namespace(uri) else: SerializerException("No valid namespace provided for the metadata: {}".format(metadata))
def datatypes(): g = ProvDocument() ex = Namespace("ex", "http://example.org/") g.add_namespace(ex) attributes = { "ex:int": 100, "ex:float": 100.123456, "ex:long": 123456789000, "ex:bool": True, "ex:str": "Some string", "ex:unicode": "Some unicode string with accents: Huỳnh Trung Đông", "ex:timedate": datetime.datetime(2012, 12, 12, 14, 7, 48), "ex:intstr": Literal("PROV Internationalized string", PROV["InternationalizedString"], "en"), } multiline = """Line1 Line2 Line3""" attributes["ex:multi-line"] = multiline g.entity("ex:e1", attributes) return g
def example_graph(): FOAF = Namespace("foaf","http://xmlns.com/foaf/0.1/") EX = Namespace("ex","http://www.example.com/") DCTERMS = Namespace("dcterms","http://purl.org/dc/terms/") # create a provenance _container g = ProvBundle() # Set the default _namespace name g.set_default_namespace(EX.get_uri()) g.add_namespace(DCTERMS) # add entities, first define the _attributes in a dictionary e0_attrs = {PROV["type"]: "File", EX["path"]: "/shared/crime.txt", EX["creator"]: "Alice"} # then create the entity # If you give the id as a string, it will be treated as a localname # under the default _namespace e0 = g.entity(EX["e0"], e0_attrs) # define the _attributes for the next entity lit0 = Literal("2011-11-16T16:06:00", XSD["dateTime"]) attrdict ={PROV["type"]: EX["File"], EX["path"]: "/shared/crime.txt", DCTERMS["creator"]: FOAF['Alice'], EX["content"]: "", DCTERMS["create"]: lit0} # create the entity, note this time we give the id as a PROVQname e1 = g.entity(FOAF['Foo'], attrdict) # add activities # You can give the _attributes during the creation if there are not many a0 = g.activity(EX['a0'], datetime.datetime(2008, 7, 6, 5, 4, 3), None, {PROV["type"]: EX["create-file"]}) g0 = g.wasGeneratedBy(e0, a0, None, "g0", {EX["fct"]: "create"}) attrdict={EX["fct"]: "load", EX["typeexample"] : Literal("MyValue", EX["MyType"])} u0 = g.used(a0, e1, None, "u0", attrdict) # The id for a relation is an optional argument, The system will generate one # if you do not specify it g.wasDerivedFrom(e0, e1, a0, g0, u0) return g
def collections(): g = ProvDocument() ex = Namespace('ex', 'http://example.org/') c1 = g.collection(ex['c1']) e1 = g.entity('ex:e1') g.hadMember(c1, e1) return g
def test_xsd_qnames(self): prov_doc = ProvDocument() ex = Namespace('ex', 'http://www.example.org') prov_doc.add_namespace(ex) an_xsd_qname = XSDQName(ex['a_value']) prov_doc.entity('ex:e1', {'prov:value': an_xsd_qname}) self.assertPROVJSONRoundTripEquivalence(prov_doc)
def collections(): g = ProvDocument() ex = Namespace("ex", "http://example.org/") c1 = g.collection(ex["c1"]) e1 = g.entity("ex:e1") g.hadMember(c1, e1) return g
def bundles1(): # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles1.provn # =============================================================================== # document g = ProvDocument() # prefix ex <http://example.org/example/> EX = Namespace("ex", "http://www.example.com/") g.add_namespace(EX) # prefix alice <http://example.org/alice/> # prefix bob <http://example.org/bob/> g.add_namespace("alice", "http://example.org/alice/") g.add_namespace("bob", "http://example.org/bob/") # entity(bob:bundle1, [prov:type='prov:Bundle']) g.entity("bob:bundle1", {"prov:type": PROV["Bundle"]}) # wasGeneratedBy(bob:bundle1, -, 2012-05-24T10:30:00) g.wasGeneratedBy("bob:bundle1", time="2012-05-24T10:30:00") # agent(ex:Bob) g.agent("ex:Bob") # wasAttributedTo(bob:bundle1, ex:Bob) g.wasAttributedTo("bob:bundle1", "ex:Bob") # entity(alice:bundle2, [ prov:type='prov:Bundle' ]) g.entity("alice:bundle2", {"prov:type": PROV["Bundle"]}) # wasGeneratedBy(alice:bundle2, -, 2012-05-25T11:15:00) g.wasGeneratedBy("alice:bundle2", time="2012-05-25T11:15:00") # agent(ex:Alice) g.agent("ex:Alice") # wasAttributedTo(alice:bundle2, ex:Alice) g.wasAttributedTo("alice:bundle2", "ex:Alice") # bundle bob:bundle1 b1 = g.bundle("bob:bundle1") # entity(ex:report1, [ prov:type="report", ex:version=1 ]) b1.entity("ex:report1", {"prov:type": "report", "ex:version": 1}) # wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01) b1.wasGeneratedBy("ex:report1", time="2012-05-24T10:00:01") # endBundle # bundle alice:bundle2 b2 = g.bundle("alice:bundle2") # entity(ex:report1) b2.entity("ex:report1") # entity(ex:report2, [ prov:type="report", ex:version=2 ]) b2.entity("ex:report2", {"prov:type": "report", "ex:version": 2}) # wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01) b2.wasGeneratedBy("ex:report2", time="2012-05-25T11:00:01") # wasDerivedFrom(ex:report2, ex:report1) b2.wasDerivedFrom("ex:report2", "ex:report1") # endBundle # endDocument return g
def ctfToProv(): d1 = ProvDocument() dummy = ProvDocument() ex = Namespace( 'ex', 'http://example/' ) # namespaces do not need to be explicitly added to a document #data = event_field(os.path.join(trace_path,'../config.yaml')) counter = 0 counter_1 = 0 relationships = [] entities = [] activities = [] for event in trace_collection.events: dataset = { 'ex:' + k: event[k] for k in event.field_list_with_scope( babeltrace.CTFScope.EVENT_FIELDS) } dataset.update( {'ex:' + 'timestamp': (event['timestamp'] / 1000000000)}) #dataset.update({'ex:'+'name':event.name}) e1 = d1.entity(ex['event' + str(counter)], dataset) entities.append(e1) producer_agent = d1.agent('ex:' + event['producer_id']) controller_agent = d1.agent('ex:' + event['controller_id']) activity = d1.activity('ex:' + event['activity'] + str(counter_1)) activities.append(activity) d1.wasGeneratedBy(e1, activity) # strings used to detect if the relationship already exists in the d1 document association_relationship = str( dummy.wasAssociatedWith(activity, producer_agent)) used_relationship = str(dummy.used(controller_agent, producer_agent)) # Add activity to producer agent if it has not been added before. d1.wasAssociatedWith(activity, producer_agent) # if association_relationship not in relationships: # d1.wasAssociatedWith(activity, producer_agent) # relationships.append(association_relationship) # Add producer agent to controller agent if it has not been added yet. if used_relationship not in relationships: d1.used(controller_agent, producer_agent) relationships.append(used_relationship) # Add temporal relationship between this event and the previous one. if counter > 0: d1.wasAssociatedWith(entities[counter - 1], e1) counter += 1 counter_1 += 1 return d1
def long_literals(): g = ProvDocument() long_uri = "http://Lorem.ipsum/dolor/sit/amet/consectetur/adipiscing/elit/Quisque/vel/sollicitudin/felis/nec/venenatis/massa/Aenean/lectus/arcu/sagittis/sit/amet/nisl/nec/varius/eleifend/sem/In/hac/habitasse/platea/dictumst/Aliquam/eget/fermentum/enim/Curabitur/auctor/elit/non/ipsum/interdum/at/orci/aliquam/" ex = Namespace('ex', long_uri) g.add_namespace(ex) g.entity( 'ex:e1', { 'prov:label': 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec pellentesque luctus nulla vel ullamcorper. Donec sit amet ligula sit amet lorem pretium rhoncus vel vel lorem. Sed at consequat metus, eget eleifend massa. Fusce a facilisis turpis. Lorem volutpat.' }) return g
def add_namespace(self, db_node, prov_bundle): try: prefixes = db_node.properties[DOC_PROPERTY_NAME_NAMESPACE_PREFIX] uris = db_node.properties[DOC_PROPERTY_NAME_NAMESPACE_URI] except KeyError: return for prefix, uri in zip(prefixes, uris): if prefix is not None and uri is not None: if prefix != 'default': prov_bundle.add_namespace(Namespace(prefix, uri)) else: prov_bundle.set_default_namespace(uri) else: ProvDeserializerException( "No valid namespace provided for the node: %s" % db_node)
def add_namespaces_to_bundle(prov_bundle, metadata): """ Add all namespaces in the metadata_dict to the provided bundle :param prov_bundle: :param metadata: :return: None """ namespaces = dict() try: namespace_str = metadata[METADATA_KEY_NAMESPACES] except ValueError: SerializerException( "No valid namespace provided, should be a string of a dict: {}". format(metadata)) return if type(namespace_str) is str: io = StringIO(namespace_str) namespaces = json.load(io) elif type(namespace_str) is dict: namespaces = namespace_str elif type(namespace_str) is list: for entry in namespace_str: if type(entry) is str: io = StringIO(entry) namespaces.update(json.load(io)) else: raise SerializerException( "Namespaces metadata should returned as json string dict or list of json strings not as {}" .format(type(namespace_str))) else: raise SerializerException( "Namespaces metadata should returned as json string dict or list of json strings not as {}" .format(type(namespace_str))) for prefix, uri in namespaces.items(): if prefix is not None and uri is not None: if prefix != 'default': prov_bundle.add_namespace(Namespace(prefix, uri)) else: prov_bundle.set_default_namespace(uri) else: SerializerException( "No valid namespace provided for the metadata: {}".format( metadata))
def update(self): """ Checks current environment and updates attributes using the os.environ module. :return: Sets attributes to self. """ env_dict = dict(os.environ.items()) env_hash = dict_to_sha256(env_dict) if env_hash != self.env_hash: self.env_dict = env_dict self.env_hash = env_hash # this is only to prevent build errors try: self.user = self.env_dict["USER"] except KeyError: # no cover self.env_dict["USER"] = "******" # no cover self.env_namespace = Namespace("envs", str(self))
def __init__(self, *args, **kwargs): """Constructor.""" # update namespaces if 'namespaces' not in kwargs: kwargs['namespaces'] = self.NAMESPACES else: if isinstance(kwargs['namespaces'], dict): kwargs['namespaces'] = [ Namespace(prefix, uri) for prefix, uri in list(kwargs['namespaces'].items()) ] kwargs['namespaces'].extend(self.NAMESPACES) # track organizations to remove redundancy self.prov_es_orgs = {} super(ProvEsDocument, self).__init__(*args, **kwargs)
""" Definition of constants. @author: Camille Maumet <*****@*****.**> @copyright: University of Warwick 2013-2014 """ from prov.model import Namespace from prov.model import PROV NIDM = Namespace('nidm', "http://www.incf.org/ns/nidash/nidm#") NIIRI = Namespace("niiri", "http://iri.nidash.org/") CRYPTO = Namespace("crypto", "http://id.loc.gov/vocabulary/preservation/cryptographicHashFunctions#") FSL = Namespace("fsl", "http://www.incf.org/ns/nidash/fsl#") DCT = Namespace("dct", "http://purl.org/dc/terms/") GAUSSIAN_DISTRIBUTION = NIDM['GaussianDistribution'] INDEPEDENT_CORR = NIDM['IndependentError'] SERIALLY_CORR = NIDM['SeriallyCorrelatedError'] COMPOUND_SYMMETRY_CORR = NIDM['CompoundSymmetricError'] ARBITRARILY_CORR = NIDM['ArbitriralyCorrelatedError'] CORRELATION_ENUM = { INDEPEDENT_CORR, SERIALLY_CORR, COMPOUND_SYMMETRY_CORR, ARBITRARILY_CORR } SPATIALLY_GLOBAL = NIDM['SpatiallyGlocal']
def primer_example(): # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/primer.pn # =========================================================================== # document g = ProvDocument() # prefix ex <http://example/> # prefix dcterms <http://purl.org/dc/terms/> # prefix foaf <http://xmlns.com/foaf/0.1/> ex = Namespace( "ex", "http://example/" ) # namespaces do not need to be explicitly added to a document g.add_namespace("dcterms", "http://purl.org/dc/terms/") g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/") # entity(ex:article, [dcterms:title="Crime rises in cities"]) # first time the ex namespace was used, it is added to the document automatically g.entity(ex["article"], {"dcterms:title": "Crime rises in cities"}) # entity(ex:articleV1) g.entity(ex["articleV1"]) # entity(ex:articleV2) g.entity(ex["articleV2"]) # entity(ex:dataSet1) g.entity(ex["dataSet1"]) # entity(ex:dataSet2) g.entity(ex["dataSet2"]) # entity(ex:regionList) g.entity(ex["regionList"]) # entity(ex:composition) g.entity(ex["composition"]) # entity(ex:chart1) g.entity(ex["chart1"]) # entity(ex:chart2) g.entity(ex["chart2"]) # entity(ex:blogEntry) g.entity(ex["blogEntry"]) # activity(ex:compile) g.activity( "ex:compile") # since ex is registered, it can be used like this # activity(ex:compile2) g.activity("ex:compile2") # activity(ex:compose) g.activity("ex:compose") # activity(ex:correct, 2012-03-31T09:21:00, 2012-04-01T15:21:00) g.activity("ex:correct", "2012-03-31T09:21:00", "2012-04-01T15:21:00") # date time can be provided as strings # activity(ex:illustrate) g.activity("ex:illustrate") # used(ex:compose, ex:dataSet1, -, [ prov:role = "ex:dataToCompose"]) g.used("ex:compose", "ex:dataSet1", other_attributes={"prov:role": "ex:dataToCompose"}) # used(ex:compose, ex:regionList, -, [ prov:role = "ex:regionsToAggregateBy"]) g.used( "ex:compose", "ex:regionList", other_attributes={"prov:role": "ex:regionsToAggregateBy"}, ) # wasGeneratedBy(ex:composition, ex:compose, -) g.wasGeneratedBy("ex:composition", "ex:compose") # used(ex:illustrate, ex:composition, -) g.used("ex:illustrate", "ex:composition") # wasGeneratedBy(ex:chart1, ex:illustrate, -) g.wasGeneratedBy("ex:chart1", "ex:illustrate") # wasGeneratedBy(ex:chart1, ex:compile, 2012-03-02T10:30:00) g.wasGeneratedBy("ex:chart1", "ex:compile", "2012-03-02T10:30:00") # wasGeneratedBy(ex:chart2, ex:compile2, 2012-04-01T15:21:00) # # # agent(ex:derek, [ prov:type="prov:Person", foaf:givenName = "Derek", # foaf:mbox= "<mailto:[email protected]>"]) g.agent( "ex:derek", { "prov:type": PROV["Person"], "foaf:givenName": "Derek", "foaf:mbox": "<mailto:[email protected]>", }, ) # wasAssociatedWith(ex:compose, ex:derek, -) g.wasAssociatedWith("ex:compose", "ex:derek") # wasAssociatedWith(ex:illustrate, ex:derek, -) g.wasAssociatedWith("ex:illustrate", "ex:derek") # # agent(ex:chartgen, [ prov:type="prov:Organization", # foaf:name = "Chart Generators Inc"]) g.agent( "ex:chartgen", { "prov:type": PROV["Organization"], "foaf:name": "Chart Generators Inc" }, ) # actedOnBehalfOf(ex:derek, ex:chartgen, ex:compose) g.actedOnBehalfOf("ex:derek", "ex:chartgen", "ex:compose") # wasAttributedTo(ex:chart1, ex:derek) g.wasAttributedTo("ex:chart1", "ex:derek") # wasGeneratedBy(ex:dataSet2, ex:correct, -) g.wasGeneratedBy("ex:dataSet2", "ex:correct") # used(ex:correct, ex:dataSet1, -) g.used("ex:correct", "ex:dataSet1") # wasDerivedFrom(ex:dataSet2, ex:dataSet1, [prov:type='prov:Revision']) g.wasDerivedFrom("ex:dataSet2", "ex:dataSet1", other_attributes={"prov:type": PROV["Revision"]}) # wasDerivedFrom(ex:chart2, ex:dataSet2) g.wasDerivedFrom("ex:chart2", "ex:dataSet2") # wasDerivedFrom(ex:blogEntry, ex:article, [prov:type='prov:Quotation']) g.wasDerivedFrom("ex:blogEntry", "ex:article", other_attributes={"prov:type": PROV["Quotation"]}) # specializationOf(ex:articleV1, ex:article) g.specializationOf("ex:articleV1", "ex:article") # wasDerivedFrom(ex:articleV1, ex:dataSet1) g.wasDerivedFrom("ex:articleV1", "ex:dataSet1") # specializationOf(ex:articleV2, ex:article) g.specializationOf("ex:articleV2", "ex:article") # wasDerivedFrom(ex:articleV2, ex:dataSet2) g.wasDerivedFrom("ex:articleV2", "ex:dataSet2") # alternateOf(ex:articleV2, ex:articleV1) g.alternateOf("ex:articleV2", "ex:articleV1") # endDocument return g
""" Definition of constants. @author: Camille Maumet <*****@*****.**> @copyright: University of Warwick 2013-2014 """ from prov.model import PROV, Namespace, NamespaceManager NIDM = Namespace('nidm', "http://purl.org/nidash/nidm#") NIIRI = Namespace("niiri", "http://iri.nidash.org/") CRYPTO = Namespace( "crypto", "http://id.loc.gov/vocabulary/preservation/cryptographicHashFunctions#") FSL = Namespace("fsl", "http://purl.org/nidash/fsl#") SPM = Namespace("spm", "http://purl.org/nidash/spm#") AFNI = Namespace("afni", "http://purl.org/nidash/afni#") DCT = Namespace("dct", "http://purl.org/dc/terms/") OBO = Namespace("obo", "http://purl.obolibrary.org/obo/") DCTYPE = Namespace("dctype", "http://purl.org/dc/dcmitype/") NLX_OLD = Namespace("nlx_old", "http://neurolex.org/wiki/") DC = Namespace("dc", "http://purl.org/dc/elements/1.1/") NFO = Namespace( "nfo", "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#") SCR = Namespace("scr", "http://scicrunch.org/resolver/") NIF = Namespace("nif", "http://uri.neuinfo.org/nif/nifstd/") namespaces = (PROV, NIDM, NIIRI, CRYPTO, FSL, SPM, AFNI, DCT, OBO, DCTYPE, DC, NFO, SCR, NIF) namespace_manager = NamespaceManager(namespaces)
def w3c_publication_2(): # https://github.com/lucmoreau/ProvToolbox/blob/master/asn/src/test/resources/prov/w3c-publication2.prov-asn # =========================================================================== # bundle # # prefix ex <http://example.org/> # prefix rec <http://example.org/record> # # prefix w3 <http://www.w3.org/TR/2011/> # prefix hg <http://dvcs.w3.org/hg/prov/raw-file/9628aaff6e20/model/releases/WD-prov-dm-20111215/> # # # entity(hg:Overview.html, [ prov:type="file in hg" ]) # entity(w3:WD-prov-dm-20111215, [ prov:type="html4" ]) # # # activity(ex:rcp,-,-,[prov:type="copy directory"]) # # wasGeneratedBy(rec:g; w3:WD-prov-dm-20111215, ex:rcp, -) # # entity(ex:req3, [ prov:type="http://www.w3.org/2005/08/01-transitions.html#pubreq" %% xsd:anyURI ]) # # used(rec:u; ex:rcp,hg:Overview.html,-) # used(ex:rcp, ex:req3, -) # # # wasDerivedFrom(w3:WD-prov-dm-20111215, hg:Overview.html, ex:rcp, rec:g, rec:u) # # agent(ex:webmaster, [ prov:type='prov:Person' ]) # # wasAssociatedWith(ex:rcp, ex:webmaster, -) # # endBundle # =========================================================================== ex = Namespace("ex", "http://example.org/") rec = Namespace("rec", "http://example.org/record") w3 = Namespace("w3", "http://www.w3.org/TR/2011/") hg = Namespace( "hg", "http://dvcs.w3.org/hg/prov/raw-file/9628aaff6e20/model/releases/WD-prov-dm-20111215/", ) g = ProvDocument() g.entity(hg["Overview.html"], {"prov:type": "file in hg"}) g.entity(w3["WD-prov-dm-20111215"], {"prov:type": "html4"}) g.activity(ex["rcp"], None, None, {"prov:type": "copy directory"}) g.wasGeneratedBy("w3:WD-prov-dm-20111215", "ex:rcp", identifier=rec["g"]) g.entity( "ex:req3", { "prov:type": Identifier("http://www.w3.org/2005/08/01-transitions.html#pubreq") }, ) g.used("ex:rcp", "hg:Overview.html", identifier="rec:u") g.used("ex:rcp", "ex:req3") g.wasDerivedFrom("w3:WD-prov-dm-20111215", "hg:Overview.html", "ex:rcp", "rec:g", "rec:u") g.agent("ex:webmaster", {"prov:type": "Person"}) g.wasAssociatedWith("ex:rcp", "ex:webmaster") return g
def ctfToProv(): d1 = ProvDocument() dummy = ProvDocument() ex = Namespace( 'ex', 'http://example/' ) # namespaces do not need to be explicitly added to a document #data = event_field(os.path.join(trace_path,'../config.yaml')) counter = 0 #counter_1 = 0 relationships = [] entityActivityList = [] # activities = [] can_events = defaultdict(list) for event in trace_collection.events: dataset = { 'ex:' + k: event[k] for k in event.field_list_with_scope( babeltrace.CTFScope.EVENT_FIELDS) } #dataset.update({'ex:'+'timestamp':(event['timestamp']/1000000000)}) dataset.update({'ex:' + 'name': event.name}) # #calculates PGN # pf = str(bin(int(dataset['node_id'], 16)))[5:13] # if int(pf) > 240: # pgn = int(str(bin(int(dataset['node_id'], 16)))[3:21], 2) # else: # pgn = int(str(bin(int(dataset['node_id'], 16)))[3:13], 2) # #Gets source address. # sa = str(bin(int(dataset['node_id'], 16)))[-8:] #gets last byte. sa = event['producer_id'] activity = event['activity'] e1 = d1.entity(ex['event' + str(counter)], dataset) #create class object to store entity and activity data field. entity_activity = entityActivity() entity_activity.addEntityActivity(e1, activity) #entityActivityList.append(e1) #can_events.setdefault(str(sa),[]).append(e1) can_events[sa].append(entity_activity) #node_id = d1.agent('ex:'+event['node_id']) controller_agent = d1.agent('ex:' + event['controller_id']) # activity = d1.activity('ex:'+event['activity']+str(counter)) # activities.append(activity) #d1.wasGeneratedBy(e1, activity) # strings used to detect if the relationship already exists in the d1 document # association_relationship = str(dummy.wasAssociatedWith(activity, sa)) # used_relationship = str(dummy.used(network_id, sa)) #add activity to sensor agent # d1.wasAssociatedWith(activity,sensor_agent) #check if the association already esists # if association_relationship not in relationships: # d1.wasAssociatedWith(activity,sensor_agent) # relationships.append(association_relationship) # if used_relationship not in relationships: # d1.used(network_id, sa) # relationships.append(used_relationship) #counter+=1 #counter_1 +=1 # for index in range(len(entityActivityList)-1): # d1.wasAssociatedWith(entityActivityList[index], entityActivityList[index + 1]) # for index in range(len(entityActivityList)): # d1.wasGeneratedBy(entityActivityList[index], activities[index]) # d1.wasAssociatedWith(activities[index],sa) for key in can_events.keys(): producer_agent = d1.agent('ex:' + str(key)) used_relationship = str(dummy.used(controller_agent, producer_agent)) #association_relationship = str(dummy.wasAssociatedWith(activity, sa)) if used_relationship not in relationships: d1.used(controller_agent, producer_agent) relationships.append(used_relationship) entityActivityList = can_events[key] for index in range(len(entityActivityList) - 1): d1.wasAssociatedWith(entityActivityList[index].getEntity(), entityActivityList[index + 1].getEntity()) d1.wasGeneratedBy(entityActivityList[index], entityActivityList[index].getActivity()) d1.wasAssociatedWith(entityActivityList[index].getActivity(), producer_agent) return d1
def bidsmri2project(directory, args): # initialize empty cde graph...it may get replaced if we're doing variable to term mapping or not cde=Graph() # Parse dataset_description.json file in BIDS directory if (os.path.isdir(os.path.join(directory))): try: with open(os.path.join(directory,'dataset_description.json')) as data_file: dataset = json.load(data_file) except OSError: logging.critical("Cannot find dataset_description.json file which is required in the BIDS spec") exit("-1") else: logging.critical("Error: BIDS directory %s does not exist!" %os.path.join(directory)) exit("-1") # create project / nidm-exp doc project = Project() # if there are git annex sources then add them num_sources=addGitAnnexSources(obj=project.get_uuid(),bids_root=directory) # else just add the local path to the dataset if num_sources == 0: project.add_attributes({Constants.PROV['Location']:"file:/" + directory}) # add various attributes if they exist in BIDS dataset for key in dataset: # if key from dataset_description file is mapped to term in BIDS_Constants.py then add to NIDM object if key in BIDS_Constants.dataset_description: if type(dataset[key]) is list: project.add_attributes({BIDS_Constants.dataset_description[key]:"".join(dataset[key])}) else: project.add_attributes({BIDS_Constants.dataset_description[key]:dataset[key]}) # get BIDS layout bids_layout = BIDSLayout(directory) # create empty dictinary for sessions where key is subject id and used later to link scans to same session as demographics session={} participant={} # Parse participants.tsv file in BIDS directory and create study and acquisition objects if os.path.isfile(os.path.join(directory,'participants.tsv')): with open(os.path.join(directory,'participants.tsv')) as csvfile: participants_data = csv.DictReader(csvfile, delimiter='\t') # logic to map variables to terms. # first iterate over variables in dataframe and check which ones are already mapped as BIDS constants and which are not. For those that are not # we want to use the variable-term mapping functions to help the user do the mapping # iterate over columns mapping_list=[] column_to_terms={} for field in participants_data.fieldnames: # column is not in BIDS_Constants if not (field in BIDS_Constants.participants): # add column to list for column_to_terms mapping mapping_list.append(field) #if user didn't supply a json mapping file but we're doing some variable-term mapping create an empty one for column_to_terms to use if args.json_map == False: #defaults to participants.json because here we're mapping the participants.tsv file variables to terms # if participants.json file doesn't exist then run without json mapping file if not os.path.isfile(os.path.join(directory,'participants.json')): #maps variables in CSV file to terms temp=DataFrame(columns=mapping_list) if args.no_concepts: column_to_terms,cde = map_variables_to_terms(directory=directory,assessment_name='participants.tsv', df=temp,output_file=os.path.join(directory,'participants.json'),bids=True,associate_concepts=False) else: column_to_terms,cde = map_variables_to_terms(directory=directory,assessment_name='participants.tsv', df=temp,output_file=os.path.join(directory,'participants.json'),bids=True) else: #maps variables in CSV file to terms temp=DataFrame(columns=mapping_list) if args.no_concepts: column_to_terms,cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp, output_file=os.path.join(directory,'participants.json'),json_file=os.path.join(directory,'participants.json'),bids=True,associate_concepts=False) else: column_to_terms,cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp, output_file=os.path.join(directory,'participants.json'),json_file=os.path.join(directory,'participants.json'),bids=True) else: #maps variables in CSV file to terms temp=DataFrame(columns=mapping_list) if args.no_concepts: column_to_terms, cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp, output_file=os.path.join(directory,'participants.json'),json_file=args.json_map,bids=True,associate_concepts=False) else: column_to_terms, cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp, output_file=os.path.join(directory,'participants.json'),json_file=args.json_map,bids=True) for row in participants_data: #create session object for subject to be used for participant metadata and image data #parse subject id from "sub-XXXX" string temp = row['participant_id'].split("-") #for ambiguity in BIDS datasets. Sometimes participant_id is sub-XXXX and othertimes it's just XXXX if len(temp) > 1: subjid = temp[1] else: subjid = temp[0] logging.info(subjid) session[subjid] = Session(project) #add acquisition object acq = AssessmentAcquisition(session=session[subjid]) acq_entity = AssessmentObject(acquisition=acq) participant[subjid] = {} participant[subjid]['person'] = acq.add_person(attributes=({Constants.NIDM_SUBJECTID:row['participant_id']})) # add nfo:filename entry to assessment entity to reflect provenance of where this data came from acq_entity.add_attributes({Constants.NIDM_FILENAME:getRelPathToBIDS(os.path.join(directory,'participants.tsv'),directory)}) #acq_entity.add_attributes({Constants.NIDM_FILENAME:os.path.join(directory,'participants.tsv')}) #add qualified association of participant with acquisition activity acq.add_qualified_association(person=participant[subjid]['person'],role=Constants.NIDM_PARTICIPANT) # print(acq) # if there are git annex sources for participants.tsv file then add them num_sources=addGitAnnexSources(obj=acq_entity.get_uuid(),bids_root=directory) # else just add the local path to the dataset if num_sources == 0: acq_entity.add_attributes({Constants.PROV['Location']:"file:/" + os.path.join(directory,'participants.tsv')}) # if there's a JSON sidecar file then create an entity and associate it with all the assessment entities if os.path.isfile(os.path.join(directory,'participants.json')): json_sidecar = AssessmentObject(acquisition=acq) json_sidecar.add_attributes({PROV_TYPE:QualifiedName(Namespace("bids",Constants.BIDS),"sidecar_file"), Constants.NIDM_FILENAME: getRelPathToBIDS(os.path.join(directory,'participants.json'),directory)}) # add Git Annex Sources # if there are git annex sources for participants.tsv file then add them num_sources=addGitAnnexSources(obj=json_sidecar.get_uuid(),filepath=os.path.join(directory,'participants.json'),bids_root=directory) # else just add the local path to the dataset if num_sources == 0: json_sidecar.add_attributes({Constants.PROV['Location']:"file:/" + os.path.join(directory,'participants.json')}) # check if json_sidecar entity exists and if so associate assessment entity with it if 'json_sidecar' in locals(): #connect json_entity with acq_entity acq_entity.add_attributes({Constants.PROV["wasInfluencedBy"]:json_sidecar}) for key,value in row.items(): if not value: continue #for variables in participants.tsv file who have term mappings in BIDS_Constants.py use those, add to json_map so we don't have to map these if user #supplied arguments to map variables if key in BIDS_Constants.participants: # WIP # Here we are adding to CDE graph data elements for BIDS Constants that remain fixed for each BIDS-compliant dataset if not (BIDS_Constants.participants[key] == Constants.NIDM_SUBJECTID): # create a namespace with the URL for fixed BIDS_Constants term # item_ns = Namespace(str(Constants.BIDS.namespace.uri)) # add prefix to namespace which is the BIDS fixed variable name # cde.bind(prefix="bids", namespace=item_ns) # ID for BIDS variables is always the same bids:[bids variable] cde_id = Constants.BIDS[key] # add the data element to the CDE graph cde.add((cde_id,RDF.type, Constants.NIDM['DataElement'])) cde.add((cde_id,RDF.type, Constants.PROV['Entity'])) # add some basic information about this data element cde.add((cde_id,Constants.RDFS['label'],Literal(BIDS_Constants.participants[key].localpart))) cde.add((cde_id,Constants.NIDM['isAbout'],URIRef(BIDS_Constants.participants[key].uri))) cde.add((cde_id,Constants.NIDM['source_variable'],Literal(key))) cde.add((cde_id,Constants.NIDM['description'],Literal("participant/subject identifier"))) cde.add((cde_id,Constants.RDFS['comment'],Literal("BIDS participants_id variable fixed in specification"))) cde.add((cde_id,Constants.RDFS['valueType'],URIRef(Constants.XSD["string"]))) acq_entity.add_attributes({cde_id:Literal(value)}) # if this was the participant_id, we already handled it above creating agent / qualified association # if not (BIDS_Constants.participants[key] == Constants.NIDM_SUBJECTID): # acq_entity.add_attributes({BIDS_Constants.participants[key]:value}) # else if user added -mapvars flag to command line then we'll use the variable-> term mapping procedures to help user map variables to terms (also used # in CSV2NIDM.py) else: # WIP: trying to add new support for CDEs... add_attributes_with_cde(prov_object=acq_entity,cde=cde,row_variable=key,value=value) # if key in column_to_terms: # acq_entity.add_attributes({QualifiedName(provNamespace(Core.safe_string(None,string=str(key)), column_to_terms[key]["url"]), ""):value}) # else: # acq_entity.add_attributes({Constants.BIDS[key.replace(" ", "_")]:value}) # create acquisition objects for each scan for each subject # loop through all subjects in dataset for subject_id in bids_layout.get_subjects(): logging.info("Converting subject: %s" %subject_id) # skip .git directories...added to support datalad datasets if subject_id.startswith("."): continue # check if there are a session numbers. If so, store it in the session activity and create a new # sessions for these imaging acquisitions. Because we don't know which imaging session the root # participants.tsv file data may be associated with we simply link the imaging acquisitions to different # sessions (i.e. the participants.tsv file goes into an AssessmentAcquisition and linked to a unique # sessions and the imaging acquisitions go into MRAcquisitions and has a unique session) imaging_sessions = bids_layout.get_sessions(subject=subject_id) # if session_dirs has entries then get any metadata about session and store in session activity # bids_layout.get(subject=subject_id,type='session',extensions='.tsv') # bids_layout.get(subject=subject_id,type='scans',extensions='.tsv') # bids_layout.get(extensions='.tsv',return_type='obj') # loop through each session if there is a sessions directory if len(imaging_sessions) > 0: for img_session in imaging_sessions: # create a new session ses = Session(project) # add session number as metadata ses.add_attributes({Constants.BIDS['session_number']:img_session}) addimagingsessions(bids_layout=bids_layout,subject_id=subject_id,session=ses,participant=participant, directory=directory,img_session=img_session) # else we have no ses-* directories in the BIDS layout addimagingsessions(bids_layout=bids_layout,subject_id=subject_id,session=Session(project),participant=participant, directory=directory) # Added temporarily to support phenotype files # for each *.tsv / *.json file pair in the phenotypes directory # WIP: ADD VARIABLE -> TERM MAPPING HERE for tsv_file in glob.glob(os.path.join(directory,"phenotype","*.tsv")): # for now, open the TSV file, extract the row for this subject, store it in an acquisition object and link to # the associated JSON data dictionary file with open(tsv_file) as phenofile: pheno_data = csv.DictReader(phenofile, delimiter='\t') for row in pheno_data: subjid = row['participant_id'].split("-") if not subjid[1] == subject_id: continue else: # add acquisition object acq = AssessmentAcquisition(session=session[subjid[1]]) # add qualified association with person acq.add_qualified_association(person=participant[subject_id]['person'],role=Constants.NIDM_PARTICIPANT) acq_entity = AssessmentObject(acquisition=acq) for key,value in row.items(): if not value: continue # we're using participant_id in NIDM in agent so don't add to assessment as a triple. # BIDS phenotype files seem to have an index column with no column header variable name so skip those if ((not key == "participant_id") and (key != "")): # for now we're using a placeholder namespace for BIDS and simply the variable names as the concept IDs.. acq_entity.add_attributes({Constants.BIDS[key]:value}) # link TSV file acq_entity.add_attributes({Constants.NIDM_FILENAME:getRelPathToBIDS(tsv_file,directory)}) #acq_entity.add_attributes({Constants.NIDM_FILENAME:tsv_file}) # if there are git annex sources for participants.tsv file then add them num_sources=addGitAnnexSources(obj=acq_entity.get_uuid(),bids_root=directory) # else just add the local path to the dataset if num_sources == 0: acq_entity.add_attributes({Constants.PROV['Location']:"file:/" + tsv_file}) # link associated JSON file if it exists data_dict = os.path.join(directory,"phenotype",os.path.splitext(os.path.basename(tsv_file))[0]+ ".json") if os.path.isfile(data_dict): # if file exists, create a new entity and associate it with the appropriate activity and a used relationship # with the TSV-related entity json_entity = AssessmentObject(acquisition=acq) json_entity.add_attributes({PROV_TYPE:Constants.BIDS["sidecar_file"], Constants.NIDM_FILENAME: getRelPathToBIDS(data_dict,directory)}) # add Git Annex Sources # if there are git annex sources for participants.tsv file then add them num_sources=addGitAnnexSources(obj=json_entity.get_uuid(),filepath=data_dict,bids_root=directory) # else just add the local path to the dataset if num_sources == 0: json_entity.add_attributes({Constants.PROV['Location']:"file:/" + data_dict}) #connect json_entity with acq_entity acq_entity.add_attributes({Constants.PROV["wasInfluencedBy"]:json_entity.get_uuid()}) return project, cde
Namespace, ) import numpy as np from .common import calculate_provenance_network_metrics POKEMON_GO_DATA_COLUMNS = [ "n_balls_collected", "n_pokemons_captured", "n_pokemons_disposed", "strength_captured_avg", "strength_disposed_avg", ] logger = logging.getLogger(__name__) NS_PGO = Namespace("pgo", "http://sociam.org/pokemongo#") PGO_strength = NS_PGO["strength"] def create_graph_index(dataset_path, output_path): logger.debug("Working in folder: %s", dataset_path) dataset_path = Path(dataset_path) graph_index_filepath = dataset_path / "graphs.csv" if not graph_index_filepath.exists(): logger.error("Graphs index file is not found: %s", graph_index_filepath) exit(1) logger.debug("Reading graphs index...") graphs = pd.read_csv(graph_index_filepath)
def decode_json_container(jc, bundle): if 'prefix' in jc: prefixes = jc['prefix'] for prefix, uri in prefixes.items(): if prefix != 'default': bundle.add_namespace(Namespace(prefix, uri)) else: bundle.set_default_namespace(uri) del jc['prefix'] for rec_type_str in jc: rec_type = PROV_RECORD_IDS_MAP[rec_type_str] for rec_id, content in jc[rec_type_str].items(): if hasattr(content, 'items'): # it is a dict # There is only one element, create a singleton list elements = [content] else: # expect it to be a list of dictionaries elements = content for element in elements: attributes = dict() other_attributes = [] # this is for the multiple-entity membership hack to come membership_extra_members = None for attr_name, values in element.items(): attr = (PROV_ATTRIBUTES_ID_MAP[attr_name] if attr_name in PROV_ATTRIBUTES_ID_MAP else valid_qualified_name(bundle, attr_name)) if attr in PROV_ATTRIBUTES: if isinstance(values, list): # only one value is allowed if len(values) > 1: # unless it is the membership hack if rec_type == PROV_MEMBERSHIP and \ attr == PROV_ATTR_ENTITY: # This is a membership relation with # multiple entities # HACK: create multiple membership # relations, one for each entity # Store all the extra entities membership_extra_members = values[1:] # Create the first membership relation as # normal for the first entity value = values[0] else: error_msg = ( 'The prov package does not support PROV' ' attributes having multiple values.') logger.error(error_msg) raise ProvJSONException(error_msg) else: value = values[0] else: value = values value = (valid_qualified_name(bundle, value) if attr in PROV_ATTRIBUTE_QNAMES else parse_xsd_datetime(value)) attributes[attr] = value else: if isinstance(values, list): other_attributes.extend( (attr, decode_json_representation(value, bundle)) for value in values) else: # single value other_attributes.append( (attr, decode_json_representation(values, bundle))) bundle.new_record(rec_type, rec_id, attributes, other_attributes) # HACK: creating extra (unidentified) membership relations if membership_extra_members: collection = attributes[PROV_ATTR_COLLECTION] for member in membership_extra_members: bundle.membership(collection, valid_qualified_name(bundle, member))
def example(): g = ProvDocument() # Local namespace # Doesnt exist yet so we are creating it ap = Namespace('aip', 'https://araport.org/provenance/') # Dublin Core g.add_namespace("dcterms", "http://purl.org/dc/terms/") # FOAF g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/") # Add sponsors and contributors as Agents # ap['matthew_vaughn'] # aip:matthew_vaughn # https://araport.org/provenance/:matthew_vaughn # Learn this from a call to profiles service? Adds a dependency on Agave so I am open to figuring out another way me = g.agent( ap['matthew_vaughn'], { 'prov:type': PROV["Person"], 'foaf:givenName': "Matthew Vaughn", 'foaf:mbox': "<mailto:[email protected]>" }) # Hard coded for now walter = g.agent( ap['walter_moreira'], { 'prov:type': PROV["Person"], 'foaf:givenName': "Walter Moreira", 'foaf:mbox': "<mailto:[email protected]>" }) utexas = g.agent( ap['university_of_texas'], { 'prov:type': PROV["Organization"], 'foaf:givenName': "University of Texas at Austin" }) # Set delegation to our host University # We may have trouble doing this for other users since we don't always capture their host instituion g.actedOnBehalfOf(walter, utexas) g.actedOnBehalfOf(me, utexas) # Include the ADAMA platform as an Agent and set attribution # dcterms:title and dcterms:description are hardcoded # dcterms:language is hard-coded # dcterms:source is the URI of the public git source repository for ADAMA # "dcterms:updated": "2015-04-17T09:44:56" - this would actually be the date ADAMA was updated adama_platform = g.agent( ap['adama_platform'], { 'dcterms:title': "ADAMA", 'dcterms:description': "Araport Data and Microservices API", 'dcterms:language': "en-US", 'dcterms:identifier': "https://api.araport.org/community/v0.3/", 'dcterms:updated': "2015-04-17T09:44:56" }) g.wasGeneratedBy(adama_platform, walter) # Include the ADAMA microservice as an Agent and set attribution+delegation # dcterms:title and dcterms:description are inherited from the service's metadata # dcterms:language is hard-coded # dcterms:identifier is the deployment URI for the service # dcterms:source is the URI of the public git source repository. The URL in this example is just a dummy # # The name for each microservice should be unique. We've decided to # use the combination of namespace, service name, and version microservice_name = 'mwvaughn/bar_annotation_v1.0.0' adama_microservice = g.agent( ap[microservice_name], { 'dcterms:title': "BAR Annotation Service", 'dcterms:description': "Returns annotation from locus ID", 'dcterms:language': "en-US", 'dcterms:identifier': "https://api.araport.org/community/v0.3/mwvaughn/bar_annotation_v1.0.0", 'dcterms:source': "https://github.com/Arabidopsis-Information-Portal/prov-enabled-api-sample" }) # the microservice was generated by me on date X (don't use now, use when the service was updated) g.wasGeneratedBy(adama_microservice, me, datetime.datetime.now()) # The microservice used the platform now g.used(adama_microservice, adama_platform, datetime.datetime.now()) # Sources # # Define BAR # Agents nick = g.agent( ap['nicholas_provart'], { 'prov:type': PROV["Person"], 'foaf:givenName': "Nicholas Provart", 'foaf:mbox': "*****@*****.**" }) utoronto = g.agent( ap['university_of_toronto'], { 'prov:type': PROV["Organization"], 'foaf:givenName': "University of Toronto", 'dcterms:identifier': "http://www.utoronto.ca/" }) g.actedOnBehalfOf(nick, utoronto) # Entity # All fields derived from Sources.yml # dcterms:title and dcterms:description come straight from the YAML # dcterms:identifier - URI pointing to the source's canonical URI representation # optional - dcterms:language: Recommended best practice is to use a controlled vocabulary such as RFC 4646 # optional - dcterms:updated: date the source was published or last updated # optional - dcterms:license: Simple string or URI to license. Validate URI if provided? datasource1 = g.entity( ap['datasource1'], { 'dcterms:title': "BAR Arabidopsis AGI -> Annotation", 'dcterms:description': "Most recent annotation for given AGI", 'dcterms:language': "en-US", 'dcterms:identifier': "http://bar.utoronto.ca/webservices/agiToAnnot.php", 'dcterms:updated': "2015-04-17T09:44:56", 'dcterms:license': "Creative Commons 3.0" }) # Set up attribution to Nick g.wasAttributedTo(datasource1, nick) # Define TAIR # Agents # dcterms:language: Recommended best practice is to use a controlled vocabulary such as RFC 4646 eva = g.agent(ap['eva_huala'], { 'prov:type': PROV["Person"], 'foaf:givenName': "Eva Huala" }) phoenix = g.agent( ap['phoenix_bioinformatics'], { 'prov:type': PROV["Organization"], 'foaf:givenName': "Phoenix Bioinformatics" }) g.actedOnBehalfOf(eva, phoenix) # Entity # All fields derived from Sources.yml # optional - dcterms:citation: Plain text bibliographic citation. If only provided as doi, should we try to validate it? datasource2 = g.entity( ap['datasource2'], { 'dcterms:title': "TAIR", 'dcterms:description': "The Arabidopsis Information Resource", 'dcterms:language': "en-US", 'dcterms:identifier': "https://www.arabidopsis.org/", 'dcterms:citation': "The Arabidopsis Information Resource (TAIR): improved gene annotation and new tools. Nucleic Acids Research 2011 doi: 10.1093/nar/gkr1090" }) g.wasAttributedTo(datasource2, eva) # In Sources.yml, these two sources are nested. Define that relationship here # There are other types of relationships but we will just use derived from for simplicity in this prototype g.wasDerivedFrom(ap['datasource1'], ap['datasource2']) # Depending on which ADAMA microservice type we are using, define an activity # Eventually, break these into more atomic actions in a chain action1 = g.activity(ap['do_query'], datetime.datetime.now()) # action1 = g.activity(ap['do_map'], datetime.datetime.now()) # action1 = g.activity(ap['do_generic'], datetime.datetime.now()) # action1 = g.activity(ap['do_passthrough'], datetime.datetime.now()) # Future... Support for ADAMA-native microservices # action1 = g.activity(ap['generate'], datetime.datetime.now()) # Define current ADAMA response as an Entity # This is what's being returned to the user and is thus the subject of the PROV record # May be able to add more attributes to it but this is the minimum response = g.entity(ap['adama_response']) # Response is generated by the process_query action # Time-stamp it! g.wasGeneratedBy(response, ap['do_query'], datetime.datetime.now()) # The process_query used the microservice g.used(ap['do_query'], adama_microservice, datetime.datetime.now()) # The microservice used datasource1 g.used(adama_microservice, datasource1, datetime.datetime.now()) # Print prov_n print(g.get_provn()) # Print prov-json print(g.serialize()) # Write out as a pretty picture graph = prov.dot.prov_to_dot(g) graph.write_png('Sources.png')
def w3c_publication_2(): # https://github.com/lucmoreau/ProvToolbox/blob/master/asn/src/test/resources/prov/w3c-publication2.prov-asn #=========================================================================== # bundle # # prefix ex <http://example.org/> # prefix rec <http://example.org/record> # # prefix w3 <http://www.w3.org/TR/2011/> # prefix hg <http://dvcs.w3.org/hg/prov/raw-file/9628aaff6e20/model/releases/WD-prov-dm-20111215/> # # # entity(hg:Overview.html, [ prov:type="file in hg" ]) # entity(w3:WD-prov-dm-20111215, [ prov:type="html4" ]) # # # activity(ex:rcp,-,-,[prov:type="copy directory"]) # # wasGeneratedBy(rec:g; w3:WD-prov-dm-20111215, ex:rcp, -) # # entity(ex:req3, [ prov:type="http://www.w3.org/2005/08/01-transitions.html#pubreq" %% xsd:anyURI ]) # # used(rec:u; ex:rcp,hg:Overview.html,-) # used(ex:rcp, ex:req3, -) # # # wasDerivedFrom(w3:WD-prov-dm-20111215, hg:Overview.html, ex:rcp, rec:g, rec:u) # # agent(ex:webmaster, [ prov:type='prov:Person' ]) # # wasAssociatedWith(ex:rcp, ex:webmaster, -) # # endBundle #=========================================================================== ex = Namespace('ex', 'http://example.org/') rec = Namespace('rec', 'http://example.org/record') w3 = Namespace('w3', 'http://www.w3.org/TR/2011/') hg = Namespace( 'hg', 'http://dvcs.w3.org/hg/prov/raw-file/9628aaff6e20/model/releases/WD-prov-dm-20111215/' ) g = ProvDocument() g.entity(hg['Overview.html'], {'prov:type': "file in hg"}) g.entity(w3['WD-prov-dm-20111215'], {'prov:type': "html4"}) g.activity(ex['rcp'], None, None, {'prov:type': "copy directory"}) g.wasGeneratedBy('w3:WD-prov-dm-20111215', 'ex:rcp', identifier=rec['g']) g.entity( 'ex:req3', { 'prov:type': Identifier("http://www.w3.org/2005/08/01-transitions.html#pubreq") }) g.used('ex:rcp', 'hg:Overview.html', identifier='rec:u') g.used('ex:rcp', 'ex:req3') g.wasDerivedFrom('w3:WD-prov-dm-20111215', 'hg:Overview.html', 'ex:rcp', 'rec:g', 'rec:u') g.agent('ex:webmaster', {'prov:type': "Person"}) g.wasAssociatedWith('ex:rcp', 'ex:webmaster') return g
from prov.dot import prov_to_dot import db logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) output_path = Path("outputs") # Provenance initialisation # Namespaces ns_type_uri = "https://openprovenance.org/ns/mimic#" ns_data_uri = "https://mimic.physionet.org/mimiciii/1.4" # Types ns_type = Namespace("mimic", ns_type_uri) ns_attrs = Namespace("attrs", ns_type_uri + "attr_") # Data gen_data_ns = lambda name: Namespace(name, f"{ns_data_uri}/{name}/") ns_patients = gen_data_ns("patients") ns_unit = gen_data_ns("units") ns_staff = gen_data_ns("staff") ns_process = gen_data_ns("process") ns_admissions = gen_data_ns("admissions") ns_stay = gen_data_ns("stay") ns_procedures = gen_data_ns("procedures") all_namespaces = [ ns_type, ns_attrs,
def toW3Cprov(ling, bundl, format='w3c-prov-xml'): g = ProvDocument() vc = Namespace( "knmi", "http://knmi.nl" ) # namespaces do not need to be explicitly added to a document con = Namespace("dfp", "http://dispel4py.org") g.add_namespace("dcterms", "http://purl.org/dc/terms/") 'specify bundle' bundle = None for trace in bundl: 'specifing user' ag = g.agent( vc[trace["username"]], other_attributes={"dcterms:author": trace["username"]} ) # first time the ex namespace was used, it is added to the document automatically if trace['type'] == 'workflow_run': trace.update({'runId': trace['_id']}) bundle = g.bundle(vc[trace["runId"]]) bundle.actedOnBehalfOf(vc[trace["runId"]], vc[trace["username"]]) dic = {} i = 0 for key in trace: if key != "input": if ':' in key: dic.update({key: trace[key]}) else: dic.update({vc[key]: trace[key]}) dic.update({'prov:type': PROV['Bundle']}) g.entity(vc[trace["runId"]], dic) dic = {} i = 0 if type(trace['input']) != list: trace['input'] = [trace['input']] for y in trace['input']: for key in y: if ':' in key: dic.update({key: y[key]}) else: dic.update({vc[key]: y[key]}) dic.update({'prov:type': 'worklfow_input'}) bundle.entity(vc[trace["_id"] + "_" + str(i)], dic) bundle.used(vc[trace["_id"]], vc[trace["_id"] + "_" + str(i)], identifier=vc["used_" + trace["_id"] + "_" + str(i)]) i = i + 1 'specify lineage' for trace in ling: #pprint(trace) try: bundle = g.bundle(vc[trace["runId"]]) bundle.wasAttributedTo(vc[trace["runId"]], vc["ag_" + trace["username"]], identifier=vc["attr_" + trace["runId"]]) except: pass 'specifing creator of the activity (to be collected from the registy)' if 'creator' in trace: bundle.agent( vc["ag_" + trace["creator"]], other_attributes={"dcterms:creator": trace["creator"]} ) # first time the ex namespace was used, it is added to the document automatically bundle.wasAssociatedWith('process_' + trace["iterationId"], vc["ag_" + trace["creator"]]) bundle.wasAttributedTo(vc[trace["runId"]], vc["ag_" + trace["creator"]]) 'adding activity information for lineage' dic = {} for key in trace: if type(trace[key]) != list: if ':' in key: dic.update({key: trace[key]}) else: if key == 'location': dic.update({"prov:location": trace[key]}) else: dic.update({vc[key]: trace[key]}) bundle.activity(vc["process_" + trace["iterationId"]], trace["startTime"], trace["endTime"], dic.update({'prov:type': trace["name"]})) 'adding parameters to the document as input entities' dic = {} for x in trace["parameters"]: #print x if ':' in x["key"]: dic.update({x["key"]: x["val"]}) else: dic.update({vc[x["key"]]: x["val"]}) dic.update({'prov:type': 'parameters'}) bundle.entity(vc["parameters_" + trace["instanceId"]], dic) bundle.used(vc['process_' + trace["iterationId"]], vc["parameters_" + trace["instanceId"]], identifier=vc["used_" + trace["iterationId"]]) 'adding input dependencies to the document as input entities' dic = {} for x in trace["derivationIds"]: 'state could be added' #dic.update({'prov:type':'parameters'}) bundle.used(vc['process_' + trace["iterationId"]], vc[x["DerivedFromDatasetID"]], identifier=vc["used_" + x["DerivedFromDatasetID"]]) 'adding entities to the document as output metadata' for x in trace["streams"]: i = 0 parent_dic = {} for key in x: if key == 'con:immediateAccess': parent_dic.update({vc['immediateAccess']: x[key]}) elif key == 'location': parent_dic.update({"prov:location": str(x[key])}) else: parent_dic.update({vc[key]: str(x[key])}) c1 = bundle.collection(vc[x["id"]], other_attributes=parent_dic) bundle.wasGeneratedBy(vc[x["id"]], vc["process_" + trace["iterationId"]], identifier=vc["wgb_" + x["id"]]) for d in trace['derivationIds']: bundle.wasDerivedFrom(vc[x["id"]], vc[d['DerivedFromDatasetID']], identifier=vc["wdf_" + x["id"]]) for y in x["content"]: dic = {} if isinstance(y, dict): val = None for key in y: try: val = num(y[key]) except Exception, e: val = str(y[key]) if ':' in key: dic.update({key: val}) else: dic.update({vc[key]: val}) else: dic = {vc['text']: y} dic.update({"verce:parent_entity": vc["data_" + x["id"]]}) print x["id"] print str(i) print dic e1 = bundle.entity(vc["data_" + x["id"] + "_" + str(i)], dic) bundle.hadMember(c1, e1) bundle.wasGeneratedBy(vc["data_" + x["id"] + "_" + str(i)], vc["process_" + trace["iterationId"]], identifier=vc["wgb_" + x["id"] + "_" + str(i)]) for d in trace['derivationIds']: bundle.wasDerivedFrom( vc["data_" + x["id"] + "_" + str(i)], vc[d['DerivedFromDatasetID']], identifier=vc["wdf_" + "data_" + x["id"] + "_" + str(i)]) i = i + 1