def dump_as_rdf(g: Dataset, table_name: str) -> bool: """ Dump the contents of Graph g in RDF turtle :param g: Dataset to dump :param table_name: name of the base table :return: success indicator """ # Propagate the mapped concepts up the tree def add_to_ancestors(s: URIRef, vm: URIRef): g.add((s, ISO['enumeratedConceptualDomain.hasMember'], vm)) for parent in g.objects(s, SKOS.broader): add_to_ancestors(parent, vm) if COMPUTE_MEMBERS and EXPLICIT_MEMBERS: for subj, obj in g.subject_objects(SKOS.exactMatch): add_to_ancestors(subj, obj) # TODO: this gives us a list of all concepts in the scheme... useful? for scheme, tc in g.subject_objects(SKOS.hasTopConcept): for member in g.objects( tc, ISO['enumeratedConceptualDomain.hasMember']): g.add((scheme, ISO['enumeratedConceptualDomain.hasMember'], member)) for name, ns in namespaces.items(): g.bind(name.lower(), ns) outfile = os.path.join(DATA_DIR, table_name + '.ttl') print(f"Saving output to {outfile}") g.serialize(outfile, format='turtle') print(f"{len(g)} triples written") return True
def createNanopubs(g): ds = Dataset() ds.namespace_manager.bind("ddi","http://purl.org/net/nlprepository/spl-ddi-annotation-poc#") ds.namespace_manager.bind("prov","http://www.w3.org/ns/prov#") ds.namespace_manager.bind("np", "http://www.nanopub.org/nschema#") bindings = g.query(interactSelect) for b in bindings: npURI = URIRef(b['inter'] + "-nanopub") headURI = URIRef(b['inter'] + "-head") aURI = URIRef(b['inter'] + "-assertion") pubInfoURI = URIRef(b['inter'] + "-pubInfo") provURI = URIRef(b['inter'] + "-provenance") head = ds.add_graph(headURI) head.add((npURI, RDF.type, np['Nanopublication'])) head.add((aURI, RDF.type, np['Assertion'])) head.add((provURI, RDF.type, np['Provenance'])) head.add((pubInfoURI, RDF.type, np['PublicationInfo'])) head.add((npURI, np['hasAssertion'], aURI)) head.add((npURI, np['hasProvenance'], provURI)) head.add((npURI, np['hasPublicationInfo'], pubInfoURI)) #print head.serialize() a = ds.add_graph(aURI) a.add((b['s'], URIRef('http://dbmi-icode-01.dbmi.pitt.edu/dikb/vocab/interactsWith'), b['o'])) a.add((b['s'], RDF.type, sio["SIO_010038"])) a.add((b['o'], RDF.type, sio["SIO_010038"])) prov = ds.add_graph(provURI) prov.add((aURI, w3prov['wasDerivedFrom'], b['inter'])) print ds.serialize(format='trig')
def test_roundtrip(): d = Dataset() d.parse(Path(__file__).parent / "test_parser_hext_multigraph.ndjson", format="hext", publicID=d.default_context.identifier) d.default_union = True with open(str( Path(__file__).parent / "test_parser_hext_multigraph.ndjson")) as i: ordered_input = "".join(sorted(i.readlines())).strip() ordered_output = "\n".join(sorted( d.serialize(format="hext").split("\n"))).strip() assert ordered_output == ordered_input
def test_hext_dataset_linecount(): d = Dataset() assert len(d) == 0 d.parse(Path(__file__).parent / "test_parser_hext_multigraph.ndjson", format="hext", publicID=d.default_context.identifier) total_triples = 0 # count all the triples in the Dataset for context in d.contexts(): for triple in context.triples((None, None, None)): total_triples += 1 assert total_triples == 18 # count the number of serialized Hextuples, should be 22, as per the original file lc = len(d.serialize(format="hext").splitlines()) assert lc == 22
def test_hext_json_representation(): """Tests to see if every link in the ND-JSON Hextuple result is, in fact, JSON""" d = Dataset() trig_data = """ PREFIX ex: <http://example.com/> PREFIX owl: <http://www.w3.org/2002/07/owl#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> ex:g1 { ex:s1 ex:p1 ex:o1 , ex:o2 ; ex:p2 [ a owl:Thing ; rdf:value "thingy" ; ] ; ex:p3 "Object 3" , "Object 4 - English"@en ; ex:p4 "2021-12-03"^^xsd:date ; ex:p5 42 ; ex:p6 "42" ; . } ex:g2 { ex:s1 ex:p1 ex:o1 , ex:o2 ; . ex:s11 ex:p11 ex:o11 , ex:o12 . } # default graph triples ex:s1 ex:p1 ex:o1 , ex:o2 . ex:s21 ex:p21 ex:o21 , ex:o22 . """ d.parse(data=trig_data, format="trig") out = d.serialize(format="hext") for line in out.splitlines(): j = json.loads(line) assert isinstance(j, list)
class Fragment(object): HYDRA = Namespace("http://www.w3.org/ns/hydra/core#") VOID = Namespace("http://rdfs.org/ns/void#") FOAF = Namespace("http://xmlns.com/foaf/0.1/") DCTERMS = Namespace("http://purl.org/dc/terms/") def __init__(self): self.rdf_graph = Dataset() def add_data_triple(self, subject, predicate, obj): self.rdf_graph.add((subject, predicate, obj)) def add_graph(self, identifier): self.rdf_graph.graph(identifier) def add_meta_quad(self, graph, subject, predicate, obj): self.rdf_graph.add((graph, subject, predicate, obj)) def add_prefix(self, prefix, uri): self.rdf_graph.bind(prefix, uri) def serialize(self): return self.rdf_graph.serialize(format="trig", encoding="utf-8")
""" @prefix ex: <http://example.com/> . ex:graph-1 { ex:subject-x ex:predicate-x "Triple X" . ex:subject-z ex:predicate-z "Triple Z" . } ex:graph-2 { ex:subject-y ex:predicate-y "Triple Y" . } """ print("Printing Serialised Dataset:") print("---") print(d.serialize(format="trig")) print("---") print() print() # # Use & Query # # print the length of the Dataset, i.e. the count of all triples in all Graphs # we should get """ 3 """ print("Printing Dataset Length:") print("---")
def visit_sparql(url, format='html', depth=1): sparqls = get_sparql_endpoints(url) predicates = get_predicates(sparqls, url) if format == 'html': limit_fraction = QUERY_RESULTS_LIMIT / 3 if len(predicates) > 1: predicate_query_limit_fraction = (limit_fraction * 2) / len(predicates) else: predicate_query_limit_fraction = limit_fraction * 2 results = [] def predicate_specific_sparql(sparql, query): log.debug(query) sparql.setQuery(query) res = sparql.query().convert() results.extend(list(res["results"]["bindings"])) threads = [] local_results = [] for p in predicates: q = u"""SELECT DISTINCT ?s ?p ?o ?g WHERE {{ {{ GRAPH ?g {{ {{ <{url}> <{predicate}> ?o . BIND(<{url}> as ?s) BIND(<{predicate}> as ?p) }} UNION {{ ?s <{predicate}> <{url}>. BIND(<{url}> as ?o) BIND(<{predicate}> as ?p) }} }} }} UNION {{ {{ <{url}> <{predicate}> ?o . BIND(<{url}> as ?s) BIND(<{predicate}> as ?p) }} UNION {{ ?s <{predicate}> <{url}>. BIND(<{url}> as ?o) BIND(<{predicate}> as ?p) }} }} }} LIMIT {limit}""".format(url=url, predicate=p, limit=predicate_query_limit_fraction) for s in sparqls: # Start processes for each endpoint, for each predicate query process = Thread(target=predicate_specific_sparql, args=[s, q]) process.start() threads.append(process) url_is_predicate_query = u"""SELECT DISTINCT ?s ?p ?o ?g WHERE {{ {{ GRAPH ?g {{ ?s <{url}> ?o. BIND(<{url}> as ?p) }} }} UNION {{ ?s <{url}> ?o. BIND(<{url}> as ?p) }} }} LIMIT {limit}""".format(url=url, limit=limit_fraction) for s in sparqls: process = Thread(target=predicate_specific_sparql, args=[s, url_is_predicate_query]) process.start() threads.append(process) # We now pause execution on the main thread by 'joining' all of our started threads. # This ensures that each has finished processing the urls. for process in threads: process.join() if LDF_STATEMENTS_URL is not None: retrieve_ldf_results(url) # We also add local results (result of dereferencing) local_results = list(visit_local(url, format)) results.extend(local_results) # If a Druid statements URL is specified, we'll try to receive it as # well if DRUID_STATEMENTS_URL is not None: results.extend(visit_druid(url, format)) if depth > 1: # If depth is larger than 1, we proceed to extend the results with the results of # visiting all object resources for every triple in the resultset. newresults = [] objects = set([ r['o']['value'] for r in results if r['o']['value'] != url and r['o']['type'] == 'uri' ]) for o in objects: newresults.extend(visit(o, format=format, depth=depth - 1)) results.extend(newresults) else: q = u""" CONSTRUCT {{ ?s ?p ?o . }} WHERE {{ {{ GRAPH ?g {{ {{ <{url}> ?p ?o . BIND(<{url}> as ?s) }} UNION {{ ?s ?p <{url}>. BIND(<{url}> as ?o) }} UNION {{ ?s <{url}> ?o. BIND(<{url}> as ?p) }} }} }} UNION {{ {{ <{url}> ?p ?o . BIND(<{url}> as ?s) }} UNION {{ ?s ?p <{url}>. BIND(<{url}> as ?o) }} UNION {{ ?s <{url}> ?o. BIND(<{url}> as ?p) }} }} }} LIMIT {limit}""".format(url=url, limit=QUERY_RESULTS_LIMIT) result_dataset = Dataset() for s in sparqls: s.setQuery(q) s.setReturnFormat(XML) result_dataset += s.query().convert() if format == 'jsonld': results = result_dataset.serialize(format='json-ld') elif format == 'rdfxml': s.setReturnFormat(XML) results = result_dataset.serialize(format='pretty-xml') elif format == 'turtle': s.setReturnFormat(XML) results = result_dataset.serialize(format='turtle') else: results = 'Nothing' log.debug("Received results") return results
def main(source, target, geometryfile='data/point2wkt.json'): with open(source) as infile: data = json.load(infile) with open(geometryfile) as infile: point2wkt = json.load(infile) ds = Dataset() dataset = lp.term('') g = rdfSubject.db = ds.graph(identifier=lp) ### Custom triples / Ontology g.add((lpOnt.Adres, OWL.equivalentClass, schema.PostalAddress)) g.add((lpOnt.Straat, OWL.equivalentClass, hg.Street)) g.add((lpOnt.Buurt, OWL.equivalentClass, hg.Neighbourhood)) g.add((lpOnt.adres, OWL.equivalentProperty, schema.address)) ######## # Data # ######## adres2locatie = defaultdict(lambda: defaultdict(list)) for n, adresLabel in enumerate(data, 1): if n % 5000 == 0: print(f"{n}/{len(data)}", end='\r') # break # # geometry # wkt = point2wkt.get(locatiepunt) # wktLiteral = Literal(wkt, datatype=geo.wktLiteral) # geometry = Geometry(lpGeo.term(str(locatiepunt)), # asWKT=wktLiteral, # label=[str(locatiepunt)]) addresses = getAdres(data[adresLabel], adresLabel, point2wkt) # adres2locatie[adres][year].append(geometry) # observations.append(locpdetail) # locp.observation = observations # addresses.append( # Role( # None, # label=address.label, # address=address, # hasLatestBeginTimeStamp=locpdetail.hasLatestBeginTimeStamp, # hasEarliestEndTimeStamp=locpdetail.hasEarliestEndTimeStamp, # startDate=Literal(year, datatype=XSD.gYear))) ds.bind('create', create) ds.bind('schema', schema) ds.bind('sem', sem) ds.bind('geo', geo) ds.bind('juso', juso) ds.bind('qb', qb) ds.bind('void', void) print("Serializing!") ds.serialize(target, format='trig')
if row['type'] != "Huisartsenposten": dataset.add((newClass, RDFS['label'], Literal(row['type_en'], lang="en"))) #, datatype=XSD['string']))) else: dataset.add((newClass, RDFS['label'], Literal(row['type_en'] + " - Out of office hours", lang="en"))) #, datatype=XSD['string']))) if short[i] == "opvo": substrkg = ["ezond", "pvoed", "OKT"] for substr in substrkg: if substr in row['titel']: dataset.add((thing, VOCAB['providesInformationAbout'], VOCAB['childDevelopment'])) substrkg = ["peel", "pel"] if substr in row['titel']: dataset.add( (thing, VOCAB['providesExercisesFor'], VOCAB['children'])) #dataset.add((thing, RDF['type'], VOCAB['childDevelopmentCenter'])) if short[i] == "lhbt": substr = ["COC", "Hiv", "Coaching", "Trans", "seksuele identiteit"] for subs in substr: if subs in row['titel']: dataset.add((thing, VOCAB['providesCoachingAbout'], VOCAB['lhbtIssues'])) dataset.add((thing, VOCAB['providesInformationAbout'], VOCAB['lhbtIssues'])) #Vieze dataset... with open('outputTTL/' + short[i] + '-rdf.ttl', 'wb') as f: dataset.serialize(f, format='turtle')
def from_csvw(metadata_filepath): pmd_metadata = Dataset() with open(metadata_filepath) as file: json_string = file.read() g = Graph().parse(data=json_string, format='json-ld') # Get all datacubes. datacubes = g.query(""" PREFIX dcat: <http://www.w3.org/ns/dcat#> SELECT * WHERE { ?dataset a dcat:Dataset . } """) for datacube in datacubes: # Try and find a sensible id for each dcat:Dataset specifed in the # metadata file to derive additional URIs for PMD resources datacube_uri = datacube[0] datacube_id = urlparse(datacube_uri).path.rsplit('/', 1)[-1] # Create sensible URIs for PMD specific resources catalog_uri = "http://gss-data.org.uk/catalog/datasets" graph_uri = f"http://gss-data.org.uk/graph/{datacube_id}" metadata_graph_uri = f"http://gss-data.org.uk/graph/{datacube_id}#metadata" catalog_record_uri = f"http://gss-data.org.uk/catalog/{datacube_id}" dataset_uri = f"http://gss-data.org.uk/data/{datacube_id}" metadata = Graph('IOMemory', URIRef(metadata_graph_uri)) metadata.bind('dcat', DCAT) metadata.bind('dct', DCTERMS) metadata.bind('foaf', FOAF) metadata.bind('qb', QB) metadata.bind('pmdcat', PMDCAT) metadata.bind('rdf', RDF) metadata.bind('rdfs', RDFS) metadata.bind('vcard', VCARD) graph = URIRef(graph_uri) metadata_graph = URIRef(metadata_graph_uri) catalog = URIRef(catalog_uri) catalog_record = URIRef(catalog_record_uri) dataset = URIRef(dataset_uri) datacube = URIRef(datacube_uri) triples = [ # Metadata required by PMD: ------------------------------------ (catalog, RDF.type, DCAT.Catalog), (catalog, DCAT.record, catalog_record), (catalog_record, RDF.type, DCAT.CatalogRecord), (catalog_record, FOAF.primaryTopic, dataset), (catalog_record, PMDCAT.metadataGraph, metadata_graph), (dataset, RDF.type, PMDCAT.Dataset), (dataset, PMDCAT.datasetContents, datacube), (dataset, PMDCAT.graph, graph), (datacube, RDF.type, PMDCAT.DataCube) ] # Get metadata attached to a datacube-like object and assign it # to the dcat:Dataset catalog entry. user_defined_metadata = g.query(""" PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX csvw: <http://www.w3.org/ns/csvw#> PREFIX qb: <http://purl.org/linked-data/cube#> PREFIX vcard: <http://www.w3.org/2006/vcard/ns#> SELECT ?dataset ?p ?o WHERE { { ?datacube ?p ?o . FILTER (?p NOT IN ( rdf:type, qb:structure, csvw:tableSchema, csvw:url )) . } } """, initBindings={ "dataset": dataset, "datacube": datacube }) contact_metadata = g.query(""" PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX vcard: <http://www.w3.org/2006/vcard/ns#> SELECT ?contact ?p ?o WHERE { { ?datacube ?p0 ?contact . ?contact a vcard:Individual . ?contact ?p ?o . } } """, initBindings={"datacube": datacube}) triples.extend(list(user_defined_metadata)) triples.extend(list(contact_metadata)) for triple in triples: if triple[2] is not None: metadata.add(triple) pmd_metadata.add_graph(metadata) pmd_metadata.serialize(metadata_filepath.replace( ".csv-metadata.json", ".trig"), format="trig")
def createNanopubs(g): ds = Dataset() ds.namespace_manager.bind("ddi","http://dbmi-icode-01.dbmi.pitt.edu/mp/") ds.namespace_manager.bind("np", "http://www.nanopub.org/nschema#") ds.namespace_manager.bind("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") ds.namespace_manager.bind("rdfs", "http://www.w3.org/2000/01/rdf-schema#") ds.namespace_manager.bind("owl", "http://www.w3.org/2002/07/owl#") ds.namespace_manager.bind("obo", "http://purl.obolibrary.org/obo/") ds.namespace_manager.bind("oboInOwl", "http://www.geneontology.org/formats/oboInOwl#") ds.namespace_manager.bind("xsd", "http://www.w3.org/2001/XMLSchema#") ds.namespace_manager.bind("dc", "http://purl.org/dc/elements/1.1/") ds.namespace_manager.bind("mp", "http://purl.org/mp/") ds.namespace_manager.bind("prov", "http://www.w3.org/ns/prov#") ds.namespace_manager.bind("dikbEvidence", "http://dbmi-icode-01.dbmi.pitt.edu/dikb-evidence/DIKB_evidence_ontology_v1.3.owl#") bindings = g.query(interactSelect) for b in bindings: asIndex = b['a'].decode('utf-8').rfind('-') identifier = b['a'].decode('utf-8')[asIndex:] predicateType = b['t'].decode('utf-8') npURI = URIRef('http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-nanopub%s') % identifier headURI = URIRef('http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-np-head%s') % identifier pubInfoURI = URIRef('http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-np-pubInfo%s') % identifier provURI = URIRef('http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-np-provenance%s') % identifier aURI = URIRef('http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-np-assertion%s') % identifier ds.add(( aURI, RDF.type, np.assertion)) head = ds.add_graph(headURI) head.add((npURI, RDF.type, np['Nanopublication'])) head.add((provURI, RDF.type, np['Provenance'])) head.add((pubInfoURI, RDF.type, np['PublicationInfo'])) head.add((npURI, np['hasAssertion'], aURI)) head.add((npURI, np['hasProvenance'], provURI)) head.add((npURI, np['hasPublicationInfo'], pubInfoURI)) pub = ds.add_graph(pubInfoURI) pub.add((npURI, prov.wasAttributedTo, URIRef('http://orcid.org/0000-0002-2993-2085'))) pub.add((npURI, prov.generatedAtTime, Literal(datetime.now()) )) if(predicateType == "http://purl.obolibrary.org/obo/DIDEO_00000000"): provenance = ds.add_graph(provURI) provenance.add(( aURI, prov.wasAttributedTo, URIRef('http://orcid.org/0000-0002-2993-2085'))) provenance.add(( aURI, prov.generatedAtTime, Literal(datetime.now()) )) provenance.add(( aURI, prov.wasDerivedFrom, Literal("Derived from the DIKB's evidence base using the listed belief criteria"))) provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_RCT )) provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_NR )) provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_Par_Grps )) elif(predicateType == "http://purl.obolibrary.org/obo/DIDEO_00000096"): provenance = ds.add_graph(provURI) provenance.add(( aURI, prov.wasAttributedTo, URIRef('http://orcid.org/0000-0002-2993-2085'))) provenance.add(( aURI, prov.generatedAtTime, Literal(datetime.now()) )) provenance.add(( aURI, prov.wasDerivedFrom, Literal("Derived from the DIKB's evidence base using the listed belief criteria"))) provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_RCT )) provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_NR )) provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_Par_Grps )) provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_CT_PK_Genotype )) provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_CT_PK_Phenotype )) elif(predicateType == "http://purl.obolibrary.org/obo/RO_0002449"): provenance = ds.add_graph(provURI) provenance.add(( aURI, prov.wasAttributedTo, URIRef('http://orcid.org/0000-0002-2993-2085'))) provenance.add(( aURI, prov.generatedAtTime, Literal(datetime.now()) )) provenance.add(( aURI, prov.wasDerivedFrom, Literal("Derived from the DIKB's evidence base using the listed belief criteria"))) provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_RCT )) provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_NR )) provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_Par_Grps )) print ds.serialize(format='trig')
# print(ref) ref_tuple = ref.split(':') suppref_text = ref_tuple[0] suppref_doi = ref_tuple[1] suppref_subj = FOODHKG_INST[get_hash(suppref_text)] dataset = createSupportingRefTriples(dataset, supp_subj, suppref_subj, suppref_doi, ref) return dataset if __name__ == '__main__': df = pd.read_excel('data/food-claims-kg.xlsx', sheet_name='13. Authorised') dataset = Dataset() for index, row in df.iterrows(): print('-', row['Supporting Evidence Reference 1'], '-', row['Status'], ':', row['Health relationship']) if row['Status'] == 'Finished': dataset = turn_into_mp(row, dataset) df = pd.read_excel('data/food-claims-kg.xlsx', sheet_name='14. Authorised') for index, row in df.iterrows(): if row['Finished?'] == 'Finished': dataset = turn_into_mp(row, dataset) add_umls_mappings() dataset.serialize('data/output/food_health_kg.ttl', format='turtle')
class BurstConverter(object): """The actual converter, that processes the chunk of lines from the CSV file, and uses the instructions from the ``schema`` graph to produce RDF.""" def __init__(self, identifier, columns, schema, metadata_graph, encoding, output_format): self.ds = Dataset() # self.ds = apply_default_namespaces(Dataset()) self.g = self.ds.graph(URIRef(identifier)) self.columns = columns self.schema = schema self.metadata_graph = metadata_graph self.encoding = encoding self.output_format = output_format self.templates = {} self.aboutURLSchema = self.schema.csvw_aboutUrl def equal_to_null(self, nulls, row): """Determines whether a value in a cell matches a 'null' value as specified in the CSVW schema)""" for n in nulls: n = Item(self.metadata_graph, n) col = str(n.csvw_name) val = str(n.csvw_null) if row[col] == val: logger.debug("Value of column {} ('{}') is equal to specified 'null' value: '{}'".format(col, unicode(row[col]).encode('utf-8'), val)) # There is a match with null value return True # There is no match with null value return False def process(self, count, rows, chunksize): """Process the rows fed to the converter. Count and chunksize are used to determine the current row number (needed for default observation identifiers)""" obs_count = count * chunksize # logger.info("Row: {}".format(obs_count)) #removed for readability # We iterate row by row, and then column by column, as given by the CSVW mapping file. mult_proc_counter = 0 iter_error_counter= 0 for row in rows: # This fixes issue:10 if row is None: mult_proc_counter += 1 # logger.debug( #removed for readability # "Skipping empty row caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...") continue # set the '_row' value in case we need to generate 'default' URIs for each observation () # logger.debug("row: {}".format(obs_count)) #removed for readability row[u'_row'] = obs_count count += 1 # The self.columns dictionary gives the mapping definition per column in the 'columns' # array of the CSVW tableSchema definition. for c in self.columns: c = Item(self.metadata_graph, c) # default about URL s = self.expandURL(self.aboutURLSchema, row) try: # Can also be used to prevent the triggering of virtual # columns! # Get the raw value from the cell in the CSV file value = row[unicode(c.csvw_name)] # This checks whether we should continue parsing this cell, or skip it. if self.isValueNull(value, c): continue # If the null values are specified in an array, we need to parse it as a collection (list) elif isinstance(c.csvw_null, Item): nulls = Collection(self.metadata_graph, BNode(c.csvw_null)) if self.equal_to_null(nulls, row): # Continue to next column specification in this row, if the value is equal to (one of) the null values. continue except: # No column name specified (virtual) because there clearly was no c.csvw_name key in the row. # logger.debug(traceback.format_exc()) #removed for readability iter_error_counter +=1 if isinstance(c.csvw_null, Item): nulls = Collection(self.metadata_graph, BNode(c.csvw_null)) if self.equal_to_null(nulls, row): # Continue to next column specification in this row, if the value is equal to (one of) the null values. continue try: # This overrides the subject resource 's' that has been created earlier based on the # schema wide aboutURLSchema specification. if unicode(c.csvw_virtual) == u'true' and c.csvw_aboutUrl is not None: s = self.expandURL(c.csvw_aboutUrl, row) if c.csvw_valueUrl is not None: # This is an object property, because the value needs to be cast to a URL p = self.expandURL(c.csvw_propertyUrl, row) o = self.expandURL(c.csvw_valueUrl, row) if self.isValueNull(os.path.basename(unicode(o)), c): logger.debug("skipping empty value") continue if unicode(c.csvw_virtual) == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.anyURI: # Special case: this is a virtual column with object values that are URIs # For now using a test special property value = row[unicode(c.csvw_name)].encode('utf-8') o = URIRef(iribaker.to_iri(value)) if unicode(c.csvw_virtual) == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.linkURI: about_url = str(c.csvw_aboutUrl) about_url = about_url[about_url.find("{"):about_url.find("}")+1] s = self.expandURL(about_url, row) # logger.debug("s: {}".format(s)) value_url = str(c.csvw_valueUrl) value_url = value_url[value_url.find("{"):value_url.find("}")+1] o = self.expandURL(value_url, row) # logger.debug("o: {}".format(o)) # For coded properties, the collectionUrl can be used to indicate that the # value URL is a concept and a member of a SKOS Collection with that URL. if c.csvw_collectionUrl is not None: collection = self.expandURL(c.csvw_collectionUrl, row) self.g.add((collection, RDF.type, SKOS['Collection'])) self.g.add((o, RDF.type, SKOS['Concept'])) self.g.add((collection, SKOS['member'], o)) # For coded properties, the schemeUrl can be used to indicate that the # value URL is a concept and a member of a SKOS Scheme with that URL. if c.csvw_schemeUrl is not None: scheme = self.expandURL(c.csvw_schemeUrl, row) self.g.add((scheme, RDF.type, SKOS['Scheme'])) self.g.add((o, RDF.type, SKOS['Concept'])) self.g.add((o, SKOS['inScheme'], scheme)) else: # This is a datatype property if c.csvw_value is not None: value = self.render_pattern(unicode(c.csvw_value), row) elif c.csvw_name is not None: # print s # print c.csvw_name, self.encoding # print row[unicode(c.csvw_name)], type(row[unicode(c.csvw_name)]) # print row[unicode(c.csvw_name)].encode('utf-8') # print '...' value = row[unicode(c.csvw_name)].encode('utf-8') else: raise Exception("No 'name' or 'csvw:value' attribute found for this column specification") # If propertyUrl is specified, use it, otherwise use # the column name if c.csvw_propertyUrl is not None: p = self.expandURL(c.csvw_propertyUrl, row) else: if "" in self.metadata_graph.namespaces(): propertyUrl = self.metadata_graph.namespaces()[""][ unicode(c.csvw_name)] else: propertyUrl = "{}{}".format(get_namespaces()['sdv'], unicode(c.csvw_name)) p = self.expandURL(propertyUrl, row) if c.csvw_datatype is not None: if URIRef(c.csvw_datatype) == XSD.anyURI: # The xsd:anyURI datatype will be cast to a proper IRI resource. o = URIRef(iribaker.to_iri(value)) elif URIRef(c.csvw_datatype) == XSD.string and c.csvw_lang is not None: # If it is a string datatype that has a language, we turn it into a # language tagged literal # We also render the lang value in case it is a # pattern. o = Literal(value, lang=self.render_pattern( c.csvw_lang, row)) else: o = Literal(value, datatype=c.csvw_datatype, normalize=False) else: # It's just a plain literal without datatype. o = Literal(value) # Add the triple to the assertion graph self.g.add((s, p, o)) # Add provenance relating the propertyUrl to the column id if '@id' in c: self.g.add((p, PROV['wasDerivedFrom'], URIRef(c['@id']))) except: # print row[0], value traceback.print_exc() # We increment the observation (row number) with one obs_count += 1 logger.debug( "{} row skips caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...".format(mult_proc_counter)) logger.debug( "{} errors encountered while trying to iterate over a NoneType...".format(mult_proc_counter)) logger.info("... done") return self.ds.serialize(format=self.output_format) # def serialize(self): # trig_file_name = self.file_name + '.trig' # logger.info("Starting serialization to {}".format(trig_file_name)) # # with open(trig_file_name, 'w') as f: # self.np.serialize(f, format='trig') # logger.info("... done") def render_pattern(self, pattern, row): """Takes a Jinja or Python formatted string, and applies it to the row value""" # Significant speedup by not re-instantiating Jinja templates for every # row. if pattern in self.templates: template = self.templates[pattern] else: template = self.templates[pattern] = Template(pattern) # TODO This should take into account the special CSVW instructions such as {_row} # First we interpret the url_pattern as a Jinja2 template, and pass all # column/value pairs as arguments rendered_template = template.render(**row) try: # We then format the resulting string using the standard Python2 # expressions return rendered_template.format(**row) except: logger.warning( u"Could not apply python string formatting, probably due to mismatched curly brackets. IRI will be '{}'. ".format(rendered_template)) return rendered_template def expandURL(self, url_pattern, row, datatype=False): """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef""" url = self.render_pattern(unicode(url_pattern), row) # DEPRECATED # for ns, nsuri in namespaces.items(): # if url.startswith(ns): # url = url.replace(ns + ':', nsuri) # break try: iri = iribaker.to_iri(url) rfc3987.parse(iri, rule='IRI') except: raise Exception(u"Cannot convert `{}` to valid IRI".format(url)) # print "Baked: ", iri return URIRef(iri) def isValueNull(self, value, c): """This checks whether we should continue parsing this cell, or skip it because it is empty or a null value.""" try: if len(value) == 0 and unicode(c.csvw_parseOnEmpty) == u"true": print("Not skipping empty value") return False #because it should not be skipped elif len(value) == 0 or value == unicode(c.csvw_null) or value in [unicode(n) for n in c.csvw_null] or value == unicode(self.schema.csvw_null): # Skip value if length is zero and equal to (one of) the null value(s) logger.debug( "Length is 0 or value is equal to specified 'null' value") return True except: logger.debug("null does not exist or is not a list.") return False
def main(search=None, cache=None, identifiers=[]): ns = Namespace("https://data.create.humanities.uva.nl/id/rkd/") ds = Dataset() ds.bind('rdfs', RDFS) ds.bind('schema', schema) ds.bind('sem', sem) ds.bind('bio', bio) ds.bind('foaf', foaf) ds.bind('void', void) ds.bind('skos', SKOS) ds.bind('owl', OWL) ds.bind('dc', dc) ds.bind('rkdArtist', URIRef("https://data.rkd.nl/artists/")) ds.bind('rkdThes', nsThesaurus) ds.bind('rkdPerson', nsPerson) ds.bind('rkdImage', URIRef("https://rkd.nl/explore/images/")) ds.bind('rkdThumb', URIRef("https://images.rkd.nl/rkd/thumb/650x650/")) ds.bind('aat', URIRef("http://vocab.getty.edu/aat/")) ## First the images g = rdfSubject.db = ds.graph(identifier=ns) # Load cache thesaurus if os.path.isfile('rkdthesaurus.json'): with open('rkdthesaurus.json') as infile: thesaurusDict = json.load(infile) else: thesaurusDict = dict() # Load cache images if os.path.isfile('imagecache.json'): with open('imagecache.json') as infile: imageCache = json.load(infile) else: imageCache = dict() # to fetch all identifiers from the search if search: thesaurusDict, imageCache = parseURL(search, thesaurusDict=thesaurusDict, imageCache=imageCache) elif cache: # assume that everything in the thesaurus is also cached for doc in cache.values(): parseData(doc, thesaurusDict=thesaurusDict) elif identifiers: for i in identifiers: thesaurusDict, imageCache = parseURL(APIURL + str(i), thesaurusDict=thesaurusDict, imageCache=imageCache) # Any images without labels? # These were not included in the search, but fetch them anyway. print("Finding referred images that were not included") q = """ PREFIX schema: <http://schema.org/> SELECT ?uri WHERE { ?role a schema:Role ; schema:isRelatedTo ?uri . FILTER NOT EXISTS { ?uri schema:name ?name } } """ images = g.query(q) print(f"Found {len(images)}!") for i in images: identifier = str(i['uri']).replace('https://rkd.nl/explore/images/', '') thesaurusDict, imageCache = parseURL( "https://api.rkd.nl/api/record/images/" + str(identifier), thesaurusDict=thesaurusDict, imageCache=imageCache) ## Then the thesaurus print("Converting the thesaurus") rdfSubject.db = ds.graph(identifier=ns.term('thesaurus/')) ids = list(thesaurusDict.keys()) for i in ids: _, thesaurusDict = getThesaurus(i, thesaurusDict, 'concept') # Save updated cache with open('rkdthesaurus.json', 'w') as outfile: json.dump(thesaurusDict, outfile) with open('imagecache.json', 'w') as outfile: json.dump(imageCache, outfile) ## Serialize print("Serializing!") ds.serialize('rkdportraits14751825.trig', format='trig')
def createNanopubs(g): ds = Dataset() ds.namespace_manager.bind("ddi","http://dbmi-icode-01.dbmi.pitt.edu/mp/") ds.namespace_manager.bind("np", "http://www.nanopub.org/nschema#") ds.namespace_manager.bind("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") ds.namespace_manager.bind("rdfs", "http://www.w3.org/2000/01/rdf-schema#") ds.namespace_manager.bind("owl", "http://www.w3.org/2002/07/owl#") ds.namespace_manager.bind("obo", "http://purl.obolibrary.org/obo/") ds.namespace_manager.bind("oboInOwl", "http://www.geneontology.org/formats/oboInOwl#") ds.namespace_manager.bind("xsd", "http://www.w3.org/2001/XMLSchema#") ds.namespace_manager.bind("dc", "http://purl.org/dc/elements/1.1/") ds.namespace_manager.bind("mp", "http://purl.org/mp/") assertionCount = 1 enzymeCount = 1 pddiD = dict([line.split(',',1) for line in open('../../data/np-graphs/processed-dikb-ddis-for-nanopub.csv')]) cL = dict([line.split('\t') for line in open('../../data/chebi_mapping.txt')]) pL = dict([line.split('\t') for line in open('../../data/pro_mapping.txt')]) substrateD = {} inhibitorD = {} bindings = g.query(interactSelect) for b in bindings: if( pddiD.has_key(str(b['c'].decode('utf-8'))) ): tempClaim = pddiD[ str(b['c'].decode('utf-8')) ] claimInfo = tempClaim.split(',') claimSub = claimInfo[1] claimObj = claimInfo[2] predicateType = claimInfo[0].strip('\n') if(predicateType == "increases_auc"): aURI = URIRef("http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-np-assertion-%s") % assertionCount assertionCount += 1 bn1 = BNode('1') bn2 = BNode('2') bn3 = BNode('3') bn4 = BNode('4') bn5 = BNode('5') bn6 = BNode('6') bn7 = BNode('7') bn8 = BNode('8') bn9 = BNode('9') bn10 = BNode('10') assertionLabel = cL[claimSub.strip('\n')].strip('\n') + " - " + cL[claimObj.strip('\n')].strip('\n') + " potential drug-drug interaction" a = ds.add_graph((aURI)) a.add(( aURI, RDF.type, np.assertion)) a.add(( aURI, RDF.type, owl.Class)) a.add(( aURI, RDFS.label, (Literal(assertionLabel.lower())))) a.add(( aURI, RDFS.subClassOf, URIRef("http://purl.obolibrary.org/obo/DIDEO_00000000"))) a.add(( bn1, RDF.type, owl.Restriction)) a.add(( bn1, owl.onProperty, URIRef("http://purl.obolibrary.org/obo/IAO_0000136"))) a.add(( bn2, RDF.type, owl.Class)) a.add(( bn3, RDF.first, URIRef("http://purl.obolibrary.org/obo/DIDEO_00000012"))) a.add(( bn5, RDF.first, bn4)) a.add(( bn3, RDF.rest, bn5)) a.add(( bn4, RDF.type, owl.Restriction)) a.add(( bn4, owl.onProperty, URIRef("http://purl.obolibrary.org/obo/BFO_0000052"))) a.add(( bn4, owl.hasValue, URIRef(claimSub.strip('\n')))) a.add(( bn5, RDF.rest, RDF.nil)) a.add(( bn2, owl.intersectionOf, bn3)) a.add(( bn1, owl.someValuesFrom, bn2)) a.add(( aURI, RDFS.subClassOf, bn1)) a.add(( bn6, RDF.type, owl.Restriction)) a.add(( bn6, owl.onProperty, URIRef("http://purl.obolibrary.org/obo/IAO_0000136"))) a.add(( bn7, RDF.type, owl.Class)) a.add(( bn8, RDF.first, URIRef("http://purl.obolibrary.org/obo/DIDEO_00000013"))) a.add(( bn10, RDF.first, bn9)) a.add(( bn8, RDF.rest, bn10)) a.add(( bn9, RDF.type, owl.Restriction)) a.add(( bn9, owl.onProperty, URIRef("http://purl.obolibrary.org/obo/BFO_0000052"))) a.add(( bn9, owl.hasValue, URIRef(claimObj.strip('\n')))) a.add(( bn10, RDF.rest, RDF.nil)) a.add(( bn7, owl.intersectionOf, bn8)) a.add(( bn6, owl.someValuesFrom, bn7)) a.add(( aURI, RDFS.subClassOf, bn6)) ds.add(( aURI, mp.formalizes, b['c'])) ds.add(( b['c'], mp.formalizedAs, aURI)) elif(predicateType == "substrate_of"): aURI = URIRef("http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-np-assertion-%s") % assertionCount assertionCount += 1 dLabel = cL[claimSub.strip('\n')].strip('\n') eLabel = pL[claimObj.strip('\n')].strip('\n') assertionLabel = dLabel + " substrate of " + eLabel a = ds.add_graph((aURI)) ds.add(( aURI, RDF.type, np.assertion)) ds.add(( aURI, RDFS.label, Literal(assertionLabel.lower()))) ds.add(( aURI, mp.formalizes, b['c'])) ds.add(( b['c'], mp.formalizedAs, aURI)) a.add(( URIRef(claimObj.strip('\n')), RDF.type, URIRef("http://purl.obolibrary.org/obo/OBI_0000427"))) a.add(( URIRef(claimObj.strip('\n')), RDFS.label, Literal(eLabel.lower()))) a.add(( URIRef(claimObj.strip('\n')), URIRef("http://purl.obolibrary.org/obo/DIDEO_00000096"), URIRef(claimSub.strip('\n')))) a.add(( URIRef(claimSub.strip('\n')), RDF.type, URIRef("http://purl.obolibrary.org/obo/CHEBI_24431"))) a.add(( URIRef(claimSub.strip('\n')), RDFS.label, Literal(dLabel.lower()))) elif(predicateType == "inhibits"): aURI = URIRef("http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-np-assertion-%s") % assertionCount assertionCount += 1 dLabel = cL[claimSub.strip('\n')].strip('\n') eLabel = pL[claimObj.strip('\n')].strip('\n') assertionLabel = dLabel + " inhibits " + eLabel a = ds.add_graph((aURI)) ds.add(( aURI, RDF.type, np.assertion)) ds.add(( aURI, RDFS.label, Literal(assertionLabel.lower()))) ds.add(( aURI, mp.formalizes, b['c'])) ds.add(( b['c'], mp.formalizedAs, aURI)) a.add(( URIRef(claimSub.strip('\n')), RDF.type, URIRef("http://purl.obolibrary.org/obo/CHEBI_24431"))) a.add(( URIRef(claimSub.strip('\n')), RDFS.label, Literal(dLabel.lower()))) a.add(( URIRef(claimSub.strip('\n')), URIRef("http://purl.obolibrary.org/obo/RO_0002449"), URIRef(claimObj.strip('\n')))) print ds.serialize(format='trig')
class BurstConverter(object): """The actual converter, that processes the chunk of lines from the CSV file, and uses the instructions from the ``schema`` graph to produce RDF.""" def __init__(self, identifier, columns, schema, metadata_graph, encoding, output_format): self.ds = Dataset() # self.ds = apply_default_namespaces(Dataset()) self.g = self.ds.graph(URIRef(identifier)) self.columns = columns self.schema = schema self.metadata_graph = metadata_graph self.encoding = encoding self.output_format = output_format self.templates = {} self.aboutURLSchema = self.schema.csvw_aboutUrl def equal_to_null(self, nulls, row): """Determines whether a value in a cell matches a 'null' value as specified in the CSVW schema)""" for n in nulls: n = Item(self.metadata_graph, n) col = str(n.csvw_name) val = str(n.csvw_null) if row[col] == val: # logger.debug("Value of column {} ('{}') is equal to specified 'null' value: '{}'".format(col, unicode(row[col]).encode('utf-8'), val)) # There is a match with null value return True # There is no match with null value return False def process(self, count, rows, chunksize): """Process the rows fed to the converter. Count and chunksize are used to determine the current row number (needed for default observation identifiers)""" obs_count = count * chunksize # logger.info("Row: {}".format(obs_count)) #removed for readability # We iterate row by row, and then column by column, as given by the CSVW mapping file. mult_proc_counter = 0 iter_error_counter= 0 for row in rows: # This fixes issue:10 if row is None: mult_proc_counter += 1 # logger.debug( #removed for readability # "Skipping empty row caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...") continue # set the '_row' value in case we need to generate 'default' URIs for each observation () # logger.debug("row: {}".format(obs_count)) #removed for readability row[u'_row'] = obs_count count += 1 # print(row) # The self.columns dictionary gives the mapping definition per column in the 'columns' # array of the CSVW tableSchema definition. for c in self.columns: c = Item(self.metadata_graph, c) # default about URL s = self.expandURL(self.aboutURLSchema, row) try: # Can also be used to prevent the triggering of virtual # columns! # Get the raw value from the cell in the CSV file try: # Python 2 value = row[unicode(c.csvw_name)] except NameError: # Python 3 value = row[str(c.csvw_name)] # This checks whether we should continue parsing this cell, or skip it. if self.isValueNull(value, c): continue # If the null values are specified in an array, we need to parse it as a collection (list) elif isinstance(c.csvw_null, Item): nulls = Collection(self.metadata_graph, BNode(c.csvw_null)) if self.equal_to_null(nulls, row): # Continue to next column specification in this row, if the value is equal to (one of) the null values. continue except: # No column name specified (virtual) because there clearly was no c.csvw_name key in the row. # logger.debug(traceback.format_exc()) #removed for readability iter_error_counter +=1 if isinstance(c.csvw_null, Item): nulls = Collection(self.metadata_graph, BNode(c.csvw_null)) if self.equal_to_null(nulls, row): # Continue to next column specification in this row, if the value is equal to (one of) the null values. continue try: # This overrides the subject resource 's' that has been created earlier based on the # schema wide aboutURLSchema specification. try: csvw_virtual = unicode(c.csvw_virtual) csvw_name = unicode(c.csvw_name) csvw_value = unicode(c.csvw_value) about_url = unicode(c.csvw_aboutUrl) value_url = unicode(c.csvw_valueUrl) except NameError: csvw_virtual = str(c.csvw_virtual) csvw_name = str(c.csvw_name) csvw_value = str(c.csvw_value) about_url = str(c.csvw_aboutUrl) value_url = str(c.csvw_valueUrl) if csvw_virtual == u'true' and c.csvw_aboutUrl is not None: s = self.expandURL(c.csvw_aboutUrl, row) if c.csvw_valueUrl is not None: # This is an object property, because the value needs to be cast to a URL p = self.expandURL(c.csvw_propertyUrl, row) o = self.expandURL(c.csvw_valueUrl, row) try: if self.isValueNull(os.path.basename(unicode(o)), c): logger.debug("skipping empty value") continue except NameError: if self.isValueNull(os.path.basename(str(o)), c): logger.debug("skipping empty value") continue if csvw_virtual == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.anyURI: # Special case: this is a virtual column with object values that are URIs # For now using a test special property value = row[unicode(c.csvw_name)].encode('utf-8') o = URIRef(iribaker.to_iri(value)) if csvw_virtual == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.linkURI: about_url = about_url[about_url.find("{"):about_url.find("}")+1] s = self.expandURL(about_url, row) # logger.debug("s: {}".format(s)) value_url = value_url[value_url.find("{"):value_url.find("}")+1] o = self.expandURL(value_url, row) # logger.debug("o: {}".format(o)) # For coded properties, the collectionUrl can be used to indicate that the # value URL is a concept and a member of a SKOS Collection with that URL. if c.csvw_collectionUrl is not None: collection = self.expandURL(c.csvw_collectionUrl, row) self.g.add((collection, RDF.type, SKOS['Collection'])) self.g.add((o, RDF.type, SKOS['Concept'])) self.g.add((collection, SKOS['member'], o)) # For coded properties, the schemeUrl can be used to indicate that the # value URL is a concept and a member of a SKOS Scheme with that URL. if c.csvw_schemeUrl is not None: scheme = self.expandURL(c.csvw_schemeUrl, row) self.g.add((scheme, RDF.type, SKOS['Scheme'])) self.g.add((o, RDF.type, SKOS['Concept'])) self.g.add((o, SKOS['inScheme'], scheme)) else: # This is a datatype property if c.csvw_value is not None: value = self.render_pattern(csvw_value, row) elif c.csvw_name is not None: # print s # print c.csvw_name, self.encoding # print row[unicode(c.csvw_name)], type(row[unicode(c.csvw_name)]) # print row[unicode(c.csvw_name)].encode('utf-8') # print '...' value = row[csvw_name].encode('utf-8') else: raise Exception("No 'name' or 'csvw:value' attribute found for this column specification") # If propertyUrl is specified, use it, otherwise use # the column name if c.csvw_propertyUrl is not None: p = self.expandURL(c.csvw_propertyUrl, row) else: if "" in self.metadata_graph.namespaces(): propertyUrl = self.metadata_graph.namespaces()[""][ csvw_name] else: propertyUrl = "{}{}".format(get_namespaces()['sdv'], csvw_name) p = self.expandURL(propertyUrl, row) if c.csvw_datatype is not None: if URIRef(c.csvw_datatype) == XSD.anyURI: # The xsd:anyURI datatype will be cast to a proper IRI resource. o = URIRef(iribaker.to_iri(value)) elif URIRef(c.csvw_datatype) == XSD.string and c.csvw_lang is not None: # If it is a string datatype that has a language, we turn it into a # language tagged literal # We also render the lang value in case it is a # pattern. o = Literal(value, lang=self.render_pattern( c.csvw_lang, row)) else: try: csvw_datatype = unicode(c.csvw_datatype) except NameError: csvw_datatype = str(c.csvw_datatype).split(')')[0].split('(')[-1] # csvw_datatype = str(c.csvw_datatype) # print(type(csvw_datatype)) # print(csvw_datatype) o = Literal(value, datatype=csvw_datatype, normalize=False) else: # It's just a plain literal without datatype. o = Literal(value) # Add the triple to the assertion graph self.g.add((s, p, o)) # Add provenance relating the propertyUrl to the column id if '@id' in c: self.g.add((p, PROV['wasDerivedFrom'], URIRef(c['@id']))) except: # print row[0], value traceback.print_exc() # We increment the observation (row number) with one obs_count += 1 # for s,p,o in self.g.triples((None,None,None)): # print(s.__repr__,p.__repr__,o.__repr__) logger.debug( "{} row skips caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...".format(mult_proc_counter)) logger.debug( "{} errors encountered while trying to iterate over a NoneType...".format(mult_proc_counter)) logger.info("... done") return self.ds.serialize(format=self.output_format) # def serialize(self): # trig_file_name = self.file_name + '.trig' # logger.info("Starting serialization to {}".format(trig_file_name)) # # with open(trig_file_name, 'w') as f: # self.np.serialize(f, format='trig') # logger.info("... done") def render_pattern(self, pattern, row): """Takes a Jinja or Python formatted string, and applies it to the row value""" # Significant speedup by not re-instantiating Jinja templates for every # row. if pattern in self.templates: template = self.templates[pattern] else: template = self.templates[pattern] = Template(pattern) # TODO This should take into account the special CSVW instructions such as {_row} # First we interpret the url_pattern as a Jinja2 template, and pass all # column/value pairs as arguments # row = {str('Int'): int('104906'), str('Country'): str('Luxembourg'), str('_row'): 1, str('Rank'): str('2')} # print(pattern) # print(type(pattern)) # print(row) # print(type(row)) # rendered_template = template.render(Int=120000) rendered_template = template.render(**row) try: # We then format the resulting string using the standard Python2 # expressions return rendered_template.format(**row) except: logger.warning( u"Could not apply python string formatting, probably due to mismatched curly brackets. IRI will be '{}'. ".format(rendered_template)) return rendered_template def expandURL(self, url_pattern, row, datatype=False): """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef""" try: unicode_url_pattern = unicode(url_pattern) except NameError: unicode_url_pattern = str(url_pattern).split(')')[0].split('(')[-1] # print(unicode_url_pattern) url = self.render_pattern(unicode_url_pattern, row) # DEPRECATED # for ns, nsuri in namespaces.items(): # if url.startswith(ns): # url = url.replace(ns + ':', nsuri) # break try: iri = iribaker.to_iri(url) rfc3987.parse(iri, rule='IRI') except: raise Exception(u"Cannot convert `{}` to valid IRI".format(url)) # print(iri) return URIRef(iri) def isValueNull(self, value, c): """This checks whether we should continue parsing this cell, or skip it because it is empty or a null value.""" try: if len(value) == 0 and unicode(c.csvw_parseOnEmpty) == u"true": # print("Not skipping empty value") return False #because it should not be skipped elif len(value) == 0 or value == unicode(c.csvw_null) or value in [unicode(n) for n in c.csvw_null] or value == unicode(self.schema.csvw_null): # Skip value if length is zero and equal to (one of) the null value(s) # logger.debug( # "Length is 0 or value is equal to specified 'null' value") return True except: # logger.debug("null does not exist or is not a list.") #this line will print for every cell in a csv without a defined null value. pass return False
def test_scenarios() -> None: """ Testing scenarios: 1. no base set 2. base set at graph creation 3. base set at serialization 4. base set at both graph creation & serialization, serialization overrides 5. multiple serialization side effect checking 6. checking results for RDF/XML 7. checking results for N3 8. checking results for TriX & TriG """ # variables base_one = Namespace("http://one.org/") base_two = Namespace("http://two.org/") title = Literal("Title", lang="en") description = Literal("Test Description", lang="en") creator = URIRef("https://creator.com") cs = URIRef("") # starting graph g = Graph() g.add((cs, RDF.type, SKOS.ConceptScheme)) g.add((cs, DCTERMS.creator, creator)) g.add((cs, DCTERMS.source, URIRef("nick"))) g.bind("dct", DCTERMS) g.bind("skos", SKOS) # 1. no base set for graph, no base set for serialization g1 = Graph() g1 += g # @base should not be in output assert "@base" not in g.serialize(format="turtle") # 2. base one set for graph, no base set for serialization g2 = Graph(base=base_one) g2 += g # @base should be in output, from Graph (one) assert "@base <http://one.org/> ." in g2.serialize(format="turtle") # 3. no base set for graph, base two set for serialization g3 = Graph() g3 += g # @base should be in output, from serialization (two) assert "@base <http://two.org/> ." in g3.serialize(format="turtle", base=base_two) # 4. base one set for graph, base two set for serialization, Graph one overrides g4 = Graph(base=base_one) g4 += g # @base should be in output, from graph (one) assert "@base <http://two.org/> ." in g4.serialize(format="turtle", base=base_two) # just checking that the serialization setting (two) hasn't snuck through assert "@base <http://one.org/> ." not in g4.serialize(format="turtle", base=base_two) # 5. multiple serialization side effect checking g5 = Graph() g5 += g # @base should be in output, from serialization (two) assert "@base <http://two.org/> ." in g5.serialize(format="turtle", base=base_two) # checking for side affects - no base now set for this serialization # @base should not be in output assert "@base" not in g5.serialize(format="turtle") # 6. checking results for RDF/XML g6 = Graph() g6 += g g6.bind("dct", DCTERMS) g6.bind("skos", SKOS) assert "@xml:base" not in g6.serialize(format="xml") assert 'xml:base="http://one.org/"' in g6.serialize(format="xml", base=base_one) g6.base = base_two assert 'xml:base="http://two.org/"' in g6.serialize(format="xml") assert 'xml:base="http://one.org/"' in g6.serialize(format="xml", base=base_one) # 7. checking results for N3 g7 = Graph() g7 += g g7.bind("dct", DCTERMS) g7.bind("skos", SKOS) assert "@xml:base" not in g7.serialize(format="xml") assert "@base <http://one.org/> ." in g7.serialize(format="n3", base=base_one) g7.base = base_two assert "@base <http://two.org/> ." in g7.serialize(format="n3") assert "@base <http://one.org/> ." in g7.serialize(format="n3", base=base_one) # 8. checking results for TriX & TriG # TriX can specify a base per graph but setting a base for the whole base_three = Namespace("http://three.org/") ds1 = Dataset() ds1.bind("dct", DCTERMS) ds1.bind("skos", SKOS) g8 = ds1.graph(URIRef("http://g8.com/"), base=base_one) g9 = ds1.graph(URIRef("http://g9.com/")) g8 += g g9 += g g9.base = base_two ds1.base = base_three trix = ds1.serialize(format="trix", base=Namespace("http://two.org/")) assert '<graph xml:base="http://one.org/">' in trix assert '<graph xml:base="http://two.org/">' in trix assert '<TriX xml:base="http://two.org/"' in trix trig = ds1.serialize(format="trig", base=Namespace("http://two.org/")) assert "@base <http://one.org/> ." not in trig assert "@base <http://three.org/> ." not in trig assert "@base <http://two.org/> ." in trig
date=Literal(datetime.datetime.now().isoformat(), datatype=XSD.datetime), created=None, issued=None, modified=None, exampleResource=exampleResource, vocabulary=[URIRef("https://schema.org/")], triples=sum(1 for i in ds.graph( identifier="https://data.create.humanities.uva.nl/id/kohier1674/"). subjects()), temporalCoverage=Literal("1674", datatype=XSD.gYear, normalize=False), licenseprop=URIRef( "https://creativecommons.org/licenses/by-nc-sa/4.0/"), distribution=download) ds.bind('owl', OWL) ds.bind('create', create) ds.bind('schema', schema) ds.bind('void', void) ds.bind('foaf', foaf) ds.bind('edm', edm) ds.bind('pnv', pnv) ds.bind('roar', roar) ds.bind('dc', dc) ds.bind('dcterms', dcterms) ds.bind('oa', oa) ds.bind('prov', prov) print("Serializing!") ds.serialize('data/kohier1674.trig', format='trig')
def visit_sparql(url, format='html', depth=1): sparqls = get_sparql_endpoints(url) predicates = get_predicates(sparqls, url) if format == 'html': limit_fraction = QUERY_RESULTS_LIMIT / 3 if len(predicates) > 1: predicate_query_limit_fraction = ( limit_fraction * 2) / len(predicates) else: predicate_query_limit_fraction = limit_fraction * 2 results = [] def predicate_specific_sparql(sparql, query): log.debug(query) sparql.setQuery(query) res = sparql.query().convert() results.extend( list(res["results"]["bindings"])) threads = [] local_results = [] for p in predicates: q = u"""SELECT DISTINCT ?s ?p ?o ?g WHERE {{ {{ GRAPH ?g {{ {{ <{url}> <{predicate}> ?o . BIND(<{url}> as ?s) BIND(<{predicate}> as ?p) }} UNION {{ ?s <{predicate}> <{url}>. BIND(<{url}> as ?o) BIND(<{predicate}> as ?p) }} }} }} UNION {{ {{ <{url}> <{predicate}> ?o . BIND(<{url}> as ?s) BIND(<{predicate}> as ?p) }} UNION {{ ?s <{predicate}> <{url}>. BIND(<{url}> as ?o) BIND(<{predicate}> as ?p) }} }} }} LIMIT {limit}""".format(url=url, predicate=p, limit=predicate_query_limit_fraction) for s in sparqls: # Start processes for each endpoint, for each predicate query process = Thread(target=predicate_specific_sparql, args=[s, q]) process.start() threads.append(process) url_is_predicate_query = u"""SELECT DISTINCT ?s ?p ?o ?g WHERE {{ {{ GRAPH ?g {{ ?s <{url}> ?o. BIND(<{url}> as ?p) }} }} UNION {{ ?s <{url}> ?o. BIND(<{url}> as ?p) }} }} LIMIT {limit}""".format(url=url, limit=limit_fraction) for s in sparqls: process = Thread(target=predicate_specific_sparql, args=[s, url_is_predicate_query]) process.start() threads.append(process) # We now pause execution on the main thread by 'joining' all of our started threads. # This ensures that each has finished processing the urls. for process in threads: process.join() if LDF_STATEMENTS_URL is not None: retrieve_ldf_results(url) # We also add local results (result of dereferencing) local_results = list(visit_local(url, format)) results.extend(local_results) # If a Druid statements URL is specified, we'll try to receive it as # well if DRUID_STATEMENTS_URL is not None: results.extend(visit_druid(url, format)) if depth > 1: # If depth is larger than 1, we proceed to extend the results with the results of # visiting all object resources for every triple in the resultset. newresults = [] objects = set([r['o']['value'] for r in results if r['o']['value'] != url and r['o']['type']=='uri']) for o in objects: newresults.extend( visit(o, format=format, depth=depth - 1)) results.extend(newresults) else: q = u""" CONSTRUCT {{ ?s ?p ?o . }} WHERE {{ {{ GRAPH ?g {{ {{ <{url}> ?p ?o . BIND(<{url}> as ?s) }} UNION {{ ?s ?p <{url}>. BIND(<{url}> as ?o) }} UNION {{ ?s <{url}> ?o. BIND(<{url}> as ?p) }} }} }} UNION {{ {{ <{url}> ?p ?o . BIND(<{url}> as ?s) }} UNION {{ ?s ?p <{url}>. BIND(<{url}> as ?o) }} UNION {{ ?s <{url}> ?o. BIND(<{url}> as ?p) }} }} }} LIMIT {limit}""".format(url=url, limit=QUERY_RESULTS_LIMIT) result_dataset = Dataset() for s in sparqls: s.setQuery(q) s.setReturnFormat(XML) result_dataset += s.query().convert() if format == 'jsonld': results = result_dataset.serialize(format='json-ld') elif format == 'rdfxml': s.setReturnFormat(XML) results = result_dataset.serialize(format='pretty-xml') elif format == 'turtle': s.setReturnFormat(XML) results = result_dataset.serialize(format='turtle') else: results = 'Nothing' log.debug("Received results") return results
), Literal(s.strip("' "), lang=lang))) # same as if not pd.isnull(row['skos_exactMatch'] ) and row['skos_exactMatch'].strip() != '-': for s in row['skos_exactMatch'].split(';'): s = s.strip() if s[:4] == 'http': graph.add((uri, SKOS.exactMatch, URIRef(s))) elif len(s.split(':')) == 2: p = s.split(':')[0] q = s.split(':')[1] if p in vocabs: graph.add( (uri, SKOS.exactMatch, URIRef(vocabs[p] + q))) # format extras if sheet == 'formats': if not pd.isnull(row['premis_formatVersion'] ) and row['premis_formatVersion'].strip() != '': graph.add((uri, URIRef(f'{vocabs["premis"]}formatVersion'), Literal(row['premis_formatVersion'].strip()))) if not pd.isnull(row['ebucore_hasMimeType'] ) and row['ebucore_hasMimeType'].strip() != '': graph.add((uri, URIRef(f'{vocabs["ebucore"]}hasMimeType'), Literal(row['ebucore_hasMimeType'].strip()))) with open('target/vocabulary.ttl', 'wb') as f: f.write(graph.serialize(format='turtle')) with open('target/vocabulary.nq', 'wb') as f: f.write(ds.serialize(format='nquads'))
def test_hext_dataset(): """Tests context-aware (multigraph) data""" d = Dataset() trig_data = """ PREFIX ex: <http://example.com/> PREFIX owl: <http://www.w3.org/2002/07/owl#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> ex:g1 { ex:s1 ex:p1 ex:o1 , ex:o2 ; ex:p2 [ a owl:Thing ; rdf:value "thingy" ; ] ; ex:p3 "Object 3" , "Object 4 - English"@en ; ex:p4 "2021-12-03"^^xsd:date ; ex:p5 42 ; ex:p6 "42" ; . } ex:g2 { ex:s1 ex:p1 ex:o1 , ex:o2 ; . ex:s11 ex:p11 ex:o11 , ex:o12 . } # default graph triples ex:s1 ex:p1 ex:o1 , ex:o2 . ex:s21 ex:p21 ex:o21 , ex:o22 . """ d.parse(data=trig_data, format="trig", publicID=d.default_context.identifier) out = d.serialize(format="hext") # note: cant' test for BNs in result as they will be different ever time testing_lines = [ [ False, '["http://example.com/s21", "http://example.com/p21", "http://example.com/o21", "globalId", "", ""]' ], [ False, '["http://example.com/s21", "http://example.com/p21", "http://example.com/o22", "globalId", "", ""]' ], [ False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", ""]' ], [ False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", ""]' ], [ False, '["http://example.com/s11", "http://example.com/p11", "http://example.com/o12", "globalId", "", "http://example.com/g2"]' ], [ False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", "http://example.com/g2"]' ], [ False, '["http://example.com/s11", "http://example.com/p11", "http://example.com/o11", "globalId", "", "http://example.com/g2"]' ], [ False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", "http://example.com/g2"]' ], [ False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", "http://example.com/g1"]' ], [False, '["http://example.com/s1", "http://example.com/p2"'], [ False, '"http://www.w3.org/1999/02/22-rdf-syntax-ns#value", "thingy", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]' ], [ False, '"http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/2002/07/owl#Thing", "globalId", "", "http://example.com/g1"]' ], [ False, '["http://example.com/s1", "http://example.com/p3", "Object 4 - English", "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString", "en", "http://example.com/g1"]' ], [ False, '["http://example.com/s1", "http://example.com/p6", "42", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]' ], [ False, '["http://example.com/s1", "http://example.com/p4", "2021-12-03", "http://www.w3.org/2001/XMLSchema#date", "", "http://example.com/g1"]' ], [ False, '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", "http://example.com/g1"]' ], [ False, '["http://example.com/s1", "http://example.com/p5", "42", "http://www.w3.org/2001/XMLSchema#integer", "", "http://example.com/g1"]' ], [ False, '["http://example.com/s1", "http://example.com/p3", "Object 3", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]' ], ] for line in out.splitlines(): for test in testing_lines: if test[1] in line: test[0] = True assert all([x[0] for x in testing_lines])
class LongTermMemory(object): ONE_TO_ONE_PREDICATES = [ 'age', 'born_in', 'faceID', 'favorite', 'favorite_of', 'id', 'is_from', 'manufactured_in', 'mother_is', 'name' ] def __init__(self, address=config.BRAIN_URL_LOCAL): """ Interact with Triple store Parameters ---------- address: str IP address and port of the Triple store """ self.address = address self.namespaces = {} self.ontology_paths = {} self.format = 'trig' self.dataset = Dataset() self.query_prefixes = """ prefix gaf: <http://groundedannotationframework.org/gaf#> prefix grasp: <http://groundedannotationframework.org/grasp#> prefix leolaniInputs: <http://cltl.nl/leolani/inputs/> prefix leolaniFriends: <http://cltl.nl/leolani/friends/> prefix leolaniTalk: <http://cltl.nl/leolani/talk/> prefix leolaniTime: <http://cltl.nl/leolani/time/> prefix leolaniWorld: <http://cltl.nl/leolani/world/> prefix n2mu: <http://cltl.nl/leolani/n2mu/> prefix ns1: <urn:x-rdflib:> prefix owl: <http://www.w3.org/2002/07/owl#> prefix prov: <http://www.w3.org/ns/prov#> prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> prefix sem: <http://semanticweb.cs.vu.nl/2009/11/sem/> prefix skos: <http://www.w3.org/2004/02/skos/core#> prefix time: <http://www.w3.org/TR/owl-time/#> prefix xml: <http://www.w3.org/XML/1998/namespace> prefix xml1: <https://www.w3.org/TR/xmlschema-2/#> prefix xsd: <http://www.w3.org/2001/XMLSchema#> """ self._define_namespaces() self._get_ontology_path() self._bind_namespaces() self.my_uri = None self._log = logger.getChild(self.__class__.__name__) self._log.debug("Booted") #################################### Main functions to interact with the brain #################################### def update(self, capsule): """ Main function to interact with if a statement is coming into the brain. Takes in a structured parsed statement, transforms them to triples, and posts them to the triple store :param statement: Structured data of a parsed statement :return: json response containing the status for posting the triples, and the original statement """ # Case fold capsule = casefold_capsule(capsule) # Create graphs and triples self._model_graphs_(capsule) data = self._serialize(config.BRAIN_LOG) code = self._upload_to_brain(data) # Create JSON output capsule["date"] = str(capsule["date"]) output = {'response': code, 'statement': capsule} return output def experience(self, capsule): """ Main function to interact with if a statement is coming into the brain. Takes in a structured parsed statement, transforms them to triples, and posts them to the triple store :param capsule: Structured data of a parsed statement :return: json response containing the status for posting the triples, and the original statement """ # Case fold capsule = casefold_capsule(capsule) # Create graphs and triples self._model_graphs_(capsule, type='Experience') data = self._serialize(config.BRAIN_LOG) code = self._upload_to_brain(data) # Create JSON output capsule["date"] = str(capsule["date"]) output = {'response': code, 'statement': capsule} return output def query_brain(self, capsule): """ Main function to interact with if a question is coming into the brain. Takes in a structured parsed question, transforms it into a query, and queries the triple store for a response :param capsule: Structured data of a parsed question :return: json response containing the results of the query, and the original question """ # Case fold capsule = casefold_capsule(capsule) # Generate query query = self._create_query(capsule) # Perform query response = self._submit_query(query) # Create JSON output if 'date' in capsule.keys(): capsule["date"] = str(capsule["date"]) output = {'response': response, 'question': capsule} return output def process_visual(self, item, exact_only=True): """ Main function to determine if this item can be recognized by the brain, learned, or none :param item: :return: """ if casefold(item) in self.get_classes(): # If this is in the ontology already, create sensor triples directly text = 'I know about %s. I will remember this object' % item return item, text temp = self.get_labels_and_classes() if casefold(item) in temp.keys(): # If this is in the ontology already, create sensor triples directly text = 'I know about %s. It is of type %s. I will remember this object' % ( item, temp[item]) return item, text # Query the web for information class_type, description = self.exact_match_dbpedia(item) if class_type is not None: # Had to learn it, but I can create triples now text = 'I did not know what %s is, but I searched on the web and I found that it is a %s. ' \ 'I will remember this object' % (item, class_type) return casefold(class_type), text if not exact_only: # Second go at dbpedia, relaxed approach class_type, description = self.keyword_match_dbpedia(item) if class_type is not None: # Had to really search for it to learn it, but I can create triples now text = 'I did not know what %s is, but I searched for fuzzy matches on the web and I found that it ' \ 'is a %s. I will remember this object' % (item, class_type) return casefold(class_type), text # Failure, nothing found text = 'I am sorry, I could not learn anything on %s so I will not remember it' % item return None, text ########## management system for keeping track of chats and turns ########## def get_last_chat_id(self): """ Get the id for the last interaction recorded :return: id """ query = read_query('last_chat_id') response = self._submit_query(query) return int(response[0]['chatid']['value']) if response else 0 def get_last_turn_id(self, chat_id): """ Get the id for the last turn in the given chat :param chat_id: id for chat of interest :return: id """ query = read_query('last_turn_id') % (chat_id) response = self._submit_query(query) last_turn = 0 for turn in response: turn_uri = turn['s']['value'] turn_id = turn_uri.split('/')[-1][10:] turn_id = int(turn_id) if turn_id > last_turn: last_turn = turn_id return last_turn ########## brain structure exploration ########## def get_predicates(self): """ Get predicates in social ontology :return: """ query = read_query('predicates') response = self._submit_query(query) return [elem['p']['value'].split('/')[-1] for elem in response] def get_classes(self): """ Get classes in social ontology :return: """ query = read_query('classes') response = self._submit_query(query) return [elem['o']['value'].split('/')[-1] for elem in response] def get_labels_and_classes(self): """ Get classes in social ontology :return: """ query = read_query('labels_and_classes') response = self._submit_query(query) temp = dict() for r in response: temp[r['l']['value']] = r['o']['value'].split('/')[-1] return temp ########## learned facts exploration ########## def count_statements(self): """ Count statements or 'facts' in the brain :return: """ query = read_query('count_statements') response = self._submit_query(query) return response[0]['count']['value'] def count_friends(self): """ Count number of people I have talked to :return: """ query = read_query('count_friends') response = self._submit_query(query) return response[0]['count']['value'] def get_my_friends(self): """ Get names of people I have talked to :return: """ query = read_query('my_friends') response = self._submit_query(query) return [elem['name']['value'].split('/')[-1] for elem in response] def get_best_friends(self): """ Get names of the 5 people I have talked to the most :return: """ query = read_query('best_friends') response = self._submit_query(query) return [elem['name']['value'] for elem in response] def get_instance_of_type(self, instance_type): """ Get isntances of a certain class type :param instance_type: name of class in ontology :return: """ query = read_query('instance_of_type') % (instance_type) response = self._submit_query(query) return [elem['name']['value'] for elem in response] def when_last_chat_with(self, actor_label): """ Get time value for the last time I chatted with this person :param actor_label: name of person :return: """ query = read_query('when_last_chat_with') % (actor_label) response = self._submit_query(query) return response[0]['time']['value'].split('/')[-1] def get_triples_with_predicate(self, predicate): """ Get triples that contain this predicate :param predicate: :return: """ query = read_query('triples_with_predicate') % predicate response = self._submit_query(query) return [(elem['sname']['value'], elem['oname']['value']) for elem in response] ########## conflicts ########## def get_all_conflicts(self): """ Aggregate all conflicts in brain :return: """ conflicts = [] for predicate in self.ONE_TO_ONE_PREDICATES: conflicts.extend(self._get_conflicts_with_predicate(predicate)) return conflicts ########## semantic web ########## def exact_match_dbpedia(self, item): """ Query dbpedia for information on this item to get it's semantic type and description. :param item: :return: """ # Gather combinations combinations = [item, item.lower(), item.capitalize(), item.title()] for comb in combinations: # Try exact matching query query = read_query('dbpedia_type_and_description') % (comb) response = self._submit_query(query) # break if we have a hit if response: break class_type = response[0]['label_type']['value'] if response else None description = response[0]['description']['value'].split( '.')[0] if response else None return class_type, description def keyword_match_dbpedia(self, item): # Query API r = requests.get( 'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch', params={ 'QueryString': item, 'MaxHits': '10' }, headers={ 'Accept': 'application/json' }).json()['results'] # Fuzzy match choices = [e['label'] for e in r] best_match = process.extractOne("item", choices) # Get best match object r = [{ 'label': e['label'], 'classes': e['classes'], 'description': e['description'] } for e in r if e['label'] == best_match[0]] if r: r = r[0] if r['classes']: # process dbpedia classes only r['classes'] = [ c['label'] for c in r['classes'] if 'dbpedia' in c['uri'] ] else: r = {'label': None, 'classes': None, 'description': None} return r['classes'][0] if r['classes'] else None, r[ 'description'].split('.')[0] if r['description'] else None ######################################## Helpers for setting up connection ######################################## def _define_namespaces(self): """ Define namespaces for different layers (ontology/vocab and resource). Assign them to self :return: """ # Namespaces for the instance layer instance_vocab = 'http://cltl.nl/leolani/n2mu/' self.namespaces['N2MU'] = Namespace(instance_vocab) instance_resource = 'http://cltl.nl/leolani/world/' self.namespaces['LW'] = Namespace(instance_resource) # Namespaces for the mention layer mention_vocab = 'http://groundedannotationframework.org/gaf#' self.namespaces['GAF'] = Namespace(mention_vocab) mention_resource = 'http://cltl.nl/leolani/talk/' self.namespaces['LTa'] = Namespace(mention_resource) # Namespaces for the attribution layer attribution_vocab = 'http://groundedannotationframework.org/grasp#' self.namespaces['GRASP'] = Namespace(attribution_vocab) attribution_resource_friends = 'http://cltl.nl/leolani/friends/' self.namespaces['LF'] = Namespace(attribution_resource_friends) attribution_resource_inputs = 'http://cltl.nl/leolani/inputs/' self.namespaces['LI'] = Namespace(attribution_resource_inputs) # Namespaces for the temporal layer-ish time_vocab = 'http://www.w3.org/TR/owl-time/#' self.namespaces['TIME'] = Namespace(time_vocab) time_resource = 'http://cltl.nl/leolani/time/' self.namespaces['LTi'] = Namespace(time_resource) # The namespaces of external ontologies skos = 'http://www.w3.org/2004/02/skos/core#' self.namespaces['SKOS'] = Namespace(skos) prov = 'http://www.w3.org/ns/prov#' self.namespaces['PROV'] = Namespace(prov) sem = 'http://semanticweb.cs.vu.nl/2009/11/sem/' self.namespaces['SEM'] = Namespace(sem) xml = 'https://www.w3.org/TR/xmlschema-2/#' self.namespaces['XML'] = Namespace(xml) def _get_ontology_path(self): """ Define ontology paths to key vocabularies :return: """ self.ontology_paths[ 'n2mu'] = './../../knowledge_representation/ontologies/leolani.ttl' self.ontology_paths[ 'gaf'] = './../../knowledge_representation/ontologies/gaf.rdf' self.ontology_paths[ 'grasp'] = './../../knowledge_representation/ontologies/grasp.rdf' self.ontology_paths[ 'sem'] = './../../knowledge_representation/ontologies/sem.rdf' def _bind_namespaces(self): """ Bnd namespaces :return: """ self.dataset.bind('n2mu', self.namespaces['N2MU']) self.dataset.bind('leolaniWorld', self.namespaces['LW']) self.dataset.bind('gaf', self.namespaces['GAF']) self.dataset.bind('leolaniTalk', self.namespaces['LTa']) self.dataset.bind('grasp', self.namespaces['GRASP']) self.dataset.bind('leolaniFriends', self.namespaces['LF']) self.dataset.bind('leolaniInputs', self.namespaces['LI']) self.dataset.bind('time', self.namespaces['TIME']) self.dataset.bind('leolaniTime', self.namespaces['LTi']) self.dataset.bind('skos', self.namespaces['SKOS']) self.dataset.bind('prov', self.namespaces['PROV']) self.dataset.bind('sem', self.namespaces['SEM']) self.dataset.bind('xml', self.namespaces['XML']) self.dataset.bind('owl', OWL) ######################################## Helpers for statement processing ######################################## def create_chat_id(self, actor, date): """ Determine chat id depending on my last conversation with this person :param actor: :param date: :return: """ self._log.debug('Chat with {} on {}'.format(actor, date)) query = read_query('last_chat_with') % (actor) response = self._submit_query(query) if response and int(response[0]['day']['value']) == int(date.day) \ and int(response[0]['month']['value']) == int(date.month) \ and int(response[0]['year']['value']) == int(date.year): # Chatted with this person today so same chat id chat_id = int(response[0]['chatid']['value']) else: # Either have never chatted with this person, or I have but not today. Add one to latest chat chat_id = self.get_last_chat_id() + 1 return chat_id def create_turn_id(self, chat_id): self._log.debug('Turn in chat {}'.format(chat_id)) query = read_query('last_turn_in_chat') % (chat_id) response = self._submit_query(query) return int(response['turnid']['value']) + 1 if response else 1 def _generate_leolani(self, instance_graph): # Create Leolani leolani_id = 'leolani' leolani_label = 'leolani' leolani = URIRef(to_iri(self.namespaces['LW'] + leolani_id)) leolani_label = Literal(leolani_label) leolani_type1 = URIRef(to_iri(self.namespaces['N2MU'] + 'robot')) leolani_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance')) instance_graph.add((leolani, RDFS.label, leolani_label)) instance_graph.add((leolani, RDF.type, leolani_type1)) instance_graph.add((leolani, RDF.type, leolani_type2)) self.my_uri = leolani return leolani def _generate_subject(self, capsule, instance_graph): if capsule['subject']['type'] == '': # We only get the label subject_vocab = OWL subject_type = 'Thing' else: subject_vocab = self.namespaces['N2MU'] subject_type = capsule['subject']['type'] subject_id = capsule['subject']['label'] subject = URIRef(to_iri(self.namespaces['LW'] + subject_id)) subject_label = Literal(subject_id) subject_type1 = URIRef(to_iri(subject_vocab + subject_type)) subject_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance')) instance_graph.add((subject, RDFS.label, subject_label)) instance_graph.add((subject, RDF.type, subject_type1)) instance_graph.add((subject, RDF.type, subject_type2)) return subject, subject_label def _create_leolani_world(self, capsule, type='Statement'): # Instance graph instance_graph_uri = URIRef(to_iri(self.namespaces['LW'] + 'Instances')) instance_graph = self.dataset.graph(instance_graph_uri) # Subject if type == 'Statement': subject, subject_label = self._generate_subject( capsule, instance_graph) elif type == 'Experience': subject = self._generate_leolani( instance_graph) if self.my_uri is None else self.my_uri subject_label = 'leolani' # Object if capsule['object']['type'] == '': # We only get the label object_vocab = OWL object_type = 'Thing' else: object_vocab = self.namespaces['N2MU'] object_type = capsule['object']['type'] object_id = capsule['object']['label'] object = URIRef(to_iri(self.namespaces['LW'] + object_id)) object_label = Literal(object_id) object_type1 = URIRef(to_iri(object_vocab + object_type)) object_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance')) instance_graph.add((object, RDFS.label, object_label)) instance_graph.add((object, RDF.type, object_type1)) instance_graph.add((object, RDF.type, object_type2)) if type == 'Statement': claim_graph, statement = self._create_claim_graph( subject, subject_label, object, object_label, capsule['predicate']['type'], type='Statement') elif type == 'Experience': claim_graph, statement = self._create_claim_graph( subject, subject_label, object, object_label, 'sees', type='Experience') return instance_graph, claim_graph, subject, object, statement def _create_claim_graph(self, subject, subject_label, object, object_label, predicate, type='Statement'): # Claim graph claim_graph_uri = URIRef(to_iri(self.namespaces['LW'] + 'Claims')) claim_graph = self.dataset.graph(claim_graph_uri) # Statement statement_id = hash_statement_id( [subject_label, predicate, object_label]) statement = URIRef(to_iri(self.namespaces['LW'] + statement_id)) statement_type1 = URIRef(to_iri(self.namespaces['GRASP'] + type)) statement_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance')) statement_type3 = URIRef(to_iri(self.namespaces['SEM'] + 'Event')) # Create graph and add triple graph = self.dataset.graph(statement) graph.add((subject, self.namespaces['N2MU'][predicate], object)) claim_graph.add((statement, RDF.type, statement_type1)) claim_graph.add((statement, RDF.type, statement_type2)) claim_graph.add((statement, RDF.type, statement_type3)) return claim_graph, statement def _create_leolani_talk(self, capsule, leolani, type='Statement'): # Interaction graph if type == 'Statement': graph_to_write = 'Interactions' elif type == 'Experience': graph_to_write = 'Sensors' interaction_graph_uri = URIRef( to_iri(self.namespaces['LTa'] + graph_to_write)) interaction_graph = self.dataset.graph(interaction_graph_uri) # Time date = capsule["date"] time = URIRef( to_iri(self.namespaces['LTi'] + str(capsule["date"].isoformat()))) time_type = URIRef( to_iri(self.namespaces['TIME'] + 'DateTimeDescription')) day = Literal(date.day, datatype=self.namespaces['XML']['gDay']) month = Literal(date.month, datatype=self.namespaces['XML']['gMonthDay']) year = Literal(date.year, datatype=self.namespaces['XML']['gYear']) time_unitType = URIRef(to_iri(self.namespaces['TIME'] + 'unitDay')) interaction_graph.add((time, RDF.type, time_type)) interaction_graph.add((time, self.namespaces['TIME']['day'], day)) interaction_graph.add((time, self.namespaces['TIME']['month'], month)) interaction_graph.add((time, self.namespaces['TIME']['year'], year)) interaction_graph.add( (time, self.namespaces['TIME']['unitType'], time_unitType)) # Actor actor_id = capsule['author'] actor_label = capsule['author'] actor = URIRef(to_iri(to_iri(self.namespaces['LF'] + actor_id))) actor_label = Literal(actor_label) actor_type1 = URIRef(to_iri(self.namespaces['SEM'] + 'Actor')) actor_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance')) if type == 'Statement': actor_type3 = URIRef(to_iri(self.namespaces['N2MU'] + 'person')) elif type == 'Experience': actor_type3 = URIRef(to_iri(self.namespaces['N2MU'] + 'sensor')) interaction_graph.add((actor, RDFS.label, actor_label)) interaction_graph.add((actor, RDF.type, actor_type1)) interaction_graph.add((actor, RDF.type, actor_type2)) interaction_graph.add((actor, RDF.type, actor_type3)) # Add leolani knows/senses actor if type == 'Statement': predicate = 'knows' elif type == 'Experience': predicate = 'senses' interaction_graph.add( (leolani, self.namespaces['N2MU'][predicate], actor)) _, _ = self._create_claim_graph(leolani, 'leolani', actor, actor_label, predicate, type) # Event and subevent event_id = self.create_chat_id(actor_label, date) if type == 'Statement': event_label = 'chat%s' % event_id elif type == 'Experience': event_label = 'visual%s' % event_id subevent_id = self.create_turn_id(event_id) if type == 'Statement': subevent_label = event_label + '_turn%s' % subevent_id elif type == 'Experience': subevent_label = event_label + '_object%s' % subevent_id turn = URIRef(to_iri(self.namespaces['LTa'] + subevent_label)) turn_type1 = URIRef(to_iri(self.namespaces['SEM'] + 'Event')) if type == 'Statement': turn_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Turn')) elif type == 'Experience': turn_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Object')) interaction_graph.add((turn, RDF.type, turn_type1)) interaction_graph.add((turn, RDF.type, turn_type2)) interaction_graph.add( (turn, self.namespaces['N2MU']['id'], Literal(subevent_id))) interaction_graph.add( (turn, self.namespaces['SEM']['hasActor'], actor)) interaction_graph.add((turn, self.namespaces['SEM']['hasTime'], time)) chat = URIRef(to_iri(self.namespaces['LTa'] + event_label)) chat_type1 = URIRef(to_iri(self.namespaces['SEM'] + 'Event')) if type == 'Statement': chat_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Chat')) elif type == 'Experience': chat_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Visual')) interaction_graph.add((chat, RDF.type, chat_type1)) interaction_graph.add((chat, RDF.type, chat_type2)) interaction_graph.add( (chat, self.namespaces['N2MU']['id'], Literal(event_id))) interaction_graph.add( (chat, self.namespaces['SEM']['hasActor'], actor)) interaction_graph.add((chat, self.namespaces['SEM']['hasTime'], time)) interaction_graph.add( (chat, self.namespaces['SEM']['hasSubevent'], turn)) perspective_graph, mention, attribution = self._create_perspective_graph( capsule, subevent_label) # Link interactions and perspectives perspective_graph.add( (mention, self.namespaces['GRASP']['wasAttributedTo'], actor)) perspective_graph.add( (mention, self.namespaces['GRASP']['hasAttribution'], attribution)) perspective_graph.add( (mention, self.namespaces['PROV']['wasDerivedFrom'], chat)) perspective_graph.add( (mention, self.namespaces['PROV']['wasDerivedFrom'], turn)) return interaction_graph, perspective_graph, actor, time, mention, attribution def _create_perspective_graph(self, capsule, turn_label, type='Statement'): # Perspective graph perspective_graph_uri = URIRef( to_iri(self.namespaces['LTa'] + 'Perspectives')) perspective_graph = self.dataset.graph(perspective_graph_uri) # Mention if type == 'Statement': mention_id = turn_label + '_char%s' % capsule['position'] elif type == 'Experience': mention_id = turn_label + '_pixel%s' % capsule['position'] mention = URIRef(to_iri(self.namespaces['LTa'] + mention_id)) mention_type = URIRef(to_iri(self.namespaces['GRASP'] + 'Mention')) perspective_graph.add((mention, RDF.type, mention_type)) # Attribution attribution_id = mention_id + '_CERTAIN' attribution = URIRef(to_iri(self.namespaces['LTa'] + attribution_id)) attribution_type = URIRef( to_iri(self.namespaces['GRASP'] + 'Attribution')) attribution_value = URIRef(to_iri(self.namespaces['GRASP'] + 'CERTAIN')) perspective_graph.add((attribution, RDF.type, attribution_type)) perspective_graph.add((attribution, RDF.value, attribution_value)) return perspective_graph, mention, attribution def _serialize(self, file_path): """ Save graph to local file and return the serialized string :param file_path: path to where data will be saved :return: serialized data as string """ # Save to file but return the python representation with open(file_path + '.' + self.format, 'w') as f: self.dataset.serialize(f, format=self.format) return self.dataset.serialize(format=self.format) def _upload_to_brain(self, data): """ Post data to the brain :param data: serialized data as string :return: response status """ self._log.debug("Posting triples") # From serialized string post_url = self.address + "/statements" response = requests.post( post_url, data=data, headers={'Content-Type': 'application/x-' + self.format}) return str(response.status_code) def _model_graphs_(self, capsule, type='Statement'): # Leolani world (includes instance and claim graphs) instance_graph, claim_graph, subject, object, instance = self._create_leolani_world( capsule, type) # Identity leolani = self._generate_leolani( instance_graph) if self.my_uri is None else self.my_uri # Leolani talk (includes interaction and perspective graphs) interaction_graph, perspective_graph, actor, time, mention, attribution = self._create_leolani_talk( capsule, leolani, type) # Interconnections instance_graph.add( (subject, self.namespaces['GRASP']['denotedIn'], mention)) instance_graph.add( (object, self.namespaces['GRASP']['denotedIn'], mention)) instance_graph.add( (instance, self.namespaces['GRASP']['denotedBy'], mention)) instance_graph.add( (instance, self.namespaces['SEM']['hasActor'], actor)) instance_graph.add((instance, self.namespaces['SEM']['hasTime'], time)) perspective_graph.add( (mention, self.namespaces['GRASP']['containsDenotation'], subject)) perspective_graph.add( (mention, self.namespaces['GRASP']['containsDenotation'], object)) perspective_graph.add( (mention, self.namespaces['GRASP']['denotes'], instance)) perspective_graph.add( (attribution, self.namespaces['GRASP']['isAttributionFor'], mention)) ######################################### Helpers for question processing ######################################### def _create_query(self, parsed_question): _ = hash_statement_id([ parsed_question['subject']['label'], parsed_question['predicate']['type'], parsed_question['object']['label'] ]) # Query subject if parsed_question['subject']['label'] == "": # Case fold # object_label = casefold_label(parsed_question['object']['label']) query = """ SELECT ?slabel ?authorlabel WHERE { ?s n2mu:%s ?o . ?s rdfs:label ?slabel . ?o rdfs:label '%s' . GRAPH ?g { ?s n2mu:%s ?o . } . ?g grasp:denotedBy ?m . ?m grasp:wasAttributedTo ?author . ?author rdfs:label ?authorlabel . } """ % (parsed_question['predicate']['type'], parsed_question['object']['label'], parsed_question['predicate']['type']) # Query object elif parsed_question['object']['label'] == "": query = """ SELECT ?olabel ?authorlabel WHERE { ?s n2mu:%s ?o . ?s rdfs:label '%s' . ?o rdfs:label ?olabel . GRAPH ?g { ?s n2mu:%s ?o . } . ?g grasp:denotedBy ?m . ?m grasp:wasAttributedTo ?author . ?author rdfs:label ?authorlabel . } """ % (parsed_question['predicate']['type'], parsed_question['subject']['label'], parsed_question['predicate']['type']) # Query existence else: query = """ SELECT ?authorlabel ?v WHERE { ?s n2mu:%s ?o . ?s rdfs:label '%s' . ?o rdfs:label '%s' . GRAPH ?g { ?s n2mu:%s ?o . } . ?g grasp:denotedBy ?m . ?m grasp:wasAttributedTo ?author . ?author rdfs:label ?authorlabel . ?m grasp:hasAttribution ?att . ?att rdf:value ?v . } """ % (parsed_question['predicate']['type'], parsed_question['subject']['label'], parsed_question['object']['label'], parsed_question['predicate']['type']) query = self.query_prefixes + query return query def _submit_query(self, query): # Set up connection sparql = SPARQLWrapper(self.address) # Response parameters sparql.setQuery(query) sparql.setReturnFormat(JSON) sparql.addParameter('Accept', 'application/sparql-results+json') response = sparql.query().convert() return response["results"]["bindings"] ######################################### Helpers for conflict processing ######################################### def _get_conflicts_with_predicate(self, one_to_one_predicate): query = """ PREFIX n2mu: <http://cltl.nl/leolani/n2mu/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX grasp: <http://groundedannotationframework.org/grasp#> select ?sname (group_concat(?oname ; separator=";") as ?onames) (group_concat(?authorlabel ; separator=";") as ?authorlabels) where { GRAPH ?g { ?s n2mu:%s ?o . } . ?s rdfs:label ?sname . ?o rdfs:label ?oname . ?g grasp:denotedBy ?m . ?m grasp:wasAttributedTo ?author . ?author rdfs:label ?authorlabel . } group by ?sname having (count(distinct ?oname) > 1) """ % one_to_one_predicate response = self._submit_query(query) conflicts = [] for item in response: conflict = { 'subject': item['sname']['value'], 'predicate': one_to_one_predicate, 'objects': [] } values = item['onames']['value'].split(';') authors = item['authorlabels']['value'].split(';') for val, auth in zip(values, authors): option = {'value': val, 'author': auth} conflict['objects'].append(option) conflicts.append(conflict) return conflicts
def update_test(t): # the update-eval tests refer to graphs on http://example.org rdflib_sparql_module.SPARQL_LOAD_GRAPHS = False uri, name, comment, data, graphdata, query, res, syntax = t if uri in skiptests: raise SkipTest() try: g = Dataset() if not res: if syntax: with bopen(query[7:]) as f: translateUpdate(parseUpdate(f)) else: try: with bopen(query[7:]) as f: translateUpdate(parseUpdate(f)) raise AssertionError("Query shouldn't have parsed!") except: pass # negative syntax test return resdata, resgraphdata = res # read input graphs if data: g.default_context.load(data, format=_fmt(data)) if graphdata: for x, l in graphdata: g.load(x, publicID=URIRef(l), format=_fmt(x)) with bopen(query[7:]) as f: req = translateUpdate(parseUpdate(f)) evalUpdate(g, req) # read expected results resg = Dataset() if resdata: resg.default_context.load(resdata, format=_fmt(resdata)) if resgraphdata: for x, l in resgraphdata: resg.load(x, publicID=URIRef(l), format=_fmt(x)) eq( set(x.identifier for x in g.contexts() if x != g.default_context), set(x.identifier for x in resg.contexts() if x != resg.default_context), "named graphs in datasets do not match", ) assert isomorphic( g.default_context, resg.default_context), "Default graphs are not isomorphic" for x in g.contexts(): if x == g.default_context: continue assert isomorphic(x, resg.get_context( x.identifier)), ("Graphs with ID %s are not isomorphic" % x.identifier) except Exception as e: if isinstance(e, AssertionError): failed_tests.append(uri) fails[str(e)] += 1 else: error_tests.append(uri) errors[str(e)] += 1 if DEBUG_ERROR and not isinstance(e, AssertionError) or DEBUG_FAIL: print("======================================") print(uri) print(name) print(comment) if not res: if syntax: print("Positive syntax test") else: print("Negative syntax test") if data: print("----------------- DATA --------------------") print(">>>", data) print(bopen_read_close(data[7:])) if graphdata: print("----------------- GRAPHDATA --------------------") for x, l in graphdata: print(">>>", x, l) print(bopen_read_close(x[7:])) print("----------------- Request -------------------") print(">>>", query) print(bopen_read_close(query[7:])) if res: if resdata: print("----------------- RES DATA --------------------") print(">>>", resdata) print(bopen_read_close(resdata[7:])) if resgraphdata: print( "----------------- RES GRAPHDATA -------------------") for x, l in resgraphdata: print(">>>", x, l) print(bopen_read_close(x[7:])) print("------------- MY RESULT ----------") print(g.serialize(format="trig")) try: pq = translateUpdate(parseUpdate(bopen_read_close(query[7:]))) print("----------------- Parsed ------------------") pprintAlgebra(pq) # print pq except: print("(parser error)") print(decodeStringEscape(str(e))) import pdb pdb.post_mortem(sys.exc_info()[2]) raise
# Info on the item g.add((item, RDF.type, saa.Item)) g.add((item, saa.term('index'), Literal(record['assigned_item_no']))) if record['persistent_uid'] != "": g.add((item, saa.identifier, Literal(record['persistent_uid']))) g.add((item, RDFS.label, Literal(record['title'], lang='nl'))) g.add((item, saa.artist, Literal(record['artist_name_1']))) g.add((item, saa.transcription, Literal(record['entry'], lang='nl'))) g.add((item, saa.workType, Literal(record['object_type_1'], lang='nl'))) if record['room'] != "": g.add((item, saa.room, Literal(record['room'], lang='nl'))) if record['valuation_amount'] != "": g.add((item, saa.valuation, Literal(record['valuation_amount']))) return g if __name__ == "__main__": ds = Dataset() ds.bind('ga', ga) ds.bind('saa', saa) ds = main(dataset=ds) ds.serialize('Dutch_Archival_Descriptions_Getty.trig', format='trig')
format="n3", base=base_one).decode("utf-8") g7.base = base_two assert "@base <http://two.org/> ." in g7.serialize(format="n3").decode("utf-8") assert "@base <http://one.org/> ." in g7.serialize( format="n3", base=base_one).decode("utf-8") # 8. checking results for TriX & TriG # TriX can specify a base per graph but setting a base for the whole base_three = Namespace("http://three.org/") ds1 = Dataset() ds1.bind("dct", DCTERMS) ds1.bind("skos", SKOS) g8 = ds1.graph(URIRef("http://g8.com/"), base=base_one) g9 = ds1.graph(URIRef("http://g9.com/")) g8 += g g9 += g g9.base = base_two ds1.base = base_three trix = ds1.serialize(format="trix", base=Namespace("http://two.org/")).decode("utf-8") assert '<graph xml:base="http://one.org/">' in trix assert '<graph xml:base="http://two.org/">' in trix assert '<TriX xml:base="http://two.org/"' in trix trig = ds1.serialize(format="trig", base=Namespace("http://two.org/")).decode("utf-8") assert "@base <http://one.org/> ." not in trig assert "@base <http://three.org/> ." not in trig assert "@base <http://two.org/> ." in trig
r1.ingredients.append(i1_1) r1.tags.append(t1) r1.add_prov("wasDerivedFrom", URIRef("http://recipes.com/r/Foo")) r1.add_pub_info("wasAttributedTo", Literal("Jeff the Data Guy")) summed = Dataset() for quad in r1.__publish__(): summed.add(quad) summed.namespace_manager.bind("np", data.NP, True) summed.namespace_manager.bind("recipe-kb", data.BASE, True) summed.namespace_manager.bind("prov", data.PROV, True) print(summed.serialize(format="trig").decode("utf-8")) u1 = data.USDAEntry(12345, "CHEESE,SERIOUSLY SPICY", []) l1 = data.Linkage(data.IngredientName(i1_1.name), u1) summed = Dataset() for quad in l1.__publish__(): summed.add(quad) summed.namespace_manager.bind("np", data.NP, True) summed.namespace_manager.bind("recipe-kb", data.BASE, True) summed.namespace_manager.bind("prov", data.PROV, True) print(summed.serialize(format="trig").decode("utf-8"))
def update_test(t): # the update-eval tests refer to graphs on http://example.org rdflib_sparql_module.SPARQL_LOAD_GRAPHS = False uri, name, comment, data, graphdata, query, res, syntax = t if uri in skiptests: raise SkipTest() try: g = Dataset() if not res: if syntax: translateUpdate(parseUpdate(open(query[7:]))) else: try: translateUpdate(parseUpdate(open(query[7:]))) raise AssertionError("Query shouldn't have parsed!") except: pass # negative syntax test return resdata, resgraphdata = res # read input graphs if data: g.default_context.load(data, format=_fmt(data)) if graphdata: for x, l in graphdata: g.load(x, publicID=URIRef(l), format=_fmt(x)) req = translateUpdate(parseUpdate(open(query[7:]))) evalUpdate(g, req) # read expected results resg = Dataset() if resdata: resg.default_context.load(resdata, format=_fmt(resdata)) if resgraphdata: for x, l in resgraphdata: resg.load(x, publicID=URIRef(l), format=_fmt(x)) eq(set(x.identifier for x in g.contexts() if x != g.default_context), set(x.identifier for x in resg.contexts() if x != resg.default_context), 'named graphs in datasets do not match') assert isomorphic(g.default_context, resg.default_context), \ 'Default graphs are not isomorphic' for x in g.contexts(): if x == g.default_context: continue assert isomorphic(x, resg.get_context(x.identifier)), \ "Graphs with ID %s are not isomorphic" % x.identifier except Exception, e: if isinstance(e, AssertionError): failed_tests.append(uri) fails[str(e)] += 1 else: error_tests.append(uri) errors[str(e)] += 1 if DEBUG_ERROR and not isinstance(e, AssertionError) or DEBUG_FAIL: print "======================================" print uri print name print comment if not res: if syntax: print "Positive syntax test" else: print "Negative syntax test" if data: print "----------------- DATA --------------------" print ">>>", data print open(data[7:]).read() if graphdata: print "----------------- GRAPHDATA --------------------" for x, l in graphdata: print ">>>", x, l print open(x[7:]).read() print "----------------- Request -------------------" print ">>>", query print open(query[7:]).read() if res: if resdata: print "----------------- RES DATA --------------------" print ">>>", resdata print open(resdata[7:]).read() if resgraphdata: print "----------------- RES GRAPHDATA -------------------" for x, l in resgraphdata: print ">>>", x, l print open(x[7:]).read() print "------------- MY RESULT ----------" print g.serialize(format='trig') try: pq = translateUpdate(parseUpdate(open(query[7:]).read())) print "----------------- Parsed ------------------" pprintAlgebra(pq) # print pq except: print "(parser error)" print decodeStringEscape(unicode(e)) import pdb pdb.post_mortem(sys.exc_info()[2]) raise
# create triples containing subject (neurodkg instances), predicate (several are defined above), and object (neurodkg instances) and add them to the dataset dataset.add((NEURO_INST[subj], URIRef(predicate_to_uri[pred]), NEURO_INST[obj])) # object id: differentiating between the cases of having a disease ID or not elif str(obj_id) != 'nan': print(obj_id) curie = obj_id.replace(' ', '').split(':') if len(curie) <= 1: print(obj_id) prefix = curie[0].lower() obj_id = curie[1] print(curie) # if a disease ID was found, then add the ID and ontology as object of the triple #obj_uri = BASE[prefix+':'+obj_id] obj_uri = URIRef(prefix_dict[prefix] + obj_id) dataset.add( (NEURO_INST[subj], URIRef(predicate_to_uri[pred]), obj_uri)) # if there was no disease ID in an ontology: use the disease label as object of the triple dataset.add((obj_uri, RDFS['label'], Literal(obj))) else: if obj in object_to_uri: obj_uri = object_to_uri[obj] dataset.add((NEURO_INST[subj], URIRef(predicate_to_uri[pred]), URIRef(obj_uri))) else: dataset.add((NEURO_INST[subj], URIRef(predicate_to_uri[pred]), Literal(obj))) print("---------", index) # saving the dataset as a turtle file dataset.serialize('data/output/neuro_dkg.ttl', format='turtle')