Esempio n. 1
0
def dump_as_rdf(g: Dataset, table_name: str) -> bool:
    """
    Dump the contents of Graph g in RDF turtle
    :param g: Dataset to dump
    :param table_name: name of the base table
    :return: success indicator
    """

    # Propagate the mapped concepts up the tree
    def add_to_ancestors(s: URIRef, vm: URIRef):
        g.add((s, ISO['enumeratedConceptualDomain.hasMember'], vm))
        for parent in g.objects(s, SKOS.broader):
            add_to_ancestors(parent, vm)

    if COMPUTE_MEMBERS and EXPLICIT_MEMBERS:
        for subj, obj in g.subject_objects(SKOS.exactMatch):
            add_to_ancestors(subj, obj)
        # TODO: this gives us a list of all concepts in the scheme... useful?
        for scheme, tc in g.subject_objects(SKOS.hasTopConcept):
            for member in g.objects(
                    tc, ISO['enumeratedConceptualDomain.hasMember']):
                g.add((scheme, ISO['enumeratedConceptualDomain.hasMember'],
                       member))

    for name, ns in namespaces.items():
        g.bind(name.lower(), ns)
    outfile = os.path.join(DATA_DIR, table_name + '.ttl')
    print(f"Saving output to {outfile}")
    g.serialize(outfile, format='turtle')
    print(f"{len(g)} triples written")
    return True
def createNanopubs(g):
	ds = Dataset()
	ds.namespace_manager.bind("ddi","http://purl.org/net/nlprepository/spl-ddi-annotation-poc#")
	ds.namespace_manager.bind("prov","http://www.w3.org/ns/prov#")
	ds.namespace_manager.bind("np", "http://www.nanopub.org/nschema#")
	
	bindings = g.query(interactSelect)
	for b in bindings:
		npURI = URIRef(b['inter'] + "-nanopub")
		headURI = URIRef(b['inter'] + "-head")
		aURI =  URIRef(b['inter'] + "-assertion")
		pubInfoURI = URIRef(b['inter'] + "-pubInfo")
		provURI = URIRef(b['inter'] + "-provenance")
		
		
		head = ds.add_graph(headURI)
		head.add((npURI, RDF.type, np['Nanopublication']))
		head.add((aURI, RDF.type, np['Assertion']))
		head.add((provURI, RDF.type, np['Provenance']))
		head.add((pubInfoURI, RDF.type, np['PublicationInfo']))
		head.add((npURI, np['hasAssertion'], aURI))
		head.add((npURI, np['hasProvenance'], provURI))
		head.add((npURI, np['hasPublicationInfo'], pubInfoURI))

		#print head.serialize()
		
		a = ds.add_graph(aURI)
		a.add((b['s'], URIRef('http://dbmi-icode-01.dbmi.pitt.edu/dikb/vocab/interactsWith'), b['o']))
		a.add((b['s'], RDF.type, sio["SIO_010038"]))
		a.add((b['o'], RDF.type,  sio["SIO_010038"]))
		
		prov = ds.add_graph(provURI)
		prov.add((aURI, w3prov['wasDerivedFrom'], b['inter']))
		
 	print ds.serialize(format='trig')
Esempio n. 3
0
def test_roundtrip():
    d = Dataset()
    d.parse(Path(__file__).parent / "test_parser_hext_multigraph.ndjson",
            format="hext",
            publicID=d.default_context.identifier)
    d.default_union = True
    with open(str(
            Path(__file__).parent /
            "test_parser_hext_multigraph.ndjson")) as i:
        ordered_input = "".join(sorted(i.readlines())).strip()

    ordered_output = "\n".join(sorted(
        d.serialize(format="hext").split("\n"))).strip()

    assert ordered_output == ordered_input
Esempio n. 4
0
def test_hext_dataset_linecount():
    d = Dataset()
    assert len(d) == 0
    d.parse(Path(__file__).parent / "test_parser_hext_multigraph.ndjson",
            format="hext",
            publicID=d.default_context.identifier)
    total_triples = 0
    # count all the triples in the Dataset
    for context in d.contexts():
        for triple in context.triples((None, None, None)):
            total_triples += 1
    assert total_triples == 18

    # count the number of serialized Hextuples, should be 22, as per the original file
    lc = len(d.serialize(format="hext").splitlines())
    assert lc == 22
Esempio n. 5
0
def test_hext_json_representation():
    """Tests to see if every link in the ND-JSON Hextuple result is, in fact, JSON"""
    d = Dataset()
    trig_data = """
            PREFIX ex: <http://example.com/>
            PREFIX owl: <http://www.w3.org/2002/07/owl#>
            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

            ex:g1 {
                ex:s1
                    ex:p1 ex:o1 , ex:o2 ;
                    ex:p2 [
                        a owl:Thing ;
                        rdf:value "thingy" ;
                    ] ;
                    ex:p3 "Object 3" , "Object 4 - English"@en ;
                    ex:p4 "2021-12-03"^^xsd:date ;
                    ex:p5 42 ;
                    ex:p6 "42" ;
                .
            }

            ex:g2 {
                ex:s1
                    ex:p1 ex:o1 , ex:o2 ;
                .
                ex:s11 ex:p11 ex:o11 , ex:o12 .
            }

            # default graph triples
            ex:s1 ex:p1 ex:o1 , ex:o2 .
            ex:s21 ex:p21 ex:o21 , ex:o22 .
           """
    d.parse(data=trig_data, format="trig")
    out = d.serialize(format="hext")
    for line in out.splitlines():
        j = json.loads(line)
        assert isinstance(j, list)
Esempio n. 6
0
class Fragment(object):

    HYDRA = Namespace("http://www.w3.org/ns/hydra/core#")
    VOID = Namespace("http://rdfs.org/ns/void#")
    FOAF = Namespace("http://xmlns.com/foaf/0.1/")
    DCTERMS = Namespace("http://purl.org/dc/terms/")

    def __init__(self):
        self.rdf_graph = Dataset()

    def add_data_triple(self, subject, predicate, obj):
        self.rdf_graph.add((subject, predicate, obj))

    def add_graph(self, identifier):
        self.rdf_graph.graph(identifier)

    def add_meta_quad(self, graph, subject, predicate, obj):
        self.rdf_graph.add((graph, subject, predicate, obj))

    def add_prefix(self, prefix, uri):
        self.rdf_graph.bind(prefix, uri)

    def serialize(self):
        return self.rdf_graph.serialize(format="trig", encoding="utf-8")
Esempio n. 7
0
"""
@prefix ex: <http://example.com/> .

ex:graph-1 {
    ex:subject-x ex:predicate-x "Triple X" .

    ex:subject-z ex:predicate-z "Triple Z" .
}

ex:graph-2 {
    ex:subject-y ex:predicate-y "Triple Y" .
}
"""
print("Printing Serialised Dataset:")
print("---")
print(d.serialize(format="trig"))
print("---")
print()
print()

#
#   Use & Query
#

# print the length of the Dataset, i.e. the count of all triples in all Graphs
# we should get
"""
3
"""
print("Printing Dataset Length:")
print("---")
Esempio n. 8
0
def visit_sparql(url, format='html', depth=1):
    sparqls = get_sparql_endpoints(url)
    predicates = get_predicates(sparqls, url)

    if format == 'html':
        limit_fraction = QUERY_RESULTS_LIMIT / 3
        if len(predicates) > 1:
            predicate_query_limit_fraction = (limit_fraction *
                                              2) / len(predicates)
        else:
            predicate_query_limit_fraction = limit_fraction * 2

        results = []

        def predicate_specific_sparql(sparql, query):
            log.debug(query)

            sparql.setQuery(query)
            res = sparql.query().convert()
            results.extend(list(res["results"]["bindings"]))

        threads = []
        local_results = []
        for p in predicates:
            q = u"""SELECT DISTINCT ?s ?p ?o ?g WHERE {{
                {{
                GRAPH ?g {{
                    {{
                        <{url}> <{predicate}> ?o .
                        BIND(<{url}> as ?s)
                        BIND(<{predicate}> as ?p)
                    }} UNION {{
                        ?s <{predicate}> <{url}>.
                        BIND(<{url}> as ?o)
                        BIND(<{predicate}> as ?p)
                    }}
                }}
                }} UNION {{
                    {{
                        <{url}> <{predicate}> ?o .
                        BIND(<{url}> as ?s)
                        BIND(<{predicate}> as ?p)
                    }} UNION {{
                        ?s <{predicate}> <{url}>.
                        BIND(<{url}> as ?o)
                        BIND(<{predicate}> as ?p)
                    }}
                }}
            }} LIMIT {limit}""".format(url=url,
                                       predicate=p,
                                       limit=predicate_query_limit_fraction)

            for s in sparqls:
                # Start processes for each endpoint, for each predicate query
                process = Thread(target=predicate_specific_sparql, args=[s, q])
                process.start()
                threads.append(process)

        url_is_predicate_query = u"""SELECT DISTINCT ?s ?p ?o ?g WHERE {{
            {{
            GRAPH ?g {{
                ?s <{url}> ?o.
                BIND(<{url}> as ?p)
            }}
            }} UNION {{
                ?s <{url}> ?o.
                BIND(<{url}> as ?p)
            }}
        }} LIMIT {limit}""".format(url=url, limit=limit_fraction)

        for s in sparqls:
            process = Thread(target=predicate_specific_sparql,
                             args=[s, url_is_predicate_query])
            process.start()
            threads.append(process)

        # We now pause execution on the main thread by 'joining' all of our started threads.
        # This ensures that each has finished processing the urls.
        for process in threads:
            process.join()

        if LDF_STATEMENTS_URL is not None:
            retrieve_ldf_results(url)

        # We also add local results (result of dereferencing)
        local_results = list(visit_local(url, format))

        results.extend(local_results)

        # If a Druid statements URL is specified, we'll try to receive it as
        # well
        if DRUID_STATEMENTS_URL is not None:
            results.extend(visit_druid(url, format))

        if depth > 1:
            # If depth is larger than 1, we proceed to extend the results with the results of
            # visiting all object resources for every triple in the resultset.
            newresults = []

            objects = set([
                r['o']['value'] for r in results
                if r['o']['value'] != url and r['o']['type'] == 'uri'
            ])

            for o in objects:
                newresults.extend(visit(o, format=format, depth=depth - 1))

            results.extend(newresults)

    else:
        q = u"""
        CONSTRUCT {{
            ?s ?p ?o .
        }} WHERE {{
            {{
            GRAPH ?g {{
                {{
                    <{url}> ?p ?o .
                    BIND(<{url}> as ?s)
                }} UNION {{
                    ?s ?p <{url}>.
                    BIND(<{url}> as ?o)
                }} UNION {{
                    ?s <{url}> ?o.
                    BIND(<{url}> as ?p)
                }}
            }}
            }} UNION {{
                {{
                    <{url}> ?p ?o .
                    BIND(<{url}> as ?s)
                }} UNION {{
                    ?s ?p <{url}>.
                    BIND(<{url}> as ?o)
                }} UNION {{
                    ?s <{url}> ?o.
                    BIND(<{url}> as ?p)
                }}
            }}
        }} LIMIT {limit}""".format(url=url, limit=QUERY_RESULTS_LIMIT)

        result_dataset = Dataset()

        for s in sparqls:
            s.setQuery(q)
            s.setReturnFormat(XML)

            result_dataset += s.query().convert()

        if format == 'jsonld':
            results = result_dataset.serialize(format='json-ld')
        elif format == 'rdfxml':
            s.setReturnFormat(XML)
            results = result_dataset.serialize(format='pretty-xml')
        elif format == 'turtle':
            s.setReturnFormat(XML)
            results = result_dataset.serialize(format='turtle')
        else:
            results = 'Nothing'

    log.debug("Received results")

    return results
Esempio n. 9
0
def main(source, target, geometryfile='data/point2wkt.json'):
    with open(source) as infile:
        data = json.load(infile)

    with open(geometryfile) as infile:
        point2wkt = json.load(infile)

    ds = Dataset()
    dataset = lp.term('')

    g = rdfSubject.db = ds.graph(identifier=lp)

    ### Custom triples / Ontology

    g.add((lpOnt.Adres, OWL.equivalentClass, schema.PostalAddress))

    g.add((lpOnt.Straat, OWL.equivalentClass, hg.Street))
    g.add((lpOnt.Buurt, OWL.equivalentClass, hg.Neighbourhood))

    g.add((lpOnt.adres, OWL.equivalentProperty, schema.address))

    ########
    # Data #
    ########

    adres2locatie = defaultdict(lambda: defaultdict(list))

    for n, adresLabel in enumerate(data, 1):

        if n % 5000 == 0:
            print(f"{n}/{len(data)}", end='\r')
            # break

        # # geometry
        # wkt = point2wkt.get(locatiepunt)

        # wktLiteral = Literal(wkt, datatype=geo.wktLiteral)
        # geometry = Geometry(lpGeo.term(str(locatiepunt)),
        #                     asWKT=wktLiteral,
        #                     label=[str(locatiepunt)])

        addresses = getAdres(data[adresLabel], adresLabel, point2wkt)

        # adres2locatie[adres][year].append(geometry)

        # observations.append(locpdetail)
        # locp.observation = observations

        # addresses.append(
        #     Role(
        #         None,
        #         label=address.label,
        #         address=address,
        #         hasLatestBeginTimeStamp=locpdetail.hasLatestBeginTimeStamp,
        #         hasEarliestEndTimeStamp=locpdetail.hasEarliestEndTimeStamp,
        #         startDate=Literal(year, datatype=XSD.gYear)))

    ds.bind('create', create)
    ds.bind('schema', schema)
    ds.bind('sem', sem)
    ds.bind('geo', geo)
    ds.bind('juso', juso)
    ds.bind('qb', qb)
    ds.bind('void', void)

    print("Serializing!")
    ds.serialize(target, format='trig')
Esempio n. 10
0
            if row['type'] != "Huisartsenposten":
                dataset.add((newClass, RDFS['label'],
                             Literal(row['type_en'],
                                     lang="en")))  #, datatype=XSD['string'])))
            else:
                dataset.add((newClass, RDFS['label'],
                             Literal(row['type_en'] + " - Out of office hours",
                                     lang="en")))  #, datatype=XSD['string'])))
        if short[i] == "opvo":
            substrkg = ["ezond", "pvoed", "OKT"]
            for substr in substrkg:
                if substr in row['titel']:
                    dataset.add((thing, VOCAB['providesInformationAbout'],
                                 VOCAB['childDevelopment']))
            substrkg = ["peel", "pel"]
            if substr in row['titel']:
                dataset.add(
                    (thing, VOCAB['providesExercisesFor'], VOCAB['children']))
            #dataset.add((thing, RDF['type'], VOCAB['childDevelopmentCenter']))
        if short[i] == "lhbt":
            substr = ["COC", "Hiv", "Coaching", "Trans", "seksuele identiteit"]
            for subs in substr:
                if subs in row['titel']:
                    dataset.add((thing, VOCAB['providesCoachingAbout'],
                                 VOCAB['lhbtIssues']))
            dataset.add((thing, VOCAB['providesInformationAbout'],
                         VOCAB['lhbtIssues']))
            #Vieze dataset...
    with open('outputTTL/' + short[i] + '-rdf.ttl', 'wb') as f:
        dataset.serialize(f, format='turtle')
Esempio n. 11
0
    def from_csvw(metadata_filepath):

        pmd_metadata = Dataset()

        with open(metadata_filepath) as file:
            json_string = file.read()
            g = Graph().parse(data=json_string, format='json-ld')

        # Get all datacubes.
        datacubes = g.query("""
            PREFIX dcat: <http://www.w3.org/ns/dcat#>
            SELECT *
            WHERE {
                ?dataset a dcat:Dataset .
            }
            """)

        for datacube in datacubes:
            # Try and find a sensible id for each dcat:Dataset specifed in the
            # metadata file to derive additional URIs for PMD resources
            datacube_uri = datacube[0]
            datacube_id = urlparse(datacube_uri).path.rsplit('/', 1)[-1]

            # Create sensible URIs for PMD specific resources
            catalog_uri = "http://gss-data.org.uk/catalog/datasets"
            graph_uri = f"http://gss-data.org.uk/graph/{datacube_id}"
            metadata_graph_uri = f"http://gss-data.org.uk/graph/{datacube_id}#metadata"
            catalog_record_uri = f"http://gss-data.org.uk/catalog/{datacube_id}"
            dataset_uri = f"http://gss-data.org.uk/data/{datacube_id}"

            metadata = Graph('IOMemory', URIRef(metadata_graph_uri))
            metadata.bind('dcat', DCAT)
            metadata.bind('dct', DCTERMS)
            metadata.bind('foaf', FOAF)
            metadata.bind('qb', QB)
            metadata.bind('pmdcat', PMDCAT)
            metadata.bind('rdf', RDF)
            metadata.bind('rdfs', RDFS)
            metadata.bind('vcard', VCARD)

            graph = URIRef(graph_uri)
            metadata_graph = URIRef(metadata_graph_uri)
            catalog = URIRef(catalog_uri)
            catalog_record = URIRef(catalog_record_uri)
            dataset = URIRef(dataset_uri)
            datacube = URIRef(datacube_uri)

            triples = [
                # Metadata required by PMD: ------------------------------------
                (catalog, RDF.type, DCAT.Catalog),
                (catalog, DCAT.record, catalog_record),
                (catalog_record, RDF.type, DCAT.CatalogRecord),
                (catalog_record, FOAF.primaryTopic, dataset),
                (catalog_record, PMDCAT.metadataGraph, metadata_graph),
                (dataset, RDF.type, PMDCAT.Dataset),
                (dataset, PMDCAT.datasetContents, datacube),
                (dataset, PMDCAT.graph, graph),
                (datacube, RDF.type, PMDCAT.DataCube)
            ]

            # Get metadata attached to a datacube-like object and assign it
            # to the dcat:Dataset catalog entry.
            user_defined_metadata = g.query("""
                PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                PREFIX csvw: <http://www.w3.org/ns/csvw#>
                PREFIX qb: <http://purl.org/linked-data/cube#>
                PREFIX vcard: <http://www.w3.org/2006/vcard/ns#>
                SELECT ?dataset ?p ?o
                WHERE {
                    {
                        ?datacube ?p ?o .
                        FILTER (?p NOT IN (
                            rdf:type, qb:structure, csvw:tableSchema, csvw:url
                        )) .
                    }
                }
                """,
                                            initBindings={
                                                "dataset": dataset,
                                                "datacube": datacube
                                            })

            contact_metadata = g.query("""
                PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                PREFIX vcard: <http://www.w3.org/2006/vcard/ns#>
                SELECT ?contact ?p ?o
                WHERE {
                    {
                        ?datacube ?p0 ?contact .
                        ?contact a vcard:Individual .
                        ?contact ?p ?o .
                    }
                }
                """,
                                       initBindings={"datacube": datacube})

            triples.extend(list(user_defined_metadata))
            triples.extend(list(contact_metadata))

            for triple in triples:
                if triple[2] is not None:
                    metadata.add(triple)

            pmd_metadata.add_graph(metadata)

        pmd_metadata.serialize(metadata_filepath.replace(
            ".csv-metadata.json", ".trig"),
                               format="trig")
def createNanopubs(g):
		
	ds = Dataset()
	ds.namespace_manager.bind("ddi","http://dbmi-icode-01.dbmi.pitt.edu/mp/")
	ds.namespace_manager.bind("np", "http://www.nanopub.org/nschema#")
	ds.namespace_manager.bind("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
	ds.namespace_manager.bind("rdfs", "http://www.w3.org/2000/01/rdf-schema#")
	ds.namespace_manager.bind("owl", "http://www.w3.org/2002/07/owl#")
	ds.namespace_manager.bind("obo", "http://purl.obolibrary.org/obo/")
	ds.namespace_manager.bind("oboInOwl", "http://www.geneontology.org/formats/oboInOwl#")
	ds.namespace_manager.bind("xsd", "http://www.w3.org/2001/XMLSchema#")
	ds.namespace_manager.bind("dc", "http://purl.org/dc/elements/1.1/")
	ds.namespace_manager.bind("mp", "http://purl.org/mp/")
	ds.namespace_manager.bind("prov", "http://www.w3.org/ns/prov#")
	ds.namespace_manager.bind("dikbEvidence", "http://dbmi-icode-01.dbmi.pitt.edu/dikb-evidence/DIKB_evidence_ontology_v1.3.owl#")
	
	bindings = g.query(interactSelect)
	for b in bindings:
	
		asIndex = b['a'].decode('utf-8').rfind('-')		   
		identifier = b['a'].decode('utf-8')[asIndex:]
		predicateType = b['t'].decode('utf-8')

		npURI = URIRef('http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-nanopub%s') % identifier
		headURI = URIRef('http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-np-head%s') % identifier
		pubInfoURI = URIRef('http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-np-pubInfo%s') % identifier
		provURI = URIRef('http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-np-provenance%s') % identifier
		aURI = URIRef('http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-np-assertion%s') % identifier

		ds.add(( aURI, RDF.type, np.assertion))
		
		head = ds.add_graph(headURI)
		head.add((npURI, RDF.type, np['Nanopublication']))
		head.add((provURI, RDF.type, np['Provenance']))
		head.add((pubInfoURI, RDF.type, np['PublicationInfo']))
		head.add((npURI, np['hasAssertion'], aURI))
		head.add((npURI, np['hasProvenance'], provURI))
		head.add((npURI, np['hasPublicationInfo'], pubInfoURI))

		pub = ds.add_graph(pubInfoURI)
		pub.add((npURI, prov.wasAttributedTo, URIRef('http://orcid.org/0000-0002-2993-2085')))
		pub.add((npURI, prov.generatedAtTime, Literal(datetime.now()) ))
		
		if(predicateType == "http://purl.obolibrary.org/obo/DIDEO_00000000"):

			provenance = ds.add_graph(provURI)
			provenance.add(( aURI, prov.wasAttributedTo, URIRef('http://orcid.org/0000-0002-2993-2085')))
			provenance.add(( aURI, prov.generatedAtTime, Literal(datetime.now()) ))
			provenance.add(( aURI, prov.wasDerivedFrom, Literal("Derived from the DIKB's evidence base using the listed belief criteria")))
			provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_RCT ))
			provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_NR ))
			provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_Par_Grps ))						 
					
		elif(predicateType == "http://purl.obolibrary.org/obo/DIDEO_00000096"):

			provenance = ds.add_graph(provURI)
			provenance.add(( aURI, prov.wasAttributedTo, URIRef('http://orcid.org/0000-0002-2993-2085')))
			provenance.add(( aURI, prov.generatedAtTime, Literal(datetime.now()) ))
			provenance.add(( aURI, prov.wasDerivedFrom, Literal("Derived from the DIKB's evidence base using the listed belief criteria")))
			provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_RCT ))
			provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_NR ))
			provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_Par_Grps )) 
			provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_CT_PK_Genotype ))
			provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_CT_PK_Phenotype )) 
					
		elif(predicateType == "http://purl.obolibrary.org/obo/RO_0002449"):

			provenance = ds.add_graph(provURI)
			provenance.add(( aURI, prov.wasAttributedTo, URIRef('http://orcid.org/0000-0002-2993-2085')))
			provenance.add(( aURI, prov.generatedAtTime, Literal(datetime.now()) ))
			provenance.add(( aURI, prov.wasDerivedFrom, Literal("Derived from the DIKB's evidence base using the listed belief criteria")))
			provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_RCT ))
			provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_NR ))
			provenance.add(( aURI, prov.hadMember, dikbEvidence.EV_PK_DDI_Par_Grps )) 
						
	print ds.serialize(format='trig')
Esempio n. 13
0
                # print(ref)
                ref_tuple = ref.split(':')
                suppref_text = ref_tuple[0]
                suppref_doi = ref_tuple[1]
                suppref_subj = FOODHKG_INST[get_hash(suppref_text)]
                dataset = createSupportingRefTriples(dataset, supp_subj,
                                                     suppref_subj, suppref_doi,
                                                     ref)

    return dataset


if __name__ == '__main__':
    df = pd.read_excel('data/food-claims-kg.xlsx', sheet_name='13. Authorised')

    dataset = Dataset()
    for index, row in df.iterrows():
        print('-', row['Supporting Evidence Reference 1'], '-', row['Status'],
              ':', row['Health relationship'])
        if row['Status'] == 'Finished':
            dataset = turn_into_mp(row, dataset)

    df = pd.read_excel('data/food-claims-kg.xlsx', sheet_name='14. Authorised')

    for index, row in df.iterrows():
        if row['Finished?'] == 'Finished':
            dataset = turn_into_mp(row, dataset)
    add_umls_mappings()

    dataset.serialize('data/output/food_health_kg.ttl', format='turtle')
Esempio n. 14
0
class BurstConverter(object):
    """The actual converter, that processes the chunk of lines from the CSV file, and uses the instructions from the ``schema`` graph to produce RDF."""

    def __init__(self, identifier, columns, schema, metadata_graph, encoding, output_format):
        self.ds = Dataset()
        # self.ds = apply_default_namespaces(Dataset())
        self.g = self.ds.graph(URIRef(identifier))

        self.columns = columns
        self.schema = schema
        self.metadata_graph = metadata_graph
        self.encoding = encoding
        self.output_format = output_format

        self.templates = {}

        self.aboutURLSchema = self.schema.csvw_aboutUrl

    def equal_to_null(self, nulls, row):
        """Determines whether a value in a cell matches a 'null' value as specified in the CSVW schema)"""
        for n in nulls:
            n = Item(self.metadata_graph, n)
            col = str(n.csvw_name)
            val = str(n.csvw_null)
            if row[col] == val:
                logger.debug("Value of column {} ('{}') is equal to specified 'null' value: '{}'".format(col, unicode(row[col]).encode('utf-8'), val))
                # There is a match with null value
                return True
        # There is no match with null value
        return False

    def process(self, count, rows, chunksize):
        """Process the rows fed to the converter. Count and chunksize are used to determine the
        current row number (needed for default observation identifiers)"""
        obs_count = count * chunksize

        # logger.info("Row: {}".format(obs_count)) #removed for readability

        # We iterate row by row, and then column by column, as given by the CSVW mapping file.
        mult_proc_counter = 0
        iter_error_counter= 0
        for row in rows:
            # This fixes issue:10
            if row is None:
                mult_proc_counter += 1
                # logger.debug( #removed for readability
                #     "Skipping empty row caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...")
                continue

            # set the '_row' value in case we need to generate 'default' URIs for each observation ()
            # logger.debug("row: {}".format(obs_count)) #removed for readability
            row[u'_row'] = obs_count
            count += 1

            # The self.columns dictionary gives the mapping definition per column in the 'columns'
            # array of the CSVW tableSchema definition.
            for c in self.columns:

                c = Item(self.metadata_graph, c)
                # default about URL
                s = self.expandURL(self.aboutURLSchema, row)

                try:
                    # Can also be used to prevent the triggering of virtual
                    # columns!

                    # Get the raw value from the cell in the CSV file
                    value = row[unicode(c.csvw_name)]
                    # This checks whether we should continue parsing this cell, or skip it.
                    if self.isValueNull(value, c):
                        continue

                    # If the null values are specified in an array, we need to parse it as a collection (list)
                    elif isinstance(c.csvw_null, Item):
                        nulls = Collection(self.metadata_graph, BNode(c.csvw_null))

                        if self.equal_to_null(nulls, row):
                            # Continue to next column specification in this row, if the value is equal to (one of) the null values.
                            continue
                except:
                    # No column name specified (virtual) because there clearly was no c.csvw_name key in the row.
                    # logger.debug(traceback.format_exc()) #removed for readability
                    iter_error_counter +=1
                    if isinstance(c.csvw_null, Item):
                        nulls = Collection(self.metadata_graph, BNode(c.csvw_null))
                        if self.equal_to_null(nulls, row):
                            # Continue to next column specification in this row, if the value is equal to (one of) the null values.
                            continue

                try:
                    # This overrides the subject resource 's' that has been created earlier based on the
                    # schema wide aboutURLSchema specification.
                    if unicode(c.csvw_virtual) == u'true' and c.csvw_aboutUrl is not None:
                        s = self.expandURL(c.csvw_aboutUrl, row)

                    if c.csvw_valueUrl is not None:
                        # This is an object property, because the value needs to be cast to a URL
                        p = self.expandURL(c.csvw_propertyUrl, row)
                        o = self.expandURL(c.csvw_valueUrl, row)
                        if self.isValueNull(os.path.basename(unicode(o)), c):
                            logger.debug("skipping empty value")
                            continue

                        if unicode(c.csvw_virtual) == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.anyURI:
                            # Special case: this is a virtual column with object values that are URIs
                            # For now using a test special property
                            value = row[unicode(c.csvw_name)].encode('utf-8')
                            o = URIRef(iribaker.to_iri(value))

                        if unicode(c.csvw_virtual) == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.linkURI:
                            about_url = str(c.csvw_aboutUrl)
                            about_url = about_url[about_url.find("{"):about_url.find("}")+1]
                            s = self.expandURL(about_url, row)
                            # logger.debug("s: {}".format(s))
                            value_url = str(c.csvw_valueUrl)
                            value_url = value_url[value_url.find("{"):value_url.find("}")+1]
                            o = self.expandURL(value_url, row)
                            # logger.debug("o: {}".format(o))

                        # For coded properties, the collectionUrl can be used to indicate that the
                        # value URL is a concept and a member of a SKOS Collection with that URL.
                        if c.csvw_collectionUrl is not None:
                            collection = self.expandURL(c.csvw_collectionUrl, row)
                            self.g.add((collection, RDF.type, SKOS['Collection']))
                            self.g.add((o, RDF.type, SKOS['Concept']))
                            self.g.add((collection, SKOS['member'], o))

                        # For coded properties, the schemeUrl can be used to indicate that the
                        # value URL is a concept and a member of a SKOS Scheme with that URL.
                        if c.csvw_schemeUrl is not None:
                            scheme = self.expandURL(c.csvw_schemeUrl, row)
                            self.g.add((scheme, RDF.type, SKOS['Scheme']))
                            self.g.add((o, RDF.type, SKOS['Concept']))
                            self.g.add((o, SKOS['inScheme'], scheme))
                    else:
                        # This is a datatype property
                        if c.csvw_value is not None:
                            value = self.render_pattern(unicode(c.csvw_value), row)
                        elif c.csvw_name is not None:
                            # print s
                            # print c.csvw_name, self.encoding
                            # print row[unicode(c.csvw_name)], type(row[unicode(c.csvw_name)])
                            # print row[unicode(c.csvw_name)].encode('utf-8')
                            # print '...'
                            value = row[unicode(c.csvw_name)].encode('utf-8')
                        else:
                            raise Exception("No 'name' or 'csvw:value' attribute found for this column specification")

                        # If propertyUrl is specified, use it, otherwise use
                        # the column name
                        if c.csvw_propertyUrl is not None:
                            p = self.expandURL(c.csvw_propertyUrl, row)
                        else:
                            if "" in self.metadata_graph.namespaces():
                                propertyUrl = self.metadata_graph.namespaces()[""][
                                    unicode(c.csvw_name)]
                            else:
                                propertyUrl = "{}{}".format(get_namespaces()['sdv'],
                                    unicode(c.csvw_name))

                            p = self.expandURL(propertyUrl, row)

                        if c.csvw_datatype is not None:
                            if URIRef(c.csvw_datatype) == XSD.anyURI:
                                # The xsd:anyURI datatype will be cast to a proper IRI resource.
                                o = URIRef(iribaker.to_iri(value))
                            elif URIRef(c.csvw_datatype) == XSD.string and c.csvw_lang is not None:
                                # If it is a string datatype that has a language, we turn it into a
                                # language tagged literal
                                # We also render the lang value in case it is a
                                # pattern.
                                o = Literal(value, lang=self.render_pattern(
                                    c.csvw_lang, row))
                            else:
                                o = Literal(value, datatype=c.csvw_datatype, normalize=False)
                        else:
                            # It's just a plain literal without datatype.
                            o = Literal(value)

                    # Add the triple to the assertion graph
                    self.g.add((s, p, o))

                    # Add provenance relating the propertyUrl to the column id
                    if '@id' in c:
                        self.g.add((p, PROV['wasDerivedFrom'], URIRef(c['@id'])))

                except:
                    # print row[0], value
                    traceback.print_exc()

            # We increment the observation (row number) with one
            obs_count += 1

        logger.debug(
            "{} row skips caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...".format(mult_proc_counter))
        logger.debug(
            "{} errors encountered while trying to iterate over a NoneType...".format(mult_proc_counter))
        logger.info("... done")
        return self.ds.serialize(format=self.output_format)

    # def serialize(self):
    #     trig_file_name = self.file_name + '.trig'
    #     logger.info("Starting serialization to {}".format(trig_file_name))
    #
    #     with open(trig_file_name, 'w') as f:
    #         self.np.serialize(f, format='trig')
    #     logger.info("... done")

    def render_pattern(self, pattern, row):
        """Takes a Jinja or Python formatted string, and applies it to the row value"""
        # Significant speedup by not re-instantiating Jinja templates for every
        # row.
        if pattern in self.templates:
            template = self.templates[pattern]
        else:
            template = self.templates[pattern] = Template(pattern)

        # TODO This should take into account the special CSVW instructions such as {_row}
        # First we interpret the url_pattern as a Jinja2 template, and pass all
        # column/value pairs as arguments
        rendered_template = template.render(**row)

        try:
            # We then format the resulting string using the standard Python2
            # expressions
            return rendered_template.format(**row)
        except:
            logger.warning(
                u"Could not apply python string formatting, probably due to mismatched curly brackets. IRI will be '{}'. ".format(rendered_template))
            return rendered_template

    def expandURL(self, url_pattern, row, datatype=False):
        """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef"""
        url = self.render_pattern(unicode(url_pattern), row)

        # DEPRECATED
        # for ns, nsuri in namespaces.items():
        #     if url.startswith(ns):
        #         url = url.replace(ns + ':', nsuri)
        #         break

        try:
            iri = iribaker.to_iri(url)
            rfc3987.parse(iri, rule='IRI')
        except:
            raise Exception(u"Cannot convert `{}` to valid IRI".format(url))

        # print "Baked: ", iri
        return URIRef(iri)

    def isValueNull(self, value, c):
        """This checks whether we should continue parsing this cell, or skip it because it is empty or a null value."""
        try:
            if len(value) == 0 and unicode(c.csvw_parseOnEmpty) == u"true":
                print("Not skipping empty value")
                return False #because it should not be skipped
            elif len(value) == 0 or value == unicode(c.csvw_null) or value in [unicode(n) for n in c.csvw_null] or value == unicode(self.schema.csvw_null):
                # Skip value if length is zero and equal to (one of) the null value(s)
                logger.debug(
                    "Length is 0 or value is equal to specified 'null' value")
                return True
        except:
            logger.debug("null does not exist or is not a list.")
        return False
Esempio n. 15
0
def main(search=None, cache=None, identifiers=[]):

    ns = Namespace("https://data.create.humanities.uva.nl/id/rkd/")

    ds = Dataset()
    ds.bind('rdfs', RDFS)
    ds.bind('schema', schema)
    ds.bind('sem', sem)
    ds.bind('bio', bio)
    ds.bind('foaf', foaf)
    ds.bind('void', void)
    ds.bind('skos', SKOS)
    ds.bind('owl', OWL)
    ds.bind('dc', dc)

    ds.bind('rkdArtist', URIRef("https://data.rkd.nl/artists/"))
    ds.bind('rkdThes', nsThesaurus)
    ds.bind('rkdPerson', nsPerson)
    ds.bind('rkdImage', URIRef("https://rkd.nl/explore/images/"))
    ds.bind('rkdThumb', URIRef("https://images.rkd.nl/rkd/thumb/650x650/"))

    ds.bind('aat', URIRef("http://vocab.getty.edu/aat/"))

    ## First the images

    g = rdfSubject.db = ds.graph(identifier=ns)

    # Load cache thesaurus
    if os.path.isfile('rkdthesaurus.json'):
        with open('rkdthesaurus.json') as infile:
            thesaurusDict = json.load(infile)
    else:
        thesaurusDict = dict()

    # Load cache images
    if os.path.isfile('imagecache.json'):
        with open('imagecache.json') as infile:
            imageCache = json.load(infile)
    else:
        imageCache = dict()

    # to fetch all identifiers from the search
    if search:
        thesaurusDict, imageCache = parseURL(search,
                                             thesaurusDict=thesaurusDict,
                                             imageCache=imageCache)
    elif cache:
        # assume that everything in the thesaurus is also cached
        for doc in cache.values():
            parseData(doc, thesaurusDict=thesaurusDict)
    elif identifiers:
        for i in identifiers:
            thesaurusDict, imageCache = parseURL(APIURL + str(i),
                                                 thesaurusDict=thesaurusDict,
                                                 imageCache=imageCache)

    # Any images without labels?
    # These were not included in the search, but fetch them anyway.
    print("Finding referred images that were not included")
    q = """
    PREFIX schema: <http://schema.org/>
    SELECT ?uri WHERE {
        ?role a schema:Role ; schema:isRelatedTo ?uri .

        FILTER NOT EXISTS { ?uri schema:name ?name }
    }
    """
    images = g.query(q)
    print(f"Found {len(images)}!")
    for i in images:
        identifier = str(i['uri']).replace('https://rkd.nl/explore/images/',
                                           '')
        thesaurusDict, imageCache = parseURL(
            "https://api.rkd.nl/api/record/images/" + str(identifier),
            thesaurusDict=thesaurusDict,
            imageCache=imageCache)

    ## Then the thesaurus
    print("Converting the thesaurus")
    rdfSubject.db = ds.graph(identifier=ns.term('thesaurus/'))

    ids = list(thesaurusDict.keys())
    for i in ids:
        _, thesaurusDict = getThesaurus(i, thesaurusDict, 'concept')

    # Save updated cache
    with open('rkdthesaurus.json', 'w') as outfile:
        json.dump(thesaurusDict, outfile)

    with open('imagecache.json', 'w') as outfile:
        json.dump(imageCache, outfile)

    ## Serialize
    print("Serializing!")
    ds.serialize('rkdportraits14751825.trig', format='trig')
def createNanopubs(g):
		
	ds = Dataset()
	ds.namespace_manager.bind("ddi","http://dbmi-icode-01.dbmi.pitt.edu/mp/")
	ds.namespace_manager.bind("np", "http://www.nanopub.org/nschema#")
	ds.namespace_manager.bind("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
	ds.namespace_manager.bind("rdfs", "http://www.w3.org/2000/01/rdf-schema#")
	ds.namespace_manager.bind("owl", "http://www.w3.org/2002/07/owl#")
	ds.namespace_manager.bind("obo", "http://purl.obolibrary.org/obo/")
	ds.namespace_manager.bind("oboInOwl", "http://www.geneontology.org/formats/oboInOwl#")
	ds.namespace_manager.bind("xsd", "http://www.w3.org/2001/XMLSchema#")
	ds.namespace_manager.bind("dc", "http://purl.org/dc/elements/1.1/")
	ds.namespace_manager.bind("mp", "http://purl.org/mp/")

	assertionCount = 1
	enzymeCount = 1

	pddiD = dict([line.split(',',1) for line in open('../../data/np-graphs/processed-dikb-ddis-for-nanopub.csv')])
	cL = dict([line.split('\t') for line in open('../../data/chebi_mapping.txt')])
	pL = dict([line.split('\t') for line in open('../../data/pro_mapping.txt')])
	substrateD = {}
	inhibitorD = {}
			
	bindings = g.query(interactSelect)
	for b in bindings:

		if( pddiD.has_key(str(b['c'].decode('utf-8'))) ):
			tempClaim = pddiD[ str(b['c'].decode('utf-8')) ]
			claimInfo = tempClaim.split(',')
			claimSub = claimInfo[1]
			claimObj = claimInfo[2]
			predicateType = claimInfo[0].strip('\n')
				
			if(predicateType == "increases_auc"):

				aURI =	URIRef("http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-np-assertion-%s") % assertionCount
				assertionCount += 1
			
				bn1 = BNode('1')
				bn2 = BNode('2')
				bn3 = BNode('3')
				bn4 = BNode('4')
				bn5 = BNode('5')
				bn6 = BNode('6')
				bn7 = BNode('7')
				bn8 = BNode('8')
				bn9 = BNode('9')
				bn10 = BNode('10')

				assertionLabel = cL[claimSub.strip('\n')].strip('\n') + " - " + cL[claimObj.strip('\n')].strip('\n') + " potential drug-drug interaction"

				a = ds.add_graph((aURI))
				a.add(( aURI, RDF.type, np.assertion))
				a.add(( aURI, RDF.type, owl.Class))
				a.add(( aURI, RDFS.label, (Literal(assertionLabel.lower()))))	 
				a.add(( aURI, RDFS.subClassOf, URIRef("http://purl.obolibrary.org/obo/DIDEO_00000000")))
				a.add(( bn1, RDF.type, owl.Restriction))
				a.add(( bn1, owl.onProperty, URIRef("http://purl.obolibrary.org/obo/IAO_0000136")))
				a.add(( bn2, RDF.type, owl.Class))
				a.add(( bn3, RDF.first, URIRef("http://purl.obolibrary.org/obo/DIDEO_00000012")))
				a.add(( bn5, RDF.first, bn4))
				a.add(( bn3, RDF.rest, bn5))
				a.add(( bn4, RDF.type, owl.Restriction))
				a.add(( bn4, owl.onProperty, URIRef("http://purl.obolibrary.org/obo/BFO_0000052")))
				a.add(( bn4, owl.hasValue, URIRef(claimSub.strip('\n'))))
				a.add(( bn5, RDF.rest, RDF.nil))
				a.add(( bn2, owl.intersectionOf, bn3))
				a.add(( bn1, owl.someValuesFrom, bn2))
				a.add(( aURI, RDFS.subClassOf, bn1))
				a.add(( bn6, RDF.type, owl.Restriction))
				a.add(( bn6, owl.onProperty, URIRef("http://purl.obolibrary.org/obo/IAO_0000136")))
				a.add(( bn7, RDF.type, owl.Class))
				a.add(( bn8, RDF.first, URIRef("http://purl.obolibrary.org/obo/DIDEO_00000013")))
				a.add(( bn10, RDF.first, bn9))
				a.add(( bn8, RDF.rest, bn10))
				a.add(( bn9, RDF.type, owl.Restriction))
				a.add(( bn9, owl.onProperty, URIRef("http://purl.obolibrary.org/obo/BFO_0000052")))
				a.add(( bn9, owl.hasValue, URIRef(claimObj.strip('\n'))))
				a.add(( bn10, RDF.rest, RDF.nil))
				a.add(( bn7, owl.intersectionOf, bn8))
				a.add(( bn6, owl.someValuesFrom, bn7))
				a.add(( aURI, RDFS.subClassOf, bn6))

				ds.add(( aURI, mp.formalizes, b['c']))
				ds.add(( b['c'], mp.formalizedAs, aURI))
				
			elif(predicateType == "substrate_of"):
						
				aURI =	URIRef("http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-np-assertion-%s") % assertionCount
				assertionCount += 1
				
				dLabel = cL[claimSub.strip('\n')].strip('\n')
				eLabel = pL[claimObj.strip('\n')].strip('\n')
				assertionLabel = dLabel + " substrate of " + eLabel

				a = ds.add_graph((aURI))
				ds.add(( aURI, RDF.type, np.assertion))
				ds.add(( aURI, RDFS.label, Literal(assertionLabel.lower())))				   
				ds.add(( aURI, mp.formalizes, b['c']))
				ds.add(( b['c'], mp.formalizedAs, aURI))
				
				a.add(( URIRef(claimObj.strip('\n')), RDF.type, URIRef("http://purl.obolibrary.org/obo/OBI_0000427")))
				a.add(( URIRef(claimObj.strip('\n')), RDFS.label, Literal(eLabel.lower())))
				a.add(( URIRef(claimObj.strip('\n')), URIRef("http://purl.obolibrary.org/obo/DIDEO_00000096"), URIRef(claimSub.strip('\n'))))

				a.add(( URIRef(claimSub.strip('\n')), RDF.type, URIRef("http://purl.obolibrary.org/obo/CHEBI_24431")))
				a.add(( URIRef(claimSub.strip('\n')), RDFS.label, Literal(dLabel.lower())))
				
			elif(predicateType == "inhibits"):

				aURI =	URIRef("http://dbmi-icode-01.dbmi.pitt.edu/mp/ddi-spl-annotation-np-assertion-%s") % assertionCount
				assertionCount += 1
				
				dLabel = cL[claimSub.strip('\n')].strip('\n')
				eLabel = pL[claimObj.strip('\n')].strip('\n')
				assertionLabel = dLabel + " inhibits " + eLabel
				
				a = ds.add_graph((aURI))
				ds.add(( aURI, RDF.type, np.assertion))
				ds.add(( aURI, RDFS.label, Literal(assertionLabel.lower())))
				ds.add(( aURI, mp.formalizes, b['c']))
				ds.add(( b['c'], mp.formalizedAs, aURI))
				
				a.add(( URIRef(claimSub.strip('\n')), RDF.type, URIRef("http://purl.obolibrary.org/obo/CHEBI_24431")))
				a.add(( URIRef(claimSub.strip('\n')), RDFS.label, Literal(dLabel.lower())))
				a.add(( URIRef(claimSub.strip('\n')), URIRef("http://purl.obolibrary.org/obo/RO_0002449"), URIRef(claimObj.strip('\n'))))

	print ds.serialize(format='trig')
Esempio n. 17
0
class BurstConverter(object):
    """The actual converter, that processes the chunk of lines from the CSV file, and uses the instructions from the ``schema`` graph to produce RDF."""

    def __init__(self, identifier, columns, schema, metadata_graph, encoding, output_format):
        self.ds = Dataset()
        # self.ds = apply_default_namespaces(Dataset())
        self.g = self.ds.graph(URIRef(identifier))

        self.columns = columns
        self.schema = schema
        self.metadata_graph = metadata_graph
        self.encoding = encoding
        self.output_format = output_format

        self.templates = {}

        self.aboutURLSchema = self.schema.csvw_aboutUrl

    def equal_to_null(self, nulls, row):
        """Determines whether a value in a cell matches a 'null' value as specified in the CSVW schema)"""
        for n in nulls:
            n = Item(self.metadata_graph, n)
            col = str(n.csvw_name)
            val = str(n.csvw_null)
            if row[col] == val:
                # logger.debug("Value of column {} ('{}') is equal to specified 'null' value: '{}'".format(col, unicode(row[col]).encode('utf-8'), val))
                # There is a match with null value
                return True
        # There is no match with null value
        return False

    def process(self, count, rows, chunksize):
        """Process the rows fed to the converter. Count and chunksize are used to determine the
        current row number (needed for default observation identifiers)"""
        obs_count = count * chunksize

        # logger.info("Row: {}".format(obs_count)) #removed for readability

        # We iterate row by row, and then column by column, as given by the CSVW mapping file.
        mult_proc_counter = 0
        iter_error_counter= 0
        for row in rows:
            # This fixes issue:10
            if row is None:
                mult_proc_counter += 1
                # logger.debug( #removed for readability
                #     "Skipping empty row caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...")
                continue

            # set the '_row' value in case we need to generate 'default' URIs for each observation ()
            # logger.debug("row: {}".format(obs_count)) #removed for readability
            row[u'_row'] = obs_count
            count += 1

            # print(row)

            # The self.columns dictionary gives the mapping definition per column in the 'columns'
            # array of the CSVW tableSchema definition.

            for c in self.columns:
                c = Item(self.metadata_graph, c)
                # default about URL
                s = self.expandURL(self.aboutURLSchema, row)

                try:
                    # Can also be used to prevent the triggering of virtual
                    # columns!

                    # Get the raw value from the cell in the CSV file
                    try:
                        # Python 2
                        value = row[unicode(c.csvw_name)]
                    except NameError:
                        # Python 3
                        value = row[str(c.csvw_name)]

                    # This checks whether we should continue parsing this cell, or skip it.
                    if self.isValueNull(value, c):
                        continue

                    # If the null values are specified in an array, we need to parse it as a collection (list)
                    elif isinstance(c.csvw_null, Item):
                        nulls = Collection(self.metadata_graph, BNode(c.csvw_null))

                        if self.equal_to_null(nulls, row):
                            # Continue to next column specification in this row, if the value is equal to (one of) the null values.
                            continue
                except:
                    # No column name specified (virtual) because there clearly was no c.csvw_name key in the row.
                    # logger.debug(traceback.format_exc()) #removed for readability
                    iter_error_counter +=1
                    if isinstance(c.csvw_null, Item):
                        nulls = Collection(self.metadata_graph, BNode(c.csvw_null))
                        if self.equal_to_null(nulls, row):
                            # Continue to next column specification in this row, if the value is equal to (one of) the null values.
                            continue

                try:
                    # This overrides the subject resource 's' that has been created earlier based on the
                    # schema wide aboutURLSchema specification.

                    try:
                        csvw_virtual = unicode(c.csvw_virtual)
                        csvw_name = unicode(c.csvw_name)
                        csvw_value = unicode(c.csvw_value)
                        about_url = unicode(c.csvw_aboutUrl)
                        value_url = unicode(c.csvw_valueUrl)
                    except NameError:
                        csvw_virtual = str(c.csvw_virtual)
                        csvw_name = str(c.csvw_name)
                        csvw_value = str(c.csvw_value)
                        about_url = str(c.csvw_aboutUrl)
                        value_url = str(c.csvw_valueUrl)

                    if csvw_virtual == u'true' and c.csvw_aboutUrl is not None:
                        s = self.expandURL(c.csvw_aboutUrl, row)

                    if c.csvw_valueUrl is not None:
                        # This is an object property, because the value needs to be cast to a URL
                        p = self.expandURL(c.csvw_propertyUrl, row)
                        o = self.expandURL(c.csvw_valueUrl, row)
                        try:
                            if self.isValueNull(os.path.basename(unicode(o)), c):
                                logger.debug("skipping empty value")
                                continue
                        except NameError:
                            if self.isValueNull(os.path.basename(str(o)), c):
                                logger.debug("skipping empty value")
                                continue

                        if csvw_virtual == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.anyURI:
                            # Special case: this is a virtual column with object values that are URIs
                            # For now using a test special property
                            value = row[unicode(c.csvw_name)].encode('utf-8')
                            o = URIRef(iribaker.to_iri(value))

                        if csvw_virtual == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.linkURI:
                            about_url = about_url[about_url.find("{"):about_url.find("}")+1]
                            s = self.expandURL(about_url, row)
                            # logger.debug("s: {}".format(s))
                            value_url = value_url[value_url.find("{"):value_url.find("}")+1]
                            o = self.expandURL(value_url, row)
                            # logger.debug("o: {}".format(o))

                        # For coded properties, the collectionUrl can be used to indicate that the
                        # value URL is a concept and a member of a SKOS Collection with that URL.
                        if c.csvw_collectionUrl is not None:
                            collection = self.expandURL(c.csvw_collectionUrl, row)
                            self.g.add((collection, RDF.type, SKOS['Collection']))
                            self.g.add((o, RDF.type, SKOS['Concept']))
                            self.g.add((collection, SKOS['member'], o))

                        # For coded properties, the schemeUrl can be used to indicate that the
                        # value URL is a concept and a member of a SKOS Scheme with that URL.
                        if c.csvw_schemeUrl is not None:
                            scheme = self.expandURL(c.csvw_schemeUrl, row)
                            self.g.add((scheme, RDF.type, SKOS['Scheme']))
                            self.g.add((o, RDF.type, SKOS['Concept']))
                            self.g.add((o, SKOS['inScheme'], scheme))
                    else:
                        # This is a datatype property
                        if c.csvw_value is not None:
                            value = self.render_pattern(csvw_value, row)
                        elif c.csvw_name is not None:
                            # print s
                            # print c.csvw_name, self.encoding
                            # print row[unicode(c.csvw_name)], type(row[unicode(c.csvw_name)])
                            # print row[unicode(c.csvw_name)].encode('utf-8')
                            # print '...'
                            value = row[csvw_name].encode('utf-8')
                        else:
                            raise Exception("No 'name' or 'csvw:value' attribute found for this column specification")

                        # If propertyUrl is specified, use it, otherwise use
                        # the column name
                        if c.csvw_propertyUrl is not None:
                            p = self.expandURL(c.csvw_propertyUrl, row)
                        else:
                            if "" in self.metadata_graph.namespaces():
                                propertyUrl = self.metadata_graph.namespaces()[""][
                                    csvw_name]
                            else:
                                propertyUrl = "{}{}".format(get_namespaces()['sdv'],
                                    csvw_name)

                            p = self.expandURL(propertyUrl, row)

                        if c.csvw_datatype is not None:
                            if URIRef(c.csvw_datatype) == XSD.anyURI:
                                # The xsd:anyURI datatype will be cast to a proper IRI resource.
                                o = URIRef(iribaker.to_iri(value))
                            elif URIRef(c.csvw_datatype) == XSD.string and c.csvw_lang is not None:
                                # If it is a string datatype that has a language, we turn it into a
                                # language tagged literal
                                # We also render the lang value in case it is a
                                # pattern.
                                o = Literal(value, lang=self.render_pattern(
                                    c.csvw_lang, row))
                            else:
                                try:
                                    csvw_datatype = unicode(c.csvw_datatype)
                                except NameError:
                                    csvw_datatype = str(c.csvw_datatype).split(')')[0].split('(')[-1]
                                    # csvw_datatype = str(c.csvw_datatype)
                                # print(type(csvw_datatype))
                                # print(csvw_datatype)
                                o = Literal(value, datatype=csvw_datatype, normalize=False)
                        else:
                            # It's just a plain literal without datatype.
                            o = Literal(value)


                    # Add the triple to the assertion graph
                    self.g.add((s, p, o))

                    # Add provenance relating the propertyUrl to the column id
                    if '@id' in c:
                        self.g.add((p, PROV['wasDerivedFrom'], URIRef(c['@id'])))

                except:
                    # print row[0], value
                    traceback.print_exc()

            # We increment the observation (row number) with one
            obs_count += 1

        # for s,p,o in self.g.triples((None,None,None)):
        #     print(s.__repr__,p.__repr__,o.__repr__)

        logger.debug(
            "{} row skips caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...".format(mult_proc_counter))
        logger.debug(
            "{} errors encountered while trying to iterate over a NoneType...".format(mult_proc_counter))
        logger.info("... done")
        return self.ds.serialize(format=self.output_format)

    # def serialize(self):
    #     trig_file_name = self.file_name + '.trig'
    #     logger.info("Starting serialization to {}".format(trig_file_name))
    #
    #     with open(trig_file_name, 'w') as f:
    #         self.np.serialize(f, format='trig')
    #     logger.info("... done")

    def render_pattern(self, pattern, row):
        """Takes a Jinja or Python formatted string, and applies it to the row value"""
        # Significant speedup by not re-instantiating Jinja templates for every
        # row.

        if pattern in self.templates:
            template = self.templates[pattern]
        else:
            template = self.templates[pattern] = Template(pattern)

        # TODO This should take into account the special CSVW instructions such as {_row}
        # First we interpret the url_pattern as a Jinja2 template, and pass all
        # column/value pairs as arguments
        # row = {str('Int'): int('104906'), str('Country'): str('Luxembourg'), str('_row'): 1, str('Rank'): str('2')}

        # print(pattern)
        # print(type(pattern))
        # print(row)
        # print(type(row))
        # rendered_template = template.render(Int=120000)

        rendered_template = template.render(**row)

        try:
            # We then format the resulting string using the standard Python2
            # expressions
            return rendered_template.format(**row)
        except:
            logger.warning(
                u"Could not apply python string formatting, probably due to mismatched curly brackets. IRI will be '{}'. ".format(rendered_template))
            return rendered_template

    def expandURL(self, url_pattern, row, datatype=False):
        """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef"""

        try:
            unicode_url_pattern = unicode(url_pattern)
        except NameError:
            unicode_url_pattern = str(url_pattern).split(')')[0].split('(')[-1]
        # print(unicode_url_pattern)

        url = self.render_pattern(unicode_url_pattern, row)

        # DEPRECATED
        # for ns, nsuri in namespaces.items():
        #     if url.startswith(ns):
        #         url = url.replace(ns + ':', nsuri)
        #         break

        try:
            iri = iribaker.to_iri(url)
            rfc3987.parse(iri, rule='IRI')
        except:
            raise Exception(u"Cannot convert `{}` to valid IRI".format(url))

        # print(iri)
        return URIRef(iri)

    def isValueNull(self, value, c):
        """This checks whether we should continue parsing this cell, or skip it because it is empty or a null value."""
        try:
            if len(value) == 0 and unicode(c.csvw_parseOnEmpty) == u"true":
                # print("Not skipping empty value")
                return False #because it should not be skipped
            elif len(value) == 0 or value == unicode(c.csvw_null) or value in [unicode(n) for n in c.csvw_null] or value == unicode(self.schema.csvw_null):
                # Skip value if length is zero and equal to (one of) the null value(s)
                # logger.debug(
                #     "Length is 0 or value is equal to specified 'null' value")
                return True
        except:
            # logger.debug("null does not exist or is not a list.") #this line will print for every cell in a csv without a defined null value.
            pass
        return False
Esempio n. 18
0
def test_scenarios() -> None:
    """
    Testing scenarios:
        1. no base set
        2. base set at graph creation
        3. base set at serialization
        4. base set at both graph creation & serialization, serialization overrides
        5. multiple serialization side effect checking
        6. checking results for RDF/XML
        7. checking results for N3
        8. checking results for TriX & TriG
    """

    # variables
    base_one = Namespace("http://one.org/")
    base_two = Namespace("http://two.org/")
    title = Literal("Title", lang="en")
    description = Literal("Test Description", lang="en")
    creator = URIRef("https://creator.com")
    cs = URIRef("")

    # starting graph
    g = Graph()
    g.add((cs, RDF.type, SKOS.ConceptScheme))
    g.add((cs, DCTERMS.creator, creator))
    g.add((cs, DCTERMS.source, URIRef("nick")))
    g.bind("dct", DCTERMS)
    g.bind("skos", SKOS)

    # 1. no base set for graph, no base set for serialization
    g1 = Graph()
    g1 += g
    # @base should not be in output
    assert "@base" not in g.serialize(format="turtle")

    # 2. base one set for graph, no base set for serialization
    g2 = Graph(base=base_one)
    g2 += g
    # @base should be in output, from Graph (one)
    assert "@base <http://one.org/> ." in g2.serialize(format="turtle")

    # 3. no base set for graph, base two set for serialization
    g3 = Graph()
    g3 += g
    # @base should be in output, from serialization (two)
    assert "@base <http://two.org/> ." in g3.serialize(format="turtle",
                                                       base=base_two)

    # 4. base one set for graph, base two set for serialization, Graph one overrides
    g4 = Graph(base=base_one)
    g4 += g
    # @base should be in output, from graph (one)
    assert "@base <http://two.org/> ." in g4.serialize(format="turtle",
                                                       base=base_two)
    # just checking that the serialization setting (two) hasn't snuck through
    assert "@base <http://one.org/> ." not in g4.serialize(format="turtle",
                                                           base=base_two)

    # 5. multiple serialization side effect checking
    g5 = Graph()
    g5 += g
    # @base should be in output, from serialization (two)
    assert "@base <http://two.org/> ." in g5.serialize(format="turtle",
                                                       base=base_two)

    # checking for side affects - no base now set for this serialization
    # @base should not be in output
    assert "@base" not in g5.serialize(format="turtle")

    # 6. checking results for RDF/XML
    g6 = Graph()
    g6 += g
    g6.bind("dct", DCTERMS)
    g6.bind("skos", SKOS)
    assert "@xml:base" not in g6.serialize(format="xml")
    assert 'xml:base="http://one.org/"' in g6.serialize(format="xml",
                                                        base=base_one)
    g6.base = base_two
    assert 'xml:base="http://two.org/"' in g6.serialize(format="xml")
    assert 'xml:base="http://one.org/"' in g6.serialize(format="xml",
                                                        base=base_one)

    # 7. checking results for N3
    g7 = Graph()
    g7 += g
    g7.bind("dct", DCTERMS)
    g7.bind("skos", SKOS)
    assert "@xml:base" not in g7.serialize(format="xml")
    assert "@base <http://one.org/> ." in g7.serialize(format="n3",
                                                       base=base_one)
    g7.base = base_two
    assert "@base <http://two.org/> ." in g7.serialize(format="n3")
    assert "@base <http://one.org/> ." in g7.serialize(format="n3",
                                                       base=base_one)

    # 8. checking results for TriX & TriG
    # TriX can specify a base per graph but setting a base for the whole
    base_three = Namespace("http://three.org/")
    ds1 = Dataset()
    ds1.bind("dct", DCTERMS)
    ds1.bind("skos", SKOS)
    g8 = ds1.graph(URIRef("http://g8.com/"), base=base_one)
    g9 = ds1.graph(URIRef("http://g9.com/"))
    g8 += g
    g9 += g
    g9.base = base_two
    ds1.base = base_three

    trix = ds1.serialize(format="trix", base=Namespace("http://two.org/"))
    assert '<graph xml:base="http://one.org/">' in trix
    assert '<graph xml:base="http://two.org/">' in trix
    assert '<TriX xml:base="http://two.org/"' in trix

    trig = ds1.serialize(format="trig", base=Namespace("http://two.org/"))
    assert "@base <http://one.org/> ." not in trig
    assert "@base <http://three.org/> ." not in trig
    assert "@base <http://two.org/> ." in trig
Esempio n. 19
0
        date=Literal(datetime.datetime.now().isoformat(),
                     datatype=XSD.datetime),
        created=None,
        issued=None,
        modified=None,
        exampleResource=exampleResource,
        vocabulary=[URIRef("https://schema.org/")],
        triples=sum(1 for i in ds.graph(
            identifier="https://data.create.humanities.uva.nl/id/kohier1674/").
                    subjects()),
        temporalCoverage=Literal("1674", datatype=XSD.gYear, normalize=False),
        licenseprop=URIRef(
            "https://creativecommons.org/licenses/by-nc-sa/4.0/"),
        distribution=download)

    ds.bind('owl', OWL)
    ds.bind('create', create)
    ds.bind('schema', schema)
    ds.bind('void', void)
    ds.bind('foaf', foaf)
    ds.bind('edm', edm)
    ds.bind('pnv', pnv)
    ds.bind('roar', roar)
    ds.bind('dc', dc)
    ds.bind('dcterms', dcterms)
    ds.bind('oa', oa)
    ds.bind('prov', prov)

    print("Serializing!")
    ds.serialize('data/kohier1674.trig', format='trig')
Esempio n. 20
0
def visit_sparql(url, format='html', depth=1):
    sparqls = get_sparql_endpoints(url)
    predicates = get_predicates(sparqls, url)

    if format == 'html':
        limit_fraction = QUERY_RESULTS_LIMIT / 3
        if len(predicates) > 1:
            predicate_query_limit_fraction = (
                limit_fraction * 2) / len(predicates)
        else:
            predicate_query_limit_fraction = limit_fraction * 2

        results = []

        def predicate_specific_sparql(sparql, query):
            log.debug(query)

            sparql.setQuery(query)
            res = sparql.query().convert()
            results.extend(
                list(res["results"]["bindings"]))

        threads = []
        local_results = []
        for p in predicates:
            q = u"""SELECT DISTINCT ?s ?p ?o ?g WHERE {{
                {{
                GRAPH ?g {{
                    {{
                        <{url}> <{predicate}> ?o .
                        BIND(<{url}> as ?s)
                        BIND(<{predicate}> as ?p)
                    }} UNION {{
                        ?s <{predicate}> <{url}>.
                        BIND(<{url}> as ?o)
                        BIND(<{predicate}> as ?p)
                    }}
                }}
                }} UNION {{
                    {{
                        <{url}> <{predicate}> ?o .
                        BIND(<{url}> as ?s)
                        BIND(<{predicate}> as ?p)
                    }} UNION {{
                        ?s <{predicate}> <{url}>.
                        BIND(<{url}> as ?o)
                        BIND(<{predicate}> as ?p)
                    }}
                }}
            }} LIMIT {limit}""".format(url=url, predicate=p, limit=predicate_query_limit_fraction)

            for s in sparqls:
                # Start processes for each endpoint, for each predicate query
                process = Thread(target=predicate_specific_sparql, args=[s, q])
                process.start()
                threads.append(process)

        url_is_predicate_query = u"""SELECT DISTINCT ?s ?p ?o ?g WHERE {{
            {{
            GRAPH ?g {{
                ?s <{url}> ?o.
                BIND(<{url}> as ?p)
            }}
            }} UNION {{
                ?s <{url}> ?o.
                BIND(<{url}> as ?p)
            }}
        }} LIMIT {limit}""".format(url=url, limit=limit_fraction)

        for s in sparqls:
            process = Thread(target=predicate_specific_sparql,
                             args=[s, url_is_predicate_query])
            process.start()
            threads.append(process)

        # We now pause execution on the main thread by 'joining' all of our started threads.
        # This ensures that each has finished processing the urls.
        for process in threads:
            process.join()

        if LDF_STATEMENTS_URL is not None:
            retrieve_ldf_results(url)

        # We also add local results (result of dereferencing)
        local_results = list(visit_local(url, format))

        results.extend(local_results)

        # If a Druid statements URL is specified, we'll try to receive it as
        # well
        if DRUID_STATEMENTS_URL is not None:
            results.extend(visit_druid(url, format))

        if depth > 1:
            # If depth is larger than 1, we proceed to extend the results with the results of
            # visiting all object resources for every triple in the resultset.
            newresults = []

            objects = set([r['o']['value'] for r in results if r['o']['value'] != url and r['o']['type']=='uri'])

            for o in objects:
                newresults.extend(
                    visit(o, format=format, depth=depth - 1))

            results.extend(newresults)

    else:
        q = u"""
        CONSTRUCT {{
            ?s ?p ?o .
        }} WHERE {{
            {{
            GRAPH ?g {{
                {{
                    <{url}> ?p ?o .
                    BIND(<{url}> as ?s)
                }} UNION {{
                    ?s ?p <{url}>.
                    BIND(<{url}> as ?o)
                }} UNION {{
                    ?s <{url}> ?o.
                    BIND(<{url}> as ?p)
                }}
            }}
            }} UNION {{
                {{
                    <{url}> ?p ?o .
                    BIND(<{url}> as ?s)
                }} UNION {{
                    ?s ?p <{url}>.
                    BIND(<{url}> as ?o)
                }} UNION {{
                    ?s <{url}> ?o.
                    BIND(<{url}> as ?p)
                }}
            }}
        }} LIMIT {limit}""".format(url=url, limit=QUERY_RESULTS_LIMIT)

        result_dataset = Dataset()

        for s in sparqls:
            s.setQuery(q)
            s.setReturnFormat(XML)

            result_dataset += s.query().convert()

        if format == 'jsonld':
            results = result_dataset.serialize(format='json-ld')
        elif format == 'rdfxml':
            s.setReturnFormat(XML)
            results = result_dataset.serialize(format='pretty-xml')
        elif format == 'turtle':
            s.setReturnFormat(XML)
            results = result_dataset.serialize(format='turtle')
        else:
            results = 'Nothing'

    log.debug("Received results")

    return results
Esempio n. 21
0
                                ), Literal(s.strip("' "), lang=lang)))
        # same as
        if not pd.isnull(row['skos_exactMatch']
                         ) and row['skos_exactMatch'].strip() != '-':
            for s in row['skos_exactMatch'].split(';'):
                s = s.strip()
                if s[:4] == 'http':
                    graph.add((uri, SKOS.exactMatch, URIRef(s)))
                elif len(s.split(':')) == 2:
                    p = s.split(':')[0]
                    q = s.split(':')[1]
                    if p in vocabs:
                        graph.add(
                            (uri, SKOS.exactMatch, URIRef(vocabs[p] + q)))
        # format extras
        if sheet == 'formats':
            if not pd.isnull(row['premis_formatVersion']
                             ) and row['premis_formatVersion'].strip() != '':
                graph.add((uri, URIRef(f'{vocabs["premis"]}formatVersion'),
                           Literal(row['premis_formatVersion'].strip())))
            if not pd.isnull(row['ebucore_hasMimeType']
                             ) and row['ebucore_hasMimeType'].strip() != '':
                graph.add((uri, URIRef(f'{vocabs["ebucore"]}hasMimeType'),
                           Literal(row['ebucore_hasMimeType'].strip())))

with open('target/vocabulary.ttl', 'wb') as f:
    f.write(graph.serialize(format='turtle'))

with open('target/vocabulary.nq', 'wb') as f:
    f.write(ds.serialize(format='nquads'))
Esempio n. 22
0
def test_hext_dataset():
    """Tests context-aware (multigraph) data"""
    d = Dataset()
    trig_data = """
            PREFIX ex: <http://example.com/>
            PREFIX owl: <http://www.w3.org/2002/07/owl#>
            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

            ex:g1 {
                ex:s1
                    ex:p1 ex:o1 , ex:o2 ;
                    ex:p2 [
                        a owl:Thing ;
                        rdf:value "thingy" ;
                    ] ;
                    ex:p3 "Object 3" , "Object 4 - English"@en ;
                    ex:p4 "2021-12-03"^^xsd:date ;
                    ex:p5 42 ;
                    ex:p6 "42" ;
                .
            }

            ex:g2 {
                ex:s1
                    ex:p1 ex:o1 , ex:o2 ;
                .
                ex:s11 ex:p11 ex:o11 , ex:o12 .
            }

            # default graph triples
            ex:s1 ex:p1 ex:o1 , ex:o2 .
            ex:s21 ex:p21 ex:o21 , ex:o22 .
           """
    d.parse(data=trig_data,
            format="trig",
            publicID=d.default_context.identifier)
    out = d.serialize(format="hext")
    # note: cant' test for BNs in result as they will be different ever time
    testing_lines = [
        [
            False,
            '["http://example.com/s21", "http://example.com/p21", "http://example.com/o21", "globalId", "", ""]'
        ],
        [
            False,
            '["http://example.com/s21", "http://example.com/p21", "http://example.com/o22", "globalId", "", ""]'
        ],
        [
            False,
            '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", ""]'
        ],
        [
            False,
            '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", ""]'
        ],
        [
            False,
            '["http://example.com/s11", "http://example.com/p11", "http://example.com/o12", "globalId", "", "http://example.com/g2"]'
        ],
        [
            False,
            '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", "http://example.com/g2"]'
        ],
        [
            False,
            '["http://example.com/s11", "http://example.com/p11", "http://example.com/o11", "globalId", "", "http://example.com/g2"]'
        ],
        [
            False,
            '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", "http://example.com/g2"]'
        ],
        [
            False,
            '["http://example.com/s1", "http://example.com/p1", "http://example.com/o2", "globalId", "", "http://example.com/g1"]'
        ],
        [False, '["http://example.com/s1", "http://example.com/p2"'],
        [
            False,
            '"http://www.w3.org/1999/02/22-rdf-syntax-ns#value", "thingy", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]'
        ],
        [
            False,
            '"http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/2002/07/owl#Thing", "globalId", "", "http://example.com/g1"]'
        ],
        [
            False,
            '["http://example.com/s1", "http://example.com/p3", "Object 4 - English", "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString", "en", "http://example.com/g1"]'
        ],
        [
            False,
            '["http://example.com/s1", "http://example.com/p6", "42", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]'
        ],
        [
            False,
            '["http://example.com/s1", "http://example.com/p4", "2021-12-03", "http://www.w3.org/2001/XMLSchema#date", "", "http://example.com/g1"]'
        ],
        [
            False,
            '["http://example.com/s1", "http://example.com/p1", "http://example.com/o1", "globalId", "", "http://example.com/g1"]'
        ],
        [
            False,
            '["http://example.com/s1", "http://example.com/p5", "42", "http://www.w3.org/2001/XMLSchema#integer", "", "http://example.com/g1"]'
        ],
        [
            False,
            '["http://example.com/s1", "http://example.com/p3", "Object 3", "http://www.w3.org/2001/XMLSchema#string", "", "http://example.com/g1"]'
        ],
    ]
    for line in out.splitlines():
        for test in testing_lines:
            if test[1] in line:
                test[0] = True

    assert all([x[0] for x in testing_lines])
Esempio n. 23
0
class LongTermMemory(object):

    ONE_TO_ONE_PREDICATES = [
        'age', 'born_in', 'faceID', 'favorite', 'favorite_of', 'id', 'is_from',
        'manufactured_in', 'mother_is', 'name'
    ]

    def __init__(self, address=config.BRAIN_URL_LOCAL):
        """
        Interact with Triple store

        Parameters
        ----------
        address: str
            IP address and port of the Triple store
        """

        self.address = address
        self.namespaces = {}
        self.ontology_paths = {}
        self.format = 'trig'
        self.dataset = Dataset()
        self.query_prefixes = """
                    prefix gaf: <http://groundedannotationframework.org/gaf#> 
                    prefix grasp: <http://groundedannotationframework.org/grasp#> 
                    prefix leolaniInputs: <http://cltl.nl/leolani/inputs/>
                    prefix leolaniFriends: <http://cltl.nl/leolani/friends/> 
                    prefix leolaniTalk: <http://cltl.nl/leolani/talk/> 
                    prefix leolaniTime: <http://cltl.nl/leolani/time/> 
                    prefix leolaniWorld: <http://cltl.nl/leolani/world/> 
                    prefix n2mu: <http://cltl.nl/leolani/n2mu/> 
                    prefix ns1: <urn:x-rdflib:> 
                    prefix owl: <http://www.w3.org/2002/07/owl#> 
                    prefix prov: <http://www.w3.org/ns/prov#> 
                    prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
                    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
                    prefix sem: <http://semanticweb.cs.vu.nl/2009/11/sem/> 
                    prefix skos: <http://www.w3.org/2004/02/skos/core#> 
                    prefix time: <http://www.w3.org/TR/owl-time/#> 
                    prefix xml: <http://www.w3.org/XML/1998/namespace> 
                    prefix xml1: <https://www.w3.org/TR/xmlschema-2/#> 
                    prefix xsd: <http://www.w3.org/2001/XMLSchema#>
                    """

        self._define_namespaces()
        self._get_ontology_path()
        self._bind_namespaces()

        self.my_uri = None

        self._log = logger.getChild(self.__class__.__name__)
        self._log.debug("Booted")

    #################################### Main functions to interact with the brain ####################################

    def update(self, capsule):
        """
        Main function to interact with if a statement is coming into the brain. Takes in a structured parsed statement,
        transforms them to triples, and posts them to the triple store
        :param statement: Structured data of a parsed statement
        :return: json response containing the status for posting the triples, and the original statement
        """
        # Case fold
        capsule = casefold_capsule(capsule)

        # Create graphs and triples
        self._model_graphs_(capsule)

        data = self._serialize(config.BRAIN_LOG)

        code = self._upload_to_brain(data)

        # Create JSON output
        capsule["date"] = str(capsule["date"])
        output = {'response': code, 'statement': capsule}

        return output

    def experience(self, capsule):
        """
        Main function to interact with if a statement is coming into the brain. Takes in a structured parsed statement,
        transforms them to triples, and posts them to the triple store
        :param capsule: Structured data of a parsed statement
        :return: json response containing the status for posting the triples, and the original statement
        """
        # Case fold
        capsule = casefold_capsule(capsule)

        # Create graphs and triples
        self._model_graphs_(capsule, type='Experience')

        data = self._serialize(config.BRAIN_LOG)

        code = self._upload_to_brain(data)

        # Create JSON output
        capsule["date"] = str(capsule["date"])
        output = {'response': code, 'statement': capsule}

        return output

    def query_brain(self, capsule):
        """
        Main function to interact with if a question is coming into the brain. Takes in a structured parsed question,
        transforms it into a query, and queries the triple store for a response
        :param capsule: Structured data of a parsed question
        :return: json response containing the results of the query, and the original question
        """
        # Case fold
        capsule = casefold_capsule(capsule)

        # Generate query
        query = self._create_query(capsule)

        # Perform query
        response = self._submit_query(query)

        # Create JSON output
        if 'date' in capsule.keys():
            capsule["date"] = str(capsule["date"])
        output = {'response': response, 'question': capsule}

        return output

    def process_visual(self, item, exact_only=True):
        """
        Main function to determine if this item can be recognized by the brain, learned, or none
        :param item:
        :return:
        """

        if casefold(item) in self.get_classes():
            # If this is in the ontology already, create sensor triples directly
            text = 'I know about %s. I will remember this object' % item
            return item, text

        temp = self.get_labels_and_classes()
        if casefold(item) in temp.keys():
            # If this is in the ontology already, create sensor triples directly
            text = 'I know about %s. It is of type %s. I will remember this object' % (
                item, temp[item])
            return item, text

        # Query the web for information
        class_type, description = self.exact_match_dbpedia(item)
        if class_type is not None:
            # Had to learn it, but I can create triples now
            text = 'I did not know what %s is, but I searched on the web and I found that it is a %s. ' \
                   'I will remember this object' % (item, class_type)
            return casefold(class_type), text

        if not exact_only:
            # Second go at dbpedia, relaxed approach
            class_type, description = self.keyword_match_dbpedia(item)
            if class_type is not None:
                # Had to really search for it to learn it, but I can create triples now
                text = 'I did not know what %s is, but I searched for fuzzy matches on the web and I found that it ' \
                       'is a %s. I will remember this object' % (item, class_type)
                return casefold(class_type), text

        # Failure, nothing found
        text = 'I am sorry, I could not learn anything on %s so I will not remember it' % item
        return None, text

    ########## management system for keeping track of chats and turns ##########
    def get_last_chat_id(self):
        """
        Get the id for the last interaction recorded
        :return: id
        """
        query = read_query('last_chat_id')
        response = self._submit_query(query)

        return int(response[0]['chatid']['value']) if response else 0

    def get_last_turn_id(self, chat_id):
        """
        Get the id for the last turn in the given chat
        :param chat_id: id for chat of interest
        :return:  id
        """
        query = read_query('last_turn_id') % (chat_id)
        response = self._submit_query(query)

        last_turn = 0
        for turn in response:
            turn_uri = turn['s']['value']
            turn_id = turn_uri.split('/')[-1][10:]
            turn_id = int(turn_id)

            if turn_id > last_turn:
                last_turn = turn_id

        return last_turn

    ########## brain structure exploration ##########
    def get_predicates(self):
        """
        Get predicates in social ontology
        :return:
        """
        query = read_query('predicates')
        response = self._submit_query(query)

        return [elem['p']['value'].split('/')[-1] for elem in response]

    def get_classes(self):
        """
        Get classes in social ontology
        :return:
        """
        query = read_query('classes')
        response = self._submit_query(query)

        return [elem['o']['value'].split('/')[-1] for elem in response]

    def get_labels_and_classes(self):
        """
        Get classes in social ontology
        :return:
        """
        query = read_query('labels_and_classes')
        response = self._submit_query(query)

        temp = dict()
        for r in response:
            temp[r['l']['value']] = r['o']['value'].split('/')[-1]

        return temp

    ########## learned facts exploration ##########
    def count_statements(self):
        """
        Count statements or 'facts' in the brain
        :return:
        """
        query = read_query('count_statements')
        response = self._submit_query(query)
        return response[0]['count']['value']

    def count_friends(self):
        """
        Count number of people I have talked to
        :return:
        """
        query = read_query('count_friends')
        response = self._submit_query(query)
        return response[0]['count']['value']

    def get_my_friends(self):
        """
        Get names of people I have talked to
        :return:
        """
        query = read_query('my_friends')
        response = self._submit_query(query)
        return [elem['name']['value'].split('/')[-1] for elem in response]

    def get_best_friends(self):
        """
        Get names of the 5 people I have talked to the most
        :return:
        """
        query = read_query('best_friends')
        response = self._submit_query(query)
        return [elem['name']['value'] for elem in response]

    def get_instance_of_type(self, instance_type):
        """
        Get isntances of a certain class type
        :param instance_type: name of class in ontology
        :return:
        """
        query = read_query('instance_of_type') % (instance_type)
        response = self._submit_query(query)
        return [elem['name']['value'] for elem in response]

    def when_last_chat_with(self, actor_label):
        """
        Get time value for the last time I chatted with this person
        :param actor_label: name of person
        :return:
        """
        query = read_query('when_last_chat_with') % (actor_label)
        response = self._submit_query(query)
        return response[0]['time']['value'].split('/')[-1]

    def get_triples_with_predicate(self, predicate):
        """
        Get triples that contain this predicate
        :param predicate:
        :return:
        """
        query = read_query('triples_with_predicate') % predicate
        response = self._submit_query(query)
        return [(elem['sname']['value'], elem['oname']['value'])
                for elem in response]

    ########## conflicts ##########
    def get_all_conflicts(self):
        """
        Aggregate all conflicts in brain
        :return:
        """
        conflicts = []
        for predicate in self.ONE_TO_ONE_PREDICATES:
            conflicts.extend(self._get_conflicts_with_predicate(predicate))

        return conflicts

    ########## semantic web ##########
    def exact_match_dbpedia(self, item):
        """
        Query dbpedia for information on this item to get it's semantic type and description.
        :param item:
        :return:
        """

        # Gather combinations
        combinations = [item, item.lower(), item.capitalize(), item.title()]

        for comb in combinations:
            # Try exact matching query
            query = read_query('dbpedia_type_and_description') % (comb)
            response = self._submit_query(query)

            # break if we have a hit
            if response:
                break

        class_type = response[0]['label_type']['value'] if response else None
        description = response[0]['description']['value'].split(
            '.')[0] if response else None

        return class_type, description

    def keyword_match_dbpedia(self, item):
        # Query API
        r = requests.get(
            'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch',
            params={
                'QueryString': item,
                'MaxHits': '10'
            },
            headers={
                'Accept': 'application/json'
            }).json()['results']

        # Fuzzy match
        choices = [e['label'] for e in r]
        best_match = process.extractOne("item", choices)

        # Get best match object
        r = [{
            'label': e['label'],
            'classes': e['classes'],
            'description': e['description']
        } for e in r if e['label'] == best_match[0]]

        if r:
            r = r[0]

            if r['classes']:
                # process dbpedia classes only
                r['classes'] = [
                    c['label'] for c in r['classes'] if 'dbpedia' in c['uri']
                ]

        else:
            r = {'label': None, 'classes': None, 'description': None}

        return r['classes'][0] if r['classes'] else None, r[
            'description'].split('.')[0] if r['description'] else None

    ######################################## Helpers for setting up connection ########################################

    def _define_namespaces(self):
        """
        Define namespaces for different layers (ontology/vocab and resource). Assign them to self
        :return:
        """
        # Namespaces for the instance layer
        instance_vocab = 'http://cltl.nl/leolani/n2mu/'
        self.namespaces['N2MU'] = Namespace(instance_vocab)
        instance_resource = 'http://cltl.nl/leolani/world/'
        self.namespaces['LW'] = Namespace(instance_resource)

        # Namespaces for the mention layer
        mention_vocab = 'http://groundedannotationframework.org/gaf#'
        self.namespaces['GAF'] = Namespace(mention_vocab)
        mention_resource = 'http://cltl.nl/leolani/talk/'
        self.namespaces['LTa'] = Namespace(mention_resource)

        # Namespaces for the attribution layer
        attribution_vocab = 'http://groundedannotationframework.org/grasp#'
        self.namespaces['GRASP'] = Namespace(attribution_vocab)
        attribution_resource_friends = 'http://cltl.nl/leolani/friends/'
        self.namespaces['LF'] = Namespace(attribution_resource_friends)
        attribution_resource_inputs = 'http://cltl.nl/leolani/inputs/'
        self.namespaces['LI'] = Namespace(attribution_resource_inputs)

        # Namespaces for the temporal layer-ish
        time_vocab = 'http://www.w3.org/TR/owl-time/#'
        self.namespaces['TIME'] = Namespace(time_vocab)
        time_resource = 'http://cltl.nl/leolani/time/'
        self.namespaces['LTi'] = Namespace(time_resource)

        # The namespaces of external ontologies
        skos = 'http://www.w3.org/2004/02/skos/core#'
        self.namespaces['SKOS'] = Namespace(skos)

        prov = 'http://www.w3.org/ns/prov#'
        self.namespaces['PROV'] = Namespace(prov)

        sem = 'http://semanticweb.cs.vu.nl/2009/11/sem/'
        self.namespaces['SEM'] = Namespace(sem)

        xml = 'https://www.w3.org/TR/xmlschema-2/#'
        self.namespaces['XML'] = Namespace(xml)

    def _get_ontology_path(self):
        """
        Define ontology paths to key vocabularies
        :return:
        """
        self.ontology_paths[
            'n2mu'] = './../../knowledge_representation/ontologies/leolani.ttl'
        self.ontology_paths[
            'gaf'] = './../../knowledge_representation/ontologies/gaf.rdf'
        self.ontology_paths[
            'grasp'] = './../../knowledge_representation/ontologies/grasp.rdf'
        self.ontology_paths[
            'sem'] = './../../knowledge_representation/ontologies/sem.rdf'

    def _bind_namespaces(self):
        """
        Bnd namespaces
        :return:
        """
        self.dataset.bind('n2mu', self.namespaces['N2MU'])
        self.dataset.bind('leolaniWorld', self.namespaces['LW'])
        self.dataset.bind('gaf', self.namespaces['GAF'])
        self.dataset.bind('leolaniTalk', self.namespaces['LTa'])
        self.dataset.bind('grasp', self.namespaces['GRASP'])
        self.dataset.bind('leolaniFriends', self.namespaces['LF'])
        self.dataset.bind('leolaniInputs', self.namespaces['LI'])
        self.dataset.bind('time', self.namespaces['TIME'])
        self.dataset.bind('leolaniTime', self.namespaces['LTi'])
        self.dataset.bind('skos', self.namespaces['SKOS'])
        self.dataset.bind('prov', self.namespaces['PROV'])
        self.dataset.bind('sem', self.namespaces['SEM'])
        self.dataset.bind('xml', self.namespaces['XML'])
        self.dataset.bind('owl', OWL)

    ######################################## Helpers for statement processing ########################################

    def create_chat_id(self, actor, date):
        """
        Determine chat id depending on my last conversation with this person
        :param actor:
        :param date:
        :return:
        """
        self._log.debug('Chat with {} on {}'.format(actor, date))

        query = read_query('last_chat_with') % (actor)
        response = self._submit_query(query)

        if response and int(response[0]['day']['value']) == int(date.day) \
                and int(response[0]['month']['value']) == int(date.month) \
                and int(response[0]['year']['value']) == int(date.year):
            # Chatted with this person today so same chat id
            chat_id = int(response[0]['chatid']['value'])
        else:
            # Either have never chatted with this person, or I have but not today. Add one to latest chat
            chat_id = self.get_last_chat_id() + 1

        return chat_id

    def create_turn_id(self, chat_id):
        self._log.debug('Turn in chat {}'.format(chat_id))

        query = read_query('last_turn_in_chat') % (chat_id)
        response = self._submit_query(query)
        return int(response['turnid']['value']) + 1 if response else 1

    def _generate_leolani(self, instance_graph):
        # Create Leolani
        leolani_id = 'leolani'
        leolani_label = 'leolani'

        leolani = URIRef(to_iri(self.namespaces['LW'] + leolani_id))
        leolani_label = Literal(leolani_label)
        leolani_type1 = URIRef(to_iri(self.namespaces['N2MU'] + 'robot'))
        leolani_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance'))

        instance_graph.add((leolani, RDFS.label, leolani_label))
        instance_graph.add((leolani, RDF.type, leolani_type1))
        instance_graph.add((leolani, RDF.type, leolani_type2))

        self.my_uri = leolani

        return leolani

    def _generate_subject(self, capsule, instance_graph):
        if capsule['subject']['type'] == '':  # We only get the label
            subject_vocab = OWL
            subject_type = 'Thing'
        else:
            subject_vocab = self.namespaces['N2MU']
            subject_type = capsule['subject']['type']

        subject_id = capsule['subject']['label']

        subject = URIRef(to_iri(self.namespaces['LW'] + subject_id))
        subject_label = Literal(subject_id)
        subject_type1 = URIRef(to_iri(subject_vocab + subject_type))
        subject_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance'))

        instance_graph.add((subject, RDFS.label, subject_label))
        instance_graph.add((subject, RDF.type, subject_type1))
        instance_graph.add((subject, RDF.type, subject_type2))

        return subject, subject_label

    def _create_leolani_world(self, capsule, type='Statement'):
        # Instance graph
        instance_graph_uri = URIRef(to_iri(self.namespaces['LW'] +
                                           'Instances'))
        instance_graph = self.dataset.graph(instance_graph_uri)

        # Subject
        if type == 'Statement':
            subject, subject_label = self._generate_subject(
                capsule, instance_graph)
        elif type == 'Experience':
            subject = self._generate_leolani(
                instance_graph) if self.my_uri is None else self.my_uri
            subject_label = 'leolani'

        # Object
        if capsule['object']['type'] == '':  # We only get the label
            object_vocab = OWL
            object_type = 'Thing'
        else:
            object_vocab = self.namespaces['N2MU']
            object_type = capsule['object']['type']

        object_id = capsule['object']['label']

        object = URIRef(to_iri(self.namespaces['LW'] + object_id))
        object_label = Literal(object_id)
        object_type1 = URIRef(to_iri(object_vocab + object_type))
        object_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance'))

        instance_graph.add((object, RDFS.label, object_label))
        instance_graph.add((object, RDF.type, object_type1))
        instance_graph.add((object, RDF.type, object_type2))

        if type == 'Statement':
            claim_graph, statement = self._create_claim_graph(
                subject,
                subject_label,
                object,
                object_label,
                capsule['predicate']['type'],
                type='Statement')
        elif type == 'Experience':
            claim_graph, statement = self._create_claim_graph(
                subject,
                subject_label,
                object,
                object_label,
                'sees',
                type='Experience')

        return instance_graph, claim_graph, subject, object, statement

    def _create_claim_graph(self,
                            subject,
                            subject_label,
                            object,
                            object_label,
                            predicate,
                            type='Statement'):
        # Claim graph
        claim_graph_uri = URIRef(to_iri(self.namespaces['LW'] + 'Claims'))
        claim_graph = self.dataset.graph(claim_graph_uri)

        # Statement
        statement_id = hash_statement_id(
            [subject_label, predicate, object_label])

        statement = URIRef(to_iri(self.namespaces['LW'] + statement_id))
        statement_type1 = URIRef(to_iri(self.namespaces['GRASP'] + type))
        statement_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance'))
        statement_type3 = URIRef(to_iri(self.namespaces['SEM'] + 'Event'))

        # Create graph and add triple
        graph = self.dataset.graph(statement)
        graph.add((subject, self.namespaces['N2MU'][predicate], object))

        claim_graph.add((statement, RDF.type, statement_type1))
        claim_graph.add((statement, RDF.type, statement_type2))
        claim_graph.add((statement, RDF.type, statement_type3))

        return claim_graph, statement

    def _create_leolani_talk(self, capsule, leolani, type='Statement'):
        # Interaction graph
        if type == 'Statement':
            graph_to_write = 'Interactions'
        elif type == 'Experience':
            graph_to_write = 'Sensors'

        interaction_graph_uri = URIRef(
            to_iri(self.namespaces['LTa'] + graph_to_write))
        interaction_graph = self.dataset.graph(interaction_graph_uri)

        # Time
        date = capsule["date"]
        time = URIRef(
            to_iri(self.namespaces['LTi'] + str(capsule["date"].isoformat())))
        time_type = URIRef(
            to_iri(self.namespaces['TIME'] + 'DateTimeDescription'))
        day = Literal(date.day, datatype=self.namespaces['XML']['gDay'])
        month = Literal(date.month,
                        datatype=self.namespaces['XML']['gMonthDay'])
        year = Literal(date.year, datatype=self.namespaces['XML']['gYear'])
        time_unitType = URIRef(to_iri(self.namespaces['TIME'] + 'unitDay'))

        interaction_graph.add((time, RDF.type, time_type))
        interaction_graph.add((time, self.namespaces['TIME']['day'], day))
        interaction_graph.add((time, self.namespaces['TIME']['month'], month))
        interaction_graph.add((time, self.namespaces['TIME']['year'], year))
        interaction_graph.add(
            (time, self.namespaces['TIME']['unitType'], time_unitType))

        # Actor
        actor_id = capsule['author']
        actor_label = capsule['author']

        actor = URIRef(to_iri(to_iri(self.namespaces['LF'] + actor_id)))
        actor_label = Literal(actor_label)
        actor_type1 = URIRef(to_iri(self.namespaces['SEM'] + 'Actor'))
        actor_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance'))

        if type == 'Statement':
            actor_type3 = URIRef(to_iri(self.namespaces['N2MU'] + 'person'))
        elif type == 'Experience':
            actor_type3 = URIRef(to_iri(self.namespaces['N2MU'] + 'sensor'))

        interaction_graph.add((actor, RDFS.label, actor_label))
        interaction_graph.add((actor, RDF.type, actor_type1))
        interaction_graph.add((actor, RDF.type, actor_type2))
        interaction_graph.add((actor, RDF.type, actor_type3))

        # Add leolani knows/senses actor
        if type == 'Statement':
            predicate = 'knows'
        elif type == 'Experience':
            predicate = 'senses'

        interaction_graph.add(
            (leolani, self.namespaces['N2MU'][predicate], actor))
        _, _ = self._create_claim_graph(leolani, 'leolani', actor, actor_label,
                                        predicate, type)

        # Event and subevent
        event_id = self.create_chat_id(actor_label, date)
        if type == 'Statement':
            event_label = 'chat%s' % event_id
        elif type == 'Experience':
            event_label = 'visual%s' % event_id

        subevent_id = self.create_turn_id(event_id)
        if type == 'Statement':
            subevent_label = event_label + '_turn%s' % subevent_id
        elif type == 'Experience':
            subevent_label = event_label + '_object%s' % subevent_id

        turn = URIRef(to_iri(self.namespaces['LTa'] + subevent_label))
        turn_type1 = URIRef(to_iri(self.namespaces['SEM'] + 'Event'))
        if type == 'Statement':
            turn_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Turn'))
        elif type == 'Experience':
            turn_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Object'))

        interaction_graph.add((turn, RDF.type, turn_type1))
        interaction_graph.add((turn, RDF.type, turn_type2))
        interaction_graph.add(
            (turn, self.namespaces['N2MU']['id'], Literal(subevent_id)))
        interaction_graph.add(
            (turn, self.namespaces['SEM']['hasActor'], actor))
        interaction_graph.add((turn, self.namespaces['SEM']['hasTime'], time))

        chat = URIRef(to_iri(self.namespaces['LTa'] + event_label))
        chat_type1 = URIRef(to_iri(self.namespaces['SEM'] + 'Event'))
        if type == 'Statement':
            chat_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Chat'))
        elif type == 'Experience':
            chat_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Visual'))

        interaction_graph.add((chat, RDF.type, chat_type1))
        interaction_graph.add((chat, RDF.type, chat_type2))
        interaction_graph.add(
            (chat, self.namespaces['N2MU']['id'], Literal(event_id)))
        interaction_graph.add(
            (chat, self.namespaces['SEM']['hasActor'], actor))
        interaction_graph.add((chat, self.namespaces['SEM']['hasTime'], time))
        interaction_graph.add(
            (chat, self.namespaces['SEM']['hasSubevent'], turn))

        perspective_graph, mention, attribution = self._create_perspective_graph(
            capsule, subevent_label)

        # Link interactions and perspectives
        perspective_graph.add(
            (mention, self.namespaces['GRASP']['wasAttributedTo'], actor))
        perspective_graph.add(
            (mention, self.namespaces['GRASP']['hasAttribution'], attribution))
        perspective_graph.add(
            (mention, self.namespaces['PROV']['wasDerivedFrom'], chat))
        perspective_graph.add(
            (mention, self.namespaces['PROV']['wasDerivedFrom'], turn))

        return interaction_graph, perspective_graph, actor, time, mention, attribution

    def _create_perspective_graph(self, capsule, turn_label, type='Statement'):
        # Perspective graph
        perspective_graph_uri = URIRef(
            to_iri(self.namespaces['LTa'] + 'Perspectives'))
        perspective_graph = self.dataset.graph(perspective_graph_uri)

        # Mention
        if type == 'Statement':
            mention_id = turn_label + '_char%s' % capsule['position']
        elif type == 'Experience':
            mention_id = turn_label + '_pixel%s' % capsule['position']
        mention = URIRef(to_iri(self.namespaces['LTa'] + mention_id))
        mention_type = URIRef(to_iri(self.namespaces['GRASP'] + 'Mention'))

        perspective_graph.add((mention, RDF.type, mention_type))

        # Attribution
        attribution_id = mention_id + '_CERTAIN'
        attribution = URIRef(to_iri(self.namespaces['LTa'] + attribution_id))
        attribution_type = URIRef(
            to_iri(self.namespaces['GRASP'] + 'Attribution'))
        attribution_value = URIRef(to_iri(self.namespaces['GRASP'] +
                                          'CERTAIN'))

        perspective_graph.add((attribution, RDF.type, attribution_type))
        perspective_graph.add((attribution, RDF.value, attribution_value))

        return perspective_graph, mention, attribution

    def _serialize(self, file_path):
        """
        Save graph to local file and return the serialized string
        :param file_path: path to where data will be saved
        :return: serialized data as string
        """
        # Save to file but return the python representation
        with open(file_path + '.' + self.format, 'w') as f:
            self.dataset.serialize(f, format=self.format)
        return self.dataset.serialize(format=self.format)

    def _upload_to_brain(self, data):
        """
        Post data to the brain
        :param data: serialized data as string
        :return: response status
        """
        self._log.debug("Posting triples")

        # From serialized string
        post_url = self.address + "/statements"
        response = requests.post(
            post_url,
            data=data,
            headers={'Content-Type': 'application/x-' + self.format})

        return str(response.status_code)

    def _model_graphs_(self, capsule, type='Statement'):
        # Leolani world (includes instance and claim graphs)
        instance_graph, claim_graph, subject, object, instance = self._create_leolani_world(
            capsule, type)

        # Identity
        leolani = self._generate_leolani(
            instance_graph) if self.my_uri is None else self.my_uri

        # Leolani talk (includes interaction and perspective graphs)
        interaction_graph, perspective_graph, actor, time, mention, attribution = self._create_leolani_talk(
            capsule, leolani, type)

        # Interconnections
        instance_graph.add(
            (subject, self.namespaces['GRASP']['denotedIn'], mention))
        instance_graph.add(
            (object, self.namespaces['GRASP']['denotedIn'], mention))

        instance_graph.add(
            (instance, self.namespaces['GRASP']['denotedBy'], mention))
        instance_graph.add(
            (instance, self.namespaces['SEM']['hasActor'], actor))
        instance_graph.add((instance, self.namespaces['SEM']['hasTime'], time))

        perspective_graph.add(
            (mention, self.namespaces['GRASP']['containsDenotation'], subject))
        perspective_graph.add(
            (mention, self.namespaces['GRASP']['containsDenotation'], object))
        perspective_graph.add(
            (mention, self.namespaces['GRASP']['denotes'], instance))

        perspective_graph.add(
            (attribution, self.namespaces['GRASP']['isAttributionFor'],
             mention))

    ######################################### Helpers for question processing #########################################

    def _create_query(self, parsed_question):
        _ = hash_statement_id([
            parsed_question['subject']['label'],
            parsed_question['predicate']['type'],
            parsed_question['object']['label']
        ])

        # Query subject
        if parsed_question['subject']['label'] == "":
            # Case fold
            # object_label = casefold_label(parsed_question['object']['label'])

            query = """
                SELECT ?slabel ?authorlabel
                        WHERE { 
                            ?s n2mu:%s ?o . 
                            ?s rdfs:label ?slabel . 
                            ?o rdfs:label '%s' .  
                            GRAPH ?g {
                                ?s n2mu:%s ?o . 
                            } . 
                            ?g grasp:denotedBy ?m . 
                            ?m grasp:wasAttributedTo ?author . 
                            ?author rdfs:label ?authorlabel .
                        }
                """ % (parsed_question['predicate']['type'],
                       parsed_question['object']['label'],
                       parsed_question['predicate']['type'])

        # Query object
        elif parsed_question['object']['label'] == "":
            query = """
                SELECT ?olabel ?authorlabel
                        WHERE { 
                            ?s n2mu:%s ?o .   
                            ?s rdfs:label '%s' .  
                            ?o rdfs:label ?olabel .  
                            GRAPH ?g {
                                ?s n2mu:%s ?o . 
                            } . 
                            ?g grasp:denotedBy ?m . 
                            ?m grasp:wasAttributedTo ?author . 
                            ?author rdfs:label ?authorlabel .
                        }
                """ % (parsed_question['predicate']['type'],
                       parsed_question['subject']['label'],
                       parsed_question['predicate']['type'])

        # Query existence
        else:
            query = """
                SELECT ?authorlabel ?v
                        WHERE { 
                            ?s n2mu:%s ?o .   
                            ?s rdfs:label '%s' .  
                            ?o rdfs:label '%s' .  
                            GRAPH ?g {
                                ?s n2mu:%s ?o . 
                            } . 
                            ?g grasp:denotedBy ?m . 
                            ?m grasp:wasAttributedTo ?author . 
                            ?author rdfs:label ?authorlabel .
                            ?m grasp:hasAttribution ?att .
                            ?att rdf:value ?v .
                        }
                """ % (parsed_question['predicate']['type'],
                       parsed_question['subject']['label'],
                       parsed_question['object']['label'],
                       parsed_question['predicate']['type'])

        query = self.query_prefixes + query

        return query

    def _submit_query(self, query):
        # Set up connection
        sparql = SPARQLWrapper(self.address)

        # Response parameters
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        sparql.addParameter('Accept', 'application/sparql-results+json')
        response = sparql.query().convert()

        return response["results"]["bindings"]

    ######################################### Helpers for conflict processing #########################################
    def _get_conflicts_with_predicate(self, one_to_one_predicate):
        query = """
            PREFIX n2mu: <http://cltl.nl/leolani/n2mu/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX grasp: <http://groundedannotationframework.org/grasp#>

            select ?sname 
                    (group_concat(?oname ; separator=";") as ?onames) 
                    (group_concat(?authorlabel ; separator=";") as ?authorlabels) 
            where { 
                GRAPH ?g {
                    ?s n2mu:%s ?o .
                    } .
                ?s rdfs:label ?sname .
                ?o rdfs:label ?oname .

                ?g grasp:denotedBy ?m . 
                ?m grasp:wasAttributedTo ?author . 
                ?author rdfs:label ?authorlabel .

            } group by ?sname having (count(distinct ?oname) > 1)
        """ % one_to_one_predicate

        response = self._submit_query(query)
        conflicts = []
        for item in response:
            conflict = {
                'subject': item['sname']['value'],
                'predicate': one_to_one_predicate,
                'objects': []
            }

            values = item['onames']['value'].split(';')
            authors = item['authorlabels']['value'].split(';')

            for val, auth in zip(values, authors):
                option = {'value': val, 'author': auth}
                conflict['objects'].append(option)

            conflicts.append(conflict)

        return conflicts
Esempio n. 24
0
def update_test(t):

    # the update-eval tests refer to graphs on http://example.org
    rdflib_sparql_module.SPARQL_LOAD_GRAPHS = False

    uri, name, comment, data, graphdata, query, res, syntax = t

    if uri in skiptests:
        raise SkipTest()

    try:
        g = Dataset()

        if not res:
            if syntax:
                with bopen(query[7:]) as f:
                    translateUpdate(parseUpdate(f))
            else:
                try:
                    with bopen(query[7:]) as f:
                        translateUpdate(parseUpdate(f))
                    raise AssertionError("Query shouldn't have parsed!")
                except:
                    pass  # negative syntax test
            return

        resdata, resgraphdata = res

        # read input graphs
        if data:
            g.default_context.load(data, format=_fmt(data))

        if graphdata:
            for x, l in graphdata:
                g.load(x, publicID=URIRef(l), format=_fmt(x))

        with bopen(query[7:]) as f:
            req = translateUpdate(parseUpdate(f))
        evalUpdate(g, req)

        # read expected results
        resg = Dataset()
        if resdata:
            resg.default_context.load(resdata, format=_fmt(resdata))

        if resgraphdata:
            for x, l in resgraphdata:
                resg.load(x, publicID=URIRef(l), format=_fmt(x))

        eq(
            set(x.identifier for x in g.contexts() if x != g.default_context),
            set(x.identifier for x in resg.contexts()
                if x != resg.default_context),
            "named graphs in datasets do not match",
        )
        assert isomorphic(
            g.default_context,
            resg.default_context), "Default graphs are not isomorphic"

        for x in g.contexts():
            if x == g.default_context:
                continue
            assert isomorphic(x, resg.get_context(
                x.identifier)), ("Graphs with ID %s are not isomorphic" %
                                 x.identifier)

    except Exception as e:

        if isinstance(e, AssertionError):
            failed_tests.append(uri)
            fails[str(e)] += 1
        else:
            error_tests.append(uri)
            errors[str(e)] += 1

        if DEBUG_ERROR and not isinstance(e, AssertionError) or DEBUG_FAIL:
            print("======================================")
            print(uri)
            print(name)
            print(comment)

            if not res:
                if syntax:
                    print("Positive syntax test")
                else:
                    print("Negative syntax test")

            if data:
                print("----------------- DATA --------------------")
                print(">>>", data)
                print(bopen_read_close(data[7:]))
            if graphdata:
                print("----------------- GRAPHDATA --------------------")
                for x, l in graphdata:
                    print(">>>", x, l)
                    print(bopen_read_close(x[7:]))

            print("----------------- Request -------------------")
            print(">>>", query)
            print(bopen_read_close(query[7:]))

            if res:
                if resdata:
                    print("----------------- RES DATA --------------------")
                    print(">>>", resdata)
                    print(bopen_read_close(resdata[7:]))
                if resgraphdata:
                    print(
                        "----------------- RES GRAPHDATA -------------------")
                    for x, l in resgraphdata:
                        print(">>>", x, l)
                        print(bopen_read_close(x[7:]))

            print("------------- MY RESULT ----------")
            print(g.serialize(format="trig"))

            try:
                pq = translateUpdate(parseUpdate(bopen_read_close(query[7:])))
                print("----------------- Parsed ------------------")
                pprintAlgebra(pq)
                # print pq
            except:
                print("(parser error)")

            print(decodeStringEscape(str(e)))

            import pdb

            pdb.post_mortem(sys.exc_info()[2])
        raise
Esempio n. 25
0
    # Info on the item
    g.add((item, RDF.type, saa.Item))
    g.add((item, saa.term('index'), Literal(record['assigned_item_no'])))

    if record['persistent_uid'] != "":
        g.add((item, saa.identifier, Literal(record['persistent_uid'])))

    g.add((item, RDFS.label, Literal(record['title'], lang='nl')))
    g.add((item, saa.artist, Literal(record['artist_name_1'])))
    g.add((item, saa.transcription, Literal(record['entry'], lang='nl')))
    g.add((item, saa.workType, Literal(record['object_type_1'], lang='nl')))

    if record['room'] != "":
        g.add((item, saa.room, Literal(record['room'], lang='nl')))

    if record['valuation_amount'] != "":
        g.add((item, saa.valuation, Literal(record['valuation_amount'])))

    return g


if __name__ == "__main__":

    ds = Dataset()
    ds.bind('ga', ga)
    ds.bind('saa', saa)

    ds = main(dataset=ds)
    ds.serialize('Dutch_Archival_Descriptions_Getty.trig', format='trig')
Esempio n. 26
0
    format="n3", base=base_one).decode("utf-8")
g7.base = base_two
assert "@base <http://two.org/> ." in g7.serialize(format="n3").decode("utf-8")
assert "@base <http://one.org/> ." in g7.serialize(
    format="n3", base=base_one).decode("utf-8")

# 8. checking results for TriX & TriG
# TriX can specify a base per graph but setting a base for the whole
base_three = Namespace("http://three.org/")
ds1 = Dataset()
ds1.bind("dct", DCTERMS)
ds1.bind("skos", SKOS)
g8 = ds1.graph(URIRef("http://g8.com/"), base=base_one)
g9 = ds1.graph(URIRef("http://g9.com/"))
g8 += g
g9 += g
g9.base = base_two
ds1.base = base_three

trix = ds1.serialize(format="trix",
                     base=Namespace("http://two.org/")).decode("utf-8")
assert '<graph xml:base="http://one.org/">' in trix
assert '<graph xml:base="http://two.org/">' in trix
assert '<TriX xml:base="http://two.org/"' in trix

trig = ds1.serialize(format="trig",
                     base=Namespace("http://two.org/")).decode("utf-8")
assert "@base <http://one.org/> ." not in trig
assert "@base <http://three.org/> ." not in trig
assert "@base <http://two.org/> ." in trig
Esempio n. 27
0
r1.ingredients.append(i1_1)
r1.tags.append(t1)

r1.add_prov("wasDerivedFrom", URIRef("http://recipes.com/r/Foo"))
r1.add_pub_info("wasAttributedTo", Literal("Jeff the Data Guy"))
summed = Dataset()

for quad in r1.__publish__():
    summed.add(quad)

summed.namespace_manager.bind("np", data.NP, True)
summed.namespace_manager.bind("recipe-kb", data.BASE, True)
summed.namespace_manager.bind("prov", data.PROV, True)

print(summed.serialize(format="trig").decode("utf-8"))

u1 = data.USDAEntry(12345, "CHEESE,SERIOUSLY SPICY", [])

l1 = data.Linkage(data.IngredientName(i1_1.name), u1)

summed = Dataset()

for quad in l1.__publish__():
    summed.add(quad)

summed.namespace_manager.bind("np", data.NP, True)
summed.namespace_manager.bind("recipe-kb", data.BASE, True)
summed.namespace_manager.bind("prov", data.PROV, True)

print(summed.serialize(format="trig").decode("utf-8"))
Esempio n. 28
0
def update_test(t):

    # the update-eval tests refer to graphs on http://example.org
    rdflib_sparql_module.SPARQL_LOAD_GRAPHS = False

    uri, name, comment, data, graphdata, query, res, syntax = t

    if uri in skiptests:
        raise SkipTest()

    try:
        g = Dataset()

        if not res:
            if syntax:
                translateUpdate(parseUpdate(open(query[7:])))
            else:
                try:
                    translateUpdate(parseUpdate(open(query[7:])))
                    raise AssertionError("Query shouldn't have parsed!")
                except:
                    pass  # negative syntax test
            return

        resdata, resgraphdata = res

        # read input graphs
        if data:
            g.default_context.load(data, format=_fmt(data))

        if graphdata:
            for x, l in graphdata:
                g.load(x, publicID=URIRef(l), format=_fmt(x))

        req = translateUpdate(parseUpdate(open(query[7:])))
        evalUpdate(g, req)

        # read expected results
        resg = Dataset()
        if resdata:
            resg.default_context.load(resdata, format=_fmt(resdata))

        if resgraphdata:
            for x, l in resgraphdata:
                resg.load(x, publicID=URIRef(l), format=_fmt(x))

        eq(set(x.identifier for x in g.contexts() if x != g.default_context),
           set(x.identifier for x in resg.contexts()
               if x != resg.default_context), 'named graphs in datasets do not match')
        assert isomorphic(g.default_context, resg.default_context), \
            'Default graphs are not isomorphic'

        for x in g.contexts():
            if x == g.default_context:
                continue
            assert isomorphic(x, resg.get_context(x.identifier)), \
                "Graphs with ID %s are not isomorphic" % x.identifier

    except Exception, e:

        if isinstance(e, AssertionError):
            failed_tests.append(uri)
            fails[str(e)] += 1
        else:
            error_tests.append(uri)
            errors[str(e)] += 1

        if DEBUG_ERROR and not isinstance(e, AssertionError) or DEBUG_FAIL:
            print "======================================"
            print uri
            print name
            print comment

            if not res:
                if syntax:
                    print "Positive syntax test"
                else:
                    print "Negative syntax test"

            if data:
                print "----------------- DATA --------------------"
                print ">>>", data
                print open(data[7:]).read()
            if graphdata:
                print "----------------- GRAPHDATA --------------------"
                for x, l in graphdata:
                    print ">>>", x, l
                    print open(x[7:]).read()

            print "----------------- Request -------------------"
            print ">>>", query
            print open(query[7:]).read()

            if res:
                if resdata:
                    print "----------------- RES DATA --------------------"
                    print ">>>", resdata
                    print open(resdata[7:]).read()
                if resgraphdata:
                    print "----------------- RES GRAPHDATA -------------------"
                    for x, l in resgraphdata:
                        print ">>>", x, l
                        print open(x[7:]).read()

            print "------------- MY RESULT ----------"
            print g.serialize(format='trig')

            try:
                pq = translateUpdate(parseUpdate(open(query[7:]).read()))
                print "----------------- Parsed ------------------"
                pprintAlgebra(pq)
                # print pq
            except:
                print "(parser error)"

            print decodeStringEscape(unicode(e))

            import pdb
            pdb.post_mortem(sys.exc_info()[2])
        raise
Esempio n. 29
0
            # create triples containing subject (neurodkg instances), predicate (several are defined above), and object (neurodkg instances) and add them to the dataset
            dataset.add((NEURO_INST[subj], URIRef(predicate_to_uri[pred]),
                         NEURO_INST[obj]))
        # object id: differentiating between the cases of having a disease ID or not
        elif str(obj_id) != 'nan':
            print(obj_id)
            curie = obj_id.replace(' ', '').split(':')
            if len(curie) <= 1:
                print(obj_id)
            prefix = curie[0].lower()
            obj_id = curie[1]
            print(curie)
            # if a disease ID was found, then add the ID and ontology as object of the triple
            #obj_uri  = BASE[prefix+':'+obj_id]
            obj_uri = URIRef(prefix_dict[prefix] + obj_id)
            dataset.add(
                (NEURO_INST[subj], URIRef(predicate_to_uri[pred]), obj_uri))
            # if there was no disease ID in an ontology: use the disease label as object of the triple
            dataset.add((obj_uri, RDFS['label'], Literal(obj)))
        else:
            if obj in object_to_uri:
                obj_uri = object_to_uri[obj]
                dataset.add((NEURO_INST[subj], URIRef(predicate_to_uri[pred]),
                             URIRef(obj_uri)))
            else:
                dataset.add((NEURO_INST[subj], URIRef(predicate_to_uri[pred]),
                             Literal(obj)))
        print("---------", index)
    # saving the dataset as a turtle file
    dataset.serialize('data/output/neuro_dkg.ttl', format='turtle')