Esempio n. 1
0
    def set_member(self, c_id, m_obj):
        if isinstance(m_obj, Model):
            m_obj = [m_obj]
        elif not isinstance(m_obj, list):
            raise ParseError()

        c_ldp_id = self.marmotta.ldp(encoder.encode(c_id))
        collection = self.get_collection(c_id).pop() # 404 if collection not found

        if len(set([m.id for m in m_obj])) is not len(m_obj):
            raise ForbiddenError()
        if not collection.capabilities.membershipIsMutable:
            raise ForbiddenError()
        if collection.capabilities.restrictedToType:
            for m in m_obj:
                if not(hasattr(m,"datatype") and m.datatype in collection.capabilities.restrictedToType):
                    raise ForbiddenError()
        if collection.capabilities.maxLength >= 0:
            size = self.sparql.size(c_ldp_id).bindings.pop().get(Variable('size'))
            if int(size) > collection.capabilities.maxLength-len(m_obj):
                raise ForbiddenError()#"Operation forbidden. Collection of maximum size {} is full.".format(collection.capabilities.maxLength))

        ds = Dataset()
        ldp = ds.graph(identifier=LDP.ns)
        for m in m_obj:
            m_id = self.marmotta.ldp(encoder.encode(c_id)+"/member/"+encoder.encode(m.id))
            member = ds.graph(identifier=m_id)
            member += self.RDA.object_to_graph(member.identifier,m)
            ldp += LDP.add_contains(c_ldp_id+"/member",m_id,False)
        res = self.sparql.insert(ds)
        if res.status_code is not 200:
            raise DBError()
        return m_obj
Esempio n. 2
0
 def set_service(self, s_obj):
     ds = Dataset()
     service = ds.graph(identifier=self.marmotta.ldp("service"))
     service += self.RDA.object_to_graph(service.identifier, s_obj)
     ldp = ds.graph(identifier=LDP.ns)
     ldp += LDP.add_contains(self.marmotta.ldp(),service.identifier,False)
     response = self.sparql.insert(ds)
     if response.status_code is 200:
         return s_obj
     else:
         raise DBError()
Esempio n. 3
0
 def set_collection(self, c_obj, over_write=False):
     if isinstance(c_obj, Model):
         c_obj = [c_obj]
     elif not isinstance(c_obj, list):
         raise ParseError()
     # create LD collection and declare as ldp:BasicContainer
     ds = Dataset()
     ldp = ds.graph(identifier=LDP.ns)
     for c in c_obj:
         c_id = encoder.encode(c.id)
         collection = ds.graph(identifier=self.marmotta.ldp(c_id))
         collection += self.RDA.object_to_graph(collection.identifier, c)
         ldp += LDP.add_contains(self.marmotta.ldp(), collection.identifier)
         member = ds.graph(identifier=self.marmotta.ldp(c_id+'/member'))
         ldp += LDP.add_contains(collection.identifier, member.identifier)
     ins = self.sparql.insert(ds)
     if ins.status_code is 200:
         return c_obj
     else:
         raise DBError()
Esempio n. 4
0
class Fragment(object):

    HYDRA = Namespace("http://www.w3.org/ns/hydra/core#")
    VOID = Namespace("http://rdfs.org/ns/void#")
    FOAF = Namespace("http://xmlns.com/foaf/0.1/")
    DCTERMS = Namespace("http://purl.org/dc/terms/")

    def __init__(self):
        self.rdf_graph = Dataset()

    def add_data_triple(self, subject, predicate, obj):
        self.rdf_graph.add((subject, predicate, obj))

    def add_graph(self, identifier):
        self.rdf_graph.graph(identifier)

    def add_meta_quad(self, graph, subject, predicate, obj):
        self.rdf_graph.add((graph, subject, predicate, obj))

    def add_prefix(self, prefix, uri):
        self.rdf_graph.bind(prefix, uri)

    def serialize(self):
        return self.rdf_graph.serialize(format="trig", encoding="utf-8")
Esempio n. 5
0
class DatasetTestCase(unittest.TestCase):
    store = 'default'
    slow = True
    tmppath = None

    def setUp(self):
        try:
            self.graph = Dataset(store=self.store)
        except ImportError:
            raise SkipTest(
                "Dependencies for store '%s' not available!" % self.store)
        if self.store == "SQLite":
            _, self.tmppath = mkstemp(
                prefix='test', dir='/tmp', suffix='.sqlite')
        elif self.store == "SPARQLUpdateStore":
            root = HOST + DB
            self.graph.open((root + "sparql", root + "update"))
        else:
            self.tmppath = mkdtemp()

        if self.store != "SPARQLUpdateStore":
            self.graph.open(self.tmppath, create=True)
        self.michel = URIRef(u'urn:michel')
        self.tarek = URIRef(u'urn:tarek')
        self.bob = URIRef(u'urn:bob')
        self.likes = URIRef(u'urn:likes')
        self.hates = URIRef(u'urn:hates')
        self.pizza = URIRef(u'urn:pizza')
        self.cheese = URIRef(u'urn:cheese')

        # Use regular URIs because SPARQL endpoints like Fuseki alter short names
        self.c1 = URIRef(u'urn:context-1')
        self.c2 = URIRef(u'urn:context-2')

        # delete the graph for each test!
        self.graph.remove((None, None, None))
        for c in self.graph.contexts():
            c.remove((None, None, None))
            assert len(c) == 0
            self.graph.remove_graph(c)

    def tearDown(self):
        self.graph.close()
        if self.store == "SPARQLUpdateStore":
            pass
        else:
            if os.path.isdir(self.tmppath):
                shutil.rmtree(self.tmppath)
            else:
                os.remove(self.tmppath)

    def testGraphAware(self):

        if not self.graph.store.graph_aware:
            return

        g = self.graph
        g1 = g.graph(self.c1)

        # Some SPARQL endpoint backends (e.g. TDB) do not consider
        # empty named graphs
        if self.store != "SPARQLUpdateStore":
            # added graph exists
            self.assertEqual(set(x.identifier for x in self.graph.contexts()),
                             set([self.c1, DATASET_DEFAULT_GRAPH_ID]))

        # added graph is empty
        self.assertEqual(len(g1), 0)

        g1.add((self.tarek, self.likes, self.pizza))

        # added graph still exists
        self.assertEqual(set(x.identifier for x in self.graph.contexts()),
                         set([self.c1, DATASET_DEFAULT_GRAPH_ID]))

        # added graph contains one triple
        self.assertEqual(len(g1), 1)

        g1.remove((self.tarek, self.likes, self.pizza))

        # added graph is empty
        self.assertEqual(len(g1), 0)

        # Some SPARQL endpoint backends (e.g. TDB) do not consider
        # empty named graphs
        if self.store != "SPARQLUpdateStore":
            # graph still exists, although empty
            self.assertEqual(set(x.identifier for x in self.graph.contexts()),
                             set([self.c1, DATASET_DEFAULT_GRAPH_ID]))

        g.remove_graph(self.c1)

        # graph is gone
        self.assertEqual(set(x.identifier for x in self.graph.contexts()),
                         set([DATASET_DEFAULT_GRAPH_ID]))

    def testDefaultGraph(self):
        # Something the default graph is read-only (e.g. TDB in union mode)
        if self.store == "SPARQLUpdateStore":
            print("Please make sure updating the default graph "
                  "is supported by your SPARQL endpoint")

        self.graph.add((self.tarek, self.likes, self.pizza))
        self.assertEqual(len(self.graph), 1)
        # only default exists
        self.assertEqual(set(x.identifier for x in self.graph.contexts()),
                         set([DATASET_DEFAULT_GRAPH_ID]))

        # removing default graph removes triples but not actual graph
        self.graph.remove_graph(DATASET_DEFAULT_GRAPH_ID)

        self.assertEqual(len(self.graph), 0)
        # default still exists
        self.assertEqual(set(x.identifier for x in self.graph.contexts()),
                         set([DATASET_DEFAULT_GRAPH_ID]))

    def testNotUnion(self):
        # Union depends on the SPARQL endpoint configuration
        if self.store == "SPARQLUpdateStore":
            print("Please make sure your SPARQL endpoint has not configured "
                  "its default graph as the union of the named graphs")
        g1 = self.graph.graph(self.c1)
        g1.add((self.tarek, self.likes, self.pizza))

        self.assertEqual(list(self.graph.objects(self.tarek, None)),
                         [])
        self.assertEqual(list(g1.objects(self.tarek, None)), [self.pizza])
Esempio n. 6
0
class DatasetTestCase(unittest.TestCase):
    store = 'default'
    slow = True
    tmppath = None

    def setUp(self):
        try:
            self.graph = Dataset(store=self.store)
        except ImportError:
            raise SkipTest(
                "Dependencies for store '%s' not available!" % self.store)
        if self.store == "SQLite":
            _, self.tmppath = mkstemp(
                prefix='test', dir='/tmp', suffix='.sqlite')
        else:
            self.tmppath = mkdtemp()
        self.graph.open(self.tmppath, create=True)
        self.michel = URIRef(u'michel')
        self.tarek = URIRef(u'tarek')
        self.bob = URIRef(u'bob')
        self.likes = URIRef(u'likes')
        self.hates = URIRef(u'hates')
        self.pizza = URIRef(u'pizza')
        self.cheese = URIRef(u'cheese')

        self.c1 = URIRef(u'context-1')
        self.c2 = URIRef(u'context-2')

        # delete the graph for each test!
        self.graph.remove((None, None, None))

    def tearDown(self):
        self.graph.close()
        if os.path.isdir(self.tmppath):
            shutil.rmtree(self.tmppath)
        else:
            os.remove(self.tmppath)


    def testGraphAware(self): 
        if not self.graph.store.graph_aware: return 
        
        g = self.graph
        g1 = g.graph(self.c1)
        
        
        # added graph exists
        self.assertEquals(set(x.identifier for x in self.graph.contexts()), 
                          set([self.c1, DATASET_DEFAULT_GRAPH_ID]))

        # added graph is empty 
        self.assertEquals(len(g1), 0)
        
        g1.add( (self.tarek, self.likes, self.pizza) )

        # added graph still exists
        self.assertEquals(set(x.identifier for x in self.graph.contexts()), 
                          set([self.c1, DATASET_DEFAULT_GRAPH_ID]))

        # added graph contains one triple
        self.assertEquals(len(g1), 1)

        g1.remove( (self.tarek, self.likes, self.pizza) )

        # added graph is empty 
        self.assertEquals(len(g1), 0)

        # graph still exists, although empty
        self.assertEquals(set(x.identifier for x in self.graph.contexts()), 
                          set([self.c1, DATASET_DEFAULT_GRAPH_ID]))

        g.remove_graph(self.c1)
                
        # graph is gone
        self.assertEquals(set(x.identifier for x in self.graph.contexts()), 
                          set([DATASET_DEFAULT_GRAPH_ID]))
        
    def testDefaultGraph(self): 
        
        self.graph.add(( self.tarek, self.likes, self.pizza))
        self.assertEquals(len(self.graph), 1)
        # only default exists
        self.assertEquals(set(x.identifier for x in self.graph.contexts()), 
                          set([DATASET_DEFAULT_GRAPH_ID]))

        # removing default graph removes triples but not actual graph
        self.graph.remove_graph(DATASET_DEFAULT_GRAPH_ID)

        self.assertEquals(len(self.graph), 0)
        # default still exists
        self.assertEquals(set(x.identifier for x in self.graph.contexts()), 
                          set([DATASET_DEFAULT_GRAPH_ID]))

    def testNotUnion(self): 
        g1 = self.graph.graph(self.c1)
        g1.add((self.tarek, self.likes, self.pizza))

        self.assertEqual(list(self.graph.objects(self.tarek, None)), 
                         [])
        self.assertEqual(list(g1.objects(self.tarek, None)), [self.pizza])
Esempio n. 7
0
def main(source, target, geometryfile='data/point2wkt.json'):
    with open(source) as infile:
        data = json.load(infile)

    with open(geometryfile) as infile:
        point2wkt = json.load(infile)

    ds = Dataset()
    dataset = lp.term('')

    g = rdfSubject.db = ds.graph(identifier=lp)

    ### Custom triples / Ontology

    g.add((lpOnt.Adres, OWL.equivalentClass, schema.PostalAddress))

    g.add((lpOnt.Straat, OWL.equivalentClass, hg.Street))
    g.add((lpOnt.Buurt, OWL.equivalentClass, hg.Neighbourhood))

    g.add((lpOnt.adres, OWL.equivalentProperty, schema.address))

    ########
    # Data #
    ########

    adres2locatie = defaultdict(lambda: defaultdict(list))

    for n, adresLabel in enumerate(data, 1):

        if n % 5000 == 0:
            print(f"{n}/{len(data)}", end='\r')
            # break

        # # geometry
        # wkt = point2wkt.get(locatiepunt)

        # wktLiteral = Literal(wkt, datatype=geo.wktLiteral)
        # geometry = Geometry(lpGeo.term(str(locatiepunt)),
        #                     asWKT=wktLiteral,
        #                     label=[str(locatiepunt)])

        addresses = getAdres(data[adresLabel], adresLabel, point2wkt)

        # adres2locatie[adres][year].append(geometry)

        # observations.append(locpdetail)
        # locp.observation = observations

        # addresses.append(
        #     Role(
        #         None,
        #         label=address.label,
        #         address=address,
        #         hasLatestBeginTimeStamp=locpdetail.hasLatestBeginTimeStamp,
        #         hasEarliestEndTimeStamp=locpdetail.hasEarliestEndTimeStamp,
        #         startDate=Literal(year, datatype=XSD.gYear)))

    ds.bind('create', create)
    ds.bind('schema', schema)
    ds.bind('sem', sem)
    ds.bind('geo', geo)
    ds.bind('juso', juso)
    ds.bind('qb', qb)
    ds.bind('void', void)

    print("Serializing!")
    ds.serialize(target, format='trig')
Esempio n. 8
0
    # We initialize a dataset, and bind our namespaces
    dataset = Dataset()
    dataset.bind('g13data', DATA)
    dataset.bind('g13vocab', VOCAB)
    dataset.bind('g13set', SETNAME)
    dataset.bind('geo', GEO)
    dataset.bind('geof', GEOF)
    dataset.bind('dbo', DBO)
    dataset.bind('dbp', DBP)
    dataset.bind('schema', SCHEMA)
    dataset.bind('vcard', VCARD)
    dataset.bind('wgs', WGS)
    dataset.bind('void', VOID)

    # We then get a new dataset object with our URI from the dataset.
    graph = dataset.graph(graph_uri)

    # Load the externally defined schema into the default dataset (context) of the dataset
    #dataset.default_context.parse('vocab.ttl', format='turtle')

    # Let's iterate over the dictionary, and create some triples
    # Let's pretend we know exactly what the 'schema' of our CSV file is
    for row in csv_contents:
        thing = ""

        points = ""
        lat = ""
        lng = ""

        name = ""
        lat = ""
Esempio n. 9
0
def data_structure_definition(profile, dataset_name, dataset_base_uri,
                              variables, source_path, source_hash):
    """Converts the dataset + variables to a set of rdflib Graphs (a nanopublication with provenance annotations)
    that contains the data structure definition (from the DataCube vocabulary) and
    the mappings to external datasets.

    Arguments:
    dataset     -- the name of the dataset
    variables   -- the list of dictionaries with the variables and their mappings to URIs
    profile     -- the Google signin profile
    source_path -- the path to the dataset file that was annotated
    source_hash -- the Git hash of the dataset file version of the dataset

    :returns: an RDF graph store containing a nanopublication
    """
    BASE = Namespace('{}/'.format(dataset_base_uri))
    dataset_uri = URIRef(dataset_base_uri)

    # Initialize a conjunctive graph for the whole lot
    rdf_dataset = Dataset()
    rdf_dataset.bind('qbrv', QBRV)
    rdf_dataset.bind('qbr', QBR)
    rdf_dataset.bind('qb', QB)
    rdf_dataset.bind('skos', SKOS)
    rdf_dataset.bind('prov', PROV)
    rdf_dataset.bind('np', NP)
    rdf_dataset.bind('foaf', FOAF)

    # Initialize the graphs needed for the nanopublication
    timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M")

    # Shorten the source hash to 8 digits (similar to Github)
    source_hash = source_hash[:8]

    hash_part = source_hash + '/' + timestamp

    # The Nanopublication consists of three graphs
    assertion_graph_uri = BASE['assertion/' + hash_part]
    assertion_graph = rdf_dataset.graph(assertion_graph_uri)

    provenance_graph_uri = BASE['provenance/' + hash_part]
    provenance_graph = rdf_dataset.graph(provenance_graph_uri)

    pubinfo_graph_uri = BASE['pubinfo/' + hash_part]
    pubinfo_graph = rdf_dataset.graph(pubinfo_graph_uri)

    # A URI that represents the author
    author_uri = QBR['person/' + profile['email']]

    rdf_dataset.add((author_uri, RDF.type, FOAF['Person']))
    rdf_dataset.add((author_uri, FOAF['name'], Literal(profile['name'])))
    rdf_dataset.add((author_uri, FOAF['email'], Literal(profile['email'])))
    rdf_dataset.add((author_uri, QBRV['googleId'], Literal(profile['id'])))
    try:
        rdf_dataset.add(
            (author_uri, FOAF['depiction'], URIRef(profile['image'])))
    except KeyError:
        pass

    # A URI that represents the version of the dataset source file
    dataset_version_uri = BASE[source_hash]

    # Some information about the source file used
    rdf_dataset.add((dataset_version_uri, QBRV['path'],
                     Literal(source_path, datatype=XSD.string)))
    rdf_dataset.add((dataset_version_uri, QBRV['sha1_hash'],
                     Literal(source_hash, datatype=XSD.string)))

    # ----
    # The nanopublication itself
    # ----
    nanopublication_uri = BASE['nanopublication/' + hash_part]

    rdf_dataset.add((nanopublication_uri, RDF.type, NP['Nanopublication']))
    rdf_dataset.add(
        (nanopublication_uri, NP['hasAssertion'], assertion_graph_uri))
    rdf_dataset.add((assertion_graph_uri, RDF.type, NP['Assertion']))
    rdf_dataset.add(
        (nanopublication_uri, NP['hasProvenance'], provenance_graph_uri))
    rdf_dataset.add((provenance_graph_uri, RDF.type, NP['Provenance']))
    rdf_dataset.add(
        (nanopublication_uri, NP['hasPublicationInfo'], pubinfo_graph_uri))
    rdf_dataset.add((pubinfo_graph_uri, RDF.type, NP['PublicationInfo']))

    # ----
    # The provenance graph
    # ----

    # Provenance information for the assertion graph (the data structure definition itself)
    provenance_graph.add(
        (assertion_graph_uri, PROV['wasDerivedFrom'], dataset_version_uri))
    provenance_graph.add(
        (dataset_uri, PROV['wasDerivedFrom'], dataset_version_uri))
    provenance_graph.add((assertion_graph_uri, PROV['generatedAtTime'],
                          Literal(timestamp, datatype=XSD.datetime)))
    provenance_graph.add(
        (assertion_graph_uri, PROV['wasAttributedTo'], author_uri))

    # ----
    # The publication info graph
    # ----

    # The URI of the latest version of QBer
    # TODO: should point to the actual latest commit of this QBer source file.
    # TODO: consider linking to this as the plan of some activity, rather than an activity itself.
    qber_uri = URIRef('https://github.com/CLARIAH/qber.git')

    pubinfo_graph.add((nanopublication_uri, PROV['wasGeneratedBy'], qber_uri))
    pubinfo_graph.add((nanopublication_uri, PROV['generatedAtTime'],
                       Literal(timestamp, datatype=XSD.datetime)))
    pubinfo_graph.add(
        (nanopublication_uri, PROV['wasAttributedTo'], author_uri))

    # ----
    # The assertion graph
    # ----

    structure_uri = BASE['structure']

    assertion_graph.add((dataset_uri, RDF.type, QB['DataSet']))
    assertion_graph.add((dataset_uri, RDFS.label, Literal(dataset_name)))
    assertion_graph.add(
        (structure_uri, RDF.type, QB['DataStructureDefinition']))

    assertion_graph.add((dataset_uri, QB['structure'], structure_uri))

    for variable_id, variable in variables.items():
        variable_uri = URIRef(variable['original']['uri'])
        variable_label = Literal(variable['original']['label'])
        variable_type = URIRef(variable['type'])

        codelist_uri = URIRef(variable['codelist']['original']['uri'])
        codelist_label = Literal(variable['codelist']['original']['label'])

        # The variable as component of the definition
        component_uri = safe_url(BASE,
                                 'component/' + variable['original']['label'])

        # Add link between the definition and the component
        assertion_graph.add((structure_uri, QB['component'], component_uri))

        # Add label to variable
        # TODO: We may need to do something with a changed label for the variable
        assertion_graph.add((variable_uri, RDFS.label, variable_label))

        if 'description' in variable and variable['description'] != "":
            assertion_graph.add(
                (variable_uri, RDFS.comment, Literal(variable['description'])))

        # If the variable URI is not the same as the original,
        # it is a specialization of a prior variable property.
        if variable['uri'] != str(variable_uri):
            assertion_graph.add(
                (variable_uri, RDFS['subPropertyOf'], URIRef(variable['uri'])))

        if variable_type == QB['DimensionProperty']:
            assertion_graph.add((variable_uri, RDF.type, variable_type))
            assertion_graph.add((component_uri, QB['dimension'], variable_uri))

            # Coded variables are also of type coded property (a subproperty of dimension property)
            if variable['category'] == 'coded':
                assertion_graph.add(
                    (variable_uri, RDF.type, QB['CodedProperty']))

        elif variable_type == QB['MeasureProperty']:
            # The category 'other'
            assertion_graph.add((variable_uri, RDF.type, variable_type))
            assertion_graph.add((component_uri, QB['measure'], variable_uri))
        elif variable_type == QB['AttributeProperty']:
            # Actually never produced by QBer at this stage
            assertion_graph.add((variable_uri, RDF.type, variable_type))
            assertion_graph.add((component_uri, QB['attribute'], variable_uri))

        # If this variable is of category 'coded', we add codelist and URIs for
        # each variable (including mappings between value uris and etc....)
        if variable['category'] == 'coded':
            assertion_graph.add((codelist_uri, RDF.type, SKOS['Collection']))
            assertion_graph.add(
                (codelist_uri, RDFS.label, Literal(codelist_label)))

            # The variable should point to the codelist
            assertion_graph.add((variable_uri, QB['codeList'], codelist_uri))

            # The variable is mapped onto an external code list.
            # If the codelist uri is not the same as the original one, we
            # have a derived codelist.
            if variable['codelist']['uri'] != str(codelist_uri):
                assertion_graph.add((codelist_uri, PROV['wasDerivedFrom'],
                                     URIRef(variable['codelist']['uri'])))

            # Generate a SKOS concept for each of the values and map it to the
            # assigned codelist
            for value in variable['values']:
                value_uri = URIRef(value['original']['uri'])
                value_label = Literal(value['original']['label'])

                assertion_graph.add((value_uri, RDF.type, SKOS['Concept']))
                assertion_graph.add(
                    (value_uri, SKOS['prefLabel'], Literal(value_label)))
                assertion_graph.add((codelist_uri, SKOS['member'], value_uri))

                # The value has been changed, and therefore there is a mapping
                if value['original']['uri'] != value['uri']:
                    assertion_graph.add(
                        (value_uri, SKOS['exactMatch'], URIRef(value['uri'])))
                    assertion_graph.add(
                        (value_uri, RDFS.label, Literal(value['label'])))

        elif variable['category'] == 'identifier':
            # Generate a SKOS concept for each of the values
            for value in variable['values']:
                value_uri = URIRef(value['original']['uri'])
                value_label = Literal(value['original']['label'])

                assertion_graph.add((value_uri, RDF.type, SKOS['Concept']))
                assertion_graph.add(
                    (value_uri, SKOS['prefLabel'], value_label))

                # The value has been changed, and therefore there is a mapping
                if value['original']['uri'] != value['uri']:
                    assertion_graph.add(
                        (value_uri, SKOS['exactMatch'], URIRef(value['uri'])))
                    assertion_graph.add(
                        (value_uri, RDFS.label, Literal(value['label'])))

        elif variable['category'] == 'other':
            # Generate a literal for each of the values when converting the dataset (but not here)
            pass

    return rdf_dataset
Esempio n. 10
0
class RDFtoUmlDiagram():
    """
    Transform a RDF dataset to an UML diagram
    """

    def __init__(self, showObjs, showClasses, namespace):
        self.ds = Dataset()
        self.d = UmlPygraphVizDiagram()
        self.show_objs = showObjs
        self.show_classes = showClasses
        self.namespace = namespace
        self.add_namespaces(self.namespace)

    def load_rdf(self, filename, input_format=None):
        if input_format:
            rdf_format = input_format
        elif filename is not sys.stdin:
            format_list = {'.xml': 'xml',
                           '.rdf': 'xml',
                           '.owl': 'xml',
                           '.n3': 'n3',
                           '.ttl': 'turtle',
                           '.nt': 'nt',
                           '.trig': 'trig',
                           '.nq': 'nquads',
                           '': 'turtle'}
            extension = splitext(filename.name)[1]
            rdf_format = format_list[extension]
        else:
            rdf_format = 'turtle'
        print("using rdf format: " + rdf_format)
        temp = self.ds.graph("file://"+filename.name)
        temp.parse(filename.name, format=rdf_format)

    def add_namespaces(self, namespaces):
        if namespaces:
            for ns in namespaces:
                self.ds.namespace_manager.bind(ns[0],ns[1])
    
    def start_subgraph(self, graph_name):
        self.d.start_subgraph(graph_name.strip('[<>]:_'))


    def add_object_node(self, object_name, classes_name, attributes):
        self.d.add_object_node(self.ds.namespace_manager.qname(object_name), classes_name, attributes)

    def add_class_node(self, class_name, attributes):
        self.d.add_class_node(self.ds.namespace_manager.qname(class_name), attributes)

    def add_edge(self, src, dst, predicate):
        self.d.add_edge(self.ds.namespace_manager.qname(src), self.ds.namespace_manager.qname(dst), self.ds.namespace_manager.qname(predicate))

    def add_subclass_edge(self, src, dst):
        self.d.add_subclass_edge(self.ds.namespace_manager.qname(src), self.ds.namespace_manager.qname(dst))

    def create_namespace_box(self):
        # Create Namespace box
        label = """<
            <table align="left" cellborder="0">
                <tr><td align='center' colspan='2'><b>Namespaces</b></td></tr>"""
        for ns in sorted(self.ds.namespaces()):
            label += "<tr><td align='left'>%s:</td><td align='left'>%s</td></tr>" % (ns[0], ns[1] )
        label += "</table> >"

        self.d.set_label(label)

    def output_dot(self, filename):
        self.d.write_to_file(filename)

    def visualize(self, filename):
        self.d.visualize(filename, self.ds.namespaces())


    def create_diagram(self, object_nodes=True, class_nodes=False):
        # Iterate over all graphs
        for graph in self.ds.contexts():
            graph_name = graph.n3()
            if graph_name == "[<urn:x-rdflib:default>]":
                break
            graph = graph.skolemize()
            if len(graph) > 0:
                self.start_subgraph(graph_name)
                if self.show_objs:
                    self.create_object_nodes(graph)
                if self.show_classes:
                    self.create_class_nodes(graph)
        self.d.add_undescribed_nodes()
        self.create_namespace_box()


    def create_object_nodes(self, graph):
        # object nodes
        query_nodes = """PREFIX owl: <http://www.w3.org/2002/07/owl#>
                    SELECT DISTINCT ?node
                    WHERE {
                        ?node a ?class.
                        FILTER (?class not IN (rdfs:Class, owl:Class, owl:Property, owl:ObjectProperty, owl:DatatypeProperty))
                    } ORDER BY ?node"""
        result_nodes = graph.query(query_nodes)
        for row_nodes in result_nodes:
            # adding the classes to the node (can be more than one)
            query_classes = """SELECT DISTINCT ?class
                    WHERE {
                        %s a ?class.
                    } ORDER BY ?class""" % row_nodes['node'].n3()
            result_classes = graph.query(query_classes)
            classes = []
            for row_classes in result_classes:
                if not self.show_classes:
                    classes.append(self.ds.namespace_manager.qname(row_classes['class']))
                else:
                    self.add_edge(row_nodes['node'], row_classes['class'],
                                  "http://www.w3.org/1999/02/22-rdf-syntax-ns#type")
                

            # adding the attributes to the node
            query_attributes = """SELECT DISTINCT ?p ?o
                        WHERE {
                            %s ?p ?o.
                            FILTER (isLiteral(?o))
                        } ORDER BY ?p ?o""" % row_nodes['node'].n3()
            result_attributes = graph.query(query_attributes)
            attributes = []
            for row_attributes in result_attributes:
                attributes.append(
                    self.ds.namespace_manager.qname(row_attributes['p']) + " = " + str(row_attributes['o']))
            self.add_object_node(row_nodes['node'], ", ".join(classes), attributes)

        # object node connections
        query_connections = """SELECT DISTINCT ?c1 ?c2 ?p
                    WHERE {
                        ?c1 ?p ?c2.
                        FILTER (!isLiteral(?c2))
                        FILTER (?p not IN (rdf:type, rdfs:domain, rdfs:range, rdfs:subClassOf))
                    } ORDER BY ?c1 ?p ?c2"""
        result_connections = graph.query(query_connections)
        for row_connections in result_connections:
            self.add_edge(row_connections['c1'], row_connections['c2'], row_connections['p'])


    def create_class_nodes(self, graph):
        # RDFS stuff
        query_classes = """PREFIX owl: <http://www.w3.org/2002/07/owl#>
                    SELECT DISTINCT ?class
                    WHERE {
                        ?class a ?c .
                        FILTER (?c in (rdfs:Class, owl:Class))
                    } ORDER BY ?class"""
        result_classes = graph.query(query_classes)
        for row_classes in result_classes:
            query_datatype_property = """
                PREFIX owl: <http://www.w3.org/2002/07/owl#>
                PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
                SELECT DISTINCT ?property ?range
                        WHERE {
                            ?property rdfs:domain %s;
                                a owl:DatatypeProperty.
                            OPTIONAL{ ?property rdfs:range ?range. }
                        } ORDER BY ?property""" % row_classes['class'].n3()
            result_datatype_property = graph.query(query_datatype_property)
            attributes = []
            for r in result_datatype_property:
                text = self.ds.namespace_manager.qname(r['property'])
                if r['range']:
                    text += " = " + self.ds.namespace_manager.qname(r['range'])
                attributes.append(text)
            self.add_class_node(row_classes['class'], attributes)

        query_object_property = """SELECT DISTINCT ?src ?dest ?property
                    WHERE {
                       ?property a <http://www.w3.org/2002/07/owl#ObjectProperty>;
                            rdfs:domain ?src;
                            rdfs:range ?dest.
                    } ORDER BY ?src ?property ?dest"""
        result_object_property = graph.query(query_object_property)
        for row_object_property in result_object_property:
            self.add_edge(row_object_property['src'], row_object_property['dest'], row_object_property['property'])

        query_subclass = """SELECT DISTINCT ?src ?dest
                    WHERE {
                       ?src rdfs:subClassOf ?dest.
                    } ORDER BY ?src ?dest"""
        result_subclass = graph.query(query_subclass)
        for row_subclass in result_subclass:
            self.add_subclass_edge(row_subclass['src'], row_subclass['dest'])
Esempio n. 11
0
def data_structure_definition(profile, dataset_name, dataset_base_uri, variables, source_path, source_hash):
    """Converts the dataset + variables to a set of rdflib Graphs (a nanopublication with provenance annotations)
    that contains the data structure definition (from the DataCube vocabulary) and
    the mappings to external datasets.

    Arguments:
    dataset     -- the name of the dataset
    variables   -- the list of dictionaries with the variables and their mappings to URIs
    profile     -- the Google signin profile
    source_path -- the path to the dataset file that was annotated
    source_hash -- the Git hash of the dataset file version of the dataset

    :returns: an RDF graph store containing a nanopublication
    """
    BASE = Namespace("{}/".format(dataset_base_uri))
    dataset_uri = URIRef(dataset_base_uri)

    # Initialize a conjunctive graph for the whole lot
    rdf_dataset = Dataset()
    rdf_dataset.bind("qbrv", QBRV)
    rdf_dataset.bind("qbr", QBR)
    rdf_dataset.bind("qb", QB)
    rdf_dataset.bind("skos", SKOS)
    rdf_dataset.bind("prov", PROV)
    rdf_dataset.bind("np", NP)
    rdf_dataset.bind("foaf", FOAF)

    # Initialize the graphs needed for the nanopublication
    timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M")

    # Shorten the source hash to 8 digits (similar to Github)
    source_hash = source_hash[:8]

    hash_part = source_hash + "/" + timestamp

    # The Nanopublication consists of three graphs
    assertion_graph_uri = BASE["assertion/" + hash_part]
    assertion_graph = rdf_dataset.graph(assertion_graph_uri)

    provenance_graph_uri = BASE["provenance/" + hash_part]
    provenance_graph = rdf_dataset.graph(provenance_graph_uri)

    pubinfo_graph_uri = BASE["pubinfo/" + hash_part]
    pubinfo_graph = rdf_dataset.graph(pubinfo_graph_uri)

    # A URI that represents the author
    author_uri = QBR["person/" + profile["email"]]

    rdf_dataset.add((author_uri, RDF.type, FOAF["Person"]))
    rdf_dataset.add((author_uri, FOAF["name"], Literal(profile["name"])))
    rdf_dataset.add((author_uri, FOAF["email"], Literal(profile["email"])))
    rdf_dataset.add((author_uri, QBRV["googleId"], Literal(profile["id"])))
    try:
        rdf_dataset.add((author_uri, FOAF["depiction"], URIRef(profile["image"])))
    except KeyError:
        pass

    # A URI that represents the version of the dataset source file
    dataset_version_uri = BASE[source_hash]

    # Some information about the source file used
    rdf_dataset.add((dataset_version_uri, QBRV["path"], Literal(source_path, datatype=XSD.string)))
    rdf_dataset.add((dataset_version_uri, QBRV["sha1_hash"], Literal(source_hash, datatype=XSD.string)))

    # ----
    # The nanopublication itself
    # ----
    nanopublication_uri = BASE["nanopublication/" + hash_part]

    rdf_dataset.add((nanopublication_uri, RDF.type, NP["Nanopublication"]))
    rdf_dataset.add((nanopublication_uri, NP["hasAssertion"], assertion_graph_uri))
    rdf_dataset.add((assertion_graph_uri, RDF.type, NP["Assertion"]))
    rdf_dataset.add((nanopublication_uri, NP["hasProvenance"], provenance_graph_uri))
    rdf_dataset.add((provenance_graph_uri, RDF.type, NP["Provenance"]))
    rdf_dataset.add((nanopublication_uri, NP["hasPublicationInfo"], pubinfo_graph_uri))
    rdf_dataset.add((pubinfo_graph_uri, RDF.type, NP["PublicationInfo"]))

    # ----
    # The provenance graph
    # ----

    # Provenance information for the assertion graph (the data structure definition itself)
    provenance_graph.add((assertion_graph_uri, PROV["wasDerivedFrom"], dataset_version_uri))
    provenance_graph.add((dataset_uri, PROV["wasDerivedFrom"], dataset_version_uri))
    provenance_graph.add((assertion_graph_uri, PROV["generatedAtTime"], Literal(timestamp, datatype=XSD.datetime)))
    provenance_graph.add((assertion_graph_uri, PROV["wasAttributedTo"], author_uri))

    # ----
    # The publication info graph
    # ----

    # The URI of the latest version of QBer
    # TODO: should point to the actual latest commit of this QBer source file.
    # TODO: consider linking to this as the plan of some activity, rather than an activity itself.
    qber_uri = URIRef("https://github.com/CLARIAH/qber.git")

    pubinfo_graph.add((nanopublication_uri, PROV["wasGeneratedBy"], qber_uri))
    pubinfo_graph.add((nanopublication_uri, PROV["generatedAtTime"], Literal(timestamp, datatype=XSD.datetime)))
    pubinfo_graph.add((nanopublication_uri, PROV["wasAttributedTo"], author_uri))

    # ----
    # The assertion graph
    # ----

    structure_uri = BASE["structure"]

    assertion_graph.add((dataset_uri, RDF.type, QB["DataSet"]))
    assertion_graph.add((dataset_uri, RDFS.label, Literal(dataset_name)))
    assertion_graph.add((structure_uri, RDF.type, QB["DataStructureDefinition"]))

    assertion_graph.add((dataset_uri, QB["structure"], structure_uri))

    for variable_id, variable in variables.items():
        variable_uri = URIRef(variable["original"]["uri"])
        variable_label = Literal(variable["original"]["label"])
        variable_type = URIRef(variable["type"])

        codelist_uri = URIRef(variable["codelist"]["original"]["uri"])
        codelist_label = Literal(variable["codelist"]["original"]["label"])

        # The variable as component of the definition
        component_uri = safe_url(BASE, "component/" + variable["original"]["label"])

        # Add link between the definition and the component
        assertion_graph.add((structure_uri, QB["component"], component_uri))

        # Add label to variable
        # TODO: We may need to do something with a changed label for the variable
        assertion_graph.add((variable_uri, RDFS.label, variable_label))

        if "description" in variable and variable["description"] != "":
            assertion_graph.add((variable_uri, RDFS.comment, Literal(variable["description"])))

        # If the variable URI is not the same as the original,
        # it is a specialization of a prior variable property.
        if variable["uri"] != str(variable_uri):
            assertion_graph.add((variable_uri, RDFS["subPropertyOf"], URIRef(variable["uri"])))

        if variable_type == QB["DimensionProperty"]:
            assertion_graph.add((variable_uri, RDF.type, variable_type))
            assertion_graph.add((component_uri, QB["dimension"], variable_uri))

            # Coded variables are also of type coded property (a subproperty of dimension property)
            if variable["category"] == "coded":
                assertion_graph.add((variable_uri, RDF.type, QB["CodedProperty"]))

        elif variable_type == QB["MeasureProperty"]:
            # The category 'other'
            assertion_graph.add((variable_uri, RDF.type, variable_type))
            assertion_graph.add((component_uri, QB["measure"], variable_uri))
        elif variable_type == QB["AttributeProperty"]:
            # Actually never produced by QBer at this stage
            assertion_graph.add((variable_uri, RDF.type, variable_type))
            assertion_graph.add((component_uri, QB["attribute"], variable_uri))

        # If this variable is of category 'coded', we add codelist and URIs for
        # each variable (including mappings between value uris and etc....)
        if variable["category"] == "coded":
            assertion_graph.add((codelist_uri, RDF.type, SKOS["Collection"]))
            assertion_graph.add((codelist_uri, RDFS.label, Literal(codelist_label)))

            # The variable should point to the codelist
            assertion_graph.add((variable_uri, QB["codeList"], codelist_uri))

            # The variable is mapped onto an external code list.
            # If the codelist uri is not the same as the original one, we
            # have a derived codelist.
            if variable["codelist"]["uri"] != str(codelist_uri):
                assertion_graph.add((codelist_uri, PROV["wasDerivedFrom"], URIRef(variable["codelist"]["uri"])))

            # Generate a SKOS concept for each of the values and map it to the
            # assigned codelist
            for value in variable["values"]:
                value_uri = URIRef(value["original"]["uri"])
                value_label = Literal(value["original"]["label"])

                assertion_graph.add((value_uri, RDF.type, SKOS["Concept"]))
                assertion_graph.add((value_uri, SKOS["prefLabel"], Literal(value_label)))
                assertion_graph.add((codelist_uri, SKOS["member"], value_uri))

                # The value has been changed, and therefore there is a mapping
                if value["original"]["uri"] != value["uri"]:
                    assertion_graph.add((value_uri, SKOS["exactMatch"], URIRef(value["uri"])))
                    assertion_graph.add((value_uri, RDFS.label, Literal(value["label"])))

        elif variable["category"] == "identifier":
            # Generate a SKOS concept for each of the values
            for value in variable["values"]:
                value_uri = URIRef(value["original"]["uri"])
                value_label = Literal(value["original"]["label"])

                assertion_graph.add((value_uri, RDF.type, SKOS["Concept"]))
                assertion_graph.add((value_uri, SKOS["prefLabel"], value_label))

                # The value has been changed, and therefore there is a mapping
                if value["original"]["uri"] != value["uri"]:
                    assertion_graph.add((value_uri, SKOS["exactMatch"], URIRef(value["uri"])))
                    assertion_graph.add((value_uri, RDFS.label, Literal(value["label"])))

        elif variable["category"] == "other":
            # Generate a literal for each of the values when converting the dataset (but not here)
            pass

    return rdf_dataset
Esempio n. 12
0
class DatasetTestCase(unittest.TestCase):
    store = "default"
    slow = True
    tmppath = None

    def setUp(self):
        try:
            self.graph = Dataset(store=self.store)
        except ImportError:
            raise SkipTest("Dependencies for store '%s' not available!" %
                           self.store)
        if self.store == "SQLite":
            _, self.tmppath = mkstemp(prefix="test",
                                      dir="/tmp",
                                      suffix=".sqlite")
        elif self.store == "SPARQLUpdateStore":
            root = HOST + DB
            self.graph.open((root + "sparql", root + "update"))
        else:
            self.tmppath = mkdtemp()

        if self.store != "SPARQLUpdateStore":
            self.graph.open(self.tmppath, create=True)
        self.michel = URIRef("urn:michel")
        self.tarek = URIRef("urn:tarek")
        self.bob = URIRef("urn:bob")
        self.likes = URIRef("urn:likes")
        self.hates = URIRef("urn:hates")
        self.pizza = URIRef("urn:pizza")
        self.cheese = URIRef("urn:cheese")

        # Use regular URIs because SPARQL endpoints like Fuseki alter short names
        self.c1 = URIRef("urn:context-1")
        self.c2 = URIRef("urn:context-2")

        # delete the graph for each test!
        self.graph.remove((None, None, None))
        for c in self.graph.contexts():
            c.remove((None, None, None))
            assert len(c) == 0
            self.graph.remove_graph(c)

    def tearDown(self):
        self.graph.close()
        if self.store == "SPARQLUpdateStore":
            pass
        else:
            if os.path.isdir(self.tmppath):
                shutil.rmtree(self.tmppath)
            else:
                os.remove(self.tmppath)

    def testGraphAware(self):

        if not self.graph.store.graph_aware:
            return

        g = self.graph
        g1 = g.graph(self.c1)

        # Some SPARQL endpoint backends (e.g. TDB) do not consider
        # empty named graphs
        if self.store != "SPARQLUpdateStore":
            # added graph exists
            self.assertEqual(
                set(x.identifier for x in self.graph.contexts()),
                set([self.c1, DATASET_DEFAULT_GRAPH_ID]),
            )

        # added graph is empty
        self.assertEqual(len(g1), 0)

        g1.add((self.tarek, self.likes, self.pizza))

        # added graph still exists
        self.assertEqual(
            set(x.identifier for x in self.graph.contexts()),
            set([self.c1, DATASET_DEFAULT_GRAPH_ID]),
        )

        # added graph contains one triple
        self.assertEqual(len(g1), 1)

        g1.remove((self.tarek, self.likes, self.pizza))

        # added graph is empty
        self.assertEqual(len(g1), 0)

        # Some SPARQL endpoint backends (e.g. TDB) do not consider
        # empty named graphs
        if self.store != "SPARQLUpdateStore":
            # graph still exists, although empty
            self.assertEqual(
                set(x.identifier for x in self.graph.contexts()),
                set([self.c1, DATASET_DEFAULT_GRAPH_ID]),
            )

        g.remove_graph(self.c1)

        # graph is gone
        self.assertEqual(
            set(x.identifier for x in self.graph.contexts()),
            set([DATASET_DEFAULT_GRAPH_ID]),
        )

    def testDefaultGraph(self):
        # Something the default graph is read-only (e.g. TDB in union mode)
        if self.store == "SPARQLUpdateStore":
            print("Please make sure updating the default graph "
                  "is supported by your SPARQL endpoint")

        self.graph.add((self.tarek, self.likes, self.pizza))
        self.assertEqual(len(self.graph), 1)
        # only default exists
        self.assertEqual(
            set(x.identifier for x in self.graph.contexts()),
            set([DATASET_DEFAULT_GRAPH_ID]),
        )

        # removing default graph removes triples but not actual graph
        self.graph.remove_graph(DATASET_DEFAULT_GRAPH_ID)

        self.assertEqual(len(self.graph), 0)
        # default still exists
        self.assertEqual(
            set(x.identifier for x in self.graph.contexts()),
            set([DATASET_DEFAULT_GRAPH_ID]),
        )

    def testNotUnion(self):
        # Union depends on the SPARQL endpoint configuration
        if self.store == "SPARQLUpdateStore":
            print("Please make sure your SPARQL endpoint has not configured "
                  "its default graph as the union of the named graphs")
        g1 = self.graph.graph(self.c1)
        g1.add((self.tarek, self.likes, self.pizza))

        self.assertEqual(list(self.graph.objects(self.tarek, None)), [])
        self.assertEqual(list(g1.objects(self.tarek, None)), [self.pizza])

    def testIter(self):
        """PR 1382: adds __iter__ to Dataset"""
        d = Dataset()
        uri_a = URIRef("https://example.com/a")
        uri_b = URIRef("https://example.com/b")
        uri_c = URIRef("https://example.com/c")
        uri_d = URIRef("https://example.com/d")

        d.add_graph(URIRef("https://example.com/g1"))
        d.add((uri_a, uri_b, uri_c, URIRef("https://example.com/g1")))
        d.add((uri_a, uri_b, uri_c, URIRef("https://example.com/g1")
               ))  # pointless addition: duplicates above

        d.add_graph(URIRef("https://example.com/g2"))
        d.add((uri_a, uri_b, uri_c, URIRef("https://example.com/g2")))
        d.add((uri_a, uri_b, uri_d,
               URIRef("https://example.com/g1")))  # new, uri_d

        # traditional iterator
        i_trad = 0
        for t in d.quads((None, None, None)):
            i_trad += 1

        # new Dataset.__iter__ iterator
        i_new = 0
        for t in d:
            i_new += 1

        self.assertEqual(i_new, i_trad)  # both should be 3
Esempio n. 13
0
class BurstConverter(object):
    """The actual converter, that processes the chunk of lines from the CSV file, and uses the instructions from the ``schema`` graph to produce RDF."""

    def __init__(self, identifier, columns, schema, metadata_graph, encoding, output_format):
        self.ds = Dataset()
        # self.ds = apply_default_namespaces(Dataset())
        self.g = self.ds.graph(URIRef(identifier))

        self.columns = columns
        self.schema = schema
        self.metadata_graph = metadata_graph
        self.encoding = encoding
        self.output_format = output_format

        self.templates = {}

        self.aboutURLSchema = self.schema.csvw_aboutUrl

    def equal_to_null(self, nulls, row):
        """Determines whether a value in a cell matches a 'null' value as specified in the CSVW schema)"""
        for n in nulls:
            n = Item(self.metadata_graph, n)
            col = str(n.csvw_name)
            val = str(n.csvw_null)
            if row[col] == val:
                logger.debug("Value of column {} ('{}') is equal to specified 'null' value: '{}'".format(col, unicode(row[col]).encode('utf-8'), val))
                # There is a match with null value
                return True
        # There is no match with null value
        return False

    def process(self, count, rows, chunksize):
        """Process the rows fed to the converter. Count and chunksize are used to determine the
        current row number (needed for default observation identifiers)"""
        obs_count = count * chunksize

        # logger.info("Row: {}".format(obs_count)) #removed for readability

        # We iterate row by row, and then column by column, as given by the CSVW mapping file.
        mult_proc_counter = 0
        iter_error_counter= 0
        for row in rows:
            # This fixes issue:10
            if row is None:
                mult_proc_counter += 1
                # logger.debug( #removed for readability
                #     "Skipping empty row caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...")
                continue

            # set the '_row' value in case we need to generate 'default' URIs for each observation ()
            # logger.debug("row: {}".format(obs_count)) #removed for readability
            row[u'_row'] = obs_count
            count += 1

            # The self.columns dictionary gives the mapping definition per column in the 'columns'
            # array of the CSVW tableSchema definition.
            for c in self.columns:

                c = Item(self.metadata_graph, c)
                # default about URL
                s = self.expandURL(self.aboutURLSchema, row)

                try:
                    # Can also be used to prevent the triggering of virtual
                    # columns!

                    # Get the raw value from the cell in the CSV file
                    value = row[unicode(c.csvw_name)]
                    # This checks whether we should continue parsing this cell, or skip it.
                    if self.isValueNull(value, c):
                        continue

                    # If the null values are specified in an array, we need to parse it as a collection (list)
                    elif isinstance(c.csvw_null, Item):
                        nulls = Collection(self.metadata_graph, BNode(c.csvw_null))

                        if self.equal_to_null(nulls, row):
                            # Continue to next column specification in this row, if the value is equal to (one of) the null values.
                            continue
                except:
                    # No column name specified (virtual) because there clearly was no c.csvw_name key in the row.
                    # logger.debug(traceback.format_exc()) #removed for readability
                    iter_error_counter +=1
                    if isinstance(c.csvw_null, Item):
                        nulls = Collection(self.metadata_graph, BNode(c.csvw_null))
                        if self.equal_to_null(nulls, row):
                            # Continue to next column specification in this row, if the value is equal to (one of) the null values.
                            continue

                try:
                    # This overrides the subject resource 's' that has been created earlier based on the
                    # schema wide aboutURLSchema specification.
                    if unicode(c.csvw_virtual) == u'true' and c.csvw_aboutUrl is not None:
                        s = self.expandURL(c.csvw_aboutUrl, row)

                    if c.csvw_valueUrl is not None:
                        # This is an object property, because the value needs to be cast to a URL
                        p = self.expandURL(c.csvw_propertyUrl, row)
                        o = self.expandURL(c.csvw_valueUrl, row)
                        if self.isValueNull(os.path.basename(unicode(o)), c):
                            logger.debug("skipping empty value")
                            continue

                        if unicode(c.csvw_virtual) == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.anyURI:
                            # Special case: this is a virtual column with object values that are URIs
                            # For now using a test special property
                            value = row[unicode(c.csvw_name)].encode('utf-8')
                            o = URIRef(iribaker.to_iri(value))

                        if unicode(c.csvw_virtual) == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.linkURI:
                            about_url = str(c.csvw_aboutUrl)
                            about_url = about_url[about_url.find("{"):about_url.find("}")+1]
                            s = self.expandURL(about_url, row)
                            # logger.debug("s: {}".format(s))
                            value_url = str(c.csvw_valueUrl)
                            value_url = value_url[value_url.find("{"):value_url.find("}")+1]
                            o = self.expandURL(value_url, row)
                            # logger.debug("o: {}".format(o))

                        # For coded properties, the collectionUrl can be used to indicate that the
                        # value URL is a concept and a member of a SKOS Collection with that URL.
                        if c.csvw_collectionUrl is not None:
                            collection = self.expandURL(c.csvw_collectionUrl, row)
                            self.g.add((collection, RDF.type, SKOS['Collection']))
                            self.g.add((o, RDF.type, SKOS['Concept']))
                            self.g.add((collection, SKOS['member'], o))

                        # For coded properties, the schemeUrl can be used to indicate that the
                        # value URL is a concept and a member of a SKOS Scheme with that URL.
                        if c.csvw_schemeUrl is not None:
                            scheme = self.expandURL(c.csvw_schemeUrl, row)
                            self.g.add((scheme, RDF.type, SKOS['Scheme']))
                            self.g.add((o, RDF.type, SKOS['Concept']))
                            self.g.add((o, SKOS['inScheme'], scheme))
                    else:
                        # This is a datatype property
                        if c.csvw_value is not None:
                            value = self.render_pattern(unicode(c.csvw_value), row)
                        elif c.csvw_name is not None:
                            # print s
                            # print c.csvw_name, self.encoding
                            # print row[unicode(c.csvw_name)], type(row[unicode(c.csvw_name)])
                            # print row[unicode(c.csvw_name)].encode('utf-8')
                            # print '...'
                            value = row[unicode(c.csvw_name)].encode('utf-8')
                        else:
                            raise Exception("No 'name' or 'csvw:value' attribute found for this column specification")

                        # If propertyUrl is specified, use it, otherwise use
                        # the column name
                        if c.csvw_propertyUrl is not None:
                            p = self.expandURL(c.csvw_propertyUrl, row)
                        else:
                            if "" in self.metadata_graph.namespaces():
                                propertyUrl = self.metadata_graph.namespaces()[""][
                                    unicode(c.csvw_name)]
                            else:
                                propertyUrl = "{}{}".format(get_namespaces()['sdv'],
                                    unicode(c.csvw_name))

                            p = self.expandURL(propertyUrl, row)

                        if c.csvw_datatype is not None:
                            if URIRef(c.csvw_datatype) == XSD.anyURI:
                                # The xsd:anyURI datatype will be cast to a proper IRI resource.
                                o = URIRef(iribaker.to_iri(value))
                            elif URIRef(c.csvw_datatype) == XSD.string and c.csvw_lang is not None:
                                # If it is a string datatype that has a language, we turn it into a
                                # language tagged literal
                                # We also render the lang value in case it is a
                                # pattern.
                                o = Literal(value, lang=self.render_pattern(
                                    c.csvw_lang, row))
                            else:
                                o = Literal(value, datatype=c.csvw_datatype, normalize=False)
                        else:
                            # It's just a plain literal without datatype.
                            o = Literal(value)

                    # Add the triple to the assertion graph
                    self.g.add((s, p, o))

                    # Add provenance relating the propertyUrl to the column id
                    if '@id' in c:
                        self.g.add((p, PROV['wasDerivedFrom'], URIRef(c['@id'])))

                except:
                    # print row[0], value
                    traceback.print_exc()

            # We increment the observation (row number) with one
            obs_count += 1

        logger.debug(
            "{} row skips caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...".format(mult_proc_counter))
        logger.debug(
            "{} errors encountered while trying to iterate over a NoneType...".format(mult_proc_counter))
        logger.info("... done")
        return self.ds.serialize(format=self.output_format)

    # def serialize(self):
    #     trig_file_name = self.file_name + '.trig'
    #     logger.info("Starting serialization to {}".format(trig_file_name))
    #
    #     with open(trig_file_name, 'w') as f:
    #         self.np.serialize(f, format='trig')
    #     logger.info("... done")

    def render_pattern(self, pattern, row):
        """Takes a Jinja or Python formatted string, and applies it to the row value"""
        # Significant speedup by not re-instantiating Jinja templates for every
        # row.
        if pattern in self.templates:
            template = self.templates[pattern]
        else:
            template = self.templates[pattern] = Template(pattern)

        # TODO This should take into account the special CSVW instructions such as {_row}
        # First we interpret the url_pattern as a Jinja2 template, and pass all
        # column/value pairs as arguments
        rendered_template = template.render(**row)

        try:
            # We then format the resulting string using the standard Python2
            # expressions
            return rendered_template.format(**row)
        except:
            logger.warning(
                u"Could not apply python string formatting, probably due to mismatched curly brackets. IRI will be '{}'. ".format(rendered_template))
            return rendered_template

    def expandURL(self, url_pattern, row, datatype=False):
        """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef"""
        url = self.render_pattern(unicode(url_pattern), row)

        # DEPRECATED
        # for ns, nsuri in namespaces.items():
        #     if url.startswith(ns):
        #         url = url.replace(ns + ':', nsuri)
        #         break

        try:
            iri = iribaker.to_iri(url)
            rfc3987.parse(iri, rule='IRI')
        except:
            raise Exception(u"Cannot convert `{}` to valid IRI".format(url))

        # print "Baked: ", iri
        return URIRef(iri)

    def isValueNull(self, value, c):
        """This checks whether we should continue parsing this cell, or skip it because it is empty or a null value."""
        try:
            if len(value) == 0 and unicode(c.csvw_parseOnEmpty) == u"true":
                print("Not skipping empty value")
                return False #because it should not be skipped
            elif len(value) == 0 or value == unicode(c.csvw_null) or value in [unicode(n) for n in c.csvw_null] or value == unicode(self.schema.csvw_null):
                # Skip value if length is zero and equal to (one of) the null value(s)
                logger.debug(
                    "Length is 0 or value is equal to specified 'null' value")
                return True
        except:
            logger.debug("null does not exist or is not a list.")
        return False
Esempio n. 14
0
vocabs['rdavf'] = Namespace('http://rdaregistry.info/termList/videoFormat/')
vocabs['rdagd'] = Namespace('http://rdaregistry.info/termList/gender/')
vocabs['iso639-2'] = Namespace('http://id.loc.gov/vocabulary/iso639-2/')
vocabs['wd'] = Namespace('http://www.wikidata.org/entity/')
vocabs['gnd'] = Namespace('http://d-nb.info/gnd/')
vocabs['rico'] = Namespace('https://www.ica.org/standards/RiC/ontology#')
vocabs['ric-rst'] = Namespace(
    'https://www.ica.org/standards/RiC/vocabularies/recordSetTypes#')
vocabs['ebucore'] = Namespace(
    'https://www.ebu.ch/metadata/ontologies/ebucore#')
vocabs['eclap'] = Namespace('http://www.eclap.eu/schema/eclap/')
vocabs['dct'] = Namespace('http://purl.org/dc/terms/')
vocabs['premis'] = Namespace('http://www.loc.gov/premis/rdf/v1#')

ds = Dataset()
graph = ds.graph(URIRef('http://vocab.performing-arts.ch/container/context'))
graph.bind('rdfs', RDFS)
graph.bind('skos', SKOS)
graph.bind('owl', OWL)
graph.bind('spav', vocab_ns)
for k, v in vocabs.items():
    graph.bind(k, v)

ldp = URIRef('http://vocab.performing-arts.ch/container')
graph.add((ldp, RDF.type, vocabs['prov'].Entity))
graph.add((ldp, RDF.type, vocabs['ldp'].Resource))
graph.add(
    (ldp, vocabs['prov'].generatedAtTime,
     Literal(datetime.datetime.now(pytz.timezone('Europe/Zurich')).isoformat(),
             datatype=XSD.dateTime)))
Esempio n. 15
0
        if r['neighbourhood']:
            locationObservation = LocationObservation(
                nsLocationObservation.term(r['neighbourhood']),
                label=[r['neighbourhood']])

            personObservation.hasLocation = [locationObservation]

    DERIVATION.entity = sorted(images)

    return personObservation  # exampleResource


if __name__ == "__main__":

    ds = Dataset()
    g = rdfSubject.db = ds.graph(
        identifier="https://data.create.humanities.uva.nl/id/kohier1674/")

    exampleResource = main('data/records.csv', g)

    rdfSubject.db = ds

    description = """Van dit handgeschreven kohier (SAA inventarisnummer 5028:662) bestaat een getypte index op achternaam (SAA inventarisnummber 5028:662A). Hierin is de naam van een persoon, de relatie tot een andere persoon of groep (e.g. wed. van, of kinderen van), beroep en de woonwijk opgenomen. Ook is genoteerd op welk foliant de persoon beschreven is.

In totaal zijn 60 wijken beschreven in het kohier, aangegeven met cijfers. Daarna volgt een sectie van de 'Magistraten' (M), 'Joodse Natie' (J), 'Paden buijten de Stadt' (P1-P6), 'Officianten' (O), 'Regerende heeren' (R), 'Personen van andere Steden' (AS) en 'Testamenten' (T).

De wijkindeling correspondeert waarschijnlijk met die van een kaart uit 1766, vervaardigd door C. Philips Jacobsz. (1732-1789) en F.W. Greebe en is beschikbaar in de Beeldbank van het Stadsarchief, afbeelding 010001000849.

"""
    contributors = ""

    download = DataDownload(
Esempio n. 16
0
def test_scenarios() -> None:
    """
    Testing scenarios:
        1. no base set
        2. base set at graph creation
        3. base set at serialization
        4. base set at both graph creation & serialization, serialization overrides
        5. multiple serialization side effect checking
        6. checking results for RDF/XML
        7. checking results for N3
        8. checking results for TriX & TriG
    """

    # variables
    base_one = Namespace("http://one.org/")
    base_two = Namespace("http://two.org/")
    title = Literal("Title", lang="en")
    description = Literal("Test Description", lang="en")
    creator = URIRef("https://creator.com")
    cs = URIRef("")

    # starting graph
    g = Graph()
    g.add((cs, RDF.type, SKOS.ConceptScheme))
    g.add((cs, DCTERMS.creator, creator))
    g.add((cs, DCTERMS.source, URIRef("nick")))
    g.bind("dct", DCTERMS)
    g.bind("skos", SKOS)

    # 1. no base set for graph, no base set for serialization
    g1 = Graph()
    g1 += g
    # @base should not be in output
    assert "@base" not in g.serialize(format="turtle")

    # 2. base one set for graph, no base set for serialization
    g2 = Graph(base=base_one)
    g2 += g
    # @base should be in output, from Graph (one)
    assert "@base <http://one.org/> ." in g2.serialize(format="turtle")

    # 3. no base set for graph, base two set for serialization
    g3 = Graph()
    g3 += g
    # @base should be in output, from serialization (two)
    assert "@base <http://two.org/> ." in g3.serialize(format="turtle",
                                                       base=base_two)

    # 4. base one set for graph, base two set for serialization, Graph one overrides
    g4 = Graph(base=base_one)
    g4 += g
    # @base should be in output, from graph (one)
    assert "@base <http://two.org/> ." in g4.serialize(format="turtle",
                                                       base=base_two)
    # just checking that the serialization setting (two) hasn't snuck through
    assert "@base <http://one.org/> ." not in g4.serialize(format="turtle",
                                                           base=base_two)

    # 5. multiple serialization side effect checking
    g5 = Graph()
    g5 += g
    # @base should be in output, from serialization (two)
    assert "@base <http://two.org/> ." in g5.serialize(format="turtle",
                                                       base=base_two)

    # checking for side affects - no base now set for this serialization
    # @base should not be in output
    assert "@base" not in g5.serialize(format="turtle")

    # 6. checking results for RDF/XML
    g6 = Graph()
    g6 += g
    g6.bind("dct", DCTERMS)
    g6.bind("skos", SKOS)
    assert "@xml:base" not in g6.serialize(format="xml")
    assert 'xml:base="http://one.org/"' in g6.serialize(format="xml",
                                                        base=base_one)
    g6.base = base_two
    assert 'xml:base="http://two.org/"' in g6.serialize(format="xml")
    assert 'xml:base="http://one.org/"' in g6.serialize(format="xml",
                                                        base=base_one)

    # 7. checking results for N3
    g7 = Graph()
    g7 += g
    g7.bind("dct", DCTERMS)
    g7.bind("skos", SKOS)
    assert "@xml:base" not in g7.serialize(format="xml")
    assert "@base <http://one.org/> ." in g7.serialize(format="n3",
                                                       base=base_one)
    g7.base = base_two
    assert "@base <http://two.org/> ." in g7.serialize(format="n3")
    assert "@base <http://one.org/> ." in g7.serialize(format="n3",
                                                       base=base_one)

    # 8. checking results for TriX & TriG
    # TriX can specify a base per graph but setting a base for the whole
    base_three = Namespace("http://three.org/")
    ds1 = Dataset()
    ds1.bind("dct", DCTERMS)
    ds1.bind("skos", SKOS)
    g8 = ds1.graph(URIRef("http://g8.com/"), base=base_one)
    g9 = ds1.graph(URIRef("http://g9.com/"))
    g8 += g
    g9 += g
    g9.base = base_two
    ds1.base = base_three

    trix = ds1.serialize(format="trix", base=Namespace("http://two.org/"))
    assert '<graph xml:base="http://one.org/">' in trix
    assert '<graph xml:base="http://two.org/">' in trix
    assert '<TriX xml:base="http://two.org/"' in trix

    trig = ds1.serialize(format="trig", base=Namespace("http://two.org/"))
    assert "@base <http://one.org/> ." not in trig
    assert "@base <http://three.org/> ." not in trig
    assert "@base <http://two.org/> ." in trig
Esempio n. 17
0
class RDFtoUmlDiagram():
    """
    Transform a RDF dataset to an UML diagram
    """
    def __init__(self, output_filename='output.png'):
        self.ds = Dataset()
        #self.d = UmlGraphVizDiagram(output_filename)
        self.d = UmlPygraphVizDiagram(output_filename)

    def load_rdf(self, filename, input_format=None):
        if input_format:
            rdf_format = input_format
        elif filename is not sys.stdin:
            format_list = {'.xml': 'xml',
                           '.rdf': 'xml',
                           '.owl': 'xml',
                           '.n3': 'n3',
                           '.ttl': 'turtle',
                           '.nt': 'nt',
                           '.trig': 'trig',
                           '.nq': 'nquads',
                           '': 'turtle'}
            extension = splitext(filename.name)[1]
            rdf_format = format_list[extension]
        else:
            rdf_format = 'turtle'
        temp = self.ds.graph("file://"+filename.name)
        temp.parse(filename.name, format=rdf_format)

    def add_namespaces(self, namespaces):
        if namespaces:
            for ns in namespaces:
                self.ds.namespace_manager.bind(ns[0],ns[1])
    
    def start_subgraph(self, graph_name):
        self.d.start_subgraph(graph_name.strip('[<>]:_'))
        
    def close_subgraph(self):
        self.d.close_subgraph()

    def add_object_node(self, object_name, classes_name, attributes):
        self.d.add_object_node(self.ds.namespace_manager.qname(object_name), classes_name, attributes)

    def add_class_node(self, class_name, attributes):
        self.d.add_class_node(self.ds.namespace_manager.qname(class_name), attributes)

    def add_edge(self, src, dst, predicate):
        self.d.add_edge(self.ds.namespace_manager.qname(src), self.ds.namespace_manager.qname(dst), self.ds.namespace_manager.qname(predicate))

    def add_subclass_edge(self, src, dst):
        self.d.add_subclass_edge(self.ds.namespace_manager.qname(src), self.ds.namespace_manager.qname(dst))

    def create_namespace_box(self):
        # Create Namespace box
        self.d.add_label("Namespaces:\l")
        for ns in sorted(self.ds.namespaces()):
            self.d.add_label("%s:\t%s \l" % (ns[0], ns[1]))

    def output_dot(self):
        self.d.write_to_file()

    def close(self):
        self.create_namespace_box()
        self.d.close()

    def visualize(self):
        self.d.visualize()
Esempio n. 18
0
def main(search=None, cache=None, identifiers=[]):

    ns = Namespace("https://data.create.humanities.uva.nl/id/rkd/")

    ds = Dataset()
    ds.bind('rdfs', RDFS)
    ds.bind('schema', schema)
    ds.bind('sem', sem)
    ds.bind('bio', bio)
    ds.bind('foaf', foaf)
    ds.bind('void', void)
    ds.bind('skos', SKOS)
    ds.bind('owl', OWL)
    ds.bind('dc', dc)

    ds.bind('rkdArtist', URIRef("https://data.rkd.nl/artists/"))
    ds.bind('rkdThes', nsThesaurus)
    ds.bind('rkdPerson', nsPerson)
    ds.bind('rkdImage', URIRef("https://rkd.nl/explore/images/"))
    ds.bind('rkdThumb', URIRef("https://images.rkd.nl/rkd/thumb/650x650/"))

    ds.bind('aat', URIRef("http://vocab.getty.edu/aat/"))

    ## First the images

    g = rdfSubject.db = ds.graph(identifier=ns)

    # Load cache thesaurus
    if os.path.isfile('rkdthesaurus.json'):
        with open('rkdthesaurus.json') as infile:
            thesaurusDict = json.load(infile)
    else:
        thesaurusDict = dict()

    # Load cache images
    if os.path.isfile('imagecache.json'):
        with open('imagecache.json') as infile:
            imageCache = json.load(infile)
    else:
        imageCache = dict()

    # to fetch all identifiers from the search
    if search:
        thesaurusDict, imageCache = parseURL(search,
                                             thesaurusDict=thesaurusDict,
                                             imageCache=imageCache)
    elif cache:
        # assume that everything in the thesaurus is also cached
        for doc in cache.values():
            parseData(doc, thesaurusDict=thesaurusDict)
    elif identifiers:
        for i in identifiers:
            thesaurusDict, imageCache = parseURL(APIURL + str(i),
                                                 thesaurusDict=thesaurusDict,
                                                 imageCache=imageCache)

    # Any images without labels?
    # These were not included in the search, but fetch them anyway.
    print("Finding referred images that were not included")
    q = """
    PREFIX schema: <http://schema.org/>
    SELECT ?uri WHERE {
        ?role a schema:Role ; schema:isRelatedTo ?uri .

        FILTER NOT EXISTS { ?uri schema:name ?name }
    }
    """
    images = g.query(q)
    print(f"Found {len(images)}!")
    for i in images:
        identifier = str(i['uri']).replace('https://rkd.nl/explore/images/',
                                           '')
        thesaurusDict, imageCache = parseURL(
            "https://api.rkd.nl/api/record/images/" + str(identifier),
            thesaurusDict=thesaurusDict,
            imageCache=imageCache)

    ## Then the thesaurus
    print("Converting the thesaurus")
    rdfSubject.db = ds.graph(identifier=ns.term('thesaurus/'))

    ids = list(thesaurusDict.keys())
    for i in ids:
        _, thesaurusDict = getThesaurus(i, thesaurusDict, 'concept')

    # Save updated cache
    with open('rkdthesaurus.json', 'w') as outfile:
        json.dump(thesaurusDict, outfile)

    with open('imagecache.json', 'w') as outfile:
        json.dump(imageCache, outfile)

    ## Serialize
    print("Serializing!")
    ds.serialize('rkdportraits14751825.trig', format='trig')
Esempio n. 19
0
class BurstConverter(object):
    """The actual converter, that processes the chunk of lines from the CSV file, and uses the instructions from the ``schema`` graph to produce RDF."""

    def __init__(self, identifier, columns, schema, metadata_graph, encoding, output_format):
        self.ds = Dataset()
        # self.ds = apply_default_namespaces(Dataset())
        self.g = self.ds.graph(URIRef(identifier))

        self.columns = columns
        self.schema = schema
        self.metadata_graph = metadata_graph
        self.encoding = encoding
        self.output_format = output_format

        self.templates = {}

        self.aboutURLSchema = self.schema.csvw_aboutUrl

    def equal_to_null(self, nulls, row):
        """Determines whether a value in a cell matches a 'null' value as specified in the CSVW schema)"""
        for n in nulls:
            n = Item(self.metadata_graph, n)
            col = str(n.csvw_name)
            val = str(n.csvw_null)
            if row[col] == val:
                # logger.debug("Value of column {} ('{}') is equal to specified 'null' value: '{}'".format(col, unicode(row[col]).encode('utf-8'), val))
                # There is a match with null value
                return True
        # There is no match with null value
        return False

    def process(self, count, rows, chunksize):
        """Process the rows fed to the converter. Count and chunksize are used to determine the
        current row number (needed for default observation identifiers)"""
        obs_count = count * chunksize

        # logger.info("Row: {}".format(obs_count)) #removed for readability

        # We iterate row by row, and then column by column, as given by the CSVW mapping file.
        mult_proc_counter = 0
        iter_error_counter= 0
        for row in rows:
            # This fixes issue:10
            if row is None:
                mult_proc_counter += 1
                # logger.debug( #removed for readability
                #     "Skipping empty row caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...")
                continue

            # set the '_row' value in case we need to generate 'default' URIs for each observation ()
            # logger.debug("row: {}".format(obs_count)) #removed for readability
            row[u'_row'] = obs_count
            count += 1

            # print(row)

            # The self.columns dictionary gives the mapping definition per column in the 'columns'
            # array of the CSVW tableSchema definition.

            for c in self.columns:
                c = Item(self.metadata_graph, c)
                # default about URL
                s = self.expandURL(self.aboutURLSchema, row)

                try:
                    # Can also be used to prevent the triggering of virtual
                    # columns!

                    # Get the raw value from the cell in the CSV file
                    try:
                        # Python 2
                        value = row[unicode(c.csvw_name)]
                    except NameError:
                        # Python 3
                        value = row[str(c.csvw_name)]

                    # This checks whether we should continue parsing this cell, or skip it.
                    if self.isValueNull(value, c):
                        continue

                    # If the null values are specified in an array, we need to parse it as a collection (list)
                    elif isinstance(c.csvw_null, Item):
                        nulls = Collection(self.metadata_graph, BNode(c.csvw_null))

                        if self.equal_to_null(nulls, row):
                            # Continue to next column specification in this row, if the value is equal to (one of) the null values.
                            continue
                except:
                    # No column name specified (virtual) because there clearly was no c.csvw_name key in the row.
                    # logger.debug(traceback.format_exc()) #removed for readability
                    iter_error_counter +=1
                    if isinstance(c.csvw_null, Item):
                        nulls = Collection(self.metadata_graph, BNode(c.csvw_null))
                        if self.equal_to_null(nulls, row):
                            # Continue to next column specification in this row, if the value is equal to (one of) the null values.
                            continue

                try:
                    # This overrides the subject resource 's' that has been created earlier based on the
                    # schema wide aboutURLSchema specification.

                    try:
                        csvw_virtual = unicode(c.csvw_virtual)
                        csvw_name = unicode(c.csvw_name)
                        csvw_value = unicode(c.csvw_value)
                        about_url = unicode(c.csvw_aboutUrl)
                        value_url = unicode(c.csvw_valueUrl)
                    except NameError:
                        csvw_virtual = str(c.csvw_virtual)
                        csvw_name = str(c.csvw_name)
                        csvw_value = str(c.csvw_value)
                        about_url = str(c.csvw_aboutUrl)
                        value_url = str(c.csvw_valueUrl)

                    if csvw_virtual == u'true' and c.csvw_aboutUrl is not None:
                        s = self.expandURL(c.csvw_aboutUrl, row)

                    if c.csvw_valueUrl is not None:
                        # This is an object property, because the value needs to be cast to a URL
                        p = self.expandURL(c.csvw_propertyUrl, row)
                        o = self.expandURL(c.csvw_valueUrl, row)
                        try:
                            if self.isValueNull(os.path.basename(unicode(o)), c):
                                logger.debug("skipping empty value")
                                continue
                        except NameError:
                            if self.isValueNull(os.path.basename(str(o)), c):
                                logger.debug("skipping empty value")
                                continue

                        if csvw_virtual == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.anyURI:
                            # Special case: this is a virtual column with object values that are URIs
                            # For now using a test special property
                            value = row[unicode(c.csvw_name)].encode('utf-8')
                            o = URIRef(iribaker.to_iri(value))

                        if csvw_virtual == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.linkURI:
                            about_url = about_url[about_url.find("{"):about_url.find("}")+1]
                            s = self.expandURL(about_url, row)
                            # logger.debug("s: {}".format(s))
                            value_url = value_url[value_url.find("{"):value_url.find("}")+1]
                            o = self.expandURL(value_url, row)
                            # logger.debug("o: {}".format(o))

                        # For coded properties, the collectionUrl can be used to indicate that the
                        # value URL is a concept and a member of a SKOS Collection with that URL.
                        if c.csvw_collectionUrl is not None:
                            collection = self.expandURL(c.csvw_collectionUrl, row)
                            self.g.add((collection, RDF.type, SKOS['Collection']))
                            self.g.add((o, RDF.type, SKOS['Concept']))
                            self.g.add((collection, SKOS['member'], o))

                        # For coded properties, the schemeUrl can be used to indicate that the
                        # value URL is a concept and a member of a SKOS Scheme with that URL.
                        if c.csvw_schemeUrl is not None:
                            scheme = self.expandURL(c.csvw_schemeUrl, row)
                            self.g.add((scheme, RDF.type, SKOS['Scheme']))
                            self.g.add((o, RDF.type, SKOS['Concept']))
                            self.g.add((o, SKOS['inScheme'], scheme))
                    else:
                        # This is a datatype property
                        if c.csvw_value is not None:
                            value = self.render_pattern(csvw_value, row)
                        elif c.csvw_name is not None:
                            # print s
                            # print c.csvw_name, self.encoding
                            # print row[unicode(c.csvw_name)], type(row[unicode(c.csvw_name)])
                            # print row[unicode(c.csvw_name)].encode('utf-8')
                            # print '...'
                            value = row[csvw_name].encode('utf-8')
                        else:
                            raise Exception("No 'name' or 'csvw:value' attribute found for this column specification")

                        # If propertyUrl is specified, use it, otherwise use
                        # the column name
                        if c.csvw_propertyUrl is not None:
                            p = self.expandURL(c.csvw_propertyUrl, row)
                        else:
                            if "" in self.metadata_graph.namespaces():
                                propertyUrl = self.metadata_graph.namespaces()[""][
                                    csvw_name]
                            else:
                                propertyUrl = "{}{}".format(get_namespaces()['sdv'],
                                    csvw_name)

                            p = self.expandURL(propertyUrl, row)

                        if c.csvw_datatype is not None:
                            if URIRef(c.csvw_datatype) == XSD.anyURI:
                                # The xsd:anyURI datatype will be cast to a proper IRI resource.
                                o = URIRef(iribaker.to_iri(value))
                            elif URIRef(c.csvw_datatype) == XSD.string and c.csvw_lang is not None:
                                # If it is a string datatype that has a language, we turn it into a
                                # language tagged literal
                                # We also render the lang value in case it is a
                                # pattern.
                                o = Literal(value, lang=self.render_pattern(
                                    c.csvw_lang, row))
                            else:
                                try:
                                    csvw_datatype = unicode(c.csvw_datatype)
                                except NameError:
                                    csvw_datatype = str(c.csvw_datatype).split(')')[0].split('(')[-1]
                                    # csvw_datatype = str(c.csvw_datatype)
                                # print(type(csvw_datatype))
                                # print(csvw_datatype)
                                o = Literal(value, datatype=csvw_datatype, normalize=False)
                        else:
                            # It's just a plain literal without datatype.
                            o = Literal(value)


                    # Add the triple to the assertion graph
                    self.g.add((s, p, o))

                    # Add provenance relating the propertyUrl to the column id
                    if '@id' in c:
                        self.g.add((p, PROV['wasDerivedFrom'], URIRef(c['@id'])))

                except:
                    # print row[0], value
                    traceback.print_exc()

            # We increment the observation (row number) with one
            obs_count += 1

        # for s,p,o in self.g.triples((None,None,None)):
        #     print(s.__repr__,p.__repr__,o.__repr__)

        logger.debug(
            "{} row skips caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...".format(mult_proc_counter))
        logger.debug(
            "{} errors encountered while trying to iterate over a NoneType...".format(mult_proc_counter))
        logger.info("... done")
        return self.ds.serialize(format=self.output_format)

    # def serialize(self):
    #     trig_file_name = self.file_name + '.trig'
    #     logger.info("Starting serialization to {}".format(trig_file_name))
    #
    #     with open(trig_file_name, 'w') as f:
    #         self.np.serialize(f, format='trig')
    #     logger.info("... done")

    def render_pattern(self, pattern, row):
        """Takes a Jinja or Python formatted string, and applies it to the row value"""
        # Significant speedup by not re-instantiating Jinja templates for every
        # row.

        if pattern in self.templates:
            template = self.templates[pattern]
        else:
            template = self.templates[pattern] = Template(pattern)

        # TODO This should take into account the special CSVW instructions such as {_row}
        # First we interpret the url_pattern as a Jinja2 template, and pass all
        # column/value pairs as arguments
        # row = {str('Int'): int('104906'), str('Country'): str('Luxembourg'), str('_row'): 1, str('Rank'): str('2')}

        # print(pattern)
        # print(type(pattern))
        # print(row)
        # print(type(row))
        # rendered_template = template.render(Int=120000)

        rendered_template = template.render(**row)

        try:
            # We then format the resulting string using the standard Python2
            # expressions
            return rendered_template.format(**row)
        except:
            logger.warning(
                u"Could not apply python string formatting, probably due to mismatched curly brackets. IRI will be '{}'. ".format(rendered_template))
            return rendered_template

    def expandURL(self, url_pattern, row, datatype=False):
        """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef"""

        try:
            unicode_url_pattern = unicode(url_pattern)
        except NameError:
            unicode_url_pattern = str(url_pattern).split(')')[0].split('(')[-1]
        # print(unicode_url_pattern)

        url = self.render_pattern(unicode_url_pattern, row)

        # DEPRECATED
        # for ns, nsuri in namespaces.items():
        #     if url.startswith(ns):
        #         url = url.replace(ns + ':', nsuri)
        #         break

        try:
            iri = iribaker.to_iri(url)
            rfc3987.parse(iri, rule='IRI')
        except:
            raise Exception(u"Cannot convert `{}` to valid IRI".format(url))

        # print(iri)
        return URIRef(iri)

    def isValueNull(self, value, c):
        """This checks whether we should continue parsing this cell, or skip it because it is empty or a null value."""
        try:
            if len(value) == 0 and unicode(c.csvw_parseOnEmpty) == u"true":
                # print("Not skipping empty value")
                return False #because it should not be skipped
            elif len(value) == 0 or value == unicode(c.csvw_null) or value in [unicode(n) for n in c.csvw_null] or value == unicode(self.schema.csvw_null):
                # Skip value if length is zero and equal to (one of) the null value(s)
                # logger.debug(
                #     "Length is 0 or value is equal to specified 'null' value")
                return True
        except:
            # logger.debug("null does not exist or is not a list.") #this line will print for every cell in a csv without a defined null value.
            pass
        return False
Esempio n. 20
0
from oldman import ClientResourceManager, parse_graph_safely, SPARQLDataStore
from oldman.rest.crud import HashLessCRUDer


logging.config.fileConfig(path.join(path.dirname(__file__), 'logging.ini'))



sesame_iri = "http://*****:*****@context": [
        {
            "myvoc": MY_VOC,
Esempio n. 21
0
class LongTermMemory(object):

    ONE_TO_ONE_PREDICATES = [
        'age', 'born_in', 'faceID', 'favorite', 'favorite_of', 'id', 'is_from',
        'manufactured_in', 'mother_is', 'name'
    ]

    def __init__(self, address=config.BRAIN_URL_LOCAL):
        """
        Interact with Triple store

        Parameters
        ----------
        address: str
            IP address and port of the Triple store
        """

        self.address = address
        self.namespaces = {}
        self.ontology_paths = {}
        self.format = 'trig'
        self.dataset = Dataset()
        self.query_prefixes = """
                    prefix gaf: <http://groundedannotationframework.org/gaf#> 
                    prefix grasp: <http://groundedannotationframework.org/grasp#> 
                    prefix leolaniInputs: <http://cltl.nl/leolani/inputs/>
                    prefix leolaniFriends: <http://cltl.nl/leolani/friends/> 
                    prefix leolaniTalk: <http://cltl.nl/leolani/talk/> 
                    prefix leolaniTime: <http://cltl.nl/leolani/time/> 
                    prefix leolaniWorld: <http://cltl.nl/leolani/world/> 
                    prefix n2mu: <http://cltl.nl/leolani/n2mu/> 
                    prefix ns1: <urn:x-rdflib:> 
                    prefix owl: <http://www.w3.org/2002/07/owl#> 
                    prefix prov: <http://www.w3.org/ns/prov#> 
                    prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
                    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
                    prefix sem: <http://semanticweb.cs.vu.nl/2009/11/sem/> 
                    prefix skos: <http://www.w3.org/2004/02/skos/core#> 
                    prefix time: <http://www.w3.org/TR/owl-time/#> 
                    prefix xml: <http://www.w3.org/XML/1998/namespace> 
                    prefix xml1: <https://www.w3.org/TR/xmlschema-2/#> 
                    prefix xsd: <http://www.w3.org/2001/XMLSchema#>
                    """

        self._define_namespaces()
        self._get_ontology_path()
        self._bind_namespaces()

        self.my_uri = None

        self._log = logger.getChild(self.__class__.__name__)
        self._log.debug("Booted")

    #################################### Main functions to interact with the brain ####################################

    def update(self, capsule):
        """
        Main function to interact with if a statement is coming into the brain. Takes in a structured parsed statement,
        transforms them to triples, and posts them to the triple store
        :param statement: Structured data of a parsed statement
        :return: json response containing the status for posting the triples, and the original statement
        """
        # Case fold
        capsule = casefold_capsule(capsule)

        # Create graphs and triples
        self._model_graphs_(capsule)

        data = self._serialize(config.BRAIN_LOG)

        code = self._upload_to_brain(data)

        # Create JSON output
        capsule["date"] = str(capsule["date"])
        output = {'response': code, 'statement': capsule}

        return output

    def experience(self, capsule):
        """
        Main function to interact with if a statement is coming into the brain. Takes in a structured parsed statement,
        transforms them to triples, and posts them to the triple store
        :param capsule: Structured data of a parsed statement
        :return: json response containing the status for posting the triples, and the original statement
        """
        # Case fold
        capsule = casefold_capsule(capsule)

        # Create graphs and triples
        self._model_graphs_(capsule, type='Experience')

        data = self._serialize(config.BRAIN_LOG)

        code = self._upload_to_brain(data)

        # Create JSON output
        capsule["date"] = str(capsule["date"])
        output = {'response': code, 'statement': capsule}

        return output

    def query_brain(self, capsule):
        """
        Main function to interact with if a question is coming into the brain. Takes in a structured parsed question,
        transforms it into a query, and queries the triple store for a response
        :param capsule: Structured data of a parsed question
        :return: json response containing the results of the query, and the original question
        """
        # Case fold
        capsule = casefold_capsule(capsule)

        # Generate query
        query = self._create_query(capsule)

        # Perform query
        response = self._submit_query(query)

        # Create JSON output
        if 'date' in capsule.keys():
            capsule["date"] = str(capsule["date"])
        output = {'response': response, 'question': capsule}

        return output

    def process_visual(self, item, exact_only=True):
        """
        Main function to determine if this item can be recognized by the brain, learned, or none
        :param item:
        :return:
        """

        if casefold(item) in self.get_classes():
            # If this is in the ontology already, create sensor triples directly
            text = 'I know about %s. I will remember this object' % item
            return item, text

        temp = self.get_labels_and_classes()
        if casefold(item) in temp.keys():
            # If this is in the ontology already, create sensor triples directly
            text = 'I know about %s. It is of type %s. I will remember this object' % (
                item, temp[item])
            return item, text

        # Query the web for information
        class_type, description = self.exact_match_dbpedia(item)
        if class_type is not None:
            # Had to learn it, but I can create triples now
            text = 'I did not know what %s is, but I searched on the web and I found that it is a %s. ' \
                   'I will remember this object' % (item, class_type)
            return casefold(class_type), text

        if not exact_only:
            # Second go at dbpedia, relaxed approach
            class_type, description = self.keyword_match_dbpedia(item)
            if class_type is not None:
                # Had to really search for it to learn it, but I can create triples now
                text = 'I did not know what %s is, but I searched for fuzzy matches on the web and I found that it ' \
                       'is a %s. I will remember this object' % (item, class_type)
                return casefold(class_type), text

        # Failure, nothing found
        text = 'I am sorry, I could not learn anything on %s so I will not remember it' % item
        return None, text

    ########## management system for keeping track of chats and turns ##########
    def get_last_chat_id(self):
        """
        Get the id for the last interaction recorded
        :return: id
        """
        query = read_query('last_chat_id')
        response = self._submit_query(query)

        return int(response[0]['chatid']['value']) if response else 0

    def get_last_turn_id(self, chat_id):
        """
        Get the id for the last turn in the given chat
        :param chat_id: id for chat of interest
        :return:  id
        """
        query = read_query('last_turn_id') % (chat_id)
        response = self._submit_query(query)

        last_turn = 0
        for turn in response:
            turn_uri = turn['s']['value']
            turn_id = turn_uri.split('/')[-1][10:]
            turn_id = int(turn_id)

            if turn_id > last_turn:
                last_turn = turn_id

        return last_turn

    ########## brain structure exploration ##########
    def get_predicates(self):
        """
        Get predicates in social ontology
        :return:
        """
        query = read_query('predicates')
        response = self._submit_query(query)

        return [elem['p']['value'].split('/')[-1] for elem in response]

    def get_classes(self):
        """
        Get classes in social ontology
        :return:
        """
        query = read_query('classes')
        response = self._submit_query(query)

        return [elem['o']['value'].split('/')[-1] for elem in response]

    def get_labels_and_classes(self):
        """
        Get classes in social ontology
        :return:
        """
        query = read_query('labels_and_classes')
        response = self._submit_query(query)

        temp = dict()
        for r in response:
            temp[r['l']['value']] = r['o']['value'].split('/')[-1]

        return temp

    ########## learned facts exploration ##########
    def count_statements(self):
        """
        Count statements or 'facts' in the brain
        :return:
        """
        query = read_query('count_statements')
        response = self._submit_query(query)
        return response[0]['count']['value']

    def count_friends(self):
        """
        Count number of people I have talked to
        :return:
        """
        query = read_query('count_friends')
        response = self._submit_query(query)
        return response[0]['count']['value']

    def get_my_friends(self):
        """
        Get names of people I have talked to
        :return:
        """
        query = read_query('my_friends')
        response = self._submit_query(query)
        return [elem['name']['value'].split('/')[-1] for elem in response]

    def get_best_friends(self):
        """
        Get names of the 5 people I have talked to the most
        :return:
        """
        query = read_query('best_friends')
        response = self._submit_query(query)
        return [elem['name']['value'] for elem in response]

    def get_instance_of_type(self, instance_type):
        """
        Get isntances of a certain class type
        :param instance_type: name of class in ontology
        :return:
        """
        query = read_query('instance_of_type') % (instance_type)
        response = self._submit_query(query)
        return [elem['name']['value'] for elem in response]

    def when_last_chat_with(self, actor_label):
        """
        Get time value for the last time I chatted with this person
        :param actor_label: name of person
        :return:
        """
        query = read_query('when_last_chat_with') % (actor_label)
        response = self._submit_query(query)
        return response[0]['time']['value'].split('/')[-1]

    def get_triples_with_predicate(self, predicate):
        """
        Get triples that contain this predicate
        :param predicate:
        :return:
        """
        query = read_query('triples_with_predicate') % predicate
        response = self._submit_query(query)
        return [(elem['sname']['value'], elem['oname']['value'])
                for elem in response]

    ########## conflicts ##########
    def get_all_conflicts(self):
        """
        Aggregate all conflicts in brain
        :return:
        """
        conflicts = []
        for predicate in self.ONE_TO_ONE_PREDICATES:
            conflicts.extend(self._get_conflicts_with_predicate(predicate))

        return conflicts

    ########## semantic web ##########
    def exact_match_dbpedia(self, item):
        """
        Query dbpedia for information on this item to get it's semantic type and description.
        :param item:
        :return:
        """

        # Gather combinations
        combinations = [item, item.lower(), item.capitalize(), item.title()]

        for comb in combinations:
            # Try exact matching query
            query = read_query('dbpedia_type_and_description') % (comb)
            response = self._submit_query(query)

            # break if we have a hit
            if response:
                break

        class_type = response[0]['label_type']['value'] if response else None
        description = response[0]['description']['value'].split(
            '.')[0] if response else None

        return class_type, description

    def keyword_match_dbpedia(self, item):
        # Query API
        r = requests.get(
            'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch',
            params={
                'QueryString': item,
                'MaxHits': '10'
            },
            headers={
                'Accept': 'application/json'
            }).json()['results']

        # Fuzzy match
        choices = [e['label'] for e in r]
        best_match = process.extractOne("item", choices)

        # Get best match object
        r = [{
            'label': e['label'],
            'classes': e['classes'],
            'description': e['description']
        } for e in r if e['label'] == best_match[0]]

        if r:
            r = r[0]

            if r['classes']:
                # process dbpedia classes only
                r['classes'] = [
                    c['label'] for c in r['classes'] if 'dbpedia' in c['uri']
                ]

        else:
            r = {'label': None, 'classes': None, 'description': None}

        return r['classes'][0] if r['classes'] else None, r[
            'description'].split('.')[0] if r['description'] else None

    ######################################## Helpers for setting up connection ########################################

    def _define_namespaces(self):
        """
        Define namespaces for different layers (ontology/vocab and resource). Assign them to self
        :return:
        """
        # Namespaces for the instance layer
        instance_vocab = 'http://cltl.nl/leolani/n2mu/'
        self.namespaces['N2MU'] = Namespace(instance_vocab)
        instance_resource = 'http://cltl.nl/leolani/world/'
        self.namespaces['LW'] = Namespace(instance_resource)

        # Namespaces for the mention layer
        mention_vocab = 'http://groundedannotationframework.org/gaf#'
        self.namespaces['GAF'] = Namespace(mention_vocab)
        mention_resource = 'http://cltl.nl/leolani/talk/'
        self.namespaces['LTa'] = Namespace(mention_resource)

        # Namespaces for the attribution layer
        attribution_vocab = 'http://groundedannotationframework.org/grasp#'
        self.namespaces['GRASP'] = Namespace(attribution_vocab)
        attribution_resource_friends = 'http://cltl.nl/leolani/friends/'
        self.namespaces['LF'] = Namespace(attribution_resource_friends)
        attribution_resource_inputs = 'http://cltl.nl/leolani/inputs/'
        self.namespaces['LI'] = Namespace(attribution_resource_inputs)

        # Namespaces for the temporal layer-ish
        time_vocab = 'http://www.w3.org/TR/owl-time/#'
        self.namespaces['TIME'] = Namespace(time_vocab)
        time_resource = 'http://cltl.nl/leolani/time/'
        self.namespaces['LTi'] = Namespace(time_resource)

        # The namespaces of external ontologies
        skos = 'http://www.w3.org/2004/02/skos/core#'
        self.namespaces['SKOS'] = Namespace(skos)

        prov = 'http://www.w3.org/ns/prov#'
        self.namespaces['PROV'] = Namespace(prov)

        sem = 'http://semanticweb.cs.vu.nl/2009/11/sem/'
        self.namespaces['SEM'] = Namespace(sem)

        xml = 'https://www.w3.org/TR/xmlschema-2/#'
        self.namespaces['XML'] = Namespace(xml)

    def _get_ontology_path(self):
        """
        Define ontology paths to key vocabularies
        :return:
        """
        self.ontology_paths[
            'n2mu'] = './../../knowledge_representation/ontologies/leolani.ttl'
        self.ontology_paths[
            'gaf'] = './../../knowledge_representation/ontologies/gaf.rdf'
        self.ontology_paths[
            'grasp'] = './../../knowledge_representation/ontologies/grasp.rdf'
        self.ontology_paths[
            'sem'] = './../../knowledge_representation/ontologies/sem.rdf'

    def _bind_namespaces(self):
        """
        Bnd namespaces
        :return:
        """
        self.dataset.bind('n2mu', self.namespaces['N2MU'])
        self.dataset.bind('leolaniWorld', self.namespaces['LW'])
        self.dataset.bind('gaf', self.namespaces['GAF'])
        self.dataset.bind('leolaniTalk', self.namespaces['LTa'])
        self.dataset.bind('grasp', self.namespaces['GRASP'])
        self.dataset.bind('leolaniFriends', self.namespaces['LF'])
        self.dataset.bind('leolaniInputs', self.namespaces['LI'])
        self.dataset.bind('time', self.namespaces['TIME'])
        self.dataset.bind('leolaniTime', self.namespaces['LTi'])
        self.dataset.bind('skos', self.namespaces['SKOS'])
        self.dataset.bind('prov', self.namespaces['PROV'])
        self.dataset.bind('sem', self.namespaces['SEM'])
        self.dataset.bind('xml', self.namespaces['XML'])
        self.dataset.bind('owl', OWL)

    ######################################## Helpers for statement processing ########################################

    def create_chat_id(self, actor, date):
        """
        Determine chat id depending on my last conversation with this person
        :param actor:
        :param date:
        :return:
        """
        self._log.debug('Chat with {} on {}'.format(actor, date))

        query = read_query('last_chat_with') % (actor)
        response = self._submit_query(query)

        if response and int(response[0]['day']['value']) == int(date.day) \
                and int(response[0]['month']['value']) == int(date.month) \
                and int(response[0]['year']['value']) == int(date.year):
            # Chatted with this person today so same chat id
            chat_id = int(response[0]['chatid']['value'])
        else:
            # Either have never chatted with this person, or I have but not today. Add one to latest chat
            chat_id = self.get_last_chat_id() + 1

        return chat_id

    def create_turn_id(self, chat_id):
        self._log.debug('Turn in chat {}'.format(chat_id))

        query = read_query('last_turn_in_chat') % (chat_id)
        response = self._submit_query(query)
        return int(response['turnid']['value']) + 1 if response else 1

    def _generate_leolani(self, instance_graph):
        # Create Leolani
        leolani_id = 'leolani'
        leolani_label = 'leolani'

        leolani = URIRef(to_iri(self.namespaces['LW'] + leolani_id))
        leolani_label = Literal(leolani_label)
        leolani_type1 = URIRef(to_iri(self.namespaces['N2MU'] + 'robot'))
        leolani_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance'))

        instance_graph.add((leolani, RDFS.label, leolani_label))
        instance_graph.add((leolani, RDF.type, leolani_type1))
        instance_graph.add((leolani, RDF.type, leolani_type2))

        self.my_uri = leolani

        return leolani

    def _generate_subject(self, capsule, instance_graph):
        if capsule['subject']['type'] == '':  # We only get the label
            subject_vocab = OWL
            subject_type = 'Thing'
        else:
            subject_vocab = self.namespaces['N2MU']
            subject_type = capsule['subject']['type']

        subject_id = capsule['subject']['label']

        subject = URIRef(to_iri(self.namespaces['LW'] + subject_id))
        subject_label = Literal(subject_id)
        subject_type1 = URIRef(to_iri(subject_vocab + subject_type))
        subject_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance'))

        instance_graph.add((subject, RDFS.label, subject_label))
        instance_graph.add((subject, RDF.type, subject_type1))
        instance_graph.add((subject, RDF.type, subject_type2))

        return subject, subject_label

    def _create_leolani_world(self, capsule, type='Statement'):
        # Instance graph
        instance_graph_uri = URIRef(to_iri(self.namespaces['LW'] +
                                           'Instances'))
        instance_graph = self.dataset.graph(instance_graph_uri)

        # Subject
        if type == 'Statement':
            subject, subject_label = self._generate_subject(
                capsule, instance_graph)
        elif type == 'Experience':
            subject = self._generate_leolani(
                instance_graph) if self.my_uri is None else self.my_uri
            subject_label = 'leolani'

        # Object
        if capsule['object']['type'] == '':  # We only get the label
            object_vocab = OWL
            object_type = 'Thing'
        else:
            object_vocab = self.namespaces['N2MU']
            object_type = capsule['object']['type']

        object_id = capsule['object']['label']

        object = URIRef(to_iri(self.namespaces['LW'] + object_id))
        object_label = Literal(object_id)
        object_type1 = URIRef(to_iri(object_vocab + object_type))
        object_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance'))

        instance_graph.add((object, RDFS.label, object_label))
        instance_graph.add((object, RDF.type, object_type1))
        instance_graph.add((object, RDF.type, object_type2))

        if type == 'Statement':
            claim_graph, statement = self._create_claim_graph(
                subject,
                subject_label,
                object,
                object_label,
                capsule['predicate']['type'],
                type='Statement')
        elif type == 'Experience':
            claim_graph, statement = self._create_claim_graph(
                subject,
                subject_label,
                object,
                object_label,
                'sees',
                type='Experience')

        return instance_graph, claim_graph, subject, object, statement

    def _create_claim_graph(self,
                            subject,
                            subject_label,
                            object,
                            object_label,
                            predicate,
                            type='Statement'):
        # Claim graph
        claim_graph_uri = URIRef(to_iri(self.namespaces['LW'] + 'Claims'))
        claim_graph = self.dataset.graph(claim_graph_uri)

        # Statement
        statement_id = hash_statement_id(
            [subject_label, predicate, object_label])

        statement = URIRef(to_iri(self.namespaces['LW'] + statement_id))
        statement_type1 = URIRef(to_iri(self.namespaces['GRASP'] + type))
        statement_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance'))
        statement_type3 = URIRef(to_iri(self.namespaces['SEM'] + 'Event'))

        # Create graph and add triple
        graph = self.dataset.graph(statement)
        graph.add((subject, self.namespaces['N2MU'][predicate], object))

        claim_graph.add((statement, RDF.type, statement_type1))
        claim_graph.add((statement, RDF.type, statement_type2))
        claim_graph.add((statement, RDF.type, statement_type3))

        return claim_graph, statement

    def _create_leolani_talk(self, capsule, leolani, type='Statement'):
        # Interaction graph
        if type == 'Statement':
            graph_to_write = 'Interactions'
        elif type == 'Experience':
            graph_to_write = 'Sensors'

        interaction_graph_uri = URIRef(
            to_iri(self.namespaces['LTa'] + graph_to_write))
        interaction_graph = self.dataset.graph(interaction_graph_uri)

        # Time
        date = capsule["date"]
        time = URIRef(
            to_iri(self.namespaces['LTi'] + str(capsule["date"].isoformat())))
        time_type = URIRef(
            to_iri(self.namespaces['TIME'] + 'DateTimeDescription'))
        day = Literal(date.day, datatype=self.namespaces['XML']['gDay'])
        month = Literal(date.month,
                        datatype=self.namespaces['XML']['gMonthDay'])
        year = Literal(date.year, datatype=self.namespaces['XML']['gYear'])
        time_unitType = URIRef(to_iri(self.namespaces['TIME'] + 'unitDay'))

        interaction_graph.add((time, RDF.type, time_type))
        interaction_graph.add((time, self.namespaces['TIME']['day'], day))
        interaction_graph.add((time, self.namespaces['TIME']['month'], month))
        interaction_graph.add((time, self.namespaces['TIME']['year'], year))
        interaction_graph.add(
            (time, self.namespaces['TIME']['unitType'], time_unitType))

        # Actor
        actor_id = capsule['author']
        actor_label = capsule['author']

        actor = URIRef(to_iri(to_iri(self.namespaces['LF'] + actor_id)))
        actor_label = Literal(actor_label)
        actor_type1 = URIRef(to_iri(self.namespaces['SEM'] + 'Actor'))
        actor_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance'))

        if type == 'Statement':
            actor_type3 = URIRef(to_iri(self.namespaces['N2MU'] + 'person'))
        elif type == 'Experience':
            actor_type3 = URIRef(to_iri(self.namespaces['N2MU'] + 'sensor'))

        interaction_graph.add((actor, RDFS.label, actor_label))
        interaction_graph.add((actor, RDF.type, actor_type1))
        interaction_graph.add((actor, RDF.type, actor_type2))
        interaction_graph.add((actor, RDF.type, actor_type3))

        # Add leolani knows/senses actor
        if type == 'Statement':
            predicate = 'knows'
        elif type == 'Experience':
            predicate = 'senses'

        interaction_graph.add(
            (leolani, self.namespaces['N2MU'][predicate], actor))
        _, _ = self._create_claim_graph(leolani, 'leolani', actor, actor_label,
                                        predicate, type)

        # Event and subevent
        event_id = self.create_chat_id(actor_label, date)
        if type == 'Statement':
            event_label = 'chat%s' % event_id
        elif type == 'Experience':
            event_label = 'visual%s' % event_id

        subevent_id = self.create_turn_id(event_id)
        if type == 'Statement':
            subevent_label = event_label + '_turn%s' % subevent_id
        elif type == 'Experience':
            subevent_label = event_label + '_object%s' % subevent_id

        turn = URIRef(to_iri(self.namespaces['LTa'] + subevent_label))
        turn_type1 = URIRef(to_iri(self.namespaces['SEM'] + 'Event'))
        if type == 'Statement':
            turn_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Turn'))
        elif type == 'Experience':
            turn_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Object'))

        interaction_graph.add((turn, RDF.type, turn_type1))
        interaction_graph.add((turn, RDF.type, turn_type2))
        interaction_graph.add(
            (turn, self.namespaces['N2MU']['id'], Literal(subevent_id)))
        interaction_graph.add(
            (turn, self.namespaces['SEM']['hasActor'], actor))
        interaction_graph.add((turn, self.namespaces['SEM']['hasTime'], time))

        chat = URIRef(to_iri(self.namespaces['LTa'] + event_label))
        chat_type1 = URIRef(to_iri(self.namespaces['SEM'] + 'Event'))
        if type == 'Statement':
            chat_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Chat'))
        elif type == 'Experience':
            chat_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Visual'))

        interaction_graph.add((chat, RDF.type, chat_type1))
        interaction_graph.add((chat, RDF.type, chat_type2))
        interaction_graph.add(
            (chat, self.namespaces['N2MU']['id'], Literal(event_id)))
        interaction_graph.add(
            (chat, self.namespaces['SEM']['hasActor'], actor))
        interaction_graph.add((chat, self.namespaces['SEM']['hasTime'], time))
        interaction_graph.add(
            (chat, self.namespaces['SEM']['hasSubevent'], turn))

        perspective_graph, mention, attribution = self._create_perspective_graph(
            capsule, subevent_label)

        # Link interactions and perspectives
        perspective_graph.add(
            (mention, self.namespaces['GRASP']['wasAttributedTo'], actor))
        perspective_graph.add(
            (mention, self.namespaces['GRASP']['hasAttribution'], attribution))
        perspective_graph.add(
            (mention, self.namespaces['PROV']['wasDerivedFrom'], chat))
        perspective_graph.add(
            (mention, self.namespaces['PROV']['wasDerivedFrom'], turn))

        return interaction_graph, perspective_graph, actor, time, mention, attribution

    def _create_perspective_graph(self, capsule, turn_label, type='Statement'):
        # Perspective graph
        perspective_graph_uri = URIRef(
            to_iri(self.namespaces['LTa'] + 'Perspectives'))
        perspective_graph = self.dataset.graph(perspective_graph_uri)

        # Mention
        if type == 'Statement':
            mention_id = turn_label + '_char%s' % capsule['position']
        elif type == 'Experience':
            mention_id = turn_label + '_pixel%s' % capsule['position']
        mention = URIRef(to_iri(self.namespaces['LTa'] + mention_id))
        mention_type = URIRef(to_iri(self.namespaces['GRASP'] + 'Mention'))

        perspective_graph.add((mention, RDF.type, mention_type))

        # Attribution
        attribution_id = mention_id + '_CERTAIN'
        attribution = URIRef(to_iri(self.namespaces['LTa'] + attribution_id))
        attribution_type = URIRef(
            to_iri(self.namespaces['GRASP'] + 'Attribution'))
        attribution_value = URIRef(to_iri(self.namespaces['GRASP'] +
                                          'CERTAIN'))

        perspective_graph.add((attribution, RDF.type, attribution_type))
        perspective_graph.add((attribution, RDF.value, attribution_value))

        return perspective_graph, mention, attribution

    def _serialize(self, file_path):
        """
        Save graph to local file and return the serialized string
        :param file_path: path to where data will be saved
        :return: serialized data as string
        """
        # Save to file but return the python representation
        with open(file_path + '.' + self.format, 'w') as f:
            self.dataset.serialize(f, format=self.format)
        return self.dataset.serialize(format=self.format)

    def _upload_to_brain(self, data):
        """
        Post data to the brain
        :param data: serialized data as string
        :return: response status
        """
        self._log.debug("Posting triples")

        # From serialized string
        post_url = self.address + "/statements"
        response = requests.post(
            post_url,
            data=data,
            headers={'Content-Type': 'application/x-' + self.format})

        return str(response.status_code)

    def _model_graphs_(self, capsule, type='Statement'):
        # Leolani world (includes instance and claim graphs)
        instance_graph, claim_graph, subject, object, instance = self._create_leolani_world(
            capsule, type)

        # Identity
        leolani = self._generate_leolani(
            instance_graph) if self.my_uri is None else self.my_uri

        # Leolani talk (includes interaction and perspective graphs)
        interaction_graph, perspective_graph, actor, time, mention, attribution = self._create_leolani_talk(
            capsule, leolani, type)

        # Interconnections
        instance_graph.add(
            (subject, self.namespaces['GRASP']['denotedIn'], mention))
        instance_graph.add(
            (object, self.namespaces['GRASP']['denotedIn'], mention))

        instance_graph.add(
            (instance, self.namespaces['GRASP']['denotedBy'], mention))
        instance_graph.add(
            (instance, self.namespaces['SEM']['hasActor'], actor))
        instance_graph.add((instance, self.namespaces['SEM']['hasTime'], time))

        perspective_graph.add(
            (mention, self.namespaces['GRASP']['containsDenotation'], subject))
        perspective_graph.add(
            (mention, self.namespaces['GRASP']['containsDenotation'], object))
        perspective_graph.add(
            (mention, self.namespaces['GRASP']['denotes'], instance))

        perspective_graph.add(
            (attribution, self.namespaces['GRASP']['isAttributionFor'],
             mention))

    ######################################### Helpers for question processing #########################################

    def _create_query(self, parsed_question):
        _ = hash_statement_id([
            parsed_question['subject']['label'],
            parsed_question['predicate']['type'],
            parsed_question['object']['label']
        ])

        # Query subject
        if parsed_question['subject']['label'] == "":
            # Case fold
            # object_label = casefold_label(parsed_question['object']['label'])

            query = """
                SELECT ?slabel ?authorlabel
                        WHERE { 
                            ?s n2mu:%s ?o . 
                            ?s rdfs:label ?slabel . 
                            ?o rdfs:label '%s' .  
                            GRAPH ?g {
                                ?s n2mu:%s ?o . 
                            } . 
                            ?g grasp:denotedBy ?m . 
                            ?m grasp:wasAttributedTo ?author . 
                            ?author rdfs:label ?authorlabel .
                        }
                """ % (parsed_question['predicate']['type'],
                       parsed_question['object']['label'],
                       parsed_question['predicate']['type'])

        # Query object
        elif parsed_question['object']['label'] == "":
            query = """
                SELECT ?olabel ?authorlabel
                        WHERE { 
                            ?s n2mu:%s ?o .   
                            ?s rdfs:label '%s' .  
                            ?o rdfs:label ?olabel .  
                            GRAPH ?g {
                                ?s n2mu:%s ?o . 
                            } . 
                            ?g grasp:denotedBy ?m . 
                            ?m grasp:wasAttributedTo ?author . 
                            ?author rdfs:label ?authorlabel .
                        }
                """ % (parsed_question['predicate']['type'],
                       parsed_question['subject']['label'],
                       parsed_question['predicate']['type'])

        # Query existence
        else:
            query = """
                SELECT ?authorlabel ?v
                        WHERE { 
                            ?s n2mu:%s ?o .   
                            ?s rdfs:label '%s' .  
                            ?o rdfs:label '%s' .  
                            GRAPH ?g {
                                ?s n2mu:%s ?o . 
                            } . 
                            ?g grasp:denotedBy ?m . 
                            ?m grasp:wasAttributedTo ?author . 
                            ?author rdfs:label ?authorlabel .
                            ?m grasp:hasAttribution ?att .
                            ?att rdf:value ?v .
                        }
                """ % (parsed_question['predicate']['type'],
                       parsed_question['subject']['label'],
                       parsed_question['object']['label'],
                       parsed_question['predicate']['type'])

        query = self.query_prefixes + query

        return query

    def _submit_query(self, query):
        # Set up connection
        sparql = SPARQLWrapper(self.address)

        # Response parameters
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        sparql.addParameter('Accept', 'application/sparql-results+json')
        response = sparql.query().convert()

        return response["results"]["bindings"]

    ######################################### Helpers for conflict processing #########################################
    def _get_conflicts_with_predicate(self, one_to_one_predicate):
        query = """
            PREFIX n2mu: <http://cltl.nl/leolani/n2mu/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX grasp: <http://groundedannotationframework.org/grasp#>

            select ?sname 
                    (group_concat(?oname ; separator=";") as ?onames) 
                    (group_concat(?authorlabel ; separator=";") as ?authorlabels) 
            where { 
                GRAPH ?g {
                    ?s n2mu:%s ?o .
                    } .
                ?s rdfs:label ?sname .
                ?o rdfs:label ?oname .

                ?g grasp:denotedBy ?m . 
                ?m grasp:wasAttributedTo ?author . 
                ?author rdfs:label ?authorlabel .

            } group by ?sname having (count(distinct ?oname) > 1)
        """ % one_to_one_predicate

        response = self._submit_query(query)
        conflicts = []
        for item in response:
            conflict = {
                'subject': item['sname']['value'],
                'predicate': one_to_one_predicate,
                'objects': []
            }

            values = item['onames']['value'].split(';')
            authors = item['authorlabels']['value'].split(';')

            for val, auth in zip(values, authors):
                option = {'value': val, 'author': auth}
                conflict['objects'].append(option)

            conflicts.append(conflict)

        return conflicts