Ejemplo n.º 1
0
def get_sparql_store(query_endpoint=None,
                     update_endpoint=None,
                     sparql_user=None,
                     sparql_pass=None):
    if not query_endpoint:
        query_endpoint = current_app.config.get("SPARQL_ENDPOINT")

    if not update_endpoint:
        update_endpoint = current_app.config.get("SPARQL_UPDATE_ENDPOINT")

    if not sparql_user:
        sparql_user = current_app.config.get("SPARQL_USER")

    if not sparql_pass:
        sparql_pass = current_app.config.get("SPARQL_PASS")

    auth = HTTPDigestAuth(sparql_user, sparql_pass)

    store = SPARQLUpdateStore(queryEndpoint=query_endpoint,
                              update_endpoint=update_endpoint,
                              auth=auth,
                              context_aware=True,
                              postAsEncoded=False,
                              node_to_sparql=_node_to_sparql)

    store.method = "POST"
    store.formula_aware = True

    return store
Ejemplo n.º 2
0
 def __init__(self, endpoint):
     self.endpoint = endpoint
     query_ep = self.endpoint + '/query'
     update_ep = self.endpoint + '/update'
     self.sparql = SPARQLUpdateStore(queryEndpoint=query_ep,
                                     update_endpoint=update_ep,
                                     bNodeAsURI=True)
Ejemplo n.º 3
0
def create_sparql_store(endpoint, update_endpoint=None, use_let_syntax=False):
    if update_endpoint == None:
        update_endpoint = endpoint
    store = SPARQLUpdateStore(queryEndpoint=endpoint,
                              update_endpoint=update_endpoint)
    store.open((endpoint,update_endpoint))
    return store
Ejemplo n.º 4
0
def update_fuseki(config, files):
    """
    The current procedure first dumps the enriched graph to a temporary file in a dir accessible by
    the web server, then loads the file using the SPARQL LOAD operation.

    I first tried pushing the enriched graph directly to the update endpoint
    without writing a temporary file, but that approach failed for two reasons:
     - Using INSERT DATA with "lots" of triples (>> 20k) caused Fuseki to give a 500 response.
     - Using INSERT DATA with chunks of 20k triples worked well... when there were no blank nodes.
       If the same bnode were referenced in two different chunks, it would end up as *two* bnodes.
       Since we're using bnodes in RDF lists, many lists ended up broken. From the SPARQL ref.:

            Variables in QuadDatas are disallowed in INSERT DATA requests (see Notes 8 in the grammar).
            That is, the INSERT DATA statement only allows to insert ground triples. Blank nodes in
            QuadDatas are assumed to be disjoint from the blank nodes in the Graph Store,
            i.e., will be inserted with "fresh" blank nodes.

    Using tdbloader would be another option, but then we would still need a temp file, we would also need
    to put that file on a volume accessible to the docker container, and we would need to shutdown the
    server while loading the file. And it's a solution tied to Fuseki.

    I'm not aware if there is a limit on how large graphs Fuseki can load with the LOAD operation.
    I guess we'll find out.
    """

    if config["dumps_dir"] is None:
        raise Exception("The 'dumps_dir' option must be set")

    if config["dumps_dir_url"] is None:
        raise Exception("The 'dumps_dir_url' option must be set")

    tmpfile = "{}/import_{}.ttl".format(config["dumps_dir"].rstrip("/"), config["basename"])
    tmpfile_url = "{}/import_{}.ttl".format(config["dumps_dir_url"].rstrip("/"), config["basename"])

    tc = enrich_and_concat(files, tmpfile)

    c0 = get_graph_count(config)

    store = SPARQLUpdateStore("{}/sparql".format(config["fuseki"]), "{}/update".format(config["fuseki"]))
    graph_uri = URIRef(config["graph"])
    graph = Graph(store, graph_uri)

    logger.info("Fuseki: Loading %d triples into <%s> from %s", tc, graph_uri, tmpfile_url)

    # CLEAR GRAPH first to make sure all blank nodes are erased
    # https://github.com/scriptotek/emnesok/issues/70
    store.update("CLEAR GRAPH <{}>".format(graph_uri))

    store.update("LOAD <{}> INTO GRAPH <{}>".format(tmpfile_url, graph_uri))

    c1 = get_graph_count(config)
    if c0 == c1:
        logger.info("Fuseki: Graph <%s> updated, number of concepts unchanged", config["graph"])
    else:
        logger.info("Fuseki: Graph <%s> updated, number of concepts changed from %d to %d.", config["graph"], c0, c1)

    invalidate_varnish_cache(config["basename"])
    logger.info("Invalidated Varnish cache for %s", config["basename"])
Ejemplo n.º 5
0
 def _do_init(self, endpoint, username, password, **kwargs):
     SPARQLUpdateStore.__init__(self, endpoint, endpoint,
                                node_to_sparql=_virtuoso_node_to_sparql,
                                node_from_result=_virtuoso_node_from_result,
                                **kwargs)
     self.setHTTPAuth('digest')
     self.setCredentials(username, password)
     self.setReturnFormat = "json"
     self.opened = True
Ejemplo n.º 6
0
 def __init__(self, endpoint):
     self.endpoint = endpoint
     query_ep = self.endpoint + '/query'
     update_ep = self.endpoint + '/update'
     self.sparql = SPARQLUpdateStore(queryEndpoint=query_ep,
                                     update_endpoint=update_ep,
                                     bNodeAsURI=True)
Ejemplo n.º 7
0
def clear():
    # Configurations
    config = ConfigParser()
    config.read('config.ini')

    endpoint_uri = config['Mandatory']['endpointURI']
    graph_uri = config['Mandatory']['graphURI']

    clean_graph_query = "CLEAR GRAPH <" + graph_uri + ">"

    # Set up endpoint and access to triple store
    sparql = SPARQLWrapper(endpoint_uri)
    sparql.setReturnFormat(JSON)
    sparql.setMethod(POST)
    store = SPARQLUpdateStore(endpoint_uri, endpoint_uri)

    # Specify the (named) graph we're working with
    sparql.addDefaultGraph(graph_uri)

    # Create an in memory graph
    g = Graph(store, identifier=graph_uri)

    # Cleanup the existing triples
    sparql.setQuery(clean_graph_query)
    sparql.query().convert()

    # Cleanup the graph instance
    g.close()
Ejemplo n.º 8
0
 def __init__(self, base_uri, endpoint=None):
     self._base_uri = base_uri
     if endpoint is None:
         self._graph = Graph()  # memory store
     else:
         default_graph = URIRef(base_uri)
         store = SPARQLUpdateStore(endpoint)
         self._graph = Graph(store, identifier=default_graph)
Ejemplo n.º 9
0
def endpoint(endpoint):
    defaultgraph = None
    store = SPARQLUpdateStore(queryEndpoint=endpoint,
                              update_endpoint=endpoint,
                              node_to_sparql=node_to_sparql,
                              node_from_result=node_from_result)
    graph = ConjunctiveGraph(store, defaultgraph)
    return graph
Ejemplo n.º 10
0
    def __init__(self, config=None, create=None):
        """Create ontology database API with SQLAlchemy store.

        Parameters
        ----------
        config : [str, knowl.DBConfig], optional
            The path to a configuration file or the configuration object. By default None,
            which results in a configuration with default parameters (see knowl.DBConfig).
        create : bool, optional
            Whether or not the tables for the ontology (triplestore) should be initalized.
            Set to True if you are creating a new database, by default None.
            As per SQLAlchemy documentation, the creation operation is idempotent. Thus,
            could be left at True, unless you specifically do not want to create a new database
            if one does not exist.
        """
        # initialize database config
        self.__config = DBConfig.factory(config)

        self.__username = None
        self.__password = None
        self.__create = create
        self.__store_type = self.config["store"]

        # configure database identifier (ontology IRI/base URL)
        self.__identifier = self.config.baseURL

        if self.store_type == "alchemy":
            self.__store = SQLAlchemy(identifier=self.identifier)
            self._graph = Graph(self.__store, identifier=self.identifier)
        elif self.store_type == "fuseki":
            self.__query_endpoint = f'http://{self.config["host"]}:{self.config["port"]}/{self.config["database"]}'
            self.__update_endpoint = f'http://{self.config["host"]}:{self.config["port"]}/{self.config["database"]}/update'
            self.__store = SPARQLUpdateStore(
                queryEndpoint=self.__query_endpoint + '/sparql',
                update_endpoint=self.__update_endpoint,
                context_aware=True,
                postAsEncoded=False,
                node_to_sparql=my_bnode_ext)
            self.__query_endpoint += '/query'
            self.__store.method = 'POST'
        else:
            raise Exception(f"Unknown store type {self.store_type}!")
Ejemplo n.º 11
0
def engine_from_config(config, prefix):
    defaultgraph = None
    if prefix + "defaultGraph" in config:
        defaultgraph = URIRef(config[prefix + "defaultGraph"])
    if prefix + "queryEndpoint" in config:
        store = SPARQLUpdateStore(
            queryEndpoint=config[prefix + "queryEndpoint"],
            update_endpoint=config[prefix + "updateEndpoint"],
            default_query_method=POST,
            returnFormat=JSON,
            node_to_sparql=node_to_sparql)
        store._defaultReturnFormat = JSON
        store.setReturnFormat(JSON)
        graph = ConjunctiveGraph(store, defaultgraph)
    elif prefix + 'store' in config:
        graph = ConjunctiveGraph(store='Sleepycat', identifier=defaultgraph)
        graph.store.batch_unification = False
        graph.store.open(config[prefix + "store"], create=True)
    else:
        graph = ConjunctiveGraph(identifier=defaultgraph)
    return graph
Ejemplo n.º 12
0
def get_ds0():
    update_endpoint = 'http://localhost:8890/sparql-auth'
    # query_endpoint = 'http://localhost:8890/sparql'
    store = SPARQLUpdateStore(update_endpoint,
                              update_endpoint,
                              autocommit=True)
    store.setHTTPAuth(DIGEST)
    store.setCredentials(user='******', passwd='admin')
    return Dataset(store)
Ejemplo n.º 13
0
 def __init__(self, database, measurement):
     """
     initialize
     :param database:
     :param measurement:
     """
     self.database = database
     self.measurement = measurement
     self.store = SPARQLUpdateStore(
         queryEndpoint="{}:{}/{}/query".format(JF_HOST, JF_PORT,
                                               JF_DATASET),
         update_endpoint="{}:{}/{}/update".format(JF_HOST, JF_PORT,
                                                  JF_DATASET))
     self.graph = Graph(identifier=URIRef(GRAPH_ID), store=self.store)
     self.__init_graph()
Ejemplo n.º 14
0
def engine_from_config(config, prefix):
    defaultgraph = None
    if prefix+"defaultGraph" in config:
        defaultgraph = URIRef(config[prefix+"defaultGraph"])
    if prefix+"queryEndpoint" in config:
        from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore
        store = SPARQLUpdateStore(queryEndpoint=config[prefix+"queryEndpoint"],
                                  update_endpoint=config[prefix+"updateEndpoint"])
        graph = ConjunctiveGraph(store,defaultgraph)
    elif prefix+'store' in config:
        graph = ConjunctiveGraph(store='Sleepycat',identifier=defaultgraph)
        graph.store.batch_unification = False
        graph.store.open(config[prefix+"store"], create=True)
    else:
        graph = ConjunctiveGraph(identifier=defaultgraph)
    return graph
Ejemplo n.º 15
0
def load(load_resource, resources):
    print('Loading', load_resource.identifier)
    file_graph = Dataset(default_union=True)
    to_disk = False
    for used in load_resource[prov.used]:
        if used[RDF.type:setl.Persisted]:
            to_disk = True
            file_graph = Dataset(store='Sleepycat', default_union=True)
            tempdir = tempfile.mkdtemp()
            print("Gathering", load_resource.identifier, "into", tempdir)
            file_graph.store.open(tempdir, True)
            break
    if len(list(load_resource[prov.used])) == 1:
        file_graph = resources[load_resource.value(prov.used).identifier]
    else:
        for used in load_resource[prov.used]:
            print("Using", used.identifier)
            used_graph = resources[used.identifier]
            file_graph.namespace_manager = used_graph.namespace_manager
            #print used_graph.serialize(format="trig")
            file_graph.addN(used_graph.quads())

    for generated in load_resource.subjects(prov.wasGeneratedBy):
        # TODO: support LDP-based loading
        if generated[RDF.type:pv.File]:
            fmt = generated.value(dc['format'])
            if fmt is not None:
                fmt = fmt.value
            if fmt in formats:
                fmt = formats[fmt]
                print(fmt)
            with open(generated.identifier.replace("file://", ''), 'wb') as o:
                o.write(file_graph.serialize(format=fmt))
                o.close()
        elif generated[RDF.type:sd.Service]:
            from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore
            endpoint = generated.value(sd.endpoint,
                                       default=generated).identifier
            store = SPARQLUpdateStore(endpoint, endpoint, autocommit=False)
            endpoint_graph = Dataset(store=store,
                                     identifier=generated.identifier,
                                     default_union=True)
            endpoint_graph.addN(file_graph.quads())
            endpoint_graph.commit()
    if to_disk:
        file_graph.close()
Ejemplo n.º 16
0
 def __initStore(self): 
     store = SPARQLUpdateStore(queryEndpoint = getattr(settings,
                                                           'SPARQL_QUERY'),
                             update_endpoint = getattr(settings,
                                                       'SPARQL_UPDATE'), 
                             postAsEncoded=False)
     store.bind("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
     store.bind("oa", "http://www.w3.org/ns/oa#")
     store.bind("chnode", getattr(settings, 'NODE_URI', 'http://localhost'))
     CharmeMiddleware.__store = store
     
     #Creates a superuser if there is not any
     try:
         users = User.objects.all()
         if len(users) == 0:
             User.objects.create_superuser('admin', '', 'admin')
     except DatabaseError:
         LOGGING.error("Cannot find or create an application superuser")         
Ejemplo n.º 17
0
    def query(self, query, initNs, initBindings, queryGraph, **kwargs):
        prepared_base = None
        if hasattr(query, '_original_args'):
            query, prepared_ns, prepared_base = query._original_args
            if not initNs:
                initNs = prepared_ns
            else:
                prepared_ns = dict(prepared_ns)
                prepared_ns.update(initNs)
                initNs = prepared_ns

        base = kwargs.pop("base", None) or prepared_base
        if base is not None:
            query = '\n'.join([('BASE <%s>' % base), query])

        res = SPARQLUpdateStore.query(self, query, initNs, initBindings, queryGraph, **kwargs)
        if res.bindings is not None:
            res.bindings = ( FrozenBindings(None, i) for i in res.bindings )

        return res
Ejemplo n.º 18
0
def engine_from_config(config, prefix):
    defaultgraph = None
    if prefix + "defaultGraph" in config:
        defaultgraph = URIRef(config[prefix + "defaultGraph"])
    if prefix + "queryEndpoint" in config:
        store = SPARQLUpdateStore(
            queryEndpoint=config[prefix + "queryEndpoint"],
            update_endpoint=config[prefix + "updateEndpoint"],
            default_query_method=POST,
            returnFormat=JSON,
            node_to_sparql=node_to_sparql)

        def publish(data, *graphs):
            s = requests.session()
            s.keep_alive = False
            result = s.post(
                store.endpoint,
                data=data,
                #                            params={"context-uri":graph.identifier},
                headers={'Content-Type': 'application/x-trig'})
            print store.endpoint, result.content

        store.publish = publish

        store._defaultReturnFormat = JSON
        store.setReturnFormat(JSON)
        graph = ConjunctiveGraph(store, defaultgraph)
    elif prefix + 'store' in config:
        graph = ConjunctiveGraph(store='Sleepycat', identifier=defaultgraph)
        graph.store.batch_unification = False
        graph.store.open(config[prefix + "store"], create=True)
    else:
        graph = ConjunctiveGraph(identifier=defaultgraph)

        def publish(data, *graphs):
            for nanopub in graphs:
                graph.addN(nanopub.quads())

        graph.store.publish = publish

    return graph
Ejemplo n.º 19
0
def main():
    # Track executing time
    # start_time = time.time()
    headers = {'content-type': 'application/json'}  # HTTP header content type
    # Configurations
    config = ConfigParser()
    config.read('config_3.ini')

    endpoint_uri = config['Mandatory']['endpointURI']
    graph_uri = config['Mandatory']['graphURI']
    pool_uri = (config['Mandatory']['poolURI']).split(',')
    type_uri = (config['Mandatory']['typeURI']).split(',')

    # Set up endpoint and access to triple store
    sparql = SPARQLWrapper(endpoint_uri)
    sparql.setReturnFormat(JSON)
    sparql.setMethod(POST)
    store = SPARQLUpdateStore(endpoint_uri, endpoint_uri)

    # Specify the (named) graph we're working with
    sparql.addDefaultGraph(graph_uri)

    # Create an in memory graph
    g = Graph(store, identifier=graph_uri)

    # Build the RDF from the JSON source data
    # This function is to be called for each URL in the pool to harvest, in case that the source is in json, with the Estonian mapping
    def rdf(urlrdf, f):
        input = Graph()
        input.open("store2", create=True)
        input.parse(urlrdf, format=f)

        for s, p, o in input:
            g.add((s, p, o))

        input.close()

    def rdf_data(rdfobject, f):
        input = ConjunctiveGraph()
        input.open("store2", create=True)
        input.parse(data=rdfobject, format=f)

        #print(input.serialize(format='json-ld', auto_compact=True, indent=4))

        for s, p, o in input:
            g.add((s, p, o))

        input.close()

    # Set counter
    c = 0

    # Loop over all URI in the pool
    while c < len(pool_uri):
        #print(pool_uri[c],type_uri[c])

        if type_uri[c] == 'xlsx':
            url = "http://cpsv-ap.semic.eu/cpsv-ap_harvester/intapi/v1/importSpreadsheetFromURL?spreadsheetURL=" + urllib.parse.quote(
                pool_uri[c])
            text_json = requests.get(url).text
            my_json = json.loads(text_json)
            type_uri[c] = 'json-ld'

#validation
        url = "https://www.itb.ec.europa.eu/shacl/cpsv-ap/api/validate"
        if type_uri[c] == 'xml':
            myobj = {
                "contentSyntax": "application/rdf+xml",
                "contentToValidate": pool_uri[c],
                "embeddingMethod": "URL",
                "reportSyntax": "application/ld+json"
            }
        if type_uri[c] == 'turtle':
            myobj = {
                "contentSyntax": "text/turtle",
                "contentToValidate": pool_uri[c],
                "embeddingMethod": "URL",
                "reportSyntax": "application/ld+json"
            }
        if type_uri[c] == 'nt':
            myobj = {
                "contentSyntax": "application/n-triples",
                "contentToValidate": pool_uri[c],
                "embeddingMethod": "URL",
                "reportSyntax": "application/ld+json"
            }
        if type_uri[c] == 'json-ld':
            data = base64.urlsafe_b64encode(
                text_json.encode("utf-8")).decode('utf-8')
            myobj = {
                "contentSyntax": "application/ld+json",
                "contentToValidate": data,
                "embeddingMethod": "BASE64",
                "reportSyntax": "application/ld+json"
            }


#myobj = { "contentSyntax": "application/ld+json", "contentToValidate": pool_uri[c], "embeddingMethod": "BASE64", "reportSyntax": "application/ld+json" }
#data = base64.urlsafe_b64encode(myobj).encode()).decode()

        result_text_json = requests.post(url, json=myobj).text
        my_json = json.loads(result_text_json)
        #print(result_text_json)
        #print(my_json.get("sh:conforms"))
        if (my_json.get("sh:conforms") or type_uri[c] == 'jsonEstonia'):
            #if 1:
            #print(d2)
            #print(d2['id'])

            #print("with:colon is equal to {d['sh-conforms']}")
            #print({'sh-conforms'}.format(**d))
            #if (json_obj[0]["sh:conforms"] == true):
            #   print(pool_uri[c] + "is conform")
            print("* " + pool_uri[c] +
                  " is conform to CPSV-AP and it is harvested")
            if type_uri[c] == 'jsonEstonia':
                try:
                    # Fetch the JSON data
                    response = requests.get(pool_uri[c],
                                            headers=headers).json()

                    # Process the response
                    configJSON = ConfigParser()
                    configJSON.read('mapping_estonia.ini')
                    json_to_rdf(pool_uri[c], response, g, configJSON)

                except ValueError as e:
                    print(e)

            if type_uri[c] == 'xml' or type_uri[c] == 'turtle' or type_uri[
                    c] == 'nt':
                rdf(pool_uri[c], type_uri[c])

            if type_uri[c] == 'json-ld':
                #print(text_json)
                rdf_data(text_json, type_uri[c])
        else:
            print("* " + pool_uri[c] +
                  " is not conform to CPSV-AP and it is not harvested")

        # Counter update
        c += 1

    # Iterate over triples in store and print them out.
    print('\r\nNumber of triples added: %d' % len(g))

    # Cleanup the graph instance
    g.close()
Ejemplo n.º 20
0
headers = {'content-type': 'application/json'}  # HTTP header content type
# Configurations
config = ConfigParser()
config.read('config_3.ini')

URI = sys.argv[1]
classType = sys.argv[2]

endpoint_uri = config['Mandatory']['endpointURI']
graph_uri = config['Mandatory']['graphURI']

# Set up endpoint and access to triple store
sparql = SPARQLWrapper(endpoint_uri)
sparql.setReturnFormat(JSON)
sparql.setMethod(POST)
store = SPARQLUpdateStore(endpoint_uri, endpoint_uri)

# Specify the (named) graph we're working with
sparql.addDefaultGraph(graph_uri)

# Create an in memory graph
g = Graph(store, identifier=graph_uri)

query = "select ?p ?o where {<"+ URI +"> ?p ?o}"
properties = g.query (query)

# Configurations mappings
mapping = ConfigParser()
mapping.read('mapping_fields.ini')

propURI = ""
Ejemplo n.º 21
0
 def __init__(self, *args, **kwargs):
     self.publish = None
     SPARQLUpdateStore.__init__(self, *args, **kwargs)
Ejemplo n.º 22
0
mu.sameAs.append(rdflib.URIRef("http://d-nb.info/gnd/117726516‏"))
si.co_author.append(mu)

print(owlready.to_n3(scholars_ontology))

import rdflib



from rdflib.plugins.stores.sparqlstore import NSSPARQLWrapper,SPARQLUpdateStore,\
    SPARQLUpdateStore

#store = SPARQLUpdateStore(queryEndpoint="http://localhost:8890/sparql",
#                        update_endpoint="http://localhost:8890/sparql-auth",
#                       )

store = SPARQLUpdateStore(queryEndpoint="http://localhost:10214/blazegraph/sparql",
                        update_endpoint="http://localhost:10214/blazegraph/sparql",
                       )


store.setNamespaceBindings({"ont":"http://ontologies.mpiwg-berlin.mpg.de/scholarlyRelations",
                          "end":"http://entity.mpiwg-berlin.mpg.de/modernPhysics"})



#g = rdflib.Graph(store=store,identifier="http://entities.mpiwg-berlin.mpg.de/graphs/gr/V14TEST")
#g.store.setCredentials('admin','admin')
#g.store.setHTTPAuth('DIGEST')
#owlready.to_rdflib(scholars_ontology,g)
Ejemplo n.º 23
0
class TruthyUpdater:
    def __init__(self, endpoint, dryrun=False, user=None, passwd=None):
        self.endpoint = SPARQLUpdateStore(endpoint,
                                          default_query_method='POST')
        if user:
            self.endpoint.setCredentials(user, passwd)
        self.dryrun = dryrun

    def build_truthy(self, np_list):
        self.insert_truthy_rank(np_list)
        self.delete_normal_rank(np_list)

    def update(self, query):
        if self.dryrun:
            action = 'INSERT' if 'INSERT' in query else 'DELETE'
            query = query.replace(action, 'CONSTRUCT')
            # print(query)
            try:
                res = self.endpoint.query(query)
                print('### About to {} following triples:'.format(action))
                for row in res:
                    print(' '.join(e.n3() for e in row))
            except ResultException:
                pass
        else:
            self.endpoint.update(query)

    def insert_truthy_rank(self, np_list):
        values = ' '.join(
            '( wd:%(node)s p:%(p)s ps:%(p)s psn:%(p)s wdt:%(p)s wdtn:%(p)s )' %
            {
                'node': n,
                'p': p
            } for n, p in np_list)
        query = '''
        INSERT {
          ?statement a wikibase:BestRank .
          ?node ?wdt ?psv ;
                ?wdtn ?psnv .
        } WHERE {
          %s
            {
                ?node ?p ?statement .
                ?statement wikibase:rank wikibase:PreferredRank ;
                           ?ps ?psv .
                OPTIONAL { ?statement ?psn ?psnv }
                FILTER NOT EXISTS { ?statement a wikibase:BestRank }
            }
            UNION
            {
              ?node ?p ?statement .
              ?statement wikibase:rank wikibase:NormalRank ;
                         ?ps ?psv .
              OPTIONAL { ?statement ?psn ?psnv }
              FILTER NOT EXISTS { ?statement a wikibase:BestRank }
              FILTER NOT EXISTS { ?node ?p [ wikibase:rank wikibase:PreferredRank ] }
            }
        }
        ''' % ('VALUES (?node ?p ?ps ?psn ?wdt ?wdtn ) { %s }' % values)
        self.update(query)

    def delete_normal_rank(self, np_list):
        values = ' '.join('( wd:%(node)s p:%(p)s wdt:%(p)s wdtn:%(p)s )' % {
            'node': n,
            'p': p
        } for n, p in np_list)
        query = '''
          DELETE {
            ?statement a wikibase:BestRank .
            ?node ?wdt ?value ;
              ?wdtn ?no .
          } WHERE {
            %s
            ?node ?p ?statement ;
                  ?p [ wikibase:rank wikibase:PreferredRank ] .
            ?statement a wikibase:BestRank ;
                       wikibase:rank wikibase:NormalRank .
          }
        ''' % ('VALUES (?node ?p ?wdt ?wdtn ) { %s }' % values)
        self.update(query)
Ejemplo n.º 24
0
def _get_store():
    store = SPARQLUpdateStore(queryEndpoint=SPARQL_QUERY,
                              update_endpoint=SPARQL_UPDATE,
                              postAsEncoded=False)
    print("update:" + SPARQL_UPDATE)
    return store
Ejemplo n.º 25
0
 def __init__(self, endpoint, dryrun=False, user=None, passwd=None):
     self.endpoint = SPARQLUpdateStore(endpoint,
                                       default_query_method='POST')
     if user:
         self.endpoint.setCredentials(user, passwd)
     self.dryrun = dryrun
Ejemplo n.º 26
0
def main():
    # Track executing time
    # start_time = time.time()
    headers = {'content-type': 'application/json'}  # HTTP header content type
    # Configurations
    config = ConfigParser()
    config.read('config.ini')
    
    endpoint_uri = config['Mandatory']['endpointURI']
    graph_uri = config['Mandatory']['graphURI']
    pool_uri = (config['Mandatory']['poolURI']).split(',')
    type_uri = (config['Mandatory']['typeURI']).split(',')

    # Set up endpoint and access to triple store
    sparql = SPARQLWrapper(endpoint_uri)
    sparql.setReturnFormat(JSON)
    sparql.setMethod(POST)
    store = SPARQLUpdateStore(endpoint_uri, endpoint_uri)

    # Specify the (named) graph we're working with
    sparql.addDefaultGraph(graph_uri)

    # Create an in memory graph
    g = Graph(store, identifier=graph_uri)

    # Build the RDF from the JSON source data
    # This function is to be called for each URL in the pool to harvest, in case that the source is in json, with the Estonian mapping
    def rdf(urlrdf, f):
        input = Graph()
        input.open("store2", create=True)
        input.parse(urlrdf, format=f)
        
        for s, p, o in input:
            g.add((s, p, o))

        input.close()

    # Set counter
    c = 0

    # Loop over all URI in the pool
    while c < len(pool_uri):
        print(pool_uri[c],type_uri[c])
        if type_uri[c] == 'jsonEstonia':
            try:
                # Fetch the JSON data
                response = requests.get(pool_uri[c], headers=headers).json()

                # Process the response
                configJSON = ConfigParser()
                configJSON.read('mapping_estonia.ini')
                json_to_rdf(pool_uri[c], response, g, configJSON)
    
            except ValueError as e:
                print(e)
                
        if type_uri[c] == 'xml' or type_uri[c] == 'turtle' or type_uri[c] == 'nt':
            rdf(pool_uri[c], type_uri[c])


        # Counter update
        c += 1

    # Iterate over triples in store and print them out.
    print('\r\nNumber of triples added: %d' % len(g))
    
    # Cleanup the graph instance
    g.close()
Ejemplo n.º 27
0
class OntologyDatabase:
    """A front-end of an ontology database. This class provides "safe" access to most of the standard
    operations provided by the rdflib.Graph class. The "safeness" of the methods lies in catching
    exceptions and reconnecting shall the connection to the database "die" for whatever reason.
    Additionally, this class implements the SQLAlchemy store for the triples

    """
    def __init__(self, config=None, create=None):
        """Create ontology database API with SQLAlchemy store.

        Parameters
        ----------
        config : [str, knowl.DBConfig], optional
            The path to a configuration file or the configuration object. By default None,
            which results in a configuration with default parameters (see knowl.DBConfig).
        create : bool, optional
            Whether or not the tables for the ontology (triplestore) should be initalized.
            Set to True if you are creating a new database, by default None.
            As per SQLAlchemy documentation, the creation operation is idempotent. Thus,
            could be left at True, unless you specifically do not want to create a new database
            if one does not exist.
        """
        # initialize database config
        self.__config = DBConfig.factory(config)

        self.__username = None
        self.__password = None
        self.__create = create
        self.__store_type = self.config["store"]

        # configure database identifier (ontology IRI/base URL)
        self.__identifier = self.config.baseURL

        if self.store_type == "alchemy":
            self.__store = SQLAlchemy(identifier=self.identifier)
            self._graph = Graph(self.__store, identifier=self.identifier)
        elif self.store_type == "fuseki":
            self.__query_endpoint = f'http://{self.config["host"]}:{self.config["port"]}/{self.config["database"]}'
            self.__update_endpoint = f'http://{self.config["host"]}:{self.config["port"]}/{self.config["database"]}/update'
            self.__store = SPARQLUpdateStore(
                queryEndpoint=self.__query_endpoint + '/sparql',
                update_endpoint=self.__update_endpoint,
                context_aware=True,
                postAsEncoded=False,
                node_to_sparql=my_bnode_ext)
            self.__query_endpoint += '/query'
            self.__store.method = 'POST'
        else:
            raise Exception(f"Unknown store type {self.store_type}!")

    def setup(self, create=False, username: str = None, password: str = None):
        """Sets-up a new database connection. Call this to initialize access to the database.

        Parameters
        ----------
        create : bool, optional
            Whether the tables should be created (idempotent). Only set to True if creating a new database, by default False.
            Setting the object property self.create to anything but None will override this value!
        username : str, optional
            Database access credentials. Only set this if you didn't set it before (e.g. in the config file), by default None
        password : str, optional
            Database access credentials. Only set this if you didn't set it before (e.g. in the config file), by default None
        """
        if self.store_type == "alchemy":
            if self.__create is not None:
                create = self.__create
            self._graph.open(self.config.getDB_URI(
                self.__username if username is None else username,
                self.__password if password is None else password),
                             create=create)
        elif self.store_type == "fuseki":
            print(
                f"Query endpoint: {self.__query_endpoint}\nUpdate endpoint: {self.__update_endpoint}\nIndentifier: {self.identifier}"
            )
            self.__store.open((self.__query_endpoint, self.__update_endpoint))
            self._graph = Graph(self.__store, identifier=self.identifier)
        for ns, uri in self.config.namespaces.items():
            self._graph.bind(ns.lower(), uri)

    def closelink(self):
        """Closes the database connection.
        """
        try:
            self._graph.close()
        except Exception as e:
            print("Exception in Closing", e)

    def destroy(self, confirmation: str = None):
        """Destroys the store for the Ontology

        This will erase/destroy the database (triplestore) used to store the data.
        Be very careful when calling this function.

        Parameters
        ----------
        confirmation : str, optional
            [description], by default None
        """
        if confirmation == "I know what I am doing":
            self._graph.destroy(self.identifier)
        else:
            raise ValueError(
                "Destroying the DB attempted but failed - wrong confirmation string!"
            )

    def setCredentials(self, username: str = None, password: str = None):
        """Set access credentials for the database server containing the triplestore.

        Parameters
        ----------
        username : str, optional
            The username, by default None
        password : str, optional
            The password. Warning, this will be visible in the DB URL! By default None
        """
        self.__username = username
        self.__password = password

    @interact_with_db
    def mergeFileIntoDB(self, filepath: str):
        """Merge an existing ontology file into the current database. This could be used to populate
        a new ontology from an existing one stored as a file. The ontology is automatically merged
        and stored in the triplestore database server after calling this function.

        Parameters
        ----------
        filepath : str
            Path to the file containing the ontology. See RDFLib documentation,
            specifically, the function Graph.parse for supported formats.
        """
        tmpGraph = Graph()
        tmpGraph.parse(filepath)
        self._graph += tmpGraph

    @property
    def config(self):
        return self.__config

    @property
    def identifier(self):
        return self.__identifier

    @property
    def store_type(self):
        return self.__store_type

    @interact_with_db
    def bind(self, prefix, namespace, override=True):
        self._graph.bind(prefix.lower(), namespace, override)

    @interact_with_db
    def query(self, *args, **kwargs) -> Generator:
        return self._graph.query(*args, **kwargs)

    @interact_with_db
    def update(self, *args, **kwargs) -> Generator:
        return self._graph.update(*args, **kwargs)

    @interact_with_db
    def add(self, triple: tuple):
        """Adds a triple (s, p, o) into the database.
        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.

        Parameters
        ----------
        triple : tuple
            (s, p, o) triple
        """
        self._graph.add(triple)

    @interact_with_db
    def addN(self, triples: list):
        """Adds n-triples into the database. This method is faster than adding
        individual triples one-by-one using the "add" method. This function
        also automatically adds the current graph as the context (unlike
        the original method from rdflib).
        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.

        Parameters
        ----------
        triples : list
            list of (s, p, o) triples to be added into the database
        """
        # automatically add self.graph as context if not specified directly
        quads = [t + (self._graph, ) for t in triples if len(t) == 3]
        self._graph.addN(quads)

    @interact_with_db
    def remove(self, triple: tuple):
        """Remove the specified triple or triples from the database.
        Not all fields of the triple needs to be specified. Omitted parts
        shall be replaced with "None" value.
        In such case, all triples matching the provided variables
        are removed.
        E.g., (someEntity, None, None) will remove containing "someEntity"
        as subject.

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.

        Parameters
        ----------
        triple : tuple
            (s, p, o) triple
        """
        self._graph.remove(triple)

    @interact_with_db
    def triples(self, triple: tuple):
        """Returns a generator for triples matching the provided pattern.
        The pattern/template triple can contain concrete values or None
        where the item shall be matched to anything.
        E.g., (None, RDF.type, None) will return all triples containing
        RDF.type as the predicate.

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.

        Parameters
        ----------
        triple : tuple
            (s, p, o) triple

        Returns
        -------
        generator
            generator of matching triples
        """
        return self._graph.triples(triple)

    @interact_with_db
    def subjects(self,
                 predicate: Identifier = None,
                 object: Identifier = None):
        """Returns a (list of) subject(s) matching the values provided as predicate
        and object. Similarly to triples, wildcard items can be replaced with None.

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.

        Parameters
        ----------
        predicate : Identifier, optional
            p, by default None
        object : Identifier, optional
            o, by default None

        Returns
        -------
        generator
            Subjects matching the query
        """
        return self._graph.subjects(predicate, object)

    @interact_with_db
    def subject_objects(self, predicate: Identifier = None):
        """Returns subjects and objects matching the value provided as predicate.

        See "subjects" and "triples" methods for more info.

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.

        Parameters
        ----------
        predicate : Identifier, optional
            p, by default None

        Returns
        -------
        generator
            The subjects and objects matching where predicate is set to the provided value
        """
        return self._graph.subject_objects(predicate)

    @interact_with_db
    def subject_predicates(self, object: Identifier = None):
        """Returns subjects and predicates matching the value provided as object.

        See "subjects" and "triples" methods for more info.

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.

        Parameters
        ----------
        object : Identifier, optional
            o, by default None

        Returns
        -------
        generator
            The subjects and predicates matching the query
        """
        return self._graph.subject_predicates(object)

    @interact_with_db
    def objects(self,
                subject: Identifier = None,
                predicate: Identifier = None):
        """Returns (a list of) object(s) matching the query.

        See "subjects" and "triples" methods for more info.

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.

        Parameters
        ----------
        subject : Identifier, optional
            s, by default None
        predicate : Identifier, optional
            p, by default None

        Returns
        -------
        generator
            The objects matching the query
        """
        return self._graph.objects(subject, predicate)

    @interact_with_db
    def predicates(self,
                   subject: Identifier = None,
                   object: Identifier = None):
        """Returns (a list of) predicate(s) matching the query.

        See "subjects" and "triples" methods for more info.

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.

        Parameters
        ----------
        subject : Identifier, optional
            s, by default None
        object : Identifier, optional
            o, by default None

        Returns
        -------
        generator
            The predicates matching the query.
        """
        return self._graph.predicates(subject, object)

    @interact_with_db
    def predicate_objects(self, subject: Identifier = None):
        """Returns predicates and objects where the subject matches
        the value specified in the function parameter.

        See "subjects" and "triples" methods for more info.

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.

        Parameters
        ----------
        subject : Identifier, optional
            s, by default None

        Returns
        -------
        generator
            The predicates and objects matching the query
        """
        return self._graph.predicate_objects(subject)

    @interact_with_db
    def transitive_subjects(self, predicate: Identifier, object: Identifier):
        """This function transitively generates subjects for the object,
        using only the value specified as predicate as the property.
        I.e., it "walks backwards" using only the predicate.
        E.g., transitive_subjects(parentOf, entity) will generate
        all ancestor of the object "entity".

        See "subjects" and "triples" methods for more info.

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.

        Parameters
        ----------
        predicate : Identifier
            p
        object : Identifier
            o

        Returns
        -------
        generator
            Generator of subjects matching the query.
        """
        return self._graph.transitive_subjects(predicate, object)

    @interact_with_db
    def transitive_objects(self, subject: Identifier, property: Identifier):
        """This function generates objects for the subject using only the property.
        It is the revers of "transitive_subjects". I.e., it "walks forwards"
        in the ontology, using only the property/predicate.
        E.g., transitive_objects(entity, hasComponent) will generate all objects
        that are part of the entity (i.e., all the components of the entity).

        See "subjects" and "triples" methods for more info.

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.

        Parameters
        ----------
        subject : Identifier
            s
        property : Identifier
            p

        Returns
        -------
        generator
            Objects matchting the query
        """
        return self._graph.transitive_objects(subject, property)

    @interact_with_db
    def set(self, triple: set):
        """Convenience function for "set" operations in the database.
        Values set by this function are first removed and than assigned,
        ensuring there is only one record for the specified subject + property.

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.

        Parameters
        ----------
        triple : set
            (s, p, o) triple
        """
        self._graph.set(triple)

    @interact_with_db
    def value(self,
              subject: Identifier = None,
              predicate: Identifier = RDF.value,
              object: Identifier = None,
              default=None,
              any=True):
        """Complementery function for the "set" method. It expects that there is only one value
        matching the subject + predicate combination. Error is risen otherwise!

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.

        Parameters
        ----------
        subject : Identifier, optional
            s, by default None
        predicate : Identifier, optional
            p, by default RDF.value
        object : Identifier, optional
            o, by default None
        default : any, optional
            Default value to be returned if it is not specified in the database, by default None
        any : bool, optional
            No idea, see rdflib documentation, by default True

        Returns
        -------
        any
            The expected value
        """
        return self._graph.value(subject, predicate, object, default, any)

    @interact_with_db
    def compute_qname(self, uri):
        return self._graph.compute_qname(uri)

    @interact_with_db
    def __getitem__(self, item):
        """Convenience function. Allows queries/triples to be specified via the "object[index]"
        notation.

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.
        """
        return self._graph.__getitem__(item)

    @interact_with_db
    def __len__(self):
        """Allows the use of the len(container) function to return the number of entries in the database

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.
        """
        return len(self._graph)

    @interact_with_db
    def __contains__(self, item):
        """Allows the use of "item in container" notation to be used to test if database contains entries
        matching the query. The item shall be an (s, p, o) triple, obeying the standard contrains.

        This function is only a "safe" re-implementation of the original rdflib graph function.
        See rdflib.Graph documentation for more information.
        """
        return item in self._graph

    @property
    def graph(self):
        # REMOVE: temporal debugging property, shall not be present at release
        return self._graph
Ejemplo n.º 28
0
class Store:
    def __init__(self, endpoint):
        self.endpoint = endpoint
        query_ep = self.endpoint + '/query'
        update_ep = self.endpoint + '/update'
        self.sparql = SPARQLUpdateStore(queryEndpoint=query_ep,
                                        update_endpoint=update_ep,
                                        bNodeAsURI=True)

    def store_annotations(self, annotations):
        for annotation in annotations:
            ann = Annotation()
            ann.parse_json(annotation)
            ann.add_to_graph(self.sparql)
        return

    def query_article(self, article):
        escaped = self.escape_sparql(article)
        ret = []
        query = """
        SELECT DISTINCT ?author ?author_fullname ?author_email
         ?date ?label ?type ?body_s ?body_p ?body_o ?body_l
         ?target_start ?target_startoffset ?target_endoffset
        WHERE {
            ?annotation rdf:type oa:Annotation ;
                oa:annotatedAt ?date ;
                oa:annotatedBy ?author .
            OPTIONAL { ?author foaf:name ?author_fullname }
            OPTIONAL { ?author schema:email ?author_email }
            OPTIONAL { ?annotation rdfs:label ?label }
            OPTIONAL { ?annotation ao:type ?type }
            OPTIONAL { ?annotation oa:hasBody ?body }
            OPTIONAL { ?body rdf:subject ?body_s }
            OPTIONAL { ?body rdf:predicate ?body_p }
            OPTIONAL { ?body rdf:object ?body_o }
            OPTIONAL { ?body rdfs:label ?body_l }
            { ?annotation oa:hasTarget ao:""" + escaped + """ }
             UNION
            { ?annotation oa:hasTarget ?bnode .
              ?bnode rdf:type oa:SpecificResource ;
                    oa:hasSource ao:""" + escaped + """ ;
                    oa:hasSelector ?selector .
              ?selector rdf:type oa:FragmentSelector ;
                    rdf:value ?target_start ;
                    oa:start ?target_startoffset ;
                    oa:end ?target_endoffset }
        }
        """
        for row in self.sparql.query(query, initNs=initNS):
            annotation = Annotation()
            annotation.parse_rdf({
                'target': article,
                'author': row[0].encode('utf-8'),
                'author_fullname': row[1].encode('utf-8') if row[1] is not None else None,
                'author_email': row[2].encode('utf-8') if row[2] is not None else None,
                'created': row[3].encode('utf-8') if row[3] is not None else None,
                'label': row[4].encode('utf-8') if row[4] is not None else None,
                'type': row[5].encode('utf-8') if row[5] is not None else None,
                'subject': row[6].encode('utf-8') if row[6] is not None else None,
                'predicate': row[7].encode('utf-8') if row[7] is not None else None,
                'object': row[8].encode('utf-8') if row[8] is not None else None,
                'obj_label': row[9].encode('utf-8') if row[9] is not None else None,
                'target_start': row[10].encode('utf-8') if row[10] is not None else None,
                'target_startoff': int(row[11]) if row[11] is not None else None,
                'target_endoff': int(row[12]) if row[12] is not None else None
            })
            ret.append(annotation.to_dict())
        return ret

    def query_authors(self):
        authors = []
        query = """
        SELECT DISTINCT ?author ?author_fullname ?author_email
        WHERE {
            ?author a foaf:Person ;
                foaf:name ?author_fullname .
            OPTIONAL { ?author schema:email ?author_email }
        }
        """
        for row in self.sparql.query(query, initNs=initNS):
            authors.append({
                'author_id': row[0].encode('utf-8'),
                'author_fullname': row[1].encode('utf-8'),
                'author_email': row[2].encode('utf-8') if row[2] is not None else None,
            })
        return authors

    # Inserts a new author.
    # Expects a dict:
    # {
    # 'author_id': ...,
    # 'author_fullname': ...,
    # 'author_email': ...
    # }
    def insert_author(self, author):
        try:
            parse(author['author_id'], rule='IRI')
            a = author['author_id']
        except ValueError:
            a = AOP[author['author_id']]
        self.sparql.add((a, RDF.type, FOAF.Person))
        self.sparql.add((a, FOAF.name, Literal(author['author_fullname'])))
        if 'author_email' in author:
            self.sparql.add((a, SCHEMA.email, Literal(author['author_email'])))
        return 'OK'

    def query_organization(self):
        ret = []
        query = """
        SELECT DISTINCT ?node ?label
        WHERE {
            ?node a foaf:Organization ;
                foaf:name ?label .
        }
        """
        for row in self.sparql.query(query, initNs=initNS):
            ret.append({
                'id': row[0].encode('utf-8'),
                'label': row[1].encode('utf-8')
            })
        return ret

    def query_place(self):
        ret = []
        query = """
        SELECT DISTINCT ?node ?label
        WHERE {
            ?node a dbpedia:Place ;
                rdfs:label ?label .
        }
        """
        for row in self.sparql.query(query, initNs=initNS):
            ret.append({
                'id': row[0].encode('utf-8'),
                'label': row[1].encode('utf-8')
            })
        return ret

    def query_concept(self):
        ret = []
        query = """
        SELECT DISTINCT ?node ?label
        WHERE {
            ?node a skos:Concept ;
                rdfs:label ?label .
        }
        """
        for row in self.sparql.query(query, initNs=initNS):
            ret.append({
                'id': row[0].encode('utf-8'),
                'label': row[1].encode('utf-8')
            })
        return ret

    def insert_organization(self, data):
        try:
            parse(data['id'], rule='IRI')
            a = URIRef(data['id'])
        except ValueError:
            a = AOP[data['id']]
        self.sparql.add((a, RDF.type, FOAF.Organization))
        self.sparql.add((a, FOAF.name, Literal(data['label'])))
        return 'OK'

    def insert_place(self, data):
        try:
            parse(data['id'], rule='IRI')
            a = URIRef(data['id'])
        except ValueError:
            a = DBPEDIA[data['id']]
        self.sparql.add((a, RDF.type, DBPEDIA.Place))
        self.sparql.add((a, RDFS.label, Literal(data['label'])))
        return 'OK'

    def insert_concept(self, data):
        try:
            parse(data['id'], rule='IRI')
            a = URIRef(data['id'])
        except ValueError:
            a = BNCF[data['id']]
        self.sparql.add((a, RDF.type, SKOS.Concept))
        self.sparql.add((a, RDFS.label, Literal(data['label'])))
        return 'OK'

    @staticmethod
    def init_graph():
        rdf = Graph()
        for ns in initNS:
            rdf.bind(ns, initNS[ns])
        return rdf

    @staticmethod
    def escape_sparql(string):
        return string.replace('(', '\(').replace(')', '\)')
Ejemplo n.º 29
0
def update_fuseki(config, files):
    """
    The current procedure first dumps the enriched graph to a temporary file in a dir accessible by
    the web server, then loads the file using the SPARQL LOAD operation.

    I first tried pushing the enriched graph directly to the update endpoint
    without writing a temporary file, but that approach failed for two reasons:
     - Using INSERT DATA with "lots" of triples (>> 20k) caused Fuseki to give a 500 response.
     - Using INSERT DATA with chunks of 20k triples worked well... when there were no blank nodes.
       If the same bnode were referenced in two different chunks, it would end up as *two* bnodes.
       Since we're using bnodes in RDF lists, many lists ended up broken. From the SPARQL ref.:

            Variables in QuadDatas are disallowed in INSERT DATA requests (see Notes 8 in the grammar).
            That is, the INSERT DATA statement only allows to insert ground triples. Blank nodes in
            QuadDatas are assumed to be disjoint from the blank nodes in the Graph Store,
            i.e., will be inserted with "fresh" blank nodes.

    Using tdbloader would be another option, but then we would still need a temp file, we would also need
    to put that file on a volume accessible to the docker container, and we would need to shutdown the
    server while loading the file. And it's a solution tied to Fuseki.

    I'm not aware if there is a limit on how large graphs Fuseki can load with the LOAD operation.
    I guess we'll find out.
    """

    if config['dumps_dir'] is None:
        raise Exception("The 'dumps_dir' option must be set")

    if config['dumps_dir_url'] is None:
        raise Exception("The 'dumps_dir_url' option must be set")

    tmpfile = '{}/import_{}.ttl'.format(config['dumps_dir'].rstrip('/'),
                                        config['basename'])
    tmpfile_url = '{}/import_{}.ttl'.format(
        config['dumps_dir_url'].rstrip('/'), config['basename'])

    tc = enrich_and_concat(files, tmpfile)

    c0 = get_graph_count(config)

    store = SPARQLUpdateStore('{}/sparql'.format(config['fuseki']),
                              '{}/update'.format(config['fuseki']))
    graph_uri = URIRef(config['graph'])
    graph = Graph(store, graph_uri)

    logger.info("Fuseki: Loading %d triples into <%s> from %s", tc, graph_uri,
                tmpfile_url)

    # CLEAR GRAPH first to make sure all blank nodes are erased
    # https://github.com/scriptotek/emnesok/issues/70
    store.update('CLEAR GRAPH <{}>'.format(graph_uri))

    store.update('LOAD <{}> INTO GRAPH <{}>'.format(tmpfile_url, graph_uri))

    c1 = get_graph_count(config)
    if c0 == c1:
        logger.info('Fuseki: Graph <%s> updated, number of concepts unchanged',
                    config['graph'])
    else:
        logger.info(
            'Fuseki: Graph <%s> updated, number of concepts changed from %d to %d.',
            config['graph'], c0, c1)

    invalidate_varnish_cache(config['basename'])
    logger.info('Invalidated Varnish cache for %s', config['basename'])
Ejemplo n.º 30
0
 def __addN(self, quads):
     for batch in ibatch(quads, 100):
         SPARQLUpdateStore.addN(self, quads)
Ejemplo n.º 31
0
class Store:
    def __init__(self, endpoint):
        self.endpoint = endpoint
        query_ep = self.endpoint + '/query'
        update_ep = self.endpoint + '/update'
        self.sparql = SPARQLUpdateStore(queryEndpoint=query_ep,
                                        update_endpoint=update_ep,
                                        bNodeAsURI=True)

    def store_annotations(self, annotations):
        for annotation in annotations:
            ann = Annotation()
            ann.parse_json(annotation)
            ann.add_to_graph(self.sparql)
        return

    def query_article(self, article):
        escaped = self.escape_sparql(article)
        ret = []
        query = """
        SELECT DISTINCT ?author ?author_fullname ?author_email
         ?date ?label ?type ?body_s ?body_p ?body_o ?body_l
         ?target_start ?target_startoffset ?target_endoffset
        WHERE {
            ?annotation rdf:type oa:Annotation ;
                oa:annotatedAt ?date ;
                oa:annotatedBy ?author .
            OPTIONAL { ?author foaf:name ?author_fullname }
            OPTIONAL { ?author schema:email ?author_email }
            OPTIONAL { ?annotation rdfs:label ?label }
            OPTIONAL { ?annotation ao:type ?type }
            OPTIONAL { ?annotation oa:hasBody ?body }
            OPTIONAL { ?body rdf:subject ?body_s }
            OPTIONAL { ?body rdf:predicate ?body_p }
            OPTIONAL { ?body rdf:object ?body_o }
            OPTIONAL { ?body rdfs:label ?body_l }
            { ?annotation oa:hasTarget ao:""" + escaped + """ }
             UNION
            { ?annotation oa:hasTarget ?bnode .
              ?bnode rdf:type oa:SpecificResource ;
                    oa:hasSource ao:""" + escaped + """ ;
                    oa:hasSelector ?selector .
              ?selector rdf:type oa:FragmentSelector ;
                    rdf:value ?target_start ;
                    oa:start ?target_startoffset ;
                    oa:end ?target_endoffset }
        }
        """
        for row in self.sparql.query(query, initNs=initNS):
            annotation = Annotation()
            annotation.parse_rdf({
                'target':
                article,
                'author':
                row[0].encode('utf-8'),
                'author_fullname':
                row[1].encode('utf-8') if row[1] is not None else None,
                'author_email':
                row[2].encode('utf-8') if row[2] is not None else None,
                'created':
                row[3].encode('utf-8') if row[3] is not None else None,
                'label':
                row[4].encode('utf-8') if row[4] is not None else None,
                'type':
                row[5].encode('utf-8') if row[5] is not None else None,
                'subject':
                row[6].encode('utf-8') if row[6] is not None else None,
                'predicate':
                row[7].encode('utf-8') if row[7] is not None else None,
                'object':
                row[8].encode('utf-8') if row[8] is not None else None,
                'obj_label':
                row[9].encode('utf-8') if row[9] is not None else None,
                'target_start':
                row[10].encode('utf-8') if row[10] is not None else None,
                'target_startoff':
                int(row[11]) if row[11] is not None else None,
                'target_endoff':
                int(row[12]) if row[12] is not None else None
            })
            ret.append(annotation.to_dict())
        return ret

    def query_authors(self):
        authors = []
        query = """
        SELECT DISTINCT ?author ?author_fullname ?author_email
        WHERE {
            ?author a foaf:Person ;
                foaf:name ?author_fullname .
            OPTIONAL { ?author schema:email ?author_email }
        }
        """
        for row in self.sparql.query(query, initNs=initNS):
            authors.append({
                'author_id':
                row[0].encode('utf-8'),
                'author_fullname':
                row[1].encode('utf-8'),
                'author_email':
                row[2].encode('utf-8') if row[2] is not None else None,
            })
        return authors

    # Inserts a new author.
    # Expects a dict:
    # {
    # 'author_id': ...,
    # 'author_fullname': ...,
    # 'author_email': ...
    # }
    def insert_author(self, author):
        try:
            parse(author['author_id'], rule='IRI')
            a = author['author_id']
        except ValueError:
            a = AOP[author['author_id']]
        self.sparql.add((a, RDF.type, FOAF.Person))
        self.sparql.add((a, FOAF.name, Literal(author['author_fullname'])))
        if 'author_email' in author:
            self.sparql.add((a, SCHEMA.email, Literal(author['author_email'])))
        return 'OK'

    def query_organization(self):
        ret = []
        query = """
        SELECT DISTINCT ?node ?label
        WHERE {
            ?node a foaf:Organization ;
                foaf:name ?label .
        }
        """
        for row in self.sparql.query(query, initNs=initNS):
            ret.append({
                'id': row[0].encode('utf-8'),
                'label': row[1].encode('utf-8')
            })
        return ret

    def query_place(self):
        ret = []
        query = """
        SELECT DISTINCT ?node ?label
        WHERE {
            ?node a dbpedia:Place ;
                rdfs:label ?label .
        }
        """
        for row in self.sparql.query(query, initNs=initNS):
            ret.append({
                'id': row[0].encode('utf-8'),
                'label': row[1].encode('utf-8')
            })
        return ret

    def query_concept(self):
        ret = []
        query = """
        SELECT DISTINCT ?node ?label
        WHERE {
            ?node a skos:Concept ;
                rdfs:label ?label .
        }
        """
        for row in self.sparql.query(query, initNs=initNS):
            ret.append({
                'id': row[0].encode('utf-8'),
                'label': row[1].encode('utf-8')
            })
        return ret

    def insert_organization(self, data):
        try:
            parse(data['id'], rule='IRI')
            a = URIRef(data['id'])
        except ValueError:
            a = AOP[data['id']]
        self.sparql.add((a, RDF.type, FOAF.Organization))
        self.sparql.add((a, FOAF.name, Literal(data['label'])))
        return 'OK'

    def insert_place(self, data):
        try:
            parse(data['id'], rule='IRI')
            a = URIRef(data['id'])
        except ValueError:
            a = DBPEDIA[data['id']]
        self.sparql.add((a, RDF.type, DBPEDIA.Place))
        self.sparql.add((a, RDFS.label, Literal(data['label'])))
        return 'OK'

    def insert_concept(self, data):
        try:
            parse(data['id'], rule='IRI')
            a = URIRef(data['id'])
        except ValueError:
            a = BNCF[data['id']]
        self.sparql.add((a, RDF.type, SKOS.Concept))
        self.sparql.add((a, RDFS.label, Literal(data['label'])))
        return 'OK'

    @staticmethod
    def init_graph():
        rdf = Graph()
        for ns in initNS:
            rdf.bind(ns, initNS[ns])
        return rdf

    @staticmethod
    def escape_sparql(string):
        return string.replace('(', '\(').replace(')', '\)')
Ejemplo n.º 32
0
    data_class = "".join(e for e in (numberOfLines + "_" + subject)
                         if e.isalnum())

    timeAndDate = getTimeAndDate(text)
    day = timeAndDate[0]
    date = timeAndDate[1]
    date_class = date.replace(" ", "_")
    time = timeAndDate[2]
    timezone = timeAndDate[3]
    time_class = time.replace(":", "_")
    date_and_time_class = date_class + "_" + time_class

    # FUSEKI
    from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore
    from rdflib.graph import ConjunctiveGraph

    updateStore = SPARQLUpdateStore("http://localhost:3030/RM/update")
    # updateStore = SPARQLUpdateStore("http://localhost:3030/store/update")
    updateGraph = ConjunctiveGraph(store=updateStore)

    insertPerson(updateGraph, person_class, first_name, last_name, email)
    insertOrganization(updateGraph, organization_class, organization_name,
                       distribution)
    insertData(updateGraph, data_class, subject, summary, numberOfLines)
    insertNewsgroup(updateGraph, newsgroup_class, newsgroup_name)
    insertDate(updateGraph, date_class, date, day)
    insertTime(updateGraph, time_class, time, timezone)
    createNews(updateGraph, news_class, newsgroup_class, person_class,
               organization_class, author_class, data_class, date_class,
               time_class, date_and_time_class)
Ejemplo n.º 33
0
import os
from urllib.error import HTTPError

from SPARQLWrapper import DIGEST, POST
from SPARQLWrapper.SPARQLExceptions import EndPointNotFound
from rdflib.graph import Dataset, ReadOnlyGraphAggregate
from rdflib.namespace import RDF, RDFS, OWL, Namespace, URIRef
from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore
from rdflib.plugins.stores.sparqlstore import SPARQLWrapper

from experiments.ontology.config import config
from experiments.utils import except_safe

ont_config = config['ontology']
endpoint = update_endpoint = ont_config['endpoint']
store = SPARQLUpdateStore(endpoint, update_endpoint, autocommit=True)  # need to call store.commit explicitly todo: there's some trouble in sparqlstore's source with that autocommit logic
store.setHTTPAuth(DIGEST)
store.setCredentials(user=ont_config['endpoint_user'], passwd=ont_config['endpoint_passwd'])
ds = Dataset(store, default_union=False)

iri_dbo = 'http://dbpedia.org/ontology'
iri_dbpedia = 'http://dbpedia.org'
iri_labels = 'http://dbpedia.org/labels'
iri_redirects = 'http://dbpedia.org/redirects'
iri_disamb = 'http://dbpedia.org/disambiguations'
iri_field = 'field'
iri_more = 'field:more'
dbo = Namespace('http://dbpedia.org/ontology/')
dbr = Namespace('http://dbpedia.org/resource/')

# NB: for existing graphs use 'ds.get_context(iri)', for new graphs use 'ds.graph(iri)'