def store(self, cur_g, base_dir, base_iri, context_path, tmp_dir=None, override=False, already_processed={}, store_now=True, remove_data=False): self.repok.new_article() self.reperr.new_article() if len(cur_g) > 0: cur_dir_path, cur_file_path = self.dir_and_file_paths( cur_g, base_dir, base_iri) try: if not os.path.exists(cur_dir_path): os.makedirs(cur_dir_path) final_g = ConjunctiveGraph() final_g.addN( [item + (cur_g.identifier, ) for item in list(cur_g)]) # Remove the data if remove_data: stored_g = None if cur_file_path in already_processed: stored_g = already_processed[cur_file_path] elif os.path.exists(cur_file_path): stored_g = self.load(cur_file_path, cur_g, tmp_dir) for s, p, o, g in final_g.quads((None, None, None, None)): stored_g.remove((s, p, o, g)) final_g = stored_g elif not override: # Merging the data if cur_file_path in already_processed: stored_g = already_processed[cur_file_path] stored_g.addN(final_g.quads((None, None, None, None))) final_g = stored_g elif os.path.exists(cur_file_path): # This is a conjunctive graps that contains all the triples (and graphs) # the file is actually defining - they could be more than those using # 'cur_subject' as subject. final_g = self.load(cur_file_path, cur_g, tmp_dir) already_processed[cur_file_path] = final_g if store_now: self.__store_in_file(final_g, cur_file_path, context_path) return already_processed except Exception as e: self.reperr.add_sentence( "[5] It was impossible to store the RDF statements in %s. %s" % (cur_file_path, str(e))) return None
def test_quad_contexts(): g = ConjunctiveGraph() a = URIRef('urn:a') b = URIRef('urn:b') g.get_context(a).add((a, a, a)) g.addN([(b, b, b, b)]) assert set(g) == set([(a, a, a), (b, b, b)]) for q in g.quads(): assert isinstance(q[3], Graph)
def test_quad_contexts(): g = ConjunctiveGraph() a = URIRef("urn:a") b = URIRef("urn:b") g.get_context(a).add((a, a, a)) g.addN([(b, b, b, b)]) assert set(g) == set([(a, a, a), (b, b, b)]) for q in g.quads(): assert isinstance(q[3], Graph)
def graphWithoutMetadata(g: ConjunctiveGraph, ignorePredicates=[]): """ graph filter that removes any statements whose subjects are contexts in the graph and also any statements with the given predicates """ ctxs = [ctx.identifier for ctx in g.contexts()] out = ConjunctiveGraph() for stmt in g.quads((None, None, None)): if stmt[0] not in ctxs and stmt[1] not in ignorePredicates: out.addN([stmt]) return out
def addTrig(graph, url, timeout=2): t1 = time.time() # workaround for some reason my ipv6 names don't resolve for name, addr in ipv6Addresses.iteritems(): url = url.replace('/' + name + ':', '/[' + addr + ']:') log.debug(' fetching %r', url) response = yield treq.get(url, headers={'accept': ['application/trig']}, timeout=timeout) if response.code != 200: raise ValueError("status %s from %s" % (response.code, url)) g = ConjunctiveGraph() g.parse(StringInputSource((yield response.content())), format='trig') fetchTime = time.time() - t1 log.debug(' %r done in %.04f sec', url, fetchTime) graph.addN(g.quads()) returnValue(fetchTime)
def addTrig(graph, url, timeout=2): t1 = time.time() # workaround for some reason my ipv6 names don't resolve for name, addr in ipv6Addresses.items(): url = url.replace('/' + name + ':', '/[' + addr + ']:') log.debug(' fetching %r', url) response = yield treq.get(url, headers={'accept': ['application/trig']}, timeout=timeout) if response.code != 200: raise ValueError("status %s from %s" % (response.code, url)) g = ConjunctiveGraph() g.parse(StringInputSource((yield response.content())), format='trig') fetchTime = time.time() - t1 log.debug(' %r done in %.04f sec', url, fetchTime) graph.addN(g.quads()) returnValue(fetchTime)
def __init__(self, location, repository, inmemory=False): super(RDFLibStore, self).__init__(location, repository) self.inmemory = inmemory self.closed = False graphid = URIRef("file://" + self.repository) g = ConjunctiveGraph(store=self._storeid(), identifier=graphid) if os.path.exists(self.location): g.open(self.location, create=False) else: g.open(self.location, create=True) l = logging.getLogger(__name__) if inmemory: l.debug("Loading store into memory") ig = ConjunctiveGraph(identifier=graphid) ig.addN(g.quads()) g.close() self.graph = ig else: l.debug("Using on-disk store") self.graph = g
def store(self, cur_g, base_dir, base_iri, context_path, tmp_dir=None, override=False, already_processed={}, store_now=True): self.repok.new_article() self.reperr.new_article() if len(cur_g) > 0: cur_subject = set(cur_g.subjects(None, None)).pop() cur_dir_path, cur_file_path = find_paths( str(cur_subject), base_dir, base_iri, self.dir_split, self.n_file_item) try: if not os.path.exists(cur_dir_path): os.makedirs(cur_dir_path) final_g = ConjunctiveGraph() final_g.addN([item + (cur_g.identifier,) for item in list(cur_g)]) # Merging the data if not override: if cur_file_path in already_processed: stored_g = already_processed[cur_file_path] stored_g.addN(final_g.quads((None, None, None, None))) final_g = stored_g elif os.path.exists(cur_file_path): # This is a conjunctive graps that contains all the triples (and graphs) # the file is actually defining - they could be more than those using # 'cur_subject' as subject. final_g = self.load(cur_file_path, cur_g, tmp_dir) already_processed[cur_file_path] = final_g if store_now: self.__store_in_file(final_g, cur_file_path, context_path) return already_processed except Exception as e: self.reperr.add_sentence("[5] It was impossible to store the RDF statements in %s. %s" % (cur_file_path, str(e))) return None
def checkrequest(request): """Analyze RDF data contained in a POST request. Args: request: A Flask HTTP Request. Returns: data: A list with RDFLib.quads object and the rdflib.ConjunciveGraph object Raises: Exception: I contained data is not valid nquads. """ data = [] reqdata = request.data graph = ConjunctiveGraph() try: graph.parse(data=reqdata, format='nquads') except Exception as e: raise e quads = graph.quads((None, None, None, None)) data = splitinformation(quads, graph) return data
def testDeleteRunsBeforeAdd(self): g = ConjunctiveGraph() patchQuads(g, [stmt1], [stmt1]) quads = list(g.quads((None, None, None))) self.assertEqual(quads, [(A, B, C, Graph(identifier=CTX1))])
def testDeletes(self): g = ConjunctiveGraph() patchQuads(g, [], [stmt1]) patchQuads(g, [stmt1], []) quads = list(g.quads((None, None, None))) self.assertEqual(quads, [])
def testAddsToNewContext(self): g = ConjunctiveGraph() patchQuads(g, [], [stmt1]) self.assertEqual(len(g), 1) quads = list(g.quads((None, None, None))) self.assertEqual(quads, [(A, B, C, Graph(identifier=CTX1))])
from rdflib import URIRef,Literal, Namespace, Graph, ConjunctiveGraph, RDF import rdflib pkg_resources.require("rdfextras>=0.1") from rdfextras import * file_store=rdflib.plugin.get('Sleepycat',rdflib.store.Store)() code = file_store.open("test.db", create=True) if code != rdflib.store.VALID_STORE: print "something went wrong" sys.exit(-1) all_graphs = ConjunctiveGraph(store = file_store) print "things already in the store" for s,p,o,m in all_graphs.quads((None, None, None)): print s,p,o,m file_store.bind('dc', Namespace("http://purl.org/dc/elements/1.1/")) dc = Namespace("http://purl.org/dc/elements/1.1/") a_graph = Graph(store=file_store, identifier=URIRef("http://example.com/graph1")) a_graph.add( (URIRef("http://example.com/i1"), dc["creator"], Literal("Author "+str(random.randint(1,1000))) ) ) print "things in a_graph" for s,p,o in a_graph: print s,p,o b_graph = Graph(store=file_store, identifier=URIRef("http://example.com/graph2"))
def testDeletes(self): g = ConjunctiveGraph() patchQuads(g, [], [stmt1]) patchQuads(g, [stmt1], []) quads = list(g.quads((None,None,None))) self.assertEqual(quads, [])
def testAddsToNewContext(self): g = ConjunctiveGraph() patchQuads(g, [], [stmt1]) self.assert_(len(g), 1) quads = list(g.quads((None,None,None))) self.assertEqual(quads, [(A, B, C, Graph(identifier=CTX1))])
class SyncedGraph(CurrentStateGraphApi, AutoDepGraphApi, GraphEditApi): """ graph for clients to use. Changes are synced with the master graph in the rdfdb process. self.patch(p: Patch) is the only way to write to the graph. Reading can be done with the AutoDepGraphApi methods which set up watchers to call you back when the results of the read have changed (like knockoutjs). Or you can read with CurrentStateGraphApi which doesn't have watchers, but you have to opt into using it so it's clear you aren't in an auto-dep context and meant to set up watchers. You may want to attach to self.initiallySynced deferred so you don't attempt patches before we've heard the initial contents of the graph. It would be ok to accumulate some patches of new material, but usually you won't correctly remove the existing statements unless we have the correct graph. If we get out of sync, we abandon our local graph (even any pending local changes) and get the data again from the server. """ def __init__(self, rdfdbRoot: URIRef, label: str, receiverHost: Optional[str] = None): """ label is a string that the server will display in association with your connection receiverHost is the hostname other nodes can use to talk to me """ self.isConnected = False self.currentClient: Optional[WsClientProtocol] = None self.rdfdbRoot = rdfdbRoot self.connectSocket() self.initiallySynced: defer.Deferred[None] = defer.Deferred() self._graph = ConjunctiveGraph() AutoDepGraphApi.__init__(self) # this needs more state to track if we're doing a resync (and # everything has to error or wait) or if we're live def lostRdfdbConnection(self) -> None: self.isConnected = False self.patch(Patch(delQuads=self._graph.quads())) log.info(f'cleared graph to {len(self._graph)}') log.error('graph is not updating- you need to restart') self.connectSocket() def connectSocket(self) -> None: factory = autobahn.twisted.websocket.WebSocketClientFactory( self.rdfdbRoot.replace('http://', 'ws://') + 'syncedGraph', # Don't know if this is required by spec, but # cyclone.websocket breaks with no origin header. origin='foo') factory.protocol = lambda: WsClientProtocol(self) rr = urllib.parse.urlparse(self.rdfdbRoot) conn = reactor.connectTCP(rr.hostname.encode('ascii'), rr.port, factory) #WsClientProtocol sets our currentClient. Needs rewrite using agents. def resync(self): """ get the whole graph again from the server (e.g. we had a conflict while applying a patch and want to return to the truth). To avoid too much churn, we remember our old graph and diff it against the replacement. This way, our callers only see the corrections. Edits you make during a resync will surely be lost, so I should just fail them. There should be a notification back to UIs who want to show that we're doing a resync. """ log.info('resync') self.currentClient.dropConnection() def _resyncGraph(self, response): log.warn("new graph in") self.currentClient.dropConnection() #diff against old entire graph #broadcast that change def patch(self, p: Patch) -> None: """send this patch to the server and apply it to our local graph and run handlers""" if not self.isConnected or self.currentClient is None: log.warn("not currently connected- dropping patch") return if p.isNoop(): log.info("skipping no-op patch") return # these could fail if we're out of sync. One approach: # Rerequest the full state from the server, try the patch # again after that, then give up. debugKey = '[id=%s]' % (id(p) % 1000) log.debug("\napply local patch %s %s", debugKey, p) try: self._applyPatchLocally(p) except ValueError as e: log.error(e) self.resync() return log.debug('runDepsOnNewPatch') self.runDepsOnNewPatch(p) log.debug('sendPatch') self.currentClient.sendPatch(p) log.debug('patch is done %s', debugKey) def suggestPrefixes(self, ctx, prefixes): """ when writing files for this ctx, try to use these n3 prefixes. async, not guaranteed to finish before any particular file flush """ treq.post(self.rdfdbRoot + 'prefixes', json.dumps({ 'ctx': ctx, 'prefixes': prefixes }).encode('utf8')) def _applyPatchLocally(self, p: Patch): # .. and disconnect on failure patchQuads(self._graph, p.delQuads, p.addQuads, perfect=True) log.debug("graph now has %s statements" % len(self._graph)) def onPatchFromDb(self, p): """ central server has sent us a patch """ if log.isEnabledFor(logging.DEBUG): if len(p.addQuads) > 50: log.debug('server has sent us %s', p.shortSummary()) else: log.debug('server has sent us %s', p) self._applyPatchLocally(p) try: self.runDepsOnNewPatch(p) except Exception: # don't reflect this error back to the server; we did # receive its patch correctly. However, we're in a bad # state since some dependencies may not have rerun traceback.print_exc() log.warn("some graph dependencies may not have completely run") if self.initiallySynced: self.initiallySynced.callback(None) self.initiallySynced = None
class RDFCrawler: logger = logging.getLogger(__name__) def __init__(self, uri, domains=set()): """ :param uri: root URI to start crawling . :param domains: list of permits domains to crawl. """ self.root = uri self.graph_route = 'graph_store_%s' % hash(self.root) self.graph = ConjunctiveGraph('Sleepycat') self.graph.open(self.graph_route, create=True) self._filter_domains = domains self._filter_domains.add(uri) self.last_process_time = 0.0 self.lock = RLock() def filter_uris(self, uri_list): """ :param uri_list: list of URIs to be filtered. :return: filtered list of URIs. """ return [ uri for uri in uri_list for match in self._filter_domains if match in str(uri) ] def _has_context(self, graph, subject): """ :param subject: the URIRef or URI to check if it has current context. :return: True if subject has a current context. """ return len(graph.get_context(self._get_context_id(subject))) > 1 @staticmethod def _get_context_id(subject): """ :param subject: URIRef or URI from which the get context id. :return: context id of the resource. Example: subject -> http://www.example.org/#fragment context_id -> http://www.example.org/ """ return str(subject).split('#')[0] def start(self): """ start method for crawling. """ self.lock.acquire(True) # Erase old graph for q in self.graph.quads(): self.graph.remove(q) # Crawl for data logging.info('Start crawling: %s' % self.root) start_time = time.time() self._crawl([self.root]) end_time = time.time() self.last_process_time = end_time - start_time logging.info( 'Crawling complete after: %s seconds with %s predicates.' % (self.last_process_time, len(self.graph))) self.lock.release() def _crawl(self, uri_list): """ Recursive method that crawl RDF objects :param uri_list: list of URIs to crawl """ if len(uri_list) > 0: for uri in uri_list: try: # A few considerations about parsing params. # publicID = uri due to redirection issues # Format = None due to default params use 'XML' self.graph.parse(uri, publicID=uri, format=None) logging.info('[OK]: %s' % uri) except Exception as e: logging.info('[Error]: %s: %s' % (uri, e)) # Check that there are context that remains without parsing objects = set([ self._get_context_id(o) for o in set(self.graph.objects(None, None)) if isinstance(o, URIRef) and not self._has_context(self.graph, o) ]) self._crawl(self.filter_uris(objects))
def testPerfectAddAllowsExistingStmtInNewContext(self): g = ConjunctiveGraph() patchQuads(g, [], [stmt1]) patchQuads(g, [], [stmt2], perfect=True) self.assertEqual(len(list(g.quads((None, None, None)))), 2)
def testPerfectAddAllowsExistingStmtInNewContext(self): g = ConjunctiveGraph() patchQuads(g, [], [stmt1]) patchQuads(g, [], [stmt2], perfect=True) self.assertEqual(len(list(g.quads((None,None,None)))), 2)
class RDFCrawler: logger = logging.getLogger(__name__) def __init__(self, uri, domains=set()): """ :param uri: root URI to start crawling . :param domains: list of permits domains to crawl. """ self.root = uri self.graph_route = 'graph_store_%s' % hash(self.root) self.graph = ConjunctiveGraph('Sleepycat') self.graph.open(self.graph_route, create=True) self._filter_domains = domains self._filter_domains.add(uri) self.last_process_time = 0.0 self.lock = RLock() def filter_uris(self, uri_list): """ :param uri_list: list of URIs to be filtered. :return: filtered list of URIs. """ return [uri for uri in uri_list for match in self._filter_domains if match in str(uri)] def _has_context(self, graph, subject): """ :param subject: the URIRef or URI to check if it has current context. :return: True if subject has a current context. """ return len(graph.get_context(self._get_context_id(subject))) > 1 @staticmethod def _get_context_id(subject): """ :param subject: URIRef or URI from which the get context id. :return: context id of the resource. Example: subject -> http://www.example.org/#fragment context_id -> http://www.example.org/ """ return str(subject).split('#')[0] def start(self): """ start method for crawling. """ self.lock.acquire(True) # Erase old graph for q in self.graph.quads(): self.graph.remove(q) # Crawl for data logging.info('Start crawling: %s' % self.root) start_time = time.time() self._crawl([self.root]) end_time = time.time() self.last_process_time = end_time - start_time logging.info('Crawling complete after: %s seconds with %s predicates.' % (self.last_process_time, len(self.graph))) self.lock.release() def _crawl(self, uri_list): """ Recursive method that crawl RDF objects :param uri_list: list of URIs to crawl """ if len(uri_list) > 0: for uri in uri_list: try: # A few considerations about parsing params. # publicID = uri due to redirection issues # Format = None due to default params use 'XML' self.graph.parse(uri, publicID=uri, format=None) logging.info('[OK]: %s' % uri) except Exception as e: logging.info('[Error]: %s: %s' % (uri, e)) # Check that there are context that remains without parsing objects = set([self._get_context_id(o) for o in set(self.graph.objects(None, None)) if isinstance(o, URIRef) and not self._has_context(self.graph, o)]) self._crawl(self.filter_uris(objects))
class Db(object): """ the master graph, all the connected clients, all the files we're watching """ def __init__(self, dirUriMap: DirUriMap, addlPrefixes): self.clients: List[WebsocketClient] = [] self.graph = ConjunctiveGraph() stats.graphLen = len(self.graph) stats.clients = len(self.clients) self.watchedFiles = WatchedFiles(dirUriMap, self.patch, self.getSubgraph, addlPrefixes) self.summarizeToLog() @graphStats.patchFps.rate() def patch(self, patch: Patch, sender: Optional[str] = None, dueToFileChange: bool = False) -> None: """ apply this patch to the master graph then notify everyone about it dueToFileChange if this is a patch describing an edit we read *from* the file (such that we shouldn't write it back to the file) """ ctx = patch.getContext() log.info("patching graph %s -%d +%d" % (ctx, len(patch.delQuads), len(patch.addQuads))) if hasattr(self, 'watchedFiles'): # todo: eliminate this self.watchedFiles.aboutToPatch(ctx) # an error here needs to drop the sender, and reset everyone # else if we can't rollback the failing patch. patchQuads(self.graph, patch.delQuads, patch.addQuads, perfect=True) stats.graphLen = len(self.graph) self._syncPatchToOtherClients(patch, sender) if not dueToFileChange: self.watchedFiles.dirtyFiles([ctx]) graphStats.statements = len(self.graph) def _syncPatchToOtherClients(self, p: Patch, sender: Optional[str] = None): for c in self.clients: if sender is not None and c.connectionId == sender: # this client has self-applied the patch already log.debug("_syncPatchToOtherClients: don't resend to %r", c) continue log.debug('_syncPatchToOtherClients: send to %r', c) c.sendPatch(p) def clientErrored(self, err, c) -> None: err.trap(twisted.internet.error.ConnectError, WebsocketDisconnect) log.info("%r %r - dropping client", c, err.getErrorMessage()) if c in self.clients: self.clients.remove(c) stats.clients = len(self.clients) def summarizeToLog(self): log.info("contexts in graph (%s total stmts):" % len(self.graph)) for c in self.graph.contexts(): log.info(" %s: %s statements" % (c.identifier, len(self.getSubgraph(c.identifier)))) def getSubgraph(self, uri: URIRef) -> Graph: """ this is meant to return a live view of the given subgraph, but if i'm still working around an rdflib bug, it might return a copy and it's returning triples, but I think quads would be better """ # this is returning an empty Graph :( #return self.graph.get_context(uri) g = Graph() for s in self.graph.triples(ALLSTMTS, uri): g.add(s) return g def addClient(self, newClient: WebsocketClient) -> None: log.info("new connection: sending all graphs to %r" % newClient) newClient.sendPatch( Patch(addQuads=self.graph.quads(ALLSTMTS), delQuads=[])) self.clients.append(newClient) stats.clients = len(self.clients)
def testDeleteRunsBeforeAdd(self): g = ConjunctiveGraph() patchQuads(g, [stmt1], [stmt1]) quads = list(g.quads((None,None,None))) self.assertEqual(quads, [(A, B, C, Graph(identifier=CTX1))])
from rdflib import URIRef, Literal, Namespace, Graph, ConjunctiveGraph, RDF import rdflib pkg_resources.require("rdfextras>=0.1") from rdfextras import * file_store = rdflib.plugin.get('Sleepycat', rdflib.store.Store)() code = file_store.open("test.db", create=True) if code != rdflib.store.VALID_STORE: print "something went wrong" sys.exit(-1) all_graphs = ConjunctiveGraph(store=file_store) print "things already in the store" for s, p, o, m in all_graphs.quads((None, None, None)): print s, p, o, m file_store.bind('dc', Namespace("http://purl.org/dc/elements/1.1/")) dc = Namespace("http://purl.org/dc/elements/1.1/") a_graph = Graph(store=file_store, identifier=URIRef("http://example.com/graph1")) a_graph.add((URIRef("http://example.com/i1"), dc["creator"], Literal("Author " + str(random.randint(1, 1000))))) print "things in a_graph" for s, p, o in a_graph: print s, p, o