def analyze_article_citations(self, num_of_articles=20, quiet=False): """Prints and returns a list of the top 20 most important articles in the TFEU treaty, as determined by the number of citing cases.""" # Create a mapping of article equivalencies, eg Art 28 TEC == Art 34 TFEU sameas = self._sameas() equivs = {} pred = util.ns['owl'] + "sameAs" for (s, o) in sameas.subject_objects(URIRef(pred)): equivs[str(o)] = str(s) self.log.debug("Defined %s equivalent article references" % len(equivs)) # Select unique articles citings store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) sq = """PREFIX eurlex:<http://lagen.nu/eurlex#> SELECT DISTINCT ?case ?article WHERE { ?case eurlex:cites ?article . FILTER (regex(str(?article), "^http://lagen.nu/ext/celex/1")) }""" cites = store.select(sq, format="python") citationcount = {} unmapped = {} self.log.debug("Going through %s unique citations" % len(cites)) for cite in cites: article = cite['article'].split("-")[0] if "12008M" in article: pass elif article in equivs: article = equivs[article] else: if article in unmapped: unmapped[article] += 1 else: unmapped[article] = 1 article = None # Keep track of the number of citing cases if article: if article in citationcount: citationcount[article] += 1 else: citationcount[article] = 1 # Report the most common cites to older treaty articles that # we have no equivalents for in TFEU # sorted_unmapped = sorted(unmapped.iteritems(), key=itemgetter(1))[-num_of_articles:] # if not quiet: # print "UNMAPPED:" # pprint(sorted_unmapped) # Report and return the most cited articles sorted_citationcount = sorted(iter(list(citationcount.items())), key=itemgetter(1))[-num_of_articles:] if not quiet: print("CITATION COUNTS:") pprint(sorted_citationcount) return [x[0] for x in reversed(sorted_citationcount)]
def tearDown(self): self.setupclass = False # make sure super.tearDown deletes all files super(AdvancedAPI, self).tearDown() FulltextIndex.connect(self.indextype, self.indexlocation, [DocumentRepository()]).destroy() TripleStore.connect(self.storetype, self.storelocation, self.storerepository).clear()
def temp_analyze(self): store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) # sq = self._query_cites('http://lagen.nu/ext/celex/12008E045',self._sameas(),False, True, 2012) sq = self._query_cites(None, self._sameas(), False, False, 2012) print(sq) cites = store.select(sq, format="python") self.log.debug( " Citation graph contains %s citations" % (len(cites))) # remove duplicate citations, self-citations and pinpoints # in citations citedict = {} for cite in cites: # print repr(cite) if "-" in cite['obj']: cite['obj'] = cite['obj'].split("-")[0] if (cite['subj'] != cite['obj']): citedict[(cite['subj'], cite['obj'])] = True self.log.debug( " Normalized graph contains %s citations" % len(citedict)) degree = {} for citing, cited in list(citedict.keys()): if citing not in degree: degree[citing] = [] if cited not in degree: degree[cited] = [] degree[cited].append(citing) return
def temp_analyze(self): store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) # sq = self._query_cites('http://lagen.nu/ext/celex/12008E045',self._sameas(),False, True, 2012) sq = self._query_cites(None, self._sameas(), False, False, 2012) print(sq) cites = store.select(sq, format="python") self.log.debug(" Citation graph contains %s citations" % (len(cites))) # remove duplicate citations, self-citations and pinpoints # in citations citedict = {} for cite in cites: # print repr(cite) if "-" in cite['obj']: cite['obj'] = cite['obj'].split("-")[0] if (cite['subj'] != cite['obj']): citedict[(cite['subj'], cite['obj'])] = True self.log.debug(" Normalized graph contains %s citations" % len(citedict)) degree = {} for citing, cited in list(citedict.keys()): if citing not in degree: degree[citing] = [] if cited not in degree: degree[cited] = [] degree[cited].append(citing) return
def download_from_triplestore(self): sq = "SELECT ?something ?references ?uri where ?something ?references ?uri AND NOT ?uri ?references ?anything" store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) with self.store.open_downloaded("biggraph") as fp: for row in store.select(sq): fp.write("<%(something)s> <%(references)s> <%(uri)s> .\n")
def test_sqlite_add_serialized(self, mock_graph): store = TripleStore.connect("SQLITE", "", "") store.add_serialized("tripledata", "nt") self.assertTrue(mock_graph.return_value.parse.called) self.assertTrue(mock_graph.return_value.commit.called) mock_graph.reset_mock() store.add_serialized("tripledata", "nt", "namedgraph") self.assertTrue(mock_graph.return_value.get_context.called) self.assertTrue(mock_graph.return_value.get_context.return_value.parse.called) store = TripleStore.connect("SQLITE", "", "", inmemory=True) with self.assertRaises(errors.TriplestoreError): store.add_serialized("tripledata", "nt")
def test_sqlite_close(self, mock_graph): # make sure this wierd but harmless sqlite3 exception is # caught mock_graph.return_value.close.side_effect = sqlite3.ProgrammingError( "You made a wrong") store = TripleStore.connect("SQLITE", "", "") store.close()
def test_fuseki_get_serialized(self, mock_get): store = TripleStore.connect("FUSEKI", "", "", curl=False) # test 1: a namedgraph (cases with no context are already run by # test_fuseki_get_serialized_file) want = util.readfile("test/files/triplestore/namedgraph.nt", "rb") got = store.get_serialized(context="namedgraph") # results in single get self.assertEqual(want, got)
def test_sesame_select(self, mock_get): store = TripleStore.connect("SESAME", "", "") rf = util.readfile want = rf("test/files/triplestore/select-results.xml").encode() got = store.select("the-query") self.assertEqual(want, got) self.assertEqual(mock_get.call_count, 1) want = rf("test/files/triplestore/select-results.json") got = store.select("the-query", format="json").decode() self.assertEqual(json.loads(want), json.loads(got)) self.assertEqual(mock_get.call_count, 2) want = json.loads( rf("test/files/triplestore/select-results-python.json"), object_hook=util.make_json_date_object_hook("issued")) got = store.select("the-query", format="python") self.assertEqual(want, got) self.assertEqual(mock_get.call_count, 3) with self.assertRaises(errors.TriplestoreError): mockresponse = Mock() mockresponse.text = "This is the actual error text" mock_get.side_effect = requests.exceptions.HTTPError( "Server error", response=mockresponse) got = store.select("the-query", format="python")
def test_fuseki_get_serialized_file(self, mock_get): # Test 1: imagine that server has data in the default graph # and in one named graph rf = util.readfile tmp = mkdtemp() try: store = TripleStore.connect("FUSEKI", "", "") # test 1.1: Get everything, assert that the result is a combo store.get_serialized_file( tmp + "/out.nt") # no ctx, will result in 2 gets self.assertEqual(mock_get.call_count, 2) self.assertEqual(rf("test/files/triplestore/combinedgraph.nt"), rf(tmp + "/out.nt")) # test 1.2: Get only namedgraph, assert that only that is returned store.get_serialized_file(tmp + "/out.nt", context="namedgraph") # 1 get self.assertEqual(rf("test/files/triplestore/namedgraph.nt"), rf(tmp + "/out.nt")) self.assertEqual(mock_get.call_count, 3) # test 1.3: Get everything in a different format store.get_serialized_file(tmp + "/out.ttl", format="turtle") # results in 2 gets self.assertEqualGraphs("test/files/triplestore/combinedgraph.ttl", tmp + "/out.ttl") self.assertEqual(mock_get.call_count, 5) finally: shutil.rmtree(tmp)
def test_sqlite_add_serialized_file(self, mock_graph): store = TripleStore.connect("SQLITE", "", "") fd, tmpname = mkstemp() fp = os.fdopen(fd, "w") fp.write("tripledata") fp.close() store.add_serialized_file(tmpname, "nt") os.unlink(tmpname)
def test_sqlite_clear(self, mock_graph): store = TripleStore.connect("SQLITE", "", "") g = Graph() g.add((URIRef("http://example.org/doc1"), RDFS.comment, Literal("Hey"))) g.add((URIRef("http://example.org/doc2"), RDFS.comment, Literal("Ho"))) mock_graph.return_value.get_context.return_value = g store.clear("namedgraph") self.assertEqual(2, mock_graph.return_value.remove.call_count) self.assertEqual(1, mock_graph.return_value.commit.call_count)
def test_sqlite_init(self, mock_graph): # create a new db that doesnt exist mock_graph.open.return_value = 42 store = TripleStore.connect("SQLITE", "", "") self.assertTrue(mock_graph.return_value.open.called) self.assertTrue(mock_graph.return_value.open.call_args[1]['create']) # reopen an existing db fd, tmpname = mkstemp() fp = os.fdopen(fd) fp.close() store = TripleStore.connect("SQLITE", tmpname, "") os.unlink(tmpname) self.assertFalse(mock_graph.return_value.open.call_args[1]['create']) # make an inmemory db store = TripleStore.connect("SQLITE", "", "", inmemory=True) self.assertTrue(mock_graph.return_value.quads.called) self.assertTrue(mock_graph.return_value.addN.called)
def download(self, basefile=None): # Get all "term sets" (used dcterms:subject Objects, wiki pages # describing legal concepts, swedish wikipedia pages...) terms = defaultdict(dict) # 1) Query the triplestore for all dcterms:subject triples (is this # semantically sensible for a "download" action -- the content # isn't really external?) -- term set "subjects" (these come # from both court cases and legal definitions in law text) sq = """ PREFIX dcterms:<http://purl.org/dc/terms/> PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#> SELECT ?uri ?subject ?label WHERE { {?uri dcterms:subject ?subject . } OPTIONAL {?subject rdfs:label ?label . } } """ store = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) results = store.select(sq, "python") for row in results: if 'label' in row: label = row['label'] else: label = self.basefile_from_uri(row['subject']) if label is None: self.log.warning("could not determine keyword from %s" % row['subject']) continue sanitized = self.sanitize_term(label) if sanitized: if sanitized not in terms: terms[sanitized]['subjects'] = [] terms[sanitized]['subjects'].append(row['uri']) self.log.debug("Retrieved %s subject terms from triplestore" % len(terms)) for termset_func in self.termset_funcs: termset_func(terms) for term in terms: term = self.sanitize_term(term) if not term: continue oldterms = "" termpath = self.store.downloaded_path(term) if os.path.exists(termpath): oldterms = yaml.load(util.readfile(termpath)) if terms[term] != oldterms: util.ensure_dir(termpath) util.writefile(termpath, yaml.dump(terms[term], default_flow_style=False)) self.log.info("%s: in %s termsets" % (term, len(terms[term]))) else: self.log.debug("%s: skipped" % term)
def test_sesame_add_serialized(self, mock_post): store = TripleStore.connect("SESAME", "", "") rf = util.readfile store.add_serialized(rf("test/files/triplestore/defaultgraph.ttl"), format="turtle") self.assertEqual(mock_post.call_count, 1) store.add_serialized(rf("test/files/triplestore/namedgraph.nt"), format="nt", context="namedgraph") self.assertEqual(mock_post.call_count, 2)
def test_sesame_get_serialized(self, mock_get): store = TripleStore.connect("SESAME", "", "") want = util.readfile("test/files/triplestore/combinedgraph.nt", "rb") got = store.get_serialized() self.assertEqual(want, got) self.assertEqual(mock_get.call_count, 1) want = util.readfile("test/files/triplestore/namedgraph.nt", "rb") got = store.get_serialized(context="namedgraph") # results in single get self.assertEqual(want, got) self.assertEqual(mock_get.call_count, 2)
def test_curl(self, runcmd_mock): # needs to test add_serialized, add_serialized_file, get_serialized # and get_serialized_file. We'll patch util.runcmd and make sure that # the command line is correct. We should also have util.runcmd return # a non-zero return code once. # our util.runcmd replacement should, for the get_serialized file, # create a suitable temp file store = TripleStore.connect("FUSEKI", "", "", curl=True) # 1. add_serialized runcmd_mock.return_value = (0, "", "") store.add_serialized("tripledata", "nt") cmdline = runcmd_mock.call_args[0][0] # first ordered argument # replace the temporary file name cmdline = re.sub('"@[^"]+"', '"@tempfile.nt"', cmdline) self.assertEqual('curl -X POST --data-binary "@tempfile.nt" --header "Content-Type:text/plain;charset=UTF-8" "//data?default"', cmdline) runcmd_mock.mock_reset() # 2. add_serialized_file runcmd_mock.return_value = (0, "", "") store.add_serialized_file("tempfile.nt", "nt") cmdline = runcmd_mock.call_args[0][0] # first ordered argument self.assertEqual('curl -X POST --data-binary "@tempfile.nt" --header "Content-Type:text/plain;charset=UTF-8" "//data?default"', cmdline) runcmd_mock.mock_reset() # 3. get_serialized def create_tempfile(*args, **kwargs): filename = re.search('-o "([^"]+)"', args[0]).group(1) with open(filename, "wb") as fp: fp.write("tripledata\n".encode()) return (0, "", "") runcmd_mock.side_effect = create_tempfile res = store.get_serialized("nt") self.assertEqual(b"tripledata\ntripledata\n", res) cmdline = runcmd_mock.call_args[0][0] # first ordered argument # replace the temporary file name cmdline = re.sub('-o "[^"]+"', '-o "tempfile.nt"', cmdline) # FIXME is this really right? self.assertEqual('curl -o "tempfile.nt" --header "Accept:text/plain" "//data?graph=urn:x-arq:UnionGraph"', cmdline) runcmd_mock.side_effect = None runcmd_mock.mock_reset() # 4. get_serialized_file store.get_serialized_file("triples.xml", "xml") cmdline = runcmd_mock.call_args[0][0] # first ordered argument self.assertEqual('curl -o "triples.xml" --header "Accept:application/rdf+xml" "//data?default"', cmdline) runcmd_mock.mock_reset() # 5. handle errors with self.assertRaises(errors.TriplestoreError): runcmd_mock.return_value = (1, "", "Internal error") store.get_serialized_file("triples.nt", "nt")
def test_curl(self, runcmd_mock): # needs to test add_serialized, add_serialized_file, get_serialized # and get_serialized_file. We'll patch util.runcmd and make sure that # the command line is correct. We should also have util.runcmd return # a non-zero return code once. # our util.runcmd replacement should, for the get_serialized file, # create a suitable temp file store = TripleStore.connect("FUSEKI", "", "", curl=True) # 1. add_serialized runcmd_mock.return_value = (0, "", "") store.add_serialized("tripledata", "nt") cmdline = runcmd_mock.call_args[0][0] # first ordered argument # replace the temporary file name cmdline = re.sub('"@[^"]+"', '"@tempfile.nt"', cmdline) self.assertEqual('curl -X POST --data-binary "@tempfile.nt" --header "Content-Type:application/n-triples;charset=UTF-8" "//data?default"', cmdline) runcmd_mock.mock_reset() # 2. add_serialized_file runcmd_mock.return_value = (0, "", "") store.add_serialized_file("tempfile.nt", "nt") cmdline = runcmd_mock.call_args[0][0] # first ordered argument self.assertEqual('curl -X POST --data-binary "@tempfile.nt" --header "Content-Type:application/n-triples;charset=UTF-8" "//data?default"', cmdline) runcmd_mock.mock_reset() # 3. get_serialized def create_tempfile(*args, **kwargs): filename = re.search('-o "([^"]+)"', args[0]).group(1) with open(filename, "wb") as fp: fp.write("tripledata\n".encode()) return (0, "", "") runcmd_mock.side_effect = create_tempfile res = store.get_serialized("nt") self.assertEqual(b"tripledata\ntripledata\n", res) cmdline = runcmd_mock.call_args[0][0] # first ordered argument # replace the temporary file name cmdline = re.sub('-o "[^"]+"', '-o "tempfile.nt"', cmdline) # FIXME is this really right? self.assertEqual('curl -o "tempfile.nt" --header "Accept:application/n-triples" "//data?graph=urn:x-arq:UnionGraph"', cmdline) runcmd_mock.side_effect = None runcmd_mock.mock_reset() # 4. get_serialized_file store.get_serialized_file("triples.xml", "xml") cmdline = runcmd_mock.call_args[0][0] # first ordered argument self.assertEqual('curl -o "triples.xml" --header "Accept:application/rdf+xml" "//data?default"', cmdline) runcmd_mock.mock_reset() # 5. handle errors with self.assertRaises(errors.TriplestoreError): runcmd_mock.return_value = (1, "", "Internal error") store.get_serialized_file("triples.nt", "nt")
def test_sesame_construct(self, mock_get): store = TripleStore.connect("SESAME", "", "") rf = util.readfile want = Graph() want.parse(data=rf("test/files/triplestore/construct-results.ttl"), format="turtle") got = store.construct("the-query") self.assertEqualGraphs(want, got) self.assertEqual(mock_get.call_count, 1) with self.assertRaises(errors.TriplestoreError): mock_get.side_effect = requests.exceptions.HTTPError("Server error") got = store.construct("the-query")
def eval_get_goldstandard(self, basefile): goldstandard = Graph() goldstandard_rdf = util.relpath( os.path.dirname(__file__) + "/../res/eut/goldstandard.n3") goldstandard.load(goldstandard_rdf, format="n3") pred = util.ns['ir'] + 'isRelevantFor' res = {} store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) sq_templ = """PREFIX eurlex:<http://lagen.nu/eurlex#> SELECT ?party ?casenum ?celexnum WHERE { <%s> eurlex:party ?party ; eurlex:casenum ?casenum ; eurlex:celexnum ?celexnum . }""" self.log.debug( "Loading gold standard relevance judgments for %s" % basefile) for article in self._articles(basefile): res[article] = [] for o in goldstandard.objects(URIRef(article), URIRef(pred)): res[article].append(str(o)) # Make sure the case exists and is the case we're looking for sq = sq_templ % str(o) parties = store.select(sq, format="python") if parties: pass # self.log.debug(" %s: %s (%s)" % # (parties[0]['celexnum'], # parties[0]['casenum'], # " v ".join([x['party'] for x in parties]))) else: self.log.warning("Can't find %s in triple store!" % o) self.log.debug(" Gold standard for %s: %s relevant docs" % (article, len(res[article]))) res[article].sort() return res
def prep_annotation_file_termsets(self, basefile, main_node): dvdataset = self.config.url + "dataset/dv" sfsdataset = self.config.url + "dataset/sfs" store = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) legaldefs = self.time_store_select(store, "sparql/keyword_sfs.rq", basefile, sfsdataset, "legaldefs") rattsfall = self.time_store_select(store, "sparql/keyword_dv.rq", basefile, dvdataset, "legalcases") # compatibility hack to enable lxml to process qnames for # namespaces FIXME: this is copied from sfs.py -- but could # probably be removed once we rewrite this method to use real # RDFLib graphs def ns(string): if ":" in string: prefix, tag = string.split(":", 1) return "{%s}%s" % (str(self.ns[prefix]), tag) for r in rattsfall: subject_node = etree.SubElement(main_node, ns("dcterms:subject")) rattsfall_node = etree.SubElement(subject_node, ns("rdf:Description")) rattsfall_node.set(ns("rdf:about"), r['uri']) id_node = etree.SubElement(rattsfall_node, ns("dcterms:identifier")) id_node.text = r['id'] desc_node = etree.SubElement(rattsfall_node, ns("dcterms:description")) desc_node.text = r['desc'] for l in legaldefs: subject_node = etree.SubElement(main_node, ns("rinfoex:isDefinedBy")) legaldef_node = etree.SubElement(subject_node, ns("rdf:Description")) legaldef_node.set(ns("rdf:about"), l['uri']) id_node = etree.SubElement(legaldef_node, ns("rdfs:label")) # id_node.text = "%s %s" % (l['uri'].split("#")[1], l['label']) id_node.text = self.sfsrepo.display_title(l['uri']) if 'wikipedia\n' in util.readfile( self.store.downloaded_path(basefile)): subject_node = etree.SubElement(main_node, ns("rdfs:seeAlso")) link_node = etree.SubElement(subject_node, ns("rdf:Description")) link_node.set( ns("rdf:about"), 'http://sv.wikipedia.org/wiki/' + basefile.replace(" ", "_")) label_node = etree.SubElement(link_node, ns("rdfs:label")) label_node.text = "Begreppet %s finns även beskrivet på svenska Wikipedia" % basefile
def eval_get_goldstandard(self, basefile): goldstandard = Graph() goldstandard_rdf = util.relpath( os.path.dirname(__file__) + "/../res/eut/goldstandard.n3") goldstandard.load(goldstandard_rdf, format="n3") pred = util.ns['ir'] + 'isRelevantFor' res = {} store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) sq_templ = """PREFIX eurlex:<http://lagen.nu/eurlex#> SELECT ?party ?casenum ?celexnum WHERE { <%s> eurlex:party ?party ; eurlex:casenum ?casenum ; eurlex:celexnum ?celexnum . }""" self.log.debug("Loading gold standard relevance judgments for %s" % basefile) for article in self._articles(basefile): res[article] = [] for o in goldstandard.objects(URIRef(article), URIRef(pred)): res[article].append(str(o)) # Make sure the case exists and is the case we're looking for sq = sq_templ % str(o) parties = store.select(sq, format="python") if parties: pass # self.log.debug(" %s: %s (%s)" % # (parties[0]['celexnum'], # parties[0]['casenum'], # " v ".join([x['party'] for x in parties]))) else: self.log.warning("Can't find %s in triple store!" % o) self.log.debug(" Gold standard for %s: %s relevant docs" % (article, len(res[article]))) res[article].sort() return res
def test_sqlite_construct(self, mock_graph): store = TripleStore.connect("SQLITE", "", "") sq = """CONSTRUCT ?s ?p ?o WHERE {?o ?p ?s . }""" g = Graph() g.add((URIRef("http://example.org/doc1"), RDFS.comment, Literal("Hey"))) g.add((URIRef("http://example.org/doc2"), RDFS.comment, Literal("Ho"))) res = Mock res.graph = g mock_graph.return_value.query.return_value = res self.assertEqual(g, store.construct(sq)) mock_graph.return_value.query.side_effect = pyparsing.ParseException("Syntax error") with self.assertRaises(errors.SparqlError): store.construct(sq)
def prep_annotation_file(self, basefile): uri = self.canonical_uri(basefile) keyword = basefile store = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) # Use SPARQL queries to create a rdf graph (to be used by the # xslt transform) containing the wiki authored # dcterms:description for this term. FIXME: This isn't a real # RDF graph yet. wikidesc = self.time_store_select(store, "sparql/keyword_subjects.rq", basefile, None, "descriptions") # compatibility hack to enable lxml to process qnames for namespaces def ns(string): if ":" in string: prefix, tag = string.split(":", 1) return "{%s}%s" % (str(self.ns[prefix]), tag) # FIXME: xhv MUST be part of nsmap if 'xhtml' not in self.ns: self.ns['xhtml'] = "http://www.w3.org/1999/xhtml" root_node = etree.Element(ns("rdf:RDF"), nsmap=self.ns) main_node = etree.SubElement(root_node, ns("rdf:Description")) main_node.set(ns("rdf:about"), uri) for d in wikidesc: desc_node = etree.SubElement(main_node, ns("dcterms:description")) xhtmlstr = "<div xmlns='http://www.w3.org/1999/xhtml'>%s</div>" % (d['desc']) # xhtmlstr = xhtmlstr.replace( # ' xmlns="http://www.w3.org/1999/xhtml"', '') desc_node.append(etree.fromstring(xhtmlstr.encode('utf-8'))) # subclasses override this to add extra annotations from other # sources self.prep_annotation_file_termsets(basefile, main_node) treestring = etree.tostring(root_node, encoding="utf-8", pretty_print=True) with self.store.open_annotation(basefile, mode="wb") as fp: fp.write(treestring) return self.store.annotation_path(basefile)
def test_fuseki_clear(self, mock_post, mock_delete): store = TripleStore.connect("FUSEKI", "", "") store.clear() self.assertEqual(mock_delete.call_count, 0) self.assertEqual(mock_post.call_count, 1) with self.assertRaises(errors.TriplestoreError): mock_post.side_effect = requests.exceptions.ConnectionError("Server error") got = store.clear() with self.assertRaises(errors.TriplestoreError): mock_post.side_effect = requests.exceptions.HTTPError("Server error") got = store.clear() mock_post.side_effect = requests.exceptions.HTTPError("No such graph") got = store.clear("namedgraph")
def select(self, template, uri, format="json"): sq = util.readfile(template) % {'uri': uri} ts = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) print("# Constructing the following from %s, repository %s, type %s" % (self.config.storelocation, self.config.storerepository, self.config.storetype)) print("".join(["# %s\n" % x for x in sq.split("\n")])) p = {} with util.logtime(print, "# Selected in %(elapsed).3fs", p): res = ts.select(sq, format=format) print(res.decode('utf-8'))
def dumpstore(self, format="turtle"): """Extract all RDF data from the system triplestore and dump it to stdout using the specified format. :param format: The serialization format for RDF data (same as for :py:meth:`ferenda.TripleStore.get_serialized`). :type format: str Example:: ./ferenda-build.py devel dumpstore nt > alltriples.nt """ # print("Creating store of type %s, location %s, repository %s" % # (self.config.storetype, self.config.storelocation, self.config.storerepository)) store = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) print(store.get_serialized(format=format).decode('utf-8'))
def test_sqlite_select(self, mock_graph): store = TripleStore.connect("SQLITE", "", "") sq = """SELECT ?p FROM <http://example.org/ctx> WHERE {?s ?p ?o . }""" res = mock_graph.return_value.get_context.return_value.query.return_value want = [{"s": "http://example.org/doc1", "p": "http://www.w3.org/2000/01/rdf-schema#comment", "o": "Hello"}] res.bindings = want self.assertEqual(want, store.select(sq, format="python")) mock_graph.reset_mock() store.select(sq, "sparql") mock_graph.return_value.get_context.return_value.query.return_value.serialize.assert_called_with(format="xml") store.select(sq, "json") mock_graph.return_value.get_context.return_value.query.return_value.serialize.assert_called_with(format="json") mock_graph.return_value.get_context.return_value.query.side_effect = pyparsing.ParseException("Syntax error") with self.assertRaises(errors.SparqlError): store.select(sq)
def download(self, basefile=None): # Get all "term sets" (used dcterms:subject Objects, wiki pages # describing legal concepts, swedish wikipedia pages...) terms = defaultdict(dict) # 1) Query the triplestore for all dcterms:subject triples (is this # semantically sensible for a "download" action -- the content # isn't really external?) -- term set "subjects" (these come # from both court cases and legal definitions in law text) sq = """ PREFIX dcterms:<http://purl.org/dc/terms/> PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#> SELECT DISTINCT ?subject ?label WHERE { {?uri dcterms:subject ?subject . } OPTIONAL {?subject rdfs:label ?label . } } """ store = TripleStore.connect(self.config.storetype, self.config.storelocation, self.config.storerepository) results = store.select(sq, "python") for row in results: if 'label' in row: label = row['label'] else: label = self.basefile_from_uri(row['subject']) if len(label) < 100: # sanity, no legit keyword is 100 chars terms[label]['subjects'] = True self.log.debug("Retrieved %s subject terms from triplestore" % len(terms)) for termset_func in self.termset_funcs: termset_func(terms) for term in terms: if not term: continue self.log.info("%s: in %s termsets" % (term, len(terms[term]))) with self.store.open_downloaded(term, "w") as fp: for termset in sorted(terms[term]): fp.write(termset + "\n")
def test_sesame_select(self, mock_get): store = TripleStore.connect("SESAME", "", "") rf = util.readfile want = rf("test/files/triplestore/select-results.xml") got = store.select("the-query") self.assertEqual(want, got) self.assertEqual(mock_get.call_count, 1) want = json.loads(rf("test/files/triplestore/select-results.json")) got = store.select("the-query", format="json") self.assertEqual(want, got) self.assertEqual(mock_get.call_count, 2) want = json.loads(rf("test/files/triplestore/select-results-python.json")) got = store.select("the-query", format="python") self.assertEqual(want, got) self.assertEqual(mock_get.call_count, 3) with self.assertRaises(errors.TriplestoreError): mock_get.side_effect = requests.exceptions.HTTPError("Server error") got = store.select("the-query", format="python")
def test_sesame_select(self, mock_get): store = TripleStore.connect("SESAME", "", "") rf = util.readfile want = rf("test/files/triplestore/select-results.xml").encode() got = store.select("the-query") self.assertEqual(want, got) self.assertEqual(mock_get.call_count, 1) want = rf("test/files/triplestore/select-results.json") got = store.select("the-query", format="json").decode() self.assertEqual(json.loads(want), json.loads(got)) self.assertEqual(mock_get.call_count, 2) want = json.loads(rf("test/files/triplestore/select-results-python.json"), object_hook=util.make_json_date_object_hook("issued")) got = store.select("the-query", format="python") self.assertEqual(want, got) self.assertEqual(mock_get.call_count, 3) with self.assertRaises(errors.TriplestoreError): mockresponse = Mock() mockresponse.text = "This is the actual error text" mock_get.side_effect = requests.exceptions.HTTPError("Server error", response=mockresponse) got = store.select("the-query", format="python")
def test_fuseki_get_serialized_file(self, mock_get): # Test 1: imagine that server has data in the default graph # and in one named graph rf = util.readfile tmp = mkdtemp() try: store = TripleStore.connect("FUSEKI", "", "") # test 1.1: Get everything, assert that the result is a combo store.get_serialized_file(tmp+"/out.nt") # no ctx, will result in 2 gets self.assertEqual(mock_get.call_count, 2) self.assertEqual(rf("test/files/triplestore/combinedgraph.nt"), rf(tmp+"/out.nt")) # test 1.2: Get only namedgraph, assert that only that is returned store.get_serialized_file(tmp+"/out.nt", context="namedgraph") # 1 get self.assertEqual(rf("test/files/triplestore/namedgraph.nt"), rf(tmp+"/out.nt")) self.assertEqual(mock_get.call_count, 3) # test 1.3: Get everything in a different format store.get_serialized_file(tmp+"/out.ttl", format="turtle") # results in 2 gets self.assertEqualGraphs("test/files/triplestore/combinedgraph.ttl", tmp+"/out.ttl") self.assertEqual(mock_get.call_count, 5) finally: shutil.rmtree(tmp)
def test_invalid_store(self): with self.assertRaises(ValueError): TripleStore.connect("INVALID", "", "")
def eval_get_ranked_set(self, basefile, algorithm="pagerank", age_compensation=False, restrict_cited=True): # * algorithm: can be "indegree", "hits" or "pagerank". # * age_compensation: create one graph per year and average to # compensate for newer cases (that have had less time to gain # citations) # * restrict_cited: Use only such citations that exist between # two cases that both cite the same TFEU article (othewise, # use all citations from all cases that cite the TFEU # article, regardless of whether the cited case also cites # the same TFEU article) sameas = self._sameas() store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) res = {} self.log.debug("Creating ranked set (%s,age_compensation=%s,restrict_cited=%s)" % (algorithm, age_compensation, restrict_cited)) for article in self._articles(basefile): article_celex = article.split("/")[-1] self.log.debug(" Creating ranking for %s" % (article_celex)) this_year = datetime.datetime.today().year if age_compensation: years = list(range(1954, this_year + 1)) # years = range(this_year-3,this_year) # testing else: years = list(range(this_year, this_year + 1)) result_by_years = [] for year in years: restrict_citing = True # always performs better if (article, year, restrict_cited) in self._graph_cache: # self.log.debug("Resuing cached graph (%s) for %s in %s" % # (restrict_cited, article_celex,year)) graph = self._graph_cache[(article, year, restrict_cited)] else: # self.log.debug("Calculating graph for %s in %s" % # (article_celex,year)) sq = self._query_cites(article, sameas, restrict_citing, restrict_cited, year) links = store.select(sq, format="python") graph = self.eval_build_nx_graph(links) self._graph_cache[(article, year, restrict_cited)] = graph self.log.debug(" Citegraph for %s in %s has %s edges, %s nodes" % (article_celex, year, len(graph.edges()), len(graph.nodes()))) if len(graph.nodes()) == 0: continue ranked = self.eval_rank_graph(graph, algorithm) result_by_years.append({}) for result, score in ranked: result_by_years[-1][result] = score if age_compensation: compensated_ranking = {} for d, score in ranked: # the last result set # cut out the year part of the URI celex = d.split("/")[-1] try: age = this_year + 1 - int( celex[1:5]) # cases decided this year has age 1 # scores = [0,0,0 ... 3,4,8,22] scores = [result_by_year[d] for result_by_year in result_by_years if d in result_by_year] avg_score = sum(scores) / float(age) # self.log.debug("Result %s (age %s, avg score %s) %r" % # (d,age,avg_score,scores)) compensated_ranking[d] = avg_score except ValueError: continue # return just a list of results, no scores if age_compensation: res[article] = [result for result in sorted( compensated_ranking, key=compensated_ranking.__getitem__, reverse=True)] else: res[article] = [result[0] for result in ranked] return res
def analyze_citation_graphs(self, articles=None): # Basic setup # articles = self._articles('tfeu')[-1:] if not articles: articles = [None] if None not in articles: articles.append(None) this_year = datetime.datetime.today().year store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) sameas = self._sameas() distributions = [] # For each article (and also for no article = the entire citation graph) for article in articles: # Get a list of all eligble cases (needed for proper degree distribution) sq = self._query_cases(article, sameas) # print sq cases = {} caserows = store.select(sq, format="python") for r in caserows: cases[r['subj']] = 0 self.log.info( "Creating graphs for %s (%s cases)" % (article, len(cases))) # Step 1. SPARQL the graph on the form ?citing ?cited # (optionally restricting on citing a particular article) if article: sq = self._query_cites( article, sameas, True, False, this_year + 1) else: sq = self._query_cites( None, sameas, False, False, this_year + 1) cites = store.select(sq, format="python") self.log.debug( " Citation graph contains %s citations" % (len(cites))) # remove duplicate citations, self-citations and pinpoints # in citations citedict = {} missingcases = {} for cite in cites: # print repr(cite) if "-" in cite['obj']: cite['obj'] = cite['obj'].split("-")[0] if not cite['obj'] in cases: # print "Case %s (cited in %s) does not exist!\n" % (cite['obj'], # cite['subj']) missingcases[cite['obj']] = True continue if (cite['subj'] != cite['obj']): citedict[(cite['subj'], cite['obj'])] = True self.log.debug( " Normalized graph contains %s citations (%s cited cases not found)" % (len(citedict), len(missingcases))) # pprint(missingcases.keys()[:10]) # Step 2. Dotify the list (maybe the direction of arrows from # cited to citing can improve results?) to create a citation # graph self.analyse_citegraph_graphviz(list(citedict.keys()), article) # Step 3. Create a degree distribution plot degree, distribution = self.analyze_citegraph_degree_distribution( cases, list(citedict.keys()), article) if article: distributions.append([article, distribution]) # Step 4. Create a citation/age scatterplot (or rather hexbin) self.analyze_citegraph_citation_age_plot( list(citedict.keys()), degree, distribution, article) # Step 5. Create a combined degree distribution graph of the # distinct citation networks. Also add the degree distribution # of gold standard cases self.analyze_citegraph_combined_degree_distribution(distributions)
def analyze_article_citations(self, num_of_articles=20, quiet=False): """Prints and returns a list of the top 20 most important articles in the TFEU treaty, as determined by the number of citing cases.""" # Create a mapping of article equivalencies, eg Art 28 TEC == Art 34 TFEU sameas = self._sameas() equivs = {} pred = util.ns['owl'] + "sameAs" for (s, o) in sameas.subject_objects(URIRef(pred)): equivs[str(o)] = str(s) self.log.debug( "Defined %s equivalent article references" % len(equivs)) # Select unique articles citings store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) sq = """PREFIX eurlex:<http://lagen.nu/eurlex#> SELECT DISTINCT ?case ?article WHERE { ?case eurlex:cites ?article . FILTER (regex(str(?article), "^http://lagen.nu/ext/celex/1")) }""" cites = store.select(sq, format="python") citationcount = {} unmapped = {} self.log.debug("Going through %s unique citations" % len(cites)) for cite in cites: article = cite['article'].split("-")[0] if "12008M" in article: pass elif article in equivs: article = equivs[article] else: if article in unmapped: unmapped[article] += 1 else: unmapped[article] = 1 article = None # Keep track of the number of citing cases if article: if article in citationcount: citationcount[article] += 1 else: citationcount[article] = 1 # Report the most common cites to older treaty articles that # we have no equivalents for in TFEU # sorted_unmapped = sorted(unmapped.iteritems(), key=itemgetter(1))[-num_of_articles:] # if not quiet: # print "UNMAPPED:" # pprint(sorted_unmapped) # Report and return the most cited articles sorted_citationcount = sorted(iter(list( citationcount.items())), key=itemgetter(1))[-num_of_articles:] if not quiet: print("CITATION COUNTS:") pprint(sorted_citationcount) return [x[0] for x in reversed(sorted_citationcount)]
def test_sqlite_close(self, mock_graph): # make sure this wierd but harmless sqlite3 exception is # caught mock_graph.return_value.close.side_effect = sqlite3.ProgrammingError("You made a wrong") store = TripleStore.connect("SQLITE", "", "") store.close()
def test_sleepycat_init(self, mock_graph): store = TripleStore.connect("SLEEPYCAT", "", "")
def test_sleepycat_triple_count(self, mock_graph): store = TripleStore.connect("SLEEPYCAT", "", "") self.assertEqual(0, store.triple_count())
def test_sqlite_remove_repository(self, mock_graph): store = TripleStore.connect("SQLITE", "", "") store.remove_repository() self.assertTrue(mock_graph.return_value.destroy.called)
def test_sqlite_initialize_triplestore(self, mock_graph): store = TripleStore.connect("SQLITE", "", "") store.initialize_repository() self.assertTrue(mock_graph.return_value.open.call_args[1]['create'])
def test_sqlite_triple_count(self, mock_graph): store = TripleStore.connect("SQLITE", "", "") self.assertEqual(0, store.triple_count())