Example #1
0
    def analyze_article_citations(self, num_of_articles=20, quiet=False):
        """Prints and returns a list of the top 20 most important articles in the
        TFEU treaty, as determined by the number of citing cases."""

        # Create a mapping of article equivalencies, eg Art 28 TEC == Art 34 TFEU
        sameas = self._sameas()
        equivs = {}
        pred = util.ns['owl'] + "sameAs"
        for (s, o) in sameas.subject_objects(URIRef(pred)):
            equivs[str(o)] = str(s)
        self.log.debug("Defined %s equivalent article references" %
                       len(equivs))

        # Select unique articles citings
        store = TripleStore(self.config.storetype, self.config.storelocation,
                            self.config.storerepository)
        sq = """PREFIX eurlex:<http://lagen.nu/eurlex#>
                SELECT DISTINCT ?case ?article WHERE {
                    ?case eurlex:cites ?article .
                    FILTER (regex(str(?article), "^http://lagen.nu/ext/celex/1"))
             }"""
        cites = store.select(sq, format="python")

        citationcount = {}
        unmapped = {}
        self.log.debug("Going through %s unique citations" % len(cites))
        for cite in cites:
            article = cite['article'].split("-")[0]
            if "12008M" in article:
                pass
            elif article in equivs:
                article = equivs[article]
            else:
                if article in unmapped:
                    unmapped[article] += 1
                else:
                    unmapped[article] = 1
                article = None

            # Keep track of the number of citing cases
            if article:
                if article in citationcount:
                    citationcount[article] += 1
                else:
                    citationcount[article] = 1

        # Report the most common cites to older treaty articles that
        # we have no equivalents for in TFEU
        # sorted_unmapped = sorted(unmapped.iteritems(), key=itemgetter(1))[-num_of_articles:]
        # if not quiet:
        #    print "UNMAPPED:"
        #    pprint(sorted_unmapped)

        # Report and return the most cited articles
        sorted_citationcount = sorted(iter(list(citationcount.items())),
                                      key=itemgetter(1))[-num_of_articles:]
        if not quiet:
            print("CITATION COUNTS:")
            pprint(sorted_citationcount)
        return [x[0] for x in reversed(sorted_citationcount)]
Example #2
0
 def tearDown(self):
     self.setupclass = False # make sure super.tearDown deletes all files
     super(AdvancedAPI, self).tearDown()
     FulltextIndex.connect(self.indextype, self.indexlocation,
                           [DocumentRepository()]).destroy()
     TripleStore.connect(self.storetype, self.storelocation,
                         self.storerepository).clear()
Example #3
0
    def temp_analyze(self):
        store = TripleStore(self.config.storetype,
                            self.config.storelocation,
                            self.config.storerepository)
        # sq = self._query_cites('http://lagen.nu/ext/celex/12008E045',self._sameas(),False, True, 2012)
        sq = self._query_cites(None, self._sameas(), False, False, 2012)
        print(sq)
        cites = store.select(sq, format="python")
        self.log.debug(
            "    Citation graph contains %s citations" % (len(cites)))

        # remove duplicate citations, self-citations and pinpoints
        # in citations
        citedict = {}
        for cite in cites:
            # print repr(cite)
            if "-" in cite['obj']:
                cite['obj'] = cite['obj'].split("-")[0]

            if (cite['subj'] != cite['obj']):
                citedict[(cite['subj'], cite['obj'])] = True

        self.log.debug(
            "    Normalized graph contains %s citations" % len(citedict))

        degree = {}
        for citing, cited in list(citedict.keys()):
            if citing not in degree:
                degree[citing] = []
            if cited not in degree:
                degree[cited] = []
            degree[cited].append(citing)

        return
Example #4
0
    def temp_analyze(self):
        store = TripleStore(self.config.storetype, self.config.storelocation,
                            self.config.storerepository)
        # sq = self._query_cites('http://lagen.nu/ext/celex/12008E045',self._sameas(),False, True, 2012)
        sq = self._query_cites(None, self._sameas(), False, False, 2012)
        print(sq)
        cites = store.select(sq, format="python")
        self.log.debug("    Citation graph contains %s citations" %
                       (len(cites)))

        # remove duplicate citations, self-citations and pinpoints
        # in citations
        citedict = {}
        for cite in cites:
            # print repr(cite)
            if "-" in cite['obj']:
                cite['obj'] = cite['obj'].split("-")[0]

            if (cite['subj'] != cite['obj']):
                citedict[(cite['subj'], cite['obj'])] = True

        self.log.debug("    Normalized graph contains %s citations" %
                       len(citedict))

        degree = {}
        for citing, cited in list(citedict.keys()):
            if citing not in degree:
                degree[citing] = []
            if cited not in degree:
                degree[cited] = []
            degree[cited].append(citing)

        return
Example #5
0
 def download_from_triplestore(self):
     sq = "SELECT ?something ?references ?uri where ?something ?references ?uri AND NOT ?uri ?references ?anything"
     store = TripleStore(self.config.storetype,
                         self.config.storelocation,
                         self.config.storerepository)
     with self.store.open_downloaded("biggraph") as fp:
         for row in store.select(sq):
             fp.write("<%(something)s> <%(references)s> <%(uri)s> .\n")
Example #6
0
 def download_from_triplestore(self):
     sq = "SELECT ?something ?references ?uri where ?something ?references ?uri AND NOT ?uri ?references ?anything"
     store = TripleStore(self.config.storetype,
                         self.config.storelocation,
                         self.config.storerepository)
     with self.store.open_downloaded("biggraph") as fp:
         for row in store.select(sq):
             fp.write("<%(something)s> <%(references)s> <%(uri)s> .\n")
Example #7
0
    def test_sqlite_add_serialized(self, mock_graph):
        store = TripleStore.connect("SQLITE", "", "")
        store.add_serialized("tripledata", "nt")
        self.assertTrue(mock_graph.return_value.parse.called)
        self.assertTrue(mock_graph.return_value.commit.called)
        mock_graph.reset_mock()
        
        store.add_serialized("tripledata", "nt", "namedgraph")
        self.assertTrue(mock_graph.return_value.get_context.called)
        self.assertTrue(mock_graph.return_value.get_context.return_value.parse.called)

        store = TripleStore.connect("SQLITE", "", "", inmemory=True)
        with self.assertRaises(errors.TriplestoreError):
            store.add_serialized("tripledata", "nt")
Example #8
0
    def test_sqlite_add_serialized(self, mock_graph):
        store = TripleStore.connect("SQLITE", "", "")
        store.add_serialized("tripledata", "nt")
        self.assertTrue(mock_graph.return_value.parse.called)
        self.assertTrue(mock_graph.return_value.commit.called)
        mock_graph.reset_mock()
        
        store.add_serialized("tripledata", "nt", "namedgraph")
        self.assertTrue(mock_graph.return_value.get_context.called)
        self.assertTrue(mock_graph.return_value.get_context.return_value.parse.called)

        store = TripleStore.connect("SQLITE", "", "", inmemory=True)
        with self.assertRaises(errors.TriplestoreError):
            store.add_serialized("tripledata", "nt")
Example #9
0
 def test_sqlite_close(self, mock_graph):
     # make sure this wierd but harmless sqlite3 exception is
     # caught
     mock_graph.return_value.close.side_effect = sqlite3.ProgrammingError(
         "You made a wrong")
     store = TripleStore.connect("SQLITE", "", "")
     store.close()
Example #10
0
 def test_fuseki_get_serialized(self, mock_get):
     store = TripleStore.connect("FUSEKI", "", "", curl=False)
     # test 1: a namedgraph (cases with no context are already run by
     # test_fuseki_get_serialized_file)
     want = util.readfile("test/files/triplestore/namedgraph.nt", "rb")
     got = store.get_serialized(context="namedgraph") # results in single get
     self.assertEqual(want, got)
Example #11
0
    def test_sesame_select(self, mock_get):
        store = TripleStore.connect("SESAME", "", "")
        rf = util.readfile
        want = rf("test/files/triplestore/select-results.xml").encode()
        got = store.select("the-query")
        self.assertEqual(want, got)
        self.assertEqual(mock_get.call_count, 1)

        want = rf("test/files/triplestore/select-results.json")
        got = store.select("the-query", format="json").decode()
        self.assertEqual(json.loads(want), json.loads(got))
        self.assertEqual(mock_get.call_count, 2)

        want = json.loads(
            rf("test/files/triplestore/select-results-python.json"),
            object_hook=util.make_json_date_object_hook("issued"))
        got = store.select("the-query", format="python")
        self.assertEqual(want, got)
        self.assertEqual(mock_get.call_count, 3)

        with self.assertRaises(errors.TriplestoreError):
            mockresponse = Mock()
            mockresponse.text = "This is the actual error text"
            mock_get.side_effect = requests.exceptions.HTTPError(
                "Server error", response=mockresponse)
            got = store.select("the-query", format="python")
Example #12
0
 def test_fuseki_get_serialized_file(self, mock_get):
     # Test 1: imagine that server has data in the default graph
     # and in one named graph
     rf = util.readfile
     tmp = mkdtemp()
     try:
         store = TripleStore.connect("FUSEKI", "", "")
         # test 1.1: Get everything, assert that the result is a combo
         store.get_serialized_file(
             tmp + "/out.nt")  # no ctx, will result in 2 gets
         self.assertEqual(mock_get.call_count, 2)
         self.assertEqual(rf("test/files/triplestore/combinedgraph.nt"),
                          rf(tmp + "/out.nt"))
         # test 1.2: Get only namedgraph, assert that only that is returned
         store.get_serialized_file(tmp + "/out.nt",
                                   context="namedgraph")  # 1 get
         self.assertEqual(rf("test/files/triplestore/namedgraph.nt"),
                          rf(tmp + "/out.nt"))
         self.assertEqual(mock_get.call_count, 3)
         # test 1.3: Get everything in a different format
         store.get_serialized_file(tmp + "/out.ttl",
                                   format="turtle")  # results in 2 gets
         self.assertEqualGraphs("test/files/triplestore/combinedgraph.ttl",
                                tmp + "/out.ttl")
         self.assertEqual(mock_get.call_count, 5)
     finally:
         shutil.rmtree(tmp)
Example #13
0
 def test_fuseki_get_serialized(self, mock_get):
     store = TripleStore.connect("FUSEKI", "", "", curl=False)
     # test 1: a namedgraph (cases with no context are already run by
     # test_fuseki_get_serialized_file)
     want = util.readfile("test/files/triplestore/namedgraph.nt", "rb")
     got = store.get_serialized(context="namedgraph") # results in single get
     self.assertEqual(want, got)
Example #14
0
 def test_sqlite_add_serialized_file(self, mock_graph):
     store = TripleStore.connect("SQLITE", "", "")
     fd, tmpname = mkstemp()
     fp = os.fdopen(fd, "w")
     fp.write("tripledata")
     fp.close()
     store.add_serialized_file(tmpname, "nt")
     os.unlink(tmpname)
Example #15
0
 def test_sqlite_add_serialized_file(self, mock_graph):
     store = TripleStore.connect("SQLITE", "", "")
     fd, tmpname = mkstemp()
     fp = os.fdopen(fd, "w")
     fp.write("tripledata")
     fp.close()
     store.add_serialized_file(tmpname, "nt")
     os.unlink(tmpname)
Example #16
0
 def test_sqlite_clear(self, mock_graph):
     store = TripleStore.connect("SQLITE", "", "")
     g = Graph()
     g.add((URIRef("http://example.org/doc1"), RDFS.comment, Literal("Hey")))
     g.add((URIRef("http://example.org/doc2"), RDFS.comment, Literal("Ho")))
     mock_graph.return_value.get_context.return_value = g
     store.clear("namedgraph")
     self.assertEqual(2, mock_graph.return_value.remove.call_count)
     self.assertEqual(1, mock_graph.return_value.commit.call_count)
Example #17
0
 def test_sqlite_clear(self, mock_graph):
     store = TripleStore.connect("SQLITE", "", "")
     g = Graph()
     g.add((URIRef("http://example.org/doc1"), RDFS.comment, Literal("Hey")))
     g.add((URIRef("http://example.org/doc2"), RDFS.comment, Literal("Ho")))
     mock_graph.return_value.get_context.return_value = g
     store.clear("namedgraph")
     self.assertEqual(2, mock_graph.return_value.remove.call_count)
     self.assertEqual(1, mock_graph.return_value.commit.call_count)
Example #18
0
    def test_sqlite_init(self, mock_graph):
        # create a new db that doesnt exist
        mock_graph.open.return_value = 42
        store = TripleStore.connect("SQLITE", "", "")
        self.assertTrue(mock_graph.return_value.open.called)
        self.assertTrue(mock_graph.return_value.open.call_args[1]['create'])

        # reopen an existing db
        fd, tmpname = mkstemp()
        fp = os.fdopen(fd)
        fp.close()
        store = TripleStore.connect("SQLITE", tmpname, "")
        os.unlink(tmpname)
        self.assertFalse(mock_graph.return_value.open.call_args[1]['create'])

        # make an inmemory db
        store = TripleStore.connect("SQLITE", "", "", inmemory=True)
        self.assertTrue(mock_graph.return_value.quads.called)
        self.assertTrue(mock_graph.return_value.addN.called)
Example #19
0
    def test_sqlite_init(self, mock_graph):
        # create a new db that doesnt exist
        mock_graph.open.return_value = 42
        store = TripleStore.connect("SQLITE", "", "")
        self.assertTrue(mock_graph.return_value.open.called)
        self.assertTrue(mock_graph.return_value.open.call_args[1]['create'])

        # reopen an existing db
        fd, tmpname = mkstemp()
        fp = os.fdopen(fd)
        fp.close()
        store = TripleStore.connect("SQLITE", tmpname, "")
        os.unlink(tmpname)
        self.assertFalse(mock_graph.return_value.open.call_args[1]['create'])

        # make an inmemory db
        store = TripleStore.connect("SQLITE", "", "", inmemory=True)
        self.assertTrue(mock_graph.return_value.quads.called)
        self.assertTrue(mock_graph.return_value.addN.called)
Example #20
0
    def download(self, basefile=None):
        # Get all "term sets" (used dcterms:subject Objects, wiki pages
        # describing legal concepts, swedish wikipedia pages...)
        terms = defaultdict(dict)

        # 1) Query the triplestore for all dcterms:subject triples (is this
        # semantically sensible for a "download" action -- the content
        # isn't really external?) -- term set "subjects" (these come
        # from both court cases and legal definitions in law text)
        sq = """
        PREFIX dcterms:<http://purl.org/dc/terms/>
        PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>

        SELECT ?uri ?subject ?label
        WHERE { {?uri dcterms:subject ?subject . }
                OPTIONAL {?subject rdfs:label ?label . } }
        """
        store = TripleStore.connect(self.config.storetype,
                                    self.config.storelocation,
                                    self.config.storerepository)
        results = store.select(sq, "python")
        for row in results:
            if 'label' in row:
                label = row['label']
            else:
                label = self.basefile_from_uri(row['subject'])
                if label is None:
                    self.log.warning("could not determine keyword from %s" % row['subject'])
                    continue
            
            sanitized = self.sanitize_term(label)
            if sanitized:
                if sanitized not in terms:
                    terms[sanitized]['subjects'] = []
                terms[sanitized]['subjects'].append(row['uri'])

        self.log.debug("Retrieved %s subject terms from triplestore" % len(terms))

        for termset_func in self.termset_funcs:
            termset_func(terms)

        for term in terms:
            term = self.sanitize_term(term)
            if not term:
                continue
            oldterms = ""
            termpath = self.store.downloaded_path(term)
            if os.path.exists(termpath):
                oldterms = yaml.load(util.readfile(termpath))
            if terms[term] != oldterms:
                util.ensure_dir(termpath)
                util.writefile(termpath, yaml.dump(terms[term], default_flow_style=False))
                self.log.info("%s: in %s termsets" % (term, len(terms[term])))
            else:
                self.log.debug("%s: skipped" % term)
Example #21
0
    def test_sesame_add_serialized(self, mock_post):
        store = TripleStore.connect("SESAME", "", "")
        rf = util.readfile
        store.add_serialized(rf("test/files/triplestore/defaultgraph.ttl"),
                             format="turtle")
        self.assertEqual(mock_post.call_count, 1)

        store.add_serialized(rf("test/files/triplestore/namedgraph.nt"),
                             format="nt",
                             context="namedgraph")
        self.assertEqual(mock_post.call_count, 2)
Example #22
0
    def test_sesame_add_serialized(self, mock_post):
        store = TripleStore.connect("SESAME", "", "")
        rf = util.readfile
        store.add_serialized(rf("test/files/triplestore/defaultgraph.ttl"),
                             format="turtle")
        self.assertEqual(mock_post.call_count, 1)

        store.add_serialized(rf("test/files/triplestore/namedgraph.nt"),
                             format="nt",
                             context="namedgraph")
        self.assertEqual(mock_post.call_count, 2)
Example #23
0
    def test_sesame_get_serialized(self, mock_get):
        store = TripleStore.connect("SESAME", "", "")
        want = util.readfile("test/files/triplestore/combinedgraph.nt", "rb")
        got = store.get_serialized() 
        self.assertEqual(want, got)
        self.assertEqual(mock_get.call_count, 1)

        want = util.readfile("test/files/triplestore/namedgraph.nt", "rb")
        got = store.get_serialized(context="namedgraph") # results in single get
        self.assertEqual(want, got)
        self.assertEqual(mock_get.call_count, 2)
Example #24
0
    def test_sesame_get_serialized(self, mock_get):
        store = TripleStore.connect("SESAME", "", "")
        want = util.readfile("test/files/triplestore/combinedgraph.nt", "rb")
        got = store.get_serialized() 
        self.assertEqual(want, got)
        self.assertEqual(mock_get.call_count, 1)

        want = util.readfile("test/files/triplestore/namedgraph.nt", "rb")
        got = store.get_serialized(context="namedgraph") # results in single get
        self.assertEqual(want, got)
        self.assertEqual(mock_get.call_count, 2)
Example #25
0
    def test_curl(self, runcmd_mock):
        # needs to test add_serialized, add_serialized_file, get_serialized
        # and get_serialized_file. We'll patch util.runcmd and make sure that
        # the command line is correct. We should also have util.runcmd return
        # a non-zero return code once.
        # our util.runcmd replacement should, for the get_serialized file,
        # create a suitable temp file

        store = TripleStore.connect("FUSEKI", "", "", curl=True)
        # 1. add_serialized
        runcmd_mock.return_value = (0, "", "")
        store.add_serialized("tripledata", "nt")
        cmdline = runcmd_mock.call_args[0][0] # first ordered argument
        # replace the temporary file name
        cmdline = re.sub('"@[^"]+"', '"@tempfile.nt"', cmdline)
        self.assertEqual('curl -X POST --data-binary "@tempfile.nt" --header "Content-Type:text/plain;charset=UTF-8" "//data?default"', cmdline)
        runcmd_mock.mock_reset()

        # 2. add_serialized_file
        runcmd_mock.return_value = (0, "", "")
        store.add_serialized_file("tempfile.nt", "nt")
        cmdline = runcmd_mock.call_args[0][0] # first ordered argument
        self.assertEqual('curl -X POST --data-binary "@tempfile.nt" --header "Content-Type:text/plain;charset=UTF-8" "//data?default"', cmdline)
        runcmd_mock.mock_reset()

        # 3. get_serialized
        def create_tempfile(*args, **kwargs):
            filename = re.search('-o "([^"]+)"', args[0]).group(1)
            with open(filename, "wb") as fp:
                fp.write("tripledata\n".encode())
            return (0, "", "")
        runcmd_mock.side_effect = create_tempfile
        res = store.get_serialized("nt")
        self.assertEqual(b"tripledata\ntripledata\n", res)
        cmdline = runcmd_mock.call_args[0][0] # first ordered argument
        # replace the temporary file name
        cmdline = re.sub('-o "[^"]+"', '-o "tempfile.nt"', cmdline)
        # FIXME is this really right?
        self.assertEqual('curl -o "tempfile.nt" --header "Accept:text/plain" "//data?graph=urn:x-arq:UnionGraph"', cmdline)
        runcmd_mock.side_effect = None
        runcmd_mock.mock_reset()

        # 4. get_serialized_file
        store.get_serialized_file("triples.xml", "xml")
        cmdline = runcmd_mock.call_args[0][0] # first ordered argument
        self.assertEqual('curl -o "triples.xml" --header "Accept:application/rdf+xml" "//data?default"', cmdline)
        runcmd_mock.mock_reset()

        # 5. handle errors
        with self.assertRaises(errors.TriplestoreError):
            runcmd_mock.return_value = (1, "", "Internal error")
            store.get_serialized_file("triples.nt", "nt")
Example #26
0
    def test_curl(self, runcmd_mock):
        # needs to test add_serialized, add_serialized_file, get_serialized
        # and get_serialized_file. We'll patch util.runcmd and make sure that
        # the command line is correct. We should also have util.runcmd return
        # a non-zero return code once.
        # our util.runcmd replacement should, for the get_serialized file,
        # create a suitable temp file

        store = TripleStore.connect("FUSEKI", "", "", curl=True)
        # 1. add_serialized
        runcmd_mock.return_value = (0, "", "")
        store.add_serialized("tripledata", "nt")
        cmdline = runcmd_mock.call_args[0][0] # first ordered argument
        # replace the temporary file name
        cmdline = re.sub('"@[^"]+"', '"@tempfile.nt"', cmdline)
        self.assertEqual('curl -X POST --data-binary "@tempfile.nt" --header "Content-Type:application/n-triples;charset=UTF-8" "//data?default"', cmdline)
        runcmd_mock.mock_reset()

        # 2. add_serialized_file
        runcmd_mock.return_value = (0, "", "")
        store.add_serialized_file("tempfile.nt", "nt")
        cmdline = runcmd_mock.call_args[0][0] # first ordered argument
        self.assertEqual('curl -X POST --data-binary "@tempfile.nt" --header "Content-Type:application/n-triples;charset=UTF-8" "//data?default"', cmdline)
        runcmd_mock.mock_reset()

        # 3. get_serialized
        def create_tempfile(*args, **kwargs):
            filename = re.search('-o "([^"]+)"', args[0]).group(1)
            with open(filename, "wb") as fp:
                fp.write("tripledata\n".encode())
            return (0, "", "")
        runcmd_mock.side_effect = create_tempfile
        res = store.get_serialized("nt")
        self.assertEqual(b"tripledata\ntripledata\n", res)
        cmdline = runcmd_mock.call_args[0][0] # first ordered argument
        # replace the temporary file name
        cmdline = re.sub('-o "[^"]+"', '-o "tempfile.nt"', cmdline)
        # FIXME is this really right?
        self.assertEqual('curl -o "tempfile.nt" --header "Accept:application/n-triples" "//data?graph=urn:x-arq:UnionGraph"', cmdline)
        runcmd_mock.side_effect = None
        runcmd_mock.mock_reset()

        # 4. get_serialized_file
        store.get_serialized_file("triples.xml", "xml")
        cmdline = runcmd_mock.call_args[0][0] # first ordered argument
        self.assertEqual('curl -o "triples.xml" --header "Accept:application/rdf+xml" "//data?default"', cmdline)
        runcmd_mock.mock_reset()

        # 5. handle errors
        with self.assertRaises(errors.TriplestoreError):
            runcmd_mock.return_value = (1, "", "Internal error")
            store.get_serialized_file("triples.nt", "nt")
Example #27
0
    def test_sesame_construct(self, mock_get):
        store = TripleStore.connect("SESAME", "", "")
        rf = util.readfile
        want = Graph()
        want.parse(data=rf("test/files/triplestore/construct-results.ttl"),
                   format="turtle")
        got = store.construct("the-query")
        self.assertEqualGraphs(want, got)
        self.assertEqual(mock_get.call_count, 1)

        with self.assertRaises(errors.TriplestoreError):
            mock_get.side_effect = requests.exceptions.HTTPError("Server error")
            got = store.construct("the-query")
Example #28
0
    def eval_get_goldstandard(self, basefile):
        goldstandard = Graph()
        goldstandard_rdf = util.relpath(
            os.path.dirname(__file__) + "/../res/eut/goldstandard.n3")
        goldstandard.load(goldstandard_rdf, format="n3")

        pred = util.ns['ir'] + 'isRelevantFor'
        res = {}
        store = TripleStore(self.config.storetype,
                            self.config.storelocation,
                            self.config.storerepository)
        sq_templ = """PREFIX eurlex:<http://lagen.nu/eurlex#>
                      SELECT ?party ?casenum ?celexnum WHERE {
                          <%s> eurlex:party ?party ;
                               eurlex:casenum ?casenum ;
                               eurlex:celexnum ?celexnum .
                      }"""

        self.log.debug(
            "Loading gold standard relevance judgments for %s" % basefile)
        for article in self._articles(basefile):
            res[article] = []
            for o in goldstandard.objects(URIRef(article), URIRef(pred)):
                res[article].append(str(o))
                # Make sure the case exists and is the case we're looking for
                sq = sq_templ % str(o)
                parties = store.select(sq, format="python")
                if parties:
                    pass
                    # self.log.debug("   %s: %s (%s)" %
                    #               (parties[0]['celexnum'],
                    #                parties[0]['casenum'],
                    #                " v ".join([x['party'] for x in parties])))
                else:
                    self.log.warning("Can't find %s in triple store!" % o)
            self.log.debug("    Gold standard for %s: %s relevant docs" %
                           (article, len(res[article])))
            res[article].sort()
        return res
Example #29
0
    def test_sesame_construct(self, mock_get):
        store = TripleStore.connect("SESAME", "", "")
        rf = util.readfile
        want = Graph()
        want.parse(data=rf("test/files/triplestore/construct-results.ttl"),
                   format="turtle")
        got = store.construct("the-query")
        self.assertEqualGraphs(want, got)
        self.assertEqual(mock_get.call_count, 1)

        with self.assertRaises(errors.TriplestoreError):
            mock_get.side_effect = requests.exceptions.HTTPError("Server error")
            got = store.construct("the-query")
Example #30
0
    def prep_annotation_file_termsets(self, basefile, main_node):
        dvdataset = self.config.url + "dataset/dv"
        sfsdataset = self.config.url + "dataset/sfs"
        store = TripleStore.connect(self.config.storetype,
                                    self.config.storelocation,
                                    self.config.storerepository)
        legaldefs = self.time_store_select(store, "sparql/keyword_sfs.rq",
                                           basefile, sfsdataset, "legaldefs")
        rattsfall = self.time_store_select(store, "sparql/keyword_dv.rq",
                                           basefile, dvdataset, "legalcases")

        # compatibility hack to enable lxml to process qnames for
        # namespaces FIXME: this is copied from sfs.py -- but could
        # probably be removed once we rewrite this method to use real
        # RDFLib graphs
        def ns(string):
            if ":" in string:
                prefix, tag = string.split(":", 1)
                return "{%s}%s" % (str(self.ns[prefix]), tag)

        for r in rattsfall:
            subject_node = etree.SubElement(main_node, ns("dcterms:subject"))
            rattsfall_node = etree.SubElement(subject_node,
                                              ns("rdf:Description"))
            rattsfall_node.set(ns("rdf:about"), r['uri'])
            id_node = etree.SubElement(rattsfall_node,
                                       ns("dcterms:identifier"))
            id_node.text = r['id']
            desc_node = etree.SubElement(rattsfall_node,
                                         ns("dcterms:description"))
            desc_node.text = r['desc']

        for l in legaldefs:
            subject_node = etree.SubElement(main_node,
                                            ns("rinfoex:isDefinedBy"))
            legaldef_node = etree.SubElement(subject_node,
                                             ns("rdf:Description"))
            legaldef_node.set(ns("rdf:about"), l['uri'])
            id_node = etree.SubElement(legaldef_node, ns("rdfs:label"))
            # id_node.text = "%s %s" % (l['uri'].split("#")[1], l['label'])
            id_node.text = self.sfsrepo.display_title(l['uri'])

        if 'wikipedia\n' in util.readfile(
                self.store.downloaded_path(basefile)):
            subject_node = etree.SubElement(main_node, ns("rdfs:seeAlso"))
            link_node = etree.SubElement(subject_node, ns("rdf:Description"))
            link_node.set(
                ns("rdf:about"),
                'http://sv.wikipedia.org/wiki/' + basefile.replace(" ", "_"))
            label_node = etree.SubElement(link_node, ns("rdfs:label"))
            label_node.text = "Begreppet %s finns även beskrivet på svenska Wikipedia" % basefile
Example #31
0
    def eval_get_goldstandard(self, basefile):
        goldstandard = Graph()
        goldstandard_rdf = util.relpath(
            os.path.dirname(__file__) + "/../res/eut/goldstandard.n3")
        goldstandard.load(goldstandard_rdf, format="n3")

        pred = util.ns['ir'] + 'isRelevantFor'
        res = {}
        store = TripleStore(self.config.storetype, self.config.storelocation,
                            self.config.storerepository)
        sq_templ = """PREFIX eurlex:<http://lagen.nu/eurlex#>
                      SELECT ?party ?casenum ?celexnum WHERE {
                          <%s> eurlex:party ?party ;
                               eurlex:casenum ?casenum ;
                               eurlex:celexnum ?celexnum .
                      }"""

        self.log.debug("Loading gold standard relevance judgments for %s" %
                       basefile)
        for article in self._articles(basefile):
            res[article] = []
            for o in goldstandard.objects(URIRef(article), URIRef(pred)):
                res[article].append(str(o))
                # Make sure the case exists and is the case we're looking for
                sq = sq_templ % str(o)
                parties = store.select(sq, format="python")
                if parties:
                    pass
                    # self.log.debug("   %s: %s (%s)" %
                    #               (parties[0]['celexnum'],
                    #                parties[0]['casenum'],
                    #                " v ".join([x['party'] for x in parties])))
                else:
                    self.log.warning("Can't find %s in triple store!" % o)
            self.log.debug("    Gold standard for %s: %s relevant docs" %
                           (article, len(res[article])))
            res[article].sort()
        return res
Example #32
0
 def test_sqlite_construct(self, mock_graph):
     store = TripleStore.connect("SQLITE", "", "")
     sq = """CONSTRUCT ?s ?p ?o WHERE {?o ?p ?s . }"""
     g = Graph()
     g.add((URIRef("http://example.org/doc1"), RDFS.comment, Literal("Hey")))
     g.add((URIRef("http://example.org/doc2"), RDFS.comment, Literal("Ho")))
     res = Mock
     res.graph = g
     mock_graph.return_value.query.return_value = res
     self.assertEqual(g, store.construct(sq))
 
     mock_graph.return_value.query.side_effect = pyparsing.ParseException("Syntax error")
     with self.assertRaises(errors.SparqlError):
         store.construct(sq)
Example #33
0
 def test_sqlite_construct(self, mock_graph):
     store = TripleStore.connect("SQLITE", "", "")
     sq = """CONSTRUCT ?s ?p ?o WHERE {?o ?p ?s . }"""
     g = Graph()
     g.add((URIRef("http://example.org/doc1"), RDFS.comment, Literal("Hey")))
     g.add((URIRef("http://example.org/doc2"), RDFS.comment, Literal("Ho")))
     res = Mock
     res.graph = g
     mock_graph.return_value.query.return_value = res
     self.assertEqual(g, store.construct(sq))
 
     mock_graph.return_value.query.side_effect = pyparsing.ParseException("Syntax error")
     with self.assertRaises(errors.SparqlError):
         store.construct(sq)
Example #34
0
    def prep_annotation_file(self, basefile):
        uri = self.canonical_uri(basefile)
        keyword = basefile
        store = TripleStore.connect(self.config.storetype,
                                    self.config.storelocation,
                                    self.config.storerepository)

        # Use SPARQL queries to create a rdf graph (to be used by the
        # xslt transform) containing the wiki authored
        # dcterms:description for this term. FIXME: This isn't a real
        # RDF graph yet.
        wikidesc = self.time_store_select(store,
                                          "sparql/keyword_subjects.rq",
                                          basefile,
                                          None,
                                          "descriptions")

        # compatibility hack to enable lxml to process qnames for namespaces
        def ns(string):
            if ":" in string:
                prefix, tag = string.split(":", 1)
                return "{%s}%s" % (str(self.ns[prefix]), tag)

        # FIXME: xhv MUST be part of nsmap
        if 'xhtml' not in self.ns:
            self.ns['xhtml'] = "http://www.w3.org/1999/xhtml"

        root_node = etree.Element(ns("rdf:RDF"), nsmap=self.ns)

        main_node = etree.SubElement(root_node, ns("rdf:Description"))
        main_node.set(ns("rdf:about"), uri)

        for d in wikidesc:
            desc_node = etree.SubElement(main_node, ns("dcterms:description"))
            xhtmlstr = "<div xmlns='http://www.w3.org/1999/xhtml'>%s</div>" % (d['desc'])
            # xhtmlstr = xhtmlstr.replace(
            #    ' xmlns="http://www.w3.org/1999/xhtml"', '')
            desc_node.append(etree.fromstring(xhtmlstr.encode('utf-8')))

        # subclasses override this to add extra annotations from other
        # sources
        self.prep_annotation_file_termsets(basefile, main_node)

        treestring = etree.tostring(root_node,
                                    encoding="utf-8",
                                    pretty_print=True)
        with self.store.open_annotation(basefile, mode="wb") as fp:
            fp.write(treestring)
        return self.store.annotation_path(basefile)
Example #35
0
    def test_fuseki_clear(self, mock_post, mock_delete):
        store = TripleStore.connect("FUSEKI", "", "")
        store.clear()
        self.assertEqual(mock_delete.call_count, 0)
        self.assertEqual(mock_post.call_count, 1)
        with self.assertRaises(errors.TriplestoreError):
            mock_post.side_effect = requests.exceptions.ConnectionError("Server error")
            got = store.clear()

        with self.assertRaises(errors.TriplestoreError):
            mock_post.side_effect = requests.exceptions.HTTPError("Server error")
            got = store.clear()

        mock_post.side_effect = requests.exceptions.HTTPError("No such graph")
        got = store.clear("namedgraph")
Example #36
0
    def test_fuseki_clear(self, mock_post, mock_delete):
        store = TripleStore.connect("FUSEKI", "", "")
        store.clear()
        self.assertEqual(mock_delete.call_count, 0)
        self.assertEqual(mock_post.call_count, 1)
        with self.assertRaises(errors.TriplestoreError):
            mock_post.side_effect = requests.exceptions.ConnectionError("Server error")
            got = store.clear()

        with self.assertRaises(errors.TriplestoreError):
            mock_post.side_effect = requests.exceptions.HTTPError("Server error")
            got = store.clear()

        mock_post.side_effect = requests.exceptions.HTTPError("No such graph")
        got = store.clear("namedgraph")
Example #37
0
    def select(self, template, uri, format="json"):
        sq = util.readfile(template) % {'uri': uri}
        ts = TripleStore.connect(self.config.storetype,
                                 self.config.storelocation,
                                 self.config.storerepository)

        print("# Constructing the following from %s, repository %s, type %s" %
              (self.config.storelocation,
               self.config.storerepository,
               self.config.storetype))
        print("".join(["# %s\n" % x for x in sq.split("\n")]))
        p = {}
        with util.logtime(print,
                          "# Selected in %(elapsed).3fs",
                          p):
            res = ts.select(sq, format=format)
            print(res.decode('utf-8'))
Example #38
0
    def dumpstore(self, format="turtle"):
        """Extract all RDF data from the system triplestore and dump
        it to stdout using the specified format.

        :param format: The serialization format for RDF data (same as
                       for :py:meth:`ferenda.TripleStore.get_serialized`).
        :type format: str

        Example::

            ./ferenda-build.py devel dumpstore nt > alltriples.nt
        """
        # print("Creating store of type %s, location %s, repository %s" %
        #       (self.config.storetype, self.config.storelocation, self.config.storerepository))
        store = TripleStore.connect(self.config.storetype,
                                    self.config.storelocation,
                                    self.config.storerepository)
        print(store.get_serialized(format=format).decode('utf-8'))
Example #39
0
 def test_sqlite_select(self, mock_graph):
     store = TripleStore.connect("SQLITE", "", "")
     sq = """SELECT ?p FROM <http://example.org/ctx> WHERE {?s ?p ?o . }"""
     res = mock_graph.return_value.get_context.return_value.query.return_value
     want = [{"s": "http://example.org/doc1",
              "p": "http://www.w3.org/2000/01/rdf-schema#comment",
              "o": "Hello"}]
     res.bindings = want
     self.assertEqual(want, store.select(sq, format="python"))
     mock_graph.reset_mock()
     store.select(sq, "sparql")
     mock_graph.return_value.get_context.return_value.query.return_value.serialize.assert_called_with(format="xml")
     
     store.select(sq, "json")
     mock_graph.return_value.get_context.return_value.query.return_value.serialize.assert_called_with(format="json")
     
     mock_graph.return_value.get_context.return_value.query.side_effect = pyparsing.ParseException("Syntax error")
     with self.assertRaises(errors.SparqlError):
         store.select(sq)
Example #40
0
 def test_sqlite_select(self, mock_graph):
     store = TripleStore.connect("SQLITE", "", "")
     sq = """SELECT ?p FROM <http://example.org/ctx> WHERE {?s ?p ?o . }"""
     res = mock_graph.return_value.get_context.return_value.query.return_value
     want = [{"s": "http://example.org/doc1",
              "p": "http://www.w3.org/2000/01/rdf-schema#comment",
              "o": "Hello"}]
     res.bindings = want
     self.assertEqual(want, store.select(sq, format="python"))
     mock_graph.reset_mock()
     store.select(sq, "sparql")
     mock_graph.return_value.get_context.return_value.query.return_value.serialize.assert_called_with(format="xml")
     
     store.select(sq, "json")
     mock_graph.return_value.get_context.return_value.query.return_value.serialize.assert_called_with(format="json")
     
     mock_graph.return_value.get_context.return_value.query.side_effect = pyparsing.ParseException("Syntax error")
     with self.assertRaises(errors.SparqlError):
         store.select(sq)
Example #41
0
    def download(self, basefile=None):
        # Get all "term sets" (used dcterms:subject Objects, wiki pages
        # describing legal concepts, swedish wikipedia pages...)
        terms = defaultdict(dict)

        # 1) Query the triplestore for all dcterms:subject triples (is this
        # semantically sensible for a "download" action -- the content
        # isn't really external?) -- term set "subjects" (these come
        # from both court cases and legal definitions in law text)
        sq = """
        PREFIX dcterms:<http://purl.org/dc/terms/>
        PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>

        SELECT DISTINCT ?subject ?label
        WHERE { {?uri dcterms:subject ?subject . } 
                OPTIONAL {?subject rdfs:label ?label . } }
        """
        store = TripleStore.connect(self.config.storetype,
                                    self.config.storelocation,
                                    self.config.storerepository)
        results = store.select(sq, "python")
        for row in results:
            if 'label' in row:
                label = row['label']
            else:
                label = self.basefile_from_uri(row['subject'])
            if len(label) < 100:  # sanity, no legit keyword is 100 chars
                terms[label]['subjects'] = True

        self.log.debug("Retrieved %s subject terms from triplestore" %
                       len(terms))

        for termset_func in self.termset_funcs:
            termset_func(terms)

        for term in terms:
            if not term:
                continue
            self.log.info("%s: in %s termsets" % (term, len(terms[term])))
            with self.store.open_downloaded(term, "w") as fp:
                for termset in sorted(terms[term]):
                    fp.write(termset + "\n")
Example #42
0
    def test_sesame_select(self, mock_get):
        store = TripleStore.connect("SESAME", "", "")
        rf = util.readfile
        want = rf("test/files/triplestore/select-results.xml")
        got = store.select("the-query")
        self.assertEqual(want, got)
        self.assertEqual(mock_get.call_count, 1)

        want = json.loads(rf("test/files/triplestore/select-results.json"))
        got = store.select("the-query", format="json")
        self.assertEqual(want, got)
        self.assertEqual(mock_get.call_count, 2)

        want = json.loads(rf("test/files/triplestore/select-results-python.json"))
        got = store.select("the-query", format="python")
        self.assertEqual(want, got)
        self.assertEqual(mock_get.call_count, 3)

        with self.assertRaises(errors.TriplestoreError):
            mock_get.side_effect = requests.exceptions.HTTPError("Server error")
            got = store.select("the-query", format="python")
Example #43
0
    def test_sesame_select(self, mock_get):
        store = TripleStore.connect("SESAME", "", "")
        rf = util.readfile
        want = rf("test/files/triplestore/select-results.xml").encode()
        got = store.select("the-query")
        self.assertEqual(want, got)
        self.assertEqual(mock_get.call_count, 1)

        want = rf("test/files/triplestore/select-results.json")
        got = store.select("the-query", format="json").decode()
        self.assertEqual(json.loads(want), json.loads(got))
        self.assertEqual(mock_get.call_count, 2)

        want = json.loads(rf("test/files/triplestore/select-results-python.json"),
                          object_hook=util.make_json_date_object_hook("issued"))
        got = store.select("the-query", format="python")
        self.assertEqual(want, got)
        self.assertEqual(mock_get.call_count, 3)

        with self.assertRaises(errors.TriplestoreError):
            mockresponse = Mock()
            mockresponse.text = "This is the actual error text"
            mock_get.side_effect = requests.exceptions.HTTPError("Server error", response=mockresponse)
            got = store.select("the-query", format="python")
Example #44
0
 def test_fuseki_get_serialized_file(self, mock_get):
     # Test 1: imagine that server has data in the default graph
     # and in one named graph
     rf = util.readfile
     tmp = mkdtemp()
     try:
         store = TripleStore.connect("FUSEKI", "", "")
         # test 1.1: Get everything, assert that the result is a combo
         store.get_serialized_file(tmp+"/out.nt") # no ctx, will result in 2 gets
         self.assertEqual(mock_get.call_count, 2)
         self.assertEqual(rf("test/files/triplestore/combinedgraph.nt"),
                          rf(tmp+"/out.nt"))
         # test 1.2: Get only namedgraph, assert that only that is returned
         store.get_serialized_file(tmp+"/out.nt", context="namedgraph") # 1 get
         self.assertEqual(rf("test/files/triplestore/namedgraph.nt"),
                          rf(tmp+"/out.nt"))
         self.assertEqual(mock_get.call_count, 3)
         # test 1.3: Get everything in a different format
         store.get_serialized_file(tmp+"/out.ttl", format="turtle") # results in 2 gets
         self.assertEqualGraphs("test/files/triplestore/combinedgraph.ttl",
                               tmp+"/out.ttl")
         self.assertEqual(mock_get.call_count, 5)
     finally:
         shutil.rmtree(tmp)
Example #45
0
 def test_invalid_store(self):
     with self.assertRaises(ValueError):
         TripleStore.connect("INVALID", "", "")
Example #46
0
    def eval_get_ranked_set(self, basefile, algorithm="pagerank",
                            age_compensation=False, restrict_cited=True):
        # * algorithm: can be "indegree", "hits" or "pagerank".
        # * age_compensation: create one graph per year and average to
        #   compensate for newer cases (that have had less time to gain
        #   citations)
        # * restrict_cited: Use only such citations that exist between
        #   two cases that both cite the same TFEU article (othewise,
        #   use all citations from all cases that cite the TFEU
        #   article, regardless of whether the cited case also cites
        #   the same TFEU article)
        sameas = self._sameas()
        store = TripleStore(self.config.storetype,
                            self.config.storelocation,
                            self.config.storerepository)
        res = {}

        self.log.debug("Creating ranked set (%s,age_compensation=%s,restrict_cited=%s)" %
                       (algorithm, age_compensation, restrict_cited))

        for article in self._articles(basefile):
            article_celex = article.split("/")[-1]
            self.log.debug("    Creating ranking for %s" % (article_celex))
            this_year = datetime.datetime.today().year
            if age_compensation:
                years = list(range(1954, this_year + 1))
                # years = range(this_year-3,this_year) # testing
            else:
                years = list(range(this_year, this_year + 1))

            result_by_years = []
            for year in years:
                restrict_citing = True  # always performs better
                if (article, year, restrict_cited) in self._graph_cache:
                    # self.log.debug("Resuing cached graph (%s) for %s in %s" %
                    #               (restrict_cited, article_celex,year))
                    graph = self._graph_cache[(article, year, restrict_cited)]
                else:
                    # self.log.debug("Calculating graph for %s in %s" %
                    #               (article_celex,year))
                    sq = self._query_cites(article, sameas, restrict_citing,
                                           restrict_cited, year)
                    links = store.select(sq, format="python")
                    graph = self.eval_build_nx_graph(links)
                    self._graph_cache[(article, year, restrict_cited)] = graph
                    self.log.debug("      Citegraph for %s in %s has %s edges, %s nodes" %
                                   (article_celex, year, len(graph.edges()),
                                    len(graph.nodes())))

                if len(graph.nodes()) == 0:
                    continue

                ranked = self.eval_rank_graph(graph, algorithm)
                result_by_years.append({})
                for result, score in ranked:
                    result_by_years[-1][result] = score

            if age_compensation:
                compensated_ranking = {}
                for d, score in ranked:  # the last result set
                    # cut out the year part of the URI
                    celex = d.split("/")[-1]
                    try:
                        age = this_year + 1 - int(
                            celex[1:5])  # cases decided this year has age 1
                        # scores = [0,0,0 ... 3,4,8,22]
                        scores = [result_by_year[d]
                                  for result_by_year
                                  in result_by_years
                                  if d in result_by_year]
                        avg_score = sum(scores) / float(age)
                        # self.log.debug("Result %s (age %s, avg score %s) %r" %
                        #               (d,age,avg_score,scores))
                        compensated_ranking[d] = avg_score
                    except ValueError:
                        continue

            # return just a list of results, no scores
            if age_compensation:
                res[article] = [result for result in sorted(
                    compensated_ranking, key=compensated_ranking.__getitem__, reverse=True)]
            else:
                res[article] = [result[0] for result in ranked]

        return res
Example #47
0
    def analyze_citation_graphs(self, articles=None):
        # Basic setup
        # articles = self._articles('tfeu')[-1:]
        if not articles:
            articles = [None]
        if None not in articles:
            articles.append(None)
        this_year = datetime.datetime.today().year
        store = TripleStore(self.config.storetype,
                            self.config.storelocation,
                            self.config.storerepository)
        sameas = self._sameas()
        distributions = []

        # For each article (and also for no article = the entire citation graph)
        for article in articles:
            # Get a list of all eligble cases (needed for proper degree distribution)
            sq = self._query_cases(article, sameas)
            # print sq
            cases = {}
            caserows = store.select(sq, format="python")
            for r in caserows:
                cases[r['subj']] = 0

            self.log.info(
                "Creating graphs for %s (%s cases)" % (article, len(cases)))
            # Step 1. SPARQL the graph on the form ?citing ?cited
            # (optionally restricting on citing a particular article)
            if article:
                sq = self._query_cites(
                    article, sameas, True, False, this_year + 1)
            else:
                sq = self._query_cites(
                    None, sameas, False, False, this_year + 1)

            cites = store.select(sq, format="python")
            self.log.debug(
                "    Citation graph contains %s citations" % (len(cites)))

            # remove duplicate citations, self-citations and pinpoints
            # in citations
            citedict = {}
            missingcases = {}
            for cite in cites:
                # print repr(cite)
                if "-" in cite['obj']:
                    cite['obj'] = cite['obj'].split("-")[0]

                if not cite['obj'] in cases:
                    # print "Case %s (cited in %s) does not exist!\n" % (cite['obj'],
                    # cite['subj'])
                    missingcases[cite['obj']] = True
                    continue

                if (cite['subj'] != cite['obj']):
                    citedict[(cite['subj'], cite['obj'])] = True

            self.log.debug(
                "    Normalized graph contains %s citations (%s cited cases not found)" %
                (len(citedict), len(missingcases)))
            # pprint(missingcases.keys()[:10])

            # Step 2. Dotify the list (maybe the direction of arrows from
            # cited to citing can improve results?) to create a citation
            # graph
            self.analyse_citegraph_graphviz(list(citedict.keys()), article)

            # Step 3. Create a degree distribution plot
            degree, distribution = self.analyze_citegraph_degree_distribution(
                cases, list(citedict.keys()), article)
            if article:
                distributions.append([article, distribution])

            # Step 4. Create a citation/age scatterplot (or rather hexbin)
            self.analyze_citegraph_citation_age_plot(
                list(citedict.keys()), degree, distribution, article)

        # Step 5. Create a combined degree distribution graph of the
        # distinct citation networks. Also add the degree distribution
        # of gold standard cases

        self.analyze_citegraph_combined_degree_distribution(distributions)
Example #48
0
    def analyze_article_citations(self, num_of_articles=20, quiet=False):
        """Prints and returns a list of the top 20 most important articles in the
        TFEU treaty, as determined by the number of citing cases."""

        # Create a mapping of article equivalencies, eg Art 28 TEC == Art 34 TFEU
        sameas = self._sameas()
        equivs = {}
        pred = util.ns['owl'] + "sameAs"
        for (s, o) in sameas.subject_objects(URIRef(pred)):
            equivs[str(o)] = str(s)
        self.log.debug(
            "Defined %s equivalent article references" % len(equivs))

        # Select unique articles citings
        store = TripleStore(self.config.storetype,
                            self.config.storelocation,
                            self.config.storerepository)
        sq = """PREFIX eurlex:<http://lagen.nu/eurlex#>
                SELECT DISTINCT ?case ?article WHERE {
                    ?case eurlex:cites ?article .
                    FILTER (regex(str(?article), "^http://lagen.nu/ext/celex/1"))
             }"""
        cites = store.select(sq, format="python")

        citationcount = {}
        unmapped = {}
        self.log.debug("Going through %s unique citations" % len(cites))
        for cite in cites:
            article = cite['article'].split("-")[0]
            if "12008M" in article:
                pass
            elif article in equivs:
                article = equivs[article]
            else:
                if article in unmapped:
                    unmapped[article] += 1
                else:
                    unmapped[article] = 1
                article = None

            # Keep track of the number of citing cases
            if article:
                if article in citationcount:
                    citationcount[article] += 1
                else:
                    citationcount[article] = 1

        # Report the most common cites to older treaty articles that
        # we have no equivalents for in TFEU
        # sorted_unmapped = sorted(unmapped.iteritems(), key=itemgetter(1))[-num_of_articles:]
        # if not quiet:
        #    print "UNMAPPED:"
        #    pprint(sorted_unmapped)

        # Report and return the most cited articles
        sorted_citationcount = sorted(iter(list(
            citationcount.items())), key=itemgetter(1))[-num_of_articles:]
        if not quiet:
            print("CITATION COUNTS:")
            pprint(sorted_citationcount)
        return [x[0] for x in reversed(sorted_citationcount)]
Example #49
0
 def test_invalid_store(self):
     with self.assertRaises(ValueError):
         TripleStore.connect("INVALID", "", "")
Example #50
0
 def test_sqlite_close(self, mock_graph):
     # make sure this wierd but harmless sqlite3 exception is
     # caught
     mock_graph.return_value.close.side_effect = sqlite3.ProgrammingError("You made a wrong")
     store = TripleStore.connect("SQLITE", "", "")
     store.close()
Example #51
0
 def test_sleepycat_init(self, mock_graph):
     store = TripleStore.connect("SLEEPYCAT", "", "")
Example #52
0
 def test_sleepycat_triple_count(self, mock_graph):
     store = TripleStore.connect("SLEEPYCAT", "", "")
     self.assertEqual(0, store.triple_count())
Example #53
0
 def test_sqlite_remove_repository(self, mock_graph):
     store = TripleStore.connect("SQLITE", "", "")
     store.remove_repository()
     self.assertTrue(mock_graph.return_value.destroy.called)
Example #54
0
 def test_sqlite_initialize_triplestore(self, mock_graph):
     store = TripleStore.connect("SQLITE", "", "")
     store.initialize_repository()
     self.assertTrue(mock_graph.return_value.open.call_args[1]['create'])
Example #55
0
 def test_sleepycat_init(self, mock_graph):
     store = TripleStore.connect("SLEEPYCAT", "", "")
Example #56
0
 def test_sleepycat_triple_count(self, mock_graph):
     store = TripleStore.connect("SLEEPYCAT", "", "")
     self.assertEqual(0, store.triple_count())
Example #57
0
 def test_sqlite_triple_count(self, mock_graph):
     store = TripleStore.connect("SQLITE", "", "")
     self.assertEqual(0, store.triple_count())