Exemple #1
0
    def analyze_article_citations(self, num_of_articles=20, quiet=False):
        """Prints and returns a list of the top 20 most important articles in the
        TFEU treaty, as determined by the number of citing cases."""

        # Create a mapping of article equivalencies, eg Art 28 TEC == Art 34 TFEU
        sameas = self._sameas()
        equivs = {}
        pred = util.ns['owl'] + "sameAs"
        for (s, o) in sameas.subject_objects(URIRef(pred)):
            equivs[str(o)] = str(s)
        self.log.debug("Defined %s equivalent article references" %
                       len(equivs))

        # Select unique articles citings
        store = TripleStore(self.config.storetype, self.config.storelocation,
                            self.config.storerepository)
        sq = """PREFIX eurlex:<http://lagen.nu/eurlex#>
                SELECT DISTINCT ?case ?article WHERE {
                    ?case eurlex:cites ?article .
                    FILTER (regex(str(?article), "^http://lagen.nu/ext/celex/1"))
             }"""
        cites = store.select(sq, format="python")

        citationcount = {}
        unmapped = {}
        self.log.debug("Going through %s unique citations" % len(cites))
        for cite in cites:
            article = cite['article'].split("-")[0]
            if "12008M" in article:
                pass
            elif article in equivs:
                article = equivs[article]
            else:
                if article in unmapped:
                    unmapped[article] += 1
                else:
                    unmapped[article] = 1
                article = None

            # Keep track of the number of citing cases
            if article:
                if article in citationcount:
                    citationcount[article] += 1
                else:
                    citationcount[article] = 1

        # Report the most common cites to older treaty articles that
        # we have no equivalents for in TFEU
        # sorted_unmapped = sorted(unmapped.iteritems(), key=itemgetter(1))[-num_of_articles:]
        # if not quiet:
        #    print "UNMAPPED:"
        #    pprint(sorted_unmapped)

        # Report and return the most cited articles
        sorted_citationcount = sorted(iter(list(citationcount.items())),
                                      key=itemgetter(1))[-num_of_articles:]
        if not quiet:
            print("CITATION COUNTS:")
            pprint(sorted_citationcount)
        return [x[0] for x in reversed(sorted_citationcount)]
Exemple #2
0
    def temp_analyze(self):
        store = TripleStore(self.config.storetype,
                            self.config.storelocation,
                            self.config.storerepository)
        # sq = self._query_cites('http://lagen.nu/ext/celex/12008E045',self._sameas(),False, True, 2012)
        sq = self._query_cites(None, self._sameas(), False, False, 2012)
        print(sq)
        cites = store.select(sq, format="python")
        self.log.debug(
            "    Citation graph contains %s citations" % (len(cites)))

        # remove duplicate citations, self-citations and pinpoints
        # in citations
        citedict = {}
        for cite in cites:
            # print repr(cite)
            if "-" in cite['obj']:
                cite['obj'] = cite['obj'].split("-")[0]

            if (cite['subj'] != cite['obj']):
                citedict[(cite['subj'], cite['obj'])] = True

        self.log.debug(
            "    Normalized graph contains %s citations" % len(citedict))

        degree = {}
        for citing, cited in list(citedict.keys()):
            if citing not in degree:
                degree[citing] = []
            if cited not in degree:
                degree[cited] = []
            degree[cited].append(citing)

        return
Exemple #3
0
    def temp_analyze(self):
        store = TripleStore(self.config.storetype, self.config.storelocation,
                            self.config.storerepository)
        # sq = self._query_cites('http://lagen.nu/ext/celex/12008E045',self._sameas(),False, True, 2012)
        sq = self._query_cites(None, self._sameas(), False, False, 2012)
        print(sq)
        cites = store.select(sq, format="python")
        self.log.debug("    Citation graph contains %s citations" %
                       (len(cites)))

        # remove duplicate citations, self-citations and pinpoints
        # in citations
        citedict = {}
        for cite in cites:
            # print repr(cite)
            if "-" in cite['obj']:
                cite['obj'] = cite['obj'].split("-")[0]

            if (cite['subj'] != cite['obj']):
                citedict[(cite['subj'], cite['obj'])] = True

        self.log.debug("    Normalized graph contains %s citations" %
                       len(citedict))

        degree = {}
        for citing, cited in list(citedict.keys()):
            if citing not in degree:
                degree[citing] = []
            if cited not in degree:
                degree[cited] = []
            degree[cited].append(citing)

        return
Exemple #4
0
 def download_from_triplestore(self):
     sq = "SELECT ?something ?references ?uri where ?something ?references ?uri AND NOT ?uri ?references ?anything"
     store = TripleStore(self.config.storetype,
                         self.config.storelocation,
                         self.config.storerepository)
     with self.store.open_downloaded("biggraph") as fp:
         for row in store.select(sq):
             fp.write("<%(something)s> <%(references)s> <%(uri)s> .\n")
Exemple #5
0
 def download_from_triplestore(self):
     sq = "SELECT ?something ?references ?uri where ?something ?references ?uri AND NOT ?uri ?references ?anything"
     store = TripleStore(self.config.storetype,
                         self.config.storelocation,
                         self.config.storerepository)
     with self.store.open_downloaded("biggraph") as fp:
         for row in store.select(sq):
             fp.write("<%(something)s> <%(references)s> <%(uri)s> .\n")
Exemple #6
0
    def eval_get_goldstandard(self, basefile):
        goldstandard = Graph()
        goldstandard_rdf = util.relpath(
            os.path.dirname(__file__) + "/../res/eut/goldstandard.n3")
        goldstandard.load(goldstandard_rdf, format="n3")

        pred = util.ns['ir'] + 'isRelevantFor'
        res = {}
        store = TripleStore(self.config.storetype,
                            self.config.storelocation,
                            self.config.storerepository)
        sq_templ = """PREFIX eurlex:<http://lagen.nu/eurlex#>
                      SELECT ?party ?casenum ?celexnum WHERE {
                          <%s> eurlex:party ?party ;
                               eurlex:casenum ?casenum ;
                               eurlex:celexnum ?celexnum .
                      }"""

        self.log.debug(
            "Loading gold standard relevance judgments for %s" % basefile)
        for article in self._articles(basefile):
            res[article] = []
            for o in goldstandard.objects(URIRef(article), URIRef(pred)):
                res[article].append(str(o))
                # Make sure the case exists and is the case we're looking for
                sq = sq_templ % str(o)
                parties = store.select(sq, format="python")
                if parties:
                    pass
                    # self.log.debug("   %s: %s (%s)" %
                    #               (parties[0]['celexnum'],
                    #                parties[0]['casenum'],
                    #                " v ".join([x['party'] for x in parties])))
                else:
                    self.log.warning("Can't find %s in triple store!" % o)
            self.log.debug("    Gold standard for %s: %s relevant docs" %
                           (article, len(res[article])))
            res[article].sort()
        return res
Exemple #7
0
    def eval_get_goldstandard(self, basefile):
        goldstandard = Graph()
        goldstandard_rdf = util.relpath(
            os.path.dirname(__file__) + "/../res/eut/goldstandard.n3")
        goldstandard.load(goldstandard_rdf, format="n3")

        pred = util.ns['ir'] + 'isRelevantFor'
        res = {}
        store = TripleStore(self.config.storetype, self.config.storelocation,
                            self.config.storerepository)
        sq_templ = """PREFIX eurlex:<http://lagen.nu/eurlex#>
                      SELECT ?party ?casenum ?celexnum WHERE {
                          <%s> eurlex:party ?party ;
                               eurlex:casenum ?casenum ;
                               eurlex:celexnum ?celexnum .
                      }"""

        self.log.debug("Loading gold standard relevance judgments for %s" %
                       basefile)
        for article in self._articles(basefile):
            res[article] = []
            for o in goldstandard.objects(URIRef(article), URIRef(pred)):
                res[article].append(str(o))
                # Make sure the case exists and is the case we're looking for
                sq = sq_templ % str(o)
                parties = store.select(sq, format="python")
                if parties:
                    pass
                    # self.log.debug("   %s: %s (%s)" %
                    #               (parties[0]['celexnum'],
                    #                parties[0]['casenum'],
                    #                " v ".join([x['party'] for x in parties])))
                else:
                    self.log.warning("Can't find %s in triple store!" % o)
            self.log.debug("    Gold standard for %s: %s relevant docs" %
                           (article, len(res[article])))
            res[article].sort()
        return res
Exemple #8
0
    def eval_get_ranked_set(self, basefile, algorithm="pagerank",
                            age_compensation=False, restrict_cited=True):
        # * algorithm: can be "indegree", "hits" or "pagerank".
        # * age_compensation: create one graph per year and average to
        #   compensate for newer cases (that have had less time to gain
        #   citations)
        # * restrict_cited: Use only such citations that exist between
        #   two cases that both cite the same TFEU article (othewise,
        #   use all citations from all cases that cite the TFEU
        #   article, regardless of whether the cited case also cites
        #   the same TFEU article)
        sameas = self._sameas()
        store = TripleStore(self.config.storetype,
                            self.config.storelocation,
                            self.config.storerepository)
        res = {}

        self.log.debug("Creating ranked set (%s,age_compensation=%s,restrict_cited=%s)" %
                       (algorithm, age_compensation, restrict_cited))

        for article in self._articles(basefile):
            article_celex = article.split("/")[-1]
            self.log.debug("    Creating ranking for %s" % (article_celex))
            this_year = datetime.datetime.today().year
            if age_compensation:
                years = list(range(1954, this_year + 1))
                # years = range(this_year-3,this_year) # testing
            else:
                years = list(range(this_year, this_year + 1))

            result_by_years = []
            for year in years:
                restrict_citing = True  # always performs better
                if (article, year, restrict_cited) in self._graph_cache:
                    # self.log.debug("Resuing cached graph (%s) for %s in %s" %
                    #               (restrict_cited, article_celex,year))
                    graph = self._graph_cache[(article, year, restrict_cited)]
                else:
                    # self.log.debug("Calculating graph for %s in %s" %
                    #               (article_celex,year))
                    sq = self._query_cites(article, sameas, restrict_citing,
                                           restrict_cited, year)
                    links = store.select(sq, format="python")
                    graph = self.eval_build_nx_graph(links)
                    self._graph_cache[(article, year, restrict_cited)] = graph
                    self.log.debug("      Citegraph for %s in %s has %s edges, %s nodes" %
                                   (article_celex, year, len(graph.edges()),
                                    len(graph.nodes())))

                if len(graph.nodes()) == 0:
                    continue

                ranked = self.eval_rank_graph(graph, algorithm)
                result_by_years.append({})
                for result, score in ranked:
                    result_by_years[-1][result] = score

            if age_compensation:
                compensated_ranking = {}
                for d, score in ranked:  # the last result set
                    # cut out the year part of the URI
                    celex = d.split("/")[-1]
                    try:
                        age = this_year + 1 - int(
                            celex[1:5])  # cases decided this year has age 1
                        # scores = [0,0,0 ... 3,4,8,22]
                        scores = [result_by_year[d]
                                  for result_by_year
                                  in result_by_years
                                  if d in result_by_year]
                        avg_score = sum(scores) / float(age)
                        # self.log.debug("Result %s (age %s, avg score %s) %r" %
                        #               (d,age,avg_score,scores))
                        compensated_ranking[d] = avg_score
                    except ValueError:
                        continue

            # return just a list of results, no scores
            if age_compensation:
                res[article] = [result for result in sorted(
                    compensated_ranking, key=compensated_ranking.__getitem__, reverse=True)]
            else:
                res[article] = [result[0] for result in ranked]

        return res
Exemple #9
0
    def analyze_citation_graphs(self, articles=None):
        # Basic setup
        # articles = self._articles('tfeu')[-1:]
        if not articles:
            articles = [None]
        if None not in articles:
            articles.append(None)
        this_year = datetime.datetime.today().year
        store = TripleStore(self.config.storetype,
                            self.config.storelocation,
                            self.config.storerepository)
        sameas = self._sameas()
        distributions = []

        # For each article (and also for no article = the entire citation graph)
        for article in articles:
            # Get a list of all eligble cases (needed for proper degree distribution)
            sq = self._query_cases(article, sameas)
            # print sq
            cases = {}
            caserows = store.select(sq, format="python")
            for r in caserows:
                cases[r['subj']] = 0

            self.log.info(
                "Creating graphs for %s (%s cases)" % (article, len(cases)))
            # Step 1. SPARQL the graph on the form ?citing ?cited
            # (optionally restricting on citing a particular article)
            if article:
                sq = self._query_cites(
                    article, sameas, True, False, this_year + 1)
            else:
                sq = self._query_cites(
                    None, sameas, False, False, this_year + 1)

            cites = store.select(sq, format="python")
            self.log.debug(
                "    Citation graph contains %s citations" % (len(cites)))

            # remove duplicate citations, self-citations and pinpoints
            # in citations
            citedict = {}
            missingcases = {}
            for cite in cites:
                # print repr(cite)
                if "-" in cite['obj']:
                    cite['obj'] = cite['obj'].split("-")[0]

                if not cite['obj'] in cases:
                    # print "Case %s (cited in %s) does not exist!\n" % (cite['obj'],
                    # cite['subj'])
                    missingcases[cite['obj']] = True
                    continue

                if (cite['subj'] != cite['obj']):
                    citedict[(cite['subj'], cite['obj'])] = True

            self.log.debug(
                "    Normalized graph contains %s citations (%s cited cases not found)" %
                (len(citedict), len(missingcases)))
            # pprint(missingcases.keys()[:10])

            # Step 2. Dotify the list (maybe the direction of arrows from
            # cited to citing can improve results?) to create a citation
            # graph
            self.analyse_citegraph_graphviz(list(citedict.keys()), article)

            # Step 3. Create a degree distribution plot
            degree, distribution = self.analyze_citegraph_degree_distribution(
                cases, list(citedict.keys()), article)
            if article:
                distributions.append([article, distribution])

            # Step 4. Create a citation/age scatterplot (or rather hexbin)
            self.analyze_citegraph_citation_age_plot(
                list(citedict.keys()), degree, distribution, article)

        # Step 5. Create a combined degree distribution graph of the
        # distinct citation networks. Also add the degree distribution
        # of gold standard cases

        self.analyze_citegraph_combined_degree_distribution(distributions)
Exemple #10
0
    def analyze_article_citations(self, num_of_articles=20, quiet=False):
        """Prints and returns a list of the top 20 most important articles in the
        TFEU treaty, as determined by the number of citing cases."""

        # Create a mapping of article equivalencies, eg Art 28 TEC == Art 34 TFEU
        sameas = self._sameas()
        equivs = {}
        pred = util.ns['owl'] + "sameAs"
        for (s, o) in sameas.subject_objects(URIRef(pred)):
            equivs[str(o)] = str(s)
        self.log.debug(
            "Defined %s equivalent article references" % len(equivs))

        # Select unique articles citings
        store = TripleStore(self.config.storetype,
                            self.config.storelocation,
                            self.config.storerepository)
        sq = """PREFIX eurlex:<http://lagen.nu/eurlex#>
                SELECT DISTINCT ?case ?article WHERE {
                    ?case eurlex:cites ?article .
                    FILTER (regex(str(?article), "^http://lagen.nu/ext/celex/1"))
             }"""
        cites = store.select(sq, format="python")

        citationcount = {}
        unmapped = {}
        self.log.debug("Going through %s unique citations" % len(cites))
        for cite in cites:
            article = cite['article'].split("-")[0]
            if "12008M" in article:
                pass
            elif article in equivs:
                article = equivs[article]
            else:
                if article in unmapped:
                    unmapped[article] += 1
                else:
                    unmapped[article] = 1
                article = None

            # Keep track of the number of citing cases
            if article:
                if article in citationcount:
                    citationcount[article] += 1
                else:
                    citationcount[article] = 1

        # Report the most common cites to older treaty articles that
        # we have no equivalents for in TFEU
        # sorted_unmapped = sorted(unmapped.iteritems(), key=itemgetter(1))[-num_of_articles:]
        # if not quiet:
        #    print "UNMAPPED:"
        #    pprint(sorted_unmapped)

        # Report and return the most cited articles
        sorted_citationcount = sorted(iter(list(
            citationcount.items())), key=itemgetter(1))[-num_of_articles:]
        if not quiet:
            print("CITATION COUNTS:")
            pprint(sorted_citationcount)
        return [x[0] for x in reversed(sorted_citationcount)]
Exemple #11
0
    def eval_get_ranked_set(self,
                            basefile,
                            algorithm="pagerank",
                            age_compensation=False,
                            restrict_cited=True):
        # * algorithm: can be "indegree", "hits" or "pagerank".
        # * age_compensation: create one graph per year and average to
        #   compensate for newer cases (that have had less time to gain
        #   citations)
        # * restrict_cited: Use only such citations that exist between
        #   two cases that both cite the same TFEU article (othewise,
        #   use all citations from all cases that cite the TFEU
        #   article, regardless of whether the cited case also cites
        #   the same TFEU article)
        sameas = self._sameas()
        store = TripleStore(self.config.storetype, self.config.storelocation,
                            self.config.storerepository)
        res = {}

        self.log.debug(
            "Creating ranked set (%s,age_compensation=%s,restrict_cited=%s)" %
            (algorithm, age_compensation, restrict_cited))

        for article in self._articles(basefile):
            article_celex = article.split("/")[-1]
            self.log.debug("    Creating ranking for %s" % (article_celex))
            this_year = datetime.datetime.today().year
            if age_compensation:
                years = list(range(1954, this_year + 1))
                # years = range(this_year-3,this_year) # testing
            else:
                years = list(range(this_year, this_year + 1))

            result_by_years = []
            for year in years:
                restrict_citing = True  # always performs better
                if (article, year, restrict_cited) in self._graph_cache:
                    # self.log.debug("Resuing cached graph (%s) for %s in %s" %
                    #               (restrict_cited, article_celex,year))
                    graph = self._graph_cache[(article, year, restrict_cited)]
                else:
                    # self.log.debug("Calculating graph for %s in %s" %
                    #               (article_celex,year))
                    sq = self._query_cites(article, sameas, restrict_citing,
                                           restrict_cited, year)
                    links = store.select(sq, format="python")
                    graph = self.eval_build_nx_graph(links)
                    self._graph_cache[(article, year, restrict_cited)] = graph
                    self.log.debug(
                        "      Citegraph for %s in %s has %s edges, %s nodes" %
                        (article_celex, year, len(
                            graph.edges()), len(graph.nodes())))

                if len(graph.nodes()) == 0:
                    continue

                ranked = self.eval_rank_graph(graph, algorithm)
                result_by_years.append({})
                for result, score in ranked:
                    result_by_years[-1][result] = score

            if age_compensation:
                compensated_ranking = {}
                for d, score in ranked:  # the last result set
                    # cut out the year part of the URI
                    celex = d.split("/")[-1]
                    try:
                        age = this_year + 1 - int(
                            celex[1:5])  # cases decided this year has age 1
                        # scores = [0,0,0 ... 3,4,8,22]
                        scores = [
                            result_by_year[d]
                            for result_by_year in result_by_years
                            if d in result_by_year
                        ]
                        avg_score = sum(scores) / float(age)
                        # self.log.debug("Result %s (age %s, avg score %s) %r" %
                        #               (d,age,avg_score,scores))
                        compensated_ranking[d] = avg_score
                    except ValueError:
                        continue

            # return just a list of results, no scores
            if age_compensation:
                res[article] = [
                    result
                    for result in sorted(compensated_ranking,
                                         key=compensated_ranking.__getitem__,
                                         reverse=True)
                ]
            else:
                res[article] = [result[0] for result in ranked]

        return res
Exemple #12
0
    def analyze_citation_graphs(self, articles=None):
        # Basic setup
        # articles = self._articles('tfeu')[-1:]
        if not articles:
            articles = [None]
        if None not in articles:
            articles.append(None)
        this_year = datetime.datetime.today().year
        store = TripleStore(self.config.storetype, self.config.storelocation,
                            self.config.storerepository)
        sameas = self._sameas()
        distributions = []

        # For each article (and also for no article = the entire citation graph)
        for article in articles:
            # Get a list of all eligble cases (needed for proper degree distribution)
            sq = self._query_cases(article, sameas)
            # print sq
            cases = {}
            caserows = store.select(sq, format="python")
            for r in caserows:
                cases[r['subj']] = 0

            self.log.info("Creating graphs for %s (%s cases)" %
                          (article, len(cases)))
            # Step 1. SPARQL the graph on the form ?citing ?cited
            # (optionally restricting on citing a particular article)
            if article:
                sq = self._query_cites(article, sameas, True, False,
                                       this_year + 1)
            else:
                sq = self._query_cites(None, sameas, False, False,
                                       this_year + 1)

            cites = store.select(sq, format="python")
            self.log.debug("    Citation graph contains %s citations" %
                           (len(cites)))

            # remove duplicate citations, self-citations and pinpoints
            # in citations
            citedict = {}
            missingcases = {}
            for cite in cites:
                # print repr(cite)
                if "-" in cite['obj']:
                    cite['obj'] = cite['obj'].split("-")[0]

                if not cite['obj'] in cases:
                    # print "Case %s (cited in %s) does not exist!\n" % (cite['obj'],
                    # cite['subj'])
                    missingcases[cite['obj']] = True
                    continue

                if (cite['subj'] != cite['obj']):
                    citedict[(cite['subj'], cite['obj'])] = True

            self.log.debug(
                "    Normalized graph contains %s citations (%s cited cases not found)"
                % (len(citedict), len(missingcases)))
            # pprint(missingcases.keys()[:10])

            # Step 2. Dotify the list (maybe the direction of arrows from
            # cited to citing can improve results?) to create a citation
            # graph
            self.analyse_citegraph_graphviz(list(citedict.keys()), article)

            # Step 3. Create a degree distribution plot
            degree, distribution = self.analyze_citegraph_degree_distribution(
                cases, list(citedict.keys()), article)
            if article:
                distributions.append([article, distribution])

            # Step 4. Create a citation/age scatterplot (or rather hexbin)
            self.analyze_citegraph_citation_age_plot(list(citedict.keys()),
                                                     degree, distribution,
                                                     article)

        # Step 5. Create a combined degree distribution graph of the
        # distinct citation networks. Also add the degree distribution
        # of gold standard cases

        self.analyze_citegraph_combined_degree_distribution(distributions)