def analyze_article_citations(self, num_of_articles=20, quiet=False): """Prints and returns a list of the top 20 most important articles in the TFEU treaty, as determined by the number of citing cases.""" # Create a mapping of article equivalencies, eg Art 28 TEC == Art 34 TFEU sameas = self._sameas() equivs = {} pred = util.ns['owl'] + "sameAs" for (s, o) in sameas.subject_objects(URIRef(pred)): equivs[str(o)] = str(s) self.log.debug("Defined %s equivalent article references" % len(equivs)) # Select unique articles citings store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) sq = """PREFIX eurlex:<http://lagen.nu/eurlex#> SELECT DISTINCT ?case ?article WHERE { ?case eurlex:cites ?article . FILTER (regex(str(?article), "^http://lagen.nu/ext/celex/1")) }""" cites = store.select(sq, format="python") citationcount = {} unmapped = {} self.log.debug("Going through %s unique citations" % len(cites)) for cite in cites: article = cite['article'].split("-")[0] if "12008M" in article: pass elif article in equivs: article = equivs[article] else: if article in unmapped: unmapped[article] += 1 else: unmapped[article] = 1 article = None # Keep track of the number of citing cases if article: if article in citationcount: citationcount[article] += 1 else: citationcount[article] = 1 # Report the most common cites to older treaty articles that # we have no equivalents for in TFEU # sorted_unmapped = sorted(unmapped.iteritems(), key=itemgetter(1))[-num_of_articles:] # if not quiet: # print "UNMAPPED:" # pprint(sorted_unmapped) # Report and return the most cited articles sorted_citationcount = sorted(iter(list(citationcount.items())), key=itemgetter(1))[-num_of_articles:] if not quiet: print("CITATION COUNTS:") pprint(sorted_citationcount) return [x[0] for x in reversed(sorted_citationcount)]
def temp_analyze(self): store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) # sq = self._query_cites('http://lagen.nu/ext/celex/12008E045',self._sameas(),False, True, 2012) sq = self._query_cites(None, self._sameas(), False, False, 2012) print(sq) cites = store.select(sq, format="python") self.log.debug( " Citation graph contains %s citations" % (len(cites))) # remove duplicate citations, self-citations and pinpoints # in citations citedict = {} for cite in cites: # print repr(cite) if "-" in cite['obj']: cite['obj'] = cite['obj'].split("-")[0] if (cite['subj'] != cite['obj']): citedict[(cite['subj'], cite['obj'])] = True self.log.debug( " Normalized graph contains %s citations" % len(citedict)) degree = {} for citing, cited in list(citedict.keys()): if citing not in degree: degree[citing] = [] if cited not in degree: degree[cited] = [] degree[cited].append(citing) return
def temp_analyze(self): store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) # sq = self._query_cites('http://lagen.nu/ext/celex/12008E045',self._sameas(),False, True, 2012) sq = self._query_cites(None, self._sameas(), False, False, 2012) print(sq) cites = store.select(sq, format="python") self.log.debug(" Citation graph contains %s citations" % (len(cites))) # remove duplicate citations, self-citations and pinpoints # in citations citedict = {} for cite in cites: # print repr(cite) if "-" in cite['obj']: cite['obj'] = cite['obj'].split("-")[0] if (cite['subj'] != cite['obj']): citedict[(cite['subj'], cite['obj'])] = True self.log.debug(" Normalized graph contains %s citations" % len(citedict)) degree = {} for citing, cited in list(citedict.keys()): if citing not in degree: degree[citing] = [] if cited not in degree: degree[cited] = [] degree[cited].append(citing) return
def download_from_triplestore(self): sq = "SELECT ?something ?references ?uri where ?something ?references ?uri AND NOT ?uri ?references ?anything" store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) with self.store.open_downloaded("biggraph") as fp: for row in store.select(sq): fp.write("<%(something)s> <%(references)s> <%(uri)s> .\n")
def eval_get_goldstandard(self, basefile): goldstandard = Graph() goldstandard_rdf = util.relpath( os.path.dirname(__file__) + "/../res/eut/goldstandard.n3") goldstandard.load(goldstandard_rdf, format="n3") pred = util.ns['ir'] + 'isRelevantFor' res = {} store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) sq_templ = """PREFIX eurlex:<http://lagen.nu/eurlex#> SELECT ?party ?casenum ?celexnum WHERE { <%s> eurlex:party ?party ; eurlex:casenum ?casenum ; eurlex:celexnum ?celexnum . }""" self.log.debug( "Loading gold standard relevance judgments for %s" % basefile) for article in self._articles(basefile): res[article] = [] for o in goldstandard.objects(URIRef(article), URIRef(pred)): res[article].append(str(o)) # Make sure the case exists and is the case we're looking for sq = sq_templ % str(o) parties = store.select(sq, format="python") if parties: pass # self.log.debug(" %s: %s (%s)" % # (parties[0]['celexnum'], # parties[0]['casenum'], # " v ".join([x['party'] for x in parties]))) else: self.log.warning("Can't find %s in triple store!" % o) self.log.debug(" Gold standard for %s: %s relevant docs" % (article, len(res[article]))) res[article].sort() return res
def eval_get_goldstandard(self, basefile): goldstandard = Graph() goldstandard_rdf = util.relpath( os.path.dirname(__file__) + "/../res/eut/goldstandard.n3") goldstandard.load(goldstandard_rdf, format="n3") pred = util.ns['ir'] + 'isRelevantFor' res = {} store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) sq_templ = """PREFIX eurlex:<http://lagen.nu/eurlex#> SELECT ?party ?casenum ?celexnum WHERE { <%s> eurlex:party ?party ; eurlex:casenum ?casenum ; eurlex:celexnum ?celexnum . }""" self.log.debug("Loading gold standard relevance judgments for %s" % basefile) for article in self._articles(basefile): res[article] = [] for o in goldstandard.objects(URIRef(article), URIRef(pred)): res[article].append(str(o)) # Make sure the case exists and is the case we're looking for sq = sq_templ % str(o) parties = store.select(sq, format="python") if parties: pass # self.log.debug(" %s: %s (%s)" % # (parties[0]['celexnum'], # parties[0]['casenum'], # " v ".join([x['party'] for x in parties]))) else: self.log.warning("Can't find %s in triple store!" % o) self.log.debug(" Gold standard for %s: %s relevant docs" % (article, len(res[article]))) res[article].sort() return res
def eval_get_ranked_set(self, basefile, algorithm="pagerank", age_compensation=False, restrict_cited=True): # * algorithm: can be "indegree", "hits" or "pagerank". # * age_compensation: create one graph per year and average to # compensate for newer cases (that have had less time to gain # citations) # * restrict_cited: Use only such citations that exist between # two cases that both cite the same TFEU article (othewise, # use all citations from all cases that cite the TFEU # article, regardless of whether the cited case also cites # the same TFEU article) sameas = self._sameas() store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) res = {} self.log.debug("Creating ranked set (%s,age_compensation=%s,restrict_cited=%s)" % (algorithm, age_compensation, restrict_cited)) for article in self._articles(basefile): article_celex = article.split("/")[-1] self.log.debug(" Creating ranking for %s" % (article_celex)) this_year = datetime.datetime.today().year if age_compensation: years = list(range(1954, this_year + 1)) # years = range(this_year-3,this_year) # testing else: years = list(range(this_year, this_year + 1)) result_by_years = [] for year in years: restrict_citing = True # always performs better if (article, year, restrict_cited) in self._graph_cache: # self.log.debug("Resuing cached graph (%s) for %s in %s" % # (restrict_cited, article_celex,year)) graph = self._graph_cache[(article, year, restrict_cited)] else: # self.log.debug("Calculating graph for %s in %s" % # (article_celex,year)) sq = self._query_cites(article, sameas, restrict_citing, restrict_cited, year) links = store.select(sq, format="python") graph = self.eval_build_nx_graph(links) self._graph_cache[(article, year, restrict_cited)] = graph self.log.debug(" Citegraph for %s in %s has %s edges, %s nodes" % (article_celex, year, len(graph.edges()), len(graph.nodes()))) if len(graph.nodes()) == 0: continue ranked = self.eval_rank_graph(graph, algorithm) result_by_years.append({}) for result, score in ranked: result_by_years[-1][result] = score if age_compensation: compensated_ranking = {} for d, score in ranked: # the last result set # cut out the year part of the URI celex = d.split("/")[-1] try: age = this_year + 1 - int( celex[1:5]) # cases decided this year has age 1 # scores = [0,0,0 ... 3,4,8,22] scores = [result_by_year[d] for result_by_year in result_by_years if d in result_by_year] avg_score = sum(scores) / float(age) # self.log.debug("Result %s (age %s, avg score %s) %r" % # (d,age,avg_score,scores)) compensated_ranking[d] = avg_score except ValueError: continue # return just a list of results, no scores if age_compensation: res[article] = [result for result in sorted( compensated_ranking, key=compensated_ranking.__getitem__, reverse=True)] else: res[article] = [result[0] for result in ranked] return res
def analyze_citation_graphs(self, articles=None): # Basic setup # articles = self._articles('tfeu')[-1:] if not articles: articles = [None] if None not in articles: articles.append(None) this_year = datetime.datetime.today().year store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) sameas = self._sameas() distributions = [] # For each article (and also for no article = the entire citation graph) for article in articles: # Get a list of all eligble cases (needed for proper degree distribution) sq = self._query_cases(article, sameas) # print sq cases = {} caserows = store.select(sq, format="python") for r in caserows: cases[r['subj']] = 0 self.log.info( "Creating graphs for %s (%s cases)" % (article, len(cases))) # Step 1. SPARQL the graph on the form ?citing ?cited # (optionally restricting on citing a particular article) if article: sq = self._query_cites( article, sameas, True, False, this_year + 1) else: sq = self._query_cites( None, sameas, False, False, this_year + 1) cites = store.select(sq, format="python") self.log.debug( " Citation graph contains %s citations" % (len(cites))) # remove duplicate citations, self-citations and pinpoints # in citations citedict = {} missingcases = {} for cite in cites: # print repr(cite) if "-" in cite['obj']: cite['obj'] = cite['obj'].split("-")[0] if not cite['obj'] in cases: # print "Case %s (cited in %s) does not exist!\n" % (cite['obj'], # cite['subj']) missingcases[cite['obj']] = True continue if (cite['subj'] != cite['obj']): citedict[(cite['subj'], cite['obj'])] = True self.log.debug( " Normalized graph contains %s citations (%s cited cases not found)" % (len(citedict), len(missingcases))) # pprint(missingcases.keys()[:10]) # Step 2. Dotify the list (maybe the direction of arrows from # cited to citing can improve results?) to create a citation # graph self.analyse_citegraph_graphviz(list(citedict.keys()), article) # Step 3. Create a degree distribution plot degree, distribution = self.analyze_citegraph_degree_distribution( cases, list(citedict.keys()), article) if article: distributions.append([article, distribution]) # Step 4. Create a citation/age scatterplot (or rather hexbin) self.analyze_citegraph_citation_age_plot( list(citedict.keys()), degree, distribution, article) # Step 5. Create a combined degree distribution graph of the # distinct citation networks. Also add the degree distribution # of gold standard cases self.analyze_citegraph_combined_degree_distribution(distributions)
def analyze_article_citations(self, num_of_articles=20, quiet=False): """Prints and returns a list of the top 20 most important articles in the TFEU treaty, as determined by the number of citing cases.""" # Create a mapping of article equivalencies, eg Art 28 TEC == Art 34 TFEU sameas = self._sameas() equivs = {} pred = util.ns['owl'] + "sameAs" for (s, o) in sameas.subject_objects(URIRef(pred)): equivs[str(o)] = str(s) self.log.debug( "Defined %s equivalent article references" % len(equivs)) # Select unique articles citings store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) sq = """PREFIX eurlex:<http://lagen.nu/eurlex#> SELECT DISTINCT ?case ?article WHERE { ?case eurlex:cites ?article . FILTER (regex(str(?article), "^http://lagen.nu/ext/celex/1")) }""" cites = store.select(sq, format="python") citationcount = {} unmapped = {} self.log.debug("Going through %s unique citations" % len(cites)) for cite in cites: article = cite['article'].split("-")[0] if "12008M" in article: pass elif article in equivs: article = equivs[article] else: if article in unmapped: unmapped[article] += 1 else: unmapped[article] = 1 article = None # Keep track of the number of citing cases if article: if article in citationcount: citationcount[article] += 1 else: citationcount[article] = 1 # Report the most common cites to older treaty articles that # we have no equivalents for in TFEU # sorted_unmapped = sorted(unmapped.iteritems(), key=itemgetter(1))[-num_of_articles:] # if not quiet: # print "UNMAPPED:" # pprint(sorted_unmapped) # Report and return the most cited articles sorted_citationcount = sorted(iter(list( citationcount.items())), key=itemgetter(1))[-num_of_articles:] if not quiet: print("CITATION COUNTS:") pprint(sorted_citationcount) return [x[0] for x in reversed(sorted_citationcount)]
def eval_get_ranked_set(self, basefile, algorithm="pagerank", age_compensation=False, restrict_cited=True): # * algorithm: can be "indegree", "hits" or "pagerank". # * age_compensation: create one graph per year and average to # compensate for newer cases (that have had less time to gain # citations) # * restrict_cited: Use only such citations that exist between # two cases that both cite the same TFEU article (othewise, # use all citations from all cases that cite the TFEU # article, regardless of whether the cited case also cites # the same TFEU article) sameas = self._sameas() store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) res = {} self.log.debug( "Creating ranked set (%s,age_compensation=%s,restrict_cited=%s)" % (algorithm, age_compensation, restrict_cited)) for article in self._articles(basefile): article_celex = article.split("/")[-1] self.log.debug(" Creating ranking for %s" % (article_celex)) this_year = datetime.datetime.today().year if age_compensation: years = list(range(1954, this_year + 1)) # years = range(this_year-3,this_year) # testing else: years = list(range(this_year, this_year + 1)) result_by_years = [] for year in years: restrict_citing = True # always performs better if (article, year, restrict_cited) in self._graph_cache: # self.log.debug("Resuing cached graph (%s) for %s in %s" % # (restrict_cited, article_celex,year)) graph = self._graph_cache[(article, year, restrict_cited)] else: # self.log.debug("Calculating graph for %s in %s" % # (article_celex,year)) sq = self._query_cites(article, sameas, restrict_citing, restrict_cited, year) links = store.select(sq, format="python") graph = self.eval_build_nx_graph(links) self._graph_cache[(article, year, restrict_cited)] = graph self.log.debug( " Citegraph for %s in %s has %s edges, %s nodes" % (article_celex, year, len( graph.edges()), len(graph.nodes()))) if len(graph.nodes()) == 0: continue ranked = self.eval_rank_graph(graph, algorithm) result_by_years.append({}) for result, score in ranked: result_by_years[-1][result] = score if age_compensation: compensated_ranking = {} for d, score in ranked: # the last result set # cut out the year part of the URI celex = d.split("/")[-1] try: age = this_year + 1 - int( celex[1:5]) # cases decided this year has age 1 # scores = [0,0,0 ... 3,4,8,22] scores = [ result_by_year[d] for result_by_year in result_by_years if d in result_by_year ] avg_score = sum(scores) / float(age) # self.log.debug("Result %s (age %s, avg score %s) %r" % # (d,age,avg_score,scores)) compensated_ranking[d] = avg_score except ValueError: continue # return just a list of results, no scores if age_compensation: res[article] = [ result for result in sorted(compensated_ranking, key=compensated_ranking.__getitem__, reverse=True) ] else: res[article] = [result[0] for result in ranked] return res
def analyze_citation_graphs(self, articles=None): # Basic setup # articles = self._articles('tfeu')[-1:] if not articles: articles = [None] if None not in articles: articles.append(None) this_year = datetime.datetime.today().year store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) sameas = self._sameas() distributions = [] # For each article (and also for no article = the entire citation graph) for article in articles: # Get a list of all eligble cases (needed for proper degree distribution) sq = self._query_cases(article, sameas) # print sq cases = {} caserows = store.select(sq, format="python") for r in caserows: cases[r['subj']] = 0 self.log.info("Creating graphs for %s (%s cases)" % (article, len(cases))) # Step 1. SPARQL the graph on the form ?citing ?cited # (optionally restricting on citing a particular article) if article: sq = self._query_cites(article, sameas, True, False, this_year + 1) else: sq = self._query_cites(None, sameas, False, False, this_year + 1) cites = store.select(sq, format="python") self.log.debug(" Citation graph contains %s citations" % (len(cites))) # remove duplicate citations, self-citations and pinpoints # in citations citedict = {} missingcases = {} for cite in cites: # print repr(cite) if "-" in cite['obj']: cite['obj'] = cite['obj'].split("-")[0] if not cite['obj'] in cases: # print "Case %s (cited in %s) does not exist!\n" % (cite['obj'], # cite['subj']) missingcases[cite['obj']] = True continue if (cite['subj'] != cite['obj']): citedict[(cite['subj'], cite['obj'])] = True self.log.debug( " Normalized graph contains %s citations (%s cited cases not found)" % (len(citedict), len(missingcases))) # pprint(missingcases.keys()[:10]) # Step 2. Dotify the list (maybe the direction of arrows from # cited to citing can improve results?) to create a citation # graph self.analyse_citegraph_graphviz(list(citedict.keys()), article) # Step 3. Create a degree distribution plot degree, distribution = self.analyze_citegraph_degree_distribution( cases, list(citedict.keys()), article) if article: distributions.append([article, distribution]) # Step 4. Create a citation/age scatterplot (or rather hexbin) self.analyze_citegraph_citation_age_plot(list(citedict.keys()), degree, distribution, article) # Step 5. Create a combined degree distribution graph of the # distinct citation networks. Also add the degree distribution # of gold standard cases self.analyze_citegraph_combined_degree_distribution(distributions)