def analyze_citegraph_citation_age_plot(self, cites, degree, distribution, article): self.log.debug(" Writing citation age plot") this_year = datetime.datetime.today().year maxcites = 40 maxage = this_year - 1954 cited_by_age = [] citations = [] for case in sorted(degree.keys()): try: year = int(case[27:31]) caseage = this_year - year if year < 1954: continue except ValueError: # some malformed URIs/Celexnos continue if degree[case] <= maxcites: cited_by_age.append(caseage) citations.append(degree[case]) cases_by_age = [0] * (maxage + 1) for citing, cited in cites: year = int(citing[27:31]) caseage = this_year - year if year < 1954: continue if caseage < 0: continue cases_by_age[caseage] += 1 fig = plt.figure() fig.set_size_inches(8, 5) plt.axis([0, maxage, 0, maxcites]) ax = plt.subplot(211) plt.hexbin(cited_by_age, citations, gridsize=maxcites, bins='log', cmap=cm.hot_r) # plt.scatter(age,citations) ax.set_title("Distribution of citations by age") ax.set_ylabel("# of citations") #cb = plt.colorbar() # cb.set_label('log(# of cases with # of citations)') ax = plt.subplot(212) ax.set_title("Distribution of cases by age") plt.axis([0, maxage, 0, max(cases_by_age)]) plt.bar(na.array(list(range(len(cases_by_age)))) + 0.5, cases_by_age) filetype = self.graph_filetype if article: filename = "citation_age_plot_%s" % (article.split("/")[-1]) else: filename = "citation_age_plot_all" filename = self.generic_path(filename, "analyzed", "." + filetype) plt.savefig(filename) plt.close() self.log.debug(" Created %s" % filename)
def set_image_palette(self, r, g, b): '''Given a set of RGB colors, create a list of 24bit numbers representing the pallet. I.e., RGB of (1,64,127) would be saved as 82047, or the number 00000001 01000000 011111111''' self.imagebuffer = array.array('l') self.clear_cal_display() sz = len(r) i = 0 self.pal = [] while i < sz: rf = int(b[i]) gf = int(g[i]) bf = int(r[i]) self.pal.append((rf << 16) | (gf << 8) | (bf)) i = i + 1
def set_image_palette(self, r,g,b): '''Given a set of RGB colors, create a list of 24bit numbers representing the pallet. I.e., RGB of (1,64,127) would be saved as 82047, or the number 00000001 01000000 011111111''' self.imagebuffer = array.array('l') self.clear_cal_display() sz = len(r) i =0 self.pal = [] while i < sz: rf = int(b[i]) gf = int(g[i]) bf = int(r[i]) self.pal.append((rf<<16) | (gf<<8) | (bf)) i = i+1
def test_load_buffer_array(self): """Test loading from various buffer objects.""" mixer.init() try: import array samples = b'\x00\xff' * 24 arsample = array.array('b') if hasattr(arsample, 'frombytes'): # Python 3 arsample.frombytes(samples) else: arsample.fromstring(samples) snd = mixer.Sound(bytearray(samples)) raw = snd.get_raw() self.assertTrue(isinstance(raw, bytes_)) self.assertEqual(raw, samples) finally: mixer.quit()
number_of_images_to_read = 3; ROI_shape = (64,64); mat_shape = np.append(number_of_images_to_read,ROI_shape) read_size = np.prod(ROI_shape)*number_of_images_to_read; fid_read = open(file_name,'rb'); bla = fid_read.read(read_size*bytes_per_element) unpack_length = int(length(bla)/8); bla2 = struct.unpack('d'*unpack_length, bla) #returns a tuple...not an array :( bla2_array = double(bla2); bla2_array_image = bla2_array.reshape(mat_shape,order='C') imshow(bla2_array_image[0]) #read using numpy.fromfile: bla_np = np.fromfile(file_name,'d',count=read_size); imshow(bla_np.reshape(ROI_shape)); #using array.array bla_double_array = array.array('d',bla); ###################################################################################################################################################################################################### ###################################################################################################################################################################################################### #ASTROPY: import astropy.table import astropy.units as u import numpy as np # Create table from scratch ra = np.random.random(5) t = table.Table() t.add_column(table.Column(name='ra', data=ra, units=u.degree)) # Write out to file t.write('myfile.fits') # also support HDF5, ASCII, etc.
class GraphAnalyze(object): def prep_annotation_file(self, basefile): goldstandard = self.eval_get_goldstandard(basefile) baseline_set = self.eval_get_ranked_set_baseline(basefile) baseline_map = self.eval_calc_map( self.eval_calc_aps(baseline_set, goldstandard)) print("Baseline MAP %f" % baseline_map) self.log.info("Calculating ranked set (pagerank, unrestricted)") pagerank_set = self.eval_get_ranked_set(basefile, "pagerank", age_compensation=False, restrict_cited=False) pagerank_map = self.eval_calc_map( self.eval_calc_aps(pagerank_set, goldstandard)) print("Pagerank MAP %f" % pagerank_map) sets = [{ 'label': 'Baseline', 'data': baseline_set }, { 'label': 'Gold standard', 'data': goldstandard }, { 'label': 'PageRank', 'data': pagerank_set }] g = Graph() g.bind('dcterms', self.ns['dcterms']) g.bind('rinfoex', self.ns['rinfoex']) XHT_NS = "{http://www.w3.org/1999/xhtml}" tree = ET.parse(self.parsed_path(basefile)) els = tree.findall("//" + XHT_NS + "div") articles = [] for el in els: if 'typeof' in el.attrib and el.attrib[ 'typeof'] == "eurlex:Article": article = str(el.attrib['id'][1:]) articles.append(article) for article in articles: self.log.info("Results for article %s" % article) articlenode = URIRef("http://lagen.nu/ext/celex/12008E%03d" % int(article)) resultsetcollectionnode = BNode() g.add((resultsetcollectionnode, RDF.type, RDF.List)) rc = Collection(g, resultsetcollectionnode) g.add((articlenode, DCTERMS["relation"], resultsetcollectionnode)) for s in sets: resultsetnode = BNode() listnode = BNode() rc.append(resultsetnode) g.add((resultsetnode, RDF.type, RINFOEX["RelatedContentCollection"])) g.add((resultsetnode, DCTERMS["title"], Literal(s["label"]))) g.add((resultsetnode, DCTERMS["hasPart"], listnode)) c = Collection(g, listnode) g.add((listnode, RDF.type, RDF.List)) if article in s['data']: print((" Set %s" % s['label'])) for result in s['data'][article]: resnode = BNode() g.add((resnode, DCTERMS["references"], Literal(result[0]))) g.add((resnode, DCTERMS["title"], Literal(result[1]))) c.append(resnode) print((" %s" % result[1])) return self.graph_to_annotation_file(g, basefile) def graph_to_image(self, graph, imageformat, filename): import pydot import rdflib dot = pydot.Dot() # dot.progs = {"dot": "c:/Program Files/Graphviz2.26.3/bin/dot.exe"} # code from rdflib.util.graph_to_dot, but adjusted to handle unicode nodes = {} for s, o in graph.subject_objects(): for i in s, o: if i not in list(nodes.keys()): if isinstance(i, rdflib.BNode): nodes[i] = repr(i)[7:] elif isinstance(i, rdflib.Literal): nodes[i] = repr(i)[16:-1] elif isinstance(i, rdflib.URIRef): nodes[i] = repr(i)[22:-2] for s, p, o in graph.triples((None, None, None)): dot.add_edge(pydot.Edge(nodes[s], nodes[o], label=repr(p)[22:-2])) self.log.debug("Writing %s format to %s" % (imageformat, filename)) util.ensure_dir(filename) dot.write(path=filename, prog="dot", format=imageformat) self.log.debug("Wrote %s" % filename) top_articles = [] graph_filetype = "png" # yields an iterator of Article URIs def _articles(self, basefile): # Those articles we have gold standard sets for now self.top_articles = [ 'http://lagen.nu/ext/celex/12008E263', 'http://lagen.nu/ext/celex/12008E101', 'http://lagen.nu/ext/celex/12008E267', 'http://lagen.nu/ext/celex/12008E107', 'http://lagen.nu/ext/celex/12008E108', 'http://lagen.nu/ext/celex/12008E296', 'http://lagen.nu/ext/celex/12008E258', 'http://lagen.nu/ext/celex/12008E045', 'http://lagen.nu/ext/celex/12008E288', 'http://lagen.nu/ext/celex/12008E034', ] # For evaluation, only return the 20 top cited articles (which # analyze_article_citations incidentally compute for us). For # full-scale generation, use commented-out code below. if not self.top_articles: self.top_articles = self.analyze_article_citations(quiet=True) return self.top_articles # For full-scale processing, return all articles present in e.g. TFEU: # XHT_NS = "{http://www.w3.org/1999/xhtml}" #tree = ET.parse(self.parsed_path(basefile)) #els = tree.findall("//"+XHT_NS+"div") # for el in els: # if 'typeof' in el.attrib and el.attrib['typeof'] == "eurlex:Article": # yield el.attrib['about'] # returns a RDFLib.Graph def _sameas(self): sameas = Graph() sameas_rdf = util.relpath( os.path.dirname(__file__) + "/../res/eut/sameas.n3") sameas.load(sameas_rdf, format="n3") return sameas def _query_cases(self, article, sameas): pred = util.ns['owl'] + "sameAs" q = "" if article: q += "{ ?subj eurlex:cites <%s> }\n" % article for equiv in sameas.objects(URIRef(article), URIRef(pred)): q += " UNION { ?subj eurlex:cites <%s> }\n" % equiv return """ PREFIX eurlex:<http://lagen.nu/eurlex#> PREFIX dcterms:<http://purl.org/dc/terms/> SELECT DISTINCT ?subj WHERE { ?subj ?pred ?obj . %s FILTER (regex(str(?subj), "^http://lagen.nu/ext/celex/6")) } """ % (q) # Returns a python list of dicts def _query_cites(self, article, sameas, restrict_citing, restrict_cited, year=None): if not year: year = datetime.datetime.today().year pred = util.ns['owl'] + "sameAs" q = "" if restrict_citing: q += "{ ?subj eurlex:cites <%s> }\n" % article for equiv in sameas.objects(URIRef(article), URIRef(pred)): q += " UNION { ?subj eurlex:cites <%s> }\n" % equiv if restrict_cited: if q: q += ".\n" q = "{?obj eurlex:cites <%s>}\n" % article for equiv in sameas.objects(URIRef(article), URIRef(pred)): q += " UNION { ?obj eurlex:cites <%s> }\n" % equiv return """ PREFIX eurlex:<http://lagen.nu/eurlex#> PREFIX dcterms:<http://purl.org/dc/terms/> SELECT DISTINCT ?subj ?pred ?obj ?celexnum WHERE { ?subj ?pred ?obj . ?subj eurlex:celexnum ?celexnum. %s FILTER (regex(str(?obj), "^http://lagen.nu/ext/celex/6") && ?pred = eurlex:cites && str(?celexnum) < str("6%s"@en)) } """ % (q, year) def temp_analyze(self): store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) # sq = self._query_cites('http://lagen.nu/ext/celex/12008E045',self._sameas(),False, True, 2012) sq = self._query_cites(None, self._sameas(), False, False, 2012) print(sq) cites = store.select(sq, format="python") self.log.debug(" Citation graph contains %s citations" % (len(cites))) # remove duplicate citations, self-citations and pinpoints # in citations citedict = {} for cite in cites: # print repr(cite) if "-" in cite['obj']: cite['obj'] = cite['obj'].split("-")[0] if (cite['subj'] != cite['obj']): citedict[(cite['subj'], cite['obj'])] = True self.log.debug(" Normalized graph contains %s citations" % len(citedict)) degree = {} for citing, cited in list(citedict.keys()): if citing not in degree: degree[citing] = [] if cited not in degree: degree[cited] = [] degree[cited].append(citing) return def analyze(self): articles = self.analyze_article_citations(num_of_articles=10) # articles = self._articles('tfeu') self.analyze_baseline_queries(articles) self.analyze_citation_graphs(articles) def analyze_article_citations(self, num_of_articles=20, quiet=False): """Prints and returns a list of the top 20 most important articles in the TFEU treaty, as determined by the number of citing cases.""" # Create a mapping of article equivalencies, eg Art 28 TEC == Art 34 TFEU sameas = self._sameas() equivs = {} pred = util.ns['owl'] + "sameAs" for (s, o) in sameas.subject_objects(URIRef(pred)): equivs[str(o)] = str(s) self.log.debug("Defined %s equivalent article references" % len(equivs)) # Select unique articles citings store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) sq = """PREFIX eurlex:<http://lagen.nu/eurlex#> SELECT DISTINCT ?case ?article WHERE { ?case eurlex:cites ?article . FILTER (regex(str(?article), "^http://lagen.nu/ext/celex/1")) }""" cites = store.select(sq, format="python") citationcount = {} unmapped = {} self.log.debug("Going through %s unique citations" % len(cites)) for cite in cites: article = cite['article'].split("-")[0] if "12008M" in article: pass elif article in equivs: article = equivs[article] else: if article in unmapped: unmapped[article] += 1 else: unmapped[article] = 1 article = None # Keep track of the number of citing cases if article: if article in citationcount: citationcount[article] += 1 else: citationcount[article] = 1 # Report the most common cites to older treaty articles that # we have no equivalents for in TFEU # sorted_unmapped = sorted(unmapped.iteritems(), key=itemgetter(1))[-num_of_articles:] # if not quiet: # print "UNMAPPED:" # pprint(sorted_unmapped) # Report and return the most cited articles sorted_citationcount = sorted(iter(list(citationcount.items())), key=itemgetter(1))[-num_of_articles:] if not quiet: print("CITATION COUNTS:") pprint(sorted_citationcount) return [x[0] for x in reversed(sorted_citationcount)] def analyze_baseline_queries(self, analyzed_articles, num_of_keyterms=5): basefile = "tfeu" # Helper from http://effbot.org/zone/element-lib.htm def flatten(elem, include_tail=0): text = elem.text or "" for e in elem: text += flatten(e, 1) if include_tail and elem.tail: text += elem.tail return text # step 1: Create a temporary whoosh index in order to find out # the most significant words for each article #ana = analysis.StandardAnalyzer() ana = analysis.StemmingAnalyzer() # vectorformat = formats.Frequency(ana) schema = fields.Schema(article=fields.ID(unique=True), content=fields.TEXT(analyzer=ana, stored=True)) st = RamStorage() tmpidx = st.create_index(schema) w = tmpidx.writer() XHT_NS = "{http://www.w3.org/1999/xhtml}" tree = ET.parse(self.parsed_path(basefile)) els = tree.findall("//" + XHT_NS + "div") articles = [] for el in els: if 'typeof' in el.attrib and el.attrib[ 'typeof'] == "eurlex:Article": text = util.normalize_space(flatten(el)) article = str(el.attrib['about']) articles.append(article) w.update_document(article=article, content=text) w.commit() self.log.info("Indexed %d articles" % len(articles)) # Step 2: Open the large whoosh index containing the text of # all cases. Then, for each article, use the 5 most distinctive terms # (filtering away numbers) to create a query against that index tempsearch = tmpidx.searcher() g = Graph() g.bind('celex', 'http://lagen.nu/ext/celex/') g.bind('ir', 'http://lagen.nu/informationretrieval#') IR = Namespace('http://lagen.nu/informationretrieval#') # celex:12008E264 ir:keyterm "blahonga"@en. outfile = self.generic_path("keyterms", "analyzed", ".tex") util.ensure_dir(outfile) fp = open(outfile, "w") fp.write(""" \\begin{tabular}{r|%s} \\hline \\textbf{Art.} & \\multicolumn{%s}{l}{\\textbf{Terms}} \\\\ \\hline """ % ("l" * num_of_keyterms, num_of_keyterms)) for article in analyzed_articles: fp.write(str(int(article.split("E")[1]))) r = tempsearch.search(query.Term("article", article)) terms = r.key_terms("content", numterms=num_of_keyterms + 1) terms = [t[0] for t in terms if not t[0].isdigit()][:num_of_keyterms] for term in terms: fp.write(" & " + term) g.add((URIRef(article), IR["keyterm"], Literal(term, lang="en"))) self.log.debug("Article %s:%r" % (article, terms)) fp.write("\\\\\n") fp.write(""" \\hline \\end{tabular} """) fp.close() outfile = self.generic_path("keyterms", "analyzed", ".n3") util.ensure_dir(outfile) fp = open(outfile, "w") fp.write(g.serialize(format="n3")) fp.close() def analyze_citation_graphs(self, articles=None): # Basic setup # articles = self._articles('tfeu')[-1:] if not articles: articles = [None] if None not in articles: articles.append(None) this_year = datetime.datetime.today().year store = TripleStore(self.config.storetype, self.config.storelocation, self.config.storerepository) sameas = self._sameas() distributions = [] # For each article (and also for no article = the entire citation graph) for article in articles: # Get a list of all eligble cases (needed for proper degree distribution) sq = self._query_cases(article, sameas) # print sq cases = {} caserows = store.select(sq, format="python") for r in caserows: cases[r['subj']] = 0 self.log.info("Creating graphs for %s (%s cases)" % (article, len(cases))) # Step 1. SPARQL the graph on the form ?citing ?cited # (optionally restricting on citing a particular article) if article: sq = self._query_cites(article, sameas, True, False, this_year + 1) else: sq = self._query_cites(None, sameas, False, False, this_year + 1) cites = store.select(sq, format="python") self.log.debug(" Citation graph contains %s citations" % (len(cites))) # remove duplicate citations, self-citations and pinpoints # in citations citedict = {} missingcases = {} for cite in cites: # print repr(cite) if "-" in cite['obj']: cite['obj'] = cite['obj'].split("-")[0] if not cite['obj'] in cases: # print "Case %s (cited in %s) does not exist!\n" % (cite['obj'], # cite['subj']) missingcases[cite['obj']] = True continue if (cite['subj'] != cite['obj']): citedict[(cite['subj'], cite['obj'])] = True self.log.debug( " Normalized graph contains %s citations (%s cited cases not found)" % (len(citedict), len(missingcases))) # pprint(missingcases.keys()[:10]) # Step 2. Dotify the list (maybe the direction of arrows from # cited to citing can improve results?) to create a citation # graph self.analyse_citegraph_graphviz(list(citedict.keys()), article) # Step 3. Create a degree distribution plot degree, distribution = self.analyze_citegraph_degree_distribution( cases, list(citedict.keys()), article) if article: distributions.append([article, distribution]) # Step 4. Create a citation/age scatterplot (or rather hexbin) self.analyze_citegraph_citation_age_plot(list(citedict.keys()), degree, distribution, article) # Step 5. Create a combined degree distribution graph of the # distinct citation networks. Also add the degree distribution # of gold standard cases self.analyze_citegraph_combined_degree_distribution(distributions) def analyse_citegraph_graphviz(self, cites, article, generate_graph=False): """Create a dot file (that can later be processed with dot or gephi)""" from time import time filetype = self.graph_filetype if article: filename = "citegraph_%s" % article.split("/")[-1] else: filename = "citegraph_all" dot_filename = self.generic_path(filename, "analyzed", ".dot") self.log.debug(" Writing graphwiz citation graph for %s" % article) fp = open(dot_filename, "w") fp.write("""digraph G { graph [ ]; """) cnt = 0 for citing, cited in cites: cnt += 1 citing = citing.split("/")[-1] cited = cited.split("/")[-1] try: fp.write(" \"%s\" -> \"%s\" ;\n" % (citing, cited)) except: pass fp.write("}") fp.close() if generate_graph: graph_filename = self.generic_path(dot_filename, "analyzed", "." + filetype) engine = "dot" start = time() cmdline = "%s -T%s -o%s tmp.dot" % (engine, filetype, graph_filename) self.log.debug("Running %s" % cmdline) p = subprocess.Popen(cmdline, shell=True) p.wait() self.log.info("Graph %s created in %.3f sec" % (graph_filename, time() - start)) def analyze_citegraph_degree_distribution(self, cases, cites, article): self.log.debug(" Writing degree distribution graph") degree = cases # self.log.debug(" %s cases, first elements %r" % (len(cases),cases.values()[:5])) # this_year = datetime.datetime.today().year maxcites = 40 # maxage = this_year - 1954 for citing, cited in cites: if citing not in degree: degree[citing] = 0 if cited not in degree: degree[cited] = 0 degree[cited] += 1 distribution = [0] * (max(degree.values()) + 1) for value in list(degree.values()): distribution[value] += 1 fig = plt.figure() fig.set_size_inches(8, 4) ax = plt.subplot(111) ax.set_ylabel('Number of cases being cited <x> times') ax.set_xlabel('Number of citing cases (max %s)' % maxcites) ax.set_title('Degree distribution of case citations') filetype = self.graph_filetype if article: filename = "degree_distribution_%s" % (article.split("/")[-1]) else: filename = "degree_distribution_all" filename = self.generic_path(filename, "analyzed", "." + filetype) plt.plot(distribution[:maxcites]) plt.savefig(filename) plt.close() self.log.debug(" Created %s" % filename) return (degree, distribution) def analyze_citegraph_combined_degree_distribution(self, distributions): self.log.debug(" Writing combined degree distribution graph") # this_year = datetime.datetime.today().year maxcites = 40 # maxnumber = 1000 # maxage = this_year - 1954 fig = plt.figure() fig.set_size_inches(8, 4) ax = plt.subplot(111) ax.set_ylabel('Number of cases being cited <x> times') ax.set_xlabel('Number of citing cases (max %s)' % maxcites) ax.set_title( 'Degree distribution of case citations concering specific articles' ) filetype = self.graph_filetype filename = "degree_distribution_combined" filename = self.generic_path(filename, "analyzed", "." + filetype) styles = [] for i in range(1, 5): for j in (['-', '--', '-.', ':']): # for j in (['-','-','-','-','-']): styles.append((i, j)) cnt = 0 for (article, distribution) in distributions: label = article.split("/")[-1].split("E")[1] self.log.debug(" Plotting %s %r" % (label, distribution[:4])) if label.isdigit(): label = "Art. %s" % int(label) # label += " (%s uncited)" % distribution[0] lw, ls = styles[cnt] plt.plot(distribution[:maxcites], label=label, linestyle=ls, linewidth=lw) # plt.axis([0,maxcites,0,maxnumber]) plt.legend(loc='best', markerscale=4, prop={'size': 'x-small'}, ncol=int(len(distributions) / 6) + 1) plt.savefig(filename) plt.close() self.log.debug(" Created %s" % filename) def analyze_citegraph_citation_age_plot(self, cites, degree, distribution, article): self.log.debug(" Writing citation age plot") this_year = datetime.datetime.today().year maxcites = 40 maxage = this_year - 1954 cited_by_age = [] citations = [] for case in sorted(degree.keys()): try: year = int(case[27:31]) caseage = this_year - year if year < 1954: continue except ValueError: # some malformed URIs/Celexnos continue if degree[case] <= maxcites: cited_by_age.append(caseage) citations.append(degree[case]) cases_by_age = [0] * (maxage + 1) for citing, cited in cites: year = int(citing[27:31]) caseage = this_year - year if year < 1954: continue if caseage < 0: continue cases_by_age[caseage] += 1 fig = plt.figure() fig.set_size_inches(8, 5) plt.axis([0, maxage, 0, maxcites]) ax = plt.subplot(211) plt.hexbin(cited_by_age, citations, gridsize=maxcites, bins='log', cmap=cm.hot_r) # plt.scatter(age,citations) ax.set_title("Distribution of citations by age") ax.set_ylabel("# of citations") #cb = plt.colorbar() # cb.set_label('log(# of cases with # of citations)') ax = plt.subplot(212) ax.set_title("Distribution of cases by age") plt.axis([0, maxage, 0, max(cases_by_age)]) plt.bar(na.array(list(range(len(cases_by_age)))) + 0.5, cases_by_age) filetype = self.graph_filetype if article: filename = "citation_age_plot_%s" % (article.split("/")[-1]) else: filename = "citation_age_plot_all" filename = self.generic_path(filename, "analyzed", "." + filetype) plt.savefig(filename) plt.close() self.log.debug(" Created %s" % filename)