def add_pub_concepts(year): #this is all to slow, used neo4j csv import instead #using periodic commit 10000 load CSV from "file:///pub_concept_2008.txt" as row FIELDTERMINATOR '\t' match (p:Publication{pub_id:toInt(row[0])}) match (c:Concept{name:row[1]}) merge (p)-[:CONCEPT{year:2008}]-(c) return count(p); print "Adding pub-concept data" session = neo4j_functions.connect() #read data pubConceptDic = readPubConcepts(year) counter = 0 for p in pubConceptDic: for c in pubConceptDic[p]: c = c.replace("'", "\\'") if counter % 10000 == 0: t = strftime("%H:%M:%S", gmtime()) print t, counter session.close() session = neo4j_functions.connect() com = "match (p:Publication {pub_id:" + str( p ) + "}) match (c:Concept {name:'" + c + "'}) merge (p)-[:CONCEPT{year:" + str( year) + "}]-(c);" session.run(com) counter += 1
def pubs_per_org(): session = neo4j_functions.connect() com = 'match (o:Org)-[:MEMBER_OF]-(s:Staff)-[:PUBLISHED]-(p:Publication) return o.code as o,count(distinct(p)) as c;' oDic = {} for res in session.run(com): oDic[res['o']] = res['c'] return oDic
def staff_per_org(): session = neo4j_functions.connect() com = ' match (o:Org)-[:MEMBER_OF]-(s:Staff) return o.code as o,count(s) as c;' sDic = {} for res in session.run(com): sDic[res['o']] = res['c'] return sDic
def pubs_per_person(): session = neo4j_functions.connect() com = 'match (s:Staff)-[:PUBLISHED]-(p:Publication) return s.person_id as p,count(p) as c;' pDic = {} for res in session.run(com): pDic[res['p']] = res['c'] return pDic
def add_metrics_to_graph(): session = neo4j_functions.connect() with open('output/pubMetrics.txt', 'rb') as p: for line in p: line = line.rstrip() pid, cf, ld, ts_flesch_reading_ease, ts_smog_index, ts_flesch_kincaid_grade, ts_coleman_liau_index, ts_automated_readability_index, ts_dale_chall_readability_score, ts_difficult_words, ts_linsear_write_formula, ts_gunning_fog, ts_text_standard = line.split( '\t') com = "match (p:Publication) where p.pub_id = "+str(pid)+" set p.cf = "+str("%.4f" % cf)+",p.ld ="+str("%.4f" % ld)+"" \ ",p.ts_flesch_reading_ease="+str("%.4f" % ts_flesch_reading_ease)+",p.ts_smog_index="+str("%.4f" % ts_smog_index)+",p.ts_flesch_kincaid_grade="+str("%.4f" % ts_flesch_kincaid_grade)+"" \ ",p.ts_coleman_liau_index="+str("%.4f" % ts_coleman_liau_index)+",p.ts_automated_readability_index="+str("%.4f" % ts_automated_readability_index)+"" \ ",p.ts_dale_chall_readability_score="+str("%.4f" % ts_dale_chall_readability_score)+",p.ts_difficult_words="+str("%.4f" % ts_difficult_words)+"" \ ",p.ts_linsear_write_formula="+str("%.4f" % ts_linsear_write_formula)+",p.ts_gunning_fog="+str("%.4f" % ts_gunning_fog)+",p.ts_text_standard='"+str(ts_text_standard)+"';" #print com session.run(com) session.close()
def get_metrics(): #check for previous pubDic = {} if os.path.exists('output/pubMetrics.txt'): with open('output/pubMetrics.txt', 'rb') as p: for line in p: if len(line.split('\t')) == 13: pid, cf, ld, ts_flesch_reading_ease, ts_smog_index, ts_flesch_kincaid_grade, ts_coleman_liau_index, ts_automated_readability_index, ts_dale_chall_readability_score, ts_difficult_words, ts_linsear_write_formula, ts_gunning_fog, ts_text_standard = line.split( '\t') pubDic[int(pid)] = '' #print pubDic print "Found " + str(len(pubDic)) + " entries already processed" session = neo4j_functions.connect() com = "match (p:Publication) return distinct p.abstract as a, p.pub_id as pid;" counter = 0 o = open('output/pubMetrics.txt', 'a') for res in session.run(com): if counter % 1000 == 0: print counter, strftime("%Y-%m-%d %H:%M:%S", gmtime()) #session.close() #session = neo4j_functions.connect() counter += 1 abs = res['a'] pid = res['pid'] #print pid if pid not in pubDic: #content fraction if len(abs) > 0: cf = nltk_functions.content_fraction(abs) ld = nltk_functions.lexical_diversity(abs) ts = nltk_functions.run_textstat(abs) ts_flesch_reading_ease, ts_smog_index, ts_flesch_kincaid_grade, ts_coleman_liau_index, ts_automated_readability_index, ts_dale_chall_readability_score, ts_difficult_words, ts_linsear_write_formula, ts_gunning_fog, ts_text_standard = ts #print pid, cf, ld, ts_flesch_reading_ease o.write( str(pid) + "\t" + str("%.4f" % cf) + "\t" + str("%.4f" % ld) + "\t" + str("%.4f" % ts_flesch_reading_ease) + "" "\t" + str("%.4f" % ts_smog_index) + "\t" + str("%.4f" % ts_flesch_kincaid_grade) + "\t" + str("%.4f" % ts_coleman_liau_index) + "" "\t" + str("%.4f" % ts_automated_readability_index) + "\t" + str("%.4f" % ts_dale_chall_readability_score) + "" "\t" + str("%.4f" % ts_difficult_words) + "\t" + str("%.4f" % ts_linsear_write_formula) + "\t" + str("%.4f" % ts_gunning_fog) + "" "\t" + str(ts_text_standard) + "\n")
def distance_metrics(year): session = neo4j_functions.connect() #staff print "Creating staff distance data" com = "match (p1:Staff)-[e1:ENRICHED]->(s), (p2:Staff)-[e2:ENRICHED]->(s) where id(p1) < id(p2) and e1.year = " + str( year ) + " and e2.year = " + str( year ) + " with sqrt(sum((-log10(e1.cpval) - -log10(e2.cpval))^2)) as euc,p1,p2 merge (p1)-[d:DISTANCE]-(p2) set d.euclidean_" + str( year) + " = euc return count(euc);" print com for res in session.run(com): print res #orgs print "Create org distance data" com = "match (p1:Org)-[e1:ENRICHED]->(s), (p2:Org)-[e2:ENRICHED]->(s) where id(p1) < id(p2) and e1.year = " + str( year ) + " and e2.year = " + str( year ) + " with sqrt(sum((-log10(e1.cpval) - -log10(e2.cpval))^2)) as euc,p1,p2 merge (p1)-[d:DISTANCE]-(p2) set d.euclidean_" + str( year) + " = euc return count(euc);" print com for res in session.run(com): print res
def copy_graph_to_mysql(): session = neo4j_functions.connect() cnx = connect() curA = cnx.cursor(buffered=True) #add orgs print "Adding orgs" neo4j_com = "match (o:Org) return o.code as c, o.short_name as s, o.full_name as f, o.url as u, o.type as t order by o.code;" mysql_com = ( "INSERT IGNORE INTO browser_org (code, short_name, full_name, url, type) " "VALUES (%s, %s, %s, %s, %s)") for res in session.run(neo4j_com): code = res['c'] short = res['s'] full = res['f'] url = res['u'] if len(url) == 0: url = 'n/a' type = res['t'] #print code,short,full,url,type try: curA.execute(mysql_com, (code, short, full, url, type)) except mysql.connector.Error as err: print("failed to insert values %s, %s") #person table #name,user_name,institute,position,sex print "Adding people" neo4j_com = "match (o:Org)--(s:Staff) return s.published_name as n, s.person_id as pid, o.code as s;" mysql_com = ( "INSERT IGNORE INTO browser_person (org_id, user_name, name, position, sex) " "VALUES ((SELECT id from browser_org where code = %s), %s, %s, %s, %s)" ) for res in session.run(neo4j_com): name = res['n'] #print name user_name = res['pid'] org = res['s'] pos = 'n/a' sex = 'n/a' curA.execute(mysql_com, (org, user_name, name, pos, sex)) #concepts #name,type print "Adding concepts" neo4j_com = "match (c:Concept) return c.name as n, c.type as t" mysql_com = ("INSERT IGNORE INTO browser_concept (name, type) " "VALUES (%s, %s)") for res in session.run(neo4j_com): name = res['n'] #print name type = res['t'] curA.execute(mysql_com, (name, type)) #enrichments #person_id,globalCount,cpval,concept_id,globalTotal,localCount,localTotal,year print "Adding people-concepts" neo4j_com = "match (s:Staff)-[e]-(c:Concept) return s.person_id as pid,c.name, e.localCount,e.localTotal,e.globalCount,e.globalTotal,e.year,e.cpval;" mysql_com = ( "INSERT IGNORE INTO browser_enrichedp (person_id,concept_id,localCount,localTotal,globalCount,globalTotal,year,cpval) " "VALUES ((SELECT id from browser_person where user_name = %s), (SELECT id from browser_concept where name = %s), %s, %s, %s, %s, %s, %s )" ) for res in session.run(neo4j_com): pid = res['pid'] cName = res['c.name'] localCount = res['e.localCount'] localTotal = res['e.localTotal'] globalCount = res['e.globalCount'] globalTotal = res['e.globalTotal'] year = res['e.year'] cpval = res['e.cpval'] #print pid,cName curA.execute(mysql_com, (pid, cName, localCount, localTotal, globalCount, globalTotal, year, cpval)) print "Adding org-concepts" neo4j_com = "match (o:Org)-[e]-(c:Concept) return o.code as c,c.name, e.localCount,e.localTotal,e.globalCount,e.globalTotal,e.year,e.cpval;" mysql_com = ( "INSERT IGNORE INTO browser_enrichedo (org_id,concept_id,localCount,localTotal,globalCount,globalTotal,year,cpval) " "VALUES ((SELECT id from browser_org where code = %s), (SELECT id from browser_concept where name = %s), %s, %s, %s, %s, %s, %s )" ) for res in session.run(neo4j_com): pid = res['c'] cName = res['c.name'] localCount = res['e.localCount'] localTotal = res['e.localTotal'] globalCount = res['e.globalCount'] globalTotal = res['e.globalTotal'] year = res['e.year'] cpval = res['e.cpval'] #print pid, cName curA.execute(mysql_com, (pid, cName, localCount, localTotal, globalCount, globalTotal, year, cpval)) cnx.close()
def add_enriched_to_graph(year): print "Adding enrichment data to graph..." session = neo4j_functions.connect() for concept_type in ['type', 'bigram', 'trigram']: print "Adding data for data type '" + concept_type + "'" counter = 0 num_lines = sum(1 for line in open(outDir + '/person_' + concept_type + '_enriched_' + str(year) + '.txt')) #create concept nodes and index with open( outDir + '/person_' + concept_type + '_enriched_' + str(year) + '.txt', 'rb') as p: next(p) for line in p: line = line.rstrip() #print line if counter % 10000 == 0: print str(counter) + '/' + str(num_lines) session.close() session = neo4j_functions.connect() counter += 1 name, type, a1, a2, b1, b2, odds, pval, cor_pval = line.split( '\t') type = type.replace("'", "\\'") person_id = name.split(':')[1] com = "MERGE (c:Concept {name:'" + type + "', type:'" + concept_type + "'})" session.run(com) i = "CREATE index on :Concept(name);" session.run(i) #i="CREATE index on :Concept(type);" #session.run(i) counter = 0 #create person-concept relationships print "Creating person-concept relationships..." with open( outDir + '/person_' + concept_type + '_enriched_' + str(year) + '.txt', 'rb') as p: next(p) for line in p: line = line.rstrip() #print line if counter % 10000 == 0: print str(counter) + '/' + str(num_lines) session.close() session = neo4j_functions.connect() counter += 1 name, type, a1, a2, b1, b2, odds, pval, cor_pval = line.split( '\t') if float(cor_pval) > 0: cor_pval = ("%03.02e" % float(cor_pval)) else: cor_pval = 0.0 type = type.replace("'", "\\'") person_id = name.split(':')[1] com = "MATCH (s:Staff {person_id: "+person_id+"}) " \ "MATCH (c:Concept {name:'"+type+"',type:'"+concept_type+"'}) " \ "MERGE (s)-[:ENRICHED{type:'pure-comp',year:"+str(year)+",localCount:"+str(a1)+"," \ "localTotal:"+str(a2)+",globalCount:"+str(b1)+",globalTotal:"+str(b2)+",cpval:"+str(cor_pval)+"}]-(c);" session.run(com) counter = 0 #create org-concept relationships print "Creating org-concept relationships..." with open( outDir + '/org_' + concept_type + '_enriched_' + str(year) + '.txt', 'rb') as p: next(p) for line in p: if counter % 10000 == 0: print counter session.close() session = neo4j_functions.connect() counter += 1 line = line.rstrip() name, type, a1, a2, b1, b2, odds, pval, cor_pval = line.split( '\t') if float(cor_pval) > 0: cor_pval = ("%03.02e" % float(cor_pval)) else: cor_pval = 0.0 type = type.replace("'", "\\'") code = name.split(':')[1] com = "MATCH (o:Org {code: '"+code+"'}) " \ "MATCH (c:Concept {name:'"+type+"',type:'"+concept_type+"'}) " \ "MERGE (o)-[:ENRICHED{type:'pure-comp',year:"+str(year)+",localCount:"+str(a1)+"," \ "localTotal:"+str(a2)+",globalCount:"+str(b1)+",globalTotal:"+str(b2)+",cpval:"+str(cor_pval)+"}]-(c);" #print com session.run(com) session.close()
def background_frequencies(year): print "Getting background frequencies" session = neo4j_functions.connect() o1 = open(outDir + '/background_type_frequencies_' + str(year) + '.txt', 'w') o1.write('type\tcount\n') o2 = open(outDir + '/background_bigram_frequencies_' + str(year) + '.txt', 'w') o2.write('bigram\tcount\n') o3 = open(outDir + '/background_trigram_frequencies_' + str(year) + '.txt', 'w') o3.write('trigram\tcount\n') com = "match (s:Staff)--(p:Publication) where p.pub_year <= " + str( year ) + " return distinct p.abstract as a, p.title as t, p.pub_id as pid;" print com typeDic = {} bigramDic = {} trigramDic = {} pubConceptDic = defaultdict(dict) counter = 0 ignoreList = ignore_tokens() for res in session.run(com): if counter % 1000 == 0: print counter counter += 1 if len(res['a']) > config.minAbsLength: #combine title and abstract title_abs = res['t'] + res['a'] #abs = res['a'] pid = res['pid'] types = nltk_functions.tokenise_and_lemm(title_abs) #bigrams,trigrams = nltk_functions.bigrams_and_trigrams(abs) bigrams, trigrams = nltk_functions.bigrams_and_trigrams( " ".join(types)) for s in set(types): if s in ignoreList: pass else: #add to pub-concept dic pubConceptDic[pid][s] = 'type' if s in typeDic: typeDic[s] += 1 else: typeDic[s] = 1 for s in set(bigrams): if len(set(s).intersection(ignoreList)) == 0: ss = s[0] + ':' + s[1] #add to pub-concept dic pubConceptDic[pid][ss] = 'bigram' if ss in bigramDic: bigramDic[ss] += 1 else: bigramDic[ss] = 1 for s in set(trigrams): if len(set(s).intersection(ignoreList)) == 0: ss = s[0] + ':' + s[1] + ':' + s[2] #add to pub-concept dic pubConceptDic[pid][ss] = 'trigram' if ss in trigramDic: trigramDic[ss] += 1 else: trigramDic[ss] = 1 for t in sorted(typeDic, key=typeDic.get, reverse=True): o1.write(t + '\t' + str(typeDic[t]) + '\n') for t in sorted(bigramDic, key=bigramDic.get, reverse=True): o2.write(t + '\t' + str(bigramDic[t]) + '\n') for t in sorted(trigramDic, key=trigramDic.get, reverse=True): o3.write(t + '\t' + str(trigramDic[t]) + '\n') o4 = open(outDir + '/pub_concept_' + str(year) + '.txt', 'w') l = len(pubConceptDic) counter = 0 for p in pubConceptDic: if counter % 1000 == 0: print counter, l counter += 1 for c in pubConceptDic[p]: o4.write(str(p) + '\t' + c + "\t" + pubConceptDic[p][c] + '\n') session.close()
def org_frequencies(year): print "Getting org frequencies" session = neo4j_functions.connect() o1 = open(outDir + '/org_type_frequencies_' + str(year) + '.txt', 'w') o1.write('org\ttype\tcount\n') o2 = open(outDir + '/org_bigram_frequencies_' + str(year) + '.txt', 'w') o2.write('org\tbigram\tcount\n') o3 = open(outDir + '/org_trigram_frequencies_' + str(year) + '.txt', 'w') o3.write('org\ttrigram\tcount\n') com = "match (o:Org)--(s:Staff)--(p:Publication) where p.pub_year <= " + str( year ) + " return distinct o.short_name as o1, o.code as o2, p.pub_id as pid;" typeDic = defaultdict(dict) bigramDic = defaultdict(dict) trigramDic = defaultdict(dict) counter = 0 ignoreList = ignore_tokens() #read in type freqs for filtering typeConceptDic = typeFreqs(year) #get pubConceptDic pubConceptDic = readPubConcepts(year) for res in session.run(com): if counter % 1000 == 0: print counter counter += 1 org = res['o1'] + ':' + res['o2'] pid = str(res['pid']) for s in pubConceptDic[pid]: if s in ignoreList: pass else: if pubConceptDic[pid][s] == 'type': if int(typeConceptDic[s]) > freqCutoff: #print pid,s,pubConceptDic[pid][s],typeConceptDic[s] pass else: if s in typeDic[org]: typeDic[org][s] += 1 else: typeDic[org][s] = 1 elif pubConceptDic[pid][s] == 'bigram': if s in bigramDic[org]: bigramDic[org][s] += 1 else: bigramDic[org][s] = 1 elif pubConceptDic[pid][s] == 'trigram': if s in trigramDic[org]: trigramDic[org][s] += 1 else: trigramDic[org][s] = 1 for p in sorted(typeDic, key=typeDic.get, reverse=True): for t in sorted(typeDic[p], key=typeDic[p].get, reverse=True): o1.write(str(p) + '\t' + t + '\t' + str(typeDic[p][t]) + '\n') for p in sorted(bigramDic, key=bigramDic.get, reverse=True): for t in sorted(bigramDic[p], key=bigramDic[p].get, reverse=True): o2.write(str(p) + '\t' + t + '\t' + str(bigramDic[p][t]) + '\n') for p in sorted(trigramDic, key=trigramDic.get, reverse=True): for t in sorted(trigramDic[p], key=trigramDic[p].get, reverse=True): o3.write(str(p) + '\t' + t + '\t' + str(trigramDic[p][t]) + '\n') session.close()
def person_frequencies(year): print "Getting person frequencies" session = neo4j_functions.connect() o1 = open(outDir + '/person_type_frequencies_' + str(year) + '.txt', 'w') o1.write('person\ttype\tcount\n') o2 = open(outDir + '/person_bigram_frequencies_' + str(year) + '.txt', 'w') o2.write('person\tbigram\tcount\n') o3 = open(outDir + '/person_trigram_frequencies_' + str(year) + '.txt', 'w') o3.write('person\ttrigram\tcount\n') #com = "match (s:Staff)--(p:Publication) where p.pub_id = 2942913 return distinct s.published_name as p1, s.person_id as p2, p.pub_id as pid;" com = "match (s:Staff)--(p:Publication) where p.pub_year <= " + str( year ) + " return distinct s.published_name as p1, s.person_id as p2, p.pub_id as pid;" typeDic = defaultdict(dict) bigramDic = defaultdict(dict) trigramDic = defaultdict(dict) ignoreList = ignore_tokens() counter = 0 #read in type freqs for filtering typeConceptDic = typeFreqs(year) #get pubConceptDic pubConceptDic = readPubConcepts(year) for res in session.run(com): if counter % 1000 == 0: print counter counter += 1 person_name = res['p1'] person_id = res['p2'] pid = str(res['pid']) person = person_name + ':' + str(person_id) #print res['pid'] for s in pubConceptDic[pid]: if s in ignoreList: pass else: if pubConceptDic[pid][s] == 'type': if int(typeConceptDic[s]) > freqCutoff: #print pid,s,pubConceptDic[pid][s],typeConceptDic[s] pass else: if s in typeDic[person]: typeDic[person][s] += 1 else: typeDic[person][s] = 1 elif pubConceptDic[pid][s] == 'bigram': if s in bigramDic[person]: bigramDic[person][s] += 1 else: bigramDic[person][s] = 1 elif pubConceptDic[pid][s] == 'trigram': if s in trigramDic[person]: trigramDic[person][s] += 1 else: trigramDic[person][s] = 1 for p in sorted(typeDic, key=typeDic.get, reverse=True): for t in sorted(typeDic[p], key=typeDic[p].get, reverse=True): o1.write(str(p) + '\t' + t + '\t' + str(typeDic[p][t]) + '\n') for p in sorted(bigramDic, key=bigramDic.get, reverse=True): for t in sorted(bigramDic[p], key=bigramDic[p].get, reverse=True): o2.write(str(p) + '\t' + t + '\t' + str(bigramDic[p][t]) + '\n') for p in sorted(trigramDic, key=trigramDic.get, reverse=True): for t in sorted(trigramDic[p], key=trigramDic[p].get, reverse=True): o3.write(str(p) + '\t' + t + '\t' + str(trigramDic[p][t]) + '\n') session.close()
#import matplotlib #matplotlib.style.use('ggplot') from csv import reader import pandas as pd import numpy as np from nltk.corpus import wordnet as wordnet from textstat.textstat import textstat from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english") from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() session = neo4j_functions.connect() #analyse output # awk -F '\t' '$2>50' output/nltk_counts.txt | sort -t$'\t' -k4 -nr | less #http://www.nltk.org/book/ch01.html def content_fraction(text): stopwords = nltk.corpus.stopwords.words('english') content = [w for w in text if w.lower() not in stopwords] return len(content) / len(text) def filter_stopwords_and_length(text, wordLength): stopwords = nltk.corpus.stopwords.words('english')