def collect_data(d): conn,cursor = db.dbConnect() quartiles = u.quartile_calculation(d) threshold = quartiles[2] #topic_id,label,domain rows = db.dbExecute(cursor,db.get_topic_labels(d)) df1 = pd.DataFrame({'id':u.col(rows,0),'label':u.col(rows,1),'domain':[d]*c.domain_topics[d]}) #topic max min years rows = db.dbExecute(cursor,db.get_maxmin_years(d,threshold)) df2 = pd.DataFrame({'id':u.col(rows,0),'max.yr':u.col(rows,1),'min.yr':u.col(rows,2)}) df2['lifetime']=df2['max.yr']-df2['min.yr'] df1 = pd.merge(df1,df2, how='inner') #print "2:",df1 #topic citations rows = db.dbExecute(cursor,db.get_weighted_citations(d,threshold)) df2 = pd.DataFrame({'id':u.col(rows,0),'w.cites':u.col(rows,1) }) df1 = pd.merge(df1,df2, how='inner') #unweighted rows = db.dbExecute(cursor,db.get_unweighted_citations(d,threshold)) df2 = pd.DataFrame({'id':u.col(rows,0),'nw.cites':u.col(rows,1) }) df1 = pd.merge(df1,df2, how='inner') #topic kws rows = collect_kws(d) df2 = pd.DataFrame({'id':rows[0],'ten.kwd':rows[1]}) df1 = pd.merge(df1,df2, how='inner') #topic papers rows = db.dbExecute(cursor,db.get_topic_papers(d,threshold)) df2 = pd.DataFrame({'id':u.col(rows,0),'papers':u.col(rows,1) }) df1 = pd.merge(df1,df2, how='inner') #topic venues rows = db.dbExecute(cursor,db.get_topic_venues(d,threshold)) df2 = pd.DataFrame({'id':u.col(rows,0),'venues':u.col(rows,1) }) df1 = pd.merge(df1,df2,how="inner") #topic authors rows = db.dbExecute(cursor,db.get_topic_authors(d,threshold)) df2 = pd.DataFrame({'id':u.col(rows,0),'authors':u.col(rows,1) }) df1 = pd.merge(df1,df2,how="inner") #topic authors median hi rows = collect_hi_med(d,threshold) df2 = pd.DataFrame({'id':rows[0],'med.hindex':rows[1]}) df1 = pd.merge(df1,df2,how="inner") #cosim intra,inter rows = collect_cosim_med(d) if domain !="all": df2 = pd.DataFrame({'id':rows[0],'med.cosim.intra':rows[1] ,'med.cosim.inter':rows[2]}) # WITH INTER else: df2 = pd.DataFrame({'id':rows[0],'med.cosim.intra':rows[1] })#,'cosim.mat.indx':rows[3]}) # ALL TOPICS df1 = pd.merge(df1,df2,how="inner") #RHL cit,pub #df2 = collect_rhl(d) #df1 = pd.merge(df1,df2,how="inner") #df2 = collect_net_intra(d) #df1 = pd.merge(df1,df2,how="left",on=['domain','label']) #Domain coordinates df1["d.x"] = df1.apply(lambda row: c.domain_cord[row.domain][0], axis=1) df1["d.y"] = df1.apply(lambda row: c.domain_cord[row.domain][1], axis=1) df1["d.z"] = df1.apply(lambda row: c.domain_cord[row.domain][2], axis=1) return df1
def topic_domain_analysis(): ''' Analyze topic-domain distributions of papers in soup approach ''' conn,cursor = db.dbConnect() quartiles = u.paper_topic_percentile("all",p=10) threshold = quartiles[c.threshold_quartile] #threshold = 0.001 #print str(c.threshold_quartile+1),"th threshold is",threshold sql = db.get_topic_domain_dist("all",threshold) #print "sql",sql cursor.execute(sql) t1 = time.time() rows = cursor.fetchall() topic_lists = {"se":[],"db":[],"ai":[],"os":[]} topics = [] # generate domain wise columns current = {"se":0,"db":0,"ai":0,"os":0} for row in rows: if not topics: topics.append(row[0]) if topics[-1]!=row[0] and topics: for d in current: topic_lists[d].append(current[d]) topics.append(row[0]) current = {"se":0,"db":0,"ai":0,"os":0} current[row[1]]=int(row[2]) for d in current: topic_lists[d].append(current[d]) #last topic #Convert int to 3 digit int topics = u.n_digit_list(topics) #Make dataframe df = pd.DataFrame({'topic_id':topics,'se':topic_lists["se"],'os':topic_lists["os"],'db':topic_lists["db"],'ai':topic_lists["ai"]}) df["total"] = df.sum(axis=1) gini = {} for d in ["se","ai","db","os"]: gini[d]=u.gini(topic_lists[d]) gini["total"] = u.gini(df["total"].tolist()) print gini #Generate dist/box plots for each domain for col in df.select_dtypes(include=[np.number]).columns: data = df[col].tolist() u.box_plot(data,"stats_data/desc/"+col+"_soup_boxplot.png") u.dist_plot(data,"stats_data/desc/"+col+"_soup_distplot.png") #Save distribution for each topic in csv df = df[['topic_id','se','ai','db','os','total']] print "saving domain topic analysis results in ",'stats_data/TopicDomain_'+c.query_name+'_Q'+str(c.threshold_quartile)+'.csv' df.to_csv('stats_data/TopicDomain_'+c.query_name+'_Q'+str(c.threshold_quartile)+'.csv',sep=',',index=False) f = open('stats_data/TopicDomain_'+c.query_name+'_Q'+str(c.threshold_quartile)+'.csv', "a") f.write("\ngini,"+gini["se"]+","+gini["ai"]+","+gini["db"]+","+gini["os"]+","+gini["total"]+"\n")
def generate_network(): for domain in c.domains: file = 'sterling_no_' + domain + '_' + str( c.domain_topics[domain]) + '.csv' if not path.exists(file): print("Generating topics ans sterling numbers") sterling() df = pd.read_csv(file) authors = np.array(df['author_id']) print(len(authors)) # Get graph edges con = dbcon.dbConnect() con, cur = dbcon.dbConnect() print('Connection status:', con.is_connected()) sql = "SELECT p1.paper_ID, p1.author_ID a1, p2.author_ID a2 \ FROM Paper_Author_Affiliations_SE p1 \ INNER JOIN Paper_Author_Affiliations_SE p2 ON p1.paper_ID = p2.paper_ID where p1.author_ID <> p2.author_ID;" graph = dbcon.dbExecute(cur, sql) graph = pd.DataFrame(graph, columns=['paper_ID', 'auth1', 'auth2']) # Remove authors with topics < 1 graph = graph[graph.auth1.isin(authors) & graph.auth2.isin(authors)] # Create edges graph = graph.groupby(by=['auth1', 'auth2']).size().reset_index( name='count') graph['edges'] = list(zip(graph['auth1'], graph['auth2'])) graph[['edges']] = graph['edges'].apply(sorted) graph[['edges']] = graph['edges'].apply(tuple) # For undirected graph drop dupilcate edges undir_graph = graph.drop_duplicates(subset='edges', keep='first') print(len(graph), len(undir_graph)) print(undir_graph.head()) print("Creating graph") metrics = get_graph_metrics(authors, undir_graph) df = df.merge(metrics, on=('author_id')) df.to_csv('dataframe_' + domain + '_' + str(c.domain_topics[domain]) + '.csv', index=False)
def DocTopic_dist(): ''' This function generates a box plot and other descriptive metrics of topic-paper prob distribution for the entire coupus ''' conn,cursor = db.dbConnect() sql = db.get_topic_paper_probs("all") cursor.execute(sql) rows = cursor.fetchall() percentiles = u.percentile(rows,p=10) u.dist_plot(rows,"stats_data/desc/DocTopic_"+c.query_name+"_distplot.png")
def get_topic_labels(): conn, cursor = db.dbConnect() labels = {domain: {} for domain in c.domains} labels = [""] * sum(c.domain_topics[domain] for domain in c.domains) for domain in c.domains: sql = db.get_topic_labels(domain) cursor.execute(sql) rows = cursor.fetchall() for row in rows: labels[u.sim_mat_indexer(domain, u.fmt_tid(row[0]))] = row[1] conn.close() return labels
def collect_hi_med(domain,threshold): data = [[],[]] conn,cursor = db.dbConnect() for i in range(0,c.domain_topics[domain],1): sql = db.get_author_hi(domain,threshold,i) cursor.execute(sql) rows = cursor.fetchall() data[0].append(rows[0][0]) data[1].append(np.median(u.col(rows,1))) cursor.close() conn.close() return data
def collect_kws(domain): data = [[],[]] conn,cursor = db.dbConnect() for i in range(0,c.domain_topics[domain],1): sql = db.get_topic_kws(domain,i) cursor.execute(sql) rows = cursor.fetchall() #print rows data[0].append(rows[0][0]) data[1].append(rows[0][1].replace(",","-")) cursor.close() conn.close() return data
def sterling(): global thresh con = dbcon.dbConnect() con, cur = dbcon.dbConnect() print('Connection status:', con.is_connected()) for domain in c.domains: #Get list of all authors and details sql = 'select pa.author_ID, count(distinct p.paper_ID) count, count(distinct paper_published_year) active_years, count(distinct paper_venue_ID) venue,a.author_h_index from \ Paper_Author_Affiliations_' + domain + ' pa,Papers_' + domain + ' p, Authors_' + domain + ' a \ where p.paper_ID =pa.paper_ID and a.author_ID = pa.author_ID group by pa.author_ID;' authors = dbcon.dbExecute(cur, sql) file = 'author_topic_' + domain + '_' + str( c.domain_topics[domain]) + '.json' if not path.exists(file): print('Finding topics for %d authors' % len(authors)) author_topic = get_author_topics(authors, cur, str(thresh)) with open(file, 'w') as f: json.dump(author_topic, f) f.close() with open(file, 'r') as f: author_topic = json.load(f) sterling_data = calc_versetaility(cur, authors, author_topic, domain) df = pd.DataFrame(sterling_data) # Author data on citations sql = 'select author_ID,sum(cites) from Paper_Author_Affiliations_' + domain + ' p, \ (select paper_ID, count(*) cites from Paper_Citations_' + domain + ' group by paper_ID) t \ where t.paper_ID = p.paper_ID \ group by author_ID;' cites = dbcon.dbExecute(cur, sql) cites = pd.DataFrame(cites, columns=['author_id', 'total_cites']) df = df.merge(cites, on='author_id', how='left').fillna(0) df[df['total_topics'] > 0].to_csv('sterling_no_' + domain + '_' + str(c.domain_topics[domain]) + '.csv', index=False) print(df.corr(method='spearman'))
def quartile_calculation(d,threshold=0): ''' Calculates quartiles from probability distributions of paper topic relation for. Return: array of the three quartile values ''' conn,cursor = db.dbConnect() sql = db.get_topic_paper_probs(d,threshold) print (sql) cursor.execute(sql) rows = cursor.fetchall() #print rows quartiles = percentile(rows, 25) # quartiles print (d,"quartiles:",quartiles) conn.close() return quartiles
def paper_topic_percentile(d, p=25, threshold=0): ''' Calculates quartiles from probability distributions of paper topic relation for. Return: array of the percentile values ''' conn, cursor = db.dbConnect() sql = db.get_topic_paper_probs(d, threshold) print sql cursor.execute(sql) rows = cursor.fetchall() #print rows percentiles = percentile(rows, p=p) # percentiles print d, "percentiles:", percentiles conn.close() return percentiles
def topic_paper_analysis(): ''' Analyze topic paper distribution in partitioned approach ''' conn,cursor = db.dbConnect() threshold = 0.001 tcount = [] for d in ["se","os","ai","db"]: sql = db.get_topic_domain_dist(d,threshold) cursor.execute(sql) t1 = time.time() rows = cursor.fetchall() dcount = [] for row in rows: dcount.append(int(row[2])) tcount.append(int(row[2])) print "Gini for",d,"is",u.gini(dcount) print "Gini for journal:",u.gini(tcount)
def generate_sim_matrix(): dot = {} start_time = time.time() n = sum(c.domain_topics[domain] for domain in c.domains) sim_matrix = np.zeros((n, n)) conn, cursor = db.dbConnect() cursor.execute('set global max_allowed_packet=671088640000') #dot product for d1 in c.domains: for d2 in c.domains[c.domains.index(d1):]: print("calculating ", d1, "X", d2) for t1 in range(0, c.domain_topics[d1]): for t2 in range(0, c.domain_topics[d2]): i1 = u.sim_mat_indexer(d1, t1) i2 = u.sim_mat_indexer(d2, t2) #old way # if sim_matrix[i1][i2] ==0: # sql = db.keyword_dot(d1,"Topic_"+str(t1),d2,"Topic_"+str(t2)) # cursor.execute(sql) # rows = cursor.fetchall() # for row in rows: # mul = float(row[0]) # sim_matrix[i1][i2] = sim_matrix[i1][i2] + mul # if i1!=i2: # sim_matrix[i2][i1] = sim_matrix[i2][i1] + mul if sim_matrix[i1][i2] == 0: sql = db.keyword_dot(d1, "Topic_" + str(t1), d2, "Topic_" + str(t2)) cursor.execute(sql) rows = cursor.fetchall() for row in rows: mul = float(row[0]) sim_matrix[i1][i2] = sim_matrix[i1][i2] + mul if i1 != i2: sim_matrix[i2][i1] = sim_matrix[i2][i1] + mul ''' #OLD METHOD sql = db.get_topic_cross(d1,d2) print sql cursor.execute(sql) rows = cursor.fetchall() print "rows:",len(rows) #print "SQL query takes ",(time.time()-t1)/60,"mins" for row in rows: i1 = u.sim_mat_indexer(d1,u.fmt_tid(row[0])) i2 = u.sim_mat_indexer(d2,u.fmt_tid(row[1])) mul = float(row[2]) sim_matrix[i1][i2] = sim_matrix[i1][i2] + mul if i1!=i2: sim_matrix[i2][i1] = sim_matrix[i2][i1] + mul ''' conn.close() cursor.close() conn, cursor = db.dbConnect() # divide by mod for d1 in c.domains: sql = db.get_topic_l2(d1) t2 = time.time() cursor.execute(sql) print("SQL query takes ", (time.time() - t2) / 60, "mins") rows = cursor.fetchall() for row in rows: t = u.sim_mat_indexer(d1, u.fmt_tid(row[0])) mod = np.sqrt(float(row[1])) if mod != 0: sim_matrix[t, :] /= mod sim_matrix[:, t] /= mod else: print("ZERO!!!") conn.close() print( "--- time for sim matrix", sim_matrix.shape, " generation " + str( (time.time() - start_time) / 60) + " minutes ---") return sim_matrix