コード例 #1
0
def collect_data(d):
	conn,cursor = db.dbConnect()
	quartiles = u.quartile_calculation(d)
	threshold = quartiles[2]
	#topic_id,label,domain
	rows = db.dbExecute(cursor,db.get_topic_labels(d))
	df1 = pd.DataFrame({'id':u.col(rows,0),'label':u.col(rows,1),'domain':[d]*c.domain_topics[d]})
	#topic max min years
	rows = db.dbExecute(cursor,db.get_maxmin_years(d,threshold))
	df2 = pd.DataFrame({'id':u.col(rows,0),'max.yr':u.col(rows,1),'min.yr':u.col(rows,2)})
	df2['lifetime']=df2['max.yr']-df2['min.yr']
	df1 = pd.merge(df1,df2, how='inner')
	#print "2:",df1
	#topic citations
	rows = db.dbExecute(cursor,db.get_weighted_citations(d,threshold))
	df2 = pd.DataFrame({'id':u.col(rows,0),'w.cites':u.col(rows,1) })
	df1 = pd.merge(df1,df2, how='inner')
	#unweighted
	rows = db.dbExecute(cursor,db.get_unweighted_citations(d,threshold))
	df2 = pd.DataFrame({'id':u.col(rows,0),'nw.cites':u.col(rows,1) })
	df1 = pd.merge(df1,df2, how='inner')
	#topic kws
	rows = collect_kws(d)
	df2 = pd.DataFrame({'id':rows[0],'ten.kwd':rows[1]})
	df1 = pd.merge(df1,df2, how='inner')
	#topic papers
	rows = db.dbExecute(cursor,db.get_topic_papers(d,threshold))
	df2 = pd.DataFrame({'id':u.col(rows,0),'papers':u.col(rows,1) })
	df1 = pd.merge(df1,df2, how='inner')
	#topic venues
	rows = db.dbExecute(cursor,db.get_topic_venues(d,threshold))
	df2 = pd.DataFrame({'id':u.col(rows,0),'venues':u.col(rows,1) })
	df1 = pd.merge(df1,df2,how="inner")
	#topic authors
	rows = db.dbExecute(cursor,db.get_topic_authors(d,threshold))
	df2 = pd.DataFrame({'id':u.col(rows,0),'authors':u.col(rows,1) })
	df1 = pd.merge(df1,df2,how="inner")
	#topic authors median hi
	rows = collect_hi_med(d,threshold)
	df2 = pd.DataFrame({'id':rows[0],'med.hindex':rows[1]})
	df1 = pd.merge(df1,df2,how="inner")
	#cosim intra,inter
	rows = collect_cosim_med(d)
	if domain !="all":
		df2 = pd.DataFrame({'id':rows[0],'med.cosim.intra':rows[1] ,'med.cosim.inter':rows[2]}) # WITH INTER
	else:
		df2 = pd.DataFrame({'id':rows[0],'med.cosim.intra':rows[1] })#,'cosim.mat.indx':rows[3]}) # ALL TOPICS
	df1 = pd.merge(df1,df2,how="inner")
	#RHL cit,pub
	#df2 = collect_rhl(d)
	#df1 = pd.merge(df1,df2,how="inner")
	#df2 = collect_net_intra(d)
	#df1 = pd.merge(df1,df2,how="left",on=['domain','label'])
	#Domain coordinates
	df1["d.x"] = df1.apply(lambda row: c.domain_cord[row.domain][0], axis=1)
	df1["d.y"] = df1.apply(lambda row: c.domain_cord[row.domain][1], axis=1)
	df1["d.z"] = df1.apply(lambda row: c.domain_cord[row.domain][2], axis=1)
	return df1
def topic_domain_analysis():
	'''
	Analyze topic-domain distributions of papers in soup approach
	'''
	conn,cursor = db.dbConnect()
	quartiles = u.paper_topic_percentile("all",p=10)
	threshold = quartiles[c.threshold_quartile]
	
	#threshold = 0.001

	#print str(c.threshold_quartile+1),"th threshold is",threshold
	
	sql = db.get_topic_domain_dist("all",threshold)
	#print "sql",sql
	cursor.execute(sql)
	t1 = time.time()
	rows = cursor.fetchall()
	topic_lists = {"se":[],"db":[],"ai":[],"os":[]}
	topics = []
	# generate domain wise columns 
	current = {"se":0,"db":0,"ai":0,"os":0}
	for row in rows:
		if not topics:
			topics.append(row[0])
		if topics[-1]!=row[0] and topics:
			for d in current: 
				topic_lists[d].append(current[d])
			topics.append(row[0])
			current = {"se":0,"db":0,"ai":0,"os":0}
		current[row[1]]=int(row[2])
	for d in current: topic_lists[d].append(current[d]) #last topic

	#Convert int  to 3 digit int
	topics = u.n_digit_list(topics)
	#Make dataframe
	df = pd.DataFrame({'topic_id':topics,'se':topic_lists["se"],'os':topic_lists["os"],'db':topic_lists["db"],'ai':topic_lists["ai"]})
	
	df["total"] = df.sum(axis=1)
	gini = {}
	for d in ["se","ai","db","os"]:
		gini[d]=u.gini(topic_lists[d])
	gini["total"] = u.gini(df["total"].tolist())

	print gini
	#Generate dist/box plots for each domain
	for col in df.select_dtypes(include=[np.number]).columns:
		data = df[col].tolist()
		u.box_plot(data,"stats_data/desc/"+col+"_soup_boxplot.png")
		u.dist_plot(data,"stats_data/desc/"+col+"_soup_distplot.png")

	#Save distribution for each topic in csv
	df = df[['topic_id','se','ai','db','os','total']]
	print "saving domain topic analysis results in ",'stats_data/TopicDomain_'+c.query_name+'_Q'+str(c.threshold_quartile)+'.csv'
	df.to_csv('stats_data/TopicDomain_'+c.query_name+'_Q'+str(c.threshold_quartile)+'.csv',sep=',',index=False)
	f = open('stats_data/TopicDomain_'+c.query_name+'_Q'+str(c.threshold_quartile)+'.csv', "a")
	f.write("\ngini,"+gini["se"]+","+gini["ai"]+","+gini["db"]+","+gini["os"]+","+gini["total"]+"\n")
def generate_network():

    for domain in c.domains:
        file = 'sterling_no_' + domain + '_' + str(
            c.domain_topics[domain]) + '.csv'
        if not path.exists(file):
            print("Generating topics ans sterling numbers")
            sterling()
        df = pd.read_csv(file)
        authors = np.array(df['author_id'])
        print(len(authors))

        # Get graph edges
        con = dbcon.dbConnect()
        con, cur = dbcon.dbConnect()
        print('Connection status:', con.is_connected())
        sql = "SELECT p1.paper_ID, p1.author_ID a1, p2.author_ID a2 \
            FROM Paper_Author_Affiliations_SE p1 \
            INNER JOIN Paper_Author_Affiliations_SE p2 ON p1.paper_ID = p2.paper_ID where p1.author_ID <> p2.author_ID;"

        graph = dbcon.dbExecute(cur, sql)
        graph = pd.DataFrame(graph, columns=['paper_ID', 'auth1', 'auth2'])

        # Remove authors with topics < 1
        graph = graph[graph.auth1.isin(authors) & graph.auth2.isin(authors)]

        # Create edges
        graph = graph.groupby(by=['auth1', 'auth2']).size().reset_index(
            name='count')
        graph['edges'] = list(zip(graph['auth1'], graph['auth2']))
        graph[['edges']] = graph['edges'].apply(sorted)
        graph[['edges']] = graph['edges'].apply(tuple)

        # For undirected graph drop dupilcate edges
        undir_graph = graph.drop_duplicates(subset='edges', keep='first')
        print(len(graph), len(undir_graph))
        print(undir_graph.head())
        print("Creating graph")
        metrics = get_graph_metrics(authors, undir_graph)
        df = df.merge(metrics, on=('author_id'))
        df.to_csv('dataframe_' + domain + '_' + str(c.domain_topics[domain]) +
                  '.csv',
                  index=False)
def DocTopic_dist():
	'''
	This function generates a box plot and other descriptive metrics of topic-paper prob distribution for the entire coupus
	'''
	conn,cursor = db.dbConnect()
	sql = db.get_topic_paper_probs("all")
	cursor.execute(sql)
	rows = cursor.fetchall()
	percentiles = u.percentile(rows,p=10)
	u.dist_plot(rows,"stats_data/desc/DocTopic_"+c.query_name+"_distplot.png")
コード例 #5
0
def get_topic_labels():
    conn, cursor = db.dbConnect()
    labels = {domain: {} for domain in c.domains}
    labels = [""] * sum(c.domain_topics[domain] for domain in c.domains)
    for domain in c.domains:
        sql = db.get_topic_labels(domain)
        cursor.execute(sql)
        rows = cursor.fetchall()
        for row in rows:
            labels[u.sim_mat_indexer(domain, u.fmt_tid(row[0]))] = row[1]
    conn.close()
    return labels
コード例 #6
0
def collect_hi_med(domain,threshold):
	data = [[],[]]
	conn,cursor = db.dbConnect()
	for i in range(0,c.domain_topics[domain],1):
		sql = db.get_author_hi(domain,threshold,i)
		cursor.execute(sql)
		rows = cursor.fetchall()
		data[0].append(rows[0][0])
		data[1].append(np.median(u.col(rows,1)))
	cursor.close()
	conn.close()
	return data
コード例 #7
0
def collect_kws(domain):
	data = [[],[]]
	conn,cursor = db.dbConnect()
	for i in range(0,c.domain_topics[domain],1):
		sql = db.get_topic_kws(domain,i)
		cursor.execute(sql)
		rows = cursor.fetchall()
		#print rows
		data[0].append(rows[0][0])
		data[1].append(rows[0][1].replace(",","-"))
	cursor.close()
	conn.close()
	return data
コード例 #8
0
def sterling():
    global thresh
    con = dbcon.dbConnect()
    con, cur = dbcon.dbConnect()
    print('Connection status:', con.is_connected())
    for domain in c.domains:
        #Get list of all authors and details
        sql = 'select pa.author_ID, count(distinct p.paper_ID) count, count(distinct paper_published_year) active_years, count(distinct paper_venue_ID) venue,a.author_h_index from \
            Paper_Author_Affiliations_' + domain + ' pa,Papers_' + domain + ' p, Authors_' + domain + ' a \
	        where p.paper_ID =pa.paper_ID and a.author_ID = pa.author_ID group by pa.author_ID;'

        authors = dbcon.dbExecute(cur, sql)
        file = 'author_topic_' + domain + '_' + str(
            c.domain_topics[domain]) + '.json'
        if not path.exists(file):
            print('Finding topics for %d authors' % len(authors))
            author_topic = get_author_topics(authors, cur, str(thresh))
            with open(file, 'w') as f:
                json.dump(author_topic, f)
                f.close()
        with open(file, 'r') as f:
            author_topic = json.load(f)

        sterling_data = calc_versetaility(cur, authors, author_topic, domain)
        df = pd.DataFrame(sterling_data)
        # Author data on citations
        sql = 'select author_ID,sum(cites) from Paper_Author_Affiliations_' + domain + ' p, \
            (select paper_ID, count(*) cites from Paper_Citations_' + domain + ' group by paper_ID) t \
            where t.paper_ID = p.paper_ID \
            group by author_ID;'

        cites = dbcon.dbExecute(cur, sql)
        cites = pd.DataFrame(cites, columns=['author_id', 'total_cites'])
        df = df.merge(cites, on='author_id', how='left').fillna(0)
        df[df['total_topics'] > 0].to_csv('sterling_no_' + domain + '_' +
                                          str(c.domain_topics[domain]) +
                                          '.csv',
                                          index=False)
        print(df.corr(method='spearman'))
コード例 #9
0
def quartile_calculation(d,threshold=0):
	'''
	Calculates quartiles from probability distributions of paper topic relation for.
	Return: array of the three quartile values
	'''
	conn,cursor = db.dbConnect()
	sql = db.get_topic_paper_probs(d,threshold)
	print (sql)
	cursor.execute(sql)
	rows = cursor.fetchall()
	#print rows
	quartiles = percentile(rows, 25) # quartiles
	print (d,"quartiles:",quartiles)
	conn.close()
	return quartiles	
コード例 #10
0
def paper_topic_percentile(d, p=25, threshold=0):
    '''
	Calculates quartiles from probability distributions of paper topic relation for.
	Return: array of the percentile values
	'''
    conn, cursor = db.dbConnect()
    sql = db.get_topic_paper_probs(d, threshold)
    print sql
    cursor.execute(sql)
    rows = cursor.fetchall()
    #print rows
    percentiles = percentile(rows, p=p)  # percentiles
    print d, "percentiles:", percentiles
    conn.close()
    return percentiles
def topic_paper_analysis():
	'''
	Analyze topic paper distribution in partitioned approach
	'''
	conn,cursor = db.dbConnect() 
	threshold = 0.001
	tcount = []
	for d in ["se","os","ai","db"]:
		sql = db.get_topic_domain_dist(d,threshold)
		cursor.execute(sql)
		t1 = time.time()
		rows = cursor.fetchall()
		dcount = []
		for row in rows:
			dcount.append(int(row[2]))
			tcount.append(int(row[2]))
		print "Gini for",d,"is",u.gini(dcount)

	print "Gini for journal:",u.gini(tcount)
コード例 #12
0
def generate_sim_matrix():
    dot = {}

    start_time = time.time()
    n = sum(c.domain_topics[domain] for domain in c.domains)
    sim_matrix = np.zeros((n, n))
    conn, cursor = db.dbConnect()
    cursor.execute('set global max_allowed_packet=671088640000')
    #dot product
    for d1 in c.domains:
        for d2 in c.domains[c.domains.index(d1):]:
            print("calculating ", d1, "X", d2)
            for t1 in range(0, c.domain_topics[d1]):
                for t2 in range(0, c.domain_topics[d2]):
                    i1 = u.sim_mat_indexer(d1, t1)
                    i2 = u.sim_mat_indexer(d2, t2)
                    #old way
                    # if sim_matrix[i1][i2] ==0:
                    # 	sql = db.keyword_dot(d1,"Topic_"+str(t1),d2,"Topic_"+str(t2))
                    # 	cursor.execute(sql)
                    # 	rows = cursor.fetchall()
                    # 	for row in rows:
                    # 		mul = float(row[0])
                    # 		sim_matrix[i1][i2] = sim_matrix[i1][i2] + mul
                    # 	if i1!=i2:
                    # 		sim_matrix[i2][i1] = sim_matrix[i2][i1] + mul

                    if sim_matrix[i1][i2] == 0:
                        sql = db.keyword_dot(d1, "Topic_" + str(t1), d2,
                                             "Topic_" + str(t2))
                        cursor.execute(sql)
                        rows = cursor.fetchall()
                        for row in rows:
                            mul = float(row[0])
                            sim_matrix[i1][i2] = sim_matrix[i1][i2] + mul
                        if i1 != i2:
                            sim_matrix[i2][i1] = sim_matrix[i2][i1] + mul
    '''
	#OLD METHOD	
	sql = db.get_topic_cross(d1,d2)
	print sql
	
	cursor.execute(sql)
	rows = cursor.fetchall()
	print "rows:",len(rows)
	#print "SQL query takes ",(time.time()-t1)/60,"mins"
	
	for row in rows:
		i1 = u.sim_mat_indexer(d1,u.fmt_tid(row[0]))
		i2 = u.sim_mat_indexer(d2,u.fmt_tid(row[1]))
		mul = float(row[2])
		sim_matrix[i1][i2] = sim_matrix[i1][i2] + mul
		if i1!=i2: 
			sim_matrix[i2][i1] = sim_matrix[i2][i1] + mul
	'''
    conn.close()
    cursor.close()
    conn, cursor = db.dbConnect()
    # divide by mod
    for d1 in c.domains:
        sql = db.get_topic_l2(d1)
        t2 = time.time()
        cursor.execute(sql)
        print("SQL query takes ", (time.time() - t2) / 60, "mins")
        rows = cursor.fetchall()
        for row in rows:
            t = u.sim_mat_indexer(d1, u.fmt_tid(row[0]))
            mod = np.sqrt(float(row[1]))

            if mod != 0:
                sim_matrix[t, :] /= mod
                sim_matrix[:, t] /= mod
            else:
                print("ZERO!!!")
    conn.close()

    print(
        "--- time for sim matrix", sim_matrix.shape, " generation " + str(
            (time.time() - start_time) / 60) + " minutes ---")
    return sim_matrix