def community_clustering(): path = settings.COMMUNITY_PATH index = 0 communities = [] merged_communities = {} for root, dirs, files in os.walk(path): for year in files: merged_communities[int(year)] = [[] for i in range(200)] comm_dict = {} input = open(os.path.join(path,year)) for line in input: x = line.strip().split(' ') author = int(x[0]) id = int(x[1]) if not comm_dict.has_key(id): comm_dict[id] = Community(int(year),id,index) index+=1 comm_dict[id].append_member(author) for id in comm_dict.keys(): communities.append(comm_dict[id]) verbose.debug("num of communities: "+str(len(communities))) adjacency = np.ndarray(shape=(len(communities),len(communities)), dtype=int) for i in range(len(communities)): for j in range(i+1,len(communities)): affinity = communities[i].intersect(communities[j]) adjacency[i,j]=affinity adjacency[j,i]=affinity labels = spectral_clustering(adjacency, n_clusters = 200) verbose.debug("clustering finished") for i in range(len(labels)): merged_communities[communities[i].year][labels[i]].extend(communities[i].members) for year in merged_communities.keys(): cluster_file = open(settings.DATA_PATH+"\\clusters\\"+str(year), 'w') for i in range(len(merged_communities[year])): [cluster_file.write(str(member)+',') for member in merged_communities[year][i]]
def split_linkedin_dump(): skip = 1000000 count = 0 log = codecs.open("H:\\data\\log"+str(skip)+".txt",'w', encoding="utf-8") id_map = codecs.open("H:\\data\\idmap"+str(skip)+".txt",'w', encoding="utf-8") linkedin_dump = codecs.open('D:\\result.csv', encoding="utf-8") out = "" linkedin_dump.next() for line in linkedin_dump: x = 0 if count < skip: count+=1 if count % 10000 == 0: print count continue print str(count)+':'+str(len(line)) log.write(str(count)+' '+str(len(line))) if line[0] == '"': x = line.find('",') log.write(str(count)+' '+line[1:x]+'\n') verbose.debug(str(count)+' '+line[1:x]) id_map.write(str(count)+' '+line[1:x]+'\n') count+=1 try: out = codecs.open("H:\\data\\linkedin1\\"+line[1:x].strip().replace('"'," ").split('?')[0],'w', encoding="utf-8") except Exception, e: print e else: log.write("[EXCEPTION]"+str(count)+":"+line+'\n') out.write(line[x:])
def get_person_papers(): author_papers = {} for author in authors: verbose.debug(author) papers = mysql.get_person_publications(author) author_papers[author] = papers dump_file = open("author_paper_sample", "w") pickle.dump(author_papers, dump_file)
def get_paper_topic(): dump_file = open("E:\\My Projects\\Eclipse Workspace\\ringnet\\preprocess\\author_paper_sample") author_papers = pickle.load(dump_file) papers = set() for a in author_papers.keys(): verbose.debug(a) for year in author_papers[a]: verbose.debug(year) for p in author_papers[a][year]: papers.add(p) paper_topic = {} top_topic = {} for p in papers: try: res = mongo.db["doc_topic200"].find({"_id": p}).next() paper_topic[p] = res["topics"] top_topic[p] = res["top_topics"] except Exception, e: print e print p
def community_clustering_modularity(): import networkx as nx import louvain path = settings.COMMUNITY_PATH verbose.debug(path) index = 0 communities = [] merged_communities = {} for root, dirs, files in os.walk(path): for year in files: verbose.debug(year) merged_communities[int(year)] = [[] for i in range(200)] comm_dict = {} input = open(os.path.join(path,year)) for line in input: x = line.strip().split(' ') author = int(x[0]) id = int(x[1]) if not comm_dict.has_key(id): comm_dict[id] = Community(int(year),id,index) index+=1 comm_dict[id].append_member(author) for id in comm_dict.keys(): communities.append(comm_dict[id]) verbose.debug("num of communities: "+str(len(communities))) g = nx.Graph() for i in range(len(communities)): for j in range(i+1,len(communities)): affinity = communities[i].intersect(communities[j]) if affinity!=0: g.add_edge(i,j,weight=affinity) louvain.detect(g, settings.COMMUNITY_PATH+"\\modularity_clusters")
def doc_to_bag_of_words(): from sklearn.feature_extraction.text import CountVectorizer voc = open(settings.TOPICMODEL_PATH+"\\alphabet.txt") print settings.TOPICMODEL_PATH+"\\alphabet.txt" vocabulary = {} line_count = 0 for line in voc: vocabulary[line.strip()]=line_count line_count+=1 print line_count vectorizer = CountVectorizer(min_df=1,vocabulary=vocabulary) docids = [] docs = [] for root, dirs, files in os.walk(settings.DOC_PATH): for file in files: verbose.debug(file) docids.append(int(file)) docs.append(UnicodeDammit(open(os.path.join(settings.DOC_PATH,file)).read()).markup) counts = vectorizer.fit_transform(docs) feature_names = vectorizer.get_feature_names() out_counts = open(settings.DATA_PATH+"\\bag_of_words",'w') out_sum_counts = open(settings.DATA_PATH+"\\sum_word_count",'w') arr_counts = counts.toarray() sum_counts = counts.sum(axis=0) for i in range(len(docs)): out_counts.write(docids[i]+':') verbose.debug(docids[i]) for j in range(len(feature_names)): if arr_counts[i,j]!=0: verbose.debug(feature_names[j]+','+str(arr_counts[i,j])) out_counts.write(feature_names[j]+','+str(arr_counts[i,j])+'.') for i in range(len(arr_counts[0])): out_sum_counts.write(feature_names[i]+' '+str(sum_counts[0,i])+'\n')
def split_linkedin_dump(): skip = 1000000 count = 0 log = codecs.open("H:\\data\\log" + str(skip) + ".txt", 'w', encoding="utf-8") id_map = codecs.open("H:\\data\\idmap" + str(skip) + ".txt", 'w', encoding="utf-8") linkedin_dump = codecs.open('D:\\result.csv', encoding="utf-8") out = "" linkedin_dump.next() for line in linkedin_dump: x = 0 if count < skip: count += 1 if count % 10000 == 0: print count continue print str(count) + ':' + str(len(line)) log.write(str(count) + ' ' + str(len(line))) if line[0] == '"': x = line.find('",') log.write(str(count) + ' ' + line[1:x] + '\n') verbose.debug(str(count) + ' ' + line[1:x]) id_map.write(str(count) + ' ' + line[1:x] + '\n') count += 1 try: out = codecs.open( "H:\\data\\linkedin1\\" + line[1:x].strip().replace('"', " ").split('?')[0], 'w', encoding="utf-8") except Exception, e: print e else: log.write("[EXCEPTION]" + str(count) + ":" + line + '\n') out.write(line[x:])
for p in author_papers[a][year]: papers.add(p) paper_topic = {} top_topic = {} for p in papers: try: res = mongo.db["doc_topic200"].find({"_id": p}).next() paper_topic[p] = res["topics"] top_topic[p] = res["top_topics"] except Exception, e: print e print p author_topic = {} for a in author_papers.keys(): verbose.debug("author") verbose.debug(a) topics = {} for i in range(2000, 2010): topics[i] = {} for j in range(200): topics[i][j] = 0.0 for year in author_papers[a].keys(): for p in author_papers[a][year]: verbose.debug("paper") verbose.debug(p) if paper_topic.has_key(p): ts = paper_topic[p] for t in range(len(ts)): topics[year][t] += ts[t] print "found"