Esempio n. 1
0
def community_clustering():
    path = settings.COMMUNITY_PATH
    index = 0
    communities = []
    merged_communities = {}
    for root, dirs, files in os.walk(path):
        for year in files:
            merged_communities[int(year)] = [[] for i in range(200)]
            comm_dict = {}
            input = open(os.path.join(path,year))
            for line in input:
                x = line.strip().split(' ')
                author = int(x[0])
                id = int(x[1])
                if not comm_dict.has_key(id):
                    comm_dict[id] = Community(int(year),id,index)
                    index+=1
                comm_dict[id].append_member(author)
            for id in comm_dict.keys():
                communities.append(comm_dict[id])
    verbose.debug("num of communities: "+str(len(communities)))
    adjacency = np.ndarray(shape=(len(communities),len(communities)), dtype=int)
    for i in range(len(communities)):
        for j in range(i+1,len(communities)):
            affinity = communities[i].intersect(communities[j])
            adjacency[i,j]=affinity
            adjacency[j,i]=affinity
    labels = spectral_clustering(adjacency, n_clusters = 200)
    verbose.debug("clustering finished")
    for i in range(len(labels)):
        merged_communities[communities[i].year][labels[i]].extend(communities[i].members)
    for year in merged_communities.keys():
        cluster_file = open(settings.DATA_PATH+"\\clusters\\"+str(year), 'w')
        for i in range(len(merged_communities[year])):
            [cluster_file.write(str(member)+',') for member in merged_communities[year][i]]                     
Esempio n. 2
0
def split_linkedin_dump():
    skip = 1000000
    count = 0
    log = codecs.open("H:\\data\\log"+str(skip)+".txt",'w', encoding="utf-8") 
    id_map = codecs.open("H:\\data\\idmap"+str(skip)+".txt",'w', encoding="utf-8") 
    linkedin_dump = codecs.open('D:\\result.csv', encoding="utf-8")
    out = ""
    linkedin_dump.next()
    for line in linkedin_dump:
        x = 0
        if count < skip:
            count+=1
            if count % 10000 == 0:
                print count
            continue
        print str(count)+':'+str(len(line))
        log.write(str(count)+' '+str(len(line)))
        if line[0] == '"':
            x = line.find('",')
            log.write(str(count)+' '+line[1:x]+'\n')
            verbose.debug(str(count)+' '+line[1:x])
            id_map.write(str(count)+' '+line[1:x]+'\n')
            count+=1
            try:
                out = codecs.open("H:\\data\\linkedin1\\"+line[1:x].strip().replace('"'," ").split('?')[0],'w', encoding="utf-8")
            except Exception, e:
                print e
        else:
            log.write("[EXCEPTION]"+str(count)+":"+line+'\n')
        out.write(line[x:])
Esempio n. 3
0
def get_person_papers():
    author_papers = {}
    for author in authors:
        verbose.debug(author)
        papers = mysql.get_person_publications(author)
        author_papers[author] = papers
    dump_file = open("author_paper_sample", "w")
    pickle.dump(author_papers, dump_file)
Esempio n. 4
0
def get_paper_topic():
    dump_file = open("E:\\My Projects\\Eclipse Workspace\\ringnet\\preprocess\\author_paper_sample")
    author_papers = pickle.load(dump_file)
    papers = set()
    for a in author_papers.keys():
        verbose.debug(a)
        for year in author_papers[a]:
            verbose.debug(year)
            for p in author_papers[a][year]:
                papers.add(p)
    paper_topic = {}
    top_topic = {}
    for p in papers:
        try:
            res = mongo.db["doc_topic200"].find({"_id": p}).next()
            paper_topic[p] = res["topics"]
            top_topic[p] = res["top_topics"]
        except Exception, e:
            print e
            print p
Esempio n. 5
0
def community_clustering_modularity():
    import networkx as nx
    import louvain
    path = settings.COMMUNITY_PATH
    verbose.debug(path)
    index = 0
    communities = []
    merged_communities = {}
    for root, dirs, files in os.walk(path):
        for year in files:
            verbose.debug(year)
            merged_communities[int(year)] = [[] for i in range(200)]
            comm_dict = {}
            input = open(os.path.join(path,year))
            for line in input:
                x = line.strip().split(' ')
                author = int(x[0])
                id = int(x[1])
                if not comm_dict.has_key(id):
                    comm_dict[id] = Community(int(year),id,index)
                    index+=1
                comm_dict[id].append_member(author)
            for id in comm_dict.keys():
                communities.append(comm_dict[id])
    verbose.debug("num of communities: "+str(len(communities)))
    g = nx.Graph()
    for i in range(len(communities)):
        for j in range(i+1,len(communities)):
            affinity = communities[i].intersect(communities[j])
            if affinity!=0:
                g.add_edge(i,j,weight=affinity)
    louvain.detect(g, settings.COMMUNITY_PATH+"\\modularity_clusters")
Esempio n. 6
0
def doc_to_bag_of_words():
    from sklearn.feature_extraction.text import CountVectorizer
    voc = open(settings.TOPICMODEL_PATH+"\\alphabet.txt")
    print settings.TOPICMODEL_PATH+"\\alphabet.txt"
    vocabulary = {}
    line_count = 0
    for line in voc:
        vocabulary[line.strip()]=line_count
        line_count+=1
    print line_count
    vectorizer = CountVectorizer(min_df=1,vocabulary=vocabulary)
    docids = []
    docs = []
    for root, dirs, files in os.walk(settings.DOC_PATH):
        for file in files:
            verbose.debug(file)
            docids.append(int(file))
            docs.append(UnicodeDammit(open(os.path.join(settings.DOC_PATH,file)).read()).markup)
    counts = vectorizer.fit_transform(docs)
    feature_names = vectorizer.get_feature_names()
    out_counts = open(settings.DATA_PATH+"\\bag_of_words",'w')
    out_sum_counts = open(settings.DATA_PATH+"\\sum_word_count",'w')
    arr_counts = counts.toarray()
    sum_counts = counts.sum(axis=0)
    for i in range(len(docs)):
        out_counts.write(docids[i]+':')
        verbose.debug(docids[i])
        for j in range(len(feature_names)):
            if arr_counts[i,j]!=0:
                verbose.debug(feature_names[j]+','+str(arr_counts[i,j]))
                out_counts.write(feature_names[j]+','+str(arr_counts[i,j])+'.')
    for i in range(len(arr_counts[0])):
        out_sum_counts.write(feature_names[i]+' '+str(sum_counts[0,i])+'\n')
Esempio n. 7
0
def split_linkedin_dump():
    skip = 1000000
    count = 0
    log = codecs.open("H:\\data\\log" + str(skip) + ".txt",
                      'w',
                      encoding="utf-8")
    id_map = codecs.open("H:\\data\\idmap" + str(skip) + ".txt",
                         'w',
                         encoding="utf-8")
    linkedin_dump = codecs.open('D:\\result.csv', encoding="utf-8")
    out = ""
    linkedin_dump.next()
    for line in linkedin_dump:
        x = 0
        if count < skip:
            count += 1
            if count % 10000 == 0:
                print count
            continue
        print str(count) + ':' + str(len(line))
        log.write(str(count) + ' ' + str(len(line)))
        if line[0] == '"':
            x = line.find('",')
            log.write(str(count) + ' ' + line[1:x] + '\n')
            verbose.debug(str(count) + ' ' + line[1:x])
            id_map.write(str(count) + ' ' + line[1:x] + '\n')
            count += 1
            try:
                out = codecs.open(
                    "H:\\data\\linkedin1\\" +
                    line[1:x].strip().replace('"', " ").split('?')[0],
                    'w',
                    encoding="utf-8")
            except Exception, e:
                print e
        else:
            log.write("[EXCEPTION]" + str(count) + ":" + line + '\n')
        out.write(line[x:])
Esempio n. 8
0
            for p in author_papers[a][year]:
                papers.add(p)
    paper_topic = {}
    top_topic = {}
    for p in papers:
        try:
            res = mongo.db["doc_topic200"].find({"_id": p}).next()
            paper_topic[p] = res["topics"]
            top_topic[p] = res["top_topics"]
        except Exception, e:
            print e
            print p

    author_topic = {}
    for a in author_papers.keys():
        verbose.debug("author")
        verbose.debug(a)
        topics = {}
        for i in range(2000, 2010):
            topics[i] = {}
            for j in range(200):
                topics[i][j] = 0.0
        for year in author_papers[a].keys():
            for p in author_papers[a][year]:
                verbose.debug("paper")
                verbose.debug(p)
                if paper_topic.has_key(p):
                    ts = paper_topic[p]
                    for t in range(len(ts)):
                        topics[year][t] += ts[t]
                    print "found"