def network(dbname, colname, netname): '''Get users' friendship network''' # # ed_usersd = ed_user(dbname, colname) # # pickle.dump(ed_usersd, open('data/ed_users.pick', 'w')) # ed_usersd = pickle.load(open('data/ed_users.pick', 'r')) # # # rec_usersd = rec_user(dbname, colname) # # pickle.dump(rec_usersd, open('data/rec_users.pick', 'w')) # rec_usersd = pickle.load(open('data/rec_users.pick', 'r')) # # # inlist = list(set(ed_usersd).union(set(rec_usersd))) # # print len(inlist) # g = gt.load_network_subset(inlist, dbname, netname) # g.vs['rec'] = 0 # for uid in rec_usersd: # exist = True # try: # v = g.vs.find(name=str(uid)) # except ValueError: # exist = False # if exist: # v['rec'] = 1 # pickle.dump(g, open('data/rec_friendship.pick', 'w')) rg = pickle.load(open('data/rec_friendship.pick', 'r')) # g.write_gml('data/rec_friendship.GML') # g.write_dot('data/rec_friendship.DOT') gc = gt.giant_component(rg, 'WEAK') comm = gt.fast_community(gc, False) fclus = comm.as_clustering(2) communit_topinflu(fclus, None)
def tag_activity(dbname, colname): # recording the activity of tag g = gt.Graph.Read_GraphML('data/pro_mention_tag_undir.graphml') vs = g.vs(weight_gt=3, user_gt=3) sg = g.subgraph(vs) gc = gt.giant_component(sg) tag_time = {} for v in gc.vs: tag_time[v['name']] = [] time = dbt.db_connect_col(dbname, colname) filter = {} filter['$where'] = 'this.entities.hashtags.length>0' filter['retweeted_status'] = {'$exists': False} for tweet in time.find(filter, no_cursor_timeout=True): # if 'retweeted_status' in row: # continue created_at = datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y') hashtags = tweet['entities']['hashtags'] for hash in hashtags: # need no .encode('utf-8') tag = (hash['text'].encode('utf-8').lower().replace('_', '').replace( '-', '')) if tag in tag_time: datelist = tag_time.get(tag, []) datelist.append(created_at) tag_time[tag] = datelist pickle.dump(tag_time, open('tag_activity.pick', 'w'))
def community(g=None): ''' Detect communities in the co-occurrence network of hashtag Use InfoMap to detect communities Only select communities whose sizes are larger than a threshold :param g: :return: ''' g = gt.Graph.Read_GraphML('ed_tag.graphml') gc = gt.giant_component(g) com = gc.community_infomap(edge_weights='weight', vertex_weights='weight') comclus = com.subgraphs() print len(comclus), com.modularity index = 0 hash_com = {} for comclu in comclus: if comclu.vcount() > 10: # print 'Com', index, '===================================' # else: # print '===================================' tag_weight = {} for v in comclu.vs: hash_com[v['name']] = index tag_weight[v['name']] = v['weight'] index += 1 sort_list = list(sorted(tag_weight, key=tag_weight.get, reverse=True)) for key in sort_list: print key, tag_weight[key] print len(hash_com) print len(set(hash_com.values())) print set(hash_com.values()) return hash_com
def test_user_cluster_assign_stable(): # Test stable how final user clustering assignments (k=2) core = gt.Graph.Read_GraphML('alled_tag_undir_filter.graphml') communication = gt.Graph.Read_GraphML( 'communication-only-fed-filter.graphml') gt.summary(communication) communication = gt.giant_component(communication) gt.summary(communication) users = [(v['name']) for v in communication.vs] print len(users) # user_hashtag_vector('fed', 'ed_tag', users) seperations = [] for i in xrange(100): print i user_hashtag_profile(core, users) # data += user_cluster_hashtag() cluters, ids = user_cluster_hashtag() seperations.append(cluters) aRI = [] for i in xrange(100): for j in xrange(i + 1, 100): aRI.append( metrics.adjusted_rand_score(seperations[i], seperations[j])) print len(aRI) print '%.3f, %.3f, %.3f, %.3f' % (min(aRI), max(aRI), np.mean(aRI), np.std(aRI))
def test_significant(file_path): # random shuffle the weights of edges and test the segregate of networks g = gt.Graph.Read_GraphML(file_path) gt.summary(g) g = g.as_undirected(combine_edges=dict(weight="sum")) g = gt.giant_component(g) gt.summary(g) # print g.es['weight'] fast = g.community_fastgreedy(weights='weight') fast_com = fast.as_clustering(n=2) orig_mod = fast_com.modularity mod_list = [] for i in xrange(1000): weights = g.es["weight"] g.rewire() g.es["weight"] = weights # gt.net_stat(g) # print g.es['weight'] fast = g.community_fastgreedy(weights='weight') fast_com = fast.as_clustering() mod_list.append(fast_com.modularity) amean, astd = np.mean(mod_list), np.std(mod_list) print 'simulated values: %.3f +- (%.3f)' %(amean, astd) # absobserved = abs(raw_assort) # pval = (np.sum(ass_list >= absobserved) + # np.sum(ass_list <= -absobserved))/float(len(ass_list)) zscore = (orig_mod-amean)/astd print 'z-score: %.3f' %zscore
def tfidf_tag_cluster(btype='retweet'): # Calculate the TFIDF of tags in two clusters cluster0 = gt.Graph.Read_GraphML('ed_' + btype + '_fed_cluster0_tag_undir.graphml') cluster1 = gt.Graph.Read_GraphML('ed_' + btype + '_fed_cluster1_tag_undir.graphml') gt.summary(cluster0) vs = cluster0.vs(weight_gt=3, user_gt=3) cluster0 = cluster0.subgraph(vs) cluster0 = gt.giant_component(cluster0) gt.summary(cluster0) gt.summary(cluster1) vs = cluster1.vs(weight_gt=3, user_gt=3) cluster1 = cluster1.subgraph(vs) cluster1 = gt.giant_component(cluster1) gt.summary(cluster1) for v in cluster0.vs: exist = True count_ov = 0.0 try: ov = cluster1.vs.find(name=v['name']) except ValueError: exist = False if exist: count_ov = ov['weight'] v['tfidf'] = float(v['weight']) / (v['weight'] + count_ov) for v in cluster1.vs: exist = True count_ov = 0.0 try: ov = cluster0.vs.find(name=v['name']) except ValueError: exist = False if exist: count_ov = ov['weight'] v['tfidf'] = float(v['weight']) / (v['weight'] + count_ov) cluster0.write_graphml('ed_' + btype + '_fed_cluster0_tfidf_tag_undir.graphml') cluster1.write_graphml('ed_' + btype + '_fed_cluster1_tfidf_tag_undir.graphml')
def community(g=None): ''' Detect communities in the co-occurrence network of hashtag Use multilevel to detect communities Only select communities whose sizes are larger than a threshold :param g: :return: hash_com: {hashtag: community_index} com_size: {community_index: community_size} ''' gt.summary(g) vs = g.vs(weight_gt=100, user_gt=10) g = g.subgraph(vs) g = g.subgraph_edges(g.es.select(rWeight_gt=0, rWeight_lt=float('Inf'))) gt.summary(g) gc = gt.giant_component(g) gt.summary(gc) # g.write_graphml('fed_tag_undir_over3.graphml') # com = g.community_multilevel(weights='rWeight', return_levels=False) com = g.community_infomap(edge_weights='rWeight', vertex_weights=None) # com = louvain.find_partition(gc, method='Significance', weight=None) comclus = com.subgraphs() print 'Community stats: #communities, modularity', len( comclus), com.modularity index = 0 nonsingle = 0 hash_com = {} com_size = {} for comclu in comclus: print '---------- Community ', index, '-----------------' if comclu.vcount() > 1: nonsingle += 1 tag_weight = {} for v in comclu.vs: if v['weight'] > 5: hash_com[v['name']] = index tag_weight[v['name']] = v['weight'] count = com_size.get(index, 0) com_size[index] = v['weight'] + count sort_list = list(sorted(tag_weight, key=tag_weight.get, reverse=True)) for key in sort_list[:min(len(sort_list), len(sort_list))]: print key, tag_weight[key] print '-------------Community size: ', com_size[ index], '---------------------' print index += 1 # print len(hash_com) # print len(set(hash_com.values())) # print set(hash_com.values()) print '------------------all size:', sum( com_size.values()), '---------------------' print '------------------non single clusters:', nonsingle, '---------------------' return hash_com, com_size
def test_stable_infomap_kmean(): # Test the stable for the whole process, from infomap clustering hashtag and k-means clustering users import tag_network core = gt.Graph.Read_GraphML('alled_tag_undir_filter.graphml') communication = gt.Graph.Read_GraphML('communication-only-fed-filter.graphml') gt.summary(communication) communication = gt.giant_component(communication) gt.summary(communication) users = [(v['name']) for v in communication.vs] print len(users) tag_network.user_hashtag_profile(core, users)
def test_user_cluster_stable(): # Test stable of using infomap and test best k for k-means core = gt.Graph.Read_GraphML('data/alled_tag_undir_filter.graphml') communication = gt.Graph.Read_GraphML( 'data/communication-only-fed-filter.graphml') gt.summary(communication) communication = gt.giant_component(communication) gt.summary(communication) users = [(v['name']) for v in communication.vs] print len(users) # user_hashtag_vector('fed', 'ed_tag', users) data = [] for i in xrange(100): user_hashtag_profile(core, users, i) ###### Run by python data += user_cluster_hashtag() df = pd.DataFrame(data, columns=['cluster', 'silhouette_avg']) df.to_csv('user-kmeans-hashtag.csv')
def cluseter_nodes(btype = 'communication'): # cluster users in networks g = gt.Graph.Read_GraphML('communication-only-fed-filter.graphml') g = gt.giant_component(g) gt.summary(g) cluters, ids = tn.user_cluster_hashtag('ed-'+btype+'.data') # ids = [] # with open('ed-'+btype+'.data', 'r') as fo: # for line in fo.readlines(): # ids.append(line.split(' ')[0]) g.vs['cluster'] = -1 for i in xrange(len(cluters)): id = ids[i] v = g.vs.find(name=id) v['cluster'] = cluters[i] g.write_graphml('communication-only-fed-filter-hashtag-cluster.graphml')
def rank_feature(gc, dbname, comname, db_field_names, directed=True): g = gt.giant_component(gc, 'WEAK') g.vs['nt'] = g.degree(type="in") netatt = g.vs['nt'] # ranks = g.pagerank(weights='weight') # g.vs['rank'] = ranks # cor = st.tau_coef(g.degree(type="in"), g.vs['rank']) # print 'Indegree' + '\t' + str(cor[0]) + '\t' + str(cor[1]) # cor = st.tau_coef(g.degree(type="out"), g.vs['rank']) # print 'Outdegree' + '\t' + str(cor[0]) + '\t' + str(cor[1]) for db_field_name in db_field_names: # print 'Processing ' + db_field_name g = gt.add_attribute(g, 'foi', dbname, comname, db_field_name) raw_values = np.array(g.vs['foi']) values = drop_initials(raw_values) if len(values) > 100: # maxv, minv = max(values), min(values) maxv, minv = np.percentile(values, 97.5), np.percentile(values, 2.5) vs = g.vs(foi_ge=minv, foi_le=maxv) sg = g.subgraph(vs) maxd, mind = np.percentile(netatt, 97.5), np.percentile(netatt, 2.5) vs = sg.vs(nt_ge=mind, nt_le=maxd) sg = sg.subgraph(vs) # cor = st.tau_coef(sg.vs['foi'], sg.vs['nt']) # print db_field_name + '\t' + str(len(sg.vs)) + '\t' + str(len(sg.es)) + '\t'\ # + str(min(netatt)) + '\t' + str(max(netatt)) + '\t' + str(mind) + '\t'\ # +str(maxd) + '\t' \ # + str(min(values)) + '\t' + str(max(values)) + '\t' + str(minv) + '\t'\ # +str(maxv) + '\t'\ # + str(cor[0]) + '\t' + str(cor[1]) pt.correlation(sg.vs['nt'], sg.vs['foi'], 'Indegree', 'Feature', 'data/' + db_field_name + '.pdf')
def two_community(file_path): # get two community from networks g = gt.Graph.Read_GraphML(file_path) gt.summary(g) # g = g.as_undirected(combine_edges=dict(weight="sum")) g = gt.giant_component(g) gt.summary(g) # ml = g.community_multilevel(weights='weight', return_levels=True) # fast = g.community_fastgreedy(weights='weight') # fast_com = fast.as_clustering(n=2) # walk = g.community_walktrap(weights='weight') # walk_com = walk.as_clustering(n=2) infor = g.community_infomap(edge_weights='weight', vertex_weights=None, trials=2) # eigen = g.community_leading_eigenvector(clusters=2, weights='weight') # label_pro = g.community_label_propagation(weights='weight', initial=eigen.membership) # betweet = g.community_edge_betweenness(weights='weight') # bet_com = betweet.as_clustering(n=2) g.vs['community'] = infor.membership g.write_graphml('com-'+file_path) return infor.subgraphs()
def user_statis(): groups = [ ('ED', 'fed', 'com', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1}), ('RD', 'random', 'scom', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1}), ('YG', 'younger', 'scom', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1}) ] data = [] for tag, dbname, comname, filter_values in groups: com = dbt.db_connect_col(dbname, comname) network1 = gt.Graph.Read_GraphML(tag.lower()+'-net.graphml') gt.summary(network1) network1_gc = gt.giant_component(network1) gt.summary(network1_gc) users_time = iot.get_values_one_field(dbname, comname, 'id_str', filter_values) try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: friends = set(network1.successors(str(uid)))
def rank_feature(gc, dbname, comname, db_field_names, directed=True): g = gt.giant_component(gc, "WEAK") g.vs["nt"] = g.degree(type="in") netatt = g.vs["nt"] # ranks = g.pagerank(weights='weight') # g.vs['rank'] = ranks # cor = st.tau_coef(g.degree(type="in"), g.vs['rank']) # print 'Indegree' + '\t' + str(cor[0]) + '\t' + str(cor[1]) # cor = st.tau_coef(g.degree(type="out"), g.vs['rank']) # print 'Outdegree' + '\t' + str(cor[0]) + '\t' + str(cor[1]) for db_field_name in db_field_names: # print 'Processing ' + db_field_name g = gt.add_attribute(g, "foi", dbname, comname, db_field_name) raw_values = np.array(g.vs["foi"]) values = drop_initials(raw_values) if len(values) > 100: # maxv, minv = max(values), min(values) maxv, minv = np.percentile(values, 97.5), np.percentile(values, 2.5) vs = g.vs(foi_ge=minv, foi_le=maxv) sg = g.subgraph(vs) maxd, mind = np.percentile(netatt, 97.5), np.percentile(netatt, 2.5) vs = sg.vs(nt_ge=mind, nt_le=maxd) sg = sg.subgraph(vs) # cor = st.tau_coef(sg.vs['foi'], sg.vs['nt']) # print db_field_name + '\t' + str(len(sg.vs)) + '\t' + str(len(sg.es)) + '\t'\ # + str(min(netatt)) + '\t' + str(max(netatt)) + '\t' + str(mind) + '\t'\ # +str(maxd) + '\t' \ # + str(min(values)) + '\t' + str(max(values)) + '\t' + str(minv) + '\t'\ # +str(maxv) + '\t'\ # + str(cor[0]) + '\t' + str(cor[1]) pt.correlation(sg.vs["nt"], sg.vs["foi"], "Indegree", "Feature", "data/" + db_field_name + ".pdf")
def benetwork(dbname, type, netname): '''Get users' behavior networks''' # ed_usersd = pickle.load(open('data/ed_users.pick', 'r')) # rec_usersd = pickle.load(open('data/rec_users.pick', 'r')) # inlist = list(set(ed_usersd).union(set(rec_usersd))) # g = gt.load_beh_network_subset(inlist, dbname, netname, type) # g.vs['rec'] = 0 # for uid in rec_usersd: # exist = True # try: # v = g.vs.find(name=str(uid)) # except ValueError # exist = False # if exist: # v['rec'] = 1 # pickle.dump(g, open('data/rec_'+type+'.pick', 'w')) rg = pickle.load(open('data/rec_'+type+'.pick', 'r')) # plot_graph(g) gc = gt.giant_component(rg, 'WEAK') comm = gt.fast_community(gc, True) fclus = comm.as_clustering(2) communit_topinflu(fclus, 'weight')
def benetwork(dbname, type, netname): '''Get users' behavior networks''' # ed_usersd = pickle.load(open('data/ed_users.pick', 'r')) # rec_usersd = pickle.load(open('data/rec_users.pick', 'r')) # inlist = list(set(ed_usersd).union(set(rec_usersd))) # g = gt.load_beh_network_subset(inlist, dbname, netname, type) # g.vs['rec'] = 0 # for uid in rec_usersd: # exist = True # try: # v = g.vs.find(name=str(uid)) # except ValueError: # exist = False # if exist: # v['rec'] = 1 # pickle.dump(g, open('data/rec_'+type+'.pick', 'w')) rg = pickle.load(open('data/rec_'+type+'.pick', 'r')) # plot_graph(g) gc = gt.giant_component(rg, 'WEAK') comm = gt.fast_community(gc, True) fclus = comm.as_clustering(2) communit_topinflu(fclus, 'weight')
def friendship_community_vis(dbname, colname, filename, ctype): '''Out graph for vis.js visualization''' ed_users = iot.get_values_one_field(dbname, 'scom', 'id') # fed_users = iot.get_values_one_field(dbname, 'com', 'id') dbcom = dbt.db_connect_col(dbname, 'com') fg = gt.load_network(dbname, colname) # fg = gt.load_beh_network_subset(ed_users, dbname, colname, 'retweet') gt.net_stat(fg) # fg = fg.as_undirected(mode="mutual") # gt.net_stat(fg) fg = gt.giant_component(fg, 'WEAK') gt.net_stat(fg) if ctype == 'ml': com = fg.community_multilevel(weights='weight', return_levels=False) elif ctype == 'lp': fgu = fg.as_undirected(combine_edges=sum) init = fgu.community_leading_eigenvector(clusters=2, weights='weight') print init.membership com = fg.community_label_propagation(weights='weight', initial=init.membership) print com.membership else: com = fg.community_infomap(edge_weights='weight', trials=2) fg.vs['group'] = com.membership # edges = fg.es.select(weight_gt=3) # print 'Filtered edges: %d' %len(edges) # fg = fg.subgraph_edges(edges) # gt.net_stat(fg) # fg.vs['degree'] = fg.degree(mode="all") # nodes = fg.vs.select(degree_gt=10) # fg = fg.subgraph(nodes) # gt.net_stat(fg) Coo = {} for x in fg.vs['group']: Coo[x] = (rand.randint(-1000, 1000), rand.randint(-1000, 1000)) with open('data/' + ctype + '_' + filename + '_net_follow.js', 'w') as fw: fw.write('var nodes = [\n') for idv, v in enumerate(fg.vs): user = dbcom.find_one({'id': int(fg.vs[idv]['name'])}) desc = ' '.join(user['description'].replace('\'', '').replace( '\"', '').split()) fw.write('{id: ' + str(idv + 1) + ', ' + 'label: \'' + user['screen_name'] + '\', ' + 'value: ' + str(fg.degree(idv, mode="in")) + ', ' + 'title: \'UID: ' + str(fg.vs[idv]['name']) + '<br> Screen Name: ' + user['screen_name'] + '<br> Followers: ' + str(user['followers_count']) + '<br> Followees: ' + str(user['friends_count']) + '<br> Tweets: ' + str(user['statuses_count']) + '<br> Description: ' + str(desc.encode('utf-8')) + '<br> Group: ' + str(fg.vs[idv]['group']) + '\', ' + 'x: ' + str(Coo[fg.vs[idv]['group']][0] + rand.randint(0, 300)) + ', ' + 'y: ' + str(Coo[fg.vs[idv]['group']][1] + rand.randint(0, 300)) + ', ' + 'group: ' + str(fg.vs[idv]['group']) + ', ') # if int(fg.vs[idv]['name']) in ed_users: # fw.write('shape: ' + '\'triangle\'') # else: # fw.write('shape: ' + '\'circle\'') fw.write('}, \n') fw.write('];\n var edges = [\n') for ide, e in enumerate(fg.es): fw.write('{from: ' + str(e.source + 1) + ', ' + 'to: ' + str(e.target + 1) + ', ' + 'arrows: ' + '\'to\'' + ', ' + 'title: \' Tags: ' + fg.vs[e.source]['name'] + ' ' + fg.vs[e.target]['name'] + '<br> Co-occurrence: ' + str(fg.es[ide]['weight']) + '\', ' + 'value: ' + str(fg.es[ide]['weight']) + '},\n') #str(fg.es[ide]['weight']) fw.write('];\n')
def read_user_time_iv(filename): # fields = iot.read_fields() fields = [ # # 'liwc_anal.result.posemo', # # 'liwc_anal.result.negemo', # # 'liwc_anal.result.ingest', # # 'liwc_anal.result.bio', # # 'liwc_anal.result.body', # # 'liwc_anal.result.health', # # 'liwc_anal.result.death' # # 'liwc_anal.result.anx', # # 'liwc_anal.result.anger', # # 'liwc_anal.result.sad', # # 'liwc_anal.result.i', # # 'liwc_anal.result.we', # # 'liwc_anal.result.negate', # # 'liwc_anal.result.swear', # # 'liwc_anal.result.social', # # 'liwc_anal.result.family', # # 'liwc_anal.result.friend', # # 'liwc_anal.result.affect', # 'senti.result.whole.posm', # # 'senti.result.whole.posstd', # 'senti.result.whole.negm', # # 'senti.result.whole.negstd', # 'senti.result.whole.scalem', # # 'senti.result.whole.scalestd', # 'senti.result.whole.N', # 'senti.result.prior.scalem', # 'senti.result.post.scalem' 'senti' ] prof_names = ['friends_count', 'statuses_count', 'followers_count', 'friends_day', 'statuses_day', 'followers_day', 'days'] trimed_fields = ['-'.join(field.split('.')[-2:]) for field in fields] print trimed_fields groups = [ ('ED', 'fed', 'com', 'fed', 'com_survival', { 'liwc_anal.result.WC': {'$exists': True}, 'level': 1, 'senti.result.whole.N': {'$gt': 10}}), ('RD', 'random', 'scom', 'random', 'com_survival', { 'liwc_anal.result.WC': {'$exists': True}, 'senti.result.whole.N': {'$gt': 10}}), ('YG', 'younger', 'scom', 'younger', 'com_survival', { 'liwc_anal.result.WC': {'$exists': True}, 'senti.result.whole.N': {'$gt': 10}}) ] data = [] for tag, dbname, comname, dbname2, comname2, filter_values in groups: com = dbt.db_connect_col(dbname, comname) com2 = dbt.db_connect_col(dbname2, comname2) sentims = (pickle.load(open(tag.lower() + '.sentis', 'r'))) print len(sentims) network1 = gt.Graph.Read_GraphML(tag.lower()+'-net-all-active.graphml') gt.summary(network1) network1_gc = gt.giant_component(network1) gt.summary(network1_gc) '''Centralities Calculation''' eigen = network1_gc.eigenvector_centrality() pageranks = network1_gc.pagerank() indegree = network1_gc.authority_score() outdegree = network1_gc.hub_score() nodes = [int(v['name']) for v in network1_gc.vs] eigen_map = dict(zip(nodes, eigen)) pagerank_map = dict(zip(nodes, pageranks)) indegree_map = dict(zip(nodes, indegree)) outdegree_map = dict(zip(nodes, outdegree)) frialive, friduration = {}, {} for v in network1.vs: friends = set(network1.successors(str(v['name']))) if len(friends) > 0: falive, fduration = [], [] for vi in friends: falive.append(network1.vs[vi]['alive']) fduration.append(network1.vs[vi]['duration']) frialive[int(v['name'])] = np.mean(falive) friduration[int(v['name'])] = np.mean(fduration) # print 'load liwc 2 batches: ' + tag.lower()+'-liwc2stage.csv' # liwc_df = pd.read_pickle(tag.lower()+'-liwc2stage.csv'+'.pick') network1 = gt.Graph.Read_GraphML(tag.lower()+'-net.graphml') for user in com.find(filter_values, no_cursor_timeout=True): first_scraped_at = user['_id'].generation_time.replace(tzinfo=None) if 'status' in user: uid = user['id'] u2 = com2.find_one({'id': uid}) first_last_post = datetime.strptime(user['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') last_post = first_last_post first_statuses_count = user['statuses_count'] second_statuses_count = first_statuses_count drop = 1 if u2: second_scraped_at = u2['_id'].generation_time.replace(tzinfo=None) second_statuses_count = u2['statuses_count'] if 'status' in u2: second_last_post = datetime.strptime(u2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if first_scraped_at < second_last_post < second_scraped_at: drop = 0 last_post = second_last_post created_at = datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y') longest_tweet_intervalb = user['longest_tweet_interval'] u_timeline_count = user['timeline_count'] # values = iot.get_fields_one_doc(user, fields) values = [sentims[uid]] level = user['level'] u_centrality = eigen_map.get(user['id'], 0) u_pagerank = pagerank_map.get(user['id'], 0) u_indegree = indegree_map.get(user['id'], 0) u_outdegree = outdegree_map.get(user['id'], 0) # values.extend(liwc_changes) values.extend(active_days(user)) '''Get friends' profiles''' exist = True try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: # friends = set(network1.neighbors(str(uid))) # id or name friends = set(network1.successors(str(uid))) if len(friends) > 0: friend_ids = [int(network1.vs[vi]['name']) for vi in friends] # return id print uid in friend_ids print len(friend_ids) fatts = [] alive = 0 ffatts = [] for fid in friend_ids: if fid in sentims: fatt = [sentims[fid]] fatt.extend([eigen_map.get(fid, 0), pagerank_map.get(fid, 0), indegree_map.get(fid, 0), outdegree_map.get(fid, 0)]) fatts.append(fatt) friendfriends = set(network1.successors(str(fid))) if len(friendfriends) > 0: friendfriends_ids = [int(network1.vs[vi]['name']) for vi in friendfriends] # return id for ffid in friendfriends_ids: if ffid in sentims: ffatt = [sentims[ffid]] ffatts.append(ffatt) if (len(fatts) > 0) and (len(ffatts)>0): fatts = np.array(fatts) fmatts = np.mean(fatts, axis=0) ffatts = np.array(ffatts) ffmatts = np.mean(ffatts, axis=0) values.extend(fmatts) # paliv = float(alive)/len(fatts) paliv = frialive.get(uid) fdays = friduration.get(uid) data.append([user['id_str'], level, drop, created_at, first_last_post, second_last_post, last_post, first_scraped_at, second_scraped_at, first_statuses_count, second_statuses_count, longest_tweet_intervalb, tag, u_centrality, u_pagerank, u_indegree, u_outdegree, u_timeline_count] + values + [len(fatts), paliv, fdays] + ffmatts.tolist()) df = pd.DataFrame(data, columns=['uid', 'level', 'dropout', 'created_at', 'first_last_post', 'second_last_post', 'last_post', 'first_scraped_at', 'second_scraped_at', 'first_statuses_count', 'second_statuses_count','longest_time_interval', 'group', 'u_eigenvector', 'u_pagerank', 'u_authority', 'u_hub', 'u_timeline_count'] + ['u_'+field for field in trimed_fields] + # ['u_prior_'+field for field in trimed_fields] + # ['u_post_'+field for field in trimed_fields] + # ['u_change_'+field for field in trimed_fields] + ['u_'+field for field in prof_names] + ['f_'+tf for tf in trimed_fields] + ['f_eigenvector', 'f_pagerank', 'f_authority', 'f_hub', 'f_num', 'f_palive', 'f_days'] + ['ff_'+field for field in trimed_fields] ) df.to_csv(filename)
def friendship_community_vis(dbname, colname, filename, ctype): '''Out graph for vis.js visualization''' ed_users = iot.get_values_one_field(dbname, 'scom', 'id') # fed_users = iot.get_values_one_field(dbname, 'com', 'id') dbcom = dbt.db_connect_col(dbname, 'com') fg = gt.load_network(dbname, colname) # fg = gt.load_beh_network_subset(ed_users, dbname, colname, 'retweet') gt.net_stat(fg) # fg = fg.as_undirected(mode="mutual") # gt.net_stat(fg) fg = gt.giant_component(fg, 'WEAK') gt.net_stat(fg) if ctype == 'ml': com = fg.community_multilevel(weights='weight', return_levels=False) elif ctype == 'lp': fgu = fg.as_undirected(combine_edges=sum) init = fgu.community_leading_eigenvector(clusters=2, weights='weight') print init.membership com = fg.community_label_propagation(weights='weight', initial=init.membership) print com.membership else: com = fg.community_infomap(edge_weights='weight', trials=2) fg.vs['group'] = com.membership # edges = fg.es.select(weight_gt=3) # print 'Filtered edges: %d' %len(edges) # fg = fg.subgraph_edges(edges) # gt.net_stat(fg) # fg.vs['degree'] = fg.degree(mode="all") # nodes = fg.vs.select(degree_gt=10) # fg = fg.subgraph(nodes) # gt.net_stat(fg) Coo={} for x in fg.vs['group']: Coo[x]=(rand.randint(-1000, 1000), rand.randint(-1000, 1000)) with open('data/' + ctype + '_' +filename+'_net_follow.js', 'w') as fw: fw.write('var nodes = [\n') for idv, v in enumerate(fg.vs): user = dbcom.find_one({'id': int(fg.vs[idv]['name'])}) desc = ' '.join(user['description'].replace('\'', '').replace('\"', '').split()) fw.write('{id: ' + str(idv+1) + ', '+ 'label: \'' + user['screen_name'] +'\', ' + 'value: ' + str(fg.degree(idv, mode="in")) + ', ' + 'title: \'UID: ' + str(fg.vs[idv]['name']) + '<br> Screen Name: ' + user['screen_name'] + '<br> Followers: ' + str(user['followers_count']) + '<br> Followees: ' + str(user['friends_count']) + '<br> Tweets: ' + str(user['statuses_count']) + '<br> Description: ' + str(desc.encode('utf-8')) + '<br> Group: ' + str(fg.vs[idv]['group']) + '\', ' + 'x: ' + str(Coo[fg.vs[idv]['group']][0]+rand.randint(0, 300)) + ', ' + 'y: ' + str(Coo[fg.vs[idv]['group']][1]+rand.randint(0, 300)) + ', ' + 'group: ' + str(fg.vs[idv]['group']) + ', ') # if int(fg.vs[idv]['name']) in ed_users: # fw.write('shape: ' + '\'triangle\'') # else: # fw.write('shape: ' + '\'circle\'') fw.write('}, \n') fw.write('];\n var edges = [\n') for ide, e in enumerate(fg.es): fw.write('{from: ' + str(e.source+1) + ', ' + 'to: ' + str(e.target+1) + ', ' + 'arrows: ' + '\'to\'' + ', ' + 'title: \' Tags: ' + fg.vs[e.source]['name'] + ' ' + fg.vs[e.target]['name'] + '<br> Co-occurrence: ' + str(fg.es[ide]['weight']) + '\', ' + 'value: ' + str(fg.es[ide]['weight']) + '},\n') #str(fg.es[ide]['weight']) fw.write('];\n')
def emotion_dropout_IV_following(filepath): ''' Only use following stats :param dbname1: :param dbname2: :param comname1: :param comname2: :return: ''' fields = [ 'senti.result.whole.posm', 'senti.result.whole.posstd', 'senti.result.whole.negm', 'senti.result.whole.negstd', 'senti.result.whole.scalem', 'senti.result.whole.scalestd', 'senti.result.whole.N', 'senti.result.prior.scalem', 'senti.result.post.scalem', # 'liwc_anal.result.posemo', # 'liwc_anal.result.negemo', # 'liwc_anal.result.ingest', # 'liwc_anal.result.bio', # 'liwc_anal.result.body', # 'liwc_anal.result.health', # 'liwc_anal.result.death' # 'liwc_anal.result.anx', # 'liwc_anal.result.anger', # 'liwc_anal.result.sad' ] trimed_fields = ['-'.join(field.split('.')[-2: -1]) for field in fields] prof_names = ['friends_count', 'statuses_count', 'followers_count', 'friends_day', 'statuses_day', 'followers_day', 'days', 'eigenvector', 'pagerank', 'authority', 'hub'] attr_names = ['uid', 'group', 'attr', 'level'] attr_names.extend(['u_'+field for field in trimed_fields]) # attr_names.extend(['u_prior_'+field for field in trimed_fields]) # attr_names.extend(['u_post_'+field for field in trimed_fields]) # attr_names.extend(['u_change_'+field for field in trimed_fields]) attr_names.extend(['u_'+field for field in prof_names]) attr_names.extend([ # 'u_recovery_tweets', 'u_timeline_count']) attr_names.extend(['f_'+field.split('.')[-1] for field in fields]) attr_names.extend(['f_'+field for field in prof_names]) attr_names.extend(['f_timeline_count', 'f_num', 'f_palive']) print attr_names data = [] name_map = { 'ed': ('fed', 'fed_sur', 'com', 'com', {'level': 1, 'liwc_anal.result.WC': {'$exists': True}}), 'yg': ('younger', 'younger_sur', 'scom', 'com', {'liwc_anal.result.WC': {'$exists': True}}), 'rd': ('random', 'random_sur', 'scom', 'com', {'liwc_anal.result.WC': {'$exists': True}}) } for groupname in [ 'yg', 'rd', 'ed']: dbname1, dbname2, comname1, comname2, filter_que = name_map[groupname] print 'Centrality Calculate .........' # users = iot.get_values_one_field('fed', 'com', 'id', {'level': {'$lt': 3}}) # print 'Number of users', len(users) # network1 = gt.load_network_subset('fed', 'net', {'user': {'$in': users}, 'follower': {'$in': users}}) # network1 = gt.load_network('fed', 'net') # pickle.dump(network1, open('net.pick', 'w')) print 'load network: ' + groupname+'-net.graphml' network1= gt.Graph.Read_GraphML(groupname+'-net.graphml') # network1 = pickle.load(open('net.pick', 'r')) gt.summary(network1) network1_gc = gt.giant_component(network1) gt.summary(network1_gc) '''Centralities Calculation''' eigen = network1_gc.eigenvector_centrality() pageranks = network1_gc.pagerank() indegree = network1_gc.authority_score() outdegree = network1_gc.hub_score() # closeness = network.closeness() # betweenness = network.betweenness() # print len(eigen), len(closeness), len(betweenness) nodes = [int(v['name']) for v in network1_gc.vs] # print len(nodes), len(eigen) # print type(nodes), type(eigen) eigen_map = dict(zip(nodes, eigen)) pagerank_map = dict(zip(nodes, pageranks)) indegree_map = dict(zip(nodes, indegree)) outdegree_map = dict(zip(nodes, outdegree)) # print eigen_map.get(nodes[1]), type(eigen_map.get(nodes[1])) # closeness_map = dict(zip(nodes, closeness)) # betweenness_map = dict(zip(nodes, betweenness)) print 'Centrality Calculate .........' # print 'load liwc 2 batches: ' + groupname+'-liwc2stage.csv' # df = pd.read_pickle(groupname+'-liwc2stage.csv'+'.pick') user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que) print 'load db1: ', dbname1, comname1 com1 = dbt.db_connect_col(dbname1, comname1) print 'load db2: ', dbname2, comname2 com2 = dbt.db_connect_col(dbname2, comname2) for uid in user1: # set uid row = [uid, groupname] # set attrition states u1 = com1.find_one({'id': uid}) u2 = com2.find_one({'id': uid}) u1_time = u1['_id'].generation_time.replace(tzinfo=None) # if u2 is None or u2['timeline_count'] == 0: drop = 1 if u2: u2_time = u2['_id'].generation_time.replace(tzinfo=None) if 'status' in u2: second_last_post = datetime.strptime(u2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if u1_time < second_last_post < u2_time: drop = 0 row.append(drop) row.append(u1['level']) # set users liwc feature uatt = iot.get_fields_one_doc(u1, fields) row.extend(uatt) # # set users liwc changes # uvs = df[df.user_id == str(uid)].loc[:, trimed_fields] # # print uvs # if len(uvs) == 2: # changes, priors, posts = [], [], [] # for name in trimed_fields: # old = uvs.iloc[0][name] # new = uvs.iloc[1][name] # priors.append(old) # posts.append(new) # changes.append(new - old) # row.extend(priors) # row.extend(posts) # row.extend(changes) # else: # row.extend([None]*(len(trimed_fields)*3)) # set profile, active days and eigenvector centrality print u1['id'] row.extend(active_days(u1)) row.extend([eigen_map.get(u1['id'], 0)]) row.extend([pagerank_map.get(u1['id'], 0)]) row.extend([indegree_map.get(u1['id'], 0)]) row.extend([outdegree_map.get(u1['id'], 0)]) row.extend([ # u1['recovery_tweets'], u1['timeline_count']]) exist = True try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: # friends = set(network1.neighbors(str(uid))) # id or name friends = set(network1.successors(str(uid))) if len(friends) > 0: friend_ids = [int(network1.vs[vi]['name']) for vi in friends] # return id print uid in friend_ids print len(friend_ids) fatts = [] alive = 0 for fid in friend_ids: fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}}) fu2 = com2.find_one({'id': fid}) if fu: f1_time = fu['_id'].generation_time.replace(tzinfo=None) # if eigen_map.get(fu['id'], 0) > 0.0001: if True: fatt = iot.get_fields_one_doc(fu, fields) factive = active_days(fu) if fu2: f2_time = fu2['_id'].generation_time.replace(tzinfo=None) if 'status' in fu2: fsecond_last_post = datetime.strptime(fu2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if f1_time < fsecond_last_post < f2_time: alive += 1 factive = active_days(fu2) fatt.extend(factive) fatt.extend([eigen_map.get(fu['id'], 0)]) fatt.extend([pagerank_map.get(fu['id'], 0)]) fatt.extend([indegree_map.get(fu['id'], 0)]) fatt.extend([outdegree_map.get(fu['id'], 0)]) fatt.extend([fu['timeline_count']]) fatts.append(fatt) if len(fatts) > 0: fatts = np.array(fatts) fmatts = np.mean(fatts, axis=0) row.extend(fmatts) row.append(len(fatts)) paliv = float(alive)/len(fatts) print 'Alive %d %d %.3f' % (alive, len(fatts), paliv) row.append(paliv) # print row data.append(row) df = pd.DataFrame(data, columns=attr_names) df.to_csv(filepath, index = False)
def network_users(file_path): # get user list in a network g = gt.Graph.Read_GraphML(file_path) g = gt.giant_component(g) gt.summary(g) return g.vs['name']
def community_net(rec_g, ped_g): # construct community networks of two network based Jarcard similarities gc_rec_g = gt.giant_component(rec_g) com_rec_g = gc_rec_g.community_multilevel(weights='weight', return_levels=False) comclus_rec_g = com_rec_g.subgraphs() print 'Community stats: #communities, modularity', len( comclus_rec_g), com_rec_g.modularity gc_ped_g = gt.giant_component(ped_g) com_ped_g = gc_ped_g.community_multilevel(weights='weight', return_levels=False) comclus_ped_g = com_ped_g.subgraphs() print 'Community stats: #communities, modularity', len( comclus_ped_g), com_ped_g.modularity name_map, edges, node_weight = {}, {}, {} for i in xrange(len(comclus_rec_g)): comclu_rec_g = comclus_rec_g[i] rec_nodes = set([v['name'] for v in comclu_rec_g.vs]) max_fre_rec = max(comclu_rec_g.vs['weight']) max_fre_rec_tag = comclu_rec_g.vs.find(weight_eq=max_fre_rec)['name'] n1 = 'rec_' + str(i) + '_' + max_fre_rec_tag for j in xrange(len(comclus_ped_g)): comclu_ped_g = comclus_ped_g[j] max_fre = max(comclu_ped_g.vs['weight']) ed_nodes = set([v['name'] for v in comclu_ped_g.vs]) max_fre_tag = comclu_ped_g.vs.find(weight_eq=max_fre)['name'] n2 = 'ped_' + str(j) + '_' + max_fre_tag n1id = name_map.get(n1, len(name_map)) name_map[n1] = n1id node_weight[n1id] = sum(comclu_rec_g.vs['weight']) n2id = name_map.get(n2, len(name_map)) name_map[n2] = n2id node_weight[n2id] = sum(comclu_ped_g.vs['weight']) similarity = float(len(rec_nodes.intersection(ed_nodes))) # /len(rec_nodes.union(ed_nodes)) if similarity > 10: edges[(n1id, n2id)] = similarity g = gt.Graph(len(name_map), directed=False) g.vs["name"] = list(sorted(name_map, key=name_map.get)) g.vs['weight'] = [node_weight[i] for i in xrange(len(node_weight))] g.add_edges(edges.keys()) g.es["weight"] = edges.values() for v in g.vs: tokens = v['name'].split('_') v['set'] = tokens[0] v['tag'] = tokens[2] g.write_graphml('hashtag_inter_net.graphml') gc = gt.giant_component(g) tagets_communities = {} for v in gc.vs: tokens = v['name'].split('_') com_list = tagets_communities.get(tokens[0], []) com_list.append(int(tokens[1])) tagets_communities[tokens[0]] = com_list return tagets_communities
def read_user_time_iv(filename): # fields = iot.read_fields() fields = [ 'liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.ingest', 'liwc_anal.result.bio', 'liwc_anal.result.body', 'liwc_anal.result.health', 'liwc_anal.result.death' 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad' ] prof_names = [ 'friends_count', 'statuses_count', 'followers_count', 'friends_day', 'statuses_day', 'followers_day', 'days' ] trimed_fields = [field.split('.')[-1] for field in fields] groups = [('ED', 'fed', 'com', 'fed_sur', 'com', '2017-06-21 14:57:39+00:00', { 'liwc_anal.result.WC': { '$exists': True }, 'level': 1 }), ('RD', 'random', 'scom', 'random_sur', 'com', '2017-06-21 14:57:39+00:00', { 'liwc_anal.result.WC': { '$exists': True } }), ('YG', 'younger', 'scom', 'younger_sur', 'com', '2017-06-21 14:57:39+00:00', { 'liwc_anal.result.WC': { '$exists': True } })] data = [] for tag, dbname, comname, dbname2, comname2, second_time, filter_values in groups: com = dbt.db_connect_col(dbname, comname) com2 = dbt.db_connect_col(dbname2, comname2) network1 = gt.Graph.Read_GraphML(tag.lower() + '-net.graphml') gt.summary(network1) network1_gc = gt.giant_component(network1) gt.summary(network1_gc) '''Centralities Calculation''' eigen = network1_gc.eigenvector_centrality() pageranks = network1_gc.pagerank() indegree = network1_gc.authority_score() outdegree = network1_gc.hub_score() nodes = [int(v['name']) for v in network1_gc.vs] eigen_map = dict(zip(nodes, eigen)) pagerank_map = dict(zip(nodes, pageranks)) indegree_map = dict(zip(nodes, indegree)) outdegree_map = dict(zip(nodes, outdegree)) print 'load liwc 2 batches: ' + tag.lower() + '-liwc2stage.csv' liwc_df = pd.read_pickle(tag.lower() + '-liwc2stage.csv' + '.pick') for user in com.find(filter_values, no_cursor_timeout=True): first_scraped_at = user['_id'].generation_time.replace(tzinfo=None) if 'status' in user: uid = user['id'] u2 = com2.find_one({'id': uid}) first_last_post = datetime.strptime( user['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') last_post = first_last_post drop = 1 if u2: second_scraped_at = u2['_id'].generation_time.replace( tzinfo=None) if 'status' in u2: second_last_post = datetime.strptime( u2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if first_scraped_at < second_last_post < second_scraped_at: drop = 0 last_post = second_last_post created_at = datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y') life_time = diff_day(last_post, created_at) average_time = float(life_time) / min(1, user['statuses_count']) longest_tweet_intervalb = user['longest_tweet_interval'] u_timeline_count = user['timeline_count'] values = iot.get_fields_one_doc(user, fields) level = user['level'] # set users liwc changes uvs = liwc_df[liwc_df.user_id == str(uid)].loc[:, trimed_fields] # print uvs if len(uvs) == 2: changes, priors, posts = [], [], [] for name in trimed_fields: old = uvs.iloc[0][name] new = uvs.iloc[1][name] priors.append(old) posts.append(new) changes.append(new - old) liwc_changes = priors + posts + changes else: liwc_changes = [None] * (len(trimed_fields) * 3) u_centrality = eigen_map.get(user['id'], 0) u_pagerank = pagerank_map.get(user['id'], 0) u_indegree = indegree_map.get(user['id'], 0) u_outdegree = outdegree_map.get(user['id'], 0) values.extend(liwc_changes) values.extend(active_days(user)) '''Get friends' profiles''' exist = True try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: # friends = set(network1.neighbors(str(uid))) # id or name friends = set(network1.successors(str(uid))) if len(friends) > 0: friend_ids = [ int(network1.vs[vi]['name']) for vi in friends ] # return id print uid in friend_ids print len(friend_ids) fatts = [] alive = 0 for fid in friend_ids: fu = com.find_one({ 'id': fid, 'liwc_anal.result.WC': { '$exists': True } }) fu2 = com2.find_one({'id': fid}) if fu: f1_time = fu['_id'].generation_time.replace( tzinfo=None) # if eigen_map.get(fu['id'], 0) > 0.0001: if True: fatt = iot.get_fields_one_doc(fu, fields) factive = active_days(fu) if fu2: f2_time = fu2[ '_id'].generation_time.replace( tzinfo=None) if 'status' in fu2: fsecond_last_post = datetime.strptime( fu2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if f1_time < fsecond_last_post < f2_time: alive += 1 factive = active_days(fu2) fatt.extend(factive) fatt.extend([ eigen_map.get(fu['id'], 0), pagerank_map.get(fu['id'], 0), indegree_map.get(fu['id'], 0), outdegree_map.get(fu['id'], 0) ]) fatts.append(fatt) # thredhold = user['friends_count']*0.5 if len(fatts) > 0: fatts = np.array(fatts) fmatts = np.mean(fatts, axis=0) values.extend(fmatts) paliv = float(alive) / len(fatts) data.append([ user['id_str'], level, drop, created_at, first_last_post, second_last_post, last_post, first_scraped_at, second_scraped_at, average_time, longest_tweet_intervalb, tag, u_centrality, u_pagerank, u_indegree, u_outdegree, u_timeline_count ] + values + [len(fatts), paliv]) df = pd.DataFrame( data, columns=[ 'uid', 'level', 'dropout', 'created_at', 'first_last_post', 'second_last_post', 'last_post', 'first_scraped_at', 'second_scraped_at', 'average_time', 'longest_time_interval', 'group', 'u_eigenvector', 'u_pagerank', 'u_authority', 'u_hub', 'u_timeline_count' ] + ['u_' + field for field in trimed_fields] + ['u_prior_' + field for field in trimed_fields] + ['u_post_' + field for field in trimed_fields] + ['u_change_' + field for field in trimed_fields] + ['u_' + field for field in prof_names] + ['f_' + tf for tf in trimed_fields] + ['f_' + field for field in prof_names] + [ 'f_eigenvector', 'f_pagerank', 'f_authority', 'f_hub', 'f_num', 'f_palive' ]) df.to_csv(filename)