Exemple #1
0
def network(dbname, colname, netname):
    '''Get users' friendship network'''
    # # ed_usersd = ed_user(dbname, colname)
    # # pickle.dump(ed_usersd, open('data/ed_users.pick', 'w'))
    # ed_usersd = pickle.load(open('data/ed_users.pick', 'r'))
    #
    # # rec_usersd = rec_user(dbname, colname)
    # # pickle.dump(rec_usersd, open('data/rec_users.pick', 'w'))
    # rec_usersd = pickle.load(open('data/rec_users.pick', 'r'))
    #
    #
    # inlist = list(set(ed_usersd).union(set(rec_usersd)))
    #
    # print len(inlist)
    # g = gt.load_network_subset(inlist, dbname, netname)
    # g.vs['rec'] = 0
    # for uid in rec_usersd:
    #     exist = True
    #     try:
    #         v = g.vs.find(name=str(uid))
    #     except ValueError:
    #         exist = False
    #     if exist:
    #         v['rec'] = 1
    # pickle.dump(g, open('data/rec_friendship.pick', 'w'))
    rg = pickle.load(open('data/rec_friendship.pick', 'r'))
    # g.write_gml('data/rec_friendship.GML')
    # g.write_dot('data/rec_friendship.DOT')

    gc = gt.giant_component(rg, 'WEAK')
    comm = gt.fast_community(gc, False)
    fclus = comm.as_clustering(2)
    communit_topinflu(fclus, None)
Exemple #2
0
def tag_activity(dbname, colname):
    # recording the activity of tag

    g = gt.Graph.Read_GraphML('data/pro_mention_tag_undir.graphml')
    vs = g.vs(weight_gt=3, user_gt=3)
    sg = g.subgraph(vs)
    gc = gt.giant_component(sg)
    tag_time = {}
    for v in gc.vs:
        tag_time[v['name']] = []

    time = dbt.db_connect_col(dbname, colname)
    filter = {}
    filter['$where'] = 'this.entities.hashtags.length>0'
    filter['retweeted_status'] = {'$exists': False}
    for tweet in time.find(filter, no_cursor_timeout=True):
        # if 'retweeted_status' in row:
        #     continue
        created_at = datetime.strptime(tweet['created_at'],
                                       '%a %b %d %H:%M:%S +0000 %Y')
        hashtags = tweet['entities']['hashtags']
        for hash in hashtags:
            # need no .encode('utf-8')
            tag = (hash['text'].encode('utf-8').lower().replace('_',
                                                                '').replace(
                                                                    '-', ''))
            if tag in tag_time:
                datelist = tag_time.get(tag, [])
                datelist.append(created_at)
                tag_time[tag] = datelist
    pickle.dump(tag_time, open('tag_activity.pick', 'w'))
Exemple #3
0
def community(g=None):
    '''
    Detect communities in the co-occurrence network of hashtag
    Use InfoMap to detect communities
    Only select communities whose sizes are larger than a threshold
    :param g:
    :return:
    '''
    g = gt.Graph.Read_GraphML('ed_tag.graphml')
    gc = gt.giant_component(g)
    com = gc.community_infomap(edge_weights='weight', vertex_weights='weight')
    comclus = com.subgraphs()
    print len(comclus), com.modularity
    index = 0
    hash_com = {}
    for comclu in comclus:
        if comclu.vcount() > 10:
        #     print 'Com', index, '==================================='
        # else:
        #     print '==================================='
            tag_weight = {}
            for v in comclu.vs:
                hash_com[v['name']] = index
                tag_weight[v['name']] = v['weight']
            index += 1
            sort_list = list(sorted(tag_weight, key=tag_weight.get, reverse=True))
            for key in sort_list:
                print key, tag_weight[key]
    print len(hash_com)
    print len(set(hash_com.values()))
    print set(hash_com.values())
    return hash_com
Exemple #4
0
def test_user_cluster_assign_stable():
    # Test stable how final user clustering assignments (k=2)
    core = gt.Graph.Read_GraphML('alled_tag_undir_filter.graphml')
    communication = gt.Graph.Read_GraphML(
        'communication-only-fed-filter.graphml')
    gt.summary(communication)
    communication = gt.giant_component(communication)
    gt.summary(communication)
    users = [(v['name']) for v in communication.vs]
    print len(users)
    # user_hashtag_vector('fed', 'ed_tag', users)
    seperations = []
    for i in xrange(100):
        print i
        user_hashtag_profile(core, users)
        # data += user_cluster_hashtag()
        cluters, ids = user_cluster_hashtag()
        seperations.append(cluters)
    aRI = []
    for i in xrange(100):
        for j in xrange(i + 1, 100):
            aRI.append(
                metrics.adjusted_rand_score(seperations[i], seperations[j]))
    print len(aRI)
    print '%.3f, %.3f, %.3f, %.3f' % (min(aRI), max(aRI), np.mean(aRI),
                                      np.std(aRI))
Exemple #5
0
def network(dbname, colname, netname):
    '''Get users' friendship network'''
    # # ed_usersd = ed_user(dbname, colname)
    # # pickle.dump(ed_usersd, open('data/ed_users.pick', 'w'))
    # ed_usersd = pickle.load(open('data/ed_users.pick', 'r'))
    #
    # # rec_usersd = rec_user(dbname, colname)
    # # pickle.dump(rec_usersd, open('data/rec_users.pick', 'w'))
    # rec_usersd = pickle.load(open('data/rec_users.pick', 'r'))
    #
    #
    # inlist = list(set(ed_usersd).union(set(rec_usersd)))
    #
    # print len(inlist)
    # g = gt.load_network_subset(inlist, dbname, netname)
    # g.vs['rec'] = 0
    # for uid in rec_usersd:
    #     exist = True
    #     try:
    #         v = g.vs.find(name=str(uid))
    #     except ValueError:
    #         exist = False
    #     if exist:
    #         v['rec'] = 1
    # pickle.dump(g, open('data/rec_friendship.pick', 'w'))
    rg = pickle.load(open('data/rec_friendship.pick', 'r'))
    # g.write_gml('data/rec_friendship.GML')
    # g.write_dot('data/rec_friendship.DOT')

    gc = gt.giant_component(rg, 'WEAK')
    comm = gt.fast_community(gc, False)
    fclus = comm.as_clustering(2)
    communit_topinflu(fclus, None)
Exemple #6
0
def test_significant(file_path):
    # random shuffle the weights of edges and test the segregate of networks
    g = gt.Graph.Read_GraphML(file_path)
    gt.summary(g)
    g = g.as_undirected(combine_edges=dict(weight="sum"))
    g = gt.giant_component(g)
    gt.summary(g)
    # print g.es['weight']
    fast = g.community_fastgreedy(weights='weight')
    fast_com = fast.as_clustering(n=2)
    orig_mod = fast_com.modularity
    mod_list = []

    for i in xrange(1000):
        weights = g.es["weight"]
        g.rewire()
        g.es["weight"] = weights
        # gt.net_stat(g)
        # print g.es['weight']
        fast = g.community_fastgreedy(weights='weight')
        fast_com = fast.as_clustering()
        mod_list.append(fast_com.modularity)


    amean, astd = np.mean(mod_list), np.std(mod_list)
    print 'simulated values: %.3f +- (%.3f)' %(amean, astd)
    # absobserved = abs(raw_assort)
    # pval = (np.sum(ass_list >= absobserved) +
    #         np.sum(ass_list <= -absobserved))/float(len(ass_list))
    zscore = (orig_mod-amean)/astd
    print 'z-score: %.3f' %zscore
Exemple #7
0
def tfidf_tag_cluster(btype='retweet'):
    # Calculate the TFIDF of tags in two clusters
    cluster0 = gt.Graph.Read_GraphML('ed_' + btype +
                                     '_fed_cluster0_tag_undir.graphml')
    cluster1 = gt.Graph.Read_GraphML('ed_' + btype +
                                     '_fed_cluster1_tag_undir.graphml')

    gt.summary(cluster0)
    vs = cluster0.vs(weight_gt=3, user_gt=3)
    cluster0 = cluster0.subgraph(vs)
    cluster0 = gt.giant_component(cluster0)
    gt.summary(cluster0)

    gt.summary(cluster1)
    vs = cluster1.vs(weight_gt=3, user_gt=3)
    cluster1 = cluster1.subgraph(vs)
    cluster1 = gt.giant_component(cluster1)
    gt.summary(cluster1)

    for v in cluster0.vs:
        exist = True
        count_ov = 0.0
        try:
            ov = cluster1.vs.find(name=v['name'])
        except ValueError:
            exist = False
        if exist:
            count_ov = ov['weight']
        v['tfidf'] = float(v['weight']) / (v['weight'] + count_ov)
    for v in cluster1.vs:
        exist = True
        count_ov = 0.0
        try:
            ov = cluster0.vs.find(name=v['name'])
        except ValueError:
            exist = False
        if exist:
            count_ov = ov['weight']
        v['tfidf'] = float(v['weight']) / (v['weight'] + count_ov)
    cluster0.write_graphml('ed_' + btype +
                           '_fed_cluster0_tfidf_tag_undir.graphml')
    cluster1.write_graphml('ed_' + btype +
                           '_fed_cluster1_tfidf_tag_undir.graphml')
Exemple #8
0
def community(g=None):
    '''
    Detect communities in the co-occurrence network of hashtag
    Use multilevel to detect communities
    Only select communities whose sizes are larger than a threshold
    :param g:
    :return:
    hash_com: {hashtag: community_index}
    com_size: {community_index: community_size}
    '''
    gt.summary(g)
    vs = g.vs(weight_gt=100, user_gt=10)
    g = g.subgraph(vs)
    g = g.subgraph_edges(g.es.select(rWeight_gt=0, rWeight_lt=float('Inf')))
    gt.summary(g)
    gc = gt.giant_component(g)
    gt.summary(gc)
    # g.write_graphml('fed_tag_undir_over3.graphml')
    # com = g.community_multilevel(weights='rWeight', return_levels=False)
    com = g.community_infomap(edge_weights='rWeight', vertex_weights=None)
    # com = louvain.find_partition(gc, method='Significance', weight=None)
    comclus = com.subgraphs()
    print 'Community stats: #communities, modularity', len(
        comclus), com.modularity
    index = 0
    nonsingle = 0
    hash_com = {}
    com_size = {}
    for comclu in comclus:
        print '---------- Community ', index, '-----------------'
        if comclu.vcount() > 1:
            nonsingle += 1
        tag_weight = {}
        for v in comclu.vs:
            if v['weight'] > 5:
                hash_com[v['name']] = index
            tag_weight[v['name']] = v['weight']
            count = com_size.get(index, 0)
            com_size[index] = v['weight'] + count
        sort_list = list(sorted(tag_weight, key=tag_weight.get, reverse=True))
        for key in sort_list[:min(len(sort_list), len(sort_list))]:
            print key, tag_weight[key]
        print '-------------Community size: ', com_size[
            index], '---------------------'
        print
        index += 1
    # print len(hash_com)
    # print len(set(hash_com.values()))
    # print set(hash_com.values())
    print '------------------all size:', sum(
        com_size.values()), '---------------------'
    print '------------------non single clusters:', nonsingle, '---------------------'

    return hash_com, com_size
Exemple #9
0
def test_stable_infomap_kmean():
    # Test the stable for the whole process, from infomap clustering hashtag and k-means clustering users
    import tag_network
    core = gt.Graph.Read_GraphML('alled_tag_undir_filter.graphml')
    communication = gt.Graph.Read_GraphML('communication-only-fed-filter.graphml')
    gt.summary(communication)
    communication = gt.giant_component(communication)
    gt.summary(communication)
    users = [(v['name']) for v in communication.vs]
    print len(users)
    tag_network.user_hashtag_profile(core, users)
Exemple #10
0
def test_user_cluster_stable():
    # Test stable of using infomap and test best k for k-means
    core = gt.Graph.Read_GraphML('data/alled_tag_undir_filter.graphml')
    communication = gt.Graph.Read_GraphML(
        'data/communication-only-fed-filter.graphml')
    gt.summary(communication)
    communication = gt.giant_component(communication)
    gt.summary(communication)
    users = [(v['name']) for v in communication.vs]
    print len(users)
    # user_hashtag_vector('fed', 'ed_tag', users)
    data = []
    for i in xrange(100):
        user_hashtag_profile(core, users, i)
        ###### Run by python
        data += user_cluster_hashtag()
    df = pd.DataFrame(data, columns=['cluster', 'silhouette_avg'])
    df.to_csv('user-kmeans-hashtag.csv')
Exemple #11
0
def cluseter_nodes(btype = 'communication'):
    # cluster users in networks
    g = gt.Graph.Read_GraphML('communication-only-fed-filter.graphml')
    g = gt.giant_component(g)
    gt.summary(g)

    cluters, ids = tn.user_cluster_hashtag('ed-'+btype+'.data')

    # ids = []
    # with open('ed-'+btype+'.data', 'r') as fo:
    #     for line in fo.readlines():
    #         ids.append(line.split(' ')[0])

    g.vs['cluster'] = -1
    for i in xrange(len(cluters)):
        id = ids[i]
        v = g.vs.find(name=id)
        v['cluster'] = cluters[i]
    g.write_graphml('communication-only-fed-filter-hashtag-cluster.graphml')
Exemple #12
0
def rank_feature(gc, dbname, comname, db_field_names, directed=True):
    g = gt.giant_component(gc, 'WEAK')

    g.vs['nt'] = g.degree(type="in")
    netatt = g.vs['nt']

    # ranks = g.pagerank(weights='weight')
    # g.vs['rank'] = ranks

    # cor = st.tau_coef(g.degree(type="in"), g.vs['rank'])
    # print 'Indegree' + '\t' + str(cor[0]) + '\t' + str(cor[1])
    # cor = st.tau_coef(g.degree(type="out"), g.vs['rank'])
    # print 'Outdegree' + '\t' + str(cor[0]) + '\t' + str(cor[1])

    for db_field_name in db_field_names:
        # print 'Processing ' + db_field_name
        g = gt.add_attribute(g, 'foi', dbname, comname, db_field_name)
        raw_values = np.array(g.vs['foi'])
        values = drop_initials(raw_values)

        if len(values) > 100:
            # maxv, minv = max(values), min(values)
            maxv, minv = np.percentile(values,
                                       97.5), np.percentile(values, 2.5)
            vs = g.vs(foi_ge=minv, foi_le=maxv)
            sg = g.subgraph(vs)

            maxd, mind = np.percentile(netatt,
                                       97.5), np.percentile(netatt, 2.5)
            vs = sg.vs(nt_ge=mind, nt_le=maxd)
            sg = sg.subgraph(vs)

            # cor = st.tau_coef(sg.vs['foi'], sg.vs['nt'])
            # print db_field_name + '\t' + str(len(sg.vs)) + '\t' + str(len(sg.es)) + '\t'\
            #       + str(min(netatt)) + '\t' + str(max(netatt)) + '\t' + str(mind) + '\t'\
            #       +str(maxd) + '\t' \
            #       + str(min(values)) + '\t' + str(max(values)) + '\t' + str(minv) + '\t'\
            #       +str(maxv) + '\t'\
            #       + str(cor[0]) + '\t' + str(cor[1])
            pt.correlation(sg.vs['nt'], sg.vs['foi'], 'Indegree', 'Feature',
                           'data/' + db_field_name + '.pdf')
Exemple #13
0
def two_community(file_path):
    # get two community from networks
    g = gt.Graph.Read_GraphML(file_path)
    gt.summary(g)
    # g = g.as_undirected(combine_edges=dict(weight="sum"))
    g = gt.giant_component(g)
    gt.summary(g)
    # ml = g.community_multilevel(weights='weight', return_levels=True)
    # fast = g.community_fastgreedy(weights='weight')
    # fast_com = fast.as_clustering(n=2)
    # walk = g.community_walktrap(weights='weight')
    # walk_com = walk.as_clustering(n=2)
    infor = g.community_infomap(edge_weights='weight', vertex_weights=None, trials=2)
    # eigen = g.community_leading_eigenvector(clusters=2, weights='weight')
    # label_pro = g.community_label_propagation(weights='weight', initial=eigen.membership)
    # betweet = g.community_edge_betweenness(weights='weight')
    # bet_com = betweet.as_clustering(n=2)
    g.vs['community'] = infor.membership
    g.write_graphml('com-'+file_path)

    return infor.subgraphs()
Exemple #14
0
def user_statis():
    groups = [
         ('ED', 'fed', 'com', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1}),
         ('RD', 'random', 'scom', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1}),
         ('YG', 'younger', 'scom', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1})
    ]

    data = []
    for tag, dbname, comname, filter_values in groups:
        com = dbt.db_connect_col(dbname, comname)
        network1 = gt.Graph.Read_GraphML(tag.lower()+'-net.graphml')
        gt.summary(network1)
        network1_gc = gt.giant_component(network1)
        gt.summary(network1_gc)

        users_time = iot.get_values_one_field(dbname, comname, 'id_str', filter_values)
        try:
            v = network1.vs.find(name=str(uid))
        except ValueError:
            exist = False
        if exist:
            friends = set(network1.successors(str(uid)))
Exemple #15
0
def rank_feature(gc, dbname, comname, db_field_names, directed=True):
    g = gt.giant_component(gc, "WEAK")

    g.vs["nt"] = g.degree(type="in")
    netatt = g.vs["nt"]

    # ranks = g.pagerank(weights='weight')
    # g.vs['rank'] = ranks

    # cor = st.tau_coef(g.degree(type="in"), g.vs['rank'])
    # print 'Indegree' + '\t' + str(cor[0]) + '\t' + str(cor[1])
    # cor = st.tau_coef(g.degree(type="out"), g.vs['rank'])
    # print 'Outdegree' + '\t' + str(cor[0]) + '\t' + str(cor[1])

    for db_field_name in db_field_names:
        # print 'Processing ' + db_field_name
        g = gt.add_attribute(g, "foi", dbname, comname, db_field_name)
        raw_values = np.array(g.vs["foi"])
        values = drop_initials(raw_values)

        if len(values) > 100:
            # maxv, minv = max(values), min(values)
            maxv, minv = np.percentile(values, 97.5), np.percentile(values, 2.5)
            vs = g.vs(foi_ge=minv, foi_le=maxv)
            sg = g.subgraph(vs)

            maxd, mind = np.percentile(netatt, 97.5), np.percentile(netatt, 2.5)
            vs = sg.vs(nt_ge=mind, nt_le=maxd)
            sg = sg.subgraph(vs)

            # cor = st.tau_coef(sg.vs['foi'], sg.vs['nt'])
            # print db_field_name + '\t' + str(len(sg.vs)) + '\t' + str(len(sg.es)) + '\t'\
            #       + str(min(netatt)) + '\t' + str(max(netatt)) + '\t' + str(mind) + '\t'\
            #       +str(maxd) + '\t' \
            #       + str(min(values)) + '\t' + str(max(values)) + '\t' + str(minv) + '\t'\
            #       +str(maxv) + '\t'\
            #       + str(cor[0]) + '\t' + str(cor[1])
            pt.correlation(sg.vs["nt"], sg.vs["foi"], "Indegree", "Feature", "data/" + db_field_name + ".pdf")
Exemple #16
0
def benetwork(dbname, type, netname):
    '''Get users' behavior networks'''
    # ed_usersd = pickle.load(open('data/ed_users.pick', 'r'))
    # rec_usersd = pickle.load(open('data/rec_users.pick', 'r'))
    # inlist = list(set(ed_usersd).union(set(rec_usersd)))
    # g = gt.load_beh_network_subset(inlist, dbname, netname, type)
    # g.vs['rec'] = 0
    # for uid in rec_usersd:
    #     exist = True
    #     try:
    #         v = g.vs.find(name=str(uid))
    #     except ValueError
    #         exist = False
    #     if exist:
    #         v['rec'] = 1
    # pickle.dump(g, open('data/rec_'+type+'.pick', 'w'))
    rg = pickle.load(open('data/rec_'+type+'.pick', 'r'))
    # plot_graph(g)

    gc = gt.giant_component(rg, 'WEAK')
    comm = gt.fast_community(gc, True)
    fclus = comm.as_clustering(2)
    communit_topinflu(fclus, 'weight')
Exemple #17
0
def benetwork(dbname, type, netname):
    '''Get users' behavior networks'''
    # ed_usersd = pickle.load(open('data/ed_users.pick', 'r'))
    # rec_usersd = pickle.load(open('data/rec_users.pick', 'r'))
    # inlist = list(set(ed_usersd).union(set(rec_usersd)))
    # g = gt.load_beh_network_subset(inlist, dbname, netname, type)
    # g.vs['rec'] = 0
    # for uid in rec_usersd:
    #     exist = True
    #     try:
    #         v = g.vs.find(name=str(uid))
    #     except ValueError:
    #         exist = False
    #     if exist:
    #         v['rec'] = 1
    # pickle.dump(g, open('data/rec_'+type+'.pick', 'w'))
    rg = pickle.load(open('data/rec_'+type+'.pick', 'r'))
    # plot_graph(g)

    gc = gt.giant_component(rg, 'WEAK')
    comm = gt.fast_community(gc, True)
    fclus = comm.as_clustering(2)
    communit_topinflu(fclus, 'weight')
Exemple #18
0
def friendship_community_vis(dbname, colname, filename, ctype):
    '''Out graph for vis.js visualization'''
    ed_users = iot.get_values_one_field(dbname, 'scom', 'id')
    # fed_users = iot.get_values_one_field(dbname, 'com', 'id')
    dbcom = dbt.db_connect_col(dbname, 'com')
    fg = gt.load_network(dbname, colname)
    # fg = gt.load_beh_network_subset(ed_users, dbname, colname, 'retweet')
    gt.net_stat(fg)
    # fg = fg.as_undirected(mode="mutual")
    # gt.net_stat(fg)

    fg = gt.giant_component(fg, 'WEAK')
    gt.net_stat(fg)

    if ctype == 'ml':
        com = fg.community_multilevel(weights='weight', return_levels=False)
    elif ctype == 'lp':
        fgu = fg.as_undirected(combine_edges=sum)
        init = fgu.community_leading_eigenvector(clusters=2, weights='weight')
        print init.membership
        com = fg.community_label_propagation(weights='weight',
                                             initial=init.membership)
        print com.membership
    else:
        com = fg.community_infomap(edge_weights='weight', trials=2)
    fg.vs['group'] = com.membership

    # edges = fg.es.select(weight_gt=3)
    # print 'Filtered edges: %d' %len(edges)
    # fg = fg.subgraph_edges(edges)
    # gt.net_stat(fg)

    # fg.vs['degree'] = fg.degree(mode="all")
    # nodes = fg.vs.select(degree_gt=10)
    # fg = fg.subgraph(nodes)
    # gt.net_stat(fg)

    Coo = {}
    for x in fg.vs['group']:
        Coo[x] = (rand.randint(-1000, 1000), rand.randint(-1000, 1000))

    with open('data/' + ctype + '_' + filename + '_net_follow.js', 'w') as fw:
        fw.write('var nodes = [\n')
        for idv, v in enumerate(fg.vs):
            user = dbcom.find_one({'id': int(fg.vs[idv]['name'])})
            desc = ' '.join(user['description'].replace('\'', '').replace(
                '\"', '').split())
            fw.write('{id: ' + str(idv + 1) + ', ' + 'label: \'' +
                     user['screen_name'] + '\', ' + 'value: ' +
                     str(fg.degree(idv, mode="in")) + ', ' + 'title: \'UID: ' +
                     str(fg.vs[idv]['name']) + '<br> Screen Name: ' +
                     user['screen_name'] + '<br> Followers: ' +
                     str(user['followers_count']) + '<br> Followees: ' +
                     str(user['friends_count']) + '<br> Tweets: ' +
                     str(user['statuses_count']) + '<br> Description: ' +
                     str(desc.encode('utf-8')) + '<br> Group: ' +
                     str(fg.vs[idv]['group']) + '\', ' + 'x: ' +
                     str(Coo[fg.vs[idv]['group']][0] + rand.randint(0, 300)) +
                     ', ' + 'y: ' +
                     str(Coo[fg.vs[idv]['group']][1] + rand.randint(0, 300)) +
                     ', ' + 'group: ' + str(fg.vs[idv]['group']) + ', ')
            # if int(fg.vs[idv]['name']) in ed_users:
            #     fw.write('shape: ' + '\'triangle\'')
            # else:
            #     fw.write('shape: ' + '\'circle\'')
            fw.write('}, \n')
        fw.write('];\n var edges = [\n')
        for ide, e in enumerate(fg.es):
            fw.write('{from: ' + str(e.source + 1) + ', ' + 'to: ' +
                     str(e.target + 1) + ', ' + 'arrows: ' + '\'to\'' + ', ' +
                     'title: \' Tags: ' + fg.vs[e.source]['name'] + ' ' +
                     fg.vs[e.target]['name'] + '<br> Co-occurrence: ' +
                     str(fg.es[ide]['weight']) + '\', ' + 'value: ' +
                     str(fg.es[ide]['weight']) +
                     '},\n')  #str(fg.es[ide]['weight'])
        fw.write('];\n')
Exemple #19
0
def read_user_time_iv(filename):
    # fields = iot.read_fields()

    fields = [
            #   #   'liwc_anal.result.posemo',
            #   # 'liwc_anal.result.negemo',
            #   # 'liwc_anal.result.ingest',
            #   # 'liwc_anal.result.bio',
            #   # 'liwc_anal.result.body',
            #   # 'liwc_anal.result.health',
            #   # 'liwc_anal.result.death'
            #   # 'liwc_anal.result.anx',
            #   # 'liwc_anal.result.anger',
            #   # 'liwc_anal.result.sad',
            #   # 'liwc_anal.result.i',
            #   # 'liwc_anal.result.we',
            #   # 'liwc_anal.result.negate',
            #   # 'liwc_anal.result.swear',
            #   # 'liwc_anal.result.social',
            #   # 'liwc_anal.result.family',
            #   # 'liwc_anal.result.friend',
            #   # 'liwc_anal.result.affect',
            # 'senti.result.whole.posm',
            # # 'senti.result.whole.posstd',
            # 'senti.result.whole.negm',
            # # 'senti.result.whole.negstd',
            # 'senti.result.whole.scalem',
            # # 'senti.result.whole.scalestd',
            # 'senti.result.whole.N',
            # 'senti.result.prior.scalem',
            # 'senti.result.post.scalem'
            'senti'
              ]
    prof_names = ['friends_count', 'statuses_count', 'followers_count',
        'friends_day', 'statuses_day', 'followers_day', 'days']

    trimed_fields = ['-'.join(field.split('.')[-2:]) for field in fields]
    print trimed_fields
    groups = [
         ('ED', 'fed', 'com', 'fed', 'com_survival', {
                                                        'liwc_anal.result.WC': {'$exists': True},
                                                        'level': 1,
                                                        'senti.result.whole.N': {'$gt': 10}}),
         ('RD', 'random', 'scom', 'random', 'com_survival', {
                                                        'liwc_anal.result.WC': {'$exists': True},
                                                        'senti.result.whole.N': {'$gt': 10}}),
         ('YG', 'younger', 'scom', 'younger', 'com_survival', {
                                                            'liwc_anal.result.WC': {'$exists': True},
                                                            'senti.result.whole.N': {'$gt': 10}})
    ]

    data = []
    for tag, dbname, comname, dbname2, comname2, filter_values in groups:
        com = dbt.db_connect_col(dbname, comname)
        com2 = dbt.db_connect_col(dbname2, comname2)

        sentims = (pickle.load(open(tag.lower() + '.sentis', 'r')))
        print len(sentims)

        network1 = gt.Graph.Read_GraphML(tag.lower()+'-net-all-active.graphml')
        gt.summary(network1)
        network1_gc = gt.giant_component(network1)
        gt.summary(network1_gc)
        '''Centralities Calculation'''
        eigen = network1_gc.eigenvector_centrality()
        pageranks = network1_gc.pagerank()
        indegree = network1_gc.authority_score()
        outdegree = network1_gc.hub_score()

        nodes = [int(v['name']) for v in network1_gc.vs]
        eigen_map = dict(zip(nodes, eigen))
        pagerank_map = dict(zip(nodes, pageranks))
        indegree_map = dict(zip(nodes, indegree))
        outdegree_map = dict(zip(nodes, outdegree))

        frialive, friduration = {}, {}
        for v in network1.vs:
            friends = set(network1.successors(str(v['name'])))
            if len(friends) > 0:
                falive, fduration = [], []
                for vi in friends:
                    falive.append(network1.vs[vi]['alive'])
                    fduration.append(network1.vs[vi]['duration'])
                frialive[int(v['name'])] = np.mean(falive)
                friduration[int(v['name'])] = np.mean(fduration)

        # print 'load liwc 2 batches: ' + tag.lower()+'-liwc2stage.csv'
        # liwc_df = pd.read_pickle(tag.lower()+'-liwc2stage.csv'+'.pick')

        network1 = gt.Graph.Read_GraphML(tag.lower()+'-net.graphml')
        for user in com.find(filter_values, no_cursor_timeout=True):
            first_scraped_at = user['_id'].generation_time.replace(tzinfo=None)
            if 'status' in user:
                uid = user['id']
                u2 = com2.find_one({'id': uid})

                first_last_post = datetime.strptime(user['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                last_post = first_last_post
                first_statuses_count = user['statuses_count']
                second_statuses_count = first_statuses_count
                drop = 1
                if u2:
                    second_scraped_at = u2['_id'].generation_time.replace(tzinfo=None)
                    second_statuses_count = u2['statuses_count']
                    if 'status' in u2:
                        second_last_post = datetime.strptime(u2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                        if first_scraped_at < second_last_post < second_scraped_at:
                            drop = 0
                            last_post = second_last_post


                created_at = datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y')

                longest_tweet_intervalb = user['longest_tweet_interval']
                u_timeline_count = user['timeline_count']

                # values = iot.get_fields_one_doc(user, fields)
                values = [sentims[uid]]
                level = user['level']


                u_centrality = eigen_map.get(user['id'], 0)
                u_pagerank = pagerank_map.get(user['id'], 0)
                u_indegree = indegree_map.get(user['id'], 0)
                u_outdegree = outdegree_map.get(user['id'], 0)

                # values.extend(liwc_changes)
                values.extend(active_days(user))

                '''Get friends' profiles'''
                exist = True
                try:
                    v = network1.vs.find(name=str(uid))
                except ValueError:
                    exist = False
                if exist:
                    # friends = set(network1.neighbors(str(uid))) # id or name
                    friends = set(network1.successors(str(uid)))
                    if len(friends) > 0:
                        friend_ids = [int(network1.vs[vi]['name']) for vi in friends] # return id
                        print uid in friend_ids
                        print len(friend_ids)
                        fatts = []
                        alive = 0
                        ffatts = []

                        for fid in friend_ids:
                            if fid in sentims:
                                fatt  = [sentims[fid]]
                                fatt.extend([eigen_map.get(fid, 0), pagerank_map.get(fid, 0),
                                             indegree_map.get(fid, 0), outdegree_map.get(fid, 0)])
                                fatts.append(fatt)

                                friendfriends = set(network1.successors(str(fid)))
                                if len(friendfriends) > 0:
                                    friendfriends_ids = [int(network1.vs[vi]['name']) for vi in friendfriends] # return id
                                    for ffid in friendfriends_ids:
                                        if ffid in sentims:
                                            ffatt = [sentims[ffid]]
                                            ffatts.append(ffatt)


                        if (len(fatts) > 0) and (len(ffatts)>0):
                            fatts = np.array(fatts)
                            fmatts = np.mean(fatts, axis=0)
                            ffatts = np.array(ffatts)
                            ffmatts = np.mean(ffatts, axis=0)
                            values.extend(fmatts)
                            # paliv = float(alive)/len(fatts)
                            paliv = frialive.get(uid)
                            fdays = friduration.get(uid)
                            data.append([user['id_str'], level, drop, created_at, first_last_post, second_last_post, last_post,
                                         first_scraped_at, second_scraped_at, first_statuses_count, second_statuses_count,
                             longest_tweet_intervalb, tag, u_centrality, u_pagerank,
                                         u_indegree, u_outdegree, u_timeline_count] +
                                        values + [len(fatts), paliv, fdays] + ffmatts.tolist())

    df = pd.DataFrame(data, columns=['uid', 'level', 'dropout', 'created_at', 'first_last_post', 'second_last_post', 'last_post', 'first_scraped_at', 'second_scraped_at',
                                     'first_statuses_count', 'second_statuses_count','longest_time_interval',
                                     'group', 'u_eigenvector', 'u_pagerank', 'u_authority', 'u_hub',
                                     'u_timeline_count'] +
                                    ['u_'+field for field in trimed_fields]  +
                                    # ['u_prior_'+field for field in trimed_fields] +
                                    # ['u_post_'+field for field in trimed_fields] +
                                    # ['u_change_'+field for field in trimed_fields] +
                                    ['u_'+field for field in prof_names] +
                                    ['f_'+tf for tf in trimed_fields]  +
                                    ['f_eigenvector', 'f_pagerank', 'f_authority', 'f_hub', 'f_num', 'f_palive', 'f_days'] + ['ff_'+field for field in trimed_fields] )
    df.to_csv(filename)
Exemple #20
0
def friendship_community_vis(dbname, colname, filename, ctype):
    '''Out graph for vis.js visualization'''
    ed_users = iot.get_values_one_field(dbname, 'scom', 'id')
    # fed_users = iot.get_values_one_field(dbname, 'com', 'id')
    dbcom = dbt.db_connect_col(dbname, 'com')
    fg = gt.load_network(dbname, colname)
    # fg = gt.load_beh_network_subset(ed_users, dbname, colname, 'retweet')
    gt.net_stat(fg)
    # fg = fg.as_undirected(mode="mutual")
    # gt.net_stat(fg)

    fg = gt.giant_component(fg, 'WEAK')
    gt.net_stat(fg)

    if ctype == 'ml':
        com = fg.community_multilevel(weights='weight', return_levels=False)
    elif ctype == 'lp':
        fgu = fg.as_undirected(combine_edges=sum)
        init = fgu.community_leading_eigenvector(clusters=2, weights='weight')
        print init.membership
        com = fg.community_label_propagation(weights='weight', initial=init.membership)
        print com.membership
    else:
        com = fg.community_infomap(edge_weights='weight', trials=2)
    fg.vs['group'] = com.membership

    # edges = fg.es.select(weight_gt=3)
    # print 'Filtered edges: %d' %len(edges)
    # fg = fg.subgraph_edges(edges)
    # gt.net_stat(fg)

    # fg.vs['degree'] = fg.degree(mode="all")
    # nodes = fg.vs.select(degree_gt=10)
    # fg = fg.subgraph(nodes)
    # gt.net_stat(fg)

    Coo={}
    for x in fg.vs['group']:
        Coo[x]=(rand.randint(-1000, 1000), rand.randint(-1000, 1000))

    with open('data/' + ctype + '_' +filename+'_net_follow.js', 'w') as fw:
        fw.write('var nodes = [\n')
        for idv, v in enumerate(fg.vs):
            user = dbcom.find_one({'id': int(fg.vs[idv]['name'])})
            desc = ' '.join(user['description'].replace('\'', '').replace('\"', '').split())
            fw.write('{id: ' + str(idv+1) + ', '+
                     'label: \'' + user['screen_name'] +'\', ' +
                     'value: ' + str(fg.degree(idv, mode="in")) + ', ' +
                     'title: \'UID: ' + str(fg.vs[idv]['name']) +
                     '<br> Screen Name: ' + user['screen_name'] +
                     '<br> Followers: ' + str(user['followers_count']) +
                     '<br> Followees: ' + str(user['friends_count']) +
                     '<br> Tweets: ' + str(user['statuses_count']) +
                     '<br> Description: ' + str(desc.encode('utf-8')) +
                     '<br> Group: ' + str(fg.vs[idv]['group']) + '\', ' +
                     'x: ' + str(Coo[fg.vs[idv]['group']][0]+rand.randint(0, 300)) + ', ' +
                     'y: ' + str(Coo[fg.vs[idv]['group']][1]+rand.randint(0, 300)) + ', ' +
                     'group: ' + str(fg.vs[idv]['group']) + ', ')
            # if int(fg.vs[idv]['name']) in ed_users:
            #     fw.write('shape: ' + '\'triangle\'')
            # else:
            #     fw.write('shape: ' + '\'circle\'')
            fw.write('}, \n')
        fw.write('];\n var edges = [\n')
        for ide, e in enumerate(fg.es):
            fw.write('{from: ' + str(e.source+1) + ', ' +
                     'to: ' + str(e.target+1) + ', ' +
                     'arrows: ' + '\'to\'' + ', ' +
                     'title: \' Tags: ' + fg.vs[e.source]['name'] + ' ' + fg.vs[e.target]['name'] +
                     '<br> Co-occurrence: ' + str(fg.es[ide]['weight']) + '\', ' +
                     'value: ' + str(fg.es[ide]['weight']) +
                     '},\n') #str(fg.es[ide]['weight'])
        fw.write('];\n')
Exemple #21
0
def emotion_dropout_IV_following(filepath):
    '''
    Only use following stats
    :param dbname1:
    :param dbname2:
    :param comname1:
    :param comname2:
    :return:
    '''

    fields = [
            'senti.result.whole.posm',
            'senti.result.whole.posstd',
            'senti.result.whole.negm',
            'senti.result.whole.negstd',
            'senti.result.whole.scalem',
            'senti.result.whole.scalestd',
            'senti.result.whole.N',
            'senti.result.prior.scalem',
            'senti.result.post.scalem',
              # 'liwc_anal.result.posemo',
              # 'liwc_anal.result.negemo',
              # 'liwc_anal.result.ingest',
              # 'liwc_anal.result.bio',
              # 'liwc_anal.result.body',
              # 'liwc_anal.result.health',
              # 'liwc_anal.result.death'
              # 'liwc_anal.result.anx',
              # 'liwc_anal.result.anger',
              # 'liwc_anal.result.sad'
              ]
    trimed_fields = ['-'.join(field.split('.')[-2: -1]) for field in fields]
    prof_names = ['friends_count', 'statuses_count', 'followers_count',
        'friends_day', 'statuses_day', 'followers_day', 'days', 'eigenvector', 'pagerank', 'authority', 'hub']
    attr_names = ['uid', 'group', 'attr', 'level']
    attr_names.extend(['u_'+field for field in trimed_fields])
    # attr_names.extend(['u_prior_'+field for field in trimed_fields])
    # attr_names.extend(['u_post_'+field for field in trimed_fields])
    # attr_names.extend(['u_change_'+field for field in trimed_fields])
    attr_names.extend(['u_'+field for field in prof_names])
    attr_names.extend([
        # 'u_recovery_tweets',
                       'u_timeline_count'])
    attr_names.extend(['f_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['f_'+field for field in prof_names])
    attr_names.extend(['f_timeline_count', 'f_num', 'f_palive'])
    print attr_names

    data = []
    name_map = {
        'ed': ('fed', 'fed_sur', 'com', 'com', {'level': 1, 'liwc_anal.result.WC': {'$exists': True}}),
        'yg': ('younger', 'younger_sur', 'scom', 'com', {'liwc_anal.result.WC': {'$exists': True}}),
        'rd': ('random', 'random_sur', 'scom', 'com', {'liwc_anal.result.WC': {'$exists': True}})
    }
    for groupname in [
        'yg', 'rd',
        'ed']:
        dbname1, dbname2, comname1, comname2, filter_que = name_map[groupname]
        print 'Centrality Calculate .........'
        # users = iot.get_values_one_field('fed', 'com', 'id', {'level': {'$lt': 3}})

        # print 'Number of users', len(users)
        # network1 = gt.load_network_subset('fed', 'net', {'user': {'$in': users}, 'follower': {'$in': users}})
        # network1 = gt.load_network('fed', 'net')
        # pickle.dump(network1, open('net.pick', 'w'))

        print 'load network: ' + groupname+'-net.graphml'
        network1= gt.Graph.Read_GraphML(groupname+'-net.graphml')
        # network1 = pickle.load(open('net.pick', 'r'))
        gt.summary(network1)
        network1_gc = gt.giant_component(network1)
        gt.summary(network1_gc)

        '''Centralities Calculation'''
        eigen = network1_gc.eigenvector_centrality()
        pageranks = network1_gc.pagerank()
        indegree = network1_gc.authority_score()
        outdegree = network1_gc.hub_score()
        # closeness = network.closeness()
        # betweenness = network.betweenness()
        # print len(eigen), len(closeness), len(betweenness)

        nodes = [int(v['name']) for v in network1_gc.vs]
        # print len(nodes), len(eigen)
        # print type(nodes), type(eigen)

        eigen_map = dict(zip(nodes, eigen))
        pagerank_map = dict(zip(nodes, pageranks))
        indegree_map = dict(zip(nodes, indegree))
        outdegree_map = dict(zip(nodes, outdegree))
        # print eigen_map.get(nodes[1]), type(eigen_map.get(nodes[1]))

        # closeness_map = dict(zip(nodes, closeness))
        # betweenness_map = dict(zip(nodes, betweenness))
        print 'Centrality Calculate .........'

        # print 'load liwc 2 batches: ' + groupname+'-liwc2stage.csv'
        # df = pd.read_pickle(groupname+'-liwc2stage.csv'+'.pick')

        user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que)

        print 'load db1: ', dbname1, comname1
        com1 = dbt.db_connect_col(dbname1, comname1)
        print 'load db2: ', dbname2, comname2
        com2 = dbt.db_connect_col(dbname2, comname2)


        for uid in user1:
            # set uid
            row = [uid, groupname]
            # set attrition states
            u1 = com1.find_one({'id': uid})
            u2 = com2.find_one({'id': uid})
            u1_time = u1['_id'].generation_time.replace(tzinfo=None)

            # if u2 is None or u2['timeline_count'] == 0:
            drop = 1
            if u2:
                u2_time = u2['_id'].generation_time.replace(tzinfo=None)
                if 'status' in u2:
                    second_last_post = datetime.strptime(u2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                    if u1_time < second_last_post < u2_time:
                        drop = 0
            row.append(drop)
            row.append(u1['level'])
            # set users liwc feature
            uatt = iot.get_fields_one_doc(u1, fields)
            row.extend(uatt)
            # # set users liwc changes
            # uvs = df[df.user_id == str(uid)].loc[:, trimed_fields]
            # # print uvs
            # if len(uvs) == 2:
            #     changes, priors, posts = [], [], []
            #     for name in trimed_fields:
            #         old = uvs.iloc[0][name]
            #         new = uvs.iloc[1][name]
            #         priors.append(old)
            #         posts.append(new)
            #         changes.append(new - old)
            #     row.extend(priors)
            #     row.extend(posts)
            #     row.extend(changes)
            # else:
            #     row.extend([None]*(len(trimed_fields)*3))

            # set profile, active days and eigenvector centrality
            print u1['id']
            row.extend(active_days(u1))
            row.extend([eigen_map.get(u1['id'], 0)])
            row.extend([pagerank_map.get(u1['id'], 0)])
            row.extend([indegree_map.get(u1['id'], 0)])
            row.extend([outdegree_map.get(u1['id'], 0)])
            row.extend([
                # u1['recovery_tweets'],
                u1['timeline_count']])

            exist = True
            try:
                v = network1.vs.find(name=str(uid))
            except ValueError:
                exist = False
            if exist:
                # friends = set(network1.neighbors(str(uid))) # id or name
                friends = set(network1.successors(str(uid)))
                if len(friends) > 0:
                    friend_ids = [int(network1.vs[vi]['name']) for vi in friends] # return id
                    print uid in friend_ids
                    print len(friend_ids)
                    fatts = []
                    alive = 0
                    for fid in friend_ids:
                        fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}})
                        fu2 = com2.find_one({'id': fid})

                        if fu:
                            f1_time = fu['_id'].generation_time.replace(tzinfo=None)
                            # if eigen_map.get(fu['id'], 0) > 0.0001:
                            if True:
                                fatt = iot.get_fields_one_doc(fu, fields)
                                factive = active_days(fu)
                                if fu2:
                                    f2_time = fu2['_id'].generation_time.replace(tzinfo=None)
                                    if 'status' in fu2:
                                        fsecond_last_post = datetime.strptime(fu2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                                        if f1_time < fsecond_last_post < f2_time:
                                            alive += 1
                                            factive = active_days(fu2)

                                fatt.extend(factive)
                                fatt.extend([eigen_map.get(fu['id'], 0)])
                                fatt.extend([pagerank_map.get(fu['id'], 0)])
                                fatt.extend([indegree_map.get(fu['id'], 0)])
                                fatt.extend([outdegree_map.get(fu['id'], 0)])
                                fatt.extend([fu['timeline_count']])
                                fatts.append(fatt)

                    if len(fatts) > 0:
                        fatts = np.array(fatts)
                        fmatts = np.mean(fatts, axis=0)
                        row.extend(fmatts)
                        row.append(len(fatts))
                        paliv = float(alive)/len(fatts)
                        print 'Alive %d %d %.3f' % (alive, len(fatts), paliv)
                        row.append(paliv)
            # print row
            data.append(row)
    df = pd.DataFrame(data, columns=attr_names)
    df.to_csv(filepath, index = False)
Exemple #22
0
def network_users(file_path):
    # get user list in a network
    g = gt.Graph.Read_GraphML(file_path)
    g = gt.giant_component(g)
    gt.summary(g)
    return g.vs['name']
Exemple #23
0
def community_net(rec_g, ped_g):
    # construct community networks of two network based Jarcard similarities
    gc_rec_g = gt.giant_component(rec_g)
    com_rec_g = gc_rec_g.community_multilevel(weights='weight',
                                              return_levels=False)
    comclus_rec_g = com_rec_g.subgraphs()
    print 'Community stats: #communities, modularity', len(
        comclus_rec_g), com_rec_g.modularity

    gc_ped_g = gt.giant_component(ped_g)
    com_ped_g = gc_ped_g.community_multilevel(weights='weight',
                                              return_levels=False)
    comclus_ped_g = com_ped_g.subgraphs()
    print 'Community stats: #communities, modularity', len(
        comclus_ped_g), com_ped_g.modularity
    name_map, edges, node_weight = {}, {}, {}

    for i in xrange(len(comclus_rec_g)):
        comclu_rec_g = comclus_rec_g[i]
        rec_nodes = set([v['name'] for v in comclu_rec_g.vs])
        max_fre_rec = max(comclu_rec_g.vs['weight'])
        max_fre_rec_tag = comclu_rec_g.vs.find(weight_eq=max_fre_rec)['name']
        n1 = 'rec_' + str(i) + '_' + max_fre_rec_tag
        for j in xrange(len(comclus_ped_g)):
            comclu_ped_g = comclus_ped_g[j]
            max_fre = max(comclu_ped_g.vs['weight'])
            ed_nodes = set([v['name'] for v in comclu_ped_g.vs])
            max_fre_tag = comclu_ped_g.vs.find(weight_eq=max_fre)['name']
            n2 = 'ped_' + str(j) + '_' + max_fre_tag

            n1id = name_map.get(n1, len(name_map))
            name_map[n1] = n1id
            node_weight[n1id] = sum(comclu_rec_g.vs['weight'])

            n2id = name_map.get(n2, len(name_map))
            name_map[n2] = n2id
            node_weight[n2id] = sum(comclu_ped_g.vs['weight'])

            similarity = float(len(rec_nodes.intersection(ed_nodes)))
            # /len(rec_nodes.union(ed_nodes))
            if similarity > 10:
                edges[(n1id, n2id)] = similarity
    g = gt.Graph(len(name_map), directed=False)
    g.vs["name"] = list(sorted(name_map, key=name_map.get))
    g.vs['weight'] = [node_weight[i] for i in xrange(len(node_weight))]
    g.add_edges(edges.keys())
    g.es["weight"] = edges.values()
    for v in g.vs:
        tokens = v['name'].split('_')
        v['set'] = tokens[0]
        v['tag'] = tokens[2]
    g.write_graphml('hashtag_inter_net.graphml')

    gc = gt.giant_component(g)
    tagets_communities = {}
    for v in gc.vs:
        tokens = v['name'].split('_')
        com_list = tagets_communities.get(tokens[0], [])
        com_list.append(int(tokens[1]))
        tagets_communities[tokens[0]] = com_list
    return tagets_communities
Exemple #24
0
def read_user_time_iv(filename):
    # fields = iot.read_fields()
    fields = [
        'liwc_anal.result.posemo', 'liwc_anal.result.negemo',
        'liwc_anal.result.ingest', 'liwc_anal.result.bio',
        'liwc_anal.result.body', 'liwc_anal.result.health',
        'liwc_anal.result.death'
        'liwc_anal.result.anx', 'liwc_anal.result.anger',
        'liwc_anal.result.sad'
    ]
    prof_names = [
        'friends_count', 'statuses_count', 'followers_count', 'friends_day',
        'statuses_day', 'followers_day', 'days'
    ]

    trimed_fields = [field.split('.')[-1] for field in fields]
    groups = [('ED', 'fed', 'com', 'fed_sur', 'com',
               '2017-06-21 14:57:39+00:00', {
                   'liwc_anal.result.WC': {
                       '$exists': True
                   },
                   'level': 1
               }),
              ('RD', 'random', 'scom', 'random_sur', 'com',
               '2017-06-21 14:57:39+00:00', {
                   'liwc_anal.result.WC': {
                       '$exists': True
                   }
               }),
              ('YG', 'younger', 'scom', 'younger_sur', 'com',
               '2017-06-21 14:57:39+00:00', {
                   'liwc_anal.result.WC': {
                       '$exists': True
                   }
               })]

    data = []
    for tag, dbname, comname, dbname2, comname2, second_time, filter_values in groups:
        com = dbt.db_connect_col(dbname, comname)
        com2 = dbt.db_connect_col(dbname2, comname2)
        network1 = gt.Graph.Read_GraphML(tag.lower() + '-net.graphml')
        gt.summary(network1)
        network1_gc = gt.giant_component(network1)
        gt.summary(network1_gc)
        '''Centralities Calculation'''
        eigen = network1_gc.eigenvector_centrality()
        pageranks = network1_gc.pagerank()
        indegree = network1_gc.authority_score()
        outdegree = network1_gc.hub_score()

        nodes = [int(v['name']) for v in network1_gc.vs]
        eigen_map = dict(zip(nodes, eigen))
        pagerank_map = dict(zip(nodes, pageranks))
        indegree_map = dict(zip(nodes, indegree))
        outdegree_map = dict(zip(nodes, outdegree))

        print 'load liwc 2 batches: ' + tag.lower() + '-liwc2stage.csv'
        liwc_df = pd.read_pickle(tag.lower() + '-liwc2stage.csv' + '.pick')

        for user in com.find(filter_values, no_cursor_timeout=True):
            first_scraped_at = user['_id'].generation_time.replace(tzinfo=None)
            if 'status' in user:
                uid = user['id']
                u2 = com2.find_one({'id': uid})

                first_last_post = datetime.strptime(
                    user['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                last_post = first_last_post
                drop = 1
                if u2:
                    second_scraped_at = u2['_id'].generation_time.replace(
                        tzinfo=None)
                    if 'status' in u2:
                        second_last_post = datetime.strptime(
                            u2['status']['created_at'],
                            '%a %b %d %H:%M:%S +0000 %Y')
                        if first_scraped_at < second_last_post < second_scraped_at:
                            drop = 0
                            last_post = second_last_post

                created_at = datetime.strptime(user['created_at'],
                                               '%a %b %d %H:%M:%S +0000 %Y')
                life_time = diff_day(last_post, created_at)
                average_time = float(life_time) / min(1,
                                                      user['statuses_count'])
                longest_tweet_intervalb = user['longest_tweet_interval']
                u_timeline_count = user['timeline_count']

                values = iot.get_fields_one_doc(user, fields)
                level = user['level']

                # set users liwc changes
                uvs = liwc_df[liwc_df.user_id == str(uid)].loc[:,
                                                               trimed_fields]
                # print uvs
                if len(uvs) == 2:
                    changes, priors, posts = [], [], []
                    for name in trimed_fields:
                        old = uvs.iloc[0][name]
                        new = uvs.iloc[1][name]
                        priors.append(old)
                        posts.append(new)
                        changes.append(new - old)
                    liwc_changes = priors + posts + changes
                else:
                    liwc_changes = [None] * (len(trimed_fields) * 3)
                u_centrality = eigen_map.get(user['id'], 0)
                u_pagerank = pagerank_map.get(user['id'], 0)
                u_indegree = indegree_map.get(user['id'], 0)
                u_outdegree = outdegree_map.get(user['id'], 0)

                values.extend(liwc_changes)
                values.extend(active_days(user))
                '''Get friends' profiles'''
                exist = True
                try:
                    v = network1.vs.find(name=str(uid))
                except ValueError:
                    exist = False
                if exist:
                    # friends = set(network1.neighbors(str(uid))) # id or name
                    friends = set(network1.successors(str(uid)))
                    if len(friends) > 0:
                        friend_ids = [
                            int(network1.vs[vi]['name']) for vi in friends
                        ]  # return id
                        print uid in friend_ids
                        print len(friend_ids)
                        fatts = []
                        alive = 0
                        for fid in friend_ids:
                            fu = com.find_one({
                                'id': fid,
                                'liwc_anal.result.WC': {
                                    '$exists': True
                                }
                            })
                            fu2 = com2.find_one({'id': fid})

                            if fu:
                                f1_time = fu['_id'].generation_time.replace(
                                    tzinfo=None)
                                # if eigen_map.get(fu['id'], 0) > 0.0001:
                                if True:
                                    fatt = iot.get_fields_one_doc(fu, fields)
                                    factive = active_days(fu)
                                    if fu2:
                                        f2_time = fu2[
                                            '_id'].generation_time.replace(
                                                tzinfo=None)
                                        if 'status' in fu2:
                                            fsecond_last_post = datetime.strptime(
                                                fu2['status']['created_at'],
                                                '%a %b %d %H:%M:%S +0000 %Y')
                                            if f1_time < fsecond_last_post < f2_time:
                                                alive += 1
                                                factive = active_days(fu2)

                                    fatt.extend(factive)
                                    fatt.extend([
                                        eigen_map.get(fu['id'], 0),
                                        pagerank_map.get(fu['id'], 0),
                                        indegree_map.get(fu['id'], 0),
                                        outdegree_map.get(fu['id'], 0)
                                    ])
                                    fatts.append(fatt)

                        # thredhold = user['friends_count']*0.5

                        if len(fatts) > 0:
                            fatts = np.array(fatts)
                            fmatts = np.mean(fatts, axis=0)
                            values.extend(fmatts)
                            paliv = float(alive) / len(fatts)
                            data.append([
                                user['id_str'], level, drop, created_at,
                                first_last_post, second_last_post, last_post,
                                first_scraped_at, second_scraped_at,
                                average_time, longest_tweet_intervalb, tag,
                                u_centrality, u_pagerank, u_indegree,
                                u_outdegree, u_timeline_count
                            ] + values + [len(fatts), paliv])

    df = pd.DataFrame(
        data,
        columns=[
            'uid', 'level', 'dropout', 'created_at', 'first_last_post',
            'second_last_post', 'last_post', 'first_scraped_at',
            'second_scraped_at', 'average_time', 'longest_time_interval',
            'group', 'u_eigenvector', 'u_pagerank', 'u_authority', 'u_hub',
            'u_timeline_count'
        ] + ['u_' + field for field in trimed_fields] +
        ['u_prior_' + field for field in trimed_fields] +
        ['u_post_' + field for field in trimed_fields] +
        ['u_change_' + field
         for field in trimed_fields] + ['u_' + field for field in prof_names] +
        ['f_' + tf
         for tf in trimed_fields] + ['f_' + field for field in prof_names] + [
             'f_eigenvector', 'f_pagerank', 'f_authority', 'f_hub', 'f_num',
             'f_palive'
         ])
    df.to_csv(filename)