Ejemplo n.º 1
0
def friend_user_change(dbname1, dbname2, comname1, comname2):
    filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists':True}}
    user2 = iot.get_values_one_field(dbname2, comname2, 'id', filter_que)
    fields = ['liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.body', 'liwc_anal.result.ingest']
    network1 = gt.load_network(dbname1, 'net')
    network2 = gt.load_network(dbname2, 'net')
    for field in fields:
        print '-----------------%s----------------' %field
        user_changes, friends_changes = [], []
        for uid in user2:
            user_feature_old = iot.get_values_one_field(dbname1, comname1, field, {'id': uid})
            user_feature_new = iot.get_values_one_field(dbname2, comname2, field, {'id': uid})
            if len(user_feature_old) != len(user_feature_new) and len(user_feature_new) != 1:
                print 'User feature value length %d, %d' %(len(user_feature_old), len(user_feature_new))
            user_change = np.mean(user_feature_new) - np.mean(user_feature_old)
            exist = True
            try:
                v = network1.vs.find(name=str(uid))
                v = network2.vs.find(name=str(uid))
            except ValueError:
                exist = False
            if exist:
                friends_old = network1.successors(str(uid))
                friends_new = network2.successors(str(uid))
                old_friend_ids = [int(network1.vs[v]['name']) for v in friends_old]
                new_friend_ids = [int(network2.vs[v]['name']) for v in friends_new]
                if len(old_friend_ids) != len(new_friend_ids):
                    print 'Friend feature value length %d, %d' % (len(old_friend_ids), len(new_friend_ids))
                friends_feature_old = iot.get_values_one_field(dbname1, comname1, field, {'id': {'$in': old_friend_ids}})
                friends_feature_new = iot.get_values_one_field(dbname2, comname2, field, {'id': {'$in': new_friend_ids}})
                friend_change = np.mean(friends_feature_new) - np.mean(friends_feature_old)
                friends_changes.append(friend_change)
                user_changes.append(user_change)
        pltt.correlation(friends_changes, user_changes, r'$\Delta$(F_'+field+')', r'$\Delta$(U_'+field+')', field+'-friend-user.pdf')
Ejemplo n.º 2
0
def friend_user_change(dbname1, dbname2, comname1, comname2):
    filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists':True}}
    user2 = iot.get_values_one_field(dbname2, comname2, 'id', filter_que)
    fields = ['liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.body', 'liwc_anal.result.ingest']
    network1 = gt.load_network(dbname1, 'net')
    network2 = gt.load_network(dbname2, 'net')
    for field in fields:
        print '-----------------%s----------------' %field
        user_changes, friends_changes = [], []
        for uid in user2:
            user_feature_old = iot.get_values_one_field(dbname1, comname1, field, {'id': uid})
            user_feature_new = iot.get_values_one_field(dbname2, comname2, field, {'id': uid})
            if len(user_feature_old) != len(user_feature_new) and len(user_feature_new) != 1:
                print 'User feature value length %d, %d' %(len(user_feature_old), len(user_feature_new))
            user_change = np.mean(user_feature_new) - np.mean(user_feature_old)
            exist = True
            try:
                v = network1.vs.find(name=str(uid))
                v = network2.vs.find(name=str(uid))
            except ValueError:
                exist = False
            if exist:
                friends_old = network1.successors(str(uid))
                friends_new = network2.successors(str(uid))
                old_friend_ids = [int(network1.vs[v]['name']) for v in friends_old]
                new_friend_ids = [int(network2.vs[v]['name']) for v in friends_new]
                if len(old_friend_ids) != len(new_friend_ids):
                    print 'Friend feature value length %d, %d' % (len(old_friend_ids), len(new_friend_ids))
                friends_feature_old = iot.get_values_one_field(dbname1, comname1, field, {'id': {'$in': old_friend_ids}})
                friends_feature_new = iot.get_values_one_field(dbname2, comname2, field, {'id': {'$in': new_friend_ids}})
                friend_change = np.mean(friends_feature_new) - np.mean(friends_feature_old)
                friends_changes.append(friend_change)
                user_changes.append(user_change)
        pltt.correlation(friends_changes, user_changes, r'$\Delta$(F_'+field+')', r'$\Delta$(U_'+field+')', field+'-friend-user.pdf')
Ejemplo n.º 3
0
def plot_distribution(dbname='fed', comname='scom'):
    # Plot difference between retweeted and liked tweets
    fields = iot.read_fields()
    for field in fields:
        tokens = field.split('.')
        retweet_key = field.replace('liwc_anal', 'retweet_liwc')
        like_key = field.replace('liwc_anal', 'like_liwc')
        retwets = iot.get_values_one_field(dbname, comname, retweet_key)
        likes = iot.get_values_one_field(dbname, comname, like_key)
        pt.plot_config()
        sns.distplot(retwets,
                     hist=False,
                     kde_kws={
                         "color": "r",
                         "lw": 2,
                         "marker": 'o'
                     },
                     label='RT ($\mu=%0.2f \pm %0.2f$)' %
                     (np.mean(retwets), np.std(retwets)))
        sns.distplot(likes,
                     hist=False,
                     kde_kws={
                         "color": "g",
                         "lw": 2,
                         "marker": 's'
                     },
                     label='Like ($\mu=%0.2f \pm %0.2f$)' %
                     (np.mean(likes), np.std(likes)))
        plt.legend(loc="best")
        plt.xlabel(tokens[-1])
        plt.ylabel('P')
        plt.savefig('data/' + tokens[-1] + '.pdf', bbox_inches='tight')
        plt.clf()
Ejemplo n.º 4
0
def profile_feature_stat():
    # 'favourites_count'
    fields = ['friends_count', 'followers_count', 'statuses_count']
    names = ['following', 'follower', 'tweet']

    filter = {}
    fitranges = [[(200, 100000), (1000, 100000000), (800, 10000000)],
                 [(700, 10000), (800, 10000000), (800, 1000000)],
                 [(800, 100000), (20000, 10000000), (10000, 10000000)]]
    for i in range(len(fields)):
        field = fields[i]
        print '=====================', field
        feds = np.array(io.get_values_one_field('fed', 'scom', field,
                                                filter)) + 1
        randoms = np.array(
            io.get_values_one_field('random', 'scom', field, filter)) + 1
        youngs = np.array(
            io.get_values_one_field('young', 'scom', field, filter)) + 1

        comm = statis_util.comm_stat(feds)
        print 'ED & ' + str(comm[0]) + ' & ' + str(comm[1]) \
              + ' & ' + str(comm[2])+ ' & ' + str(comm[3]) + '\\\\'
        comm = statis_util.comm_stat(randoms)
        print 'Random &' + str(comm[0]) + ' & ' + str(comm[1]) \
              + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\'
        comm = statis_util.comm_stat(youngs)
        print 'Younger &' + str(comm[0]) + ' & ' + str(comm[1]) \
              + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\'
        print '\\hline'

        # z = statis_util.z_test(randoms, feds)
        # print 'z-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
        #       + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        # z = statis_util.z_test(youngs, feds)
        # print 'z-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$:' + str(z[1]) \
        #       + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        # z = statis_util.z_test(youngs, randoms)
        # print 'z-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
        #       + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'

        z = statis_util.ks_test(randoms, feds)
        print 'ks-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
              + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        z = statis_util.ks_test(youngs, feds)
        print 'ks-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
              + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        z = statis_util.ks_test(youngs, randoms)
        print 'ks-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
              + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'

        plot.plot_pdf_mul_data([feds, randoms, youngs],
                               names[i], ['g', 'b', 'r'], ['s', 'o', '^'],
                               ['ED', 'Random', 'Younger'],
                               linear_bins=False,
                               central=False,
                               fit=True,
                               fitranges=fitranges[i],
                               savefile=field + '.pdf')
Ejemplo n.º 5
0
def profile_feature_dependence():
    fields = ['friends_count', 'statuses_count', 'followers_count']
    names = ['following', 'tweet', 'follower']

    for i in xrange(len(fields)):
        fi = fields[i]
        ni = names[i]
        for j in xrange(i + 1, len(fields)):
            fj = fields[j]
            nj = names[j]
            print '=========================Dependence :', fi, fj
            plt.rcParams['legend.fontsize'] = 20
            plt.rcParams['axes.labelsize'] = 20
            ax = plt.gca()
            i = 0
            for db, color, mark, label in [('fed', 'g', 's', 'ED'),
                                           ('random', 'b', 'o', 'Random'),
                                           ('young', 'r', '^', 'Younger')]:
                print '++++++++++++++++++++++++++Dependence :', fi, fj, db
                fivalue = np.array(io.get_values_one_field(db, 'scom', fi))
                fjvalue = np.array(io.get_values_one_field(db, 'scom', fj))
                fivalue += 1
                fjvalue += 1
                xmeans, ymeans = plot.mean_bin(fivalue, fjvalue)
                ax.scatter(xmeans,
                           ymeans,
                           s=50,
                           c=color,
                           marker=mark,
                           label=label)
                fit_start = min(fivalue)
                fit_end = max(fivalue)
                # fit_start = np.percentile(fivalue, 2.5)
                # fit_end = np.percentile(fivalue, 97.5)
                xfit, yfit, cof = plot.lr_ls(xmeans, ymeans, fit_start,
                                             fit_end)
                ax.plot(xfit, yfit, c=color, linewidth=2, linestyle='--')
                ax.annotate(r'$k_y \propto {k_x}^{' + str(round(cof, 2)) +
                            '}$',
                            xy=(xfit[-15], yfit[-15]),
                            xycoords='data',
                            xytext=(28 + (i) * 10, -30 - (i) * 10),
                            textcoords='offset points',
                            fontsize=20,
                            arrowprops=dict(arrowstyle="->"))
                i += 1
            ax.set_xscale("log")
            ax.set_yscale("log")
            ax.set_ylabel('k(' + nj + ')')
            ax.set_xlabel('k(' + ni + ')')
            ax.set_xlim(xmin=1)
            ax.set_ylim(ymin=1)
            handles, labels = ax.get_legend_handles_labels()
            leg = ax.legend(handles, labels, loc=4)
            leg.draw_frame(True)
            plt.savefig(fi + '-' + fj + '.pdf')
            plt.clf()
Ejemplo n.º 6
0
def pro_tag_user():
    # get users with pro-ed and pro-recovery hashtags
    proed = set(iot.get_values_one_field('fed', 'proed_tag', 'user.id'))
    prorec = set(iot.get_values_one_field('fed', 'prorec_tag', 'user.id'))
    print len(proed), len(prorec), len(proed.intersection(prorec))
    print len(proed-prorec), len(prorec-proed)
    print prorec-proed
    return ([str(i) for i in proed-prorec],
            [str(i) for i in prorec-proed],
            [str(i) for i in proed.intersection(prorec)])
Ejemplo n.º 7
0
def feature_stat(dumped=False):
    fields = io.read_fields()
    print len(fields)
    assert isinstance(fields, object)
    for field in fields:
        keys = field.split('.')
        filter = {field: {'$exists': True}}
        eds = io.get_values_one_field('fed', 'scom', field, filter)
        randoms = io.get_values_one_field('random', 'scom', field, filter)
        youngs = io.get_values_one_field('young', 'scom', field, filter)
        compore_distribution(keys[-1], eds, randoms, youngs)
Ejemplo n.º 8
0
def load_net():
    g = gt.load_network('fed', 'net')
    g.write_graphml('ed-net.graphml')

    users = iot.get_values_one_field('random', 'scom', 'id')
    g = gt.load_network_subset('random', 'net', {'user': {'$in': users}, 'follower': {'$in': users}})
    g.write_graphml('rd-net.graphml')

    users = iot.get_values_one_field('younger', 'scom', 'id')
    g = gt.load_network_subset('younger', 'net', {'user': {'$in': users}, 'follower': {'$in': users}})
    g.write_graphml('yg-net.graphml')
Ejemplo n.º 9
0
def overlap():
    # overlap between two data
    core_ed = set(iot.get_values_one_field('fed', 'scom', 'id'))
    ian_ed = set()
    with open('uid.txt', 'r') as fo:
        for line in fo.readlines():
            ian_ed.add(int(line.strip()))
    print len(core_ed), len(ian_ed), len(core_ed.intersection(ian_ed))

    fed = set(iot.get_values_one_field('fed', 'com', 'id'))
    ian_all = set(iot.get_values_one_field('TwitterProAna', 'users', 'id'))
    print len(fed), len(ian_all), len(fed.intersection(ian_all))
    print len(fed), len(ian_ed), len(fed.intersection(ian_ed))
Ejemplo n.º 10
0
def compare_weights():
    #Compare distributions of CW and GW between pro-ed and pro-recovery users
    prorec, proed = edrelatedcom.rec_proed() ## based on profiles
    for users in [prorec, proed]:
        field = 'text_anal.cw.value'
        cw = iot.get_values_one_field('fed', 'scom', field, {'id_str': {'$in': users},
                                    field: {'$exists': True}})
        field = 'text_anal.gw.value'
        gw = iot.get_values_one_field('fed', 'scom', field, {'id_str': {'$in': users},
                                    field: {'$exists': True}})
        sns.distplot(cw, hist=False, label='CW')
        sns.distplot(gw, hist=False, label='GW')
        plt.show()
Ejemplo n.º 11
0
def hashtag_users():
    com = dbt.db_connect_col('fed', 'com')
    times_ped = list(set(iot.get_values_one_field('fed', 'proed_tag', 'user.id')))
    times_rec = list(set(iot.get_values_one_field('fed', 'prorec_tag', 'user.id')))
    newtime = dbt.db_connect_col('fed', 'tag_com')
    newtime.create_index([('id', pymongo.ASCENDING)], unique=True)

    for users in [times_ped, times_rec]:
        for uid in users:
            user = com.find_one({'id': uid})
            try:
                newtime.insert(user)
            except pymongo.errors.DuplicateKeyError:
                pass
Ejemplo n.º 12
0
def distribution_change(dbname, colname):
    rec_users1 = pickle.load(open('data/pro-recovery.pick', 'r'))
    pro_ed = pickle.load(open('data/pro_ed.pick', 'r'))
    print len(rec_users1)
    print len(pro_ed)
    features = [
        'liwc_anal.result.i',
        'liwc_anal.result.we',
        'liwc_anal.result.bio',
        'liwc_anal.result.body',
        'liwc_anal.result.health',
        'liwc_anal.result.posemo',
        'liwc_anal.result.negemo',
        'liwc_anal.result.ingest',
        'liwc_anal.result.anx',
        'liwc_anal.result.anger',
        'liwc_anal.result.sad'
                ]
    names = ['I', 'We', 'Bio', 'Body', 'Health', 'Posemo', 'Negemo', 'Ingest', 'Anx', 'Anger', 'Sad']
    df = pd.DataFrame()
    pltt.plot_config()
    for i in xrange(len(features)):
        feature = features[i]
        old_values = iot.get_values_one_field(dbname, colname, feature, {'id':{'$in': rec_users1}})
        df1 = pd.DataFrame({'Feature': names[i], 'Group': 'Pro-Recovery', 'Values': old_values})
        new_values = iot.get_values_one_field(dbname, colname, feature, {'id':{'$in': pro_ed}})
        df2 = pd.DataFrame({'Feature': names[i], 'Group': 'Pro-ED', 'Values': new_values})
        df1 = df1.append(df2)
        if len(df) == 0:
            df = df1
        else:
            df = df.append(df1)
        '''Plot Individual'''
        # sns.distplot(old_values, hist=False, label='Before')
        # sns.distplot(new_values, hist=False, label='After')
        d, p = stats.ks_2samp(old_values, new_values)
        print (names[i] + ', %.3f(%.3f), %.3f(%.3f), %.3f(%.3f)' %((np.mean(old_values)), (np.std(old_values)),
                                                 (np.mean(new_values)), (np.std(new_values)), d, p))
        # plt.xlabel(feature)
        # plt.ylabel('PDF')
        # # plt.show()
        # plt.savefig(dbname+'_'+feature+'_time.pdf')
        # plt.clf()
    sns.set(style="whitegrid", palette="pastel", color_codes=True)
    # sns.violinplot(x="Feature", y="Values", hue="Time", data=df, split=True,
    #                inner="quart", palette={"Before": "b", "After": "y"})
    # sns.despine(left=True)
    sns.boxplot(x="Feature", y="Values", hue="Group", data=df, palette="PRGn")
    sns.despine(offset=10, trim=True)
    plt.show()
Ejemplo n.º 13
0
def distribution_change(dbname, colname):
    rec_users1 = pickle.load(open('data/pro-recovery.pick', 'r'))
    pro_ed = pickle.load(open('data/pro_ed.pick', 'r'))
    print len(rec_users1)
    print len(pro_ed)
    features = [
        'liwc_anal.result.i',
        'liwc_anal.result.we',
        'liwc_anal.result.bio',
        'liwc_anal.result.body',
        'liwc_anal.result.health',
        'liwc_anal.result.posemo',
        'liwc_anal.result.negemo',
        'liwc_anal.result.ingest',
        'liwc_anal.result.anx',
        'liwc_anal.result.anger',
        'liwc_anal.result.sad'
                ]
    names = ['I', 'We', 'Bio', 'Body', 'Health', 'Posemo', 'Negemo', 'Ingest', 'Anx', 'Anger', 'Sad']
    df = pd.DataFrame()
    pltt.plot_config()
    for i in xrange(len(features)):
        feature = features[i]
        old_values = iot.get_values_one_field(dbname, colname, feature, {'id':{'$in': rec_users1}})
        df1 = pd.DataFrame({'Feature': names[i], 'Group': 'Pro-Recovery', 'Values': old_values})
        new_values = iot.get_values_one_field(dbname, colname, feature, {'id':{'$in': pro_ed}})
        df2 = pd.DataFrame({'Feature': names[i], 'Group': 'Pro-ED', 'Values': new_values})
        df1 = df1.append(df2)
        if len(df) == 0:
            df = df1
        else:
            df = df.append(df1)
        '''Plot Individual'''
        # sns.distplot(old_values, hist=False, label='Before')
        # sns.distplot(new_values, hist=False, label='After')
        d, p = stats.ks_2samp(old_values, new_values)
        print (names[i] + ', %.3f(%.3f), %.3f(%.3f), %.3f(%.3f)' %((np.mean(old_values)), (np.std(old_values)),
                                                 (np.mean(new_values)), (np.std(new_values)), d, p))
        # plt.xlabel(feature)
        # plt.ylabel('PDF')
        # # plt.show()
        # plt.savefig(dbname+'_'+feature+'_time.pdf')
        # plt.clf()
    sns.set(style="whitegrid", palette="pastel", color_codes=True)
    # sns.violinplot(x="Feature", y="Values", hue="Time", data=df, split=True,
    #                inner="quart", palette={"Before": "b", "After": "y"})
    # sns.despine(left=True)
    sns.boxplot(x="Feature", y="Values", hue="Group", data=df, palette="PRGn")
    sns.despine(offset=10, trim=True)
    plt.show()
Ejemplo n.º 14
0
def profile_feature_stat():
    # 'favourites_count'
    fields = ['friends_count', 'followers_count', 'statuses_count']
    names = ['following', 'follower', 'tweet']

    filter = {}
    fitranges = [[(200, 100000), (1000, 100000000), (800, 10000000)],
                     [(700, 10000), (800, 10000000), (800, 1000000)],
                     [(800, 100000), (20000, 10000000), (10000, 10000000)]]
    for i in range(len(fields)):
        field = fields[i]
        print '=====================', field
        feds = np.array(io.get_values_one_field('fed', 'scom', field, filter))+1
        randoms = np.array(io.get_values_one_field('random', 'scom', field, filter))+1
        youngs = np.array(io.get_values_one_field('young', 'scom', field, filter))+1

        comm = statis_util.comm_stat(feds)
        print 'ED & ' + str(comm[0]) + ' & ' + str(comm[1]) \
              + ' & ' + str(comm[2])+ ' & ' + str(comm[3]) + '\\\\'
        comm = statis_util.comm_stat(randoms)
        print 'Random &' + str(comm[0]) + ' & ' + str(comm[1]) \
              + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\'
        comm = statis_util.comm_stat(youngs)
        print 'Younger &' + str(comm[0]) + ' & ' + str(comm[1]) \
              + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\'
        print '\\hline'

        # z = statis_util.z_test(randoms, feds)
        # print 'z-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
        #       + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        # z = statis_util.z_test(youngs, feds)
        # print 'z-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$:' + str(z[1]) \
        #       + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        # z = statis_util.z_test(youngs, randoms)
        # print 'z-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
        #       + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'

        z = statis_util.ks_test(randoms, feds)
        print 'ks-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
              + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        z = statis_util.ks_test(youngs, feds)
        print 'ks-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
              + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'
        z = statis_util.ks_test(youngs, randoms)
        print 'ks-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \
              + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\'

        plot.plot_pdf_mul_data([feds, randoms, youngs], names[i], ['g', 'b', 'r'], ['s', 'o', '^'], ['ED', 'Random', 'Younger'],
                               linear_bins=False, central=False, fit=True, fitranges=fitranges[i], savefile=field+'.pdf')
Ejemplo n.º 15
0
def process_tweet(dbname, comname, timename, label, filename):
    db = dbt.db_connect_no_auth(dbname)
    times = db[timename]
    user_list = iot.get_values_one_field(dbname, comname, 'id', {
        "timeline_count": {
            '$gt': 0
        },
        'lang': 'en'
    })
    target_users = []
    for user in user_list:
        context = ''
        for time in times.find({'user.id': user}).sort([('id', 1)]):
            # print time['created_at']
            if 'retweeted_status' in time:
                continue
            elif 'quoted_status' in time:
                continue
            else:
                text = process(time['text'])
                if text:
                    # print user, time['id'], text, '<-------', time['text']
                    context += text + ' '
                else:
                    continue
                    # print user, time['id'], 'None', '<-------', time['text']
        if len(context.split()) > 50:
            target_users.append(user)
            print '__label__' + label + ' , ' + context
    pickle.dump(target_users, open('data/' + filename + '.pick', 'w'))
Ejemplo n.º 16
0
def re_snowball_friends(olddbname, oldcomname, newdbname, newcomname):
    newdb = dbt.db_connect_no_auth(newdbname)
    newcom = newdb[newcomname]
    newnet = newdb['net']
    newcom.create_index("id", unique=True)
    newcom.create_index([('level', pymongo.ASCENDING),
                         ('following_prelevel_node', pymongo.ASCENDING)],
                        unique=False)
    newcom.create_index([('level', pymongo.ASCENDING),
                         ('follower_prelevel_node', pymongo.ASCENDING)],
                        unique=False)
    newnet.create_index([("user", pymongo.ASCENDING),
                         ("follower", pymongo.ASCENDING)],
                        unique=True)

    '''Reteive ED core users'''
    ed_users = iot.get_values_one_field(olddbname, oldcomname, 'id', {'level': 1})
    list_size = len(ed_users)
    length = int(math.ceil(list_size/100.0))
    for index in xrange(length):
        index_begin = index*100
        index_end = min(list_size, index_begin+100)
        lookup.lookup_user_list(ed_users[index_begin:index_end], newcom, 1, 'N')

    level = 1
    while True:
        # Each call of snowball_following and snowball_follower only process up to 200 users
        print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'Snowball followings of seeds for sample db', level
        following_flag = following.snowball_following(newcom, newnet, level, 'N')
        print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'Snowball followees of seeds for sample db', level
        follower_flag = follower.snowball_follower(newcom, newnet, level, 'N')
        if following_flag == False and follower_flag == False:
            break
        else:
            continue
Ejemplo n.º 17
0
def data_4_opinionfinder(dbname, comname, timename, outpath, filter={}):
    db = dbt.db_connect_no_auth(dbname)
    time = db[timename]

    rtgrex = re.compile(r'RT (?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z0-9_]+):')  # for Retweet
    mgrex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z0-9_]+)')  # for mention
    hgrex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9]))#([A-Za-z0-9_]+)')  # for hashtags
    ugrex = re.compile(r'(https?://[^\s]+)')  # for url

    users = io.get_values_one_field(dbname, comname, 'id_str', filter)
    userlist = list()
    for user in users:
        documents = list()
        for tweet in time.find({'user.id': int(user)}):
            text = tweet['text'].encode('utf8')
            # replace RT, @, # and Http://
            text = rtgrex.sub('', text)
            text = mgrex.sub('', text)
            text = hgrex.sub('', text)
            text = ugrex.sub('', text)
            text = text.strip()
            if not(text.endswith('.') or text.endswith('?') or text.endswith('!')):
                text += '.'
            words = text.split()
            if len(words) > 0:
                documents.append(' '.join(words))
        if len(documents) > 0:
            with open(outpath+'/'+user+'.data', 'w') as fo:
                for document in documents:
                    fo.write(document+'\t\n')
            userlist.append(user)
    with open(outpath+'.doclist', 'w') as fo:
        for user in userlist:
            fo.write('database/'+outpath+'/'+ user+'.data\n')
Ejemplo n.º 18
0
def avg_liwc(dbname):
    fields = iot.read_fields()
    for field in fields:
        filters = {field: {'$exists': True}}
        results = list()
        N = 5
        for i in range(1, N+1):
            result = iot.get_values_one_field(dbname, dbname+'com_t'+str(i), field, filters)
            result = central_values(result)
            results.append(result)
        ax = plt.gca()
        ind = [y+1 for y in range(len(results))]
        means = [np.mean(result) for result in results]
        stds = [np.std(result) for result in results]
        ax.errorbar(ind, means, stds, fmt='--o--', capthick=3)
        ax.violinplot(results, showmeans=False, showextrema=True)
        ax.set_xticks(ind)
        # for i in ind:
        #     ax.text(i, means[i-1]+0.5,
        #         str(round(means[i-1], 2))+ '$\pm$'+ str(round(stds[i-1], 2)),
        #         ha='center', va='bottom', )
        ax.set_xticklabels(('Before 2012', '2012', '2013', '2014', 'After 2014'))
        ax.set_xlabel('Time Series')
        tokens = field.split('.')
        if tokens[-1] == 'value':
            ax.set_ylabel(tokens[-2].upper())
        else:
            ax.set_ylabel(tokens[-1])
        ax.grid(True)
        plt.savefig('data/'+field+'.pdf')
        plt.clf()
Ejemplo n.º 19
0
def feature_stat(dumped=False):
    fields = io.read_fields()
    print len(fields)
    assert isinstance(fields, object)
    for field in fields:
        keys = field.split('.')
        # filter = {field: {'$exists': True}}
        # eds = io.get_values_one_field('fed', 'scom', field, filter)
        # randoms = io.get_values_one_field('random', 'scom', field, filter)
        # youngs = io.get_values_one_field('young', 'scom', field, filter)
        # compore_distribution(keys[-1], eds, randoms, youngs)

        positive = io.get_values_one_field('depression', 'com', field, {field: {'$exists': True}, 'checked': True})
        negative = io.get_values_one_field('depression', 'neg_com', field, {field: {'$exists': True}})
        # print len(positive), len(negative)
        compore_distribution(keys[-1], positive, negative)
Ejemplo n.º 20
0
def network_analysis():
    # output network among depression users
    # user1 = iot.get_values_one_field('depression', 'users1', 'id')
    # user2 = iot.get_values_one_field('depression', 'users2', 'id')
    # print len(user1), len(user2)
    # alluser = user1 + user2
    alluser = iot.get_values_one_field('depression', 'depressive', 'id')
    follow_net = gt.load_network_subset('depression', 'net', {
        'user': {
            '$in': alluser
        },
        'follower': {
            '$in': alluser
        }
    })
    gt.net_stat(follow_net)
    follow_net.write_graphml('data/follow_net.graphml')

    for beh in ['retweet', 'communication']:
        print beh
        bnetwork = gt.load_beh_network_subset(userlist=alluser,
                                              db_name='depression',
                                              collection='bnet',
                                              btype=beh)
        gt.net_stat(bnetwork)
        bnetwork.write_graphml('data/' + beh + '_net.graphml')
Ejemplo n.º 21
0
def refine_recovery(dbname, netname):
    '''
    refine the users who have use hashtag #recovery
    :param dbname:
    :param netname:
    :return:
    '''
    network = dbutil.db_connect_col(dbname, netname)
    proed = set([
        'proed', 'proana', 'promia', 'proanorexia', 'proanamia', 'proanatips',
        'proanatip'
    ])
    proedrel = proed
    for link in network.find(no_cursor_timeout=True):
        tags = set(link['tags'])
        if len(proed.intersection(tags)) > 0:
            proedrel = proedrel.union(tags)
    print len(proedrel)
    users = iot.get_values_one_field(dbname, netname, 'id0')
    print len(users)
    for user in users:
        # print user
        utags = set()
        for link in network.find({'id0': user}):
            utags.add(tag for tag in link['tags'])
        if len(utags.intersection(proedrel)) == 0:
            network.delete_many({'id0': user})
Ejemplo n.º 22
0
def user_hashtag_profile(dbname, hash_com):
    '''
    Map the hashtags that a user has used to communities of hashtag network
    Get the <commnity: proportion> vector for users' hashtag profiles
    :param dbname:
    :param hash_com:
    :return:
    '''
    ed_users = iot.get_values_one_field(dbname, 'scom', 'id')
    db = dbt.db_connect_no_auth(dbname)
    com_length = len(set(hash_com.values()))
    times = db['timeline']
    user_hash_profile = {}
    for uid in ed_users:
        counter = {}
        for tweet in times.find({'user.id': uid, '$where': 'this.entities.hashtags.length>0'}):
            hashtags = tweet['entities']['hashtags']
            hash_set = set()
            for hash in hashtags:
                hash_set.add(hash['text'].encode('utf-8').lower().replace('_', '').replace('-', ''))
            hash_list = list(hash_set)
            for hash in hash_list:
                v = counter.get(hash, 0)
                counter[hash] = v+1
        vector = [0.0]*com_length
        for hash in counter:
            if hash in hash_com:
                comid = hash_com[hash]
                vector[comid] += counter[hash]
        if sum(vector) == 0:
            user_hash_profile[uid] = np.array(vector)
        else:
            user_hash_profile[uid] = np.array(vector)/sum(vector)

    pickle.dump(user_hash_profile, open('data/user-hash-profile.pick', 'w'))
Ejemplo n.º 23
0
def ed_follow_net():
    # construct ED and their followee network
    g = gt.load_network('fed', 'follownet')
    g.vs['deg'] = g.indegree()
    users = set(iot.get_values_one_field('fed', 'scom', 'id'))
    nodes = []
    for v in g.vs:
        if int(v['name']) in users:
            nodes.append(v)
        elif v['deg'] > 5:
            nodes.append(v)
        else:
            pass
    print 'Filtered nodes: %d' %len(nodes)
    g = g.subgraph(nodes)
    gt.summary(g)
    g.write_graphml('ed-friend'+'.graphml')

    # sbnet have extended all interactions posted by ED users
    edusers = set(g.vs['name'])
    for btype in ['retweet', 'reply', 'mention']:
        gb = gt.load_beh_network('fed', 'sbnet', btype)
        gt.summary(gb)
        nodes = []
        for v in gb.vs:
            if v['name'] in edusers:
                nodes.append(v)
        gb = gb.subgraph(nodes)
        gt.summary(gb)
        gb.write_graphml('ed-'+btype+'-follow.graphml')
Ejemplo n.º 24
0
def label_ed_recovery(hash_com, com_size, idx=[18, 102]):
    # select users in prorec that have more ed-related hashtags
    times = dbt.db_connect_col('fed', 'prorec_tag')
    com = dbt.db_connect_col('fed', 'tag_com')
    threshold = float(sum([com_size[i] for i in idx])) / sum(com_size.values())
    print 'threshold: ', threshold
    users = list(set(iot.get_values_one_field('fed', 'prorec_tag', 'user.id')))
    for uid in users:
        taget_count, all_count = 0.0, 0.0
        for tweet in times.find({'user.id': uid}):
            hashtags = tweet['entities']['hashtags']
            hash_set = set()
            for hash in hashtags:
                # need no .encode('utf-8')
                hash_set.add(hash['text'].encode('utf-8').lower().replace(
                    '_', '').replace('-', ''))
            for tag in hash_set:
                com_id = hash_com.get(tag, -1)
                if com_id > -1:
                    all_count += 1
                    if com_id in idx:
                        taget_count += 1

        if all_count and taget_count / all_count > threshold:
            com.update({'id': uid}, {'$set': {
                'rec_tageted': True
            }},
                       upsert=False)
Ejemplo n.º 25
0
def hashtag_users_label_proed():
    # label all of users who have proed hashtags as selected
    com = dbt.db_connect_col('fed', 'tag_com')
    times_ped = list(set(iot.get_values_one_field('fed', 'proed_tag', 'user.id')))

    for uid in times_ped:
        com.update({'id': uid}, {'$set': {'ped_tageted': True}}, upsert=False)
Ejemplo n.º 26
0
def avg_liwc(dbname):
    fields = iot.read_fields()
    for field in fields:
        filters = {field: {'$exists': True}}
        results = list()
        N = 5
        for i in range(1, N + 1):
            result = iot.get_values_one_field(dbname,
                                              dbname + 'com_t' + str(i), field,
                                              filters)
            result = central_values(result)
            results.append(result)
        ax = plt.gca()
        ind = [y + 1 for y in range(len(results))]
        means = [np.mean(result) for result in results]
        stds = [np.std(result) for result in results]
        ax.errorbar(ind, means, stds, fmt='--o--', capthick=3)
        ax.violinplot(results, showmeans=False, showextrema=True)
        ax.set_xticks(ind)
        # for i in ind:
        #     ax.text(i, means[i-1]+0.5,
        #         str(round(means[i-1], 2))+ '$\pm$'+ str(round(stds[i-1], 2)),
        #         ha='center', va='bottom', )
        ax.set_xticklabels(
            ('Before 2012', '2012', '2013', '2014', 'After 2014'))
        ax.set_xlabel('Time Series')
        tokens = field.split('.')
        if tokens[-1] == 'value':
            ax.set_ylabel(tokens[-2].upper())
        else:
            ax.set_ylabel(tokens[-1])
        ax.grid(True)
        plt.savefig('data/' + field + '.pdf')
        plt.clf()
Ejemplo n.º 27
0
def profile_feature_dependence():
    fields = ['friends_count', 'statuses_count', 'followers_count']
    names = ['following', 'tweet', 'follower']

    for i in xrange(len(fields)):
        fi = fields[i]
        ni = names[i]
        for j in xrange(i+1, len(fields)):
            fj = fields[j]
            nj = names[j]
            print '=========================Dependence :', fi, fj
            plt.rcParams['legend.fontsize'] = 20
            plt.rcParams['axes.labelsize'] = 20
            ax = plt.gca()
            i = 0
            for db, color, mark, label in [('fed', 'g', 's', 'ED'),
                                           ('random', 'b', 'o', 'Random'),
                                           ('young', 'r', '^', 'Younger')]:
                print '++++++++++++++++++++++++++Dependence :', fi, fj, db
                fivalue = np.array(io.get_values_one_field(db, 'scom', fi))
                fjvalue = np.array(io.get_values_one_field(db, 'scom', fj))
                fivalue += 1
                fjvalue += 1
                xmeans, ymeans = plot.mean_bin(fivalue, fjvalue)
                ax.scatter(xmeans, ymeans, s=50, c=color, marker=mark, label=label)
                fit_start = min(fivalue)
                fit_end = max(fivalue)
                # fit_start = np.percentile(fivalue, 2.5)
                # fit_end = np.percentile(fivalue, 97.5)
                xfit, yfit, cof = plot.lr_ls(xmeans, ymeans, fit_start, fit_end)
                ax.plot(xfit, yfit, c=color, linewidth=2, linestyle='--')
                ax.annotate(r'$k_y \propto {k_x}^{'+str(round(cof, 2))+'}$',
                 xy=(xfit[-15], yfit[-15]),  xycoords='data',
                 xytext=(28+(i)*10, -30-(i)*10), textcoords='offset points', fontsize=20,
                 arrowprops=dict(arrowstyle="->"))
                i += 1
            ax.set_xscale("log")
            ax.set_yscale("log")
            ax.set_ylabel('k('+nj+')')
            ax.set_xlabel('k('+ni+')')
            ax.set_xlim(xmin=1)
            ax.set_ylim(ymin=1)
            handles, labels = ax.get_legend_handles_labels()
            leg = ax.legend(handles, labels, loc=4)
            leg.draw_frame(True)
            plt.savefig(fi+'-'+fj+'.pdf')
            plt.clf()
Ejemplo n.º 28
0
def compare_distribute(dbname, comname):
    user = iot.get_values_one_field(dbname, comname, 'id', {'prior-liwc.result.WC':{'$exists': True},
                                                                'post-liwc.result.WC':{'$exists': True}})
    print len(user)
    print user
    features = [
        '.result.i',
        '.result.we',
        '.result.bio',
        '.result.body',
        '.result.health',
        '.result.posemo',
        '.result.negemo',
        '.result.ingest',
        '.result.anx',
        '.result.anger',
        '.result.sad'
        # '.result.work'
        # '.result.future'
                ]
    names = [
        'I', 'We',
             'Bio', 'Body', 'Health', 'Posemo', 'Negemo', 'Ingest', 'Anx', 'Anger', 'Sad',
        # 'Work',
        # 'Future'
    ]
    df = []
    for i in xrange(len(features)):
        feature = features[i]
        prior_values = iot.get_values_one_field(dbname, comname, 'prior-liwc'+feature, {'id':{'$in': user}})
        post_values = iot.get_values_one_field(dbname, comname, 'post-liwc'+feature, {'id':{'$in': user}})
        # sns.kdeplot(np.array(prior_values), label="Prior")
        # sns.kdeplot(np.array(post_values), label="Post")
        # plt.legend()
        # sns.plt.title(feature)
        # plt.show()
        # plt.clf()
        df_prior = pd.DataFrame({'Feature': names[i], 'Group': 'Prior', 'Values': prior_values})
        df_post = pd.DataFrame({'Feature': names[i], 'Group': 'Post', 'Values': post_values})

        df.append(df_prior)
        df.append(df_post)
    df = pd.concat(df)
    sns.set(style="whitegrid", palette="pastel", color_codes=True)
    sns.boxplot(x="Feature", y="Values", hue="Group", data=df, palette="PRGn")
    # sns.despine(offset=10, trim=True)
    plt.show()
Ejemplo n.º 29
0
def diversity_db(dbname, comname, behavior):
    userlist = iot.get_values_one_field(dbname, comname, 'id_str',
                                        {'timeline_count': {'$gt': 0}})
    g = bahavior_net(dbname, comname, 'bnet', behavior)
    # pickle.dump(g, open('data/'+dbname+'_'+behavior+'.pick', 'w'))
    print dbname, behavior
    # g = pickle.load(open('data/' + dbname + '_' + behavior + '.pick', 'r'))
    return netstatis(dbname, behavior, g, userlist)
Ejemplo n.º 30
0
def control_users():
    com = dbt.db_connect_col('fed', 'scom')
    recovery_user = set(iot.get_values_one_field('fed', 'recover', 'user.id'))
    control_com = dbt.db_connect_col('fed', 'control_com')
    control_com.create_index("id", unique=True)
    for user in com.find():
        if user['id'] not in recovery_user:
            control_com.insert(user)
Ejemplo n.º 31
0
def image_color_compare():
    ed_urls = io.get_values_one_field('fed', 'com', 'profile_banner_url', {
        'level': 1,
        'profile_banner_url': {
            '$exists': True
        }
    })
    rd_urls = io.get_values_one_field('random', 'com', 'profile_banner_url', {
        'level': 1,
        'profile_banner_url': {
            '$exists': True
        }
    })
    yg_urls = io.get_values_one_field('young', 'com', 'profile_banner_url', {
        'level': 1,
        'profile_banner_url': {
            '$exists': True
        }
    })

    pickle.dump(ed_urls, open("data/edimage.pick", "wb"))
    pickle.dump(rd_urls, open("data/rdimage.pick", "wb"))
    pickle.dump(yg_urls, open("data/ygimage.pick", "wb"))
    standers, rgbstan = color_standers()

    # ed_urls = pickle.load(open("data/edimage.pick", "rb"))
    # rd_urls = pickle.load(open("data/rdimage.pick", "rb"))
    # yg_urls = pickle.load(open("data/ygimage.pick", "rb"))

    ed_cs = get_image_color(ed_urls)
    pickle.dump(ed_cs, open("data/edics.pick", "wb"))
    ed_cs = pickle.load(open("data/edics.pick", "rb"))
    edi = cate_color(ed_cs, standers, 'lab')
    plot.color_bars(rgbstan, edi)

    rd_cs = get_image_color(rd_urls)
    pickle.dump(rd_cs, open("data/rdics.pick", "wb"))
    rd_cs = pickle.load(open("data/rdics.pick", "rb"))
    rdi = cate_color(rd_cs, standers, 'lab')
    plot.color_bars(rgbstan, rdi)

    yg_cs = get_image_color(yg_urls)
    pickle.dump(yg_cs, open("data/ygics.pick", "wb"))
    ygi = cate_color(yg_cs, standers, 'lab')
    plot.color_bars(rgbstan, ygi)
Ejemplo n.º 32
0
def remove_random_users(dbname, comname, netname):
    com = dbt.db_connect_col(dbname, comname)
    users = iot.get_values_one_field(dbname, comname, 'id', {'level': 3})
    net = dbt.db_connect_col(dbname, netname)
    for row in net.find(no_cursor_timeout=True):
        uid = row['user']
        fid = row['follower']
        if uid in users or fid in users:
            net.delete_one({'_id': row['_id']})
    com.delete_many({'level': 3})
Ejemplo n.º 33
0
def calculate_extenal_user():
    # Calculate how many users have been retweeted by ED but do not exist in ED users
    users = set(iot.get_values_one_field('fed', 'com', 'id'))
    print len(users)
    net = dbt.db_connect_col('fed', 'sbnet')
    i, count = 0, 0
    for record in net.find():
        if (record['id0'] not in users) or (record['id1'] not in users):
            i = +1
        count += 1
    print i, count, float(i) / count
Ejemplo n.º 34
0
def ED_followee():
    # put all ED's followees in follownet
    net = dbt.db_connect_col('fed', 'net2')
    users = set(iot.get_values_one_field('fed', 'scom', 'id'))
    print len(users)
    tem = dbt.db_connect_col('fed', 'follownet')
    for re in net.find():
        if re['follower'] in users:
            try:
                tem.insert(re)
            except pymongo.errors.DuplicateKeyError:
                pass
Ejemplo n.º 35
0
def diversity_db(dbname, comname, behavior, netname):
    userlist = iot.get_values_one_field(dbname, comname, 'id',
                                        # {'timeline_count': {'$gt': 0}}
                                        )
    g = gt.load_beh_network_subset(userlist, dbname, netname, behavior)
    gt.summary(g)

    # g = bahavior_net(dbname, comname, netname, behavior)
    # pickle.dump(g, open('data/'+dbname+'_'+behavior+'.pick', 'w'))
    print dbname, behavior
    # g = pickle.load(open('data/' + dbname + '_' + behavior + '.pick', 'r'))
    return netstatis(dbname, behavior, g, [str(i) for i in userlist], comname)
Ejemplo n.º 36
0
def ed_tweet_normal_tweet_count():
    user_ids = set(iot.get_values_one_field('fed', 'ed_tag', 'user.id'))
    print len(user_ids)
    com = dbt.db_connect_col('fed', 'com')
    tags = dbt.db_connect_col('fed', 'ed_tag')
    data = []
    for uid in user_ids:
        ed_count = tags.count({'user.id': uid})
        all_count = com.find_one({'id': uid})['timeline_count']
        data.append([uid, ed_count, all_count])
    df = pd.DataFrame(data, columns=['id', 'ed_tweet_count', 'all_tweet_count'])
    df.to_csv('user-ed-stats.csv')
Ejemplo n.º 37
0
def filter_user():
    prior = dbt.db_connect_col('fed', 'prior_treat')
    post = dbt.db_connect_col('fed', 'post_treat')
    com = dbt.db_connect_col('fed', 'scom')

    treat_com = dbt.db_connect_col('fed', 'treat_com')
    treat_com.create_index("id", unique=True)

    prior_user = iot.get_values_one_field('fed', 'prior_treat', 'user.id')
    post_user = iot.get_values_one_field('fed', 'post_treat', 'user.id')
    print len(set(prior_user)), len(set(post_user)), len(set(prior_user).intersection(set(post_user)))
    users = list()
    propotions = list()
    for uid in set(prior_user).intersection(set(post_user)):
        count_prior = prior.count({'user.id': uid})
        count_post = post.count({'user.id': uid})
        if count_prior > 0 and count_post > 0:
            users.append(uid)
            propotions.append(float(count_prior)/(count_prior + count_post))

    print len(users)
    print np.mean(propotions)
Ejemplo n.º 38
0
def out_data():
    control = dbt.db_connect_col('fed', 'control_com')
    treat = dbt.db_connect_col('fed', 'treat_com')
    control_user = iot.get_values_one_field('fed', 'control_com', 'id', {'prior_liwc.result.WC':{'$exists': True},
                                                                'post_liwc.result.WC':{'$exists': True}})
    treat_user = iot.get_values_one_field('fed', 'treat_com', 'id', {'prior_liwc.result.WC':{'$exists': True},
                                                                'post_liwc.result.WC':{'$exists': True}})
    data = []
    fields = iot.read_fields()
    prefix = ['prior_liwc', 'post_liwc']
    for i in xrange(2):
        uids = [control_user, treat_user][i]
        for uid in uids:
            user = [control, treat][i].find_one({'id': uid})
            for j in xrange(2):
                fields_new = ['id_str']+[field.replace('liwc_anal', prefix[j]) for field in fields]
                values = iot.get_fields_one_doc(user, fields_new)
                data.append(values+[i, j])

    df = pd.DataFrame(data, columns=['id']+[field.split('.')[-1] for field in fields]+['treated', 'time'])

    df.to_csv('treatment.csv')
Ejemplo n.º 39
0
def image_color_compare():
    ed_urls = io.get_values_one_field(
        "fed", "com", "profile_banner_url", {"level": 1, "profile_banner_url": {"$exists": True}}
    )
    rd_urls = io.get_values_one_field(
        "random", "com", "profile_banner_url", {"level": 1, "profile_banner_url": {"$exists": True}}
    )
    yg_urls = io.get_values_one_field(
        "young", "com", "profile_banner_url", {"level": 1, "profile_banner_url": {"$exists": True}}
    )

    pickle.dump(ed_urls, open("data/edimage.pick", "wb"))
    pickle.dump(rd_urls, open("data/rdimage.pick", "wb"))
    pickle.dump(yg_urls, open("data/ygimage.pick", "wb"))
    standers, rgbstan = color_standers()

    # ed_urls = pickle.load(open("data/edimage.pick", "rb"))
    # rd_urls = pickle.load(open("data/rdimage.pick", "rb"))
    # yg_urls = pickle.load(open("data/ygimage.pick", "rb"))

    ed_cs = get_image_color(ed_urls)
    pickle.dump(ed_cs, open("data/edics.pick", "wb"))
    ed_cs = pickle.load(open("data/edics.pick", "rb"))
    edi = cate_color(ed_cs, standers, "lab")
    plot.color_bars(rgbstan, edi)

    rd_cs = get_image_color(rd_urls)
    pickle.dump(rd_cs, open("data/rdics.pick", "wb"))
    rd_cs = pickle.load(open("data/rdics.pick", "rb"))
    rdi = cate_color(rd_cs, standers, "lab")
    plot.color_bars(rgbstan, rdi)

    yg_cs = get_image_color(yg_urls)
    pickle.dump(yg_cs, open("data/ygics.pick", "wb"))
    ygi = cate_color(yg_cs, standers, "lab")
    plot.color_bars(rgbstan, ygi)
Ejemplo n.º 40
0
def copy_net(dbname, comname, netname):
    # Move networks among two-level users in net2
    net = dbt.db_connect_col(dbname, netname)
    netn = dbt.db_connect_col(dbname, 'net')
    # netn.create_index([("user", pymongo.ASCENDING),
    #              ("follower", pymongo.ASCENDING),
    #              ("type", pymongo.ASCENDING)],
    #             unique=True)
    eduset_list = set(
        iot.get_values_one_field(dbname, comname, 'id', {'level': 1}))
    oneuser_list = set(
        iot.get_values_one_field(dbname, comname, 'id', {'level': 2}))
    print(len(eduset_list))
    for row in net.find(no_cursor_timeout=True):
        uid = row['user']
        fid = row['follower']
        if (uid in eduset_list and fid in oneuser_list) \
                or (uid in oneuser_list and fid in eduset_list) \
                or (uid in eduset_list and fid in eduset_list):
            try:
                netn.insert(row)
                net.delete_one({'_id': row['_id']})
            except pymongo.errors.DuplicateKeyError:
                pass
Ejemplo n.º 41
0
def users_with_collected_friends(dbname, comname, netname):
    # get network from random and younger datasets
    users = iot.get_values_one_field(dbname, comname, 'id', {'level':1})
    # net = gt.load_network_subset(dbname, netname, {
    #     'user': {'$in': users}, 'follower': {'$in': users}
    # })
    # net.write_graphml(dbname+'-net.graphml')

    g = gt.Graph.Read_GraphML(dbname+'-net.graphml')
    gt.summary(g)
    g.vs['outk'] = g.indegree()
    nodes = g.vs.select(outk_gt=0)
    print len(nodes)
    user_ids = [int(v['name']) for v in nodes]
    print len(set(users).intersection(set(user_ids)))
Ejemplo n.º 42
0
def communtiy_feature(dbname, typename):
    fg = ntt.loadnet(dbname, typename)

    fcoms = gt.fast_community(fg)
    pickle.dump(fcoms, open('data/'+dbname+typename+'com.pick', 'w'))
    fcoms = pickle.load(open('data/'+dbname+typename+'com.pick', 'r'))
    fclus = fcoms.as_clustering()
    gt.summary(fclus)

    """Compare difference of features in cummunities"""
    features = [
        'liwc_anal.result.i',
        'liwc_anal.result.we',
        'liwc_anal.result.bio',
        'liwc_anal.result.body',
        'liwc_anal.result.health',
        'liwc_anal.result.posemo',
        'liwc_anal.result.negemo',
        'liwc_anal.result.ingest',
        'liwc_anal.result.anx',
        'liwc_anal.result.anger',
        'liwc_anal.result.sad'
                ]
    therh = 0.1 * fg.vcount()
    for feature in features:
        data = []
        for clu in fclus:
            if len(clu) > therh:
                ulist = set()
                for v in clu:
                    ulist.add(int(fg.vs[v]['name']))
                ulist = list(ulist)
                clu_values = iot.get_values_one_field(dbname, 'com', feature, {'id': {'$in': ulist}})
                data.append(clu_values)

        plot.plot_config()
        for i in xrange(len(data)):
            sns.distplot(data[i], hist=False, label=str(i)+':'+str(len(data[i])))
        plt.xlabel(feature)
        plt.ylabel('PDF')
        # plt.show()
        plt.savefig(feature+typename+'_com.pdf')
        plt.clf()
Ejemplo n.º 43
0
def refine_recovery(dbname, netname):
    '''
    refine the users who have use hashtag #recovery
    :param dbname:
    :param netname:
    :return:
    '''
    network = dbutil.db_connect_col(dbname, netname)
    proed = set(['proed', 'proana', 'promia', 'proanorexia', 'proanamia', 'proanatips', 'proanatip'])
    proedrel = proed
    for link in network.find(no_cursor_timeout=True):
        tags = set(link['tags'])
        if len(proed.intersection(tags)) > 0:
            proedrel = proedrel.union(tags)
    print len(proedrel)
    users = iot.get_values_one_field(dbname, netname, 'id0')
    print len(users)
    for user in users:
        # print user
        utags = set()
        for link in network.find({'id0': user}):
            utags.add(tag for tag in link['tags'])
        if len(utags.intersection(proedrel)) == 0:
            network.delete_many({'id0': user})
Ejemplo n.º 44
0
def hashtag_net(dbname, comname, timename):
    userlist = iot.get_values_one_field(dbname, comname, 'id_str',
                                    {'timeline_count': {'$gt': 0}})
    g = gt.load_user_hashtag_network(dbname, timename)
    pickle.dump(g, open('data/'+dbname+'_hashtag.pick', 'w'))
Ejemplo n.º 45
0
def plot_bio(dbname, colname, fields, names):
    datas = list()
    for field in fields:
        datas.append(iot.get_values_one_field(dbname, colname, field, {field: {'$exists': True}}))
    plot.plot_pdf_mul_data(datas, 'Age', ['g-', 'b-', 'r-', 'k-'], ['s', 'o', '^', '*'],
                           names, linear_bins=True, central=True, fit=False, fitranges=None, savefile='bmi' + '.pdf')
Ejemplo n.º 46
0
def emotion_dropout_IV_split(dbname1, dbname2, comname1, comname2):
    '''
    Split followees and followers as different variables
    :param dbname1:
    :param dbname2:
    :param comname1:
    :param comname2:
    :return:
    '''
    filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}}
    user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que)
    com1 = dbt.db_connect_col(dbname1, comname1)
    com2 = dbt.db_connect_col(dbname2, comname2)
    fields = ['liwc_anal.result.posemo',
              'liwc_anal.result.negemo',
              'liwc_anal.result.anx',
              'liwc_anal.result.anger',
              'liwc_anal.result.sad']
    prof_names = ['friends_count', 'statuses_count', 'followers_count',
        'friends_day', 'statuses_day', 'followers_day', 'days']
    attr_names = ['uid', 'attr']
    attr_names.extend(['u_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['u_'+field for field in prof_names])
    attr_names.extend(['fr_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['fr_'+field for field in prof_names])
    attr_names.extend(['fr_num', 'fr_palive'])
    attr_names.extend(['fo_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['fo_'+field for field in prof_names])
    attr_names.extend(['fo_num', 'fo_palive'])
    attr_names.extend(['co_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['co_'+field for field in prof_names])
    attr_names.extend(['co_num', 'co_palive'])
    print attr_names
    attr_length = len(fields) + len(prof_names) + 2
    network1 = gt.load_network(dbname1, 'net')
    data = []
    for uid in user1:
        row = [uid]
        u1 = com1.find_one({'id': uid})
        u2 = com2.find_one({'id': uid})
        if u2 is None or u2['timeline_count'] == 0:
            row.append(1)
        else:
            row.append(0)
        uatt = iot.get_fields_one_doc(u1, fields)
        row.extend(uatt)
        row.extend(active_days(u1))
        exist = True
        try:
            v = network1.vs.find(name=str(uid))
        except ValueError:
            exist = False
        if exist:
            print '--------------------user %d---------------' %uid
            followees = set([int(network1.vs[v]['name']) for v in network1.successors(str(uid))])
            followers = set([int(network1.vs[v]['name']) for v in network1.predecessors(str(uid))])
            common = followees.intersection(followers)
            followees = followees - common
            followers = followers - common
            for friend_ids in [followees, followers, common]:
                if len(friend_ids) > 0:
                    # friend_ids = [int(network1.vs[v]['name']) for v in friends]
                    print uid in friend_ids
                    print len(friend_ids)
                    fatts = []
                    alive = 0
                    for fid in friend_ids:
                        fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}})
                        fu2 = com2.find_one({'id': fid})
                        if fu != None:
                            fatt = iot.get_fields_one_doc(fu, fields) # Friends' LIWC
                            fatt.extend(active_days(fu))
                            fatts.append(fatt)
                            if fu2 is None or fu2['timeline_count'] == 0:
                                alive += 0
                            else:
                                alive += 1
                    if len(fatts) > 0:
                        fatts = np.array(fatts)
                        fmatts = np.mean(fatts, axis=0)
                        row.extend(fmatts)
                        row.append(len(fatts))
                        paliv = float(alive)/len(fatts)
                        print 'Alive %d %d %.3f' % (alive, len(fatts), paliv)
                        row.append(paliv)
                else:
                    row.extend([None] * attr_length)
            # friends = followers # followers
            # if len(friends) > 0:
            #     friend_ids = [int(network1.vs[v]['name']) for v in friends]
            #     print uid in friend_ids
            #     print len(friend_ids)
            #     fatts = []
            #     alive = 0
            #     for fid in friend_ids:
            #         fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}})
            #         fu2 = com2.find_one({'id': fid})
            #         if fu != None:
            #             fatt = iot.get_fields_one_doc(fu, fields)
            #             fatt.extend(active_days(fu))
            #             fatts.append(fatt)
            #             if fu2 is None or fu2['timeline_count'] == 0:
            #                 alive += 0
            #             else:
            #                 alive += 1
            #     if len(fatts) > 0:
            #         fatts = np.array(fatts)
            #         fmatts = np.mean(fatts, axis=0)
            #         row.extend(fmatts)
            #         row.append(len(fatts))
            #         paliv = float(alive)/len(fatts)
            #         print 'Alive %d %d %.3f' % (alive, len(fatts), paliv)
            #         row.append(paliv)
        # print row
        data.append(row)
    df = pd.DataFrame(data, columns=attr_names)
    df.to_csv('data-attr-split.csv', index = False)
Ejemplo n.º 47
0
def emotion_dropout_IV_combine(dbname1, dbname2, comname1, comname2):
    '''
    Combine followees and follower together as variables
    :param dbname1:
    :param dbname2:
    :param comname1:
    :param comname2:
    :return:
    '''
    filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}}
    user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que)
    com1 = dbt.db_connect_col(dbname1, comname1)
    com2 = dbt.db_connect_col(dbname2, comname2)
    fields = ['liwc_anal.result.posemo',
              'liwc_anal.result.negemo',
              'liwc_anal.result.anx',
              'liwc_anal.result.anger',
              'liwc_anal.result.sad']
    prof_names = ['friends_count', 'statuses_count', 'followers_count',
        'friends_day', 'statuses_day', 'followers_day', 'days']
    attr_names = ['uid', 'attr']
    attr_names.extend(['u_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['u_'+field for field in prof_names])
    attr_names.extend(['f_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['f_'+field for field in prof_names])
    attr_names.extend(['f_num', 'f_palive'])
    print attr_names
    network1 = gt.load_network(dbname1, 'net')
    data = []
    for uid in user1:
        row = [uid]
        u1 = com1.find_one({'id': uid})
        u2 = com2.find_one({'id': uid})
        if u2 is None or u2['timeline_count'] == 0:
            row.append(1)
        else:
            row.append(0)
        uatt = iot.get_fields_one_doc(u1, fields)
        row.extend(uatt)
        row.extend(active_days(u1))
        exist = True
        try:
            v = network1.vs.find(name=str(uid))
        except ValueError:
            exist = False
        if exist:
            friends = set(network1.neighbors(str(uid))) # id or name
            if len(friends) > 0:
                friend_ids = [int(network1.vs[v]['name']) for v in friends] # return id
                print uid in friend_ids
                print len(friend_ids)
                fatts = []
                alive = 0
                for fid in friend_ids:
                    fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}})
                    fu2 = com2.find_one({'id': fid})
                    if fu != None:
                        fatt = iot.get_fields_one_doc(fu, fields)
                        fatt.extend(active_days(fu))
                        fatts.append(fatt)
                        if fu2 is None or fu2['timeline_count'] == 0:
                            alive += 0
                        else:
                            alive += 1
                if len(fatts) > 0:
                    fatts = np.array(fatts)
                    fmatts = np.mean(fatts, axis=0)
                    row.extend(fmatts)
                    row.append(len(fatts))
                    paliv = float(alive)/len(fatts)
                    print 'Alive %d %d %.3f' % (alive, len(fatts), paliv)
                    row.append(paliv)
        # print row
        data.append(row)
    df = pd.DataFrame(data, columns=attr_names)
    df.to_csv('data-attr-combine.csv', index = False)
Ejemplo n.º 48
0
def compare_difference():
    ed_ids = ioutil.get_values_one_field('fed', 'com', 'id', {'level':1})
    rd_ids = ioutil.get_values_one_field('random', 'com', 'id', {'level':1})
    print list(set(ed_ids).intersection(rd_ids))
Ejemplo n.º 49
0
def friendship_community_vis(dbname, colname, filename, ctype):
    '''Out graph for vis.js visualization'''
    ed_users = iot.get_values_one_field(dbname, 'scom', 'id')
    # fed_users = iot.get_values_one_field(dbname, 'com', 'id')
    dbcom = dbt.db_connect_col(dbname, 'com')
    fg = gt.load_network(dbname, colname)
    # fg = gt.load_beh_network_subset(ed_users, dbname, colname, 'retweet')
    gt.net_stat(fg)
    # fg = fg.as_undirected(mode="mutual")
    # gt.net_stat(fg)

    fg = gt.giant_component(fg, 'WEAK')
    gt.net_stat(fg)

    if ctype == 'ml':
        com = fg.community_multilevel(weights='weight', return_levels=False)
    elif ctype == 'lp':
        fgu = fg.as_undirected(combine_edges=sum)
        init = fgu.community_leading_eigenvector(clusters=2, weights='weight')
        print init.membership
        com = fg.community_label_propagation(weights='weight', initial=init.membership)
        print com.membership
    else:
        com = fg.community_infomap(edge_weights='weight', trials=2)
    fg.vs['group'] = com.membership

    # edges = fg.es.select(weight_gt=3)
    # print 'Filtered edges: %d' %len(edges)
    # fg = fg.subgraph_edges(edges)
    # gt.net_stat(fg)

    # fg.vs['degree'] = fg.degree(mode="all")
    # nodes = fg.vs.select(degree_gt=10)
    # fg = fg.subgraph(nodes)
    # gt.net_stat(fg)

    Coo={}
    for x in fg.vs['group']:
        Coo[x]=(rand.randint(-1000, 1000), rand.randint(-1000, 1000))

    with open('data/' + ctype + '_' +filename+'_net_follow.js', 'w') as fw:
        fw.write('var nodes = [\n')
        for idv, v in enumerate(fg.vs):
            user = dbcom.find_one({'id': int(fg.vs[idv]['name'])})
            desc = ' '.join(user['description'].replace('\'', '').replace('\"', '').split())
            fw.write('{id: ' + str(idv+1) + ', '+
                     'label: \'' + user['screen_name'] +'\', ' +
                     'value: ' + str(fg.degree(idv, mode="in")) + ', ' +
                     'title: \'UID: ' + str(fg.vs[idv]['name']) +
                     '<br> Screen Name: ' + user['screen_name'] +
                     '<br> Followers: ' + str(user['followers_count']) +
                     '<br> Followees: ' + str(user['friends_count']) +
                     '<br> Tweets: ' + str(user['statuses_count']) +
                     '<br> Description: ' + str(desc.encode('utf-8')) +
                     '<br> Group: ' + str(fg.vs[idv]['group']) + '\', ' +
                     'x: ' + str(Coo[fg.vs[idv]['group']][0]+rand.randint(0, 300)) + ', ' +
                     'y: ' + str(Coo[fg.vs[idv]['group']][1]+rand.randint(0, 300)) + ', ' +
                     'group: ' + str(fg.vs[idv]['group']) + ', ')
            # if int(fg.vs[idv]['name']) in ed_users:
            #     fw.write('shape: ' + '\'triangle\'')
            # else:
            #     fw.write('shape: ' + '\'circle\'')
            fw.write('}, \n')
        fw.write('];\n var edges = [\n')
        for ide, e in enumerate(fg.es):
            fw.write('{from: ' + str(e.source+1) + ', ' +
                     'to: ' + str(e.target+1) + ', ' +
                     'arrows: ' + '\'to\'' + ', ' +
                     'title: \' Tags: ' + fg.vs[e.source]['name'] + ' ' + fg.vs[e.target]['name'] +
                     '<br> Co-occurrence: ' + str(fg.es[ide]['weight']) + '\', ' +
                     'value: ' + str(fg.es[ide]['weight']) +
                     '},\n') #str(fg.es[ide]['weight'])
        fw.write('];\n')
Ejemplo n.º 50
0
def bahavior_net(dbname, comname, bnetname, btype):
    userlist = iot.get_values_one_field(dbname, comname, 'id',
                                        {'timeline_count': {'$gt': 0}})
    return gt.load_beh_network_subset(userlist, dbname, bnetname, btype)