def friend_user_change(dbname1, dbname2, comname1, comname2): filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists':True}} user2 = iot.get_values_one_field(dbname2, comname2, 'id', filter_que) fields = ['liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.body', 'liwc_anal.result.ingest'] network1 = gt.load_network(dbname1, 'net') network2 = gt.load_network(dbname2, 'net') for field in fields: print '-----------------%s----------------' %field user_changes, friends_changes = [], [] for uid in user2: user_feature_old = iot.get_values_one_field(dbname1, comname1, field, {'id': uid}) user_feature_new = iot.get_values_one_field(dbname2, comname2, field, {'id': uid}) if len(user_feature_old) != len(user_feature_new) and len(user_feature_new) != 1: print 'User feature value length %d, %d' %(len(user_feature_old), len(user_feature_new)) user_change = np.mean(user_feature_new) - np.mean(user_feature_old) exist = True try: v = network1.vs.find(name=str(uid)) v = network2.vs.find(name=str(uid)) except ValueError: exist = False if exist: friends_old = network1.successors(str(uid)) friends_new = network2.successors(str(uid)) old_friend_ids = [int(network1.vs[v]['name']) for v in friends_old] new_friend_ids = [int(network2.vs[v]['name']) for v in friends_new] if len(old_friend_ids) != len(new_friend_ids): print 'Friend feature value length %d, %d' % (len(old_friend_ids), len(new_friend_ids)) friends_feature_old = iot.get_values_one_field(dbname1, comname1, field, {'id': {'$in': old_friend_ids}}) friends_feature_new = iot.get_values_one_field(dbname2, comname2, field, {'id': {'$in': new_friend_ids}}) friend_change = np.mean(friends_feature_new) - np.mean(friends_feature_old) friends_changes.append(friend_change) user_changes.append(user_change) pltt.correlation(friends_changes, user_changes, r'$\Delta$(F_'+field+')', r'$\Delta$(U_'+field+')', field+'-friend-user.pdf')
def plot_distribution(dbname='fed', comname='scom'): # Plot difference between retweeted and liked tweets fields = iot.read_fields() for field in fields: tokens = field.split('.') retweet_key = field.replace('liwc_anal', 'retweet_liwc') like_key = field.replace('liwc_anal', 'like_liwc') retwets = iot.get_values_one_field(dbname, comname, retweet_key) likes = iot.get_values_one_field(dbname, comname, like_key) pt.plot_config() sns.distplot(retwets, hist=False, kde_kws={ "color": "r", "lw": 2, "marker": 'o' }, label='RT ($\mu=%0.2f \pm %0.2f$)' % (np.mean(retwets), np.std(retwets))) sns.distplot(likes, hist=False, kde_kws={ "color": "g", "lw": 2, "marker": 's' }, label='Like ($\mu=%0.2f \pm %0.2f$)' % (np.mean(likes), np.std(likes))) plt.legend(loc="best") plt.xlabel(tokens[-1]) plt.ylabel('P') plt.savefig('data/' + tokens[-1] + '.pdf', bbox_inches='tight') plt.clf()
def profile_feature_stat(): # 'favourites_count' fields = ['friends_count', 'followers_count', 'statuses_count'] names = ['following', 'follower', 'tweet'] filter = {} fitranges = [[(200, 100000), (1000, 100000000), (800, 10000000)], [(700, 10000), (800, 10000000), (800, 1000000)], [(800, 100000), (20000, 10000000), (10000, 10000000)]] for i in range(len(fields)): field = fields[i] print '=====================', field feds = np.array(io.get_values_one_field('fed', 'scom', field, filter)) + 1 randoms = np.array( io.get_values_one_field('random', 'scom', field, filter)) + 1 youngs = np.array( io.get_values_one_field('young', 'scom', field, filter)) + 1 comm = statis_util.comm_stat(feds) print 'ED & ' + str(comm[0]) + ' & ' + str(comm[1]) \ + ' & ' + str(comm[2])+ ' & ' + str(comm[3]) + '\\\\' comm = statis_util.comm_stat(randoms) print 'Random &' + str(comm[0]) + ' & ' + str(comm[1]) \ + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\' comm = statis_util.comm_stat(youngs) print 'Younger &' + str(comm[0]) + ' & ' + str(comm[1]) \ + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\' print '\\hline' # z = statis_util.z_test(randoms, feds) # print 'z-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ # + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' # z = statis_util.z_test(youngs, feds) # print 'z-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$:' + str(z[1]) \ # + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' # z = statis_util.z_test(youngs, randoms) # print 'z-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ # + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' z = statis_util.ks_test(randoms, feds) print 'ks-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' z = statis_util.ks_test(youngs, feds) print 'ks-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' z = statis_util.ks_test(youngs, randoms) print 'ks-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' plot.plot_pdf_mul_data([feds, randoms, youngs], names[i], ['g', 'b', 'r'], ['s', 'o', '^'], ['ED', 'Random', 'Younger'], linear_bins=False, central=False, fit=True, fitranges=fitranges[i], savefile=field + '.pdf')
def profile_feature_dependence(): fields = ['friends_count', 'statuses_count', 'followers_count'] names = ['following', 'tweet', 'follower'] for i in xrange(len(fields)): fi = fields[i] ni = names[i] for j in xrange(i + 1, len(fields)): fj = fields[j] nj = names[j] print '=========================Dependence :', fi, fj plt.rcParams['legend.fontsize'] = 20 plt.rcParams['axes.labelsize'] = 20 ax = plt.gca() i = 0 for db, color, mark, label in [('fed', 'g', 's', 'ED'), ('random', 'b', 'o', 'Random'), ('young', 'r', '^', 'Younger')]: print '++++++++++++++++++++++++++Dependence :', fi, fj, db fivalue = np.array(io.get_values_one_field(db, 'scom', fi)) fjvalue = np.array(io.get_values_one_field(db, 'scom', fj)) fivalue += 1 fjvalue += 1 xmeans, ymeans = plot.mean_bin(fivalue, fjvalue) ax.scatter(xmeans, ymeans, s=50, c=color, marker=mark, label=label) fit_start = min(fivalue) fit_end = max(fivalue) # fit_start = np.percentile(fivalue, 2.5) # fit_end = np.percentile(fivalue, 97.5) xfit, yfit, cof = plot.lr_ls(xmeans, ymeans, fit_start, fit_end) ax.plot(xfit, yfit, c=color, linewidth=2, linestyle='--') ax.annotate(r'$k_y \propto {k_x}^{' + str(round(cof, 2)) + '}$', xy=(xfit[-15], yfit[-15]), xycoords='data', xytext=(28 + (i) * 10, -30 - (i) * 10), textcoords='offset points', fontsize=20, arrowprops=dict(arrowstyle="->")) i += 1 ax.set_xscale("log") ax.set_yscale("log") ax.set_ylabel('k(' + nj + ')') ax.set_xlabel('k(' + ni + ')') ax.set_xlim(xmin=1) ax.set_ylim(ymin=1) handles, labels = ax.get_legend_handles_labels() leg = ax.legend(handles, labels, loc=4) leg.draw_frame(True) plt.savefig(fi + '-' + fj + '.pdf') plt.clf()
def pro_tag_user(): # get users with pro-ed and pro-recovery hashtags proed = set(iot.get_values_one_field('fed', 'proed_tag', 'user.id')) prorec = set(iot.get_values_one_field('fed', 'prorec_tag', 'user.id')) print len(proed), len(prorec), len(proed.intersection(prorec)) print len(proed-prorec), len(prorec-proed) print prorec-proed return ([str(i) for i in proed-prorec], [str(i) for i in prorec-proed], [str(i) for i in proed.intersection(prorec)])
def feature_stat(dumped=False): fields = io.read_fields() print len(fields) assert isinstance(fields, object) for field in fields: keys = field.split('.') filter = {field: {'$exists': True}} eds = io.get_values_one_field('fed', 'scom', field, filter) randoms = io.get_values_one_field('random', 'scom', field, filter) youngs = io.get_values_one_field('young', 'scom', field, filter) compore_distribution(keys[-1], eds, randoms, youngs)
def load_net(): g = gt.load_network('fed', 'net') g.write_graphml('ed-net.graphml') users = iot.get_values_one_field('random', 'scom', 'id') g = gt.load_network_subset('random', 'net', {'user': {'$in': users}, 'follower': {'$in': users}}) g.write_graphml('rd-net.graphml') users = iot.get_values_one_field('younger', 'scom', 'id') g = gt.load_network_subset('younger', 'net', {'user': {'$in': users}, 'follower': {'$in': users}}) g.write_graphml('yg-net.graphml')
def overlap(): # overlap between two data core_ed = set(iot.get_values_one_field('fed', 'scom', 'id')) ian_ed = set() with open('uid.txt', 'r') as fo: for line in fo.readlines(): ian_ed.add(int(line.strip())) print len(core_ed), len(ian_ed), len(core_ed.intersection(ian_ed)) fed = set(iot.get_values_one_field('fed', 'com', 'id')) ian_all = set(iot.get_values_one_field('TwitterProAna', 'users', 'id')) print len(fed), len(ian_all), len(fed.intersection(ian_all)) print len(fed), len(ian_ed), len(fed.intersection(ian_ed))
def compare_weights(): #Compare distributions of CW and GW between pro-ed and pro-recovery users prorec, proed = edrelatedcom.rec_proed() ## based on profiles for users in [prorec, proed]: field = 'text_anal.cw.value' cw = iot.get_values_one_field('fed', 'scom', field, {'id_str': {'$in': users}, field: {'$exists': True}}) field = 'text_anal.gw.value' gw = iot.get_values_one_field('fed', 'scom', field, {'id_str': {'$in': users}, field: {'$exists': True}}) sns.distplot(cw, hist=False, label='CW') sns.distplot(gw, hist=False, label='GW') plt.show()
def hashtag_users(): com = dbt.db_connect_col('fed', 'com') times_ped = list(set(iot.get_values_one_field('fed', 'proed_tag', 'user.id'))) times_rec = list(set(iot.get_values_one_field('fed', 'prorec_tag', 'user.id'))) newtime = dbt.db_connect_col('fed', 'tag_com') newtime.create_index([('id', pymongo.ASCENDING)], unique=True) for users in [times_ped, times_rec]: for uid in users: user = com.find_one({'id': uid}) try: newtime.insert(user) except pymongo.errors.DuplicateKeyError: pass
def distribution_change(dbname, colname): rec_users1 = pickle.load(open('data/pro-recovery.pick', 'r')) pro_ed = pickle.load(open('data/pro_ed.pick', 'r')) print len(rec_users1) print len(pro_ed) features = [ 'liwc_anal.result.i', 'liwc_anal.result.we', 'liwc_anal.result.bio', 'liwc_anal.result.body', 'liwc_anal.result.health', 'liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.ingest', 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad' ] names = ['I', 'We', 'Bio', 'Body', 'Health', 'Posemo', 'Negemo', 'Ingest', 'Anx', 'Anger', 'Sad'] df = pd.DataFrame() pltt.plot_config() for i in xrange(len(features)): feature = features[i] old_values = iot.get_values_one_field(dbname, colname, feature, {'id':{'$in': rec_users1}}) df1 = pd.DataFrame({'Feature': names[i], 'Group': 'Pro-Recovery', 'Values': old_values}) new_values = iot.get_values_one_field(dbname, colname, feature, {'id':{'$in': pro_ed}}) df2 = pd.DataFrame({'Feature': names[i], 'Group': 'Pro-ED', 'Values': new_values}) df1 = df1.append(df2) if len(df) == 0: df = df1 else: df = df.append(df1) '''Plot Individual''' # sns.distplot(old_values, hist=False, label='Before') # sns.distplot(new_values, hist=False, label='After') d, p = stats.ks_2samp(old_values, new_values) print (names[i] + ', %.3f(%.3f), %.3f(%.3f), %.3f(%.3f)' %((np.mean(old_values)), (np.std(old_values)), (np.mean(new_values)), (np.std(new_values)), d, p)) # plt.xlabel(feature) # plt.ylabel('PDF') # # plt.show() # plt.savefig(dbname+'_'+feature+'_time.pdf') # plt.clf() sns.set(style="whitegrid", palette="pastel", color_codes=True) # sns.violinplot(x="Feature", y="Values", hue="Time", data=df, split=True, # inner="quart", palette={"Before": "b", "After": "y"}) # sns.despine(left=True) sns.boxplot(x="Feature", y="Values", hue="Group", data=df, palette="PRGn") sns.despine(offset=10, trim=True) plt.show()
def profile_feature_stat(): # 'favourites_count' fields = ['friends_count', 'followers_count', 'statuses_count'] names = ['following', 'follower', 'tweet'] filter = {} fitranges = [[(200, 100000), (1000, 100000000), (800, 10000000)], [(700, 10000), (800, 10000000), (800, 1000000)], [(800, 100000), (20000, 10000000), (10000, 10000000)]] for i in range(len(fields)): field = fields[i] print '=====================', field feds = np.array(io.get_values_one_field('fed', 'scom', field, filter))+1 randoms = np.array(io.get_values_one_field('random', 'scom', field, filter))+1 youngs = np.array(io.get_values_one_field('young', 'scom', field, filter))+1 comm = statis_util.comm_stat(feds) print 'ED & ' + str(comm[0]) + ' & ' + str(comm[1]) \ + ' & ' + str(comm[2])+ ' & ' + str(comm[3]) + '\\\\' comm = statis_util.comm_stat(randoms) print 'Random &' + str(comm[0]) + ' & ' + str(comm[1]) \ + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\' comm = statis_util.comm_stat(youngs) print 'Younger &' + str(comm[0]) + ' & ' + str(comm[1]) \ + ' & ' + str(comm[2])+ ' & ' + str(comm[3])+ '\\\\' print '\\hline' # z = statis_util.z_test(randoms, feds) # print 'z-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ # + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' # z = statis_util.z_test(youngs, feds) # print 'z-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$:' + str(z[1]) \ # + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' # z = statis_util.z_test(youngs, randoms) # print 'z-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ # + ' & z-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' z = statis_util.ks_test(randoms, feds) print 'ks-test(Random, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' z = statis_util.ks_test(youngs, feds) print 'ks-test(Younger, ED): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' z = statis_util.ks_test(youngs, randoms) print 'ks-test(Younger, Random): & $n_1$: ' + str(z[0]) + ' & $n_2$: ' + str(z[1]) \ + ' & ks-value: ' + str(z[2])+ ' & p-value: ' + str(z[3])+ '\\\\' plot.plot_pdf_mul_data([feds, randoms, youngs], names[i], ['g', 'b', 'r'], ['s', 'o', '^'], ['ED', 'Random', 'Younger'], linear_bins=False, central=False, fit=True, fitranges=fitranges[i], savefile=field+'.pdf')
def process_tweet(dbname, comname, timename, label, filename): db = dbt.db_connect_no_auth(dbname) times = db[timename] user_list = iot.get_values_one_field(dbname, comname, 'id', { "timeline_count": { '$gt': 0 }, 'lang': 'en' }) target_users = [] for user in user_list: context = '' for time in times.find({'user.id': user}).sort([('id', 1)]): # print time['created_at'] if 'retweeted_status' in time: continue elif 'quoted_status' in time: continue else: text = process(time['text']) if text: # print user, time['id'], text, '<-------', time['text'] context += text + ' ' else: continue # print user, time['id'], 'None', '<-------', time['text'] if len(context.split()) > 50: target_users.append(user) print '__label__' + label + ' , ' + context pickle.dump(target_users, open('data/' + filename + '.pick', 'w'))
def re_snowball_friends(olddbname, oldcomname, newdbname, newcomname): newdb = dbt.db_connect_no_auth(newdbname) newcom = newdb[newcomname] newnet = newdb['net'] newcom.create_index("id", unique=True) newcom.create_index([('level', pymongo.ASCENDING), ('following_prelevel_node', pymongo.ASCENDING)], unique=False) newcom.create_index([('level', pymongo.ASCENDING), ('follower_prelevel_node', pymongo.ASCENDING)], unique=False) newnet.create_index([("user", pymongo.ASCENDING), ("follower", pymongo.ASCENDING)], unique=True) '''Reteive ED core users''' ed_users = iot.get_values_one_field(olddbname, oldcomname, 'id', {'level': 1}) list_size = len(ed_users) length = int(math.ceil(list_size/100.0)) for index in xrange(length): index_begin = index*100 index_end = min(list_size, index_begin+100) lookup.lookup_user_list(ed_users[index_begin:index_end], newcom, 1, 'N') level = 1 while True: # Each call of snowball_following and snowball_follower only process up to 200 users print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'Snowball followings of seeds for sample db', level following_flag = following.snowball_following(newcom, newnet, level, 'N') print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'Snowball followees of seeds for sample db', level follower_flag = follower.snowball_follower(newcom, newnet, level, 'N') if following_flag == False and follower_flag == False: break else: continue
def data_4_opinionfinder(dbname, comname, timename, outpath, filter={}): db = dbt.db_connect_no_auth(dbname) time = db[timename] rtgrex = re.compile(r'RT (?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z0-9_]+):') # for Retweet mgrex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z0-9_]+)') # for mention hgrex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9]))#([A-Za-z0-9_]+)') # for hashtags ugrex = re.compile(r'(https?://[^\s]+)') # for url users = io.get_values_one_field(dbname, comname, 'id_str', filter) userlist = list() for user in users: documents = list() for tweet in time.find({'user.id': int(user)}): text = tweet['text'].encode('utf8') # replace RT, @, # and Http:// text = rtgrex.sub('', text) text = mgrex.sub('', text) text = hgrex.sub('', text) text = ugrex.sub('', text) text = text.strip() if not(text.endswith('.') or text.endswith('?') or text.endswith('!')): text += '.' words = text.split() if len(words) > 0: documents.append(' '.join(words)) if len(documents) > 0: with open(outpath+'/'+user+'.data', 'w') as fo: for document in documents: fo.write(document+'\t\n') userlist.append(user) with open(outpath+'.doclist', 'w') as fo: for user in userlist: fo.write('database/'+outpath+'/'+ user+'.data\n')
def avg_liwc(dbname): fields = iot.read_fields() for field in fields: filters = {field: {'$exists': True}} results = list() N = 5 for i in range(1, N+1): result = iot.get_values_one_field(dbname, dbname+'com_t'+str(i), field, filters) result = central_values(result) results.append(result) ax = plt.gca() ind = [y+1 for y in range(len(results))] means = [np.mean(result) for result in results] stds = [np.std(result) for result in results] ax.errorbar(ind, means, stds, fmt='--o--', capthick=3) ax.violinplot(results, showmeans=False, showextrema=True) ax.set_xticks(ind) # for i in ind: # ax.text(i, means[i-1]+0.5, # str(round(means[i-1], 2))+ '$\pm$'+ str(round(stds[i-1], 2)), # ha='center', va='bottom', ) ax.set_xticklabels(('Before 2012', '2012', '2013', '2014', 'After 2014')) ax.set_xlabel('Time Series') tokens = field.split('.') if tokens[-1] == 'value': ax.set_ylabel(tokens[-2].upper()) else: ax.set_ylabel(tokens[-1]) ax.grid(True) plt.savefig('data/'+field+'.pdf') plt.clf()
def feature_stat(dumped=False): fields = io.read_fields() print len(fields) assert isinstance(fields, object) for field in fields: keys = field.split('.') # filter = {field: {'$exists': True}} # eds = io.get_values_one_field('fed', 'scom', field, filter) # randoms = io.get_values_one_field('random', 'scom', field, filter) # youngs = io.get_values_one_field('young', 'scom', field, filter) # compore_distribution(keys[-1], eds, randoms, youngs) positive = io.get_values_one_field('depression', 'com', field, {field: {'$exists': True}, 'checked': True}) negative = io.get_values_one_field('depression', 'neg_com', field, {field: {'$exists': True}}) # print len(positive), len(negative) compore_distribution(keys[-1], positive, negative)
def network_analysis(): # output network among depression users # user1 = iot.get_values_one_field('depression', 'users1', 'id') # user2 = iot.get_values_one_field('depression', 'users2', 'id') # print len(user1), len(user2) # alluser = user1 + user2 alluser = iot.get_values_one_field('depression', 'depressive', 'id') follow_net = gt.load_network_subset('depression', 'net', { 'user': { '$in': alluser }, 'follower': { '$in': alluser } }) gt.net_stat(follow_net) follow_net.write_graphml('data/follow_net.graphml') for beh in ['retweet', 'communication']: print beh bnetwork = gt.load_beh_network_subset(userlist=alluser, db_name='depression', collection='bnet', btype=beh) gt.net_stat(bnetwork) bnetwork.write_graphml('data/' + beh + '_net.graphml')
def refine_recovery(dbname, netname): ''' refine the users who have use hashtag #recovery :param dbname: :param netname: :return: ''' network = dbutil.db_connect_col(dbname, netname) proed = set([ 'proed', 'proana', 'promia', 'proanorexia', 'proanamia', 'proanatips', 'proanatip' ]) proedrel = proed for link in network.find(no_cursor_timeout=True): tags = set(link['tags']) if len(proed.intersection(tags)) > 0: proedrel = proedrel.union(tags) print len(proedrel) users = iot.get_values_one_field(dbname, netname, 'id0') print len(users) for user in users: # print user utags = set() for link in network.find({'id0': user}): utags.add(tag for tag in link['tags']) if len(utags.intersection(proedrel)) == 0: network.delete_many({'id0': user})
def user_hashtag_profile(dbname, hash_com): ''' Map the hashtags that a user has used to communities of hashtag network Get the <commnity: proportion> vector for users' hashtag profiles :param dbname: :param hash_com: :return: ''' ed_users = iot.get_values_one_field(dbname, 'scom', 'id') db = dbt.db_connect_no_auth(dbname) com_length = len(set(hash_com.values())) times = db['timeline'] user_hash_profile = {} for uid in ed_users: counter = {} for tweet in times.find({'user.id': uid, '$where': 'this.entities.hashtags.length>0'}): hashtags = tweet['entities']['hashtags'] hash_set = set() for hash in hashtags: hash_set.add(hash['text'].encode('utf-8').lower().replace('_', '').replace('-', '')) hash_list = list(hash_set) for hash in hash_list: v = counter.get(hash, 0) counter[hash] = v+1 vector = [0.0]*com_length for hash in counter: if hash in hash_com: comid = hash_com[hash] vector[comid] += counter[hash] if sum(vector) == 0: user_hash_profile[uid] = np.array(vector) else: user_hash_profile[uid] = np.array(vector)/sum(vector) pickle.dump(user_hash_profile, open('data/user-hash-profile.pick', 'w'))
def ed_follow_net(): # construct ED and their followee network g = gt.load_network('fed', 'follownet') g.vs['deg'] = g.indegree() users = set(iot.get_values_one_field('fed', 'scom', 'id')) nodes = [] for v in g.vs: if int(v['name']) in users: nodes.append(v) elif v['deg'] > 5: nodes.append(v) else: pass print 'Filtered nodes: %d' %len(nodes) g = g.subgraph(nodes) gt.summary(g) g.write_graphml('ed-friend'+'.graphml') # sbnet have extended all interactions posted by ED users edusers = set(g.vs['name']) for btype in ['retweet', 'reply', 'mention']: gb = gt.load_beh_network('fed', 'sbnet', btype) gt.summary(gb) nodes = [] for v in gb.vs: if v['name'] in edusers: nodes.append(v) gb = gb.subgraph(nodes) gt.summary(gb) gb.write_graphml('ed-'+btype+'-follow.graphml')
def label_ed_recovery(hash_com, com_size, idx=[18, 102]): # select users in prorec that have more ed-related hashtags times = dbt.db_connect_col('fed', 'prorec_tag') com = dbt.db_connect_col('fed', 'tag_com') threshold = float(sum([com_size[i] for i in idx])) / sum(com_size.values()) print 'threshold: ', threshold users = list(set(iot.get_values_one_field('fed', 'prorec_tag', 'user.id'))) for uid in users: taget_count, all_count = 0.0, 0.0 for tweet in times.find({'user.id': uid}): hashtags = tweet['entities']['hashtags'] hash_set = set() for hash in hashtags: # need no .encode('utf-8') hash_set.add(hash['text'].encode('utf-8').lower().replace( '_', '').replace('-', '')) for tag in hash_set: com_id = hash_com.get(tag, -1) if com_id > -1: all_count += 1 if com_id in idx: taget_count += 1 if all_count and taget_count / all_count > threshold: com.update({'id': uid}, {'$set': { 'rec_tageted': True }}, upsert=False)
def hashtag_users_label_proed(): # label all of users who have proed hashtags as selected com = dbt.db_connect_col('fed', 'tag_com') times_ped = list(set(iot.get_values_one_field('fed', 'proed_tag', 'user.id'))) for uid in times_ped: com.update({'id': uid}, {'$set': {'ped_tageted': True}}, upsert=False)
def avg_liwc(dbname): fields = iot.read_fields() for field in fields: filters = {field: {'$exists': True}} results = list() N = 5 for i in range(1, N + 1): result = iot.get_values_one_field(dbname, dbname + 'com_t' + str(i), field, filters) result = central_values(result) results.append(result) ax = plt.gca() ind = [y + 1 for y in range(len(results))] means = [np.mean(result) for result in results] stds = [np.std(result) for result in results] ax.errorbar(ind, means, stds, fmt='--o--', capthick=3) ax.violinplot(results, showmeans=False, showextrema=True) ax.set_xticks(ind) # for i in ind: # ax.text(i, means[i-1]+0.5, # str(round(means[i-1], 2))+ '$\pm$'+ str(round(stds[i-1], 2)), # ha='center', va='bottom', ) ax.set_xticklabels( ('Before 2012', '2012', '2013', '2014', 'After 2014')) ax.set_xlabel('Time Series') tokens = field.split('.') if tokens[-1] == 'value': ax.set_ylabel(tokens[-2].upper()) else: ax.set_ylabel(tokens[-1]) ax.grid(True) plt.savefig('data/' + field + '.pdf') plt.clf()
def profile_feature_dependence(): fields = ['friends_count', 'statuses_count', 'followers_count'] names = ['following', 'tweet', 'follower'] for i in xrange(len(fields)): fi = fields[i] ni = names[i] for j in xrange(i+1, len(fields)): fj = fields[j] nj = names[j] print '=========================Dependence :', fi, fj plt.rcParams['legend.fontsize'] = 20 plt.rcParams['axes.labelsize'] = 20 ax = plt.gca() i = 0 for db, color, mark, label in [('fed', 'g', 's', 'ED'), ('random', 'b', 'o', 'Random'), ('young', 'r', '^', 'Younger')]: print '++++++++++++++++++++++++++Dependence :', fi, fj, db fivalue = np.array(io.get_values_one_field(db, 'scom', fi)) fjvalue = np.array(io.get_values_one_field(db, 'scom', fj)) fivalue += 1 fjvalue += 1 xmeans, ymeans = plot.mean_bin(fivalue, fjvalue) ax.scatter(xmeans, ymeans, s=50, c=color, marker=mark, label=label) fit_start = min(fivalue) fit_end = max(fivalue) # fit_start = np.percentile(fivalue, 2.5) # fit_end = np.percentile(fivalue, 97.5) xfit, yfit, cof = plot.lr_ls(xmeans, ymeans, fit_start, fit_end) ax.plot(xfit, yfit, c=color, linewidth=2, linestyle='--') ax.annotate(r'$k_y \propto {k_x}^{'+str(round(cof, 2))+'}$', xy=(xfit[-15], yfit[-15]), xycoords='data', xytext=(28+(i)*10, -30-(i)*10), textcoords='offset points', fontsize=20, arrowprops=dict(arrowstyle="->")) i += 1 ax.set_xscale("log") ax.set_yscale("log") ax.set_ylabel('k('+nj+')') ax.set_xlabel('k('+ni+')') ax.set_xlim(xmin=1) ax.set_ylim(ymin=1) handles, labels = ax.get_legend_handles_labels() leg = ax.legend(handles, labels, loc=4) leg.draw_frame(True) plt.savefig(fi+'-'+fj+'.pdf') plt.clf()
def compare_distribute(dbname, comname): user = iot.get_values_one_field(dbname, comname, 'id', {'prior-liwc.result.WC':{'$exists': True}, 'post-liwc.result.WC':{'$exists': True}}) print len(user) print user features = [ '.result.i', '.result.we', '.result.bio', '.result.body', '.result.health', '.result.posemo', '.result.negemo', '.result.ingest', '.result.anx', '.result.anger', '.result.sad' # '.result.work' # '.result.future' ] names = [ 'I', 'We', 'Bio', 'Body', 'Health', 'Posemo', 'Negemo', 'Ingest', 'Anx', 'Anger', 'Sad', # 'Work', # 'Future' ] df = [] for i in xrange(len(features)): feature = features[i] prior_values = iot.get_values_one_field(dbname, comname, 'prior-liwc'+feature, {'id':{'$in': user}}) post_values = iot.get_values_one_field(dbname, comname, 'post-liwc'+feature, {'id':{'$in': user}}) # sns.kdeplot(np.array(prior_values), label="Prior") # sns.kdeplot(np.array(post_values), label="Post") # plt.legend() # sns.plt.title(feature) # plt.show() # plt.clf() df_prior = pd.DataFrame({'Feature': names[i], 'Group': 'Prior', 'Values': prior_values}) df_post = pd.DataFrame({'Feature': names[i], 'Group': 'Post', 'Values': post_values}) df.append(df_prior) df.append(df_post) df = pd.concat(df) sns.set(style="whitegrid", palette="pastel", color_codes=True) sns.boxplot(x="Feature", y="Values", hue="Group", data=df, palette="PRGn") # sns.despine(offset=10, trim=True) plt.show()
def diversity_db(dbname, comname, behavior): userlist = iot.get_values_one_field(dbname, comname, 'id_str', {'timeline_count': {'$gt': 0}}) g = bahavior_net(dbname, comname, 'bnet', behavior) # pickle.dump(g, open('data/'+dbname+'_'+behavior+'.pick', 'w')) print dbname, behavior # g = pickle.load(open('data/' + dbname + '_' + behavior + '.pick', 'r')) return netstatis(dbname, behavior, g, userlist)
def control_users(): com = dbt.db_connect_col('fed', 'scom') recovery_user = set(iot.get_values_one_field('fed', 'recover', 'user.id')) control_com = dbt.db_connect_col('fed', 'control_com') control_com.create_index("id", unique=True) for user in com.find(): if user['id'] not in recovery_user: control_com.insert(user)
def image_color_compare(): ed_urls = io.get_values_one_field('fed', 'com', 'profile_banner_url', { 'level': 1, 'profile_banner_url': { '$exists': True } }) rd_urls = io.get_values_one_field('random', 'com', 'profile_banner_url', { 'level': 1, 'profile_banner_url': { '$exists': True } }) yg_urls = io.get_values_one_field('young', 'com', 'profile_banner_url', { 'level': 1, 'profile_banner_url': { '$exists': True } }) pickle.dump(ed_urls, open("data/edimage.pick", "wb")) pickle.dump(rd_urls, open("data/rdimage.pick", "wb")) pickle.dump(yg_urls, open("data/ygimage.pick", "wb")) standers, rgbstan = color_standers() # ed_urls = pickle.load(open("data/edimage.pick", "rb")) # rd_urls = pickle.load(open("data/rdimage.pick", "rb")) # yg_urls = pickle.load(open("data/ygimage.pick", "rb")) ed_cs = get_image_color(ed_urls) pickle.dump(ed_cs, open("data/edics.pick", "wb")) ed_cs = pickle.load(open("data/edics.pick", "rb")) edi = cate_color(ed_cs, standers, 'lab') plot.color_bars(rgbstan, edi) rd_cs = get_image_color(rd_urls) pickle.dump(rd_cs, open("data/rdics.pick", "wb")) rd_cs = pickle.load(open("data/rdics.pick", "rb")) rdi = cate_color(rd_cs, standers, 'lab') plot.color_bars(rgbstan, rdi) yg_cs = get_image_color(yg_urls) pickle.dump(yg_cs, open("data/ygics.pick", "wb")) ygi = cate_color(yg_cs, standers, 'lab') plot.color_bars(rgbstan, ygi)
def remove_random_users(dbname, comname, netname): com = dbt.db_connect_col(dbname, comname) users = iot.get_values_one_field(dbname, comname, 'id', {'level': 3}) net = dbt.db_connect_col(dbname, netname) for row in net.find(no_cursor_timeout=True): uid = row['user'] fid = row['follower'] if uid in users or fid in users: net.delete_one({'_id': row['_id']}) com.delete_many({'level': 3})
def calculate_extenal_user(): # Calculate how many users have been retweeted by ED but do not exist in ED users users = set(iot.get_values_one_field('fed', 'com', 'id')) print len(users) net = dbt.db_connect_col('fed', 'sbnet') i, count = 0, 0 for record in net.find(): if (record['id0'] not in users) or (record['id1'] not in users): i = +1 count += 1 print i, count, float(i) / count
def ED_followee(): # put all ED's followees in follownet net = dbt.db_connect_col('fed', 'net2') users = set(iot.get_values_one_field('fed', 'scom', 'id')) print len(users) tem = dbt.db_connect_col('fed', 'follownet') for re in net.find(): if re['follower'] in users: try: tem.insert(re) except pymongo.errors.DuplicateKeyError: pass
def diversity_db(dbname, comname, behavior, netname): userlist = iot.get_values_one_field(dbname, comname, 'id', # {'timeline_count': {'$gt': 0}} ) g = gt.load_beh_network_subset(userlist, dbname, netname, behavior) gt.summary(g) # g = bahavior_net(dbname, comname, netname, behavior) # pickle.dump(g, open('data/'+dbname+'_'+behavior+'.pick', 'w')) print dbname, behavior # g = pickle.load(open('data/' + dbname + '_' + behavior + '.pick', 'r')) return netstatis(dbname, behavior, g, [str(i) for i in userlist], comname)
def ed_tweet_normal_tweet_count(): user_ids = set(iot.get_values_one_field('fed', 'ed_tag', 'user.id')) print len(user_ids) com = dbt.db_connect_col('fed', 'com') tags = dbt.db_connect_col('fed', 'ed_tag') data = [] for uid in user_ids: ed_count = tags.count({'user.id': uid}) all_count = com.find_one({'id': uid})['timeline_count'] data.append([uid, ed_count, all_count]) df = pd.DataFrame(data, columns=['id', 'ed_tweet_count', 'all_tweet_count']) df.to_csv('user-ed-stats.csv')
def filter_user(): prior = dbt.db_connect_col('fed', 'prior_treat') post = dbt.db_connect_col('fed', 'post_treat') com = dbt.db_connect_col('fed', 'scom') treat_com = dbt.db_connect_col('fed', 'treat_com') treat_com.create_index("id", unique=True) prior_user = iot.get_values_one_field('fed', 'prior_treat', 'user.id') post_user = iot.get_values_one_field('fed', 'post_treat', 'user.id') print len(set(prior_user)), len(set(post_user)), len(set(prior_user).intersection(set(post_user))) users = list() propotions = list() for uid in set(prior_user).intersection(set(post_user)): count_prior = prior.count({'user.id': uid}) count_post = post.count({'user.id': uid}) if count_prior > 0 and count_post > 0: users.append(uid) propotions.append(float(count_prior)/(count_prior + count_post)) print len(users) print np.mean(propotions)
def out_data(): control = dbt.db_connect_col('fed', 'control_com') treat = dbt.db_connect_col('fed', 'treat_com') control_user = iot.get_values_one_field('fed', 'control_com', 'id', {'prior_liwc.result.WC':{'$exists': True}, 'post_liwc.result.WC':{'$exists': True}}) treat_user = iot.get_values_one_field('fed', 'treat_com', 'id', {'prior_liwc.result.WC':{'$exists': True}, 'post_liwc.result.WC':{'$exists': True}}) data = [] fields = iot.read_fields() prefix = ['prior_liwc', 'post_liwc'] for i in xrange(2): uids = [control_user, treat_user][i] for uid in uids: user = [control, treat][i].find_one({'id': uid}) for j in xrange(2): fields_new = ['id_str']+[field.replace('liwc_anal', prefix[j]) for field in fields] values = iot.get_fields_one_doc(user, fields_new) data.append(values+[i, j]) df = pd.DataFrame(data, columns=['id']+[field.split('.')[-1] for field in fields]+['treated', 'time']) df.to_csv('treatment.csv')
def image_color_compare(): ed_urls = io.get_values_one_field( "fed", "com", "profile_banner_url", {"level": 1, "profile_banner_url": {"$exists": True}} ) rd_urls = io.get_values_one_field( "random", "com", "profile_banner_url", {"level": 1, "profile_banner_url": {"$exists": True}} ) yg_urls = io.get_values_one_field( "young", "com", "profile_banner_url", {"level": 1, "profile_banner_url": {"$exists": True}} ) pickle.dump(ed_urls, open("data/edimage.pick", "wb")) pickle.dump(rd_urls, open("data/rdimage.pick", "wb")) pickle.dump(yg_urls, open("data/ygimage.pick", "wb")) standers, rgbstan = color_standers() # ed_urls = pickle.load(open("data/edimage.pick", "rb")) # rd_urls = pickle.load(open("data/rdimage.pick", "rb")) # yg_urls = pickle.load(open("data/ygimage.pick", "rb")) ed_cs = get_image_color(ed_urls) pickle.dump(ed_cs, open("data/edics.pick", "wb")) ed_cs = pickle.load(open("data/edics.pick", "rb")) edi = cate_color(ed_cs, standers, "lab") plot.color_bars(rgbstan, edi) rd_cs = get_image_color(rd_urls) pickle.dump(rd_cs, open("data/rdics.pick", "wb")) rd_cs = pickle.load(open("data/rdics.pick", "rb")) rdi = cate_color(rd_cs, standers, "lab") plot.color_bars(rgbstan, rdi) yg_cs = get_image_color(yg_urls) pickle.dump(yg_cs, open("data/ygics.pick", "wb")) ygi = cate_color(yg_cs, standers, "lab") plot.color_bars(rgbstan, ygi)
def copy_net(dbname, comname, netname): # Move networks among two-level users in net2 net = dbt.db_connect_col(dbname, netname) netn = dbt.db_connect_col(dbname, 'net') # netn.create_index([("user", pymongo.ASCENDING), # ("follower", pymongo.ASCENDING), # ("type", pymongo.ASCENDING)], # unique=True) eduset_list = set( iot.get_values_one_field(dbname, comname, 'id', {'level': 1})) oneuser_list = set( iot.get_values_one_field(dbname, comname, 'id', {'level': 2})) print(len(eduset_list)) for row in net.find(no_cursor_timeout=True): uid = row['user'] fid = row['follower'] if (uid in eduset_list and fid in oneuser_list) \ or (uid in oneuser_list and fid in eduset_list) \ or (uid in eduset_list and fid in eduset_list): try: netn.insert(row) net.delete_one({'_id': row['_id']}) except pymongo.errors.DuplicateKeyError: pass
def users_with_collected_friends(dbname, comname, netname): # get network from random and younger datasets users = iot.get_values_one_field(dbname, comname, 'id', {'level':1}) # net = gt.load_network_subset(dbname, netname, { # 'user': {'$in': users}, 'follower': {'$in': users} # }) # net.write_graphml(dbname+'-net.graphml') g = gt.Graph.Read_GraphML(dbname+'-net.graphml') gt.summary(g) g.vs['outk'] = g.indegree() nodes = g.vs.select(outk_gt=0) print len(nodes) user_ids = [int(v['name']) for v in nodes] print len(set(users).intersection(set(user_ids)))
def communtiy_feature(dbname, typename): fg = ntt.loadnet(dbname, typename) fcoms = gt.fast_community(fg) pickle.dump(fcoms, open('data/'+dbname+typename+'com.pick', 'w')) fcoms = pickle.load(open('data/'+dbname+typename+'com.pick', 'r')) fclus = fcoms.as_clustering() gt.summary(fclus) """Compare difference of features in cummunities""" features = [ 'liwc_anal.result.i', 'liwc_anal.result.we', 'liwc_anal.result.bio', 'liwc_anal.result.body', 'liwc_anal.result.health', 'liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.ingest', 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad' ] therh = 0.1 * fg.vcount() for feature in features: data = [] for clu in fclus: if len(clu) > therh: ulist = set() for v in clu: ulist.add(int(fg.vs[v]['name'])) ulist = list(ulist) clu_values = iot.get_values_one_field(dbname, 'com', feature, {'id': {'$in': ulist}}) data.append(clu_values) plot.plot_config() for i in xrange(len(data)): sns.distplot(data[i], hist=False, label=str(i)+':'+str(len(data[i]))) plt.xlabel(feature) plt.ylabel('PDF') # plt.show() plt.savefig(feature+typename+'_com.pdf') plt.clf()
def refine_recovery(dbname, netname): ''' refine the users who have use hashtag #recovery :param dbname: :param netname: :return: ''' network = dbutil.db_connect_col(dbname, netname) proed = set(['proed', 'proana', 'promia', 'proanorexia', 'proanamia', 'proanatips', 'proanatip']) proedrel = proed for link in network.find(no_cursor_timeout=True): tags = set(link['tags']) if len(proed.intersection(tags)) > 0: proedrel = proedrel.union(tags) print len(proedrel) users = iot.get_values_one_field(dbname, netname, 'id0') print len(users) for user in users: # print user utags = set() for link in network.find({'id0': user}): utags.add(tag for tag in link['tags']) if len(utags.intersection(proedrel)) == 0: network.delete_many({'id0': user})
def hashtag_net(dbname, comname, timename): userlist = iot.get_values_one_field(dbname, comname, 'id_str', {'timeline_count': {'$gt': 0}}) g = gt.load_user_hashtag_network(dbname, timename) pickle.dump(g, open('data/'+dbname+'_hashtag.pick', 'w'))
def plot_bio(dbname, colname, fields, names): datas = list() for field in fields: datas.append(iot.get_values_one_field(dbname, colname, field, {field: {'$exists': True}})) plot.plot_pdf_mul_data(datas, 'Age', ['g-', 'b-', 'r-', 'k-'], ['s', 'o', '^', '*'], names, linear_bins=True, central=True, fit=False, fitranges=None, savefile='bmi' + '.pdf')
def emotion_dropout_IV_split(dbname1, dbname2, comname1, comname2): ''' Split followees and followers as different variables :param dbname1: :param dbname2: :param comname1: :param comname2: :return: ''' filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}} user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que) com1 = dbt.db_connect_col(dbname1, comname1) com2 = dbt.db_connect_col(dbname2, comname2) fields = ['liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad'] prof_names = ['friends_count', 'statuses_count', 'followers_count', 'friends_day', 'statuses_day', 'followers_day', 'days'] attr_names = ['uid', 'attr'] attr_names.extend(['u_'+field.split('.')[-1] for field in fields]) attr_names.extend(['u_'+field for field in prof_names]) attr_names.extend(['fr_'+field.split('.')[-1] for field in fields]) attr_names.extend(['fr_'+field for field in prof_names]) attr_names.extend(['fr_num', 'fr_palive']) attr_names.extend(['fo_'+field.split('.')[-1] for field in fields]) attr_names.extend(['fo_'+field for field in prof_names]) attr_names.extend(['fo_num', 'fo_palive']) attr_names.extend(['co_'+field.split('.')[-1] for field in fields]) attr_names.extend(['co_'+field for field in prof_names]) attr_names.extend(['co_num', 'co_palive']) print attr_names attr_length = len(fields) + len(prof_names) + 2 network1 = gt.load_network(dbname1, 'net') data = [] for uid in user1: row = [uid] u1 = com1.find_one({'id': uid}) u2 = com2.find_one({'id': uid}) if u2 is None or u2['timeline_count'] == 0: row.append(1) else: row.append(0) uatt = iot.get_fields_one_doc(u1, fields) row.extend(uatt) row.extend(active_days(u1)) exist = True try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: print '--------------------user %d---------------' %uid followees = set([int(network1.vs[v]['name']) for v in network1.successors(str(uid))]) followers = set([int(network1.vs[v]['name']) for v in network1.predecessors(str(uid))]) common = followees.intersection(followers) followees = followees - common followers = followers - common for friend_ids in [followees, followers, common]: if len(friend_ids) > 0: # friend_ids = [int(network1.vs[v]['name']) for v in friends] print uid in friend_ids print len(friend_ids) fatts = [] alive = 0 for fid in friend_ids: fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}}) fu2 = com2.find_one({'id': fid}) if fu != None: fatt = iot.get_fields_one_doc(fu, fields) # Friends' LIWC fatt.extend(active_days(fu)) fatts.append(fatt) if fu2 is None or fu2['timeline_count'] == 0: alive += 0 else: alive += 1 if len(fatts) > 0: fatts = np.array(fatts) fmatts = np.mean(fatts, axis=0) row.extend(fmatts) row.append(len(fatts)) paliv = float(alive)/len(fatts) print 'Alive %d %d %.3f' % (alive, len(fatts), paliv) row.append(paliv) else: row.extend([None] * attr_length) # friends = followers # followers # if len(friends) > 0: # friend_ids = [int(network1.vs[v]['name']) for v in friends] # print uid in friend_ids # print len(friend_ids) # fatts = [] # alive = 0 # for fid in friend_ids: # fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}}) # fu2 = com2.find_one({'id': fid}) # if fu != None: # fatt = iot.get_fields_one_doc(fu, fields) # fatt.extend(active_days(fu)) # fatts.append(fatt) # if fu2 is None or fu2['timeline_count'] == 0: # alive += 0 # else: # alive += 1 # if len(fatts) > 0: # fatts = np.array(fatts) # fmatts = np.mean(fatts, axis=0) # row.extend(fmatts) # row.append(len(fatts)) # paliv = float(alive)/len(fatts) # print 'Alive %d %d %.3f' % (alive, len(fatts), paliv) # row.append(paliv) # print row data.append(row) df = pd.DataFrame(data, columns=attr_names) df.to_csv('data-attr-split.csv', index = False)
def emotion_dropout_IV_combine(dbname1, dbname2, comname1, comname2): ''' Combine followees and follower together as variables :param dbname1: :param dbname2: :param comname1: :param comname2: :return: ''' filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}} user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que) com1 = dbt.db_connect_col(dbname1, comname1) com2 = dbt.db_connect_col(dbname2, comname2) fields = ['liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad'] prof_names = ['friends_count', 'statuses_count', 'followers_count', 'friends_day', 'statuses_day', 'followers_day', 'days'] attr_names = ['uid', 'attr'] attr_names.extend(['u_'+field.split('.')[-1] for field in fields]) attr_names.extend(['u_'+field for field in prof_names]) attr_names.extend(['f_'+field.split('.')[-1] for field in fields]) attr_names.extend(['f_'+field for field in prof_names]) attr_names.extend(['f_num', 'f_palive']) print attr_names network1 = gt.load_network(dbname1, 'net') data = [] for uid in user1: row = [uid] u1 = com1.find_one({'id': uid}) u2 = com2.find_one({'id': uid}) if u2 is None or u2['timeline_count'] == 0: row.append(1) else: row.append(0) uatt = iot.get_fields_one_doc(u1, fields) row.extend(uatt) row.extend(active_days(u1)) exist = True try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: friends = set(network1.neighbors(str(uid))) # id or name if len(friends) > 0: friend_ids = [int(network1.vs[v]['name']) for v in friends] # return id print uid in friend_ids print len(friend_ids) fatts = [] alive = 0 for fid in friend_ids: fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}}) fu2 = com2.find_one({'id': fid}) if fu != None: fatt = iot.get_fields_one_doc(fu, fields) fatt.extend(active_days(fu)) fatts.append(fatt) if fu2 is None or fu2['timeline_count'] == 0: alive += 0 else: alive += 1 if len(fatts) > 0: fatts = np.array(fatts) fmatts = np.mean(fatts, axis=0) row.extend(fmatts) row.append(len(fatts)) paliv = float(alive)/len(fatts) print 'Alive %d %d %.3f' % (alive, len(fatts), paliv) row.append(paliv) # print row data.append(row) df = pd.DataFrame(data, columns=attr_names) df.to_csv('data-attr-combine.csv', index = False)
def compare_difference(): ed_ids = ioutil.get_values_one_field('fed', 'com', 'id', {'level':1}) rd_ids = ioutil.get_values_one_field('random', 'com', 'id', {'level':1}) print list(set(ed_ids).intersection(rd_ids))
def friendship_community_vis(dbname, colname, filename, ctype): '''Out graph for vis.js visualization''' ed_users = iot.get_values_one_field(dbname, 'scom', 'id') # fed_users = iot.get_values_one_field(dbname, 'com', 'id') dbcom = dbt.db_connect_col(dbname, 'com') fg = gt.load_network(dbname, colname) # fg = gt.load_beh_network_subset(ed_users, dbname, colname, 'retweet') gt.net_stat(fg) # fg = fg.as_undirected(mode="mutual") # gt.net_stat(fg) fg = gt.giant_component(fg, 'WEAK') gt.net_stat(fg) if ctype == 'ml': com = fg.community_multilevel(weights='weight', return_levels=False) elif ctype == 'lp': fgu = fg.as_undirected(combine_edges=sum) init = fgu.community_leading_eigenvector(clusters=2, weights='weight') print init.membership com = fg.community_label_propagation(weights='weight', initial=init.membership) print com.membership else: com = fg.community_infomap(edge_weights='weight', trials=2) fg.vs['group'] = com.membership # edges = fg.es.select(weight_gt=3) # print 'Filtered edges: %d' %len(edges) # fg = fg.subgraph_edges(edges) # gt.net_stat(fg) # fg.vs['degree'] = fg.degree(mode="all") # nodes = fg.vs.select(degree_gt=10) # fg = fg.subgraph(nodes) # gt.net_stat(fg) Coo={} for x in fg.vs['group']: Coo[x]=(rand.randint(-1000, 1000), rand.randint(-1000, 1000)) with open('data/' + ctype + '_' +filename+'_net_follow.js', 'w') as fw: fw.write('var nodes = [\n') for idv, v in enumerate(fg.vs): user = dbcom.find_one({'id': int(fg.vs[idv]['name'])}) desc = ' '.join(user['description'].replace('\'', '').replace('\"', '').split()) fw.write('{id: ' + str(idv+1) + ', '+ 'label: \'' + user['screen_name'] +'\', ' + 'value: ' + str(fg.degree(idv, mode="in")) + ', ' + 'title: \'UID: ' + str(fg.vs[idv]['name']) + '<br> Screen Name: ' + user['screen_name'] + '<br> Followers: ' + str(user['followers_count']) + '<br> Followees: ' + str(user['friends_count']) + '<br> Tweets: ' + str(user['statuses_count']) + '<br> Description: ' + str(desc.encode('utf-8')) + '<br> Group: ' + str(fg.vs[idv]['group']) + '\', ' + 'x: ' + str(Coo[fg.vs[idv]['group']][0]+rand.randint(0, 300)) + ', ' + 'y: ' + str(Coo[fg.vs[idv]['group']][1]+rand.randint(0, 300)) + ', ' + 'group: ' + str(fg.vs[idv]['group']) + ', ') # if int(fg.vs[idv]['name']) in ed_users: # fw.write('shape: ' + '\'triangle\'') # else: # fw.write('shape: ' + '\'circle\'') fw.write('}, \n') fw.write('];\n var edges = [\n') for ide, e in enumerate(fg.es): fw.write('{from: ' + str(e.source+1) + ', ' + 'to: ' + str(e.target+1) + ', ' + 'arrows: ' + '\'to\'' + ', ' + 'title: \' Tags: ' + fg.vs[e.source]['name'] + ' ' + fg.vs[e.target]['name'] + '<br> Co-occurrence: ' + str(fg.es[ide]['weight']) + '\', ' + 'value: ' + str(fg.es[ide]['weight']) + '},\n') #str(fg.es[ide]['weight']) fw.write('];\n')
def bahavior_net(dbname, comname, bnetname, btype): userlist = iot.get_values_one_field(dbname, comname, 'id', {'timeline_count': {'$gt': 0}}) return gt.load_beh_network_subset(userlist, dbname, bnetname, btype)