def plot_rfecvs(rfecvs, labels): # Plot number of features VS. cross-validation scores plu.plot_config() marker = itertools.cycle((',', 'x', 'o', '.', '*')) plt.xlabel("#Features") plt.ylabel("Cross validation ROC_AUC") for i in xrange(len(rfecvs)): rfecv = rfecvs[i] label = labels[i] # c = np.random.rand(3) plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, # label=label, # c=c, marker=marker.next(), lw=2 ) plt.axvline(rfecv.n_features_, linestyle='dashdot', # c=c, lw=4) plt.annotate('Best: (' + str(rfecv.n_features_) + ', ' + str(round(rfecv.grid_scores_[rfecv.n_features_-1]*100, 2))+'%)', xy=(rfecv.n_features_, rfecv.grid_scores_[rfecv.n_features_-1]), xycoords='data', xytext=(-30, -30*(i+1)), textcoords='offset points', fontsize=20, arrowprops=dict(arrowstyle="->")) # plt.annotate(str(rfecv.n_features_)+', '+str(rfecv.grid_scores_[rfecv.n_features_-1]), # xy=(rfecv.n_features_, rfecv.grid_scores_[rfecv.n_features_-1]), # xytext=(rfecv.n_features_, rfecv.grid_scores_[rfecv.n_features_-1]-0.2) # ) plt.legend(loc="best") plt.grid(True) plt.show() plt.savefig('refcv.pdf') plt.clf()
def lifetime(dbname, comname, timename): db = dbt.db_connect_no_auth(dbname) com = db[comname] time = db[timename] during = [] for user in com.find({"timeline_count": {'$gt': 0}}): newtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', -1)]).limit(1)[0] last = datetime.strptime(newtweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y') account = datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y') print user['id'], last, account, (last.date() - account.date()).days + 1 during.append((last.date() - account.date()).days + 1) pt.plot_config() plt.figure(1) plt.subplot(211) pt.sns.distplot(during) print np.mean(during), np.std(during) plt.axvline(np.mean(during), linestyle='--', color='k', label='Mean') plt.ylabel('PDF') plt.xlim(0, 2700) plt.legend() plt.subplot(212) pt.sns.boxplot(x=during) plt.ylabel('Quartile') plt.xlabel('Day') plt.xlim(0,2700) plt.show()
def plot_distribution(dbname='fed', comname='scom'): # Plot difference between retweeted and liked tweets fields = iot.read_fields() for field in fields: tokens = field.split('.') retweet_key = field.replace('liwc_anal', 'retweet_liwc') like_key = field.replace('liwc_anal', 'like_liwc') retwets = iot.get_values_one_field(dbname, comname, retweet_key) likes = iot.get_values_one_field(dbname, comname, like_key) pt.plot_config() sns.distplot(retwets, hist=False, kde_kws={ "color": "r", "lw": 2, "marker": 'o' }, label='RT ($\mu=%0.2f \pm %0.2f$)' % (np.mean(retwets), np.std(retwets))) sns.distplot(likes, hist=False, kde_kws={ "color": "g", "lw": 2, "marker": 's' }, label='Like ($\mu=%0.2f \pm %0.2f$)' % (np.mean(likes), np.std(likes))) plt.legend(loc="best") plt.xlabel(tokens[-1]) plt.ylabel('P') plt.savefig('data/' + tokens[-1] + '.pdf', bbox_inches='tight') plt.clf()
def distribution_change(dbname, colname): rec_users1 = pickle.load(open('data/pro-recovery.pick', 'r')) pro_ed = pickle.load(open('data/pro_ed.pick', 'r')) print len(rec_users1) print len(pro_ed) features = [ 'liwc_anal.result.i', 'liwc_anal.result.we', 'liwc_anal.result.bio', 'liwc_anal.result.body', 'liwc_anal.result.health', 'liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.ingest', 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad' ] names = ['I', 'We', 'Bio', 'Body', 'Health', 'Posemo', 'Negemo', 'Ingest', 'Anx', 'Anger', 'Sad'] df = pd.DataFrame() pltt.plot_config() for i in xrange(len(features)): feature = features[i] old_values = iot.get_values_one_field(dbname, colname, feature, {'id':{'$in': rec_users1}}) df1 = pd.DataFrame({'Feature': names[i], 'Group': 'Pro-Recovery', 'Values': old_values}) new_values = iot.get_values_one_field(dbname, colname, feature, {'id':{'$in': pro_ed}}) df2 = pd.DataFrame({'Feature': names[i], 'Group': 'Pro-ED', 'Values': new_values}) df1 = df1.append(df2) if len(df) == 0: df = df1 else: df = df.append(df1) '''Plot Individual''' # sns.distplot(old_values, hist=False, label='Before') # sns.distplot(new_values, hist=False, label='After') d, p = stats.ks_2samp(old_values, new_values) print (names[i] + ', %.3f(%.3f), %.3f(%.3f), %.3f(%.3f)' %((np.mean(old_values)), (np.std(old_values)), (np.mean(new_values)), (np.std(new_values)), d, p)) # plt.xlabel(feature) # plt.ylabel('PDF') # # plt.show() # plt.savefig(dbname+'_'+feature+'_time.pdf') # plt.clf() sns.set(style="whitegrid", palette="pastel", color_codes=True) # sns.violinplot(x="Feature", y="Values", hue="Time", data=df, split=True, # inner="quart", palette={"Before": "b", "After": "y"}) # sns.despine(left=True) sns.boxplot(x="Feature", y="Values", hue="Group", data=df, palette="PRGn") sns.despine(offset=10, trim=True) plt.show()
def plot_boxplot(filename='user-kmeans-hashtag.csv'): import ohsn.util.plot_util as plu plu.plot_config() df = pd.read_csv(filename, index_col=0) ax = sns.boxplot(x="cluster", y="silhouette_avg", data=df, color="lightblue") # sns.pointplot(x="cluster", y="silhouette_avg", data=df, errcolor='red') # ax.set_xticklabels([t.get_text() if int(t.get_text())/2==0 else '' for t in ax.get_xticklabels()]) sns.despine(offset=10, trim=True) plt.xlabel('K') plt.ylabel('Average Silhouette') plt.ylim(0.38, 0.81) plt.show()
def communtiy_feature(dbname, typename): fg = ntt.loadnet(dbname, typename) fcoms = gt.fast_community(fg) pickle.dump(fcoms, open('data/'+dbname+typename+'com.pick', 'w')) fcoms = pickle.load(open('data/'+dbname+typename+'com.pick', 'r')) fclus = fcoms.as_clustering() gt.summary(fclus) """Compare difference of features in cummunities""" features = [ 'liwc_anal.result.i', 'liwc_anal.result.we', 'liwc_anal.result.bio', 'liwc_anal.result.body', 'liwc_anal.result.health', 'liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.ingest', 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad' ] therh = 0.1 * fg.vcount() for feature in features: data = [] for clu in fclus: if len(clu) > therh: ulist = set() for v in clu: ulist.add(int(fg.vs[v]['name'])) ulist = list(ulist) clu_values = iot.get_values_one_field(dbname, 'com', feature, {'id': {'$in': ulist}}) data.append(clu_values) plot.plot_config() for i in xrange(len(data)): sns.distplot(data[i], hist=False, label=str(i)+':'+str(len(data[i]))) plt.xlabel(feature) plt.ylabel('PDF') # plt.show() plt.savefig(feature+typename+'_com.pdf') plt.clf()
def communtiy_feature(dbname, typename): fg = ntt.loadnet(dbname, typename) fcoms = gt.fast_community(fg) pickle.dump(fcoms, open('data/' + dbname + typename + 'com.pick', 'w')) fcoms = pickle.load(open('data/' + dbname + typename + 'com.pick', 'r')) fclus = fcoms.as_clustering() gt.summary(fclus) """Compare difference of features in cummunities""" features = [ 'liwc_anal.result.i', 'liwc_anal.result.we', 'liwc_anal.result.bio', 'liwc_anal.result.body', 'liwc_anal.result.health', 'liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.ingest', 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad' ] therh = 0.1 * fg.vcount() for feature in features: data = [] for clu in fclus: if len(clu) > therh: ulist = set() for v in clu: ulist.add(int(fg.vs[v]['name'])) ulist = list(ulist) clu_values = iot.get_values_one_field(dbname, 'com', feature, {'id': { '$in': ulist }}) data.append(clu_values) plot.plot_config() for i in xrange(len(data)): sns.distplot(data[i], hist=False, label=str(i) + ':' + str(len(data[i]))) plt.xlabel(feature) plt.ylabel('PDF') # plt.show() plt.savefig(feature + typename + '_com.pdf') plt.clf()
def profile_change(dbname, colname, timename): # db = dbt.db_connect_no_auth(dbname) # com = db[colname] # time = db[timename] # # followee, follower, tweets, users, olddate, newdate, during = [], [], [], [], [], [], [] # filter = {'liwc_anal.result.i':{'$exists':True}, 'new_liwc_anal.result.i':{'$exists':True}} # # for user in com.find(filter): # newtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', -1)]).limit(1)[0] # oldtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', 1)]).limit(1)[0] # print user['id'], oldtweet['created_at'], newtweet['created_at'], \ # (newtweet['created_at'].date() - oldtweet['created_at'].date()).days+1 # users.append(user['id']) # olddate.append(oldtweet['created_at']) # newdate.append(newtweet['created_at']) # during.append((newtweet['created_at'].date() - oldtweet['created_at'].date()).days + 1) # follower.append(newtweet['user']['followers_count'] - oldtweet['user']['followers_count']) # followee.append(newtweet['user']['friends_count']- oldtweet['user']['friends_count']) # tweets.append(newtweet['user']['statuses_count']- oldtweet['user']['statuses_count']) # df = pd.DataFrame({'User': users, # 'OldDate': olddate, # 'NewDate': newdate, # 'Follower': follower, # 'Followee': followee, # 'Tweet': tweets, # 'ActiveTime': during}) # pickle.dump(df, open('data/df.pick', 'w')) df = pickle.load(open('data/df.pick', 'r')) pt.plot_config() df['Followee/Day'] = (df.Followee / df.ActiveTime) df['Follower/Day'] = (df.Follower / df.ActiveTime) df['Tweet/Day'] = (df.Tweet / df.ActiveTime) print df.describe() df.to_csv('profiles.csv') sns.boxplot(data=df.loc[:, ['Followee', 'Follower', 'Tweet', 'ActiveTime']]) # sns.boxplot(data=df.loc[:, ['Followee/Day', 'Follower/Day', 'Tweet/Day']]) plt.ylim(-300, 400) plt.show()
def profile_change(dbname, colname, timename): # db = dbt.db_connect_no_auth(dbname) # com = db[colname] # time = db[timename] # # followee, follower, tweets, users, olddate, newdate, during = [], [], [], [], [], [], [] # filter = {'liwc_anal.result.i':{'$exists':True}, 'new_liwc_anal.result.i':{'$exists':True}} # # for user in com.find(filter): # newtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', -1)]).limit(1)[0] # oldtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', 1)]).limit(1)[0] # print user['id'], oldtweet['created_at'], newtweet['created_at'], \ # (newtweet['created_at'].date() - oldtweet['created_at'].date()).days+1 # users.append(user['id']) # olddate.append(oldtweet['created_at']) # newdate.append(newtweet['created_at']) # during.append((newtweet['created_at'].date() - oldtweet['created_at'].date()).days + 1) # follower.append(newtweet['user']['followers_count'] - oldtweet['user']['followers_count']) # followee.append(newtweet['user']['friends_count']- oldtweet['user']['friends_count']) # tweets.append(newtweet['user']['statuses_count']- oldtweet['user']['statuses_count']) # df = pd.DataFrame({'User': users, # 'OldDate': olddate, # 'NewDate': newdate, # 'Follower': follower, # 'Followee': followee, # 'Tweet': tweets, # 'ActiveTime': during}) # pickle.dump(df, open('data/df.pick', 'w')) df = pickle.load(open('data/df.pick', 'r')) pt.plot_config() df['Followee/Day']=(df.Followee/df.ActiveTime) df['Follower/Day']=(df.Follower/df.ActiveTime) df['Tweet/Day']=(df.Tweet/df.ActiveTime) print df.describe() df.to_csv('profiles.csv') sns.boxplot(data=df.loc[:, ['Followee', 'Follower', 'Tweet', 'ActiveTime']]) # sns.boxplot(data=df.loc[:, ['Followee/Day', 'Follower/Day', 'Tweet/Day']]) plt.ylim(-300, 400) plt.show()
def compare_post_time(): # prec = tsplit.timeline('fed', 'prorec_tag_refine') # ped = tsplit.timeline('fed', 'proed_tag_refine') # pickle.dump((prec, ped), open('tweets_dates.pick', 'w')) prec, ped = pickle.load(open('tweets_dates.pick', 'r')) print len(prec), len(ped) '''Get index ''' mind = min(min(prec), min(ped)) maxd = max(max(prec), max(ped)) print mind, maxd indeces = pd.date_range(mind, maxd, freq='M') plu.plot_config() fig, ax = plt.subplots() '''counting''' df_rec = pd.DataFrame(prec, columns=['Recovery']) df_rec['year'] = df_rec["Recovery"].dt.year df_rec['month'] = df_rec["Recovery"].dt.month rec_counts = df_rec.groupby([df_rec["year"], df_rec["month"]]).count() '''Get count per month''' rec_cs = [0.0]*len(indeces) for i in xrange(len(indeces)): year = indeces[i].year month = indeces[i].month count = rec_counts.loc[(rec_counts.index.get_level_values('year') == year) & (rec_counts.index.get_level_values('month') == month)] if not count.empty: rec_cs[i] = count.iloc[0, 1] '''Plot series''' rec_s = pd.Series(rec_cs, index=indeces, name='Pro-recovery') rec_s.plot(kind="line", marker='s', ax=ax) ax.legend(loc='best') df_ped = pd.DataFrame(ped, columns=['Pro-ED']) df_ped['year'] = df_ped['Pro-ED'].dt.year df_ped['month'] = df_ped['Pro-ED'].dt.month ped_counts = df_ped.groupby([df_ped["year"], df_ped["month"]]).count() ped_cs = [0.0]*len(indeces) for i in xrange(len(indeces)): year = indeces[i].year month = indeces[i].month count = ped_counts.loc[(ped_counts.index.get_level_values('year') == year) & (ped_counts.index.get_level_values('month') == month)] if not count.empty: ped_cs[i] = count.iloc[0, 1] ped_s = pd.Series(ped_cs, index=indeces, name='Pro-ED') ped_s.plot(kind="line", marker='o', ax=ax) ax.legend(loc='best') ax.set_ylabel('Number of tweets') ax.set_xlabel('Date') print len(rec_cs), len(ped_cs), len(indeces) s, p = stats.kendalltau(rec_cs, ped_cs) print s, p print ('kendalltau test: %.2f, p-value: %.5f' %(s, p)) s, p = stats.spearmanr(rec_cs, ped_cs) print s, p print ('spearmanr test: %.2f, p-value: %.5f' %(s, p)) plt.show() return rec_cs, ped_cs
def roc_plot(datafile, savename, pca_num=10): X, y = load_scale_data(datafile) print X.shape plu.plot_config() # plt.rcParams['axes.labelsize'] = 20 # plt.rcParams['xtick.labelsize'] = 15 # plt.rcParams['ytick.labelsize'] = 15 # plt.rcParams['legend.fontsize'] = 20 # plt.rcParams['lines.markersize'] = 50 # plt.rcParams['pdf.fonttype'] = 42 # plt.rcParams['ps.fonttype'] = 42 ax = plt.gca() ax.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6)) '''social status features''' mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 0:6], y) # pickle.dump((mean_fpr, mean_tpr, mean_auc), open(datafile+'soc-short.pick', 'w')) # mean_fpr, mean_tpr, mean_auc = pickle.load(open(datafile+'soc-short.pick', 'r')) ax.plot(mean_fpr[0:100:5], mean_tpr[0:100:5], 'r--^', label='Soc. (AUC = %0.2f)' % mean_auc, lw=3, ms=10) '''Behavioral pattern features''' mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 6:17], y) # pickle.dump((mean_fpr, mean_tpr, mean_auc), open(datafile+'beh.pick', 'w')) # mean_fpr, mean_tpr, mean_auc = pickle.load(open(datafile+'beh.pick', 'r')) ax.plot(mean_fpr[0:100:5], mean_tpr[0:100:5], 'g--d', label='Beh. (AUC = %0.2f)' % mean_auc, lw=3, ms=10) '''LIWC features''' mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 17:], y) # pickle.dump((mean_fpr, mean_tpr, mean_auc), open(datafile+'liwc.pick', 'w')) # mean_fpr, mean_tpr, mean_auc = pickle.load(open(datafile+'liwc.pick', 'r')) ax.plot(mean_fpr[0:100:5], mean_tpr[0:100:5], 'b--o', label='Psy. (AUC = %0.2f)' % mean_auc, lw=3, ms=10) # '''Plus Hashtag features''' # mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 21:], y) # pickle.dump((mean_fpr, mean_tpr, mean_auc), open(datafile+'liwc-hash.pick', 'w')) # mean_fpr, mean_tpr, mean_auc = pickle.load(open(datafile+'liwc-hash.pick', 'r')) # ax.plot(mean_fpr[0:100:5], mean_tpr[0:100:5], 'c--o', label='L+H. (area = %0.2f)' % mean_auc, lw=3, ms=10) '''All features''' '''Remove social impact features''' # X_short = np.delete(X, [6,7,8,9], 1) mean_fpr, mean_tpr, mean_auc = cross_val_roc(X, y) # pickle.dump((mean_fpr, mean_tpr, mean_auc), open(datafile+'all-short.pick', 'w')) # mean_fpr, mean_tpr, mean_auc = pickle.load(open(datafile+'all-short.pick', 'r')) ax.plot(mean_fpr[0:100:5], mean_tpr[0:100:5], 'k--*', label='All. (AUC = %0.2f)' % mean_auc, lw=3, ms=10) '''PCA''' # from sklearn import decomposition # pca = decomposition.PCA(n_components=pca_num) # X = pca.fit_transform(X) # mean_fpr, mean_tpr, mean_auc = cross_val_roc(X, y) # pickle.dump((mean_fpr, mean_tpr, mean_auc), open(datafile+'red.pick', 'w')) # mean_fpr, mean_tpr, mean_auc = pickle.load(open(datafile+'red.pick', 'r')) # ax.plot(mean_fpr, mean_tpr, 'c--*', label='Red. (area = %0.2f)' % mean_auc, lw=2, ms=10) ax.set_xlim([0, 1]) ax.set_ylim([0, 1]) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') ax.legend(loc="lower right") ax.grid(True) # plt.gca().set_aspect('equal') plt.savefig(savename) plt.clf()
def roc_plot_feature(datafile): X, y = load_scale_data(datafile) fields = iot.read_fields() trim_files = [f.split('.')[-1] for f in fields] print len(trim_files) select_f = [ 'friend_count', 'status_count', 'follower_count', 'friends_day', 'statuses_day', 'followers_day', 'retweet_pro', 'dmention_pro', 'reply_pro', # 'hashtag_pro', # 'url_pro', 'retweet_div', 'mention_div', 'reply_div', 'i', 'we', 'swear', 'negate', 'body', 'health', 'ingest', 'social', 'posemo', 'negemo' ] indecs = [trim_files.index(f) for f in select_f] print indecs X = X[:, indecs] # '''Calculate positive emotion ratio''' # # print X.shape # X[:,-2] /= (X[:,-2] + X[:, -1]) # X = X[:, :-1] # X[:, -1][~np.isfinite(X[:, -1])] = 0 # min_max_scaler = preprocessing.MinMaxScaler() # X = min_max_scaler.fit_transform(X) X = preprocessing.scale(X) print X.shape, y.shape # Z = np.append(X, y.reshape((len(y), 1)), axis=1) # df = pd.DataFrame(Z, columns=select_f + ['label']) # affair_mod = logit("label ~ " + '+'.join(select_f[:-1]), df).fit() # print(affair_mod.summary()) # df.to_csv('scaling-clsuter-feature.csv', index=False) print X.shape plu.plot_config() ax = plt.gca() ax.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6)) mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 0:12], y) ax.plot(mean_fpr[0:100:5], mean_tpr[0:100:5], 'r--^', label='Soc. (AUC = %0.2f)' % mean_auc, lw=3, ms=10) mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 12:22], y) ax.plot(mean_fpr[0:100:5], mean_tpr[0:100:5], 'g--d', label='Lin. (AUC = %0.2f)' % mean_auc, lw=3, ms=10) mean_fpr, mean_tpr, mean_auc = cross_val_roc(X, y) ax.plot(mean_fpr[0:100:5], mean_tpr[0:100:5], 'b--o', label='All. (AUC = %0.2f)' % mean_auc, lw=3, ms=10) ax.set_xlim([0, 1]) ax.set_ylim([0, 1]) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') ax.legend(loc="lower right") ax.grid(True) plt.show() data = [] result = svm_cv(X[:, 0:12], y) for i, v in enumerate(result): data.append(['Social Activities', i, v]) result = svm_cv(X[:, 12:22], y) for i, v in enumerate(result): data.append(['Linguistic Constructs', i, v]) result = svm_cv(X, y) for i, v in enumerate(result): data.append(['All', i, v]) df = pd.DataFrame(data, columns=['Feature', 'Metric', 'Value']) plu.plot_config() g = sns.factorplot(x="Metric", y="Value", hue="Feature", data=df, kind="bar", legend=False, palette={ "Social Activities": "#e9a3c9", "Linguistic Constructs": "#91bfdb", 'All': '#a1d76a' }) g.set_xticklabels(["Accuracy", "Micro-F1", 'Macro-F1']) g.set_ylabels('Index') g.set_xlabels('Metric') annots = df['Value'] print annots hatches = ['/', '/', '/', '', '', '', '\\', '\\', '\\'] ax = g.ax #annotate axis = seaborn axis for i, p in enumerate(ax.patches): ax.annotate("%.2f" % (annots[i]), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=25, color='black', rotation=0, xytext=(0, 20), textcoords='offset points') p.set_hatch(hatches[i]) plt.legend(bbox_to_anchor=(1, 1.2), ncol=6) plt.ylim(0.5, 1) plt.show()