def msepath(X, y): print X.shape, y.shape # Compute paths print("Computing regularization path using the coordinate descent lasso...") model = LassoCV(cv=10, max_iter=3000).fit(X, y) # Display results m_log_alphas = -np.log10(model.alphas_) plt.figure() plt.plot(m_log_alphas, model.mse_path_, ':') plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha: CV estimate') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: coordinate descent') plt.axis('tight') plt.show() fields = iot.read_fields() for i in xrange(len(fields)): print str(fields[i]) +'\t'+ str(model.coef_[i])
def parameter_select(X, y): print X.shape, y.shape ############################################################################## # LassoLarsIC: least angle regression with BIC/AIC criterion # model_bic = LassoLarsIC(criterion='bic') # model_bic.fit(X, y) # alpha_bic_ = model_bic.alpha_ model_aic = LassoLarsIC(criterion='aic', max_iter=100000000) model_aic.fit(X, y) alpha_aic_ = model_aic.alpha_ print alpha_aic_ def plot_ic_criterion(model, name, color): alpha_ = model.alpha_ alphas_ = model.alphas_ criterion_ = model.criterion_ plt.plot(-np.log10(alphas_), criterion_, '--', color=color, linewidth=3, label='%s criterion' % name) plt.axvline(-np.log10(alpha_), color=color, linewidth=3, label='alpha: %s estimate' % name) plt.xlabel('-log(alpha)') plt.ylabel('criterion') plt.figure() plot_ic_criterion(model_aic, 'AIC', 'b') # plot_ic_criterion(model_bic, 'BIC', 'r') plt.legend() plt.title('Information-criterion for model selection') plt.show() fields = iot.read_fields() for i in xrange(len(fields)): print str(fields[i]) +'\t'+ str(model_aic.coef_[i])
def avg_liwc(dbname): fields = iot.read_fields() for field in fields: filters = {field: {'$exists': True}} results = list() N = 5 for i in range(1, N+1): result = iot.get_values_one_field(dbname, dbname+'com_t'+str(i), field, filters) result = central_values(result) results.append(result) ax = plt.gca() ind = [y+1 for y in range(len(results))] means = [np.mean(result) for result in results] stds = [np.std(result) for result in results] ax.errorbar(ind, means, stds, fmt='--o--', capthick=3) ax.violinplot(results, showmeans=False, showextrema=True) ax.set_xticks(ind) # for i in ind: # ax.text(i, means[i-1]+0.5, # str(round(means[i-1], 2))+ '$\pm$'+ str(round(stds[i-1], 2)), # ha='center', va='bottom', ) ax.set_xticklabels(('Before 2012', '2012', '2013', '2014', 'After 2014')) ax.set_xlabel('Time Series') tokens = field.split('.') if tokens[-1] == 'value': ax.set_ylabel(tokens[-2].upper()) else: ax.set_ylabel(tokens[-1]) ax.grid(True) plt.savefig('data/'+field+'.pdf') plt.clf()
def classification_subfeature(train, test, outclss): fields = iot.read_fields() print len(fields) foi = ['liwc_anal.result.i', 'liwc_anal.result.we', 'liwc_anal.result.affect', 'liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.bio', 'liwc_anal.result.body', 'liwc_anal.result.health', 'liwc_anal.result.ingest'] indeces = [np.where(fields==f)[0][0] for f in foi] print fields[indeces] '''Load Training data''' X_train, y_train = load_svmlight_file(train) X_train = X_train.toarray()[:, indeces] scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) print X_train.shape '''Load Test data''' X_test, y_test = load_svmlight_file(test) X_test = X_test.toarray()[:, indeces] X_test = scaler.transform(X_test) print X_test.shape svc_lin = SVC(kernel='linear', class_weight='balanced') y_lin = svc_lin.fit(X_train, y_train).predict(X_test) # pickle.dump(y_test, open(outid, 'w')) pickle.dump(y_lin, open(outclss, 'w'))
def common_features(): '''Need no scoring metrics''' LIWC = iot.read_fields() LIWC = [line.strip().split('.')[-1] for line in LIWC] X1, y1 = load_scale_data('data/ed-random.data') X2, y2 = load_scale_data('data/ed-young.data') '''Feature rankings''' ref1 = ref(X1, y1) support1, ranking1 = ref1.support_, ref1.ranking_ convert_fields(LIWC, ranking1) ref2 = ref(X2, y2) support2, ranking2 = ref2.support_, ref2.ranking_ convert_fields(LIWC, ranking2) # # X3, y3 = load_scale_data('data/ed-all-liwc.data') # # ref3 = ref(X3, y3, 69) # # support3, ranking3 = ref3.support_, ref3.ranking_ # # convert_fields(LIWC, ranking3) comm = np.logical_and(support1, support2) convert_fields(LIWC, comm) pickle.dump(comm, open('data/ed-random-young-common.pick', 'w')) # svm_cv(X1[:, support1], y1) # svm_cv(X2[:, support2], y2) # # svm_cv(X3[:, support3], y3) # svm_cv(X1[:, comm], y1) # svm_cv(X2[:, comm], y2) # svm_cv(X3[:, comm], y3) '''Classify with common features'''
def msepath(X, y): print X.shape, y.shape # Compute paths print( "Computing regularization path using the coordinate descent lasso...") model = LassoCV(cv=10, max_iter=3000).fit(X, y) # Display results m_log_alphas = -np.log10(model.alphas_) plt.figure() plt.plot(m_log_alphas, model.mse_path_, ':') plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha: CV estimate') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: coordinate descent') plt.axis('tight') plt.show() fields = iot.read_fields() for i in xrange(len(fields)): print str(fields[i]) + '\t' + str(model.coef_[i])
def read_user_time(filename): fields = iot.read_fields() trimed_fields = [field.split('.')[-1] for field in fields] groups = [ ('ED', 'fed', 'com', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1}), ('RD', 'random', 'scom', {'liwc_anal.result.WC': {'$exists': True}}), ('YG', 'younger', 'scom', {'liwc_anal.result.WC': {'$exists': True}}) ] data = [] for tag, dbname, comname, filter_values in groups: com = dbt.db_connect_col(dbname, comname) for user in com.find(filter_values, no_cursor_timeout=True): if 'status' in user: created_at = datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y') scraped_at = user['scrape_timeline_at'] last_post = datetime.strptime(user['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') life_time = diff_day(last_post, created_at) average_time = float(life_time)/min(1, user['statuses_count']) longest_tweet_intervalb = user['longest_tweet_interval'] observation_interval = diff_day(scraped_at, last_post) if (observation_interval-longest_tweet_intervalb) > 30: death = 1 else: death = 0 values = iot.get_fields_one_doc(user, fields) data.append([user['id_str'], created_at, last_post, scraped_at, average_time, longest_tweet_intervalb, observation_interval, tag, death] + values) df = pd.DataFrame(data, columns=['uid', 'created_at', 'last_post', 'scraped_at', 'average_time', 'longest_time_interval', 'observation_interval', 'group', 'event'] + trimed_fields) df.to_csv(filename)
def avg_liwc(dbname): fields = iot.read_fields() for field in fields: filters = {field: {'$exists': True}} results = list() N = 5 for i in range(1, N + 1): result = iot.get_values_one_field(dbname, dbname + 'com_t' + str(i), field, filters) result = central_values(result) results.append(result) ax = plt.gca() ind = [y + 1 for y in range(len(results))] means = [np.mean(result) for result in results] stds = [np.std(result) for result in results] ax.errorbar(ind, means, stds, fmt='--o--', capthick=3) ax.violinplot(results, showmeans=False, showextrema=True) ax.set_xticks(ind) # for i in ind: # ax.text(i, means[i-1]+0.5, # str(round(means[i-1], 2))+ '$\pm$'+ str(round(stds[i-1], 2)), # ha='center', va='bottom', ) ax.set_xticklabels( ('Before 2012', '2012', '2013', '2014', 'After 2014')) ax.set_xlabel('Time Series') tokens = field.split('.') if tokens[-1] == 'value': ax.set_ylabel(tokens[-2].upper()) else: ax.set_ylabel(tokens[-1]) ax.grid(True) plt.savefig('data/' + field + '.pdf') plt.clf()
def liwc_color_bar(fieldname): X, y = load_scale_data('data/ygcolor.data', True) group = 10 # print X.shape y = np.array(y).ravel() LIWC = iot.read_fields() T = X[:, np.argwhere(LIWC == fieldname).ravel()] T = np.repeat(T, 3) # fig, ax = plt.subplots() print T.shape print y.shape yhist, ybin_edges = np.histogram(y, [1, 2, 3, 4]) # print yhist xhist, xbin_edges = np.histogram(T, group, range=(np.percentile(T, 2.5), np.percentile(T, 97.5))) H = np.histogram2d(T, y, bins=[xbin_edges, ybin_edges]) print xbin_edges ind = np.arange(group) # the x locations for the groups width = 0.35 # the width of the bars: can also be len(x) sequence # print H[0][:, 0] # print H[0][:, 1] # print H[0][:, 2] # print H[0][:, 0]+H[0][:, 1]+H[0][:, 2] # print xhist # print H[0][:, 0]/xhist p1 = plt.bar(ind, H[0][:, 0]/xhist, width, color='r', hatch="\\\\") p2 = plt.bar(ind, H[0][:, 1]/xhist, width, color='g', hatch="//", bottom=H[0][:, 0]/xhist) p3 = plt.bar(ind, H[0][:, 2]/xhist, width, color='b', hatch="--", bottom=(H[0][:, 0] + H[0][:, 1])/xhist) plt.xticks(ind+width/2., np.around(0.5*(xbin_edges[1:] + xbin_edges[:-1]), decimals=4)) # [::3] choose one every three items # plt.xticks(ind + width / 2., ind) plt.ylabel('Ratio') plt.xlabel('Value') plt.title('Sentiment class counts of colors by LIWC field ' + fieldname) plt.legend((p1[0], p2[0], p3[0]), ('Positive', 'Neutral', 'Negative')) plt.show()
def liwc_color_sig(): X, y = load_scale_data('data/ygcolor.data', True) LIWC = iot.read_fields() flags = list() for yi in y: if yi[0]==yi[1] and yi[1]==yi[2]: flags.append(True) else: flags.append(False) y = np.array([(b, c, d) for (b, c, d) in y]) flags = np.array(flags) y = y[flags][:, 0] yhist, ybin_edges = np.histogram(y, [1, 2, 3, 4]) print yhist y[np.where(y < 3)] = +1 y[np.where(y==3)] = -1 print y.shape print len(y[np.where(y==1)]) print len(y[np.where(y==-1)]) X = X[flags, :] print X.shape # rfecv1 = rfecv(X, y) # pickle.dump(rfecv1, open('data/allrfcv.p', 'w')) # rfecv1 = pickle.load(open('data/allrfcv.p', 'r')) # scores = list() # scores.append(rfecv1) # plot_rfecvs(scores, ['All Negative or All Non-negative']) ref2 = ref(X, y, 1) support2, ranking2 = ref2.support_, ref2.ranking_ print ranking2 convert_fields(LIWC, ranking2)
def friend_network_hashtag_weight(dbname, netname): ''' Community detection on friendship network, weighted by hashtag similarity :param dbname: :param netname: :param user_hash_profile: :return: ''' user_hash_profile = pickle.load(open('data/user-hash-profile.pick', 'r')) net = gt.load_network(dbname, netname) fields = iot.read_fields() com = dbt.db_connect_col(dbname, 'scom') for edge in net.es: source_vertex_id = edge.source target_vertex_id = edge.target source_uid = int(net.vs[source_vertex_id]['name']) target_uid = int(net.vs[target_vertex_id]['name']) source_user = com.find_one({'id':source_uid}) target_user = com.find_one({'id':target_uid}) source_user_liwc = iot.get_fields_one_doc(source_user, fields) target_user_liwc = iot.get_fields_one_doc(target_user, fields) source_user_liwc.extend(user_hash_profile[source_uid]) target_user_liwc.extend(user_hash_profile[target_uid]) print len(target_user_liwc) dis = spatial.distance.euclidean(source_user_liwc, target_user_liwc) edge['weight'] = 1.0/(1.0 + dis) net.write_graphml('ed_weighted_follow.graphml')
def friend_network_hashtag_weight(dbname, netname): ''' Community detection on friendship network, weighted by hashtag similarity :param dbname: :param netname: :param user_hash_profile: :return: ''' user_hash_profile = pickle.load(open('data/user-hash-profile.pick', 'r')) net = gt.load_network(dbname, netname) fields = iot.read_fields() com = dbt.db_connect_col(dbname, 'scom') for edge in net.es: source_vertex_id = edge.source target_vertex_id = edge.target source_uid = int(net.vs[source_vertex_id]['name']) target_uid = int(net.vs[target_vertex_id]['name']) source_user = com.find_one({'id': source_uid}) target_user = com.find_one({'id': target_uid}) source_user_liwc = iot.get_fields_one_doc(source_user, fields) target_user_liwc = iot.get_fields_one_doc(target_user, fields) source_user_liwc.extend(user_hash_profile[source_uid]) target_user_liwc.extend(user_hash_profile[target_uid]) print len(target_user_liwc) dis = spatial.distance.euclidean(source_user_liwc, target_user_liwc) edge['weight'] = 1.0 / (1.0 + dis) net.write_graphml('ed_weighted_follow.graphml')
def plot_distribution(dbname='fed', comname='scom'): # Plot difference between retweeted and liked tweets fields = iot.read_fields() for field in fields: tokens = field.split('.') retweet_key = field.replace('liwc_anal', 'retweet_liwc') like_key = field.replace('liwc_anal', 'like_liwc') retwets = iot.get_values_one_field(dbname, comname, retweet_key) likes = iot.get_values_one_field(dbname, comname, like_key) pt.plot_config() sns.distplot(retwets, hist=False, kde_kws={ "color": "r", "lw": 2, "marker": 'o' }, label='RT ($\mu=%0.2f \pm %0.2f$)' % (np.mean(retwets), np.std(retwets))) sns.distplot(likes, hist=False, kde_kws={ "color": "g", "lw": 2, "marker": 's' }, label='Like ($\mu=%0.2f \pm %0.2f$)' % (np.mean(likes), np.std(likes))) plt.legend(loc="best") plt.xlabel(tokens[-1]) plt.ylabel('P') plt.savefig('data/' + tokens[-1] + '.pdf', bbox_inches='tight') plt.clf()
def feature_rank(file_path): # Ranking feature usefulness LIWC = iot.read_fields()[17:] LIWC = [line.strip().split('.')[-1] for line in LIWC] X1, y1 = load_scale_data(file_path) ref1 = ref(X1, y1) support1, ranking1 = ref1.support_, ref1.ranking_ convert_fields(LIWC, ranking1)
def feature_stat(dumped=False): fields = io.read_fields() print len(fields) assert isinstance(fields, object) for field in fields: keys = field.split('.') filter = {field: {'$exists': True}} eds = io.get_values_one_field('fed', 'scom', field, filter) randoms = io.get_values_one_field('random', 'scom', field, filter) youngs = io.get_values_one_field('young', 'scom', field, filter) compore_distribution(keys[-1], eds, randoms, youngs)
def coreness_features(g): """Correlation of K-core and feature values""" g = g.as_undirected(mode="collapse") all_coreness = g.shell_index(mode='ALL') g.vs['core'] = all_coreness fields = iot.read_fields() for field in fields: gt.add_attribute(g, 'pof', 'fed', 'com', field) vlist = g.vs.select(pof_ne=-1000000000.0)['core'] flist = g.vs.select(pof_ne=-1000000000.0)['pof'] pt.correlation(vlist, flist, 'K-Core', 'Feature', 'data/corerel/'+field+'.pdf')
def coreness_features(g): """Correlation of K-core and feature values""" g = g.as_undirected(mode="collapse") all_coreness = g.shell_index(mode='ALL') g.vs['core'] = all_coreness fields = iot.read_fields() for field in fields: gt.add_attribute(g, 'pof', 'fed', 'com', field) vlist = g.vs.select(pof_ne=-1000000000.0)['core'] flist = g.vs.select(pof_ne=-1000000000.0)['pof'] pt.correlation(vlist, flist, 'K-Core', 'Feature', 'data/corerel/' + field + '.pdf')
def network_stats(dbname, com, fnet, bnet): fields = iot.read_fields() # print ('Feature, #Nodes, #Edges, %Nodes, %Edges, D_assort, F_assort, F_assort, Mean, STD, z_sore, p_value') print( 'Network_Feature \t #Nodes \t #Edges \t X_Min \t X_Max \t X_P2.5 \t X_P97.5 \t Y_Min \t Y_Max \t Y_P2.5 \t Y_P97.5 \t Tau_coef \t p_value' ) print 'Following' fnetwork = gt.load_network(dbname, fnet) '''Out put file for Gephi''' # fnetwork.write_dot('friendship.DOT') gt.net_stat(fnetwork) # outputs = feature_assort_friend(fnetwork, dbname, com, fields, directed=True) outputs = rank_feature(fnetwork, dbname, com, fields, directed=True)
def network_stats(dbname, com, fnet, bnet): fields = iot.read_fields() # print ('Feature, #Nodes, #Edges, %Nodes, %Edges, D_assort, F_assort, F_assort, Mean, STD, z_sore, p_value') print ( "Network_Feature \t #Nodes \t #Edges \t X_Min \t X_Max \t X_P2.5 \t X_P97.5 \t Y_Min \t Y_Max \t Y_P2.5 \t Y_P97.5 \t Tau_coef \t p_value" ) print "Following" fnetwork = gt.load_network(dbname, fnet) """Out put file for Gephi""" # fnetwork.write_dot('friendship.DOT') gt.net_stat(fnetwork) # outputs = feature_assort_friend(fnetwork, dbname, com, fields, directed=True) outputs = rank_feature(fnetwork, dbname, com, fields, directed=True)
def fs_svm(X, y): # feature selection with SVM model lsvc = LinearSVC(C=0.001, penalty="l1", dual=False).fit(X, y) model = SelectFromModel(lsvc, prefit=True) X_new = model.transform(X) LIWC = iot.read_fields() print 'Original feature size', X.shape print 'New feature size', X_new.shape sample_X = X[0] sample_X_new = X_new[0] print 'Original feature length of sample', len(set(sample_X)) print 'New feature length of sample', len(set(sample_X_new)) for i in xrange(len(sample_X)): if sample_X[i] in sample_X_new: print i+1, LIWC[i]
def feature_stat(dumped=False): fields = io.read_fields() print len(fields) assert isinstance(fields, object) for field in fields: keys = field.split('.') # filter = {field: {'$exists': True}} # eds = io.get_values_one_field('fed', 'scom', field, filter) # randoms = io.get_values_one_field('random', 'scom', field, filter) # youngs = io.get_values_one_field('young', 'scom', field, filter) # compore_distribution(keys[-1], eds, randoms, youngs) positive = io.get_values_one_field('depression', 'com', field, {field: {'$exists': True}, 'checked': True}) negative = io.get_values_one_field('depression', 'neg_com', field, {field: {'$exists': True}}) # print len(positive), len(negative) compore_distribution(keys[-1], positive, negative)
def out_data(): control = dbt.db_connect_col('fed', 'control_com') treat = dbt.db_connect_col('fed', 'treat_com') control_user = iot.get_values_one_field('fed', 'control_com', 'id', {'prior_liwc.result.WC':{'$exists': True}, 'post_liwc.result.WC':{'$exists': True}}) treat_user = iot.get_values_one_field('fed', 'treat_com', 'id', {'prior_liwc.result.WC':{'$exists': True}, 'post_liwc.result.WC':{'$exists': True}}) data = [] fields = iot.read_fields() prefix = ['prior_liwc', 'post_liwc'] for i in xrange(2): uids = [control_user, treat_user][i] for uid in uids: user = [control, treat][i].find_one({'id': uid}) for j in xrange(2): fields_new = ['id_str']+[field.replace('liwc_anal', prefix[j]) for field in fields] values = iot.get_fields_one_doc(user, fields_new) data.append(values+[i, j]) df = pd.DataFrame(data, columns=['id']+[field.split('.')[-1] for field in fields]+['treated', 'time']) df.to_csv('treatment.csv')
def parameter_select(X, y): print X.shape, y.shape ############################################################################## # LassoLarsIC: least angle regression with BIC/AIC criterion # model_bic = LassoLarsIC(criterion='bic') # model_bic.fit(X, y) # alpha_bic_ = model_bic.alpha_ model_aic = LassoLarsIC(criterion='aic', max_iter=100000000) model_aic.fit(X, y) alpha_aic_ = model_aic.alpha_ print alpha_aic_ def plot_ic_criterion(model, name, color): alpha_ = model.alpha_ alphas_ = model.alphas_ criterion_ = model.criterion_ plt.plot(-np.log10(alphas_), criterion_, '--', color=color, linewidth=3, label='%s criterion' % name) plt.axvline(-np.log10(alpha_), color=color, linewidth=3, label='alpha: %s estimate' % name) plt.xlabel('-log(alpha)') plt.ylabel('criterion') plt.figure() plot_ic_criterion(model_aic, 'AIC', 'b') # plot_ic_criterion(model_bic, 'BIC', 'r') plt.legend() plt.title('Information-criterion for model selection') plt.show() fields = iot.read_fields() for i in xrange(len(fields)): print str(fields[i]) + '\t' + str(model_aic.coef_[i])
def bmi_regreesion(dbname, colname, filename): # regress bmi with features fields = iot.read_fields() poi_fields = fields[-9:-1] print poi_fields trimed_fields = [(field.split('.')[-1]) for field in fields] trimed_fields[-10:] = [ 'sentiment', 'age', 'gender', 'height', 'cw', 'gw', 'cbmi', 'gbmi', 'edword', 'level' ] com = dbutil.db_connect_col(dbname, colname) data = [] # for user in com.find({'$or': [{'text_anal.cbmi.value': {'$exists': True}}, # {'text_anal.gbmi.value': {'$exists': True}}], # 'liwc_anal.result.WC': {'$exists': True}}, no_cursor_timeout=True): com2 = dbutil.db_connect_col('fed2', colname) com3 = dbutil.db_connect_col('fed3', colname) for user in com.find({'liwc_anal.result.WC': { '$exists': True }}, no_cursor_timeout=True): values = iot.get_fields_one_doc(user, fields) user2 = com2.find_one({'id': user['id']}) if user2: values.extend(iot.get_fields_one_doc(user2, poi_fields)) else: values.extend([0] * len(poi_fields)) user3 = com3.find_one({'id': user['id']}) if user3: values.extend(iot.get_fields_one_doc(user3, poi_fields)) else: values.extend([0] * len(poi_fields)) data.append(values) df = pd.DataFrame(data, columns=trimed_fields + [(field.split('.')[-2] + '_p2') for field in poi_fields] + [(field.split('.')[-2] + '_p3') for field in poi_fields]) df.to_csv(filename)
# ed_bio_sta('fed', 'scom') # fields = [ # # 'text_anal.gw.value', # # 'text_anal.cw.value', # # 'text_anal.edword_count.value', # # 'text_anal.h.value', # # 'text_anal.a.value', # # 'text_anal.bmi.value', # 'text_anal.cbmi.value', # 'text_anal.gbmi.value', # # 'text_anal.lw.value', # # 'text_anal.hw.value' # ] # plot_bio('fed', 'scom', fields, ['CBMI', 'GBMI']) # bmi_regreesion('fed', 'com', 'data/bmi_reg.csv') fields = iot.read_fields() poi_fields = fields[-9:-1] print poi_fields trimed_fields = [(field.split('.')[-1]) for field in fields] trimed_fields[-10:] = [ 'sentiment', 'age', 'gender', 'height', 'cw', 'gw', 'cbmi', 'gbmi', 'edword', 'level' ] df = pd.read_csv('data/bmi_reg.csv', index_col=0) df.columns = trimed_fields + [ (field.split('.')[-2] + '_p2') for field in poi_fields ] + [(field.split('.')[-2] + '_p3') for field in poi_fields] df.to_csv('data/bmi_reg.csv')
def liwc_feature(): fields = iot.read_fields() for field in fields: values = iot.get_values_one_field('depression', 'users1', field) print field, np.mean(values), np.std(values)
def network_assort(): # test network assortative gs = ['edfollow', 'follow', 'retweet', 'communication'] fields = iot.read_fields() # print len(fields) for gf in gs[1:]: g = gt.Graph.Read_GraphML('data/' + gf + '_net.graphml') # g = gt.giant_component(g) # gt.net_stat(g) sigs = [] for filed in fields: g = gt.add_attribute(g, 'foi', 'depression', 'com', filed) raw_values = np.array(g.vs['foi']) values = drop_initials(raw_values) if len(values) > 100: output = gf + ',' + filed.split('.')[-1] + ',' # maxv, minv = np.percentile(values, 97.5), np.percentile(values, 2.5) maxv, minv = max(values), min(values) vs = g.vs.select(foi_ge=minv, foi_le=maxv) sg = g.subgraph(vs) raw_assort = sg.assortativity('foi', 'foi', directed=True) ass_list = [] for i in xrange(1000): np.random.shuffle(raw_values) g.vs["foi"] = raw_values vs = g.vs.select(foi_ge=minv, foi_le=maxv) sg = g.subgraph(vs) ass_list.append( sg.assortativity('foi', 'foi', directed=True)) ass_list = np.array(ass_list) amean, astd = np.mean(ass_list), np.std(ass_list) absobserved = abs(raw_assort) pval = (np.sum(ass_list >= absobserved) + np.sum(ass_list <= -absobserved)) / float( len(ass_list)) zscore = (raw_assort - amean) / astd output += format(raw_assort, '.2f') + ',' + format(amean, '.2f') + ',' + \ format(astd, '.2f') + ',' + format(zscore, '.2f') + ',' + format(pval, '.3f') + ',' if pval < 0.001: output += '***' if raw_assort > 0: sigs.append('***') print output continue if pval < 0.01: output += '**' if raw_assort > 0: sigs.append('**') print output continue if pval < 0.05: output += '*' if raw_assort > 0: sigs.append('*') print output continue else: sigs.append('N') print output continue c = Counter(sigs) print c for sig, cou in c.items(): print sig, 1.0 * cou / len(fields)
# ygimage = pickle.load(open('data/ygimage.pick', 'r')) # print len(ygimage) # labels = map_color_label(ygimage) # pickle.dump(labels, open('data/yglabels.pick', 'w')) # labels = pickle.load(open('data/yglabels.pick', 'r')) # print labels # senti = map_label_senti(labels) # pickle.dump(senti, open('data/ygsentis.pick', 'w')) # senti = pickle.load(open('data/ygsentis.pick', 'r')) # LIWC = io.read_fields() # print len(LIWC) # print len(senti) # print senti # color_classify(senti, LIWC, 'data/ygcolor', 'young') """Generate Data for user classification""" fields = io.read_fields() print len(fields) # common = pickle.load(open('data/common.pick', 'r')) # fields = LIWC[common] # print len(LIWC[common]) # print fields # # # common users in random and young = set([4319191638L, 2627223434L, 2976822286L, 4788248335L, 3289264086L, 520847919, 439647015, 947539758, 617442479, 2481703728L, 2913311029L, 3760687289L, 2303011905L, 1712561862, 2882255303L, 261549132, 982895821, 2849269327L, 312684498, 160044558, 774072534, 330611545, 430569947, 1275228253, 3399616094L, 2924322143L, 457692129, 3006221026L, 2837359399L, 18942418, 2848241137L, 273768180, 235857269, 3315086840L]) # # fed, random, young # users = potential_users('fed', 'com') # triangle = pickle.load(open('data/triangle.pick', 'r')) # print triangle # feature_output(fields, 'data/random-younger', 'younger', '-1', False, [])
def roc_plot_feature(datafile): X, y = load_scale_data(datafile) fields = iot.read_fields() trim_files = [f.split('.')[-1] for f in fields] print len(trim_files) select_f = [ 'friend_count', 'status_count', 'follower_count', 'friends_day', 'statuses_day', 'followers_day', 'retweet_pro', 'dmention_pro', 'reply_pro', # 'hashtag_pro', # 'url_pro', 'retweet_div', 'mention_div', 'reply_div', 'i', 'we', 'swear', 'negate', 'body', 'health', 'ingest', 'social', 'posemo', 'negemo' ] indecs = [trim_files.index(f) for f in select_f] print indecs X = X[:, indecs] # '''Calculate positive emotion ratio''' # # print X.shape # X[:,-2] /= (X[:,-2] + X[:, -1]) # X = X[:, :-1] # X[:, -1][~np.isfinite(X[:, -1])] = 0 # min_max_scaler = preprocessing.MinMaxScaler() # X = min_max_scaler.fit_transform(X) X = preprocessing.scale(X) print X.shape, y.shape # Z = np.append(X, y.reshape((len(y), 1)), axis=1) # df = pd.DataFrame(Z, columns=select_f + ['label']) # affair_mod = logit("label ~ " + '+'.join(select_f[:-1]), df).fit() # print(affair_mod.summary()) # df.to_csv('scaling-clsuter-feature.csv', index=False) print X.shape plu.plot_config() ax = plt.gca() ax.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6)) mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 0:12], y) ax.plot(mean_fpr[0:100:5], mean_tpr[0:100:5], 'r--^', label='Soc. (AUC = %0.2f)' % mean_auc, lw=3, ms=10) mean_fpr, mean_tpr, mean_auc = cross_val_roc(X[:, 12:22], y) ax.plot(mean_fpr[0:100:5], mean_tpr[0:100:5], 'g--d', label='Lin. (AUC = %0.2f)' % mean_auc, lw=3, ms=10) mean_fpr, mean_tpr, mean_auc = cross_val_roc(X, y) ax.plot(mean_fpr[0:100:5], mean_tpr[0:100:5], 'b--o', label='All. (AUC = %0.2f)' % mean_auc, lw=3, ms=10) ax.set_xlim([0, 1]) ax.set_ylim([0, 1]) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') ax.legend(loc="lower right") ax.grid(True) plt.show() data = [] result = svm_cv(X[:, 0:12], y) for i, v in enumerate(result): data.append(['Social Activities', i, v]) result = svm_cv(X[:, 12:22], y) for i, v in enumerate(result): data.append(['Linguistic Constructs', i, v]) result = svm_cv(X, y) for i, v in enumerate(result): data.append(['All', i, v]) df = pd.DataFrame(data, columns=['Feature', 'Metric', 'Value']) plu.plot_config() g = sns.factorplot(x="Metric", y="Value", hue="Feature", data=df, kind="bar", legend=False, palette={ "Social Activities": "#e9a3c9", "Linguistic Constructs": "#91bfdb", 'All': '#a1d76a' }) g.set_xticklabels(["Accuracy", "Micro-F1", 'Macro-F1']) g.set_ylabels('Index') g.set_xlabels('Metric') annots = df['Value'] print annots hatches = ['/', '/', '/', '', '', '', '\\', '\\', '\\'] ax = g.ax #annotate axis = seaborn axis for i, p in enumerate(ax.patches): ax.annotate("%.2f" % (annots[i]), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=25, color='black', rotation=0, xytext=(0, 20), textcoords='offset points') p.set_hatch(hatches[i]) plt.legend(bbox_to_anchor=(1, 1.2), ncol=6) plt.ylim(0.5, 1) plt.show()
def user_profiles(dbname, comname, userfile='data/actor.uid'): # # get profile infor for regression uids = pickle.load(open(userfile)) print len(uids) com = dbt.db_connect_col(dbname, comname) newcom = dbt.db_connect_col(dbname, 'pro_mention_miss_com') # newcom.create_index("id", unique=True) # # Collect miss data # missuids, taguids = [], [] # for uid in uids: # user = com.find_one({'id': int(uid)}) # if user is None: # missuids.append(int(uid)) # else: # taguids.append(int(uid)) # list_size = len(missuids) # print '%d users to process' %list_size # length = int(math.ceil(list_size/100.0)) # for index in xrange(length): # index_begin = index*100 # index_end = min(list_size, index_begin+100) # userlook.lookup_user_list(missuids[index_begin:index_end], newcom, 1, 'N') # # Collect tweets for missing users # converstream = dbt.db_connect_col(dbname, 'pro_mention_timeline') # most_recenty = converstream.find().sort([('id', -1)]).limit(1) # oldest = converstream.find().sort([('id', 1)]).limit(1) # max_id = most_recenty[0]['id'] # since_id = oldest[0]['id'] # print most_recenty[0] # print oldest[0] # com = dbt.db_connect_col(dbname, 'pro_mention_miss_com') # timeline = dbt.db_connect_col(dbname, 'pro_mention_miss_timeline') # com.create_index([('timeline_scraped_times', pymongo.ASCENDING)]) # timeline.create_index([('user.id', pymongo.ASCENDING), # ('id', pymongo.DESCENDING)]) # timeline.create_index([('id', pymongo.ASCENDING)], unique=True) # print datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + 'Connect Twitter.com' # timelines.retrieve_timeline(com, timeline, max_id) # print datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'finish timeline for sample users' data = [] fields = iot.read_fields() miss_count = 0 print fields for uid in uids: user = com.find_one({'id': int(uid)}) if user is not None: row = iot.get_fields_one_doc(user, fields) data.append(row) else: user = newcom.find_one({'id': int(uid)}) if user is not None: row = iot.get_fields_one_doc(user, fields) data.append(row) else: miss_count += 1 print miss_count, miss_count * 1.0 / len(uids) df = pd.DataFrame(data=data, columns=['uid', 'posemo', 'negemo', 'senti']) df.to_csv('data/emotions.csv')
def data_split(dbname='TwitterProAna', colname='tweets'): # # https://stackoverflow.com/questions/8136652/query-mongodb-on-month-day-year-of-a-datetime # # Label tweets with dates # tweets = dbt.db_connect_col(dbname, colname) # # basedate = datetime(1970, 1, 1) # # tweets.create_index([('date_week', pymongo.ASCENDING)]) # # for tweet in tweets.find({}, no_cursor_timeout=True): # # creat = tweet['created_at'] # # detal = creat - basedate # # datestr = detal.days // 7 + 1 # # tweets.update_one({'id': tweet['id']}, {'$set': {"date_week": datestr}}, upsert=False) # # # # Indexing tweets with dates # date_index = {} # for tweet in tweets.find({}, ['id', 'date_week'], no_cursor_timeout=True): # tid, date = tweet['id'], tweet['date_week'] # tlist = date_index.get(date, []) # tlist.append(tid) # date_index[date] = tlist # pickle.dump(date_index, open('date_tid_list_week.pick', 'w')) # # # Bunch with tweets in give dates to produce LIWC results # # tweets = dbt.db_connect_col(dbname, colname) # # date_index = pickle.load(open('date_tid_list_week.pick', 'r')) # timeseries = dbt.db_connect_col(dbname, 'weekseries') # for key in date_index.keys(): # tlist = date_index[key] # textmass = '' # for tid in tlist: # tweet = tweets.find_one({'id': tid}) # text = tweet['text'].encode('utf8') # # replace RT, @, # and Http:// # match = rtgrex.search(text) # if match is None: # text = mgrex.sub('', text) # text = hgrex.sub('', text) # text = ugrex.sub('', text) # text = text.strip() # if not(text.endswith('.') or text.endswith('?') or text.endswith('!')): # text += '.' # textmass += " " + text.lower() # words = textmass.split() # # Any text with fewer than 50 words should be looked at with a certain degree of skepticism. # if len(words) > 50: # liwc_result = liwc.summarize_document(' '.join(words)) # timeseries.insert({'date': key, 'liwc':liwc_result}) timeseries = dbt.db_connect_col(dbname, 'weekseries') fields = iot.read_fields() fields_trim = [f.replace('liwc_anal.result.', '') for f in fields] fields = [f.replace('_anal.result', '') for f in fields] print len(fields) data = [] basedate = datetime(1970, 1, 1) for entry in timeseries.find(): time = entry['date'] # date = datetime.strptime(time, '%Y-%m') # date = datetime.date(year=int(time[0]), month=int(time[1])) # detal = creat - basedate # # datestr = detal.days // 7 + 1 days = (time -1)*7 date = basedate + datetime.timedelta(days=days) features = iot.get_fields_one_doc(entry, fields) data.append([date] + features) df = pd.DataFrame(data=data, columns=['date'] + fields_trim) df.to_csv('ian-liwc-tweets-week.csv')