def main(): parser = ArgumentParser() parser.add_argument('--out_dir', default='../../output/results/') parser.add_argument('--tf_file', default='../../data/frequency/2013_2016_tf_norm_log.tsv') parser.add_argument('--growth_score_file', default='../../data/frequency/growth_scores.tsv') args = parser.parse_args() out_dir = args.out_dir tf_file = args.tf_file growth_score_file = args.growth_score_file ## load data tf = pd.read_csv(tf_file, sep='\t', index_col=0) growth_params = pd.read_csv(growth_score_file, sep='\t', index_col=0) growth_words = get_growth_words() decline_words, decline_params = get_growth_decline_words_and_params() logistic_decline_words, logistic_params = get_logistic_decline_words() piecewise_decline_words, piecewise_params = get_piecewise_decline_words() logistic_decline_words = list(set(logistic_decline_words) & set(decline_words) - set(growth_words)) piecewise_decline_words = list(set(piecewise_decline_words) & set(decline_words) - set(growth_words)) ## sort scores growth_scores = growth_params.loc[growth_words, 'spearman'].sort_values(inplace=False, ascending=False) decline_logistic_scores = logistic_params.loc[logistic_decline_words, 'R2'].sort_values(inplace=False, ascending=False) decline_piecewise_scores = piecewise_params.loc[piecewise_decline_words, 'R2'].sort_values(inplace=False, ascending=False) ## get example words top_k = 5 example_growth_words = growth_scores.index.tolist()[:top_k] example_logistic_words = decline_logistic_scores.index.tolist()[:top_k] example_piecewise_words = decline_piecewise_scores.index.tolist()[:top_k] ## plot!! and write to file word_categories = ['growth', 'logistic_decline', 'piecewise_decline'] word_lists = [example_growth_words, example_logistic_words, example_piecewise_words] for word_category, word_list in izip(word_categories, word_lists): plot_time_series(tf, sorted(word_list)) out_file = os.path.join(out_dir, '%s_best_fit.pdf'%(word_category)) # save to file plt.savefig(out_file)
def main(): parser = ArgumentParser() parser.add_argument( '--tf_file', default='../../data/frequency/2013_2016_tf_norm_log.tsv') parser.add_argument( '--POS_file', default='../../data/frequency/2013_2016_tag_estimates.tsv') parser.add_argument('--k', default=1, type=int) parser.add_argument('--out_dir', default='../../output') args = parser.parse_args() tf_file = args.tf_file POS_file = args.POS_file k = args.k out_dir = args.out_dir ## load data k_range = pd.np.arange(1, k + 1) tf = pd.read_csv(tf_file, sep='\t', index_col=0).iloc[:, k_range - 1] tf.columns = map(lambda x: 'f_%d' % (x), k_range) POS_tags = pd.read_csv(POS_file, sep='\t', index_col=0).iloc[:, 0] # convert to dummy vars POS_tags = POS_tags.str.get_dummies() POS_tag_list = list(POS_tags.columns) # combine shared_vocab = tf.index & POS_tags.index data = pd.concat([tf.loc[shared_vocab], POS_tags.loc[shared_vocab]], axis=1) ## restrict to success/fail words success_words = get_growth_words() fail_words, _ = get_growth_decline_words_and_params() # success_words = get_success_words_final() # fail_words = get_fail_words_final() fail_words = list(set(fail_words)) # restrict to shared vocab success_words = list(set(success_words) & set(shared_vocab)) fail_words = list(set(fail_words) & set(shared_vocab)) change_words = success_words + fail_words data = data.loc[change_words, :] # add success condition y_var = 'success' data.loc[:, 'success'] = map(lambda x: int(x in success_words), data.index.tolist()) ## organize data_sets = [ data.loc[:, [y_var] + POS_tag_list], # just POS data, # f+POS ] data_set_names = ['POS', 'f+POS'] results = pd.DataFrame() n_folds = 10 for data_set, data_set_name in izip(data_sets, data_set_names): for k in k_range: feat_results = predict_LR(data_set, y_var, n_folds) feat_results.loc[:, 'k'] = k feat_results.loc[:, 'feat_names'] = data_set_name results = results.append(feat_results) ## write to file!! k_range_str = '%d_%d' % (min(k_range), max(k_range)) out_file = os.path.join(out_dir, 'success_%s_window_POS.tsv' % (k_range_str)) results.to_csv(out_file, sep='\t', index=False)
def main(): parser = ArgumentParser() parser.add_argument( '--tf_file', default='../../data/frequency/2013_2016_tf_norm_log.tsv') parser.add_argument( '--DL_file', default='../../data/frequency/2013_2016_3gram_residuals.tsv') parser.add_argument( '--word_category_file', default='../../data/frequency/word_lists/2013_2016_word_categories.csv' ) parser.add_argument('--k', default=1, type=int) parser.add_argument('--out_dir', default='../../output') args = parser.parse_args() tf_file = args.tf_file DL_file = args.DL_file word_category_file = args.word_category_file k = args.k out_dir = args.out_dir ## load data k_range = pd.np.arange(1, k + 1) tf = pd.read_csv(tf_file, sep='\t', index_col=0).iloc[:, k_range - 1] tf.columns = map(lambda x: 'f_%d' % (x), k_range) DL = pd.read_csv(DL_file, sep='\t', index_col=0).iloc[:, k_range - 1] DL.columns = map(lambda x: 'f_%d' % (x), k_range) word_categories = pd.read_csv(word_category_file, sep=',', index_col=0) word_categories = word_categories.loc[:, 'category'] # eliminate partials word_categories = word_categories.apply(lambda x: x.split('/')[0]) # convert to dummy vars word_categories = word_categories.str.get_dummies() category_list = list(word_categories.columns) # combine shared_vocab = list(set(tf.index) & set(word_categories.index)) data = pd.concat([ tf.loc[shared_vocab], word_categories.loc[shared_vocab], DL.loc[shared_vocab] ], axis=1) ## restrict to growth/decline words growth_words = get_growth_words() decline_words, _ = get_growth_decline_words_and_params() decline_words = list(set(decline_words)) # restrict to shared vocab growth_words = list(set(growth_words) & set(shared_vocab)) decline_words = list(set(decline_words) & set(shared_vocab)) change_words = growth_words + decline_words data = data.loc[change_words, :] # add growth condition y_var = 'growth' data.loc[:, 'growth'] = map(lambda x: int(x in growth_words), data.index.tolist()) ## organize data_sets = [ data.loc[:, [y_var] + category_list], # just categories data.loc[:, [y_var] + category_list + list(tf.columns)], # f+categories data, # f+categories+DL ] data_set_names = ['CAT', 'f+CAT', 'f+DL+CAT'] results = pd.DataFrame() n_folds = 10 for data_set, data_set_name in izip(data_sets, data_set_names): for k in k_range: feat_results = predict_LR(data_set, y_var, n_folds) feat_results.loc[:, 'k'] = k feat_results.loc[:, 'feat_names'] = data_set_name results = results.append(feat_results) ## write to file!! k_range_str = '%d_%d' % (min(k_range), max(k_range)) out_file = os.path.join(out_dir, 'growth_%s_window_CAT.tsv' % (k_range_str)) results.to_csv(out_file, sep='\t', index=False)
def main(): parser = ArgumentParser() parser.add_argument('--tf_file', default='../../data/frequency/2013_2016_tf_norm_log.tsv') parser.add_argument('--DL_file', default='../../data/frequency/2013_2016_3gram_residuals.tsv') parser.add_argument('--DU_file', default='../../data/frequency/2013_2016_user_diffusion.tsv') parser.add_argument('--DS_file', default='../../data/frequency/2013_2016_subreddit_diffusion.tsv') parser.add_argument('--DT_file', default='../../data/frequency/2013_2016_thread_diffusion.tsv') parser.add_argument('--k', default=12, type=int) parser.add_argument('--out_dir', default='../../output') args = parser.parse_args() tf_file = args.tf_file DL_file = args.DL_file DU_file = args.DU_file DS_file = args.DS_file DT_file = args.DT_file k = args.k out_dir = args.out_dir ## load data tf = pd.read_csv(tf_file, sep='\t', index_col=0) DL = pd.read_csv(DL_file, sep='\t', index_col=0) DU = pd.read_csv(DU_file, sep='\t', index_col=0) DS = pd.read_csv(DS_file, sep='\t', index_col=0) DT = pd.read_csv(DT_file, sep='\t', index_col=0) all_stats = [tf, DL, DU, DS, DT] all_stats = [s.fillna(0, inplace=False) for s in all_stats] shared_vocab = list(reduce(lambda x,y : x&y, map(lambda y: y.index, all_stats))) k_start = 0 k_range = range(k_start+1,k_start+k+1) n_folds = 10 all_time_steps = tf.columns.tolist() ## restrict to success/fail words success_words = get_growth_words() fail_words, _ = get_growth_decline_words_and_params() # restrict to shared vocab success_words = list(set(success_words) & set(shared_vocab)) fail_words = list(set(fail_words) & set(shared_vocab)) change_words = success_words + fail_words all_stats = [s.loc[change_words, :] for s in all_stats] # add success condition y_var = 'success' for s in all_stats: s.loc[:, y_var] = map(lambda x: int(x in success_words), s.index.tolist()) ## organize feat_sets = [ [all_stats[0]], [all_stats[0], all_stats[1]], [all_stats[0], all_stats[2], all_stats[3], all_stats[4]], all_stats ] feat_name_lists = [ ['f'], ['f','DL'], ['f','DU','DS','DT'], ['f','DL','DU','DS','DT'] ] feat_set_names = ['f', 'f+L', 'f+S', 'f+L+S'] results = pd.DataFrame() use_mean = False for feat_set, feat_set_name, feat_name_list in izip(feat_sets, feat_set_names, feat_name_lists): for k_ in k_range: time_steps = all_time_steps[k_start:k_] feat_results = predict_LR(feat_set, feat_name_list, y_var, time_steps, n_folds, use_mean=use_mean) feat_results.loc[:, 'k'] = k_ feat_results.loc[:, 'feat_names'] = feat_set_name results = results.append(feat_results) ## write to file!! k_range_str = '%s_%s'%(min(k_range), max(k_range)) if(use_mean): out_file = os.path.join(out_dir, 'success_%s_window_mean.tsv'%(k_range_str)) else: out_file = os.path.join(out_dir, 'success_%s_window.tsv'%(k_range_str)) results.to_csv(out_file, sep='\t', index=False)
def main(): parser = ArgumentParser() parser.add_argument('--out_dir', default='../../output') args = parser.parse_args() out_dir = args.out_dir # load data growth_decline_words, split_points = get_growth_decline_words_and_params() split_points = split_points.apply(lambda x: int(ceil(x))) # drop bad split points T = 36 split_points = split_points[(split_points > 0) & (split_points < T)] growth_words = get_growth_words() GD = len(growth_decline_words) G = len(growth_words) N = G + GD survivors = pd.np.repeat(N, T) deaths = pd.Series(pd.np.zeros(T), index=pd.np.arange(T)) deaths = (deaths + split_points.value_counts()).fillna(0, inplace=False) deaths_cumulative = deaths.cumsum() survivors -= deaths_cumulative timesteps = pd.np.arange(T) t_0 = '2013-06' t_0 = datetime.strptime(t_0, '%Y-%m') time_labels = [datetime.strftime(t_0 + relativedelta(months=+d), '%Y-%m') for d in range(T)] time_interval = 8 time_ticks, time_labels = zip(*zip(timesteps, time_labels)[::time_interval]) # make curve x_buffer = 0.5 y_buffer = 50 xlabel = 'Date' ylabel = 'Survivors' label_size = 20 tick_size = 14 survivor_marker_size = 10 survivor_color = 'k' survivor_linestyle = '-' fill_hatch = '//' # fill_color = 'b' # use light-blue as fill color fill_color = (117, 117, 255) fill_color = tuple(c/255 for c in fill_color) xlim = [min(timesteps)-x_buffer, max(timesteps)+x_buffer] # cutoff at y=0 # ylim = [min(survivors) - y_buffer, max(survivors)+y_buffer] ylim = [0, max(survivors)+y_buffer] plt.plot(timesteps, survivors, color=survivor_color, linestyle=survivor_linestyle, zorder=2) # add markers plt.scatter(timesteps, survivors, color=survivor_color, s=survivor_marker_size, zorder=3) # add dotted line at lower bound lower_bound_x = [0 - x_buffer, max(timesteps) + x_buffer] lower_bound_y = [G, G] plt.plot(lower_bound_x, lower_bound_y, color='k', linestyle='--') # fill between survivor curve and lower bound plt.fill_between(timesteps, survivors, facecolor='none', hatch=fill_hatch, edgecolor=fill_color, linewidth=0.0) # plt.fill_between(timesteps, survivors, hatch='X', edgecolor='none', facecolor=fill_color, zorder=1) # fix ticks plt.xticks(time_ticks, time_labels, fontsize=tick_size) plt.yticks(fontsize=tick_size) plt.xlabel(xlabel, fontsize=label_size) plt.ylabel(ylabel, fontsize=label_size) plt.xlim(xlim) plt.ylim(ylim) # add bracket annotation for growth/failure def bracket_text(x, y1, y2, text, fraction=0.2, text_x_offset=2., text_y_offset=20): connection_style = 'bar, fraction=%.2f'%(fraction) plt.annotate('', xy=(x,y1), xycoords='data', xytext=(x,y2), textcoords='data', arrowprops=dict(arrowstyle='-', connectionstyle=connection_style)) text_x = x + text_x_offset text_y = (y1 + y2) / 2. + text_y_offset plt.text(text_x, text_y, text, rotation=270.) growth_bracket_x = max(timesteps) + .5 # growth bracket growth_bracket_y1 = G * .75 growth_bracket_y2 = G * .25 growth_text = 'growth' text_y_offset = 110 bracket_text(growth_bracket_x, growth_bracket_y1, growth_bracket_y2, growth_text, text_y_offset=text_y_offset) # failure bracket failure_bracket_x = max(timesteps) + .5 failure_bracket_y1 = N * .95 failure_bracket_y2 = G * 1.05 failure_text = 'decline' text_y_offset = 35 bracket_text(failure_bracket_x, failure_bracket_y1, failure_bracket_y2, failure_text, fraction=0.3, text_y_offset=text_y_offset) # squeeze layout plt.tight_layout() # write to file out_file = os.path.join(out_dir, 'split_point_survivor_curve.pdf') plt.savefig(out_file, bbox_inches='tight')
def main(): parser = ArgumentParser() parser.add_argument('--data_dir', default='../../data/frequency') parser.add_argument( '--match_stat', default='../../data/frequency/2013_2016_tf_norm_log.tsv') parser.add_argument( '--plot_stat', default='../../data/frequency/2013_2016_3gram_residuals.tsv') parser.add_argument('--tag_pcts', default='../../data/frequency/2013_2016_tag_pcts.tsv') parser.add_argument('--out_dir', default='../../output') args = parser.parse_args() data_dir = args.data_dir match_stat_file = args.match_stat plot_stat_file = args.plot_stat tag_pct_file = args.tag_pcts out_dir = args.out_dir growth_words = get_growth_words() decline_words, split_points = get_growth_decline_words_and_params() split_points = split_points.apply(lambda x: int(ceil(x))) vocab = get_default_vocab() # match_stat = pd.read_csv(os.path.join(data_dir, '2013_2016_tf_norm.tsv'), sep='\t', index_col=0).loc[vocab, :] match_stat = pd.read_csv(match_stat_file, sep='\t', index_col=0) DL = pd.read_csv(plot_stat_file, sep='\t', index_col=0) min_diff_pct = 0 # match on split point # k = 1 # match_diffs = match_words_split_points(decline_words, growth_words, match_stat, split_points, k, min_diff_pct, replace=False) # match on first k months of data k = 12 match_diffs = match_word_diffs_all_pairs(decline_words, growth_words, match_stat, k, min_diff_pct=min_diff_pct) # tag_estimates = pd.read_csv(os.path.join(data_dir, '2013_2016_tag_pcts.tsv'), sep='\t', index_col=0).apply(lambda x: x.argmax(), axis=1) # use tag estimates without proper nouns tag_estimates = pd.read_csv(tag_pct_file, sep='\t', index_col=0).drop( '^', inplace=False, axis=1).apply(lambda x: x.argmax(), axis=1) decline_words_matched = match_diffs.loc[:, 'word'].tolist() growth_words_matched = match_diffs.loc[:, 'match'].tolist() split_points_ordered = split_points.loc[decline_words_matched] split_points_growth = pd.Series(split_points_ordered) split_points_growth.index = growth_words_matched combined_words = decline_words_matched + growth_words_matched tag_estimates_combined = tag_estimates.loc[combined_words] tag_list = [] growth_vals = [] decline_vals = [] ttest_results = [] ttest_results = pd.DataFrame() min_count = 5 DL_k = DL.iloc[:, 1:k] for t, group in tag_estimates_combined.groupby(tag_estimates_combined): decline_relevant = list(group.index & set(decline_words_matched)) growth_relevant = list(group.index & set(growth_words_matched)) if ((len(decline_relevant) >= min_count) and (len(growth_relevant) >= min_count)): tag_list.append(t) # now! get DL values # get mean DL values decline_DL = DL_k.loc[decline_relevant, :].mean(axis=1) growth_DL = DL_k.loc[growth_relevant, :].mean(axis=1) decline_vals.append(decline_DL) growth_vals.append(growth_DL) # t-test for significance tval, pval = ttest_ind(growth_DL, decline_DL, equal_var=False) pval /= 2 # divide by two because one-sided # track means, t-val, p-val ttest_results_ = pd.Series({ 'POS_tag': t, 'growth_DL_mean': growth_DL.mean(), 'growth_DL_sd': growth_DL.std(), 'growth_DL_N': len(growth_DL), 'growth_DL_mean': decline_DL.mean(), 'growth_DL_sd': decline_DL.std(), 'growth_DL_N': len(decline_DL), 't': tval, 'p': pval, }) ttest_results = ttest_results.append(ttest_results_, ignore_index=True) # ttest_results.append((t, pval)) name_1 = 'growth' name_2 = 'decline' xlabel = 'POS tag' ylabel = '$D^{L}$' ylim = (-1., 0.5) # TACL size tick_size = 15 # NWAV size # tick_size = 18 # save ttest to file first ttest_out_file = os.path.join( out_dir, '%s_vs_%s_matched_pos_DL_distribution_1_%d.tsv' % (name_1, name_2, k)) ttest_results.to_csv(ttest_out_file, sep='\t', index=False) out_file = os.path.join( out_dir, '%s_vs_%s_matched_pos_DL_distribution_1_%d.pdf' % (name_1, name_2, k)) # convert tag list to meanings tag_meanings = pd.read_csv( '../../data/metadata/tag_meaning.tsv', sep='\t', index_col=0).applymap( lambda x: x.split('/')[0].replace(' ', '\n')) #replace('/', '\n')) tag_list = [tag_meanings.loc[t, 'meaning'] for t in tag_list] # plot boxes color_1 = 'b' color_2 = 'r' linestyle_1 = '--' linestyle_2 = '-' # TACL size # label_size = 18 # NWAV size label_size = 28 compare_boxplots(growth_vals, decline_vals, tag_list, xlabel, ylabel, name_1, name_2, color_1=color_1, color_2=color_2, linestyle_1=linestyle_1, linestyle_2=linestyle_2, label_size=label_size, tick_size=tick_size, ylim=ylim) # add xticks x_offset = 0.25 x_positions = pd.np.arange(len(tag_list)) + x_offset plt.xticks(x_positions, tag_list, fontsize=tick_size) # add significance stars # new: add as brackets between boxes def bracket_text(x1_bracket, x2_bracket, y_bracket, x_txt, y_txt, text, fraction=0.2, textsize=12, bracket_color='black'): connection_style = 'bar, fraction=%.2f' % (fraction) arrowprops = dict(arrowstyle='-', ec=bracket_color, connectionstyle=connection_style) plt.annotate('', xy=(x1_bracket, y_bracket), xycoords='data', xytext=(x2_bracket, y_bracket), textcoords='data', arrowprops=arrowprops) plt.text(x_txt, y_txt, text, rotation=0., size=textsize, weight='bold') pval_upper = 0.05 # ttest_results is a data frame x_positions_significant = [ x_positions[i] for i in range(len(x_positions)) if ttest_results.iloc[i, :].loc['p'] < pval_upper ] bracket_y = max(max(map(max, growth_vals)), max(map(max, decline_vals))) bracket_x_offset = 0.25 text_x_offset = -0.025 text_y_offset = 0.1 fraction = 0.3 annotate_txt = '*' annotate_txt_size = 15 for x_position in x_positions_significant: bracket_x1 = x_position - bracket_x_offset bracket_x2 = x_position + bracket_x_offset x_txt = (bracket_x1 + bracket_x2) / 2. + text_x_offset y_txt = bracket_y + text_y_offset bracket_text(bracket_x1, bracket_x2, bracket_y, x_txt, y_txt, annotate_txt, fraction=fraction, textsize=annotate_txt_size) # update xlim to fit labels and boxes xmin = x_positions.min() - x_offset * 2. xmax = x_positions.max() + x_offset * 2. plt.xlim(xmin, xmax) plt.tight_layout() # remove border but keep axes plt.axis('on') # plt.box(on=False) plt.savefig(out_file)