def get_user_groups(delta, category=None): seeds = Util.load_seeds() log('Finding basic user groups for delta %s and category %s...' % (delta, category)) (num_users, newsaholics, active_users, common_users) = basic_groups.group_users(delta, category) log('Finding precision experts for delta %s and category %s...' % (delta, category)) experts_p = experts.select_experts_precision(newsaholics.union(active_users), num_users, delta, .02, category) log('Finding ground truths...') gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING, category) log('Finding target news...') target_news = ground_truths.find_target_news(gt_rankings, .02) size_target_news = len(target_news) log('Finding fscore experts for delta %s and category %s...' % (delta, category)) experts_f = experts.select_experts_fscore(size_target_news, num_users, delta, .02, category) log('Finding ci experts for delta %s and category %s...' % (delta, category)) experts_ci = experts.select_experts_ci(num_users, delta, .02, category) experts_all = experts_p.union(experts_f).union(experts_ci) return experts_all, newsaholics, active_users, common_users
def get_user_groups(delta, category=None): seeds = Util.load_seeds() log('Finding basic user groups for delta %s and category %s...' % (delta, category)) (num_users, newsaholics, active_users, common_users) = basic_groups.group_users(delta, category) log('Finding precision experts for delta %s and category %s...' % (delta, category)) experts_p = experts.select_experts_precision( newsaholics.union(active_users), num_users, delta, .02, category) log('Finding ground truths...') gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING, category) log('Finding target news...') target_news = ground_truths.find_target_news(gt_rankings, .02) size_target_news = len(target_news) log('Finding fscore experts for delta %s and category %s...' % (delta, category)) experts_f = experts.select_experts_fscore(size_target_news, num_users, delta, .02, category) log('Finding ci experts for delta %s and category %s...' % (delta, category)) experts_ci = experts.select_experts_ci(num_users, delta, .02, category) experts_all = experts_p.union(experts_f).union(experts_ci) return experts_all, newsaholics, active_users, common_users
def run(): """Main logic for this analysis.""" if _MAIN_ANALYSIS: seeds = Util.load_seeds() gt_ranks = ground_truths.get_gt_rankings(seeds, DataSet.ALL) target_news = ground_truths.find_target_news(gt_ranks, _SIZE_TOP_NEWS) for delta in _DELTAS: log('Performing analysis for delta %s' % delta) param_str = 'd%s' % delta Util.ensure_dir_exist(_GRAPH_DIR + '%s/' % param_str) Util.ensure_dir_exist(_GRAPH_DIR + '%s/info/' % param_str) (counts, news_nyt_participant, news_nyt_not_participant, when_nyt_tweeted) = find_counts(target_news, delta) agg_counts = aggregate_counts(counts, delta) with open(_GRAPH_DIR + '%s/info/stats.txt' % param_str, 'w') as out_file: out_file.write('Num stories total: %s\n' % len(target_news)) out_file.write('Num NYT Participant: %s\n' % len(news_nyt_participant)) out_file.write('Num NYT Not Participant: %s\n' % len(news_nyt_not_participant)) with open(_GRAPH_DIR + '%s/info/legend.tsv' % param_str, 'w') as out_file: for i in range( min( 50, min(len(news_nyt_participant), len(news_nyt_not_participant)))): log('Outputting graph %s...' % i) url_nyt = news_nyt_participant.pop() url_not_nyt = news_nyt_not_participant.pop() nyt_tweeted_min = when_nyt_tweeted[url_nyt] out_file.write('%s\t%s\t%s\n' % (i, url_nyt, url_not_nyt)) draw_graph(agg_counts[url_nyt], agg_counts[url_not_nyt], (nyt_tweeted_min, agg_counts[url_nyt][nyt_tweeted_min]), i, param_str) if _SECONDARY_ANALYSIS: url_str = ('http://thelede.blogs.nytimes.com/2011/08/21/' 'latest-updates-on-the-battle-for-tripoli/') user_info = crawl_users.load_user_info() for url, delta, legend_num in [(url_str, 8, 28)]: additional_info = find_additional_info(url, user_info, delta) log('Outputting additional info to disk...') with open( _GRAPH_DIR + 'additional_info_%s_%s.tsv' % (delta, legend_num), 'w') as out_file: for user_id, (num_followers, screen_name, minutes) in additional_info: out_file.write( '%s\t%s\t%s\t%s\n' % (user_id, screen_name, num_followers, minutes)) log('Analysis complete!')
def run(): """Main logic for this analysis.""" if _MAIN_ANALYSIS: seeds = Util.load_seeds() gt_ranks = ground_truths.get_gt_rankings(seeds, DataSet.ALL) target_news = ground_truths.find_target_news(gt_ranks, _SIZE_TOP_NEWS) for delta in _DELTAS: log('Performing analysis for delta %s' % delta) param_str = 'd%s' % delta Util.ensure_dir_exist(_GRAPH_DIR + '%s/' % param_str) Util.ensure_dir_exist(_GRAPH_DIR + '%s/info/' % param_str) (counts, news_nyt_participant, news_nyt_not_participant, when_nyt_tweeted) = find_counts(target_news, delta) agg_counts = aggregate_counts(counts, delta) with open(_GRAPH_DIR + '%s/info/stats.txt' % param_str, 'w') as out_file: out_file.write('Num stories total: %s\n' % len(target_news)) out_file.write('Num NYT Participant: %s\n' % len(news_nyt_participant)) out_file.write('Num NYT Not Participant: %s\n' % len(news_nyt_not_participant)) with open(_GRAPH_DIR + '%s/info/legend.tsv' % param_str, 'w') as out_file: for i in range(min(50, min(len(news_nyt_participant), len(news_nyt_not_participant)))): log('Outputting graph %s...' % i) url_nyt = news_nyt_participant.pop() url_not_nyt = news_nyt_not_participant.pop() nyt_tweeted_min = when_nyt_tweeted[url_nyt] out_file.write('%s\t%s\t%s\n' % (i, url_nyt, url_not_nyt)) draw_graph(agg_counts[url_nyt], agg_counts[url_not_nyt], (nyt_tweeted_min, agg_counts[url_nyt][nyt_tweeted_min]), i, param_str) if _SECONDARY_ANALYSIS: url_str = ('http://thelede.blogs.nytimes.com/2011/08/21/' 'latest-updates-on-the-battle-for-tripoli/') user_info = crawl_users.load_user_info() for url, delta, legend_num in [(url_str, 8, 28)]: additional_info = find_additional_info(url, user_info, delta) log('Outputting additional info to disk...') with open(_GRAPH_DIR + 'additional_info_%s_%s.tsv' % (delta, legend_num), 'w') as out_file: for user_id, (num_followers, screen_name, minutes) in additional_info: out_file.write('%s\t%s\t%s\t%s\n' % (user_id, screen_name, num_followers, minutes)) log('Analysis complete!')
def run(): """Main logic. Outputs data in format for further analysis.""" global _OUT_DIR cache = Util.load_cache() seeds = Util.load_seeds() # Set up params appropriately. data_set = DataSet.TRAINING months = _TRAINING_SET_MONTHS if _SWITCHED: data_set = DataSet.TESTING months = _TESTING_SET_MONTHS _OUT_DIR += 'switched/' retweets = set() if _EXCLUDE_RETWEETS: retweets = ground_truths.find_retweets(months) _OUT_DIR += 'no_retweets/' Util.ensure_dir_exist(_OUT_DIR) log('Output dir: %s' % _OUT_DIR) for delta in _DELTAS: for category in _CATEGORIES: gt_rankings = ground_truths.get_gt_rankings(seeds, data_set, category, exclude_tweets_within_delta=_EXCLUDE_TWEETS_WITHIN_DELTA, retweets=retweets) sort_users_by_tweet_count(months, seeds, cache, delta, category) target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS) find_hits_and_mises(months, target_news, seeds, cache, delta, category) # if _SWITCHED: # gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING, # category) # sort_users_by_tweet_count(_TESTING_SET_MONTHS, seeds, cache, # delta, category) # target_news = ground_truths.find_target_news(gt_rankings, .02) # find_hits_and_mises(_TESTING_SET_MONTHS, target_news, seeds, cache, # delta, category) # else: # gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TRAINING, # category) # sort_users_by_tweet_count(_TRAINING_SET_MONTHS, seeds, cache, # delta, category) # target_news = ground_truths.find_target_news(gt_rankings, .02) # find_hits_and_mises(_TRAINING_SET_MONTHS, target_news, seeds, cache, # delta, category) log('Finished outputting data!')
def run(): """Main logic for this analysis.""" seeds = Util.load_seeds() gt_ranks = ground_truths.get_gt_rankings(seeds, DataSet.ALL) target_news = ground_truths.find_target_news(gt_ranks, _SIZE_TOP_NEWS) # for delta in _DELTAS: for delta in [8]: log('Performing analysis for delta %s' % delta) param_str = 'd%s' % delta Util.ensure_dir_exist(_GRAPH_DIR + '%s/' % param_str) Util.ensure_dir_exist(_GRAPH_DIR + '%s/info/' % param_str) (counts, news_nyt_participant, news_nyt_not_participant, when_nyt_tweeted) = find_counts(target_news, delta) agg_counts = aggregate_counts(counts, delta) with open(_GRAPH_DIR + '%s/info/stats.txt' % param_str, 'w') as out_file: out_file.write('Num stories total: %s\n' % len(target_news)) out_file.write('Num NYT Participant: %s\n' % len(news_nyt_participant)) out_file.write('Num NYT Not Participant: %s\n' % len(news_nyt_not_participant)) with open(_GRAPH_DIR + '%s/info/legend.tsv' % param_str, 'w') as out_file: log('Outputting graph...') nyt_tweeted_min = when_nyt_tweeted[_STORY_NYT] annotations = [] annotations.append( (nyt_tweeted_min, agg_counts[_STORY_NYT][nyt_tweeted_min], '@nytimes')) annotations.append((204, agg_counts[_STORY_NYT][204], '@evertuts')) annotations.append((193, agg_counts[_STORY_NYT][193], '@nytjim')) annotations.append( (194, agg_counts[_STORY_NYT][194], '@nytimesglobal')) annotations.append( (222, agg_counts[_STORY_NYT][222], '@Larryferlazzo')) draw_graph(agg_counts[_STORY_NYT], agg_counts[_STORY_NOT_NYT], annotations, param_str) log('Analysis complete!')
def run(): """Main logic for this analysis.""" FileLog.set_log_dir() Util.ensure_dir_exist(_OUTPUT_DIR) if _REGENERATE_DATA: deltas = find_deltas() cache = Util.load_cache() seeds = Util.load_seeds() # Find top news param_str = '_t%s' % (int(_SIZE_TOP_NEWS * 100)) gts = ground_truths.get_gt_rankings(seeds, DataSet.ALL) top_news = ground_truths.find_target_news(gts, _SIZE_TOP_NEWS) # Do analysis for all delta, including sys.max to do analysis with no delta. for delta in [sys.maxint] + _DELTAS: param_str = _get_param_str(delta) (all_counts, original_counts, retweet_counts, top_counts) = find_device_counts(delta, deltas, top_news, cache) (sorted_all_counts, sorted_original_counts, sorted_retweet_counts, sorted_top_counts) = sort_data(all_counts, original_counts, retweet_counts, top_counts) output_data(sorted_all_counts, sorted_original_counts, sorted_retweet_counts, sorted_top_counts, param_str) if _REDRAW_GRAPH: for delta in [sys.maxint] + _DELTAS: param_str = _get_param_str(delta) (top, original_dict, retweet_dict) = load_data(param_str) log('Drawing graph for delta %s...' % delta) draw_graph(top, original_dict, retweet_dict, param_str) log('Analysis complete.')
def run(): """Main logic for this analysis.""" seeds = Util.load_seeds() gt_ranks = ground_truths.get_gt_rankings(seeds, DataSet.ALL) target_news = ground_truths.find_target_news(gt_ranks, _SIZE_TOP_NEWS) # for delta in _DELTAS: for delta in [8] : log('Performing analysis for delta %s' % delta) param_str = 'd%s' % delta Util.ensure_dir_exist(_GRAPH_DIR + '%s/' % param_str) Util.ensure_dir_exist(_GRAPH_DIR + '%s/info/' % param_str) (counts, news_nyt_participant, news_nyt_not_participant, when_nyt_tweeted) = find_counts(target_news, delta) agg_counts = aggregate_counts(counts, delta) with open(_GRAPH_DIR + '%s/info/stats.txt' % param_str, 'w') as out_file: out_file.write('Num stories total: %s\n' % len(target_news)) out_file.write('Num NYT Participant: %s\n' % len(news_nyt_participant)) out_file.write('Num NYT Not Participant: %s\n' % len(news_nyt_not_participant)) with open(_GRAPH_DIR + '%s/info/legend.tsv' % param_str, 'w') as out_file: log('Outputting graph...') nyt_tweeted_min = when_nyt_tweeted[_STORY_NYT] annotations = [] annotations.append((nyt_tweeted_min, agg_counts[_STORY_NYT][nyt_tweeted_min], '@nytimes')) annotations.append((204, agg_counts[_STORY_NYT][204], '@evertuts')) annotations.append((193, agg_counts[_STORY_NYT][193], '@nytjim')) annotations.append((194, agg_counts[_STORY_NYT][194], '@nytimesglobal')) annotations.append((222, agg_counts[_STORY_NYT][222], '@Larryferlazzo')) draw_graph(agg_counts[_STORY_NYT], agg_counts[_STORY_NOT_NYT], annotations, param_str) log('Analysis complete!')
def run(): """Contains the main logic for this analysis.""" global _SIZE_TOP_NEWS FileLog.set_log_dir() seeds = Util.load_seeds() for category in _CATEGORIES: log('Preforming analysis for category: %s' % category) if category: _SIZE_TOP_NEWS = .10 else: _SIZE_TOP_NEWS = .02 gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING, category) log('Num ground_truth_rankings: %s' % len(gt_rankings)) target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS) log('Size target_news: %s' % len(target_news)) # for delta in _DELTAS: for delta in [4]: run_params_str = 'd%s_t%s_e%s_%s' % (delta, int(_SIZE_TOP_NEWS * 100), int(_SIZE_EXPERTS * 100), category) output_dir = '../graph/CrowdWisdomDef/%s/' % run_params_str Util.ensure_dir_exist(output_dir) info_output_dir = '../graph/CrowdWisdomDef/%s/info/' % run_params_str Util.ensure_dir_exist(info_output_dir) output_dir = '../graph/CrowdWisdomDef/%s/' % run_params_str Util.ensure_dir_exist(output_dir) (num_users, newsaholics, active_users, common_users) = basic_groups.group_users(delta, category) log('Num newsaholics: %s' % len(newsaholics)) log('Num active: %s' % len(active_users)) log('Num common: %s' % len(common_users)) common_user_buckets = common_user_groups.group_users(common_users, _NUM_GROUPS) for i, common_user_bucket in enumerate(common_user_buckets): print 'Number users in common user bucket %s: %s' % (i, len(common_user_bucket)) experts_precision = experts.select_experts_precision( newsaholics.union(active_users), num_users, delta, _SIZE_EXPERTS, category) experts_fscore = experts.select_experts_fscore(len(target_news), num_users, delta, _SIZE_EXPERTS, category) experts_ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS, category) super_experts = experts.select_super_experts(experts_precision, experts_fscore, experts_ci) log('Num experts (precision): %s' % len(experts_precision)) log('Num experts (fscore): %s' % len(experts_fscore)) log('Num experts (ci): %s' % len(experts_ci)) log('Finding rankings with an %s hour delta.' % delta) (market_rankings, newsaholic_rankings, active_rankings, common_rankings) = basic_groups.get_rankings(delta, seeds, newsaholics, active_users, category) (expert_precision_rankings, expert_fscore_rankings, expert_ci_rankings, expert_s_rankings) = experts.get_rankings(delta, seeds, experts_precision, experts_fscore, experts_ci, super_experts, category) common_groups_rankings = common_user_groups.get_rankings(delta, seeds, common_user_buckets, category) num_votes_common = 0 for url, count in common_rankings: num_votes_common += count log('Num common_rankings: %s' % len(common_rankings)) log('Num common votes: %s' % num_votes_common) num_votes_expert_precision = 0 for url, count in expert_precision_rankings: num_votes_expert_precision += count log('Num expert_precision rankings: %s' % len(expert_precision_rankings)) log('Num expert_precision votes: %s' % num_votes_expert_precision) num_votes_expert_fscore = 0 for url, count in expert_fscore_rankings: num_votes_expert_fscore += count log('Num expert_fscore rankings: %s' % len(expert_fscore_rankings)) log('Num expert_fscore votes: %s' % num_votes_expert_fscore) num_votes_expert_ci = 0 for url, count in expert_ci_rankings: num_votes_expert_ci += count log('Num expert_ci rankings: %s' % len(expert_ci_rankings)) log('Num expert_ci votes: %s' % num_votes_expert_ci) num_votes_buckets = [] for i, common_group_rankings in enumerate(common_groups_rankings): num_votes = 0 for url, count in common_group_rankings: num_votes += count num_votes_buckets.append(num_votes) log('Num common rankings (%s buckets): %s' % (i, len(common_group_rankings))) log('Num expert_ci votes (%s buckets): %s' % (i, num_votes)) with open('%suser_demographics_%s.txt' % (info_output_dir, run_params_str), 'w') as output_file: output_file.write('Number of Common Users: %s\n' % len(common_users)) output_file.write('\n'); output_file.write('Number of Precision Experts: %s\n' % len(experts_precision)) output_file.write('Number of F-Score Experts: %s\n' % len(experts_fscore)) output_file.write('Number of CI Experts: %s\n' % len(experts_ci)) output_file.write('Number users per common user bucket: %s\n' %len(common_user_buckets[0])) output_file.write('Number of Precision and F-Score Experts: %s\n' % len(experts_precision.intersection(experts_fscore))) output_file.write('Number of Precision and CI Experts: %s\n' % len(experts_precision.intersection(experts_ci))) output_file.write('Number of F-Score and CI Experts: %s\n' % len(experts_fscore.intersection(experts_ci))) output_file.write('\n'); output_file.write('Number of Users (Total): %s\n' % (len(newsaholics) + len(active_users) + len(common_users))) output_file.write('\n') output_file.write('Number of votes by Common Users: %s\n' % num_votes_common) output_file.write('\n'); output_file.write('Number of votes by Expert (Precision) Users: %s\n' % num_votes_expert_precision) output_file.write('Number of votes by Expert (fscore) Users: %s\n' % num_votes_expert_fscore) output_file.write('Number of votes by Expert (ci) Users: %s\n' % num_votes_expert_ci) output_file.write('Number of votes per bucket: %s\n' % num_votes_buckets) output_file.write('\n') output_file.write('Total Number of Good News: %s\n' % len(target_news)) log('Ground Truth Top 5') for i in range(min(len(gt_rankings), 5)): url, count = gt_rankings[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Common Top 5') for i in range(min(len(common_rankings), 5)): url, count = common_rankings[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Expert (Precision) Top 5') for i in range(min(len(expert_precision_rankings), 5)): url, count = expert_precision_rankings[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Expert (fscore) Top 5') for i in range(min(len(expert_fscore_rankings), 5)): url, count = expert_fscore_rankings[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Expert (ci) Top 5') for i in range(min(len(expert_ci_rankings), 5)): url, count = expert_ci_rankings[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') common_precisions, common_recalls = calc_precision_recall(gt_rankings, common_rankings) (expert_p_precisions, expert_p_recalls) = calc_precision_recall(gt_rankings, expert_precision_rankings) (expert_f_precisions, expert_f_recalls) = calc_precision_recall(gt_rankings, expert_fscore_rankings) (expert_c_precisions, expert_c_recalls) = calc_precision_recall(gt_rankings, expert_ci_rankings) common_group_ps = [] common_group_rs = [] for common_group_ranking in common_groups_rankings: common_group_p, common_group_r = calc_precision_recall(gt_rankings, common_group_ranking) common_group_ps.append(common_group_p) common_group_rs.append(common_group_r) log('Drawing common group model precision-recall graph...') common_user_groups.draw_precision_recall(common_group_ps, common_group_rs, expert_p_precisions, expert_p_recalls, expert_f_precisions, expert_f_recalls, expert_c_precisions, expert_c_recalls, run_params_str) log('Drawing common group model precision graph...') common_user_groups.draw_precision(common_group_ps, expert_p_precisions, expert_f_precisions, expert_c_precisions, run_params_str)
def find_counts(seeds, category=None): num_0_1 = 0 num_1_4 = 0 num_4_8 = 0 num_after_8 = 0 num_total = 0 log('Finding common users delta 1...') (num_users_1, newsaholics_1, active_users_1, common_users_1) = basic_groups.group_users(1, category) log('Finding common users delta 4...') (num_users_4, newsaholics_4, active_users_4, common_users_4) = basic_groups.group_users(4, category) log('Finding common users delta 8...') (num_users_8, newsaholics_8, active_users_8, common_users_8) = basic_groups.group_users(8, category) copy_common_users_1 = set(common_users_1) common_users_1_1 = set() common_users_1_2 = set() common_users_1_3 = set() count = 0 while len(copy_common_users_1) > 0: if count % 3 == 0: common_users_1_1.add(copy_common_users_1.pop()) elif count % 3 == 1: common_users_1_2.add(copy_common_users_1.pop()) elif count % 3 == 2: common_users_1_3.add(copy_common_users_1.pop()) count += 1 copy_common_users_4 = set(common_users_4) common_users_4_1 = set() common_users_4_2 = set() common_users_4_3 = set() count = 0 while len(copy_common_users_4) > 0: if count % 3 == 0: common_users_4_1.add(copy_common_users_4.pop()) elif count % 3 == 1: common_users_4_2.add(copy_common_users_4.pop()) elif count % 3 == 2: common_users_4_3.add(copy_common_users_4.pop()) count += 1 copy_common_users_8 = set(common_users_8) common_users_8_1 = set() common_users_8_2 = set() common_users_8_3 = set() count = 0 while len(copy_common_users_8) > 0: if count % 3 == 0: common_users_8_1.add(copy_common_users_8.pop()) elif count % 3 == 1: common_users_8_2.add(copy_common_users_8.pop()) elif count % 3 == 2: common_users_8_3.add(copy_common_users_8.pop()) count += 1 log('Size Common Users 1 (delta 1): %s' % len(common_users_1_1)) log('Size Common Users 2 (delta 1): %s' % len(common_users_1_2)) log('Size Common Users 3 (delta 1): %s' % len(common_users_1_3)) log('Size Common Users 1 (delta 4): %s' % len(common_users_4_1)) log('Size Common Users 2 (delta 4): %s' % len(common_users_4_2)) log('Size Common Users 3 (delta 4): %s' % len(common_users_4_3)) log('Size Common Users 1 (delta 8): %s' % len(common_users_8_1)) log('Size Common Users 2 (delta 8): %s' % len(common_users_8_2)) log('Size Common Users 3 (delta 8): %s' % len(common_users_8_3)) log('Finding precision experts delta 1...') experts_p_1 = experts.select_experts_precision(newsaholics_1.union(active_users_1), num_users_1, 1, _SIZE_EXPERTS, category) log('Finding precision experts delta 1...') experts_p_4 = experts.select_experts_precision(newsaholics_4.union(active_users_4), num_users_4, 4, _SIZE_EXPERTS, category) log('Finding precision experts delta 1...') experts_p_8 = experts.select_experts_precision(newsaholics_8.union(active_users_8), num_users_8, 8, _SIZE_EXPERTS, category) log('Finding ground truths...') gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING, category) log('Finding target news...') target_news = ground_truths.find_target_news(gt_rankings, _SIZE_EXPERTS) size_target_news = len(target_news) log('Finding fscore experts delta 1...') experts_f_1 = experts.select_experts_fscore(size_target_news, num_users_1, 1, _SIZE_EXPERTS, category) log('Finding fscore experts delta 4...') experts_f_4 = experts.select_experts_fscore(size_target_news, num_users_4, 4, _SIZE_EXPERTS, category) log('Finding fscore experts delta 8...') experts_f_8 = experts.select_experts_fscore(size_target_news, num_users_8, 8, _SIZE_EXPERTS, category) log('Finding ci experts delta 1...') experts_ci_1 = experts.select_experts_ci(num_users_1, 1, _SIZE_EXPERTS, category) log('Finding ci experts delta 4...') experts_ci_4 = experts.select_experts_ci(num_users_4, 4, _SIZE_EXPERTS, category) log('Finding ci experts delta 8...') experts_ci_8 = experts.select_experts_ci(num_users_8, 8, _SIZE_EXPERTS, category) experts_all_1 = experts_p_1.union(experts_f_1).union(experts_ci_1) experts_all_4 = experts_p_4.union(experts_f_4).union(experts_ci_4) experts_all_8 = experts_p_8.union(experts_f_8).union(experts_ci_8) num_0_1_common = 0 num_1_4_common = 0 num_4_8_common = 0 num_cu_1_1 = 0 num_cu_1_2 = 0 num_cu_1_3 = 0 num_cu_4_1 = 0 num_cu_4_2 = 0 num_cu_4_3 = 0 num_cu_8_1 = 0 num_cu_8_2 = 0 num_cu_8_3 = 0 num_0_1_experts_p = 0 num_1_4_experts_p = 0 num_4_8_experts_p = 0 num_0_1_experts_f = 0 num_1_4_experts_f = 0 num_4_8_experts_f = 0 num_0_1_experts_ci = 0 num_1_4_experts_ci = 0 num_4_8_experts_ci = 0 num_0_1_experts_all = 0 num_1_4_experts_all = 0 num_4_8_experts_all = 0 log('Finding counts...') with open('../data/FolkWisdom/time_deltas.tsv') as input_file: for line in input_file: # parse line tokens = line.split('\t') url = tokens[_TIMEDELTAS_FILE_URL_INDEX] time_delta = timedelta(seconds=int(tokens[_TIMEDELTAS_FILE_DELTA_INDEX])) tweet_category = tokens[_TIMEDELTAS_FILE_CATEGORY_INDEX].strip() user_id = tokens[_TIMEDELTAS_FILE_USER_ID_INDEX] if url in seeds: (seed_tweet_id, seed_user_id, seed_time) = seeds[url] if Util.is_in_testing_set(seed_time) and category_matches(category, tweet_category): num_total += 1 if time_delta < timedelta(hours=1): num_0_1 += 1 if user_id in common_users_1: num_0_1_common += 1 if user_id in experts_p_1: num_0_1_experts_p += 1 if user_id in experts_f_1: num_0_1_experts_f += 1 if user_id in experts_ci_1: num_0_1_experts_ci += 1 if user_id in experts_all_1: num_0_1_experts_all += 1 if user_id in common_users_1_1: num_cu_1_1 += 1 if user_id in common_users_1_2: num_cu_1_2 += 1 if user_id in common_users_1_3: num_cu_1_3 += 1 elif time_delta >= timedelta(hours=1) and time_delta < timedelta(hours=4): num_1_4 += 1 if user_id in common_users_4: num_1_4_common += 1 if user_id in experts_p_4: num_1_4_experts_p += 1 if user_id in experts_f_4: num_1_4_experts_f += 1 if user_id in experts_ci_4: num_1_4_experts_ci += 1 if user_id in experts_all_4: num_1_4_experts_all += 1 if user_id in common_users_4_1: num_cu_4_1 += 1 if user_id in common_users_4_2: num_cu_4_2 += 1 if user_id in common_users_4_3: num_cu_4_3 += 1 elif time_delta >= timedelta(hours=4) and time_delta < timedelta(hours=8): num_4_8 += 1 if user_id in common_users_8: num_4_8_common += 1 if user_id in experts_p_8: num_4_8_experts_p += 1 if user_id in experts_f_8: num_4_8_experts_f += 1 if user_id in experts_ci_8: num_4_8_experts_ci += 1 if user_id in experts_all_8: num_4_8_experts_all += 1 if user_id in common_users_8_1: num_cu_8_1 += 1 if user_id in common_users_8_2: num_cu_8_2 += 1 if user_id in common_users_8_3: num_cu_8_3 += 1 else: num_after_8 += 1 return (num_0_1, num_0_1_common, num_0_1_experts_p, num_0_1_experts_f, num_0_1_experts_ci, num_0_1_experts_all, num_1_4, num_1_4_common, num_1_4_experts_p, num_1_4_experts_f, num_1_4_experts_ci, num_1_4_experts_all, num_4_8, num_4_8_common, num_4_8_experts_p, num_4_8_experts_f, num_4_8_experts_ci, num_4_8_experts_all, num_cu_1_1, num_cu_1_2, num_cu_1_3, num_cu_4_1, num_cu_4_2, num_cu_4_3, num_cu_8_1, num_cu_8_2, num_cu_8_3, num_after_8, num_total)
def get_all_user_groups(delta=4, category=None): seeds = Util.load_seeds() # Set up params appropriately. data_set = DataSet.TRAINING months = _TRAINING_SET_MONTHS if _SWITCHED: data_set = DataSet.TESTING months = _TESTING_SET_MONTHS retweets = set() if _EXCLUDE_RETWEETS: retweets = ground_truths.find_retweets(months) gt_rankings = ground_truths.get_gt_rankings(seeds, data_set, category, exclude_tweets_within_delta=_EXCLUDE_TWEETS_WITHIN_DELTA, retweets=retweets) target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS) groups = UserGroups() (num_users, groups.newsaholics, groups.active_users, groups.common_users) = basic_groups.group_users(delta, category) groups.population = groups.newsaholics.union(groups.active_users).union(groups.common_users) num_users_eg, groups.even_groups = even_groups.group_users(delta, _NUM_GROUPS, _SIZE_OF_GROUP_IN_PERCENT, category) groups.precision = experts.select_experts_precision( groups.newsaholics.union(groups.active_users), num_users, delta, _SIZE_EXPERTS, category) groups.fscore = experts.select_experts_fscore(len(target_news), num_users, delta, _SIZE_EXPERTS, category) groups.ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS, category) groups.super_experts = experts.select_super_experts(groups.precision, groups.fscore, groups.ci) groups.ci_hi, groups.ci_li = experts.split_ci_experts_by_followers(groups.ci) groups.ci_1 = set() groups.ci_2 = set() groups.ci_3 = set() counter = 0 for ci_expert in groups.ci: if counter % 3 == 0: groups.ci_1.add(ci_expert) elif counter % 3 == 1: groups.ci_2.add(ci_expert) elif counter % 3 == 2: groups.ci_3.add(ci_expert) counter += 1 groups.social_bias, d_num_followers = experts.select_experts_social_bias(num_users, _SIZE_EXPERTS) groups.all_experts = experts.select_all_experts(groups.precision, groups.fscore, groups.ci) groups.non_experts = groups.population.difference(groups.all_experts) sample_size = int(len(groups.non_experts) * _NON_EXPERTS_SAMPLE_SIZE) sample_size_25 = int(len(groups.non_experts) * 0.05) sample_size_10 = int(len(groups.non_experts) * 0.10) sample_size_1 = int(len(groups.non_experts) * 0.02) groups.non_experts_sampled = set(random.sample(groups.non_experts, sample_size)) groups.non_experts_25 = set(random.sample(groups.non_experts, sample_size_25)) groups.non_experts_10 = set(random.sample(groups.non_experts, sample_size_10)) groups.non_experts_1 = set(random.sample(groups.non_experts, sample_size_1)) return groups, d_num_followers
def run(): """Contains the main logic for this analysis.""" FileLog.set_log_dir() seeds = Util.load_seeds() for category in _CATEGORIES: log('Preforming analysis for category: %s' % category) size_top_news = _SIZE_TOP_NEWS if category: size_top_news = .10 data_set = DataSet.TESTING retweets = set() if _SWITCHED: data_set = DataSet.TRAINING if _EXCLUDE_RETWEETS: retweets = ground_truths.find_retweets(_TESTING_SET_MONTHS) log('Num retweets to exclude: %s' % len(retweets)) gt_rankings = ground_truths.get_gt_rankings(seeds, data_set, category, exclude_tweets_within_delta=_EXCLUDE_TWEETS_WITHIN_DELTA, retweets=retweets) log('Num ground_truth_rankings: %s' % len(gt_rankings)) # Format for use later. ground_truth_url_to_rank = {} for rank, (url, count) in enumerate(gt_rankings): ground_truth_url_to_rank[url] = rank target_news = ground_truths.find_target_news(gt_rankings, size_top_news) log('Size target_news: %s' % len(target_news)) for delta in _DELTAS: run_params_str = 'd%s_t%s_e%s_%s' % (delta, int(size_top_news * 100), int(_SIZE_EXPERTS * 100), category) info_output_dir = '../graph/FolkWisdom/%s/info/' % run_params_str Util.ensure_dir_exist(info_output_dir) groups, d_num_followers = user_groups.get_all_user_groups(delta, category) log('Num experts (precision): %s' % len(groups.precision)) log('Num experts (fscore): %s' % len(groups.fscore)) log('Num experts (ci): %s' % len(groups.ci)) log('Num Super Experts: %s' %len(groups.super_experts)) log('Num Social Bias Experts: %s' % len(groups.social_bias)) log('Finding rankings with an %s hour delta.' % delta) ranks = rankings.get_rankings(delta, seeds, groups, category, d_num_followers) # Output some interesting info to file size_market_unfiltered = '0' with open('../data/FolkWisdom/size_of_market_unfiltered.txt') as in_file: size_market_unfiltered = in_file.readline().strip() with open('%suser_demographics_%s.txt' % (info_output_dir, run_params_str), 'w') as output_file: output_file.write('Number of Newsaholics: %s\n' % len(groups.newsaholics)) output_file.write('Number of Active Users: %s\n' % len(groups.active_users)) output_file.write('Number of Common Users: %s\n' % len(groups.common_users)) output_file.write('\n'); output_file.write('Number of Precision Experts: %s\n' % len(groups.precision)) output_file.write('Number of F-Score Experts: %s\n' % len(groups.fscore)) output_file.write('Number of CI Experts: %s\n' % len(groups.ci)) output_file.write('Number of Social Bias Experts: %s\n' % len(groups.social_bias)) output_file.write('Total number of unique experts: %s\n' % len(groups.all_experts)) output_file.write('Number of Precision and F-Score Experts: %s\n' % len(groups.precision.intersection(groups.fscore))) output_file.write('Number of Precision and CI Experts: %s\n' % len(groups.precision.intersection(groups.ci))) output_file.write('Number of F-Score and CI Experts: %s\n' % len(groups.fscore.intersection(groups.ci))) output_file.write('Number of Super Experts: %s\n' % len(groups.super_experts)) output_file.write('\n'); output_file.write('Number of Users (Total): %s\n' % (len(groups.newsaholics) + len(groups.active_users) + len(groups.common_users))) output_file.write('Size of market (unfiltered): %s\n' % size_market_unfiltered) output_file.write('\n') # output_file.write('Number of votes by Newsaholics: %s\n' # % num_votes_newsaholics) # output_file.write('Number of votes by Market: %s\n' % num_votes_market) # output_file.write('Number of votes by Active Users: %s\n' # % num_votes_active) # output_file.write('Number of votes by Common Users: %s\n' # % num_votes_common) # output_file.write('\n'); # output_file.write('Number of votes by Expert (Precision) Users: %s\n' # % num_votes_expert_precision) # output_file.write('Number of votes by Expert (fscore) Users: %s\n' # % num_votes_expert_fscore) # output_file.write('Number of votes by Expert (ci) Users: %s\n' # % num_votes_expert_ci) # output_file.write('Number of votes by Super Experts: %s\n' # % num_votes_expert_s) # output_file.write('Number of votes by Social Bias Experts: %s\n' # % num_votes_expert_sb) # output_file.write('\n') # output_file.write('Total Number of votes cast: %s\n' # % (num_votes_newsaholics + num_votes_active # + num_votes_common)) # output_file.write('\n') output_file.write('Total Number of Good News: %s\n' % len(target_news)) log('Ground Truth Top 50') for i in range(min(len(gt_rankings), 50)): url, count = gt_rankings[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Newsaholic Top 5') for i in range(min(len(ranks.newsaholics), 5)): url, count = ranks.newsaholics[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Active Top 5') for i in range(min(len(ranks.active_users), 5)): url, count = ranks.active_users[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Common Top 5') for i in range(min(len(ranks.common_users), 5)): url, count = ranks.common_users[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('nonexpert Top 5') for i in range(min(len(ranks.non_experts), 5)): url, count = ranks.non_experts[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Expert (Precision) Top 5') for i in range(min(len(ranks.precision), 5)): url, count = ranks.precision[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Expert (fscore) Top 5') for i in range(min(len(ranks.fscore), 5)): url, count = ranks.fscore[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Expert (ci) Top 5') for i in range(min(len(ranks.ci), 5)): url, count = ranks.ci[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Super Expert Top 5') for i in range(min(len(ranks.super_experts), 5)): url, count = ranks.super_experts[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Social Bias Expert Top 5') for i in range(min(len(ranks.social_bias), 5)): url, count = ranks.social_bias[i] log('[%s] %s\t%s' %(i, url.strip(), count)) market_rank_to_url = {} newsaholic_rank_to_url = {} active_rank_to_url = {} common_rank_to_url = {} expert_p_rank_to_url = {} expert_f_rank_to_url = {} expert_c_rank_to_url = {} expert_s_rank_to_url = {} for rank, (url, count) in enumerate(ranks.newsaholics): newsaholic_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.population): market_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.active_users): active_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.common_users): common_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.precision): expert_p_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.fscore): expert_f_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.ci): expert_c_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.super_experts): expert_s_rank_to_url[rank] = url population_url_to_rank = {} market_url_to_rank = {} precision_url_to_rank = {} fscore_url_to_rank = {} ci_url_to_rank = {} ci_1_url_to_rank = {} ci_2_url_to_rank = {} ci_3_url_to_rank = {} common_url_to_rank = {} for rank, (url, count) in enumerate(ranks.population): population_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.non_experts): market_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.precision): precision_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.fscore): fscore_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci): ci_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci_1): ci_1_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci_2): ci_2_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci_3): ci_3_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.common_users): common_url_to_rank[url] = rank precisions, recalls = precision_recall.get_precision_recalls(gt_rankings, ranks) mixed_rankings = mixed_model.get_mixed_rankings(market_url_to_rank, precisions.non_experts, precision_url_to_rank, precisions.precision, fscore_url_to_rank, precisions.fscore, ci_url_to_rank, precisions.ci, ground_truth_url_to_rank) mixed_inact_rankings = mixed_model.get_mixed_rankings(common_url_to_rank, precisions.common_users, precision_url_to_rank, precisions.precision, fscore_url_to_rank, precisions.fscore, ci_url_to_rank, precisions.ci, ground_truth_url_to_rank) mixed_ci_rankings = mixed_model.get_mixed_rankings(market_url_to_rank, precisions.non_experts, ci_1_url_to_rank, precisions.ci_1, ci_2_url_to_rank, precisions.ci_2, ci_3_url_to_rank, precisions.ci_3, ground_truth_url_to_rank) mixed_precisions, mixed_recalls = precision_recall.calc_precision_recall(gt_rankings, mixed_rankings) mixed_inact_precisions, mixed_inact_recalls = precision_recall.calc_precision_recall(gt_rankings, mixed_inact_rankings) mixed_ci_precisions, mixed_ci_recalls = precision_recall.calc_precision_recall(gt_rankings, mixed_ci_rankings) log('-----------------------------------') log('Mixed (min) Top 5') for i in range(min(len(mixed_rankings), 5)): url, count = mixed_rankings[i] log('[%s] %s\t%s' %(i + 1, url, count)) log('-----------------------------------') with open('%sranking_comparisons_%s.tsv' % (info_output_dir, run_params_str), 'w') as out_file: for gt_rank, (gt_url, _) in enumerate(gt_rankings): market_rank = 0 precision_rank = 0 ci_rank = 0 fscore_rank = 0 inactive_crowd_rank = 0 if gt_url in market_url_to_rank: market_rank = market_url_to_rank[gt_url] + 1 if gt_url in precision_url_to_rank: precision_rank = precision_url_to_rank[gt_url] + 1 if gt_url in ci_url_to_rank: ci_rank = ci_url_to_rank[gt_url] + 1 if gt_url in fscore_url_to_rank: fscore_rank = fscore_url_to_rank[gt_url] + 1 if gt_url in common_url_to_rank: inactive_crowd_rank = common_url_to_rank[gt_url] + 1 line = '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (gt_url, gt_rank + 1, market_rank, inactive_crowd_rank, precision_rank, ci_rank, fscore_rank) out_file.write(line) with open('%sground_truth_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for url, count in gt_rankings: output_file.write('%s\t%s\n' % (url.strip(), count)) with open('%smarket_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.common_users): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%snewsaholic_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.newsaholics): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%sactive_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.active_users): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%scommon_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.common_users): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%snonexpert_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.non_experts): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%sexpert_p_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.precision): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%sexpert_f_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.fscore): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%sexpert_c_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.ci): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%sexpert_s_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.super_experts): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%smixed_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(mixed_rankings): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('../data/FolkWisdom/market_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.common_users: out_file.write('%s\n' % precision) with open('../data/FolkWisdom/nonexpert_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.non_experts: out_file.write('%s\n' % precision) with open('../data/FolkWisdom/expert_p_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.precision: out_file.write('%s\n' % precision) with open('../data/FolkWisdom/expert_f_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.fscore: out_file.write('%s\n' % precision) with open('../data/FolkWisdom/expert_c_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.ci: out_file.write('%s\n' % precision) log('Drawing summary precision-recall graphs...') # draw_precision_recall_graph(market_precisions, market_recalls, precision_recall.draw([precisions.newsaholics, precisions.active_users, precisions.common_users, precisions.precision, precisions.fscore, precisions.ci, precisions.super_experts], [recalls.newsaholics, recalls.active_users, recalls.common_users, recalls.precision, recalls.fscore, recalls.ci, recalls.super_experts], ['Newsaholics', 'Active', 'Common', 'Precision', 'F-score', 'CI', 'Super Experts'], 'precision_recall_all', run_params_str) # Draw via old method because it has fancy markings. experts.draw_precision_recall_experts(precisions.non_experts, recalls.non_experts, precisions.precision, recalls.precision, precisions.fscore, recalls.fscore, precisions.ci, recalls.ci, run_params_str) log('Drawing experts precision-recall graph...') # precision_recall.draw_with_markers([precisions.population, precisions.non_experts, precisions.precision, # precisions.fscore, precisions.ci], # [recalls.population, recalls.non_experts, recalls.precision, # recalls.fscore, recalls.ci], # ['Population', 'Crowd', 'Precision', 'F-score', 'CI'], # 'precision_recall_experts', # 0, run_params_str) log('Drawing mixed + inact graph...') precision_recall.draw_with_markers([precisions.non_experts, precisions.common_users, mixed_inact_precisions], [recalls.non_experts, recalls.common_users, mixed_inact_recalls], ['Crowd', 'Inactive Crowd', 'Mixed + Inactive'], 'precision_recall_mixed_and_inactive', 3, run_params_str, zoom=True) log('Drawing ci breakdown by followers precisions-recall graph...') precision_recall.draw([precisions.non_experts, precisions.ci, precisions.ci_hi, precisions.ci_li], [recalls.non_experts, recalls.ci, recalls.ci_hi, recalls.ci_li], ['Crowd', 'CI', 'CI High', 'CI Low'], 'precision_recall_ci_followers_breakdown', run_params_str) log('Drawing social bias precision-recall graph...') precision_recall.draw([precisions.non_experts, precisions.social_bias, precisions.precision, precisions.fscore, precisions.ci], [recalls.non_experts, recalls.social_bias, recalls.precision, recalls.fscore, recalls.ci], ['Crowd', 'Influence Experts', 'Precision', 'F-score', 'CI'], 'precision_recall_social_bias', run_params_str) log('Drawing basic groups precision-recall graph...') precision_recall.draw([precisions.newsaholics, precisions.active_users, precisions.common_users], [recalls.newsaholics, recalls.active_users, recalls.common_users], ['Newsaholics', 'Active Users', 'Common Users'], 'precision_recall_basic_groups', run_params_str) log('Drawing crowd def precision-recall graph...') precision_recall.draw([precisions.non_experts, precisions.common_users], [recalls.non_experts, recalls.common_users], ['Crowd', 'Inactive Crowd'], 'precision_recall_crowd_def', run_params_str, zoom=True) log('Drawing non_expert_sampling precision-recall graph...') precision_recall.draw_with_markers([precisions.non_experts, precisions.non_experts_sampled, precisions.non_experts_10, precisions.non_experts_25, precisions.non_experts_1, precisions.ci], [recalls.non_experts, recalls.non_experts_sampled, recalls.non_experts_10, recalls.non_experts_25, recalls.non_experts_1, recalls.ci], ['Crowd', 'Crowd (33% sample)', 'Crowd (10% sample)', 'Crowd (5% sample)', 'Crowd (2% sample)', 'Experts (CI)'], 'precision_recall_non_expert_sampling', 3, run_params_str, ncol=2) # TODO: Replace with new method. log('Drawing mixed model precision-recall graph...') mixed_model.draw_precision_recall_mixed(precisions.non_experts, recalls.non_experts, mixed_precisions, mixed_recalls, run_params_str, zoom=True) log('Drawing mixed ci model precision-recall graph...') precision_recall.draw([precisions.non_experts, mixed_ci_precisions], [recalls.non_experts, mixed_ci_recalls], ['Crowd', 'Mixed'], 'precision_recall_mixed_ci', run_params_str) log('Drawing weighted followers precision-recall graph...') precision_recall.draw([precisions.non_experts, precisions.weighted_followers, precisions.ci], [recalls.non_experts, recalls.weighted_followers, recalls.ci], ['Crowd', 'Weighted Followers', 'CI'], 'precision_recall_weighted_followers', run_params_str) log('Drawing ci weighted graph...') precision_recall.draw([precisions.population, precisions.ci, precisions.ci_weighted], [recalls.population, recalls.ci, recalls.ci_weighted], ['Crowd', 'CI', 'CI (Weighted)'], 'precision_recall_ci_weighted', run_params_str) log('Drawing weighted graph...') precision_recall.draw([precisions.population, precisions.weighted], [recalls.population, recalls.weighted], ['Crowd', 'Crowd (Weighted)'], 'precision_recall_weighted', run_params_str) log('Drawing weighted both graph...') precision_recall.draw([precisions.population, precisions.weighted, precisions.weighted_both], [recalls.population, recalls.weighted, recalls.weighted_both], ['Crowd', 'Crowd (Weighted)', 'Crowd (Weighted Both)'], 'precision_recall_weighted_both', run_params_str)
def run(): Util.ensure_dir_exist(_DATA_DIR) category = None seeds = Util.load_seeds() #read twitter data gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING, category) log('Num ground_truth_rankings: %s' % len(gt_rankings)) target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS) log('Size target_news: %s' % len(target_news)) for delta in _DELTAS: (num_users, newsaholics, active_users, common_users) = basic_groups.group_users(delta, category) population = newsaholics.union(active_users).union(common_users) log('Num newsaholics: %s' % len(newsaholics)) log('Num active: %s' % len(active_users)) log('Num common: %s' % len(common_users)) log('Num users (population): %s' % len(population)) # -- Get experts -- ExpertGroup.precision = experts.select_experts_precision( newsaholics.union(active_users), num_users, delta, _SIZE_EXPERTS, category) ExpertGroup.fscore = experts.select_experts_fscore( len(target_news), num_users, delta, _SIZE_EXPERTS, category) ExpertGroup.ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS, category) ExpertGroup.union = experts.select_all_experts(ExpertGroup.precision, ExpertGroup.fscore, ExpertGroup.ci) log('Num experts (precision): %s' % len(ExpertGroup.precision)) log('Num experts (fscore): %s' % len(ExpertGroup.fscore)) log('Num experts (ci): %s' % len(ExpertGroup.ci)) log('Num all experts: %s' % len(ExpertGroup.union)) non_experts = population.difference(ExpertGroup.union) log('Num non_experts: %s' % len(non_experts)) # other_users = population.difference(all_experts).difference(common_users) # -- counting -- total_num_tweets = 0 hour_to_num_tweets = {} with open('../data/FolkWisdom/time_deltas.tsv') as in_file: for line in in_file: tokens = line.split('\t') time_delta_in_sec = int(tokens[_TIMEDELTAS_FILE_DELTA_INDEX]) url = tokens[_TIMEDELTAS_FILE_URL_INDEX].strip() user_id = tokens[_TIMEDELTAS_FILE_USER_ID_INDEX] if time_delta_in_sec > 0 and url in target_news: current_hour = time_delta_in_sec / _NUM_SEC_PER_HOUR total_num_tweets += 1 if current_hour not in hour_to_num_tweets: hour_to_num_tweets[current_hour] = GroupCount() gcount = hour_to_num_tweets[current_hour] gcount.population += 1 if user_id in ExpertGroup.union: gcount.union += 1 if user_id in ExpertGroup.precision: gcount.precision += 1 if user_id in ExpertGroup.fscore: gcount.fscore += 1 if user_id in ExpertGroup.ci: gcount.ci += 1 else: gcount.non_experts += 1 if user_id in common_users: gcount.common += 1 # print >> sys.stderr, 'Error, a user in expert union but not belongs to any expert group' # elif user_id in common_users: # gcount.common += 1 # else : # gcount.other += 1 # if user_id in non_experts: # gcount.non_experts += 1 gcount = GroupCount() with open(_DATA_DIR + 'hour_thresholds_%s.tsv' % delta, 'w') as out_file: for hour in hour_to_num_tweets.keys(): gc = hour_to_num_tweets[hour] gcount.add(gc) percentage = (gcount.population / float(total_num_tweets)) * 100.0 percentage_common = (gcount.common / float(total_num_tweets)) * 100.0 percentage_other = (gcount.other / float(total_num_tweets)) * 100.0 percentage_experts = (gcount.union / float(total_num_tweets)) * 100.0 percentage_non_experts = (gcount.non_experts / float(total_num_tweets)) * 100.0 out_file.write( '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (hour, percentage, percentage_non_experts, percentage_experts, percentage_common, (gcount.precision / float(total_num_tweets)) * 100.0, (gcount.fscore / float(total_num_tweets)) * 100.0, (gcount.ci / float(total_num_tweets)) * 100.0)) log('hour\tpopulation\tnon_experts\texperts\tcommon\tprecision\tfscore\tci' )
def run(): Util.ensure_dir_exist(_DATA_DIR) category = None seeds = Util.load_seeds() #read twitter data gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING, category) log('Num ground_truth_rankings: %s' % len(gt_rankings)) target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS) log('Size target_news: %s' % len(target_news)) for delta in _DELTAS: (num_users, newsaholics, active_users, common_users) = basic_groups.group_users(delta, category) population = newsaholics.union(active_users).union(common_users) log('Num newsaholics: %s' % len(newsaholics)) log('Num active: %s' % len(active_users)) log('Num common: %s' % len(common_users)) log('Num users (population): %s' % len(population)) # -- Get experts -- ExpertGroup.precision = experts.select_experts_precision( newsaholics.union(active_users), num_users, delta, _SIZE_EXPERTS, category) ExpertGroup.fscore = experts.select_experts_fscore(len(target_news), num_users, delta, _SIZE_EXPERTS, category) ExpertGroup.ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS, category) ExpertGroup.union = experts.select_all_experts(ExpertGroup.precision, ExpertGroup.fscore, ExpertGroup.ci) log('Num experts (precision): %s' % len(ExpertGroup.precision)) log('Num experts (fscore): %s' % len(ExpertGroup.fscore)) log('Num experts (ci): %s' % len(ExpertGroup.ci)) log('Num all experts: %s' % len(ExpertGroup.union)) non_experts = population.difference(ExpertGroup.union) log('Num non_experts: %s' % len(non_experts)) # other_users = population.difference(all_experts).difference(common_users) # -- counting -- total_num_tweets = 0 hour_to_num_tweets = {} with open('../data/FolkWisdom/time_deltas.tsv') as in_file: for line in in_file: tokens = line.split('\t') time_delta_in_sec = int(tokens[_TIMEDELTAS_FILE_DELTA_INDEX]) url = tokens[_TIMEDELTAS_FILE_URL_INDEX].strip() user_id = tokens[_TIMEDELTAS_FILE_USER_ID_INDEX] if time_delta_in_sec > 0 and url in target_news: current_hour = time_delta_in_sec / _NUM_SEC_PER_HOUR total_num_tweets += 1 if current_hour not in hour_to_num_tweets: hour_to_num_tweets[current_hour] = GroupCount() gcount = hour_to_num_tweets[current_hour] gcount.population += 1 if user_id in ExpertGroup.union: gcount.union += 1 if user_id in ExpertGroup.precision: gcount.precision += 1 if user_id in ExpertGroup.fscore: gcount.fscore += 1 if user_id in ExpertGroup.ci: gcount.ci += 1 else: gcount.non_experts += 1 if user_id in common_users: gcount.common += 1 # print >> sys.stderr, 'Error, a user in expert union but not belongs to any expert group' # elif user_id in common_users: # gcount.common += 1 # else : # gcount.other += 1 # if user_id in non_experts: # gcount.non_experts += 1 gcount = GroupCount() with open(_DATA_DIR + 'hour_thresholds_%s.tsv' % delta, 'w') as out_file: for hour in hour_to_num_tweets.keys(): gc = hour_to_num_tweets[hour] gcount.add(gc) percentage = (gcount.population / float(total_num_tweets)) * 100.0 percentage_common = (gcount.common / float(total_num_tweets)) * 100.0 percentage_other = (gcount.other / float(total_num_tweets)) * 100.0 percentage_experts = (gcount.union / float(total_num_tweets)) * 100.0 percentage_non_experts = (gcount.non_experts / float(total_num_tweets)) * 100.0 out_file.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (hour, percentage, percentage_non_experts, percentage_experts, percentage_common, (gcount.precision / float(total_num_tweets)) * 100.0, (gcount.fscore / float(total_num_tweets)) * 100.0, (gcount.ci / float(total_num_tweets)) * 100.0)) log('hour\tpopulation\tnon_experts\texperts\tcommon\tprecision\tfscore\tci')
def run(): """Contains the main logic for this analysis.""" FileLog.set_log_dir() seeds = Util.load_seeds() for category in _CATEGORIES: log('Preforming analysis for category: %s' % category) size_top_news = _SIZE_TOP_NEWS if category: size_top_news = .10 data_set = DataSet.TESTING retweets = set() if _SWITCHED: data_set = DataSet.TRAINING if _EXCLUDE_RETWEETS: retweets = ground_truths.find_retweets(_TESTING_SET_MONTHS) log('Num retweets to exclude: %s' % len(retweets)) gt_rankings = ground_truths.get_gt_rankings( seeds, data_set, category, exclude_tweets_within_delta=_EXCLUDE_TWEETS_WITHIN_DELTA, retweets=retweets) log('Num ground_truth_rankings: %s' % len(gt_rankings)) # Format for use later. ground_truth_url_to_rank = {} for rank, (url, count) in enumerate(gt_rankings): ground_truth_url_to_rank[url] = rank target_news = ground_truths.find_target_news(gt_rankings, size_top_news) log('Size target_news: %s' % len(target_news)) for delta in _DELTAS: run_params_str = 'd%s_t%s_e%s_%s' % (delta, int( size_top_news * 100), int(_SIZE_EXPERTS * 100), category) info_output_dir = '../graph/FolkWisdom/%s/info/' % run_params_str Util.ensure_dir_exist(info_output_dir) groups, d_num_followers = user_groups.get_all_user_groups( delta, category) log('Num experts (precision): %s' % len(groups.precision)) log('Num experts (fscore): %s' % len(groups.fscore)) log('Num experts (ci): %s' % len(groups.ci)) log('Num Super Experts: %s' % len(groups.super_experts)) log('Num Social Bias Experts: %s' % len(groups.social_bias)) log('Finding rankings with an %s hour delta.' % delta) ranks = rankings.get_rankings(delta, seeds, groups, category, d_num_followers) # Output some interesting info to file size_market_unfiltered = '0' with open('../data/FolkWisdom/size_of_market_unfiltered.txt' ) as in_file: size_market_unfiltered = in_file.readline().strip() with open( '%suser_demographics_%s.txt' % (info_output_dir, run_params_str), 'w') as output_file: output_file.write('Number of Newsaholics: %s\n' % len(groups.newsaholics)) output_file.write('Number of Active Users: %s\n' % len(groups.active_users)) output_file.write('Number of Common Users: %s\n' % len(groups.common_users)) output_file.write('\n') output_file.write('Number of Precision Experts: %s\n' % len(groups.precision)) output_file.write('Number of F-Score Experts: %s\n' % len(groups.fscore)) output_file.write('Number of CI Experts: %s\n' % len(groups.ci)) output_file.write('Number of Social Bias Experts: %s\n' % len(groups.social_bias)) output_file.write('Total number of unique experts: %s\n' % len(groups.all_experts)) output_file.write( 'Number of Precision and F-Score Experts: %s\n' % len(groups.precision.intersection(groups.fscore))) output_file.write( 'Number of Precision and CI Experts: %s\n' % len(groups.precision.intersection(groups.ci))) output_file.write('Number of F-Score and CI Experts: %s\n' % len(groups.fscore.intersection(groups.ci))) output_file.write('Number of Super Experts: %s\n' % len(groups.super_experts)) output_file.write('\n') output_file.write( 'Number of Users (Total): %s\n' % (len(groups.newsaholics) + len(groups.active_users) + len(groups.common_users))) output_file.write('Size of market (unfiltered): %s\n' % size_market_unfiltered) output_file.write('\n') # output_file.write('Number of votes by Newsaholics: %s\n' # % num_votes_newsaholics) # output_file.write('Number of votes by Market: %s\n' % num_votes_market) # output_file.write('Number of votes by Active Users: %s\n' # % num_votes_active) # output_file.write('Number of votes by Common Users: %s\n' # % num_votes_common) # output_file.write('\n'); # output_file.write('Number of votes by Expert (Precision) Users: %s\n' # % num_votes_expert_precision) # output_file.write('Number of votes by Expert (fscore) Users: %s\n' # % num_votes_expert_fscore) # output_file.write('Number of votes by Expert (ci) Users: %s\n' # % num_votes_expert_ci) # output_file.write('Number of votes by Super Experts: %s\n' # % num_votes_expert_s) # output_file.write('Number of votes by Social Bias Experts: %s\n' # % num_votes_expert_sb) # output_file.write('\n') # output_file.write('Total Number of votes cast: %s\n' # % (num_votes_newsaholics + num_votes_active # + num_votes_common)) # output_file.write('\n') output_file.write('Total Number of Good News: %s\n' % len(target_news)) log('Ground Truth Top 50') for i in range(min(len(gt_rankings), 50)): url, count = gt_rankings[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Newsaholic Top 5') for i in range(min(len(ranks.newsaholics), 5)): url, count = ranks.newsaholics[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Active Top 5') for i in range(min(len(ranks.active_users), 5)): url, count = ranks.active_users[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Common Top 5') for i in range(min(len(ranks.common_users), 5)): url, count = ranks.common_users[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('nonexpert Top 5') for i in range(min(len(ranks.non_experts), 5)): url, count = ranks.non_experts[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Expert (Precision) Top 5') for i in range(min(len(ranks.precision), 5)): url, count = ranks.precision[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Expert (fscore) Top 5') for i in range(min(len(ranks.fscore), 5)): url, count = ranks.fscore[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Expert (ci) Top 5') for i in range(min(len(ranks.ci), 5)): url, count = ranks.ci[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Super Expert Top 5') for i in range(min(len(ranks.super_experts), 5)): url, count = ranks.super_experts[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Social Bias Expert Top 5') for i in range(min(len(ranks.social_bias), 5)): url, count = ranks.social_bias[i] log('[%s] %s\t%s' % (i, url.strip(), count)) market_rank_to_url = {} newsaholic_rank_to_url = {} active_rank_to_url = {} common_rank_to_url = {} expert_p_rank_to_url = {} expert_f_rank_to_url = {} expert_c_rank_to_url = {} expert_s_rank_to_url = {} for rank, (url, count) in enumerate(ranks.newsaholics): newsaholic_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.population): market_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.active_users): active_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.common_users): common_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.precision): expert_p_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.fscore): expert_f_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.ci): expert_c_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.super_experts): expert_s_rank_to_url[rank] = url population_url_to_rank = {} market_url_to_rank = {} precision_url_to_rank = {} fscore_url_to_rank = {} ci_url_to_rank = {} ci_1_url_to_rank = {} ci_2_url_to_rank = {} ci_3_url_to_rank = {} common_url_to_rank = {} for rank, (url, count) in enumerate(ranks.population): population_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.non_experts): market_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.precision): precision_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.fscore): fscore_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci): ci_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci_1): ci_1_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci_2): ci_2_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci_3): ci_3_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.common_users): common_url_to_rank[url] = rank precisions, recalls = precision_recall.get_precision_recalls( gt_rankings, ranks) mixed_rankings = mixed_model.get_mixed_rankings( market_url_to_rank, precisions.non_experts, precision_url_to_rank, precisions.precision, fscore_url_to_rank, precisions.fscore, ci_url_to_rank, precisions.ci, ground_truth_url_to_rank) mixed_inact_rankings = mixed_model.get_mixed_rankings( common_url_to_rank, precisions.common_users, precision_url_to_rank, precisions.precision, fscore_url_to_rank, precisions.fscore, ci_url_to_rank, precisions.ci, ground_truth_url_to_rank) mixed_ci_rankings = mixed_model.get_mixed_rankings( market_url_to_rank, precisions.non_experts, ci_1_url_to_rank, precisions.ci_1, ci_2_url_to_rank, precisions.ci_2, ci_3_url_to_rank, precisions.ci_3, ground_truth_url_to_rank) mixed_precisions, mixed_recalls = precision_recall.calc_precision_recall( gt_rankings, mixed_rankings) mixed_inact_precisions, mixed_inact_recalls = precision_recall.calc_precision_recall( gt_rankings, mixed_inact_rankings) mixed_ci_precisions, mixed_ci_recalls = precision_recall.calc_precision_recall( gt_rankings, mixed_ci_rankings) log('-----------------------------------') log('Mixed (min) Top 5') for i in range(min(len(mixed_rankings), 5)): url, count = mixed_rankings[i] log('[%s] %s\t%s' % (i + 1, url, count)) log('-----------------------------------') with open( '%sranking_comparisons_%s.tsv' % (info_output_dir, run_params_str), 'w') as out_file: for gt_rank, (gt_url, _) in enumerate(gt_rankings): market_rank = 0 precision_rank = 0 ci_rank = 0 fscore_rank = 0 inactive_crowd_rank = 0 if gt_url in market_url_to_rank: market_rank = market_url_to_rank[gt_url] + 1 if gt_url in precision_url_to_rank: precision_rank = precision_url_to_rank[gt_url] + 1 if gt_url in ci_url_to_rank: ci_rank = ci_url_to_rank[gt_url] + 1 if gt_url in fscore_url_to_rank: fscore_rank = fscore_url_to_rank[gt_url] + 1 if gt_url in common_url_to_rank: inactive_crowd_rank = common_url_to_rank[gt_url] + 1 line = '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ( gt_url, gt_rank + 1, market_rank, inactive_crowd_rank, precision_rank, ci_rank, fscore_rank) out_file.write(line) with open( '%sground_truth_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for url, count in gt_rankings: output_file.write('%s\t%s\n' % (url.strip(), count)) with open( '%smarket_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.common_users): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%snewsaholic_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.newsaholics): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%sactive_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.active_users): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%scommon_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.common_users): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%snonexpert_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.non_experts): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%sexpert_p_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.precision): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%sexpert_f_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.fscore): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%sexpert_c_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.ci): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%sexpert_s_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.super_experts): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%smixed_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(mixed_rankings): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '../data/FolkWisdom/market_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.common_users: out_file.write('%s\n' % precision) with open( '../data/FolkWisdom/nonexpert_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.non_experts: out_file.write('%s\n' % precision) with open( '../data/FolkWisdom/expert_p_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.precision: out_file.write('%s\n' % precision) with open( '../data/FolkWisdom/expert_f_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.fscore: out_file.write('%s\n' % precision) with open( '../data/FolkWisdom/expert_c_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.ci: out_file.write('%s\n' % precision) log('Drawing summary precision-recall graphs...') # draw_precision_recall_graph(market_precisions, market_recalls, precision_recall.draw([ precisions.newsaholics, precisions.active_users, precisions.common_users, precisions.precision, precisions.fscore, precisions.ci, precisions.super_experts ], [ recalls.newsaholics, recalls.active_users, recalls.common_users, recalls.precision, recalls.fscore, recalls.ci, recalls.super_experts ], [ 'Newsaholics', 'Active', 'Common', 'Precision', 'F-score', 'CI', 'Super Experts' ], 'precision_recall_all', run_params_str) # Draw via old method because it has fancy markings. experts.draw_precision_recall_experts( precisions.non_experts, recalls.non_experts, precisions.precision, recalls.precision, precisions.fscore, recalls.fscore, precisions.ci, recalls.ci, run_params_str) log('Drawing experts precision-recall graph...') # precision_recall.draw_with_markers([precisions.population, precisions.non_experts, precisions.precision, # precisions.fscore, precisions.ci], # [recalls.population, recalls.non_experts, recalls.precision, # recalls.fscore, recalls.ci], # ['Population', 'Crowd', 'Precision', 'F-score', 'CI'], # 'precision_recall_experts', # 0, run_params_str) log('Drawing mixed + inact graph...') precision_recall.draw_with_markers( [ precisions.non_experts, precisions.common_users, mixed_inact_precisions ], [ recalls.non_experts, recalls.common_users, mixed_inact_recalls ], ['Crowd', 'Inactive Crowd', 'Mixed + Inactive'], 'precision_recall_mixed_and_inactive', 3, run_params_str, zoom=True) log('Drawing ci breakdown by followers precisions-recall graph...') precision_recall.draw([ precisions.non_experts, precisions.ci, precisions.ci_hi, precisions.ci_li ], [recalls.non_experts, recalls.ci, recalls.ci_hi, recalls.ci_li], ['Crowd', 'CI', 'CI High', 'CI Low'], 'precision_recall_ci_followers_breakdown', run_params_str) log('Drawing social bias precision-recall graph...') precision_recall.draw([ precisions.non_experts, precisions.social_bias, precisions.precision, precisions.fscore, precisions.ci ], [ recalls.non_experts, recalls.social_bias, recalls.precision, recalls.fscore, recalls.ci ], ['Crowd', 'Influence Experts', 'Precision', 'F-score', 'CI'], 'precision_recall_social_bias', run_params_str) log('Drawing basic groups precision-recall graph...') precision_recall.draw([ precisions.newsaholics, precisions.active_users, precisions.common_users ], [ recalls.newsaholics, recalls.active_users, recalls.common_users ], ['Newsaholics', 'Active Users', 'Common Users'], 'precision_recall_basic_groups', run_params_str) log('Drawing crowd def precision-recall graph...') precision_recall.draw( [precisions.non_experts, precisions.common_users], [recalls.non_experts, recalls.common_users], ['Crowd', 'Inactive Crowd'], 'precision_recall_crowd_def', run_params_str, zoom=True) log('Drawing non_expert_sampling precision-recall graph...') precision_recall.draw_with_markers( [ precisions.non_experts, precisions.non_experts_sampled, precisions.non_experts_10, precisions.non_experts_25, precisions.non_experts_1, precisions.ci ], [ recalls.non_experts, recalls.non_experts_sampled, recalls.non_experts_10, recalls.non_experts_25, recalls.non_experts_1, recalls.ci ], [ 'Crowd', 'Crowd (33% sample)', 'Crowd (10% sample)', 'Crowd (5% sample)', 'Crowd (2% sample)', 'Experts (CI)' ], 'precision_recall_non_expert_sampling', 3, run_params_str, ncol=2) # TODO: Replace with new method. log('Drawing mixed model precision-recall graph...') mixed_model.draw_precision_recall_mixed(precisions.non_experts, recalls.non_experts, mixed_precisions, mixed_recalls, run_params_str, zoom=True) log('Drawing mixed ci model precision-recall graph...') precision_recall.draw( [precisions.non_experts, mixed_ci_precisions], [recalls.non_experts, mixed_ci_recalls], ['Crowd', 'Mixed'], 'precision_recall_mixed_ci', run_params_str) log('Drawing weighted followers precision-recall graph...') precision_recall.draw([ precisions.non_experts, precisions.weighted_followers, precisions.ci ], [recalls.non_experts, recalls.weighted_followers, recalls.ci], ['Crowd', 'Weighted Followers', 'CI'], 'precision_recall_weighted_followers', run_params_str) log('Drawing ci weighted graph...') precision_recall.draw( [precisions.population, precisions.ci, precisions.ci_weighted], [recalls.population, recalls.ci, recalls.ci_weighted], ['Crowd', 'CI', 'CI (Weighted)'], 'precision_recall_ci_weighted', run_params_str) log('Drawing weighted graph...') precision_recall.draw([precisions.population, precisions.weighted], [recalls.population, recalls.weighted], ['Crowd', 'Crowd (Weighted)'], 'precision_recall_weighted', run_params_str) log('Drawing weighted both graph...') precision_recall.draw( [ precisions.population, precisions.weighted, precisions.weighted_both ], [recalls.population, recalls.weighted, recalls.weighted_both], ['Crowd', 'Crowd (Weighted)', 'Crowd (Weighted Both)'], 'precision_recall_weighted_both', run_params_str)