def run(): users = crawl_users.load_user_info() groups = user_groups.get_all_user_groups() user_id_to_precision = get_user_precisions() num_followers = [] precision_scores = [] awesome_people = [] missing = 0 awesome_people_votes = 0 for user in users.values(): if user.id in groups.all_experts and user.id in user_id_to_precision: num_followers.append(user.followers_count) precision, num_tweets = user_id_to_precision[user.id] precision_scores.append(precision) if precision >= .5 and num_tweets > 7 and user.followers_count > 1254: awesome_people.append(user) awesome_people_votes += num_tweets else: missing += 1 for user in awesome_people: print user.screen_name print 'Number of awesome people: %s' % len(awesome_people) print 'Number of votes awesome people: %s' % awesome_people_votes print 'Missing: %s' % missing draw(num_followers, precision_scores)
def run(): """Main logic for this analysis.""" if _MAIN_ANALYSIS: seeds = Util.load_seeds() gt_ranks = ground_truths.get_gt_rankings(seeds, DataSet.ALL) target_news = ground_truths.find_target_news(gt_ranks, _SIZE_TOP_NEWS) for delta in _DELTAS: log('Performing analysis for delta %s' % delta) param_str = 'd%s' % delta Util.ensure_dir_exist(_GRAPH_DIR + '%s/' % param_str) Util.ensure_dir_exist(_GRAPH_DIR + '%s/info/' % param_str) (counts, news_nyt_participant, news_nyt_not_participant, when_nyt_tweeted) = find_counts(target_news, delta) agg_counts = aggregate_counts(counts, delta) with open(_GRAPH_DIR + '%s/info/stats.txt' % param_str, 'w') as out_file: out_file.write('Num stories total: %s\n' % len(target_news)) out_file.write('Num NYT Participant: %s\n' % len(news_nyt_participant)) out_file.write('Num NYT Not Participant: %s\n' % len(news_nyt_not_participant)) with open(_GRAPH_DIR + '%s/info/legend.tsv' % param_str, 'w') as out_file: for i in range( min( 50, min(len(news_nyt_participant), len(news_nyt_not_participant)))): log('Outputting graph %s...' % i) url_nyt = news_nyt_participant.pop() url_not_nyt = news_nyt_not_participant.pop() nyt_tweeted_min = when_nyt_tweeted[url_nyt] out_file.write('%s\t%s\t%s\n' % (i, url_nyt, url_not_nyt)) draw_graph(agg_counts[url_nyt], agg_counts[url_not_nyt], (nyt_tweeted_min, agg_counts[url_nyt][nyt_tweeted_min]), i, param_str) if _SECONDARY_ANALYSIS: url_str = ('http://thelede.blogs.nytimes.com/2011/08/21/' 'latest-updates-on-the-battle-for-tripoli/') user_info = crawl_users.load_user_info() for url, delta, legend_num in [(url_str, 8, 28)]: additional_info = find_additional_info(url, user_info, delta) log('Outputting additional info to disk...') with open( _GRAPH_DIR + 'additional_info_%s_%s.tsv' % (delta, legend_num), 'w') as out_file: for user_id, (num_followers, screen_name, minutes) in additional_info: out_file.write( '%s\t%s\t%s\t%s\n' % (user_id, screen_name, num_followers, minutes)) log('Analysis complete!')
def run(): """Main logic for this analysis.""" if _MAIN_ANALYSIS: seeds = Util.load_seeds() gt_ranks = ground_truths.get_gt_rankings(seeds, DataSet.ALL) target_news = ground_truths.find_target_news(gt_ranks, _SIZE_TOP_NEWS) for delta in _DELTAS: log('Performing analysis for delta %s' % delta) param_str = 'd%s' % delta Util.ensure_dir_exist(_GRAPH_DIR + '%s/' % param_str) Util.ensure_dir_exist(_GRAPH_DIR + '%s/info/' % param_str) (counts, news_nyt_participant, news_nyt_not_participant, when_nyt_tweeted) = find_counts(target_news, delta) agg_counts = aggregate_counts(counts, delta) with open(_GRAPH_DIR + '%s/info/stats.txt' % param_str, 'w') as out_file: out_file.write('Num stories total: %s\n' % len(target_news)) out_file.write('Num NYT Participant: %s\n' % len(news_nyt_participant)) out_file.write('Num NYT Not Participant: %s\n' % len(news_nyt_not_participant)) with open(_GRAPH_DIR + '%s/info/legend.tsv' % param_str, 'w') as out_file: for i in range(min(50, min(len(news_nyt_participant), len(news_nyt_not_participant)))): log('Outputting graph %s...' % i) url_nyt = news_nyt_participant.pop() url_not_nyt = news_nyt_not_participant.pop() nyt_tweeted_min = when_nyt_tweeted[url_nyt] out_file.write('%s\t%s\t%s\n' % (i, url_nyt, url_not_nyt)) draw_graph(agg_counts[url_nyt], agg_counts[url_not_nyt], (nyt_tweeted_min, agg_counts[url_nyt][nyt_tweeted_min]), i, param_str) if _SECONDARY_ANALYSIS: url_str = ('http://thelede.blogs.nytimes.com/2011/08/21/' 'latest-updates-on-the-battle-for-tripoli/') user_info = crawl_users.load_user_info() for url, delta, legend_num in [(url_str, 8, 28)]: additional_info = find_additional_info(url, user_info, delta) log('Outputting additional info to disk...') with open(_GRAPH_DIR + 'additional_info_%s_%s.tsv' % (delta, legend_num), 'w') as out_file: for user_id, (num_followers, screen_name, minutes) in additional_info: out_file.write('%s\t%s\t%s\t%s\n' % (user_id, screen_name, num_followers, minutes)) log('Analysis complete!')
def find_users_to_crawl(): user_info = crawl_users.load_user_info() user_ids_already_crawled = user_info.keys() bad_users = load_bad_users() experts, newsaholics, active_users, common_users = get_user_groups(_DELTA) sample_size = round(len(newsaholics.union(active_users).union(common_users)) * _SAMPLE_SIZE) newsaholics_sample = sample_user_group(newsaholics, sample_size) active_users_sample = sample_user_group(active_users, sample_size) common_users_sample = sample_user_group(common_users, sample_size) users_to_crawl = experts.union(newsaholics_sample).union(active_users_sample).union(common_users_sample) users_to_crawl = users_to_crawl.difference(bad_users).difference(user_ids_already_crawled) return users_to_crawl
def find_users_to_crawl(): user_info = crawl_users.load_user_info() user_ids_already_crawled = user_info.keys() bad_users = load_bad_users() experts, newsaholics, active_users, common_users = get_user_groups(_DELTA) sample_size = round( len(newsaholics.union(active_users).union(common_users)) * _SAMPLE_SIZE) newsaholics_sample = sample_user_group(newsaholics, sample_size) active_users_sample = sample_user_group(active_users, sample_size) common_users_sample = sample_user_group(common_users, sample_size) users_to_crawl = experts.union(newsaholics_sample).union( active_users_sample).union(common_users_sample) users_to_crawl = users_to_crawl.difference(bad_users).difference( user_ids_already_crawled) return users_to_crawl