Exemple #1
0
def run():
  users = crawl_users.load_user_info()
  groups = user_groups.get_all_user_groups()
  user_id_to_precision = get_user_precisions()

  num_followers = []
  precision_scores = []
  awesome_people = []
  missing = 0
  awesome_people_votes = 0
  for user in users.values():
    if user.id in groups.all_experts and user.id in user_id_to_precision:
      num_followers.append(user.followers_count)
      precision, num_tweets = user_id_to_precision[user.id]
      precision_scores.append(precision)
      if precision >= .5 and num_tweets > 7 and user.followers_count > 1254:
        awesome_people.append(user)
        awesome_people_votes += num_tweets
    else:
      missing += 1

  for user in awesome_people:
    print user.screen_name
  print 'Number of awesome people: %s' % len(awesome_people)
  print 'Number of votes awesome people: %s' % awesome_people_votes
  print 'Missing: %s' % missing

  draw(num_followers, precision_scores)
def run():
    """Main logic for this analysis."""
    if _MAIN_ANALYSIS:
        seeds = Util.load_seeds()
        gt_ranks = ground_truths.get_gt_rankings(seeds, DataSet.ALL)
        target_news = ground_truths.find_target_news(gt_ranks, _SIZE_TOP_NEWS)
        for delta in _DELTAS:
            log('Performing analysis for delta %s' % delta)
            param_str = 'd%s' % delta
            Util.ensure_dir_exist(_GRAPH_DIR + '%s/' % param_str)
            Util.ensure_dir_exist(_GRAPH_DIR + '%s/info/' % param_str)

            (counts, news_nyt_participant, news_nyt_not_participant,
             when_nyt_tweeted) = find_counts(target_news, delta)
            agg_counts = aggregate_counts(counts, delta)

            with open(_GRAPH_DIR + '%s/info/stats.txt' % param_str,
                      'w') as out_file:
                out_file.write('Num stories total: %s\n' % len(target_news))
                out_file.write('Num NYT Participant: %s\n' %
                               len(news_nyt_participant))
                out_file.write('Num NYT Not Participant: %s\n' %
                               len(news_nyt_not_participant))

            with open(_GRAPH_DIR + '%s/info/legend.tsv' % param_str,
                      'w') as out_file:
                for i in range(
                        min(
                            50,
                            min(len(news_nyt_participant),
                                len(news_nyt_not_participant)))):
                    log('Outputting graph %s...' % i)
                    url_nyt = news_nyt_participant.pop()
                    url_not_nyt = news_nyt_not_participant.pop()
                    nyt_tweeted_min = when_nyt_tweeted[url_nyt]
                    out_file.write('%s\t%s\t%s\n' % (i, url_nyt, url_not_nyt))
                    draw_graph(agg_counts[url_nyt], agg_counts[url_not_nyt],
                               (nyt_tweeted_min,
                                agg_counts[url_nyt][nyt_tweeted_min]), i,
                               param_str)

    if _SECONDARY_ANALYSIS:
        url_str = ('http://thelede.blogs.nytimes.com/2011/08/21/'
                   'latest-updates-on-the-battle-for-tripoli/')
        user_info = crawl_users.load_user_info()
        for url, delta, legend_num in [(url_str, 8, 28)]:
            additional_info = find_additional_info(url, user_info, delta)
            log('Outputting additional info to disk...')
            with open(
                    _GRAPH_DIR + 'additional_info_%s_%s.tsv' %
                (delta, legend_num), 'w') as out_file:
                for user_id, (num_followers, screen_name,
                              minutes) in additional_info:
                    out_file.write(
                        '%s\t%s\t%s\t%s\n' %
                        (user_id, screen_name, num_followers, minutes))

    log('Analysis complete!')
def run():
  """Main logic for this analysis."""
  if _MAIN_ANALYSIS:
    seeds = Util.load_seeds()
    gt_ranks = ground_truths.get_gt_rankings(seeds, DataSet.ALL)
    target_news = ground_truths.find_target_news(gt_ranks, _SIZE_TOP_NEWS)
    for delta in _DELTAS:
      log('Performing analysis for delta %s' % delta)
      param_str = 'd%s' % delta
      Util.ensure_dir_exist(_GRAPH_DIR + '%s/' % param_str)
      Util.ensure_dir_exist(_GRAPH_DIR + '%s/info/' % param_str)

      (counts, news_nyt_participant,
       news_nyt_not_participant, when_nyt_tweeted) = find_counts(target_news,
                                                                 delta)
      agg_counts = aggregate_counts(counts, delta)

      with open(_GRAPH_DIR + '%s/info/stats.txt' % param_str, 'w') as out_file:
        out_file.write('Num stories total: %s\n' % len(target_news))
        out_file.write('Num NYT Participant: %s\n' % len(news_nyt_participant))
        out_file.write('Num NYT Not Participant: %s\n'
                       % len(news_nyt_not_participant))

      with open(_GRAPH_DIR + '%s/info/legend.tsv' % param_str, 'w') as out_file:
        for i in range(min(50, min(len(news_nyt_participant),
                                   len(news_nyt_not_participant)))):
          log('Outputting graph %s...' % i)
          url_nyt = news_nyt_participant.pop()
          url_not_nyt = news_nyt_not_participant.pop()
          nyt_tweeted_min = when_nyt_tweeted[url_nyt]
          out_file.write('%s\t%s\t%s\n' % (i, url_nyt, url_not_nyt))
          draw_graph(agg_counts[url_nyt], agg_counts[url_not_nyt],
                     (nyt_tweeted_min, agg_counts[url_nyt][nyt_tweeted_min]), i,
                     param_str)

  if _SECONDARY_ANALYSIS:
    url_str = ('http://thelede.blogs.nytimes.com/2011/08/21/'
               'latest-updates-on-the-battle-for-tripoli/')
    user_info = crawl_users.load_user_info()
    for url, delta, legend_num in [(url_str, 8, 28)]:
      additional_info = find_additional_info(url, user_info, delta)
      log('Outputting additional info to disk...')
      with open(_GRAPH_DIR + 'additional_info_%s_%s.tsv' % (delta, legend_num),
                'w') as out_file:
        for user_id, (num_followers, screen_name, minutes) in additional_info:
          out_file.write('%s\t%s\t%s\t%s\n' % (user_id, screen_name,
                                               num_followers, minutes))

  log('Analysis complete!')
def find_users_to_crawl():
  user_info = crawl_users.load_user_info()
  user_ids_already_crawled = user_info.keys()
  bad_users = load_bad_users()
  experts, newsaholics, active_users, common_users = get_user_groups(_DELTA)

  sample_size = round(len(newsaholics.union(active_users).union(common_users)) * _SAMPLE_SIZE)

  newsaholics_sample = sample_user_group(newsaholics, sample_size)
  active_users_sample = sample_user_group(active_users, sample_size)
  common_users_sample = sample_user_group(common_users, sample_size)

  users_to_crawl = experts.union(newsaholics_sample).union(active_users_sample).union(common_users_sample)
  users_to_crawl = users_to_crawl.difference(bad_users).difference(user_ids_already_crawled)

  return users_to_crawl
def find_users_to_crawl():
    user_info = crawl_users.load_user_info()
    user_ids_already_crawled = user_info.keys()
    bad_users = load_bad_users()
    experts, newsaholics, active_users, common_users = get_user_groups(_DELTA)

    sample_size = round(
        len(newsaholics.union(active_users).union(common_users)) *
        _SAMPLE_SIZE)

    newsaholics_sample = sample_user_group(newsaholics, sample_size)
    active_users_sample = sample_user_group(active_users, sample_size)
    common_users_sample = sample_user_group(common_users, sample_size)

    users_to_crawl = experts.union(newsaholics_sample).union(
        active_users_sample).union(common_users_sample)
    users_to_crawl = users_to_crawl.difference(bad_users).difference(
        user_ids_already_crawled)

    return users_to_crawl