def get_user_groups(delta, category=None):
  seeds = Util.load_seeds()

  log('Finding basic user groups for delta %s and category %s...' % (delta, category))
  (num_users, newsaholics, active_users, common_users) = basic_groups.group_users(delta, category)

  log('Finding precision experts for delta %s and category %s...' % (delta, category))
  experts_p = experts.select_experts_precision(newsaholics.union(active_users),
                                               num_users, delta, .02, category)

  log('Finding ground truths...')
  gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING, category)
  log('Finding target news...')
  target_news = ground_truths.find_target_news(gt_rankings, .02)
  size_target_news = len(target_news)

  log('Finding fscore experts for delta %s and category %s...' % (delta, category))
  experts_f = experts.select_experts_fscore(size_target_news, num_users,
                                            delta, .02, category)

  log('Finding ci experts for delta %s and category %s...' % (delta, category))
  experts_ci = experts.select_experts_ci(num_users, delta, .02, category)

  experts_all = experts_p.union(experts_f).union(experts_ci)

  return experts_all, newsaholics, active_users, common_users
def get_user_groups(delta, category=None):
    seeds = Util.load_seeds()

    log('Finding basic user groups for delta %s and category %s...' %
        (delta, category))
    (num_users, newsaholics, active_users,
     common_users) = basic_groups.group_users(delta, category)

    log('Finding precision experts for delta %s and category %s...' %
        (delta, category))
    experts_p = experts.select_experts_precision(
        newsaholics.union(active_users), num_users, delta, .02, category)

    log('Finding ground truths...')
    gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING,
                                                category)
    log('Finding target news...')
    target_news = ground_truths.find_target_news(gt_rankings, .02)
    size_target_news = len(target_news)

    log('Finding fscore experts for delta %s and category %s...' %
        (delta, category))
    experts_f = experts.select_experts_fscore(size_target_news, num_users,
                                              delta, .02, category)

    log('Finding ci experts for delta %s and category %s...' %
        (delta, category))
    experts_ci = experts.select_experts_ci(num_users, delta, .02, category)

    experts_all = experts_p.union(experts_f).union(experts_ci)

    return experts_all, newsaholics, active_users, common_users
def run():
    """Main logic for this analysis."""
    if _MAIN_ANALYSIS:
        seeds = Util.load_seeds()
        gt_ranks = ground_truths.get_gt_rankings(seeds, DataSet.ALL)
        target_news = ground_truths.find_target_news(gt_ranks, _SIZE_TOP_NEWS)
        for delta in _DELTAS:
            log('Performing analysis for delta %s' % delta)
            param_str = 'd%s' % delta
            Util.ensure_dir_exist(_GRAPH_DIR + '%s/' % param_str)
            Util.ensure_dir_exist(_GRAPH_DIR + '%s/info/' % param_str)

            (counts, news_nyt_participant, news_nyt_not_participant,
             when_nyt_tweeted) = find_counts(target_news, delta)
            agg_counts = aggregate_counts(counts, delta)

            with open(_GRAPH_DIR + '%s/info/stats.txt' % param_str,
                      'w') as out_file:
                out_file.write('Num stories total: %s\n' % len(target_news))
                out_file.write('Num NYT Participant: %s\n' %
                               len(news_nyt_participant))
                out_file.write('Num NYT Not Participant: %s\n' %
                               len(news_nyt_not_participant))

            with open(_GRAPH_DIR + '%s/info/legend.tsv' % param_str,
                      'w') as out_file:
                for i in range(
                        min(
                            50,
                            min(len(news_nyt_participant),
                                len(news_nyt_not_participant)))):
                    log('Outputting graph %s...' % i)
                    url_nyt = news_nyt_participant.pop()
                    url_not_nyt = news_nyt_not_participant.pop()
                    nyt_tweeted_min = when_nyt_tweeted[url_nyt]
                    out_file.write('%s\t%s\t%s\n' % (i, url_nyt, url_not_nyt))
                    draw_graph(agg_counts[url_nyt], agg_counts[url_not_nyt],
                               (nyt_tweeted_min,
                                agg_counts[url_nyt][nyt_tweeted_min]), i,
                               param_str)

    if _SECONDARY_ANALYSIS:
        url_str = ('http://thelede.blogs.nytimes.com/2011/08/21/'
                   'latest-updates-on-the-battle-for-tripoli/')
        user_info = crawl_users.load_user_info()
        for url, delta, legend_num in [(url_str, 8, 28)]:
            additional_info = find_additional_info(url, user_info, delta)
            log('Outputting additional info to disk...')
            with open(
                    _GRAPH_DIR + 'additional_info_%s_%s.tsv' %
                (delta, legend_num), 'w') as out_file:
                for user_id, (num_followers, screen_name,
                              minutes) in additional_info:
                    out_file.write(
                        '%s\t%s\t%s\t%s\n' %
                        (user_id, screen_name, num_followers, minutes))

    log('Analysis complete!')
def run():
  """Main logic for this analysis."""
  if _MAIN_ANALYSIS:
    seeds = Util.load_seeds()
    gt_ranks = ground_truths.get_gt_rankings(seeds, DataSet.ALL)
    target_news = ground_truths.find_target_news(gt_ranks, _SIZE_TOP_NEWS)
    for delta in _DELTAS:
      log('Performing analysis for delta %s' % delta)
      param_str = 'd%s' % delta
      Util.ensure_dir_exist(_GRAPH_DIR + '%s/' % param_str)
      Util.ensure_dir_exist(_GRAPH_DIR + '%s/info/' % param_str)

      (counts, news_nyt_participant,
       news_nyt_not_participant, when_nyt_tweeted) = find_counts(target_news,
                                                                 delta)
      agg_counts = aggregate_counts(counts, delta)

      with open(_GRAPH_DIR + '%s/info/stats.txt' % param_str, 'w') as out_file:
        out_file.write('Num stories total: %s\n' % len(target_news))
        out_file.write('Num NYT Participant: %s\n' % len(news_nyt_participant))
        out_file.write('Num NYT Not Participant: %s\n'
                       % len(news_nyt_not_participant))

      with open(_GRAPH_DIR + '%s/info/legend.tsv' % param_str, 'w') as out_file:
        for i in range(min(50, min(len(news_nyt_participant),
                                   len(news_nyt_not_participant)))):
          log('Outputting graph %s...' % i)
          url_nyt = news_nyt_participant.pop()
          url_not_nyt = news_nyt_not_participant.pop()
          nyt_tweeted_min = when_nyt_tweeted[url_nyt]
          out_file.write('%s\t%s\t%s\n' % (i, url_nyt, url_not_nyt))
          draw_graph(agg_counts[url_nyt], agg_counts[url_not_nyt],
                     (nyt_tweeted_min, agg_counts[url_nyt][nyt_tweeted_min]), i,
                     param_str)

  if _SECONDARY_ANALYSIS:
    url_str = ('http://thelede.blogs.nytimes.com/2011/08/21/'
               'latest-updates-on-the-battle-for-tripoli/')
    user_info = crawl_users.load_user_info()
    for url, delta, legend_num in [(url_str, 8, 28)]:
      additional_info = find_additional_info(url, user_info, delta)
      log('Outputting additional info to disk...')
      with open(_GRAPH_DIR + 'additional_info_%s_%s.tsv' % (delta, legend_num),
                'w') as out_file:
        for user_id, (num_followers, screen_name, minutes) in additional_info:
          out_file.write('%s\t%s\t%s\t%s\n' % (user_id, screen_name,
                                               num_followers, minutes))

  log('Analysis complete!')
Beispiel #5
0
def run():
  """Main logic. Outputs data in format for further analysis."""
  global _OUT_DIR
  cache = Util.load_cache()
  seeds = Util.load_seeds()

  # Set up params appropriately.
  data_set = DataSet.TRAINING
  months = _TRAINING_SET_MONTHS
  if _SWITCHED:
    data_set = DataSet.TESTING
    months = _TESTING_SET_MONTHS
    _OUT_DIR += 'switched/'
  retweets = set()
  if _EXCLUDE_RETWEETS:
    retweets = ground_truths.find_retweets(months)
    _OUT_DIR += 'no_retweets/'

  Util.ensure_dir_exist(_OUT_DIR)
  log('Output dir: %s' % _OUT_DIR)

  for delta in _DELTAS:
    for category in _CATEGORIES:
      gt_rankings = ground_truths.get_gt_rankings(seeds, data_set, category,
                                                  exclude_tweets_within_delta=_EXCLUDE_TWEETS_WITHIN_DELTA,
                                                  retweets=retweets)
      sort_users_by_tweet_count(months, seeds, cache, delta, category)
      target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS)
      find_hits_and_mises(months, target_news, seeds, cache,
                          delta, category)
#      if _SWITCHED:
#        gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING,
#                                                    category)
#        sort_users_by_tweet_count(_TESTING_SET_MONTHS, seeds, cache,
#                                  delta, category)
#        target_news = ground_truths.find_target_news(gt_rankings, .02)
#        find_hits_and_mises(_TESTING_SET_MONTHS, target_news, seeds, cache,
#                            delta, category)
#      else:
#        gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TRAINING,
#                                                    category)
#        sort_users_by_tweet_count(_TRAINING_SET_MONTHS, seeds, cache,
#                                  delta, category)
#        target_news = ground_truths.find_target_news(gt_rankings, .02)
#        find_hits_and_mises(_TRAINING_SET_MONTHS, target_news, seeds, cache,
#                            delta, category)

  log('Finished outputting data!')
Beispiel #6
0
def run():
    """Main logic for this analysis."""
    seeds = Util.load_seeds()
    gt_ranks = ground_truths.get_gt_rankings(seeds, DataSet.ALL)
    target_news = ground_truths.find_target_news(gt_ranks, _SIZE_TOP_NEWS)
    # for delta in _DELTAS:
    for delta in [8]:
        log('Performing analysis for delta %s' % delta)
        param_str = 'd%s' % delta
        Util.ensure_dir_exist(_GRAPH_DIR + '%s/' % param_str)
        Util.ensure_dir_exist(_GRAPH_DIR + '%s/info/' % param_str)

        (counts, news_nyt_participant, news_nyt_not_participant,
         when_nyt_tweeted) = find_counts(target_news, delta)
        agg_counts = aggregate_counts(counts, delta)

        with open(_GRAPH_DIR + '%s/info/stats.txt' % param_str,
                  'w') as out_file:
            out_file.write('Num stories total: %s\n' % len(target_news))
            out_file.write('Num NYT Participant: %s\n' %
                           len(news_nyt_participant))
            out_file.write('Num NYT Not Participant: %s\n' %
                           len(news_nyt_not_participant))

        with open(_GRAPH_DIR + '%s/info/legend.tsv' % param_str,
                  'w') as out_file:
            log('Outputting graph...')
            nyt_tweeted_min = when_nyt_tweeted[_STORY_NYT]
            annotations = []
            annotations.append(
                (nyt_tweeted_min, agg_counts[_STORY_NYT][nyt_tweeted_min],
                 '@nytimes'))
            annotations.append((204, agg_counts[_STORY_NYT][204], '@evertuts'))
            annotations.append((193, agg_counts[_STORY_NYT][193], '@nytjim'))
            annotations.append(
                (194, agg_counts[_STORY_NYT][194], '@nytimesglobal'))
            annotations.append(
                (222, agg_counts[_STORY_NYT][222], '@Larryferlazzo'))
            draw_graph(agg_counts[_STORY_NYT], agg_counts[_STORY_NOT_NYT],
                       annotations, param_str)

    log('Analysis complete!')
def run():
  """Main logic for this analysis."""
  FileLog.set_log_dir()
  Util.ensure_dir_exist(_OUTPUT_DIR)
  if _REGENERATE_DATA:
    deltas = find_deltas()
    cache = Util.load_cache()
    seeds = Util.load_seeds()

    # Find top news
    param_str = '_t%s' % (int(_SIZE_TOP_NEWS * 100))
    gts = ground_truths.get_gt_rankings(seeds, DataSet.ALL)
    top_news = ground_truths.find_target_news(gts, _SIZE_TOP_NEWS)

    # Do analysis for all delta, including sys.max to do analysis with no delta.
    for delta in [sys.maxint] + _DELTAS:
      param_str = _get_param_str(delta) 

      (all_counts, original_counts,
       retweet_counts, top_counts) = find_device_counts(delta, deltas, top_news,
                                                        cache)

      (sorted_all_counts, sorted_original_counts,
       sorted_retweet_counts, sorted_top_counts) = sort_data(all_counts,
                                                             original_counts,
                                                             retweet_counts,
                                                             top_counts)

      output_data(sorted_all_counts, sorted_original_counts,
                  sorted_retweet_counts, sorted_top_counts, param_str)

  if _REDRAW_GRAPH:
    for delta in [sys.maxint] + _DELTAS:
      param_str = _get_param_str(delta) 

      (top, original_dict, retweet_dict) = load_data(param_str)
      log('Drawing graph for delta %s...' % delta)
      draw_graph(top, original_dict, retweet_dict, param_str)

  log('Analysis complete.')
def run():
  """Main logic for this analysis."""
  seeds = Util.load_seeds()
  gt_ranks = ground_truths.get_gt_rankings(seeds, DataSet.ALL)
  target_news = ground_truths.find_target_news(gt_ranks, _SIZE_TOP_NEWS)
  # for delta in _DELTAS:
  for delta in [8] :
    log('Performing analysis for delta %s' % delta)
    param_str = 'd%s' % delta
    Util.ensure_dir_exist(_GRAPH_DIR + '%s/' % param_str)
    Util.ensure_dir_exist(_GRAPH_DIR + '%s/info/' % param_str)

    (counts, news_nyt_participant,
     news_nyt_not_participant, when_nyt_tweeted) = find_counts(target_news,
                                                               delta)
    agg_counts = aggregate_counts(counts, delta)

    with open(_GRAPH_DIR + '%s/info/stats.txt' % param_str, 'w') as out_file:
      out_file.write('Num stories total: %s\n' % len(target_news))
      out_file.write('Num NYT Participant: %s\n' % len(news_nyt_participant))
      out_file.write('Num NYT Not Participant: %s\n'
                     % len(news_nyt_not_participant))

    with open(_GRAPH_DIR + '%s/info/legend.tsv' % param_str, 'w') as out_file:
      log('Outputting graph...')
      nyt_tweeted_min = when_nyt_tweeted[_STORY_NYT]
      annotations = []
      annotations.append((nyt_tweeted_min,
                          agg_counts[_STORY_NYT][nyt_tweeted_min], '@nytimes'))
      annotations.append((204, agg_counts[_STORY_NYT][204], '@evertuts'))
      annotations.append((193, agg_counts[_STORY_NYT][193], '@nytjim'))
      annotations.append((194, agg_counts[_STORY_NYT][194], '@nytimesglobal'))
      annotations.append((222, agg_counts[_STORY_NYT][222], '@Larryferlazzo'))
      draw_graph(agg_counts[_STORY_NYT], agg_counts[_STORY_NOT_NYT],
                 annotations, param_str)


  log('Analysis complete!')
def run():
    """Main logic for this analysis."""
    FileLog.set_log_dir()
    Util.ensure_dir_exist(_OUTPUT_DIR)
    if _REGENERATE_DATA:
        deltas = find_deltas()
        cache = Util.load_cache()
        seeds = Util.load_seeds()

        # Find top news
        param_str = '_t%s' % (int(_SIZE_TOP_NEWS * 100))
        gts = ground_truths.get_gt_rankings(seeds, DataSet.ALL)
        top_news = ground_truths.find_target_news(gts, _SIZE_TOP_NEWS)

        # Do analysis for all delta, including sys.max to do analysis with no delta.
        for delta in [sys.maxint] + _DELTAS:
            param_str = _get_param_str(delta)

            (all_counts, original_counts, retweet_counts,
             top_counts) = find_device_counts(delta, deltas, top_news, cache)

            (sorted_all_counts, sorted_original_counts, sorted_retweet_counts,
             sorted_top_counts) = sort_data(all_counts, original_counts,
                                            retweet_counts, top_counts)

            output_data(sorted_all_counts, sorted_original_counts,
                        sorted_retweet_counts, sorted_top_counts, param_str)

    if _REDRAW_GRAPH:
        for delta in [sys.maxint] + _DELTAS:
            param_str = _get_param_str(delta)

            (top, original_dict, retweet_dict) = load_data(param_str)
            log('Drawing graph for delta %s...' % delta)
            draw_graph(top, original_dict, retweet_dict, param_str)

    log('Analysis complete.')
def run():
  """Contains the main logic for this analysis."""
  global _SIZE_TOP_NEWS
  FileLog.set_log_dir()

  seeds = Util.load_seeds()
  for category in _CATEGORIES:
    log('Preforming analysis for category: %s' % category)
    if category:
      _SIZE_TOP_NEWS = .10
    else:
      _SIZE_TOP_NEWS = .02

    gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING,
                                                category)
    log('Num ground_truth_rankings: %s' % len(gt_rankings))


    target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS)
    log('Size target_news: %s' % len(target_news))

    # for delta in _DELTAS:
    for delta in [4]:
      run_params_str = 'd%s_t%s_e%s_%s' % (delta, int(_SIZE_TOP_NEWS * 100),
                                           int(_SIZE_EXPERTS * 100), category)
      output_dir = '../graph/CrowdWisdomDef/%s/' % run_params_str
      Util.ensure_dir_exist(output_dir)

      info_output_dir = '../graph/CrowdWisdomDef/%s/info/' % run_params_str
      Util.ensure_dir_exist(info_output_dir)

      output_dir = '../graph/CrowdWisdomDef/%s/' % run_params_str
      Util.ensure_dir_exist(output_dir)

      (num_users, newsaholics,
       active_users, common_users) = basic_groups.group_users(delta, category)
      log('Num newsaholics: %s' % len(newsaholics))
      log('Num active: %s' % len(active_users))
      log('Num common: %s' % len(common_users))

      common_user_buckets = common_user_groups.group_users(common_users, _NUM_GROUPS)
      for i, common_user_bucket in enumerate(common_user_buckets):
        print 'Number users in common user bucket %s: %s' % (i, len(common_user_bucket))

      experts_precision = experts.select_experts_precision(
          newsaholics.union(active_users), num_users, delta, _SIZE_EXPERTS,
          category)
      experts_fscore = experts.select_experts_fscore(len(target_news),
                                                     num_users,
                                                     delta, _SIZE_EXPERTS,
                                                     category)
      experts_ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS,
                                             category)
      super_experts = experts.select_super_experts(experts_precision,
                                                   experts_fscore,
                                                   experts_ci)

      log('Num experts (precision): %s' % len(experts_precision))
      log('Num experts (fscore): %s' % len(experts_fscore))
      log('Num experts (ci): %s' % len(experts_ci))

      log('Finding rankings with an %s hour delta.' % delta)
      (market_rankings, newsaholic_rankings,
       active_rankings,
       common_rankings) = basic_groups.get_rankings(delta, seeds, newsaholics,
                                                    active_users, category)
      (expert_precision_rankings, expert_fscore_rankings,
       expert_ci_rankings,
       expert_s_rankings) = experts.get_rankings(delta, seeds,
                                                 experts_precision,
                                                 experts_fscore,
                                                 experts_ci,
                                                 super_experts,
                                                 category)

      common_groups_rankings = common_user_groups.get_rankings(delta, seeds,
                                                               common_user_buckets,
                                                               category)

      num_votes_common = 0
      for url, count in common_rankings:
        num_votes_common += count
      log('Num common_rankings: %s' % len(common_rankings))
      log('Num common votes: %s' % num_votes_common)
      num_votes_expert_precision = 0
      for url, count in expert_precision_rankings:
        num_votes_expert_precision += count
      log('Num expert_precision rankings: %s' % len(expert_precision_rankings))
      log('Num expert_precision votes: %s' % num_votes_expert_precision)
      num_votes_expert_fscore = 0
      for url, count in expert_fscore_rankings:
        num_votes_expert_fscore += count
      log('Num expert_fscore rankings: %s' % len(expert_fscore_rankings))
      log('Num expert_fscore votes: %s' % num_votes_expert_fscore)
      num_votes_expert_ci = 0
      for url, count in expert_ci_rankings:
        num_votes_expert_ci += count
      log('Num expert_ci rankings: %s' % len(expert_ci_rankings))
      log('Num expert_ci votes: %s' % num_votes_expert_ci)
      num_votes_buckets = []
      for i, common_group_rankings in enumerate(common_groups_rankings):
        num_votes = 0
        for url, count in common_group_rankings:
          num_votes += count
        num_votes_buckets.append(num_votes)
        log('Num common rankings (%s buckets): %s' % (i, len(common_group_rankings)))
        log('Num expert_ci votes (%s buckets): %s' % (i, num_votes))

      with open('%suser_demographics_%s.txt'
                % (info_output_dir, run_params_str), 'w') as output_file:
        output_file.write('Number of Common Users: %s\n' % len(common_users))
        output_file.write('\n');
        output_file.write('Number of Precision Experts: %s\n' % len(experts_precision))
        output_file.write('Number of F-Score Experts: %s\n' % len(experts_fscore))
        output_file.write('Number of CI Experts: %s\n' % len(experts_ci))
        output_file.write('Number users per common user bucket: %s\n' %len(common_user_buckets[0]))
        output_file.write('Number of Precision and F-Score Experts: %s\n'
                          % len(experts_precision.intersection(experts_fscore)))
        output_file.write('Number of Precision and CI Experts: %s\n'
                          % len(experts_precision.intersection(experts_ci)))
        output_file.write('Number of F-Score and CI Experts: %s\n'
                          % len(experts_fscore.intersection(experts_ci)))
        output_file.write('\n');
        output_file.write('Number of Users (Total): %s\n'
                          % (len(newsaholics) + len(active_users)
                             + len(common_users)))
        output_file.write('\n')
        output_file.write('Number of votes by Common Users: %s\n'
                          % num_votes_common)
        output_file.write('\n');
        output_file.write('Number of votes by Expert (Precision) Users: %s\n'
                % num_votes_expert_precision) 
        output_file.write('Number of votes by Expert (fscore) Users: %s\n'
                % num_votes_expert_fscore) 
        output_file.write('Number of votes by Expert (ci) Users: %s\n'
                % num_votes_expert_ci) 
        output_file.write('Number of votes per bucket: %s\n' % num_votes_buckets)
        output_file.write('\n')
        output_file.write('Total Number of Good News: %s\n' % len(target_news))

      log('Ground Truth Top 5')
      for i in range(min(len(gt_rankings), 5)):
        url, count = gt_rankings[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Common Top 5')
      for i in range(min(len(common_rankings), 5)):
        url, count = common_rankings[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Expert (Precision) Top 5')
      for i in range(min(len(expert_precision_rankings), 5)):
        url, count = expert_precision_rankings[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Expert (fscore) Top 5')
      for i in range(min(len(expert_fscore_rankings), 5)):
        url, count = expert_fscore_rankings[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Expert (ci) Top 5')
      for i in range(min(len(expert_ci_rankings), 5)):
        url, count = expert_ci_rankings[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
        

      common_precisions, common_recalls = calc_precision_recall(gt_rankings,
                                                                common_rankings)
      (expert_p_precisions,
       expert_p_recalls) = calc_precision_recall(gt_rankings,
                                                 expert_precision_rankings)
      (expert_f_precisions,
       expert_f_recalls) = calc_precision_recall(gt_rankings,
                                                 expert_fscore_rankings)
      (expert_c_precisions,
       expert_c_recalls) = calc_precision_recall(gt_rankings,
                                                 expert_ci_rankings)

      common_group_ps = []
      common_group_rs = []
      for common_group_ranking in common_groups_rankings:
        common_group_p, common_group_r = calc_precision_recall(gt_rankings,
                                                               common_group_ranking)
        common_group_ps.append(common_group_p)
        common_group_rs.append(common_group_r)
                                                

      log('Drawing common group model precision-recall graph...')
      common_user_groups.draw_precision_recall(common_group_ps, common_group_rs,
                                               expert_p_precisions, expert_p_recalls,
                                               expert_f_precisions, expert_f_recalls,
                                               expert_c_precisions, expert_c_recalls,
                                               run_params_str)

      log('Drawing common group model precision graph...')
      common_user_groups.draw_precision(common_group_ps, expert_p_precisions,
                                        expert_f_precisions, expert_c_precisions,
                                        run_params_str)
def find_counts(seeds, category=None):
  num_0_1 = 0
  num_1_4 = 0
  num_4_8 = 0
  num_after_8 = 0
  num_total = 0

  log('Finding common users delta 1...')
  (num_users_1, newsaholics_1, active_users_1, common_users_1) = basic_groups.group_users(1, category)
  log('Finding common users delta 4...')
  (num_users_4, newsaholics_4, active_users_4, common_users_4) = basic_groups.group_users(4, category)
  log('Finding common users delta 8...')
  (num_users_8, newsaholics_8, active_users_8, common_users_8) = basic_groups.group_users(8, category)

  copy_common_users_1 = set(common_users_1)
  common_users_1_1 = set()
  common_users_1_2 = set()
  common_users_1_3 = set()
  count = 0
  while len(copy_common_users_1) > 0:
    if count % 3 == 0:
      common_users_1_1.add(copy_common_users_1.pop())
    elif count % 3 == 1:
      common_users_1_2.add(copy_common_users_1.pop())
    elif count % 3 == 2:
      common_users_1_3.add(copy_common_users_1.pop())
    count += 1

  copy_common_users_4 = set(common_users_4)
  common_users_4_1 = set()
  common_users_4_2 = set()
  common_users_4_3 = set()
  count = 0
  while len(copy_common_users_4) > 0:
    if count % 3 == 0:
      common_users_4_1.add(copy_common_users_4.pop())
    elif count % 3 == 1:
      common_users_4_2.add(copy_common_users_4.pop())
    elif count % 3 == 2:
      common_users_4_3.add(copy_common_users_4.pop())
    count += 1

  copy_common_users_8 = set(common_users_8)
  common_users_8_1 = set()
  common_users_8_2 = set()
  common_users_8_3 = set()
  count = 0
  while len(copy_common_users_8) > 0:
    if count % 3 == 0:
      common_users_8_1.add(copy_common_users_8.pop())
    elif count % 3 == 1:
      common_users_8_2.add(copy_common_users_8.pop())
    elif count % 3 == 2:
      common_users_8_3.add(copy_common_users_8.pop())
    count += 1

  log('Size Common Users 1 (delta 1): %s' % len(common_users_1_1))
  log('Size Common Users 2 (delta 1): %s' % len(common_users_1_2))
  log('Size Common Users 3 (delta 1): %s' % len(common_users_1_3))
  log('Size Common Users 1 (delta 4): %s' % len(common_users_4_1))
  log('Size Common Users 2 (delta 4): %s' % len(common_users_4_2))
  log('Size Common Users 3 (delta 4): %s' % len(common_users_4_3))
  log('Size Common Users 1 (delta 8): %s' % len(common_users_8_1))
  log('Size Common Users 2 (delta 8): %s' % len(common_users_8_2))
  log('Size Common Users 3 (delta 8): %s' % len(common_users_8_3))

  log('Finding precision experts delta 1...')
  experts_p_1 = experts.select_experts_precision(newsaholics_1.union(active_users_1),
                                                 num_users_1, 1, _SIZE_EXPERTS, category)
  log('Finding precision experts delta 1...')
  experts_p_4 = experts.select_experts_precision(newsaholics_4.union(active_users_4),
                                                 num_users_4, 4, _SIZE_EXPERTS, category)
  log('Finding precision experts delta 1...')
  experts_p_8 = experts.select_experts_precision(newsaholics_8.union(active_users_8),
                                                 num_users_8, 8, _SIZE_EXPERTS, category)

  log('Finding ground truths...')
  gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING, category)
  log('Finding target news...')
  target_news = ground_truths.find_target_news(gt_rankings, _SIZE_EXPERTS)
  size_target_news = len(target_news)

  log('Finding fscore experts delta 1...')
  experts_f_1 = experts.select_experts_fscore(size_target_news, num_users_1,
                                              1, _SIZE_EXPERTS, category)
  log('Finding fscore experts delta 4...')
  experts_f_4 = experts.select_experts_fscore(size_target_news, num_users_4,
                                              4, _SIZE_EXPERTS, category)
  log('Finding fscore experts delta 8...')
  experts_f_8 = experts.select_experts_fscore(size_target_news, num_users_8,
                                              8, _SIZE_EXPERTS, category)

  log('Finding ci experts delta 1...')
  experts_ci_1 = experts.select_experts_ci(num_users_1, 1, _SIZE_EXPERTS, category)
  log('Finding ci experts delta 4...')
  experts_ci_4 = experts.select_experts_ci(num_users_4, 4, _SIZE_EXPERTS, category)
  log('Finding ci experts delta 8...')
  experts_ci_8 = experts.select_experts_ci(num_users_8, 8, _SIZE_EXPERTS, category)

  experts_all_1 = experts_p_1.union(experts_f_1).union(experts_ci_1)
  experts_all_4 = experts_p_4.union(experts_f_4).union(experts_ci_4)
  experts_all_8 = experts_p_8.union(experts_f_8).union(experts_ci_8)

  num_0_1_common = 0
  num_1_4_common = 0
  num_4_8_common = 0
  
  num_cu_1_1 = 0
  num_cu_1_2 = 0
  num_cu_1_3 = 0

  num_cu_4_1 = 0
  num_cu_4_2 = 0
  num_cu_4_3 = 0

  num_cu_8_1 = 0
  num_cu_8_2 = 0
  num_cu_8_3 = 0

  num_0_1_experts_p = 0
  num_1_4_experts_p = 0
  num_4_8_experts_p = 0

  num_0_1_experts_f = 0
  num_1_4_experts_f = 0
  num_4_8_experts_f = 0

  num_0_1_experts_ci = 0
  num_1_4_experts_ci = 0
  num_4_8_experts_ci = 0

  num_0_1_experts_all = 0
  num_1_4_experts_all = 0
  num_4_8_experts_all = 0

  log('Finding counts...')
  with open('../data/FolkWisdom/time_deltas.tsv') as input_file:
    for line in input_file:

      # parse line
      tokens = line.split('\t')
      url = tokens[_TIMEDELTAS_FILE_URL_INDEX]
      time_delta = timedelta(seconds=int(tokens[_TIMEDELTAS_FILE_DELTA_INDEX]))
      tweet_category = tokens[_TIMEDELTAS_FILE_CATEGORY_INDEX].strip()
      user_id = tokens[_TIMEDELTAS_FILE_USER_ID_INDEX]

      if url in seeds:
        (seed_tweet_id, seed_user_id, seed_time) = seeds[url]

        if Util.is_in_testing_set(seed_time) and category_matches(category,
                                                                  tweet_category):
            num_total += 1
            if time_delta < timedelta(hours=1):
              num_0_1 += 1
              if user_id in common_users_1:
                num_0_1_common += 1
              if user_id in experts_p_1:
                num_0_1_experts_p += 1
              if user_id in experts_f_1:
                num_0_1_experts_f += 1
              if user_id in experts_ci_1:
                num_0_1_experts_ci += 1
              if user_id in experts_all_1:
                num_0_1_experts_all += 1
              if user_id in common_users_1_1:
                num_cu_1_1 += 1
              if user_id in common_users_1_2:
                num_cu_1_2 += 1
              if user_id in common_users_1_3:
                num_cu_1_3 += 1
            elif time_delta >= timedelta(hours=1) and time_delta < timedelta(hours=4):
              num_1_4 += 1
              if user_id in common_users_4:
                num_1_4_common += 1
              if user_id in experts_p_4:
                num_1_4_experts_p += 1
              if user_id in experts_f_4:
                num_1_4_experts_f += 1
              if user_id in experts_ci_4:
                num_1_4_experts_ci += 1
              if user_id in experts_all_4:
                num_1_4_experts_all += 1
              if user_id in common_users_4_1:
                num_cu_4_1 += 1
              if user_id in common_users_4_2:
                num_cu_4_2 += 1
              if user_id in common_users_4_3:
                num_cu_4_3 += 1
            elif time_delta >= timedelta(hours=4) and time_delta < timedelta(hours=8):
              num_4_8 += 1
              if user_id in common_users_8:
                num_4_8_common += 1
              if user_id in experts_p_8:
                num_4_8_experts_p += 1
              if user_id in experts_f_8:
                num_4_8_experts_f += 1
              if user_id in experts_ci_8:
                num_4_8_experts_ci += 1
              if user_id in experts_all_8:
                num_4_8_experts_all += 1
              if user_id in common_users_8_1:
                num_cu_8_1 += 1
              if user_id in common_users_8_2:
                num_cu_8_2 += 1
              if user_id in common_users_8_3:
                num_cu_8_3 += 1
            else:
              num_after_8 += 1

  return (num_0_1, num_0_1_common, num_0_1_experts_p, num_0_1_experts_f, num_0_1_experts_ci, num_0_1_experts_all,
          num_1_4, num_1_4_common, num_1_4_experts_p, num_1_4_experts_f, num_1_4_experts_ci, num_1_4_experts_all,
          num_4_8, num_4_8_common, num_4_8_experts_p, num_4_8_experts_f, num_4_8_experts_ci, num_4_8_experts_all,
          num_cu_1_1, num_cu_1_2, num_cu_1_3,
          num_cu_4_1, num_cu_4_2, num_cu_4_3,
          num_cu_8_1, num_cu_8_2, num_cu_8_3,
          num_after_8, num_total)
Beispiel #12
0
def get_all_user_groups(delta=4, category=None):
  seeds = Util.load_seeds()

  # Set up params appropriately.
  data_set = DataSet.TRAINING
  months = _TRAINING_SET_MONTHS
  if _SWITCHED:
    data_set = DataSet.TESTING
    months = _TESTING_SET_MONTHS
  retweets = set()
  if _EXCLUDE_RETWEETS:
    retweets = ground_truths.find_retweets(months)

  gt_rankings = ground_truths.get_gt_rankings(seeds, data_set, category,
                                              exclude_tweets_within_delta=_EXCLUDE_TWEETS_WITHIN_DELTA,
                                              retweets=retweets)
  target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS)

  groups = UserGroups()

  (num_users, groups.newsaholics,
   groups.active_users,
   groups.common_users) = basic_groups.group_users(delta, category)
  groups.population = groups.newsaholics.union(groups.active_users).union(groups.common_users)

  num_users_eg, groups.even_groups = even_groups.group_users(delta,
                                                             _NUM_GROUPS,
                                                             _SIZE_OF_GROUP_IN_PERCENT,
                                                             category)

  groups.precision = experts.select_experts_precision(
      groups.newsaholics.union(groups.active_users), num_users, delta,
      _SIZE_EXPERTS, category)
  groups.fscore = experts.select_experts_fscore(len(target_news),
                                                num_users,
                                                delta, _SIZE_EXPERTS,
                                                category)
  groups.ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS,
                                        category)
  groups.super_experts = experts.select_super_experts(groups.precision,
                                                      groups.fscore,
                                                      groups.ci)

  groups.ci_hi, groups.ci_li = experts.split_ci_experts_by_followers(groups.ci)

  groups.ci_1 = set()
  groups.ci_2 = set()
  groups.ci_3 = set()
  counter = 0
  for ci_expert in groups.ci:
    if counter % 3 == 0:
      groups.ci_1.add(ci_expert)
    elif counter % 3 == 1:
      groups.ci_2.add(ci_expert)
    elif counter % 3 == 2:
      groups.ci_3.add(ci_expert)
    counter += 1

  groups.social_bias, d_num_followers  = experts.select_experts_social_bias(num_users,
                                                                            _SIZE_EXPERTS)
  groups.all_experts = experts.select_all_experts(groups.precision,
                                                  groups.fscore,
                                                  groups.ci)
  groups.non_experts = groups.population.difference(groups.all_experts)
  sample_size = int(len(groups.non_experts) * _NON_EXPERTS_SAMPLE_SIZE)
  sample_size_25 = int(len(groups.non_experts) * 0.05)
  sample_size_10 = int(len(groups.non_experts) * 0.10)
  sample_size_1 = int(len(groups.non_experts) * 0.02)
  groups.non_experts_sampled = set(random.sample(groups.non_experts, sample_size))
  groups.non_experts_25 = set(random.sample(groups.non_experts, sample_size_25))
  groups.non_experts_10 = set(random.sample(groups.non_experts, sample_size_10))
  groups.non_experts_1 = set(random.sample(groups.non_experts, sample_size_1))

  return groups, d_num_followers
Beispiel #13
0
def run():
  """Contains the main logic for this analysis."""
  FileLog.set_log_dir()

  seeds = Util.load_seeds()
  for category in _CATEGORIES:
    log('Preforming analysis for category: %s' % category)
    size_top_news = _SIZE_TOP_NEWS
    if category:
      size_top_news = .10

    data_set = DataSet.TESTING
    retweets = set()
    if _SWITCHED:
      data_set = DataSet.TRAINING
    if _EXCLUDE_RETWEETS:
      retweets = ground_truths.find_retweets(_TESTING_SET_MONTHS)
    log('Num retweets to exclude: %s' % len(retweets))
    gt_rankings = ground_truths.get_gt_rankings(seeds, data_set, category,
                                                exclude_tweets_within_delta=_EXCLUDE_TWEETS_WITHIN_DELTA,
                                                retweets=retweets)
    log('Num ground_truth_rankings: %s' % len(gt_rankings))

    # Format for use later.
    ground_truth_url_to_rank = {}
    for rank, (url, count) in enumerate(gt_rankings):
      ground_truth_url_to_rank[url] = rank

    target_news = ground_truths.find_target_news(gt_rankings, size_top_news)
    log('Size target_news: %s' % len(target_news))

    for delta in _DELTAS:
      run_params_str = 'd%s_t%s_e%s_%s' % (delta, int(size_top_news * 100),
                                           int(_SIZE_EXPERTS * 100), category)
      info_output_dir = '../graph/FolkWisdom/%s/info/' % run_params_str
      Util.ensure_dir_exist(info_output_dir)


      groups, d_num_followers  = user_groups.get_all_user_groups(delta, category)
      log('Num experts (precision): %s' % len(groups.precision))
      log('Num experts (fscore): %s' % len(groups.fscore))
      log('Num experts (ci): %s' % len(groups.ci))
      log('Num Super Experts: %s' %len(groups.super_experts))
      log('Num Social Bias Experts: %s' % len(groups.social_bias))

      log('Finding rankings with an %s hour delta.' % delta)
      ranks = rankings.get_rankings(delta, seeds, groups, category, d_num_followers)

      # Output some interesting info to file
      size_market_unfiltered = '0'
      with open('../data/FolkWisdom/size_of_market_unfiltered.txt') as in_file:
        size_market_unfiltered = in_file.readline().strip()

      with open('%suser_demographics_%s.txt'
                % (info_output_dir, run_params_str), 'w') as output_file:
        output_file.write('Number of Newsaholics: %s\n' % len(groups.newsaholics))
        output_file.write('Number of Active Users: %s\n' % len(groups.active_users))
        output_file.write('Number of Common Users: %s\n' % len(groups.common_users))
        output_file.write('\n');
        output_file.write('Number of Precision Experts: %s\n' % len(groups.precision))
        output_file.write('Number of F-Score Experts: %s\n' % len(groups.fscore))
        output_file.write('Number of CI Experts: %s\n' % len(groups.ci))
        output_file.write('Number of Social Bias Experts: %s\n' % len(groups.social_bias))
        output_file.write('Total number of unique experts: %s\n' % len(groups.all_experts))
        output_file.write('Number of Precision and F-Score Experts: %s\n'
                          % len(groups.precision.intersection(groups.fscore)))
        output_file.write('Number of Precision and CI Experts: %s\n'
                          % len(groups.precision.intersection(groups.ci)))
        output_file.write('Number of F-Score and CI Experts: %s\n'
                          % len(groups.fscore.intersection(groups.ci)))
        output_file.write('Number of Super Experts: %s\n' % len(groups.super_experts))
        output_file.write('\n');
        output_file.write('Number of Users (Total): %s\n'
                          % (len(groups.newsaholics) + len(groups.active_users)
                             + len(groups.common_users)))
        output_file.write('Size of market (unfiltered): %s\n'
                          % size_market_unfiltered)
        output_file.write('\n')
        # output_file.write('Number of votes by Newsaholics: %s\n'
        #                   % num_votes_newsaholics)
        # output_file.write('Number of votes by Market: %s\n' % num_votes_market)
        # output_file.write('Number of votes by Active Users: %s\n'
        #                   % num_votes_active)
        # output_file.write('Number of votes by Common Users: %s\n'
        #                   % num_votes_common)
        # output_file.write('\n');
        # output_file.write('Number of votes by Expert (Precision) Users: %s\n'
        #         % num_votes_expert_precision) 
        # output_file.write('Number of votes by Expert (fscore) Users: %s\n'
        #         % num_votes_expert_fscore) 
        # output_file.write('Number of votes by Expert (ci) Users: %s\n'
        #         % num_votes_expert_ci) 
        # output_file.write('Number of votes by Super Experts: %s\n'
        #                   % num_votes_expert_s)
        # output_file.write('Number of votes by Social Bias Experts: %s\n'
        #                   % num_votes_expert_sb)
        # output_file.write('\n')
        # output_file.write('Total Number of votes cast: %s\n'
        #                   % (num_votes_newsaholics + num_votes_active
        #                      + num_votes_common))
        # output_file.write('\n')
        output_file.write('Total Number of Good News: %s\n' % len(target_news))

      log('Ground Truth Top 50')
      for i in range(min(len(gt_rankings), 50)):
        url, count = gt_rankings[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Newsaholic Top 5')
      for i in range(min(len(ranks.newsaholics), 5)):
        url, count = ranks.newsaholics[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Active Top 5')
      for i in range(min(len(ranks.active_users), 5)):
        url, count = ranks.active_users[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Common Top 5')
      for i in range(min(len(ranks.common_users), 5)):
        url, count = ranks.common_users[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('nonexpert Top 5')
      for i in range(min(len(ranks.non_experts), 5)):
        url, count = ranks.non_experts[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Expert (Precision) Top 5')
      for i in range(min(len(ranks.precision), 5)):
        url, count = ranks.precision[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Expert (fscore) Top 5')
      for i in range(min(len(ranks.fscore), 5)):
        url, count = ranks.fscore[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Expert (ci) Top 5')
      for i in range(min(len(ranks.ci), 5)):
        url, count = ranks.ci[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Super Expert Top 5')
      for i in range(min(len(ranks.super_experts), 5)):
        url, count = ranks.super_experts[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Social Bias Expert Top 5')
      for i in range(min(len(ranks.social_bias), 5)):
        url, count = ranks.social_bias[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))

        
      market_rank_to_url = {}
      newsaholic_rank_to_url = {}
      active_rank_to_url = {}
      common_rank_to_url = {}
      expert_p_rank_to_url = {}
      expert_f_rank_to_url = {}
      expert_c_rank_to_url = {}
      expert_s_rank_to_url = {}
      for rank, (url, count) in enumerate(ranks.newsaholics):
        newsaholic_rank_to_url[rank] = url
      for rank, (url, count) in enumerate(ranks.population):
        market_rank_to_url[rank] = url
      for rank, (url, count) in enumerate(ranks.active_users):
        active_rank_to_url[rank] = url
      for rank, (url, count) in enumerate(ranks.common_users):
        common_rank_to_url[rank] = url
      for rank, (url, count) in enumerate(ranks.precision):
        expert_p_rank_to_url[rank] = url
      for rank, (url, count) in enumerate(ranks.fscore):
        expert_f_rank_to_url[rank] = url
      for rank, (url, count) in enumerate(ranks.ci):
        expert_c_rank_to_url[rank] = url
      for rank, (url, count) in enumerate(ranks.super_experts):
        expert_s_rank_to_url[rank] = url

      population_url_to_rank = {}
      market_url_to_rank = {}
      precision_url_to_rank = {}
      fscore_url_to_rank = {}
      ci_url_to_rank = {}
      ci_1_url_to_rank = {}
      ci_2_url_to_rank = {}
      ci_3_url_to_rank = {}
      common_url_to_rank = {}
      for rank, (url, count) in enumerate(ranks.population):
        population_url_to_rank[url] = rank
      for rank, (url, count) in enumerate(ranks.non_experts):
        market_url_to_rank[url] = rank
      for rank, (url, count) in enumerate(ranks.precision):
        precision_url_to_rank[url] = rank
      for rank, (url, count) in enumerate(ranks.fscore):
        fscore_url_to_rank[url] = rank
      for rank, (url, count) in enumerate(ranks.ci):
        ci_url_to_rank[url] = rank
      for rank, (url, count) in enumerate(ranks.ci_1):
        ci_1_url_to_rank[url] = rank
      for rank, (url, count) in enumerate(ranks.ci_2):
        ci_2_url_to_rank[url] = rank
      for rank, (url, count) in enumerate(ranks.ci_3):
        ci_3_url_to_rank[url] = rank
      for rank, (url, count) in enumerate(ranks.common_users):
        common_url_to_rank[url] = rank

      precisions, recalls = precision_recall.get_precision_recalls(gt_rankings, ranks)

      mixed_rankings = mixed_model.get_mixed_rankings(market_url_to_rank,
                                                      precisions.non_experts,
                                                      precision_url_to_rank,
                                                      precisions.precision,
                                                      fscore_url_to_rank,
                                                      precisions.fscore,
                                                      ci_url_to_rank,
                                                      precisions.ci,
                                                      ground_truth_url_to_rank)

      mixed_inact_rankings = mixed_model.get_mixed_rankings(common_url_to_rank,
                                                            precisions.common_users,
                                                            precision_url_to_rank,
                                                            precisions.precision,
                                                            fscore_url_to_rank,
                                                            precisions.fscore,
                                                            ci_url_to_rank,
                                                            precisions.ci,
                                                            ground_truth_url_to_rank)

      mixed_ci_rankings = mixed_model.get_mixed_rankings(market_url_to_rank,
                                                         precisions.non_experts,
                                                         ci_1_url_to_rank,
                                                         precisions.ci_1,
                                                         ci_2_url_to_rank,
                                                         precisions.ci_2,
                                                         ci_3_url_to_rank,
                                                         precisions.ci_3,
                                                         ground_truth_url_to_rank)
                                                         

      mixed_precisions, mixed_recalls = precision_recall.calc_precision_recall(gt_rankings, 
                                                                               mixed_rankings)

      mixed_inact_precisions, mixed_inact_recalls = precision_recall.calc_precision_recall(gt_rankings, 
                                                                                           mixed_inact_rankings)

      mixed_ci_precisions, mixed_ci_recalls = precision_recall.calc_precision_recall(gt_rankings, 
                                                                                     mixed_ci_rankings)

      log('-----------------------------------')
      log('Mixed (min) Top 5')
      for i in range(min(len(mixed_rankings), 5)):
        url, count = mixed_rankings[i]
        log('[%s] %s\t%s' %(i + 1, url, count))
      log('-----------------------------------')

      with open('%sranking_comparisons_%s.tsv'
                % (info_output_dir, run_params_str), 'w') as out_file:
        for gt_rank, (gt_url, _) in enumerate(gt_rankings):
          market_rank = 0
          precision_rank = 0
          ci_rank = 0
          fscore_rank = 0
          inactive_crowd_rank = 0
          if gt_url in market_url_to_rank:
            market_rank = market_url_to_rank[gt_url] + 1
          if gt_url in precision_url_to_rank:
            precision_rank = precision_url_to_rank[gt_url] + 1
          if gt_url in ci_url_to_rank:
            ci_rank = ci_url_to_rank[gt_url] + 1
          if gt_url in fscore_url_to_rank:
            fscore_rank = fscore_url_to_rank[gt_url] + 1
          if gt_url in common_url_to_rank:
            inactive_crowd_rank = common_url_to_rank[gt_url] + 1
          line = '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (gt_url, gt_rank + 1,
                                                   market_rank,
                                                   inactive_crowd_rank,
                                                   precision_rank, ci_rank,
                                                   fscore_rank)
          out_file.write(line)


      with open('%sground_truth_rankings_%s.tsv'
                % (info_output_dir, run_params_str), 'w') as output_file:
        for url, count in gt_rankings:
          output_file.write('%s\t%s\n' % (url.strip(), count))
      with open('%smarket_rankings_%s.tsv'
                % (info_output_dir, run_params_str), 'w') as output_file:
        for rank, (url, count) in enumerate(ranks.common_users):
          output_file.write('%s\t%s\t(%s,%s)\n'
                            % (url.strip(), count, rank,
                               ground_truth_url_to_rank[url]))
      with open('%snewsaholic_rankings_%s.tsv'
                % (info_output_dir, run_params_str), 'w') as output_file:
        for rank, (url, count) in enumerate(ranks.newsaholics):
          output_file.write('%s\t%s\t(%s,%s)\n'
                            % (url.strip(), count, rank,
                               ground_truth_url_to_rank[url]))
      with open('%sactive_user_rankings_%s.tsv'
                % (info_output_dir, run_params_str), 'w') as output_file:
        for rank, (url, count) in enumerate(ranks.active_users):
          output_file.write('%s\t%s\t(%s,%s)\n'
                            % (url.strip(), count, rank,
                               ground_truth_url_to_rank[url]))
      with open('%scommon_user_rankings_%s.tsv'
                % (info_output_dir, run_params_str), 'w') as output_file:
        for rank, (url, count) in enumerate(ranks.common_users):
          output_file.write('%s\t%s\t(%s,%s)\n'
                            % (url.strip(), count, rank,
                               ground_truth_url_to_rank[url]))
      with open('%snonexpert_user_rankings_%s.tsv'
                % (info_output_dir, run_params_str), 'w') as output_file:
        for rank, (url, count) in enumerate(ranks.non_experts):
          output_file.write('%s\t%s\t(%s,%s)\n'
                            % (url.strip(), count, rank,
                               ground_truth_url_to_rank[url]))
      with open('%sexpert_p_user_rankings_%s.tsv'
                % (info_output_dir, run_params_str), 'w') as output_file:
        for rank, (url, count) in enumerate(ranks.precision):
          output_file.write('%s\t%s\t(%s,%s)\n'
                            % (url.strip(), count, rank,
                               ground_truth_url_to_rank[url]))
      with open('%sexpert_f_user_rankings_%s.tsv'
                % (info_output_dir, run_params_str), 'w') as output_file:
        for rank, (url, count) in enumerate(ranks.fscore):
          output_file.write('%s\t%s\t(%s,%s)\n'
                            % (url.strip(), count, rank,
                               ground_truth_url_to_rank[url]))
      with open('%sexpert_c_user_rankings_%s.tsv'
                % (info_output_dir, run_params_str), 'w') as output_file:
        for rank, (url, count) in enumerate(ranks.ci):
          output_file.write('%s\t%s\t(%s,%s)\n'
                            % (url.strip(), count, rank,
                               ground_truth_url_to_rank[url]))
      with open('%sexpert_s_user_rankings_%s.tsv'
                % (info_output_dir, run_params_str), 'w') as output_file:
        for rank, (url, count) in enumerate(ranks.super_experts):
          output_file.write('%s\t%s\t(%s,%s)\n'
                            % (url.strip(), count, rank,
                            ground_truth_url_to_rank[url]))
      with open('%smixed_rankings_%s.tsv'
                % (info_output_dir, run_params_str), 'w') as output_file:
        for rank, (url, count) in enumerate(mixed_rankings):
          output_file.write('%s\t%s\t(%s,%s)\n'
                            % (url.strip(), count, rank,
                            ground_truth_url_to_rank[url]))

      with open('../data/FolkWisdom/market_precisions_%s.txt'
                % run_params_str, 'w') as out_file:
        for precision in precisions.common_users:
          out_file.write('%s\n' % precision)

      with open('../data/FolkWisdom/nonexpert_precisions_%s.txt'
                % run_params_str, 'w') as out_file:
        for precision in precisions.non_experts:
          out_file.write('%s\n' % precision)

      with open('../data/FolkWisdom/expert_p_precisions_%s.txt'
                % run_params_str, 'w') as out_file:
        for precision in precisions.precision:
          out_file.write('%s\n' % precision)

      with open('../data/FolkWisdom/expert_f_precisions_%s.txt'
                % run_params_str, 'w') as out_file:
        for precision in precisions.fscore:
          out_file.write('%s\n' % precision)

      with open('../data/FolkWisdom/expert_c_precisions_%s.txt'
                % run_params_str, 'w') as out_file:
        for precision in precisions.ci:
          out_file.write('%s\n' % precision)

      log('Drawing summary precision-recall graphs...')
      # draw_precision_recall_graph(market_precisions, market_recalls,
      precision_recall.draw([precisions.newsaholics, precisions.active_users,
                             precisions.common_users, precisions.precision,
                             precisions.fscore, precisions.ci,
                             precisions.super_experts],
                            [recalls.newsaholics, recalls.active_users,
                             recalls.common_users, recalls.precision,
                             recalls.fscore, recalls.ci, recalls.super_experts],
                            ['Newsaholics', 'Active', 'Common', 'Precision',
                             'F-score', 'CI', 'Super Experts'],
                            'precision_recall_all',
                            run_params_str)

      # Draw via old method because it has fancy markings.
      experts.draw_precision_recall_experts(precisions.non_experts, recalls.non_experts,
                                            precisions.precision, recalls.precision,
                                            precisions.fscore, recalls.fscore,
                                            precisions.ci, recalls.ci,
                                            run_params_str)

      log('Drawing experts precision-recall graph...')
      # precision_recall.draw_with_markers([precisions.population, precisions.non_experts, precisions.precision,
      #                                     precisions.fscore, precisions.ci],
      #                                    [recalls.population, recalls.non_experts, recalls.precision,
      #                                     recalls.fscore, recalls.ci],
      #                                    ['Population', 'Crowd', 'Precision', 'F-score', 'CI'],
      #                                    'precision_recall_experts',
      #                                    0, run_params_str)

      log('Drawing mixed + inact graph...')
      precision_recall.draw_with_markers([precisions.non_experts, precisions.common_users, mixed_inact_precisions],
                                         [recalls.non_experts, recalls.common_users, mixed_inact_recalls],
                                         ['Crowd', 'Inactive Crowd', 'Mixed + Inactive'],
                                         'precision_recall_mixed_and_inactive',
                                         3, run_params_str, zoom=True)

      log('Drawing ci breakdown by followers precisions-recall graph...')
      precision_recall.draw([precisions.non_experts, precisions.ci,
                             precisions.ci_hi, precisions.ci_li],
                            [recalls.non_experts, recalls.ci,
                             recalls.ci_hi, recalls.ci_li],
                            ['Crowd', 'CI', 'CI High', 'CI Low'],
                            'precision_recall_ci_followers_breakdown',
                            run_params_str)

      log('Drawing social bias precision-recall graph...')
      precision_recall.draw([precisions.non_experts, precisions.social_bias,
                             precisions.precision, precisions.fscore,
                             precisions.ci],
                            [recalls.non_experts, recalls.social_bias,
                             recalls.precision, recalls.fscore,
                             recalls.ci],
                            ['Crowd', 'Influence Experts', 'Precision',
                             'F-score', 'CI'],
                            'precision_recall_social_bias',
                            run_params_str)

      log('Drawing basic groups precision-recall graph...')
      precision_recall.draw([precisions.newsaholics, precisions.active_users,
                             precisions.common_users],
                            [recalls.newsaholics, recalls.active_users,
                             recalls.common_users],
                            ['Newsaholics', 'Active Users', 'Common Users'],
                            'precision_recall_basic_groups',
                            run_params_str)

      log('Drawing crowd def precision-recall graph...')
      precision_recall.draw([precisions.non_experts, precisions.common_users],
                            [recalls.non_experts, recalls.common_users],
                            ['Crowd', 'Inactive Crowd'],
                            'precision_recall_crowd_def',
                            run_params_str, zoom=True)

      log('Drawing non_expert_sampling precision-recall graph...')
      precision_recall.draw_with_markers([precisions.non_experts, precisions.non_experts_sampled,
                                          precisions.non_experts_10, precisions.non_experts_25,
                                          precisions.non_experts_1, precisions.ci],
                                          [recalls.non_experts, recalls.non_experts_sampled,
                                           recalls.non_experts_10, recalls.non_experts_25,
                                           recalls.non_experts_1, recalls.ci],
                                          ['Crowd', 'Crowd (33% sample)', 'Crowd (10% sample)',
                                           'Crowd (5% sample)', 'Crowd (2% sample)', 'Experts (CI)'],
                                          'precision_recall_non_expert_sampling',
                                          3, run_params_str, ncol=2)

      # TODO: Replace with new method.
      log('Drawing mixed model precision-recall graph...')
      mixed_model.draw_precision_recall_mixed(precisions.non_experts, recalls.non_experts,
                                              mixed_precisions, mixed_recalls,
                                              run_params_str, zoom=True)

      log('Drawing mixed ci model precision-recall graph...')
      precision_recall.draw([precisions.non_experts, mixed_ci_precisions],
                            [recalls.non_experts, mixed_ci_recalls],
                            ['Crowd', 'Mixed'],
                            'precision_recall_mixed_ci',
                            run_params_str)

      log('Drawing weighted followers precision-recall graph...')
      precision_recall.draw([precisions.non_experts, precisions.weighted_followers, precisions.ci],
                            [recalls.non_experts, recalls.weighted_followers, recalls.ci],
                            ['Crowd', 'Weighted Followers', 'CI'],
                            'precision_recall_weighted_followers',
                            run_params_str)

      log('Drawing ci weighted graph...')
      precision_recall.draw([precisions.population, precisions.ci, precisions.ci_weighted],
                            [recalls.population, recalls.ci, recalls.ci_weighted],
                            ['Crowd', 'CI', 'CI (Weighted)'],
                            'precision_recall_ci_weighted',
                            run_params_str)

      log('Drawing weighted graph...')
      precision_recall.draw([precisions.population, precisions.weighted],
                            [recalls.population, recalls.weighted],
                            ['Crowd', 'Crowd (Weighted)'],
                            'precision_recall_weighted',
                            run_params_str)

      log('Drawing weighted both graph...')
      precision_recall.draw([precisions.population, precisions.weighted, precisions.weighted_both],
                            [recalls.population, recalls.weighted, recalls.weighted_both],
                            ['Crowd', 'Crowd (Weighted)', 'Crowd (Weighted Both)'],
                            'precision_recall_weighted_both',
                            run_params_str)
def run():

    Util.ensure_dir_exist(_DATA_DIR)
    category = None
    seeds = Util.load_seeds()  #read twitter data

    gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING,
                                                category)
    log('Num ground_truth_rankings: %s' % len(gt_rankings))
    target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS)
    log('Size target_news: %s' % len(target_news))

    for delta in _DELTAS:
        (num_users, newsaholics, active_users,
         common_users) = basic_groups.group_users(delta, category)
        population = newsaholics.union(active_users).union(common_users)
        log('Num newsaholics: %s' % len(newsaholics))
        log('Num active: %s' % len(active_users))
        log('Num common: %s' % len(common_users))
        log('Num users (population): %s' % len(population))

        # -- Get experts --
        ExpertGroup.precision = experts.select_experts_precision(
            newsaholics.union(active_users), num_users, delta, _SIZE_EXPERTS,
            category)
        ExpertGroup.fscore = experts.select_experts_fscore(
            len(target_news), num_users, delta, _SIZE_EXPERTS, category)
        ExpertGroup.ci = experts.select_experts_ci(num_users, delta,
                                                   _SIZE_EXPERTS, category)
        ExpertGroup.union = experts.select_all_experts(ExpertGroup.precision,
                                                       ExpertGroup.fscore,
                                                       ExpertGroup.ci)

        log('Num experts (precision): %s' % len(ExpertGroup.precision))
        log('Num experts (fscore): %s' % len(ExpertGroup.fscore))
        log('Num experts (ci): %s' % len(ExpertGroup.ci))
        log('Num all experts: %s' % len(ExpertGroup.union))

        non_experts = population.difference(ExpertGroup.union)
        log('Num non_experts: %s' % len(non_experts))

        # other_users = population.difference(all_experts).difference(common_users)

        # -- counting --

        total_num_tweets = 0
        hour_to_num_tweets = {}
        with open('../data/FolkWisdom/time_deltas.tsv') as in_file:
            for line in in_file:
                tokens = line.split('\t')
                time_delta_in_sec = int(tokens[_TIMEDELTAS_FILE_DELTA_INDEX])
                url = tokens[_TIMEDELTAS_FILE_URL_INDEX].strip()
                user_id = tokens[_TIMEDELTAS_FILE_USER_ID_INDEX]

                if time_delta_in_sec > 0 and url in target_news:
                    current_hour = time_delta_in_sec / _NUM_SEC_PER_HOUR
                    total_num_tweets += 1

                    if current_hour not in hour_to_num_tweets:
                        hour_to_num_tweets[current_hour] = GroupCount()
                    gcount = hour_to_num_tweets[current_hour]

                    gcount.population += 1
                    if user_id in ExpertGroup.union:
                        gcount.union += 1
                        if user_id in ExpertGroup.precision:
                            gcount.precision += 1
                        if user_id in ExpertGroup.fscore:
                            gcount.fscore += 1
                        if user_id in ExpertGroup.ci:
                            gcount.ci += 1
                    else:
                        gcount.non_experts += 1
                        if user_id in common_users:
                            gcount.common += 1

                        #  print >> sys.stderr, 'Error, a user in expert union but not belongs to any expert group'

                    # elif user_id in common_users:
                    #   gcount.common += 1
                    # else :
                    #   gcount.other += 1

                    # if user_id in non_experts:
                    #   gcount.non_experts += 1

        gcount = GroupCount()
        with open(_DATA_DIR + 'hour_thresholds_%s.tsv' % delta,
                  'w') as out_file:
            for hour in hour_to_num_tweets.keys():
                gc = hour_to_num_tweets[hour]
                gcount.add(gc)
                percentage = (gcount.population /
                              float(total_num_tweets)) * 100.0
                percentage_common = (gcount.common /
                                     float(total_num_tweets)) * 100.0
                percentage_other = (gcount.other /
                                    float(total_num_tweets)) * 100.0
                percentage_experts = (gcount.union /
                                      float(total_num_tweets)) * 100.0
                percentage_non_experts = (gcount.non_experts /
                                          float(total_num_tweets)) * 100.0

                out_file.write(
                    '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
                    (hour, percentage, percentage_non_experts,
                     percentage_experts, percentage_common,
                     (gcount.precision / float(total_num_tweets)) * 100.0,
                     (gcount.fscore / float(total_num_tweets)) * 100.0,
                     (gcount.ci / float(total_num_tweets)) * 100.0))
        log('hour\tpopulation\tnon_experts\texperts\tcommon\tprecision\tfscore\tci'
            )
def run():

  Util.ensure_dir_exist(_DATA_DIR)
  category = None
  seeds = Util.load_seeds() #read twitter data

  gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING,
                                              category)
  log('Num ground_truth_rankings: %s' % len(gt_rankings))
  target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS)
  log('Size target_news: %s' % len(target_news))

  for delta in _DELTAS:
    (num_users, newsaholics,
     active_users, common_users) = basic_groups.group_users(delta, category)
    population = newsaholics.union(active_users).union(common_users)
    log('Num newsaholics: %s' % len(newsaholics))
    log('Num active: %s' % len(active_users))
    log('Num common: %s' % len(common_users))
    log('Num users (population): %s' % len(population))

    # -- Get experts --
    ExpertGroup.precision = experts.select_experts_precision(
        newsaholics.union(active_users), num_users, delta, _SIZE_EXPERTS,
        category)
    ExpertGroup.fscore = experts.select_experts_fscore(len(target_news),
                                                   num_users,
                                                   delta, _SIZE_EXPERTS,
                                                   category)
    ExpertGroup.ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS,
                                           category)
    ExpertGroup.union = experts.select_all_experts(ExpertGroup.precision,
                                             ExpertGroup.fscore,
                                             ExpertGroup.ci)

    log('Num experts (precision): %s' % len(ExpertGroup.precision))
    log('Num experts (fscore): %s' % len(ExpertGroup.fscore))
    log('Num experts (ci): %s' % len(ExpertGroup.ci))
    log('Num all experts: %s' % len(ExpertGroup.union))

    non_experts = population.difference(ExpertGroup.union)
    log('Num non_experts: %s' % len(non_experts))

    # other_users = population.difference(all_experts).difference(common_users)


    # -- counting --

    total_num_tweets = 0 
    hour_to_num_tweets = {}
    with open('../data/FolkWisdom/time_deltas.tsv') as in_file:
      for line in in_file:
        tokens = line.split('\t')
        time_delta_in_sec = int(tokens[_TIMEDELTAS_FILE_DELTA_INDEX])
        url = tokens[_TIMEDELTAS_FILE_URL_INDEX].strip()
        user_id = tokens[_TIMEDELTAS_FILE_USER_ID_INDEX]

        if time_delta_in_sec > 0 and url in target_news:
          current_hour = time_delta_in_sec / _NUM_SEC_PER_HOUR
          total_num_tweets += 1

          if current_hour not in hour_to_num_tweets:
            hour_to_num_tweets[current_hour] = GroupCount()
          gcount = hour_to_num_tweets[current_hour]

          gcount.population += 1
          if user_id in ExpertGroup.union:
            gcount.union += 1
            if user_id in ExpertGroup.precision:
              gcount.precision += 1
            if user_id in ExpertGroup.fscore:
              gcount.fscore += 1
            if user_id in ExpertGroup.ci:
              gcount.ci += 1
          else:
            gcount.non_experts += 1
            if user_id in common_users:
              gcount.common += 1
            
            #  print >> sys.stderr, 'Error, a user in expert union but not belongs to any expert group'

          # elif user_id in common_users:
          #   gcount.common += 1
          # else :
          #   gcount.other += 1

          # if user_id in non_experts:
          #   gcount.non_experts += 1

    gcount = GroupCount()  
    with open(_DATA_DIR + 'hour_thresholds_%s.tsv' % delta, 'w') as out_file:
      for hour in hour_to_num_tweets.keys():
        gc = hour_to_num_tweets[hour]
        gcount.add(gc)
        percentage = (gcount.population / float(total_num_tweets)) * 100.0
        percentage_common = (gcount.common / float(total_num_tweets)) * 100.0
        percentage_other = (gcount.other / float(total_num_tweets)) * 100.0
        percentage_experts = (gcount.union / float(total_num_tweets)) * 100.0
        percentage_non_experts = (gcount.non_experts / float(total_num_tweets)) * 100.0
        
        out_file.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (hour, percentage,
                                                             percentage_non_experts,
                                                             percentage_experts,
                                                             percentage_common,
                                                             (gcount.precision / float(total_num_tweets)) * 100.0,
                                                             (gcount.fscore / float(total_num_tweets)) * 100.0,
                                                             (gcount.ci / float(total_num_tweets)) * 100.0))
    log('hour\tpopulation\tnon_experts\texperts\tcommon\tprecision\tfscore\tci')
Beispiel #16
0
def run():
    """Contains the main logic for this analysis."""
    FileLog.set_log_dir()

    seeds = Util.load_seeds()
    for category in _CATEGORIES:
        log('Preforming analysis for category: %s' % category)
        size_top_news = _SIZE_TOP_NEWS
        if category:
            size_top_news = .10

        data_set = DataSet.TESTING
        retweets = set()
        if _SWITCHED:
            data_set = DataSet.TRAINING
        if _EXCLUDE_RETWEETS:
            retweets = ground_truths.find_retweets(_TESTING_SET_MONTHS)
        log('Num retweets to exclude: %s' % len(retweets))
        gt_rankings = ground_truths.get_gt_rankings(
            seeds,
            data_set,
            category,
            exclude_tweets_within_delta=_EXCLUDE_TWEETS_WITHIN_DELTA,
            retweets=retweets)
        log('Num ground_truth_rankings: %s' % len(gt_rankings))

        # Format for use later.
        ground_truth_url_to_rank = {}
        for rank, (url, count) in enumerate(gt_rankings):
            ground_truth_url_to_rank[url] = rank

        target_news = ground_truths.find_target_news(gt_rankings,
                                                     size_top_news)
        log('Size target_news: %s' % len(target_news))

        for delta in _DELTAS:
            run_params_str = 'd%s_t%s_e%s_%s' % (delta, int(
                size_top_news * 100), int(_SIZE_EXPERTS * 100), category)
            info_output_dir = '../graph/FolkWisdom/%s/info/' % run_params_str
            Util.ensure_dir_exist(info_output_dir)

            groups, d_num_followers = user_groups.get_all_user_groups(
                delta, category)
            log('Num experts (precision): %s' % len(groups.precision))
            log('Num experts (fscore): %s' % len(groups.fscore))
            log('Num experts (ci): %s' % len(groups.ci))
            log('Num Super Experts: %s' % len(groups.super_experts))
            log('Num Social Bias Experts: %s' % len(groups.social_bias))

            log('Finding rankings with an %s hour delta.' % delta)
            ranks = rankings.get_rankings(delta, seeds, groups, category,
                                          d_num_followers)

            # Output some interesting info to file
            size_market_unfiltered = '0'
            with open('../data/FolkWisdom/size_of_market_unfiltered.txt'
                      ) as in_file:
                size_market_unfiltered = in_file.readline().strip()

            with open(
                    '%suser_demographics_%s.txt' %
                (info_output_dir, run_params_str), 'w') as output_file:
                output_file.write('Number of Newsaholics: %s\n' %
                                  len(groups.newsaholics))
                output_file.write('Number of Active Users: %s\n' %
                                  len(groups.active_users))
                output_file.write('Number of Common Users: %s\n' %
                                  len(groups.common_users))
                output_file.write('\n')
                output_file.write('Number of Precision Experts: %s\n' %
                                  len(groups.precision))
                output_file.write('Number of F-Score Experts: %s\n' %
                                  len(groups.fscore))
                output_file.write('Number of CI Experts: %s\n' %
                                  len(groups.ci))
                output_file.write('Number of Social Bias Experts: %s\n' %
                                  len(groups.social_bias))
                output_file.write('Total number of unique experts: %s\n' %
                                  len(groups.all_experts))
                output_file.write(
                    'Number of Precision and F-Score Experts: %s\n' %
                    len(groups.precision.intersection(groups.fscore)))
                output_file.write(
                    'Number of Precision and CI Experts: %s\n' %
                    len(groups.precision.intersection(groups.ci)))
                output_file.write('Number of F-Score and CI Experts: %s\n' %
                                  len(groups.fscore.intersection(groups.ci)))
                output_file.write('Number of Super Experts: %s\n' %
                                  len(groups.super_experts))
                output_file.write('\n')
                output_file.write(
                    'Number of Users (Total): %s\n' %
                    (len(groups.newsaholics) + len(groups.active_users) +
                     len(groups.common_users)))
                output_file.write('Size of market (unfiltered): %s\n' %
                                  size_market_unfiltered)
                output_file.write('\n')
                # output_file.write('Number of votes by Newsaholics: %s\n'
                #                   % num_votes_newsaholics)
                # output_file.write('Number of votes by Market: %s\n' % num_votes_market)
                # output_file.write('Number of votes by Active Users: %s\n'
                #                   % num_votes_active)
                # output_file.write('Number of votes by Common Users: %s\n'
                #                   % num_votes_common)
                # output_file.write('\n');
                # output_file.write('Number of votes by Expert (Precision) Users: %s\n'
                #         % num_votes_expert_precision)
                # output_file.write('Number of votes by Expert (fscore) Users: %s\n'
                #         % num_votes_expert_fscore)
                # output_file.write('Number of votes by Expert (ci) Users: %s\n'
                #         % num_votes_expert_ci)
                # output_file.write('Number of votes by Super Experts: %s\n'
                #                   % num_votes_expert_s)
                # output_file.write('Number of votes by Social Bias Experts: %s\n'
                #                   % num_votes_expert_sb)
                # output_file.write('\n')
                # output_file.write('Total Number of votes cast: %s\n'
                #                   % (num_votes_newsaholics + num_votes_active
                #                      + num_votes_common))
                # output_file.write('\n')
                output_file.write('Total Number of Good News: %s\n' %
                                  len(target_news))

            log('Ground Truth Top 50')
            for i in range(min(len(gt_rankings), 50)):
                url, count = gt_rankings[i]
                log('[%s] %s\t%s' % (i, url.strip(), count))
            log('-----------------------------------')
            log('Newsaholic Top 5')
            for i in range(min(len(ranks.newsaholics), 5)):
                url, count = ranks.newsaholics[i]
                log('[%s] %s\t%s' % (i, url.strip(), count))
            log('-----------------------------------')
            log('Active Top 5')
            for i in range(min(len(ranks.active_users), 5)):
                url, count = ranks.active_users[i]
                log('[%s] %s\t%s' % (i, url.strip(), count))
            log('-----------------------------------')
            log('Common Top 5')
            for i in range(min(len(ranks.common_users), 5)):
                url, count = ranks.common_users[i]
                log('[%s] %s\t%s' % (i, url.strip(), count))
            log('-----------------------------------')
            log('nonexpert Top 5')
            for i in range(min(len(ranks.non_experts), 5)):
                url, count = ranks.non_experts[i]
                log('[%s] %s\t%s' % (i, url.strip(), count))
            log('-----------------------------------')
            log('Expert (Precision) Top 5')
            for i in range(min(len(ranks.precision), 5)):
                url, count = ranks.precision[i]
                log('[%s] %s\t%s' % (i, url.strip(), count))
            log('-----------------------------------')
            log('Expert (fscore) Top 5')
            for i in range(min(len(ranks.fscore), 5)):
                url, count = ranks.fscore[i]
                log('[%s] %s\t%s' % (i, url.strip(), count))
            log('-----------------------------------')
            log('Expert (ci) Top 5')
            for i in range(min(len(ranks.ci), 5)):
                url, count = ranks.ci[i]
                log('[%s] %s\t%s' % (i, url.strip(), count))
            log('-----------------------------------')
            log('Super Expert Top 5')
            for i in range(min(len(ranks.super_experts), 5)):
                url, count = ranks.super_experts[i]
                log('[%s] %s\t%s' % (i, url.strip(), count))
            log('-----------------------------------')
            log('Social Bias Expert Top 5')
            for i in range(min(len(ranks.social_bias), 5)):
                url, count = ranks.social_bias[i]
                log('[%s] %s\t%s' % (i, url.strip(), count))

            market_rank_to_url = {}
            newsaholic_rank_to_url = {}
            active_rank_to_url = {}
            common_rank_to_url = {}
            expert_p_rank_to_url = {}
            expert_f_rank_to_url = {}
            expert_c_rank_to_url = {}
            expert_s_rank_to_url = {}
            for rank, (url, count) in enumerate(ranks.newsaholics):
                newsaholic_rank_to_url[rank] = url
            for rank, (url, count) in enumerate(ranks.population):
                market_rank_to_url[rank] = url
            for rank, (url, count) in enumerate(ranks.active_users):
                active_rank_to_url[rank] = url
            for rank, (url, count) in enumerate(ranks.common_users):
                common_rank_to_url[rank] = url
            for rank, (url, count) in enumerate(ranks.precision):
                expert_p_rank_to_url[rank] = url
            for rank, (url, count) in enumerate(ranks.fscore):
                expert_f_rank_to_url[rank] = url
            for rank, (url, count) in enumerate(ranks.ci):
                expert_c_rank_to_url[rank] = url
            for rank, (url, count) in enumerate(ranks.super_experts):
                expert_s_rank_to_url[rank] = url

            population_url_to_rank = {}
            market_url_to_rank = {}
            precision_url_to_rank = {}
            fscore_url_to_rank = {}
            ci_url_to_rank = {}
            ci_1_url_to_rank = {}
            ci_2_url_to_rank = {}
            ci_3_url_to_rank = {}
            common_url_to_rank = {}
            for rank, (url, count) in enumerate(ranks.population):
                population_url_to_rank[url] = rank
            for rank, (url, count) in enumerate(ranks.non_experts):
                market_url_to_rank[url] = rank
            for rank, (url, count) in enumerate(ranks.precision):
                precision_url_to_rank[url] = rank
            for rank, (url, count) in enumerate(ranks.fscore):
                fscore_url_to_rank[url] = rank
            for rank, (url, count) in enumerate(ranks.ci):
                ci_url_to_rank[url] = rank
            for rank, (url, count) in enumerate(ranks.ci_1):
                ci_1_url_to_rank[url] = rank
            for rank, (url, count) in enumerate(ranks.ci_2):
                ci_2_url_to_rank[url] = rank
            for rank, (url, count) in enumerate(ranks.ci_3):
                ci_3_url_to_rank[url] = rank
            for rank, (url, count) in enumerate(ranks.common_users):
                common_url_to_rank[url] = rank

            precisions, recalls = precision_recall.get_precision_recalls(
                gt_rankings, ranks)

            mixed_rankings = mixed_model.get_mixed_rankings(
                market_url_to_rank, precisions.non_experts,
                precision_url_to_rank, precisions.precision,
                fscore_url_to_rank, precisions.fscore, ci_url_to_rank,
                precisions.ci, ground_truth_url_to_rank)

            mixed_inact_rankings = mixed_model.get_mixed_rankings(
                common_url_to_rank, precisions.common_users,
                precision_url_to_rank, precisions.precision,
                fscore_url_to_rank, precisions.fscore, ci_url_to_rank,
                precisions.ci, ground_truth_url_to_rank)

            mixed_ci_rankings = mixed_model.get_mixed_rankings(
                market_url_to_rank, precisions.non_experts, ci_1_url_to_rank,
                precisions.ci_1, ci_2_url_to_rank, precisions.ci_2,
                ci_3_url_to_rank, precisions.ci_3, ground_truth_url_to_rank)

            mixed_precisions, mixed_recalls = precision_recall.calc_precision_recall(
                gt_rankings, mixed_rankings)

            mixed_inact_precisions, mixed_inact_recalls = precision_recall.calc_precision_recall(
                gt_rankings, mixed_inact_rankings)

            mixed_ci_precisions, mixed_ci_recalls = precision_recall.calc_precision_recall(
                gt_rankings, mixed_ci_rankings)

            log('-----------------------------------')
            log('Mixed (min) Top 5')
            for i in range(min(len(mixed_rankings), 5)):
                url, count = mixed_rankings[i]
                log('[%s] %s\t%s' % (i + 1, url, count))
            log('-----------------------------------')

            with open(
                    '%sranking_comparisons_%s.tsv' %
                (info_output_dir, run_params_str), 'w') as out_file:
                for gt_rank, (gt_url, _) in enumerate(gt_rankings):
                    market_rank = 0
                    precision_rank = 0
                    ci_rank = 0
                    fscore_rank = 0
                    inactive_crowd_rank = 0
                    if gt_url in market_url_to_rank:
                        market_rank = market_url_to_rank[gt_url] + 1
                    if gt_url in precision_url_to_rank:
                        precision_rank = precision_url_to_rank[gt_url] + 1
                    if gt_url in ci_url_to_rank:
                        ci_rank = ci_url_to_rank[gt_url] + 1
                    if gt_url in fscore_url_to_rank:
                        fscore_rank = fscore_url_to_rank[gt_url] + 1
                    if gt_url in common_url_to_rank:
                        inactive_crowd_rank = common_url_to_rank[gt_url] + 1
                    line = '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (
                        gt_url, gt_rank + 1, market_rank, inactive_crowd_rank,
                        precision_rank, ci_rank, fscore_rank)
                    out_file.write(line)

            with open(
                    '%sground_truth_rankings_%s.tsv' %
                (info_output_dir, run_params_str), 'w') as output_file:
                for url, count in gt_rankings:
                    output_file.write('%s\t%s\n' % (url.strip(), count))
            with open(
                    '%smarket_rankings_%s.tsv' %
                (info_output_dir, run_params_str), 'w') as output_file:
                for rank, (url, count) in enumerate(ranks.common_users):
                    output_file.write('%s\t%s\t(%s,%s)\n' %
                                      (url.strip(), count, rank,
                                       ground_truth_url_to_rank[url]))
            with open(
                    '%snewsaholic_rankings_%s.tsv' %
                (info_output_dir, run_params_str), 'w') as output_file:
                for rank, (url, count) in enumerate(ranks.newsaholics):
                    output_file.write('%s\t%s\t(%s,%s)\n' %
                                      (url.strip(), count, rank,
                                       ground_truth_url_to_rank[url]))
            with open(
                    '%sactive_user_rankings_%s.tsv' %
                (info_output_dir, run_params_str), 'w') as output_file:
                for rank, (url, count) in enumerate(ranks.active_users):
                    output_file.write('%s\t%s\t(%s,%s)\n' %
                                      (url.strip(), count, rank,
                                       ground_truth_url_to_rank[url]))
            with open(
                    '%scommon_user_rankings_%s.tsv' %
                (info_output_dir, run_params_str), 'w') as output_file:
                for rank, (url, count) in enumerate(ranks.common_users):
                    output_file.write('%s\t%s\t(%s,%s)\n' %
                                      (url.strip(), count, rank,
                                       ground_truth_url_to_rank[url]))
            with open(
                    '%snonexpert_user_rankings_%s.tsv' %
                (info_output_dir, run_params_str), 'w') as output_file:
                for rank, (url, count) in enumerate(ranks.non_experts):
                    output_file.write('%s\t%s\t(%s,%s)\n' %
                                      (url.strip(), count, rank,
                                       ground_truth_url_to_rank[url]))
            with open(
                    '%sexpert_p_user_rankings_%s.tsv' %
                (info_output_dir, run_params_str), 'w') as output_file:
                for rank, (url, count) in enumerate(ranks.precision):
                    output_file.write('%s\t%s\t(%s,%s)\n' %
                                      (url.strip(), count, rank,
                                       ground_truth_url_to_rank[url]))
            with open(
                    '%sexpert_f_user_rankings_%s.tsv' %
                (info_output_dir, run_params_str), 'w') as output_file:
                for rank, (url, count) in enumerate(ranks.fscore):
                    output_file.write('%s\t%s\t(%s,%s)\n' %
                                      (url.strip(), count, rank,
                                       ground_truth_url_to_rank[url]))
            with open(
                    '%sexpert_c_user_rankings_%s.tsv' %
                (info_output_dir, run_params_str), 'w') as output_file:
                for rank, (url, count) in enumerate(ranks.ci):
                    output_file.write('%s\t%s\t(%s,%s)\n' %
                                      (url.strip(), count, rank,
                                       ground_truth_url_to_rank[url]))
            with open(
                    '%sexpert_s_user_rankings_%s.tsv' %
                (info_output_dir, run_params_str), 'w') as output_file:
                for rank, (url, count) in enumerate(ranks.super_experts):
                    output_file.write('%s\t%s\t(%s,%s)\n' %
                                      (url.strip(), count, rank,
                                       ground_truth_url_to_rank[url]))
            with open(
                    '%smixed_rankings_%s.tsv' %
                (info_output_dir, run_params_str), 'w') as output_file:
                for rank, (url, count) in enumerate(mixed_rankings):
                    output_file.write('%s\t%s\t(%s,%s)\n' %
                                      (url.strip(), count, rank,
                                       ground_truth_url_to_rank[url]))

            with open(
                    '../data/FolkWisdom/market_precisions_%s.txt' %
                    run_params_str, 'w') as out_file:
                for precision in precisions.common_users:
                    out_file.write('%s\n' % precision)

            with open(
                    '../data/FolkWisdom/nonexpert_precisions_%s.txt' %
                    run_params_str, 'w') as out_file:
                for precision in precisions.non_experts:
                    out_file.write('%s\n' % precision)

            with open(
                    '../data/FolkWisdom/expert_p_precisions_%s.txt' %
                    run_params_str, 'w') as out_file:
                for precision in precisions.precision:
                    out_file.write('%s\n' % precision)

            with open(
                    '../data/FolkWisdom/expert_f_precisions_%s.txt' %
                    run_params_str, 'w') as out_file:
                for precision in precisions.fscore:
                    out_file.write('%s\n' % precision)

            with open(
                    '../data/FolkWisdom/expert_c_precisions_%s.txt' %
                    run_params_str, 'w') as out_file:
                for precision in precisions.ci:
                    out_file.write('%s\n' % precision)

            log('Drawing summary precision-recall graphs...')
            # draw_precision_recall_graph(market_precisions, market_recalls,
            precision_recall.draw([
                precisions.newsaholics, precisions.active_users,
                precisions.common_users, precisions.precision,
                precisions.fscore, precisions.ci, precisions.super_experts
            ], [
                recalls.newsaholics, recalls.active_users,
                recalls.common_users, recalls.precision, recalls.fscore,
                recalls.ci, recalls.super_experts
            ], [
                'Newsaholics', 'Active', 'Common', 'Precision', 'F-score',
                'CI', 'Super Experts'
            ], 'precision_recall_all', run_params_str)

            # Draw via old method because it has fancy markings.
            experts.draw_precision_recall_experts(
                precisions.non_experts, recalls.non_experts,
                precisions.precision, recalls.precision, precisions.fscore,
                recalls.fscore, precisions.ci, recalls.ci, run_params_str)

            log('Drawing experts precision-recall graph...')
            # precision_recall.draw_with_markers([precisions.population, precisions.non_experts, precisions.precision,
            #                                     precisions.fscore, precisions.ci],
            #                                    [recalls.population, recalls.non_experts, recalls.precision,
            #                                     recalls.fscore, recalls.ci],
            #                                    ['Population', 'Crowd', 'Precision', 'F-score', 'CI'],
            #                                    'precision_recall_experts',
            #                                    0, run_params_str)

            log('Drawing mixed + inact graph...')
            precision_recall.draw_with_markers(
                [
                    precisions.non_experts, precisions.common_users,
                    mixed_inact_precisions
                ], [
                    recalls.non_experts, recalls.common_users,
                    mixed_inact_recalls
                ], ['Crowd', 'Inactive Crowd', 'Mixed + Inactive'],
                'precision_recall_mixed_and_inactive',
                3,
                run_params_str,
                zoom=True)

            log('Drawing ci breakdown by followers precisions-recall graph...')
            precision_recall.draw([
                precisions.non_experts, precisions.ci, precisions.ci_hi,
                precisions.ci_li
            ], [recalls.non_experts, recalls.ci, recalls.ci_hi, recalls.ci_li],
                                  ['Crowd', 'CI', 'CI High', 'CI Low'],
                                  'precision_recall_ci_followers_breakdown',
                                  run_params_str)

            log('Drawing social bias precision-recall graph...')
            precision_recall.draw([
                precisions.non_experts, precisions.social_bias,
                precisions.precision, precisions.fscore, precisions.ci
            ], [
                recalls.non_experts, recalls.social_bias, recalls.precision,
                recalls.fscore, recalls.ci
            ], ['Crowd', 'Influence Experts', 'Precision', 'F-score', 'CI'],
                                  'precision_recall_social_bias',
                                  run_params_str)

            log('Drawing basic groups precision-recall graph...')
            precision_recall.draw([
                precisions.newsaholics, precisions.active_users,
                precisions.common_users
            ], [
                recalls.newsaholics, recalls.active_users, recalls.common_users
            ], ['Newsaholics', 'Active Users', 'Common Users'],
                                  'precision_recall_basic_groups',
                                  run_params_str)

            log('Drawing crowd def precision-recall graph...')
            precision_recall.draw(
                [precisions.non_experts, precisions.common_users],
                [recalls.non_experts, recalls.common_users],
                ['Crowd', 'Inactive Crowd'],
                'precision_recall_crowd_def',
                run_params_str,
                zoom=True)

            log('Drawing non_expert_sampling precision-recall graph...')
            precision_recall.draw_with_markers(
                [
                    precisions.non_experts, precisions.non_experts_sampled,
                    precisions.non_experts_10, precisions.non_experts_25,
                    precisions.non_experts_1, precisions.ci
                ], [
                    recalls.non_experts, recalls.non_experts_sampled,
                    recalls.non_experts_10, recalls.non_experts_25,
                    recalls.non_experts_1, recalls.ci
                ], [
                    'Crowd', 'Crowd (33% sample)', 'Crowd (10% sample)',
                    'Crowd (5% sample)', 'Crowd (2% sample)', 'Experts (CI)'
                ],
                'precision_recall_non_expert_sampling',
                3,
                run_params_str,
                ncol=2)

            # TODO: Replace with new method.
            log('Drawing mixed model precision-recall graph...')
            mixed_model.draw_precision_recall_mixed(precisions.non_experts,
                                                    recalls.non_experts,
                                                    mixed_precisions,
                                                    mixed_recalls,
                                                    run_params_str,
                                                    zoom=True)

            log('Drawing mixed ci model precision-recall graph...')
            precision_recall.draw(
                [precisions.non_experts, mixed_ci_precisions],
                [recalls.non_experts, mixed_ci_recalls], ['Crowd', 'Mixed'],
                'precision_recall_mixed_ci', run_params_str)

            log('Drawing weighted followers precision-recall graph...')
            precision_recall.draw([
                precisions.non_experts, precisions.weighted_followers,
                precisions.ci
            ], [recalls.non_experts, recalls.weighted_followers, recalls.ci],
                                  ['Crowd', 'Weighted Followers', 'CI'],
                                  'precision_recall_weighted_followers',
                                  run_params_str)

            log('Drawing ci weighted graph...')
            precision_recall.draw(
                [precisions.population, precisions.ci, precisions.ci_weighted],
                [recalls.population, recalls.ci, recalls.ci_weighted],
                ['Crowd', 'CI', 'CI (Weighted)'],
                'precision_recall_ci_weighted', run_params_str)

            log('Drawing weighted graph...')
            precision_recall.draw([precisions.population, precisions.weighted],
                                  [recalls.population, recalls.weighted],
                                  ['Crowd', 'Crowd (Weighted)'],
                                  'precision_recall_weighted', run_params_str)

            log('Drawing weighted both graph...')
            precision_recall.draw(
                [
                    precisions.population, precisions.weighted,
                    precisions.weighted_both
                ],
                [recalls.population, recalls.weighted, recalls.weighted_both],
                ['Crowd', 'Crowd (Weighted)', 'Crowd (Weighted Both)'],
                'precision_recall_weighted_both', run_params_str)