コード例 #1
0
def run():

    Util.ensure_dir_exist(_DATA_DIR)
    category = None
    seeds = Util.load_seeds()  #read twitter data

    gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING,
                                                category)
    log('Num ground_truth_rankings: %s' % len(gt_rankings))
    target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS)
    log('Size target_news: %s' % len(target_news))

    for delta in _DELTAS:
        (num_users, newsaholics, active_users,
         common_users) = basic_groups.group_users(delta, category)
        population = newsaholics.union(active_users).union(common_users)
        log('Num newsaholics: %s' % len(newsaholics))
        log('Num active: %s' % len(active_users))
        log('Num common: %s' % len(common_users))
        log('Num users (population): %s' % len(population))

        # -- Get experts --
        ExpertGroup.precision = experts.select_experts_precision(
            newsaholics.union(active_users), num_users, delta, _SIZE_EXPERTS,
            category)
        ExpertGroup.fscore = experts.select_experts_fscore(
            len(target_news), num_users, delta, _SIZE_EXPERTS, category)
        ExpertGroup.ci = experts.select_experts_ci(num_users, delta,
                                                   _SIZE_EXPERTS, category)
        ExpertGroup.union = experts.select_all_experts(ExpertGroup.precision,
                                                       ExpertGroup.fscore,
                                                       ExpertGroup.ci)

        log('Num experts (precision): %s' % len(ExpertGroup.precision))
        log('Num experts (fscore): %s' % len(ExpertGroup.fscore))
        log('Num experts (ci): %s' % len(ExpertGroup.ci))
        log('Num all experts: %s' % len(ExpertGroup.union))

        non_experts = population.difference(ExpertGroup.union)
        log('Num non_experts: %s' % len(non_experts))

        # other_users = population.difference(all_experts).difference(common_users)

        # -- counting --

        total_num_tweets = 0
        hour_to_num_tweets = {}
        with open('../data/FolkWisdom/time_deltas.tsv') as in_file:
            for line in in_file:
                tokens = line.split('\t')
                time_delta_in_sec = int(tokens[_TIMEDELTAS_FILE_DELTA_INDEX])
                url = tokens[_TIMEDELTAS_FILE_URL_INDEX].strip()
                user_id = tokens[_TIMEDELTAS_FILE_USER_ID_INDEX]

                if time_delta_in_sec > 0 and url in target_news:
                    current_hour = time_delta_in_sec / _NUM_SEC_PER_HOUR
                    total_num_tweets += 1

                    if current_hour not in hour_to_num_tweets:
                        hour_to_num_tweets[current_hour] = GroupCount()
                    gcount = hour_to_num_tweets[current_hour]

                    gcount.population += 1
                    if user_id in ExpertGroup.union:
                        gcount.union += 1
                        if user_id in ExpertGroup.precision:
                            gcount.precision += 1
                        if user_id in ExpertGroup.fscore:
                            gcount.fscore += 1
                        if user_id in ExpertGroup.ci:
                            gcount.ci += 1
                    else:
                        gcount.non_experts += 1
                        if user_id in common_users:
                            gcount.common += 1

                        #  print >> sys.stderr, 'Error, a user in expert union but not belongs to any expert group'

                    # elif user_id in common_users:
                    #   gcount.common += 1
                    # else :
                    #   gcount.other += 1

                    # if user_id in non_experts:
                    #   gcount.non_experts += 1

        gcount = GroupCount()
        with open(_DATA_DIR + 'hour_thresholds_%s.tsv' % delta,
                  'w') as out_file:
            for hour in hour_to_num_tweets.keys():
                gc = hour_to_num_tweets[hour]
                gcount.add(gc)
                percentage = (gcount.population /
                              float(total_num_tweets)) * 100.0
                percentage_common = (gcount.common /
                                     float(total_num_tweets)) * 100.0
                percentage_other = (gcount.other /
                                    float(total_num_tweets)) * 100.0
                percentage_experts = (gcount.union /
                                      float(total_num_tweets)) * 100.0
                percentage_non_experts = (gcount.non_experts /
                                          float(total_num_tweets)) * 100.0

                out_file.write(
                    '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
                    (hour, percentage, percentage_non_experts,
                     percentage_experts, percentage_common,
                     (gcount.precision / float(total_num_tweets)) * 100.0,
                     (gcount.fscore / float(total_num_tweets)) * 100.0,
                     (gcount.ci / float(total_num_tweets)) * 100.0))
        log('hour\tpopulation\tnon_experts\texperts\tcommon\tprecision\tfscore\tci'
            )
コード例 #2
0
def get_all_user_groups(delta=4, category=None):
  seeds = Util.load_seeds()

  # Set up params appropriately.
  data_set = DataSet.TRAINING
  months = _TRAINING_SET_MONTHS
  if _SWITCHED:
    data_set = DataSet.TESTING
    months = _TESTING_SET_MONTHS
  retweets = set()
  if _EXCLUDE_RETWEETS:
    retweets = ground_truths.find_retweets(months)

  gt_rankings = ground_truths.get_gt_rankings(seeds, data_set, category,
                                              exclude_tweets_within_delta=_EXCLUDE_TWEETS_WITHIN_DELTA,
                                              retweets=retweets)
  target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS)

  groups = UserGroups()

  (num_users, groups.newsaholics,
   groups.active_users,
   groups.common_users) = basic_groups.group_users(delta, category)
  groups.population = groups.newsaholics.union(groups.active_users).union(groups.common_users)

  num_users_eg, groups.even_groups = even_groups.group_users(delta,
                                                             _NUM_GROUPS,
                                                             _SIZE_OF_GROUP_IN_PERCENT,
                                                             category)

  groups.precision = experts.select_experts_precision(
      groups.newsaholics.union(groups.active_users), num_users, delta,
      _SIZE_EXPERTS, category)
  groups.fscore = experts.select_experts_fscore(len(target_news),
                                                num_users,
                                                delta, _SIZE_EXPERTS,
                                                category)
  groups.ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS,
                                        category)
  groups.super_experts = experts.select_super_experts(groups.precision,
                                                      groups.fscore,
                                                      groups.ci)

  groups.ci_hi, groups.ci_li = experts.split_ci_experts_by_followers(groups.ci)

  groups.ci_1 = set()
  groups.ci_2 = set()
  groups.ci_3 = set()
  counter = 0
  for ci_expert in groups.ci:
    if counter % 3 == 0:
      groups.ci_1.add(ci_expert)
    elif counter % 3 == 1:
      groups.ci_2.add(ci_expert)
    elif counter % 3 == 2:
      groups.ci_3.add(ci_expert)
    counter += 1

  groups.social_bias, d_num_followers  = experts.select_experts_social_bias(num_users,
                                                                            _SIZE_EXPERTS)
  groups.all_experts = experts.select_all_experts(groups.precision,
                                                  groups.fscore,
                                                  groups.ci)
  groups.non_experts = groups.population.difference(groups.all_experts)
  sample_size = int(len(groups.non_experts) * _NON_EXPERTS_SAMPLE_SIZE)
  sample_size_25 = int(len(groups.non_experts) * 0.05)
  sample_size_10 = int(len(groups.non_experts) * 0.10)
  sample_size_1 = int(len(groups.non_experts) * 0.02)
  groups.non_experts_sampled = set(random.sample(groups.non_experts, sample_size))
  groups.non_experts_25 = set(random.sample(groups.non_experts, sample_size_25))
  groups.non_experts_10 = set(random.sample(groups.non_experts, sample_size_10))
  groups.non_experts_1 = set(random.sample(groups.non_experts, sample_size_1))

  return groups, d_num_followers
コード例 #3
0
def run():

  Util.ensure_dir_exist(_DATA_DIR)
  category = None
  seeds = Util.load_seeds() #read twitter data

  gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING,
                                              category)
  log('Num ground_truth_rankings: %s' % len(gt_rankings))
  target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS)
  log('Size target_news: %s' % len(target_news))

  for delta in _DELTAS:
    (num_users, newsaholics,
     active_users, common_users) = basic_groups.group_users(delta, category)
    population = newsaholics.union(active_users).union(common_users)
    log('Num newsaholics: %s' % len(newsaholics))
    log('Num active: %s' % len(active_users))
    log('Num common: %s' % len(common_users))
    log('Num users (population): %s' % len(population))

    # -- Get experts --
    ExpertGroup.precision = experts.select_experts_precision(
        newsaholics.union(active_users), num_users, delta, _SIZE_EXPERTS,
        category)
    ExpertGroup.fscore = experts.select_experts_fscore(len(target_news),
                                                   num_users,
                                                   delta, _SIZE_EXPERTS,
                                                   category)
    ExpertGroup.ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS,
                                           category)
    ExpertGroup.union = experts.select_all_experts(ExpertGroup.precision,
                                             ExpertGroup.fscore,
                                             ExpertGroup.ci)

    log('Num experts (precision): %s' % len(ExpertGroup.precision))
    log('Num experts (fscore): %s' % len(ExpertGroup.fscore))
    log('Num experts (ci): %s' % len(ExpertGroup.ci))
    log('Num all experts: %s' % len(ExpertGroup.union))

    non_experts = population.difference(ExpertGroup.union)
    log('Num non_experts: %s' % len(non_experts))

    # other_users = population.difference(all_experts).difference(common_users)


    # -- counting --

    total_num_tweets = 0 
    hour_to_num_tweets = {}
    with open('../data/FolkWisdom/time_deltas.tsv') as in_file:
      for line in in_file:
        tokens = line.split('\t')
        time_delta_in_sec = int(tokens[_TIMEDELTAS_FILE_DELTA_INDEX])
        url = tokens[_TIMEDELTAS_FILE_URL_INDEX].strip()
        user_id = tokens[_TIMEDELTAS_FILE_USER_ID_INDEX]

        if time_delta_in_sec > 0 and url in target_news:
          current_hour = time_delta_in_sec / _NUM_SEC_PER_HOUR
          total_num_tweets += 1

          if current_hour not in hour_to_num_tweets:
            hour_to_num_tweets[current_hour] = GroupCount()
          gcount = hour_to_num_tweets[current_hour]

          gcount.population += 1
          if user_id in ExpertGroup.union:
            gcount.union += 1
            if user_id in ExpertGroup.precision:
              gcount.precision += 1
            if user_id in ExpertGroup.fscore:
              gcount.fscore += 1
            if user_id in ExpertGroup.ci:
              gcount.ci += 1
          else:
            gcount.non_experts += 1
            if user_id in common_users:
              gcount.common += 1
            
            #  print >> sys.stderr, 'Error, a user in expert union but not belongs to any expert group'

          # elif user_id in common_users:
          #   gcount.common += 1
          # else :
          #   gcount.other += 1

          # if user_id in non_experts:
          #   gcount.non_experts += 1

    gcount = GroupCount()  
    with open(_DATA_DIR + 'hour_thresholds_%s.tsv' % delta, 'w') as out_file:
      for hour in hour_to_num_tweets.keys():
        gc = hour_to_num_tweets[hour]
        gcount.add(gc)
        percentage = (gcount.population / float(total_num_tweets)) * 100.0
        percentage_common = (gcount.common / float(total_num_tweets)) * 100.0
        percentage_other = (gcount.other / float(total_num_tweets)) * 100.0
        percentage_experts = (gcount.union / float(total_num_tweets)) * 100.0
        percentage_non_experts = (gcount.non_experts / float(total_num_tweets)) * 100.0
        
        out_file.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (hour, percentage,
                                                             percentage_non_experts,
                                                             percentage_experts,
                                                             percentage_common,
                                                             (gcount.precision / float(total_num_tweets)) * 100.0,
                                                             (gcount.fscore / float(total_num_tweets)) * 100.0,
                                                             (gcount.ci / float(total_num_tweets)) * 100.0))
    log('hour\tpopulation\tnon_experts\texperts\tcommon\tprecision\tfscore\tci')