Example #1
0
def find_hits_and_mises(months, target_news, seeds, cache, delta,
                        category=None):
  """Finds the hit and miss count for each user.

  Keyword Arguments:
  months -- The months over which to calculate hit and misses.
  target_news -- A set of urls that is the set of known target news.
  cache -- A dictionary of short url to long url.
  category -- The category to find hits and misses for, None for all news.
  """
  hits_and_misses = {}
  for month in months:
    log('Finding hits and misses for users from %s for delta %s and category %s'
        % (month, delta, category))
    dir_name = Util.get_data_dir_name_for(month)
    for filename in os.listdir(dir_name):
      if '.tweet' in filename and 'http_nyti_ms' in filename:
        data_file = '%s/%s' % (dir_name, filename)
        with open(data_file) as input_file:
          for line in input_file:
            tokens = line.split('\t')
            user_id = tokens[_TWEETFILE_USER_ID_INDEX]
            tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX]
            urls = URLUtil.parse_urls(tweet_text, cache)
            for url in urls:
              _, _, seed_time = seeds[url]
              created = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX],
                                          _DATETIME_FORMAT)
              time_delta = created - seed_time
              if time_delta < timedelta(hours=delta):
                category_matches = True
                if category:
                  category_matches = False
                  url_category = URLUtil.extract_category(url)
                  if category == url_category:
                    category_matches = True
                if url in target_news and category_matches:
                  if user_id in hits_and_misses:
                    (user_hits, user_misses) = hits_and_misses[user_id]
                    hits_and_misses[user_id] = (user_hits + 1, user_misses)
                  else:
                    hits_and_misses[user_id] = (1, 0)
                elif category_matches:
                  if user_id in hits_and_misses:
                    (user_hits, user_misses) = hits_and_misses[user_id]
                    hits_and_misses[user_id] = (user_hits, user_misses + 1)
                  else:
                    hits_and_misses[user_id] = (0, 1)

  output_file = (_OUT_DIR + 'user_hits_and_misses_%s_%s.tsv'
                 % (delta, category))
  with open(output_file, 'w') as out_file:
    for user_id, (hits, misses) in hits_and_misses.items():
      out_file.write('%s\t%s\t%s\n' % (user_id, hits, misses))
  log('Wrote hits and misses to disk.')
def find_delta_times(months, seeds, cache):
  """Finds the delta times for every url.
  
  Looks at every url, and calculates the time delta from previously calculated
  seed times.
  
  Keyword Arguments:
  months -- The months over which to look at urls.
  seeds -- A set of seed times, given as a dictionary of url to timedelta.
  cache -- Dictionary mapping short-url to long-url.
  """
  time_deltas = {}
  for month in months:
    log('Finding delta times from %s' % month)
    dir_name = Util.get_data_dir_name_for(month) 
    for filename in os.listdir(dir_name):
      if '.tweet' in filename and 'http_nyti_ms' in filename:
        data_file = '%s/%s' % (dir_name, filename)
        with open(data_file) as input_file:
          for line in input_file:
            tokens = line.split('\t')
            user_id = tokens[_TWEETFILE_USER_ID_INDEX]
            tweet_id = tokens[_TWEETFILE_TWEET_ID_INDEX]
            tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX]
            urls = URLUtil.parse_urls(tweet_text, cache)
            source = tokens[_TWEETFILE_SOURCE_INDEX]
            for url in urls:
              seed_tweet_id, _, seed_time = seeds[url]
              category = URLUtil.extract_category(url)
              if tweet_id == seed_tweet_id:
                time_deltas[tweet_id] = (user_id, 0, url, category, source)
              else:
                created = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX],
                                            _DATETIME_FORMAT)
                time_delta = created - seed_time
                # Convert time delta to seconds to make it easy to read from
                # file later.
                time_delta_in_seconds = (time_delta.days * 86400
                                         + time_delta.seconds)
                time_deltas[tweet_id] = (user_id, time_delta_in_seconds, url,
                                         category, source)
  sorted_deltas = sorted(time_deltas.items(), key=lambda x: x[1][1],
                         reverse=False)
  for (tweet_id, tp) in sorted_deltas:
    if len(tp) < 5:
      print tp
  with open('../data/FolkWisdom/time_deltas.tsv', 'w') as output_file:
    for (tweet_id, (user_id, time_delta, url,
                    category, source)) in sorted_deltas:
      output_file.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (tweet_id, user_id,
                                                      time_delta, url, category,
                                                      source))
  log('Wrote time deltas to disk')
Example #3
0
def sort_users_by_tweet_count(months, seeds, cache, delta, category=None):
  """Sorts users by their tweet activity.
  
  Keyword Arguments:
  months -- The months for which to sort the users on.
  cache -- Dictionary of short url to long url.
  category -- The category to go by, None for all news.
  """
  user_id_to_tweet_count = {}
  for month in months:
    log('Gathering count information for users from %s for delta %s '
        'and category %s' % (month, delta, category))
    dir_name = Util.get_data_dir_name_for(month)
    for filename in os.listdir(dir_name):
      if '.tweet' in filename and 'http_nyti_ms' in filename:
        data_file = '%s/%s' % (dir_name, filename)
        with open(data_file) as input_file:
          for line in input_file:
            tokens = line.split('\t')
            user_id = tokens[_TWEETFILE_USER_ID_INDEX]
            tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX]
            urls = URLUtil.parse_urls(tweet_text, cache)
            for url in urls:
              _, _, seed_time = seeds[url]
              created = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX],
                                          _DATETIME_FORMAT)
              time_delta = created - seed_time
              if time_delta < timedelta(hours=delta):
                if category:
                  url_category = URLUtil.extract_category(url)
                  if url_category == category:
                    if user_id_to_tweet_count.has_key(user_id):
                      user_id_to_tweet_count[user_id] += 1
                    else:
                      user_id_to_tweet_count[user_id] = 1
                else:
                  if user_id_to_tweet_count.has_key(user_id):
                    user_id_to_tweet_count[user_id] += 1
                  else:
                    user_id_to_tweet_count[user_id] = 1
                
  user_ids_sorted_by_tweet_count = sorted(user_id_to_tweet_count.items(),
                                          key=lambda x: x[1], reverse=True)
  
  log("Size of users for category %s (total): %s"
      % (str(len(user_id_to_tweet_count.keys())), category))

  output_file = _OUT_DIR + 'user_activity_%s_%s.tsv' % (delta, category)
  with open(output_file, 'w') as out_file:
    for user_id, count in user_ids_sorted_by_tweet_count:
      out_file.write('%s\t%s\n' % (user_id, count))
  log('Wrote users (sorted by activity) to disk') 
def find_seed_times(months, cache):
  """Finds the time at which each url was seen.
  
  Keyword Arguments:
  months -- The months over which to look at urls.
  cache -- Dictionary mapping short-url to long-url.
  """
  seed_times = {}
  for month in months:
    log('Finding seed times from %s' % month)
    dir_name = Util.get_data_dir_name_for(month) 
    for filename in os.listdir(dir_name):
      if '.tweet' in filename and 'http_nyti_ms' in filename:
        data_file = '%s/%s' % (dir_name, filename)
        with open(data_file) as input_file:
          for line in input_file:
            tokens = line.split('\t')
            user_id = tokens[_TWEETFILE_USER_ID_INDEX]
            tweet_id = tokens[_TWEETFILE_TWEET_ID_INDEX]
            seed_time = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX],
                                          _DATETIME_FORMAT)
            tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX]
            urls = URLUtil.parse_urls(tweet_text, cache)
            for url in urls:
              if not url in seed_times:
                seed_times[url] = (tweet_id, user_id, seed_time)
              else:
                (_, _, previous_seed_time) = seed_times[url]
                if seed_time < previous_seed_time:
                  seed_times[url] = (tweet_id, user_id, seed_time) 
  with open('../data/FolkWisdom/seed_times.tsv', 'w') as output_file:
    for url, (tweet_id, user_id, seed_time) in seed_times.items():
      output_file.write('%s\t%s\t%s\t%s\n' %(tweet_id, user_id, seed_time, url))
  log('Wrote seed times to disk')
def get_gt_rankings(seeds,
                    dataset,
                    category=None,
                    delta=4,
                    exclude_tweets_within_delta=False,
                    retweets=set()):
    """Generate the ground truth rankings.
  
  Keyword Arguments:
  seeds -- A dictionary of url to first time seen.
  training -- Boolean to indicate if we want training window. False specifies
              testing window.
  category -- The category to get gt's for, None for all news.

  Returns:
  gt_rankings -- A list of (url, count) pairs in ranked order.
  """
    gt_tweet_counts = {}
    with open('../data/FolkWisdom/time_deltas.tsv') as input_file:
        for line in input_file:
            tokens = line.split('\t')
            source = tokens[_TIMEDELTAS_FILE_SOURCE_INDEX].strip()
            url = tokens[_TIMEDELTAS_FILE_URL_INDEX]
            tweet_id = tokens[_TIMEDELTAS_FILE_TWEET_ID_INDEX]
            tweet_delta = float(tokens[_TIMEDELTAS_FILE_DELTA_INDEX]) / 3600
            include_tweet = True
            if exclude_tweets_within_delta:
                if tweet_delta > delta:
                    include_tweet = False
            if url in seeds and include_tweet and not tweet_id in retweets:
                _, _, seed_time = seeds[url]
                is_in_window = False
                if dataset == DataSet.TRAINING:
                    is_in_window = Util.is_in_training_set(seed_time)
                elif dataset == DataSet.TESTING:
                    is_in_window = Util.is_in_testing_set(seed_time)
                else:
                    is_in_window = True
                if is_in_window:
                    category_matches = True
                    if category:
                        category_matches = False
                        url_category = URLUtil.extract_category(url)
                        if url_category == category:
                            category_matches = True
                    if category_matches:
                        if url in gt_tweet_counts:
                            gt_tweet_counts[url] += 1
                        else:
                            gt_tweet_counts[url] = 1

    gt_rankings = sorted(gt_tweet_counts.items(),
                         key=lambda x: x[1],
                         reverse=True)
    return gt_rankings
def get_gt_rankings(seeds, dataset, category=None, delta=4,
                    exclude_tweets_within_delta=False, retweets=set()):
  """Generate the ground truth rankings.
  
  Keyword Arguments:
  seeds -- A dictionary of url to first time seen.
  training -- Boolean to indicate if we want training window. False specifies
              testing window.
  category -- The category to get gt's for, None for all news.

  Returns:
  gt_rankings -- A list of (url, count) pairs in ranked order.
  """
  gt_tweet_counts = {}
  with open('../data/FolkWisdom/time_deltas.tsv') as input_file:
    for line in input_file:
      tokens = line.split('\t')
      source = tokens[_TIMEDELTAS_FILE_SOURCE_INDEX].strip()
      url = tokens[_TIMEDELTAS_FILE_URL_INDEX]
      tweet_id = tokens[_TIMEDELTAS_FILE_TWEET_ID_INDEX]
      tweet_delta = float(tokens[_TIMEDELTAS_FILE_DELTA_INDEX]) / 3600
      include_tweet = True
      if exclude_tweets_within_delta:
        if tweet_delta > delta:
          include_tweet = False
      if url in seeds and include_tweet and not tweet_id in retweets:
        _, _, seed_time = seeds[url]
        is_in_window = False
        if dataset == DataSet.TRAINING:
          is_in_window = Util.is_in_training_set(seed_time)
        elif dataset == DataSet.TESTING:
          is_in_window = Util.is_in_testing_set(seed_time)
        else:
          is_in_window = True
        if is_in_window:
          category_matches = True
          if category:
            category_matches = False
            url_category = URLUtil.extract_category(url)
            if url_category == category:
              category_matches = True
          if category_matches:
            if url in gt_tweet_counts:
              gt_tweet_counts[url] += 1
            else:
              gt_tweet_counts[url] = 1

  gt_rankings = sorted(gt_tweet_counts.items(), key=lambda x: x[1],
                       reverse=True)
  return gt_rankings
def find_device_counts(max_delta, deltas, top_news, cache):
  """Finds the number of tweets by each source device.

  * To achieve no filtering by delta, pass in sys.maxint.

  Returns:
  Dictionary of source device string to pair of (count, percentage).
  e.g. {'Twitter for iPhone': (1100, 10.0) ...} for all, top, origin,
  and retweets.
  """
  device_counts = {}
  all_count = 0
  device_counts_top = {}
  top_count = 0
  device_counts_original = {}
  original_count = 0
  device_counts_retweets = {}
  retweet_count = 0
  for month in _WINDOW_MONTHS:
    log('Finding device counts for month %s and delta %s.' % (month, max_delta))
    dir_name = Util.get_data_dir_name_for(month) 
    for filename in os.listdir(dir_name):
      if '.tweet' in filename and 'http_nyti_ms' in filename:
        data_file = '%s/%s' % (dir_name, filename)
        with open(data_file) as input_file:
          for line in input_file:
            tokens = line.split('\t')
            created = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX],
                                        _DATETIME_FORMAT)
            if Util.is_in_window(created):
              tweet_id = tokens[_TWEETFILE_TWEET_ID_INDEX]
              source_device = tokens[_TWEETFILE_SOURCE_INDEX]
              retweet = bool(int(tokens[_TWEETFILE_RETWEET_COUNT_INDEX]))

              # If the url is in the top news, increment the count. Note
              # we do not limit this by delta.
              tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX]
              urls = URLUtil.parse_urls(tweet_text, cache)
              for url in urls:
                if url in top_news:
                  top_count += 1
                  if source_device in device_counts_top:
                    device_counts_top[source_device] += 1
                  else:
                    device_counts_top[source_device] = 1

              # If we don't see the tweet_id in the timedeltas file, we weren't
              # able to parse a url from the tweet text, so lets ignore it by
              # setting default delta to sys.maxint
              delta = sys.maxint
              if tweet_id in deltas:
                delta = deltas[tweet_id]
              if delta < max_delta: 
                all_count += 1
                if source_device in device_counts:
                  device_counts[source_device] += 1
                else:
                  device_counts[source_device] = 1
                if retweet:
                  retweet_count += 1
                  if source_device in device_counts_retweets:
                    device_counts_retweets[source_device] += 1
                  else:
                    device_counts_retweets[source_device] = 1
                else:
                  original_count += 1
                  if source_device in device_counts_original:
                    device_counts_original[source_device] += 1
                  else:
                    device_counts_original[source_device] = 1

  for device, count in device_counts_original.items():
    device_total = device_counts[device]
    device_counts_original[device] = (count,
                                      (float(count) / original_count) * 100,
                                      (float(count) / device_total) * 100)
  for device, count in device_counts_retweets.items():
    device_total = device_counts[device]
    device_counts_retweets[device] = (count,
                                      (float(count) / retweet_count) * 100,
                                      (float(count) / device_total) * 100)
  for device, count in device_counts.items():
    device_counts[device] = (count, (float(count) / all_count) * 100)
  for device, count in device_counts_top.items():
    device_counts_top[device] = (count, (float(count) / top_count) * 100)

  return (device_counts, device_counts_original, device_counts_retweets,
          device_counts_top)
def find_device_counts(max_delta, deltas, top_news, cache):
    """Finds the number of tweets by each source device.

  * To achieve no filtering by delta, pass in sys.maxint.

  Returns:
  Dictionary of source device string to pair of (count, percentage).
  e.g. {'Twitter for iPhone': (1100, 10.0) ...} for all, top, origin,
  and retweets.
  """
    device_counts = {}
    all_count = 0
    device_counts_top = {}
    top_count = 0
    device_counts_original = {}
    original_count = 0
    device_counts_retweets = {}
    retweet_count = 0
    for month in _WINDOW_MONTHS:
        log('Finding device counts for month %s and delta %s.' %
            (month, max_delta))
        dir_name = Util.get_data_dir_name_for(month)
        for filename in os.listdir(dir_name):
            if '.tweet' in filename and 'http_nyti_ms' in filename:
                data_file = '%s/%s' % (dir_name, filename)
                with open(data_file) as input_file:
                    for line in input_file:
                        tokens = line.split('\t')
                        created = datetime.strptime(
                            tokens[_TWEETFILE_CREATED_AT_INDEX],
                            _DATETIME_FORMAT)
                        if Util.is_in_window(created):
                            tweet_id = tokens[_TWEETFILE_TWEET_ID_INDEX]
                            source_device = tokens[_TWEETFILE_SOURCE_INDEX]
                            retweet = bool(
                                int(tokens[_TWEETFILE_RETWEET_COUNT_INDEX]))

                            # If the url is in the top news, increment the count. Note
                            # we do not limit this by delta.
                            tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX]
                            urls = URLUtil.parse_urls(tweet_text, cache)
                            for url in urls:
                                if url in top_news:
                                    top_count += 1
                                    if source_device in device_counts_top:
                                        device_counts_top[source_device] += 1
                                    else:
                                        device_counts_top[source_device] = 1

                            # If we don't see the tweet_id in the timedeltas file, we weren't
                            # able to parse a url from the tweet text, so lets ignore it by
                            # setting default delta to sys.maxint
                            delta = sys.maxint
                            if tweet_id in deltas:
                                delta = deltas[tweet_id]
                            if delta < max_delta:
                                all_count += 1
                                if source_device in device_counts:
                                    device_counts[source_device] += 1
                                else:
                                    device_counts[source_device] = 1
                                if retweet:
                                    retweet_count += 1
                                    if source_device in device_counts_retweets:
                                        device_counts_retweets[
                                            source_device] += 1
                                    else:
                                        device_counts_retweets[
                                            source_device] = 1
                                else:
                                    original_count += 1
                                    if source_device in device_counts_original:
                                        device_counts_original[
                                            source_device] += 1
                                    else:
                                        device_counts_original[
                                            source_device] = 1

    for device, count in device_counts_original.items():
        device_total = device_counts[device]
        device_counts_original[device] = (count,
                                          (float(count) / original_count) *
                                          100,
                                          (float(count) / device_total) * 100)
    for device, count in device_counts_retweets.items():
        device_total = device_counts[device]
        device_counts_retweets[device] = (count,
                                          (float(count) / retweet_count) * 100,
                                          (float(count) / device_total) * 100)
    for device, count in device_counts.items():
        device_counts[device] = (count, (float(count) / all_count) * 100)
    for device, count in device_counts_top.items():
        device_counts_top[device] = (count, (float(count) / top_count) * 100)

    return (device_counts, device_counts_original, device_counts_retweets,
            device_counts_top)