def find_hits_and_mises(months, target_news, seeds, cache, delta, category=None): """Finds the hit and miss count for each user. Keyword Arguments: months -- The months over which to calculate hit and misses. target_news -- A set of urls that is the set of known target news. cache -- A dictionary of short url to long url. category -- The category to find hits and misses for, None for all news. """ hits_and_misses = {} for month in months: log('Finding hits and misses for users from %s for delta %s and category %s' % (month, delta, category)) dir_name = Util.get_data_dir_name_for(month) for filename in os.listdir(dir_name): if '.tweet' in filename and 'http_nyti_ms' in filename: data_file = '%s/%s' % (dir_name, filename) with open(data_file) as input_file: for line in input_file: tokens = line.split('\t') user_id = tokens[_TWEETFILE_USER_ID_INDEX] tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX] urls = URLUtil.parse_urls(tweet_text, cache) for url in urls: _, _, seed_time = seeds[url] created = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX], _DATETIME_FORMAT) time_delta = created - seed_time if time_delta < timedelta(hours=delta): category_matches = True if category: category_matches = False url_category = URLUtil.extract_category(url) if category == url_category: category_matches = True if url in target_news and category_matches: if user_id in hits_and_misses: (user_hits, user_misses) = hits_and_misses[user_id] hits_and_misses[user_id] = (user_hits + 1, user_misses) else: hits_and_misses[user_id] = (1, 0) elif category_matches: if user_id in hits_and_misses: (user_hits, user_misses) = hits_and_misses[user_id] hits_and_misses[user_id] = (user_hits, user_misses + 1) else: hits_and_misses[user_id] = (0, 1) output_file = (_OUT_DIR + 'user_hits_and_misses_%s_%s.tsv' % (delta, category)) with open(output_file, 'w') as out_file: for user_id, (hits, misses) in hits_and_misses.items(): out_file.write('%s\t%s\t%s\n' % (user_id, hits, misses)) log('Wrote hits and misses to disk.')
def find_delta_times(months, seeds, cache): """Finds the delta times for every url. Looks at every url, and calculates the time delta from previously calculated seed times. Keyword Arguments: months -- The months over which to look at urls. seeds -- A set of seed times, given as a dictionary of url to timedelta. cache -- Dictionary mapping short-url to long-url. """ time_deltas = {} for month in months: log('Finding delta times from %s' % month) dir_name = Util.get_data_dir_name_for(month) for filename in os.listdir(dir_name): if '.tweet' in filename and 'http_nyti_ms' in filename: data_file = '%s/%s' % (dir_name, filename) with open(data_file) as input_file: for line in input_file: tokens = line.split('\t') user_id = tokens[_TWEETFILE_USER_ID_INDEX] tweet_id = tokens[_TWEETFILE_TWEET_ID_INDEX] tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX] urls = URLUtil.parse_urls(tweet_text, cache) source = tokens[_TWEETFILE_SOURCE_INDEX] for url in urls: seed_tweet_id, _, seed_time = seeds[url] category = URLUtil.extract_category(url) if tweet_id == seed_tweet_id: time_deltas[tweet_id] = (user_id, 0, url, category, source) else: created = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX], _DATETIME_FORMAT) time_delta = created - seed_time # Convert time delta to seconds to make it easy to read from # file later. time_delta_in_seconds = (time_delta.days * 86400 + time_delta.seconds) time_deltas[tweet_id] = (user_id, time_delta_in_seconds, url, category, source) sorted_deltas = sorted(time_deltas.items(), key=lambda x: x[1][1], reverse=False) for (tweet_id, tp) in sorted_deltas: if len(tp) < 5: print tp with open('../data/FolkWisdom/time_deltas.tsv', 'w') as output_file: for (tweet_id, (user_id, time_delta, url, category, source)) in sorted_deltas: output_file.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (tweet_id, user_id, time_delta, url, category, source)) log('Wrote time deltas to disk')
def sort_users_by_tweet_count(months, seeds, cache, delta, category=None): """Sorts users by their tweet activity. Keyword Arguments: months -- The months for which to sort the users on. cache -- Dictionary of short url to long url. category -- The category to go by, None for all news. """ user_id_to_tweet_count = {} for month in months: log('Gathering count information for users from %s for delta %s ' 'and category %s' % (month, delta, category)) dir_name = Util.get_data_dir_name_for(month) for filename in os.listdir(dir_name): if '.tweet' in filename and 'http_nyti_ms' in filename: data_file = '%s/%s' % (dir_name, filename) with open(data_file) as input_file: for line in input_file: tokens = line.split('\t') user_id = tokens[_TWEETFILE_USER_ID_INDEX] tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX] urls = URLUtil.parse_urls(tweet_text, cache) for url in urls: _, _, seed_time = seeds[url] created = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX], _DATETIME_FORMAT) time_delta = created - seed_time if time_delta < timedelta(hours=delta): if category: url_category = URLUtil.extract_category(url) if url_category == category: if user_id_to_tweet_count.has_key(user_id): user_id_to_tweet_count[user_id] += 1 else: user_id_to_tweet_count[user_id] = 1 else: if user_id_to_tweet_count.has_key(user_id): user_id_to_tweet_count[user_id] += 1 else: user_id_to_tweet_count[user_id] = 1 user_ids_sorted_by_tweet_count = sorted(user_id_to_tweet_count.items(), key=lambda x: x[1], reverse=True) log("Size of users for category %s (total): %s" % (str(len(user_id_to_tweet_count.keys())), category)) output_file = _OUT_DIR + 'user_activity_%s_%s.tsv' % (delta, category) with open(output_file, 'w') as out_file: for user_id, count in user_ids_sorted_by_tweet_count: out_file.write('%s\t%s\n' % (user_id, count)) log('Wrote users (sorted by activity) to disk')
def find_seed_times(months, cache): """Finds the time at which each url was seen. Keyword Arguments: months -- The months over which to look at urls. cache -- Dictionary mapping short-url to long-url. """ seed_times = {} for month in months: log('Finding seed times from %s' % month) dir_name = Util.get_data_dir_name_for(month) for filename in os.listdir(dir_name): if '.tweet' in filename and 'http_nyti_ms' in filename: data_file = '%s/%s' % (dir_name, filename) with open(data_file) as input_file: for line in input_file: tokens = line.split('\t') user_id = tokens[_TWEETFILE_USER_ID_INDEX] tweet_id = tokens[_TWEETFILE_TWEET_ID_INDEX] seed_time = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX], _DATETIME_FORMAT) tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX] urls = URLUtil.parse_urls(tweet_text, cache) for url in urls: if not url in seed_times: seed_times[url] = (tweet_id, user_id, seed_time) else: (_, _, previous_seed_time) = seed_times[url] if seed_time < previous_seed_time: seed_times[url] = (tweet_id, user_id, seed_time) with open('../data/FolkWisdom/seed_times.tsv', 'w') as output_file: for url, (tweet_id, user_id, seed_time) in seed_times.items(): output_file.write('%s\t%s\t%s\t%s\n' %(tweet_id, user_id, seed_time, url)) log('Wrote seed times to disk')
def get_gt_rankings(seeds, dataset, category=None, delta=4, exclude_tweets_within_delta=False, retweets=set()): """Generate the ground truth rankings. Keyword Arguments: seeds -- A dictionary of url to first time seen. training -- Boolean to indicate if we want training window. False specifies testing window. category -- The category to get gt's for, None for all news. Returns: gt_rankings -- A list of (url, count) pairs in ranked order. """ gt_tweet_counts = {} with open('../data/FolkWisdom/time_deltas.tsv') as input_file: for line in input_file: tokens = line.split('\t') source = tokens[_TIMEDELTAS_FILE_SOURCE_INDEX].strip() url = tokens[_TIMEDELTAS_FILE_URL_INDEX] tweet_id = tokens[_TIMEDELTAS_FILE_TWEET_ID_INDEX] tweet_delta = float(tokens[_TIMEDELTAS_FILE_DELTA_INDEX]) / 3600 include_tweet = True if exclude_tweets_within_delta: if tweet_delta > delta: include_tweet = False if url in seeds and include_tweet and not tweet_id in retweets: _, _, seed_time = seeds[url] is_in_window = False if dataset == DataSet.TRAINING: is_in_window = Util.is_in_training_set(seed_time) elif dataset == DataSet.TESTING: is_in_window = Util.is_in_testing_set(seed_time) else: is_in_window = True if is_in_window: category_matches = True if category: category_matches = False url_category = URLUtil.extract_category(url) if url_category == category: category_matches = True if category_matches: if url in gt_tweet_counts: gt_tweet_counts[url] += 1 else: gt_tweet_counts[url] = 1 gt_rankings = sorted(gt_tweet_counts.items(), key=lambda x: x[1], reverse=True) return gt_rankings
def find_device_counts(max_delta, deltas, top_news, cache): """Finds the number of tweets by each source device. * To achieve no filtering by delta, pass in sys.maxint. Returns: Dictionary of source device string to pair of (count, percentage). e.g. {'Twitter for iPhone': (1100, 10.0) ...} for all, top, origin, and retweets. """ device_counts = {} all_count = 0 device_counts_top = {} top_count = 0 device_counts_original = {} original_count = 0 device_counts_retweets = {} retweet_count = 0 for month in _WINDOW_MONTHS: log('Finding device counts for month %s and delta %s.' % (month, max_delta)) dir_name = Util.get_data_dir_name_for(month) for filename in os.listdir(dir_name): if '.tweet' in filename and 'http_nyti_ms' in filename: data_file = '%s/%s' % (dir_name, filename) with open(data_file) as input_file: for line in input_file: tokens = line.split('\t') created = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX], _DATETIME_FORMAT) if Util.is_in_window(created): tweet_id = tokens[_TWEETFILE_TWEET_ID_INDEX] source_device = tokens[_TWEETFILE_SOURCE_INDEX] retweet = bool(int(tokens[_TWEETFILE_RETWEET_COUNT_INDEX])) # If the url is in the top news, increment the count. Note # we do not limit this by delta. tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX] urls = URLUtil.parse_urls(tweet_text, cache) for url in urls: if url in top_news: top_count += 1 if source_device in device_counts_top: device_counts_top[source_device] += 1 else: device_counts_top[source_device] = 1 # If we don't see the tweet_id in the timedeltas file, we weren't # able to parse a url from the tweet text, so lets ignore it by # setting default delta to sys.maxint delta = sys.maxint if tweet_id in deltas: delta = deltas[tweet_id] if delta < max_delta: all_count += 1 if source_device in device_counts: device_counts[source_device] += 1 else: device_counts[source_device] = 1 if retweet: retweet_count += 1 if source_device in device_counts_retweets: device_counts_retweets[source_device] += 1 else: device_counts_retweets[source_device] = 1 else: original_count += 1 if source_device in device_counts_original: device_counts_original[source_device] += 1 else: device_counts_original[source_device] = 1 for device, count in device_counts_original.items(): device_total = device_counts[device] device_counts_original[device] = (count, (float(count) / original_count) * 100, (float(count) / device_total) * 100) for device, count in device_counts_retweets.items(): device_total = device_counts[device] device_counts_retweets[device] = (count, (float(count) / retweet_count) * 100, (float(count) / device_total) * 100) for device, count in device_counts.items(): device_counts[device] = (count, (float(count) / all_count) * 100) for device, count in device_counts_top.items(): device_counts_top[device] = (count, (float(count) / top_count) * 100) return (device_counts, device_counts_original, device_counts_retweets, device_counts_top)
def find_device_counts(max_delta, deltas, top_news, cache): """Finds the number of tweets by each source device. * To achieve no filtering by delta, pass in sys.maxint. Returns: Dictionary of source device string to pair of (count, percentage). e.g. {'Twitter for iPhone': (1100, 10.0) ...} for all, top, origin, and retweets. """ device_counts = {} all_count = 0 device_counts_top = {} top_count = 0 device_counts_original = {} original_count = 0 device_counts_retweets = {} retweet_count = 0 for month in _WINDOW_MONTHS: log('Finding device counts for month %s and delta %s.' % (month, max_delta)) dir_name = Util.get_data_dir_name_for(month) for filename in os.listdir(dir_name): if '.tweet' in filename and 'http_nyti_ms' in filename: data_file = '%s/%s' % (dir_name, filename) with open(data_file) as input_file: for line in input_file: tokens = line.split('\t') created = datetime.strptime( tokens[_TWEETFILE_CREATED_AT_INDEX], _DATETIME_FORMAT) if Util.is_in_window(created): tweet_id = tokens[_TWEETFILE_TWEET_ID_INDEX] source_device = tokens[_TWEETFILE_SOURCE_INDEX] retweet = bool( int(tokens[_TWEETFILE_RETWEET_COUNT_INDEX])) # If the url is in the top news, increment the count. Note # we do not limit this by delta. tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX] urls = URLUtil.parse_urls(tweet_text, cache) for url in urls: if url in top_news: top_count += 1 if source_device in device_counts_top: device_counts_top[source_device] += 1 else: device_counts_top[source_device] = 1 # If we don't see the tweet_id in the timedeltas file, we weren't # able to parse a url from the tweet text, so lets ignore it by # setting default delta to sys.maxint delta = sys.maxint if tweet_id in deltas: delta = deltas[tweet_id] if delta < max_delta: all_count += 1 if source_device in device_counts: device_counts[source_device] += 1 else: device_counts[source_device] = 1 if retweet: retweet_count += 1 if source_device in device_counts_retweets: device_counts_retweets[ source_device] += 1 else: device_counts_retweets[ source_device] = 1 else: original_count += 1 if source_device in device_counts_original: device_counts_original[ source_device] += 1 else: device_counts_original[ source_device] = 1 for device, count in device_counts_original.items(): device_total = device_counts[device] device_counts_original[device] = (count, (float(count) / original_count) * 100, (float(count) / device_total) * 100) for device, count in device_counts_retweets.items(): device_total = device_counts[device] device_counts_retweets[device] = (count, (float(count) / retweet_count) * 100, (float(count) / device_total) * 100) for device, count in device_counts.items(): device_counts[device] = (count, (float(count) / all_count) * 100) for device, count in device_counts_top.items(): device_counts_top[device] = (count, (float(count) / top_count) * 100) return (device_counts, device_counts_original, device_counts_retweets, device_counts_top)