def file_filter(): """ This function removes null_byte and duplicate tweets. """ str_file_with_screenames_to_filter = 'mídia.csv' set_media_usernames = set(load_filter_list(str_file_with_screenames_to_filter)) filter_tweets_without_RT('tweets_FIXED.csv') if (not set_media_usernames): return else: filter_dataset('tweets_FIXED.csv', set_media_usernames)
def file_filter(): """ This function removes null_byte and duplicate tweets. """ str_file_with_screenames_to_filter = 'mídia.csv' set_media_usernames = set( load_filter_list(str_file_with_screenames_to_filter)) filter_tweets_without_RT('tweets_FIXED.csv') if (not set_media_usernames): return else: filter_dataset('tweets_FIXED.csv', set_media_usernames)
def main(input_file='tweets_FIXED_NO_DUPLICATES.csv'): """ Input file is set to 'tweets_FIXED' because it is the output of remove_null_byte() """ file_fix('tweets.csv') file_filter() dict_users_relations = load_user_relations('user_relations.csv') if (dict_users_relations == {}): user_relations_file_found = False else: user_relations_file_found = True set_cluster_usernames = set(load_filter_list('cluster_usernames.csv')) try: str_target_hashtag = load_filter_list('specific_hashtags.csv')[0] except IndexError: str_target_hashtag = None terminal_options = options_parser(sys.argv) # Dictionary of URLS where each entry contains a set of distinct # usernames that tweeted this URL. # Entry example: 'http://www.google.com' => ['Mary','John','Ronaldo'] dict_set_urls = {} # Dictionary of hashtags where each entry contains a set of distinct #usernames that commented on this hashtag. # entry example: 'chocolate' => ['johnDoe85','barack0','_b0btables', ...] dict_set_hashtags = {} dict_set_hashtags_without_accents = {} # Dictionary of mentions where each entry contains a set of distinct # usernames that mentioned a profile. # Entry example: 'uFulano2128_' => ['johnDoe85','barack0','_b0btables', ...] dict_set_mentions = {} # Dictionary of users where each entry contains their last given geo-coordinates # Entry example: 'random_Person' => (latitude,longitude) dict_tuple_users_positions = {} # Dictionary of distinct usernames by date. # Entry example: '04/05/2013' => ['ronaLDO', 'Rivaldo', 'RobertoCarlos_'] dict_int_users_by_date = {} # Dictionary of words where each entry contains the number of times # they were mentioned. # Entry example: 'chocolate' => 9001 dict_int_words = defaultdict(int) # Dictionary with the number of tweets in a given date. # entry example: '02/08/2013' => 1234 dates = defaultdict(int) # Dictionary with the number of distinct users that tweeted a hashtag. # entry example: 'beliebers' => 12 dict_int_hashtags = defaultdict(int) dict_int_hashtags_without_accents = defaultdict(int) # Dictionary with the number of distinct users that mentioned a profile. # entry example: '0bama' => 789 dict_int_mentions = defaultdict(int) # Dictionary with the number of tweets by a user. # entry example: 'ronald0' => 11 dict_int_users_activity = defaultdict(int) # Dictionary with the relation user_tweets/user_mention # entry example: 'ronald0' => 11 dict_int_user_influence = {} # Dictionary with the tweet texts. # entry example: 'a nice tweet example #creativity' => 11 tweets_count = defaultdict(int) # List with hashtags relations tuples # entry example: (#salt, #pepper) list_tuple_hashtags_relations = [] list_tuple_hashtags_relations_without_accents = [] # counter for the number of incorrect timestamps in a dataset int_incorrect_timestamps = 0 # counter for the number of corrupted lines int_corrupted_lines = 0 # The "Words timeline" feature is finished nor documented. timestamp_list = [] words_per_time = {} number_of_topwords = terminal_options['number_of_words'] # Set of tweets that have links set_tup_str_tweets_with_links = set() # Set of tweets with only the specified hashtag set_tup_tweets_specific_hashtag = set() # Set of tweets that doesn't have hashtags set_tup_str_tweets_without_hashtags = set() with open(input_file, 'rt', encoding="utf8") as csvfile: try: csv_in = csv.reader(csvfile, delimiter=DEFAULT_INPUT_DELIMITER, quoting=csv.QUOTE_NONE) lis_column_titles = next( csv_in) #Skips the line with the column titles. try: for line in csv_in: if len(line) is 13: str_username = line[2] str_username = str_username.lower() if (not set_cluster_usernames) or ( str_username in set_cluster_usernames): tweet_text = line[0] if (not str_target_hashtag) or ( is_the_only_hashtag_in_text( str_target_hashtag, tweet_text)): #saving the tweet if it has a link if has_links(line[0]): set_tup_str_tweets_with_links.add( tuple(line)) if not contains_hashtags(line[0]): set_tup_str_tweets_without_hashtags.add( tuple(line)) # Set of tweets with only the specified hashtag if (str_target_hashtag): set_tup_tweets_specific_hashtag.add( tuple(line)) tweets_count[tweet_text] += 1 dict_int_users_activity[str_username] += 1 try: # Sometimes this data is corrupted by YourTwapperKeeper, # this is why this clause is in a "try" block. timestamp = line[12] # Append the relations between the hashtags found in the tweet to a list list_tuple_hashtags_relations += process_hashtags_relations( tweet_text) list_tuple_hashtags_relations_without_accents += process_hashtags_relations_without_accents( tweet_text) if timestamp: str_date = datetime.datetime.fromtimestamp( int(timestamp) ).strftime( '%d/%m/%Y' ) # date STRING in the format DD/MM/YYYY count_users_by_date( dict_int_users_by_date, str_date, str_username) dates[datetime.datetime.fromtimestamp( int(timestamp)).strftime( '%d/%m/%Y')] += 1 timestamp = datetime.datetime.fromtimestamp( int(timestamp)) timestamp_list.append(timestamp) except ValueError: timestamp = '' int_incorrect_timestamps += 1 # Lines where the eighth column is 'Point' have # geographical data on columns 9(latitude) and 10(longitute). # Sometimes this data is corrupted by YourTwapperKeeper, # this is why this clause is in a "try" block. if line[8] == 'Point': dict_tuple_users_positions[ str_username] = (line[9], line[10]) read_tweet_text( tweet_text, str_username, dict_int_words, dict_set_urls, dict_set_hashtags, dict_set_hashtags_without_accents, dict_set_mentions, words_per_time, timestamp) else: int_corrupted_lines += 1 except (UnicodeDecodeError, IndexError): print(line) error_parsing(csv_in.line_num) except (IOError, StopIteration): print("Error opening some necessary files.") print("Make sure you have a 'tweets.csv' file in this folder.") print("Please ensure that you are not running the script as root.") int_total_line_num = csv_in.line_num dict_int_hashtags = dict_of_int_from_dict_of_lists(dict_set_hashtags) dict_int_hashtags_without_accents = dict_of_int_from_dict_of_lists( dict_set_hashtags_without_accents) dict_int_mentions = dict_of_int_from_dict_of_lists(dict_set_mentions) # Writing the CSV's of all that was calculated. locations_to_csv(dict_tuple_users_positions) hashtags_relations_to_csv(list_tuple_hashtags_relations) hashtags_relations_to_csv(list_tuple_hashtags_relations_without_accents, 'hashtags_network_without_accents.csv') top_something_to_csv(dict_set_urls, 'top_urls.csv', ['url', 'distinct_users'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t: len(t)) top_something_to_csv(dict_int_users_by_date, 'users_by_date.csv', ['date', 'distinct_users'], reverse=False, sort_key_function=lambda t: (t[0:2], t[3:5], t[6:8]), value_format_function=lambda t: len(t)) top_something_to_csv(dates, 'dates.csv', ['date', 'number_of_tweets'], reverse=False, sort_key_function=lambda t: datetime.date( int(t[0][6:]), int(t[0][3:5]), int(t[0][:2]))) top_something_to_csv(dict_int_hashtags, 'hashtags.csv', ['hashtag', 'distinct_users_commenting'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t: t) top_something_to_csv(dict_int_hashtags_without_accents, 'hashtags_without_accents.csv', ['hashtag', 'distinct_users_commenting'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t: t) if (user_relations_file_found): top_something_to_csv_with_relations( 'mentions.csv', dict_int_mentions, dict_users_relations, [ 'username', 'distinct_users_mentioning', 'followers', 'friends_count' ]) top_something_to_csv_with_relations( 'users_activity.csv', dict_int_users_activity, dict_users_relations, ['username', 'total_tweets', 'followers', 'friends_count']) else: top_something_to_csv(dict_int_mentions, 'mentions.csv', ['mentions', 'distinct_users_mentioning'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t: t) top_something_to_csv(dict_int_users_activity, 'users_activity.csv', ['user', 'total_tweets'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t: t) top_something_to_csv(tweets_count, 'top_tweets.csv', ['tweet', 'times_tweeted'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t: t) top_something_to_csv(dict_int_words, 'top_words.csv', ['word', 'times_mentioned'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t: t) #Calculating the user influence metric(mentions/number_of_tweets_by_this_user) for username, num_of_tweets in dict_int_users_activity.items(): try: dict_int_user_influence[username] = dict_int_mentions[ username] / dict_int_users_activity[username] except ZeroDivisionError: pass # Writing the user influence CSV. top_something_to_csv( dict_int_user_influence, 'user_influence.csv', ['word', 'influence(mentions/number_of_tweets_by_this_user)'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t: t) # Writing the TXT's files of the wordclouds. dict_to_txt_for_wordle(dict_int_words, 'top_words_wordle.txt', sort_key=lambda t: t[1]) dict_to_txt_for_wordle(dict_int_hashtags, 'top_hashtags_wordle.txt', sort_key=lambda t: t[1]) dict_to_txt_for_wordle(dict_int_hashtags_without_accents, 'top_hashtags_without_accents_wordle.txt', sort_key=lambda t: t[1]) # Writing the word timeline. timeline(words_per_time, get_N_first(dict_int_words, number_of_topwords), timestamp_list) # Writing tweets that have links write_set_of_tuples(set_tup_str_tweets_with_links, 'tweets_with_links.csv', column_titles=lis_column_titles) # Writing tweets that have links write_set_of_tuples(set_tup_tweets_specific_hashtag, 'tweets_of_a_specific_hashtag.csv', column_titles=lis_column_titles) # Writing tweets that have links write_set_of_tuples(set_tup_str_tweets_without_hashtags, 'tweets_without_hashtags.csv', column_titles=lis_column_titles) print(str(int_total_line_num) + "\t lines read.") print( str(len(dict_tuple_users_positions.keys())) + "\t tweets with geolocation data.") print(str(int_corrupted_lines) + "\t corrupted lines in this dataset.") cleanup()
def main(input_file='tweets_FIXED_NO_DUPLICATES.csv'): """ Input file is set to 'tweets_FIXED' because it is the output of remove_null_byte() """ file_fix('tweets.csv') file_filter() dict_users_relations = load_user_relations('user_relations.csv') if(dict_users_relations == {}): user_relations_file_found = False else: user_relations_file_found = True set_cluster_usernames = set(load_filter_list('cluster_usernames.csv')) try: str_target_hashtag = load_filter_list('specific_hashtags.csv')[0] except IndexError: str_target_hashtag = None terminal_options = options_parser(sys.argv) # Dictionary of URLS where each entry contains a set of distinct # usernames that tweeted this URL. # Entry example: 'http://www.google.com' => ['Mary','John','Ronaldo'] dict_set_urls = {} # Dictionary of hashtags where each entry contains a set of distinct #usernames that commented on this hashtag. # entry example: 'chocolate' => ['johnDoe85','barack0','_b0btables', ...] dict_set_hashtags = {} dict_set_hashtags_without_accents = {} # Dictionary of mentions where each entry contains a set of distinct # usernames that mentioned a profile. # Entry example: 'uFulano2128_' => ['johnDoe85','barack0','_b0btables', ...] dict_set_mentions = {} # Dictionary of users where each entry contains their last given geo-coordinates # Entry example: 'random_Person' => (latitude,longitude) dict_tuple_users_positions = {} # Dictionary of distinct usernames by date. # Entry example: '04/05/2013' => ['ronaLDO', 'Rivaldo', 'RobertoCarlos_'] dict_int_users_by_date = {} # Dictionary of words where each entry contains the number of times # they were mentioned. # Entry example: 'chocolate' => 9001 dict_int_words = defaultdict(int) # Dictionary with the number of tweets in a given date. # entry example: '02/08/2013' => 1234 dates = defaultdict(int) # Dictionary with the number of distinct users that tweeted a hashtag. # entry example: 'beliebers' => 12 dict_int_hashtags = defaultdict(int) dict_int_hashtags_without_accents = defaultdict(int) # Dictionary with the number of distinct users that mentioned a profile. # entry example: '0bama' => 789 dict_int_mentions = defaultdict(int) # Dictionary with the number of tweets by a user. # entry example: 'ronald0' => 11 dict_int_users_activity = defaultdict(int) # Dictionary with the relation user_tweets/user_mention # entry example: 'ronald0' => 11 dict_int_user_influence = {} # Dictionary with the tweet texts. # entry example: 'a nice tweet example #creativity' => 11 tweets_count = defaultdict(int) # List with hashtags relations tuples # entry example: (#salt, #pepper) list_tuple_hashtags_relations = [] list_tuple_hashtags_relations_without_accents =[] # counter for the number of incorrect timestamps in a dataset int_incorrect_timestamps = 0 # counter for the number of corrupted lines int_corrupted_lines = 0 # The "Words timeline" feature is finished nor documented. timestamp_list =[] words_per_time = {} number_of_topwords = terminal_options['number_of_words'] # Set of tweets that have links set_tup_str_tweets_with_links = set() # Set of tweets with only the specified hashtag set_tup_tweets_specific_hashtag = set() # Set of tweets that doesn't have hashtags set_tup_str_tweets_without_hashtags = set() with open(input_file, 'rt', encoding="utf8") as csvfile: try: csv_in = csv.reader(csvfile, delimiter=DEFAULT_INPUT_DELIMITER, quoting=csv.QUOTE_NONE) lis_column_titles = next(csv_in) #Skips the line with the column titles. try: for line in csv_in: if len(line) is 13: str_username = line[2] str_username = str_username.lower() if (not set_cluster_usernames) or (str_username in set_cluster_usernames): tweet_text = line[0] if (not str_target_hashtag) or (is_the_only_hashtag_in_text(str_target_hashtag, tweet_text)): #saving the tweet if it has a link if has_links(line[0]): set_tup_str_tweets_with_links.add(tuple(line)) if not contains_hashtags(line[0]): set_tup_str_tweets_without_hashtags.add(tuple(line)) # Set of tweets with only the specified hashtag if (str_target_hashtag): set_tup_tweets_specific_hashtag.add(tuple(line)) tweets_count[tweet_text] += 1 dict_int_users_activity[str_username] += 1 try: # Sometimes this data is corrupted by YourTwapperKeeper, # this is why this clause is in a "try" block. timestamp = line[12] # Append the relations between the hashtags found in the tweet to a list list_tuple_hashtags_relations += process_hashtags_relations(tweet_text) list_tuple_hashtags_relations_without_accents += process_hashtags_relations_without_accents(tweet_text) if timestamp: str_date = datetime.datetime.fromtimestamp(int(timestamp)).strftime('%d/%m/%Y') # date STRING in the format DD/MM/YYYY count_users_by_date(dict_int_users_by_date, str_date, str_username) dates[datetime.datetime.fromtimestamp(int(timestamp)).strftime('%d/%m/%Y')] += 1 timestamp = datetime.datetime.fromtimestamp(int(timestamp)) timestamp_list.append(timestamp) except ValueError: timestamp = '' int_incorrect_timestamps += 1 # Lines where the eighth column is 'Point' have # geographical data on columns 9(latitude) and 10(longitute). # Sometimes this data is corrupted by YourTwapperKeeper, # this is why this clause is in a "try" block. if line[8] == 'Point': dict_tuple_users_positions[str_username] = (line[9],line[10]) read_tweet_text(tweet_text, str_username, dict_int_words, dict_set_urls, dict_set_hashtags, dict_set_hashtags_without_accents, dict_set_mentions,words_per_time, timestamp) else: int_corrupted_lines += 1 except (UnicodeDecodeError, IndexError): print(line) error_parsing(csv_in.line_num) except (IOError, StopIteration): print("Error opening some necessary files.") print("Make sure you have a 'tweets.csv' file in this folder.") print("Please ensure that you are not running the script as root.") int_total_line_num = csv_in.line_num dict_int_hashtags = dict_of_int_from_dict_of_lists(dict_set_hashtags) dict_int_hashtags_without_accents = dict_of_int_from_dict_of_lists(dict_set_hashtags_without_accents) dict_int_mentions = dict_of_int_from_dict_of_lists(dict_set_mentions) # Writing the CSV's of all that was calculated. locations_to_csv(dict_tuple_users_positions) hashtags_relations_to_csv(list_tuple_hashtags_relations) hashtags_relations_to_csv(list_tuple_hashtags_relations_without_accents, 'hashtags_network_without_accents.csv') top_something_to_csv(dict_set_urls, 'top_urls.csv', ['url', 'distinct_users'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t: len(t)) top_something_to_csv(dict_int_users_by_date, 'users_by_date.csv', ['date', 'distinct_users'], reverse=False, sort_key_function=lambda t:(t[0:2], t[3:5], t[6:8]), value_format_function=lambda t: len(t)) top_something_to_csv(dates, 'dates.csv', ['date', 'number_of_tweets'], reverse=False, sort_key_function=lambda t: datetime.date(int(t[0][6:]), int(t[0][3:5]), int(t[0][:2]))) top_something_to_csv(dict_int_hashtags, 'hashtags.csv', ['hashtag', 'distinct_users_commenting'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t:t) top_something_to_csv(dict_int_hashtags_without_accents, 'hashtags_without_accents.csv', ['hashtag', 'distinct_users_commenting'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t:t) if(user_relations_file_found): top_something_to_csv_with_relations('mentions.csv', dict_int_mentions, dict_users_relations, ['username', 'distinct_users_mentioning', 'followers', 'friends_count']) top_something_to_csv_with_relations('users_activity.csv', dict_int_users_activity, dict_users_relations, ['username', 'total_tweets', 'followers', 'friends_count']) else: top_something_to_csv(dict_int_mentions, 'mentions.csv', ['mentions', 'distinct_users_mentioning'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t:t) top_something_to_csv(dict_int_users_activity, 'users_activity.csv', ['user', 'total_tweets'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t:t) top_something_to_csv(tweets_count, 'top_tweets.csv', ['tweet', 'times_tweeted'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t:t) top_something_to_csv(dict_int_words, 'top_words.csv', ['word', 'times_mentioned'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t:t) #Calculating the user influence metric(mentions/number_of_tweets_by_this_user) for username, num_of_tweets in dict_int_users_activity.items(): try: dict_int_user_influence[username] = dict_int_mentions[username]/dict_int_users_activity[username] except ZeroDivisionError: pass # Writing the user influence CSV. top_something_to_csv(dict_int_user_influence, 'user_influence.csv', ['word', 'influence(mentions/number_of_tweets_by_this_user)'], reverse=True, sort_key_function=lambda t: t[1], value_format_function=lambda t:t) # Writing the TXT's files of the wordclouds. dict_to_txt_for_wordle(dict_int_words, 'top_words_wordle.txt', sort_key=lambda t:t[1]) dict_to_txt_for_wordle(dict_int_hashtags, 'top_hashtags_wordle.txt', sort_key=lambda t: t[1]) dict_to_txt_for_wordle(dict_int_hashtags_without_accents, 'top_hashtags_without_accents_wordle.txt', sort_key=lambda t: t[1]) # Writing the word timeline. timeline(words_per_time, get_N_first(dict_int_words, number_of_topwords), timestamp_list) # Writing tweets that have links write_set_of_tuples(set_tup_str_tweets_with_links, 'tweets_with_links.csv', column_titles=lis_column_titles) # Writing tweets that have links write_set_of_tuples(set_tup_tweets_specific_hashtag, 'tweets_of_a_specific_hashtag.csv', column_titles=lis_column_titles) # Writing tweets that have links write_set_of_tuples(set_tup_str_tweets_without_hashtags, 'tweets_without_hashtags.csv', column_titles=lis_column_titles) print(str(int_total_line_num) + "\t lines read.") print(str(len(dict_tuple_users_positions.keys())) + "\t tweets with geolocation data.") print(str(int_corrupted_lines) + "\t corrupted lines in this dataset.") cleanup()