def print_user_archive(): """ Fetch all available tweets for one user and print them, line by line """ archive_generator = rest.fetch_user_archive("lessig") for page in archive_generator: for tweet in page: print_tweet(tweet)
def save_user_archive_to_database(): """ Fetch all available tweets for one user and save them to the database. """ archive_generator = rest.fetch_user_archive("lessig") for page in archive_generator: for tweet in page: database.create_tweet_from_dict(tweet) logging.warning(u"Wrote tweets from @lessig to database")
def save_user_archive_to_file(): """ Fetch all available tweets for one user and save them to a text file, one tweet per line. (This is approximately the format that GNIP uses) """ with open("lessig-tweets.json", "w") as f: archive_generator = rest.fetch_user_archive("lessig") for page in archive_generator: for tweet in page: f.write(json.dumps(tweet) + "\n") logging.warning(u"Wrote tweets from @lessig to file lessig-tweets.json")
def main(): archive = rest.fetch_user_archive(sys.argv[1]) processed_word_list = [] for page in archive: for tweet in page: # print_tweet(tweet) print(u"{0}: {1}".format(tweet["user"]["screen_name"], tweet["text"])) print("Length of the tweets: ", len(tweet["text"])) processed_word_list = filter_stop_words(processed_word_list, tweet["text"]) plot_tweets_by_wordlist(processed_word_list) break
def main(): archive = rest.fetch_user_archive(sys.argv[1]) processed_word_list = [] for page in archive: for tweet in page: # print_tweet(tweet) cur_text = tweet["text"] # print(u"{0}: {1}".format(tweet["user"]["screen_name"], cur_text)) # print("Length of the tweets: ", len(cur_text)) cur_text = cleanse_sentence(cur_text) processed_word_list = filter_stop_words(processed_word_list, cur_text) plot_tweets_by_wordlist(processed_word_list, True) break
def save_user_archive_list(unfiltered_user_path, months_back, lang, capital): """ Function to fetch tweets, at max 200x200 for a list of users that in a cvs, filter them, if they exist, according to their activity and the selection criteria: user["interface"]/user["time_zone"]/tweet["language"]/tweet["place"] and then save them in a cvs: 'filtered_users.csv' and some other analysis csv :param unfiltered_user_path:(str) path of the file that contains the list of unique users "unique_users.cvs" :param months_back: (int) if the user has not tweeted in months_back months back in time then he is not active :param lang: (str) language of the nation of analysis "it" :param capital: (str) capital of the nation of analysis "rome" :return: 1) 'filtered_users.csv' with ids of the users that satisfy the activity and selection criteria 2) 'filtered_time_users.csv' with ids of the selected users with the retrieval date 3) 'unique_active_users_time.csv' with number of saved 4) 'nationality_users.csv' with parameters of selection: n_tweets, user_interface, user_time_zone, tweet_language, tweet_place 5) 'perc_overlap.csv'/'abs_overlap.csv' impact of each selection criteria 6) 'frequency_users.csv' frequency of the users """ with open('filtered_users.csv', "w") as f, open('filtered_time_users.csv', "w") as ft, \ open('unique_active_users_time.csv', "w") as t, open('nationality_users.csv', "w") as n, \ open('abs_overlap.csv', "w") as a, open('perc_overlap.csv', "w") as p, \ open('frequency_users.csv', "w") as freq: dict_retrival_date = {} unique_active_users = [] frequency = [] perc_users = {} u_interface = [ 'user_interface', 0, 0, 0, 0 ] # user_interface, &user_time_zone, &tweet_language, &tweet_place u_time_zone = [ 'user_time_zone', 0, 0, 0, 0 ] # &user_interface, user_time_zone, &tweet_language, &tweet_place u_language = [ 'tweet_language', 0, 0, 0, 0 ] # &user_interface, &user_time_zone, tweet_language, &tweet_place u_place = [ 'tweet_place', 0, 0, 0, 0 ] # &user_interface, &user_time_zone, &tweet_language, tweet_place n.write( "user_id, n_tweets, user_interface, user_time_zone, tweet_language, tweet_place\n" ) freq.write("days interval, users' count\n") for user, retrival_date in unfiltered_user_path: perc_users[user] = [0, 0, 0, 0, 0] archive_first_page = rest.fetch_user_archive( user, 1, 200) # get the first page of the archive n_tweets_user = 0 # filter for tweets in archive_first_page: # if the profile still exists if "errors" not in archive_first_page: date_first_tweet = parser.parse( tweets[0]['created_at']).strftime("%Y-%m-%d") date_first_tweet_str = datetime.datetime.strptime( date_first_tweet, "%Y-%m-%d").date() month_ago = (datetime.datetime.today() - relativedelta(months=months_back)).date() for tweet in tweets: # if the user is still active if date_first_tweet_str >= month_ago: n_tweets_user += 1 language = tweet["lang"] if tweet["place"] is not None: place = tweet["place"]["country_code"].lower() else: place = None if tweet["user"]["lang"] is not None: interface = tweet["user"]["lang"].lower() else: interface = None if tweet["user"]["time_zone"] is not None: time_zone = tweet["user"]["time_zone"].lower() else: time_zone = None # analyse all their tweets if interface == lang: perc_users[user][0] += 1 if time_zone == capital: perc_users[user][1] += 1 if language == lang: perc_users[user][2] += 1 if place == lang: perc_users[user][3] += 1 # if the national criteria are still valid if interface == lang or time_zone == capital or language == lang or place == lang: # selection phase: unique users if user not in unique_active_users: f.write("{0}\n".format(user)) ft.write("{0},{1}\n".format( user, retrival_date)) if retrival_date in dict_retrival_date: dict_retrival_date[retrival_date] += 1 else: dict_retrival_date[retrival_date] = 1 if interface == lang: u_interface[1] += 1 if time_zone == capital: u_interface[2] += 1 if language == lang: u_interface[3] += 1 if place == lang: u_interface[4] += 1 if time_zone == capital: u_time_zone[2] += 1 if interface == lang: u_time_zone[1] += 1 if language == lang: u_time_zone[3] += 1 if place == lang: u_time_zone[4] += 1 if language == lang: u_language[3] += 1 if interface == lang: u_language[1] += 1 if time_zone == capital: u_language[2] += 1 if place == lang: u_language[4] += 1 if place == lang: u_place[4] += 1 if interface == lang: u_place[1] += 1 if time_zone == capital: u_place[2] += 1 if language == lang: u_place[3] += 1 unique_active_users.append(user) else: break # consider all the tweets for the nationality on the use, consistency, but print just the selected if user in unique_active_users: date_first_tweet = parser.parse( tweets[0]['created_at']) date_last_tweet = parser.parse( tweets[199]['created_at']) time_interval = date_first_tweet - date_last_tweet # in days frequency.append(time_interval.days) perc_users[user] = [ l / n_tweets_user for l in perc_users[user] ] perc_users[user][4] = n_tweets_user n.write("{0},{1},{2},{3},{4},{5}\n".format( user, perc_users[user][4], perc_users[user][0], perc_users[user][1], perc_users[user][2], perc_users[user][3])) print(dict_retrival_date) u_tot = 0 for key in dict_retrival_date: u_tot += dict_retrival_date[key] t.write("{0},{1}\n".format(key, dict_retrival_date[key])) for item in Counter(frequency).items(): freq.write("{0},{1}\n".format(item[0], item[1])) # summary: overlap tables a.write( "{0}, user_interface, user_time_zone, tweet_language, tweet_place\n" .format(u_tot)) a.write("{0},{1},{2},{3},{4}\n".format(u_interface[0], u_interface[1], u_interface[2], u_interface[3], u_interface[4])) a.write("{0},{1},{2},{3},{4}\n".format(u_time_zone[0], u_time_zone[1], u_time_zone[2], u_time_zone[3], u_time_zone[4])) a.write("{0},{1},{2},{3},{4}\n".format(u_language[0], u_language[1], u_language[2], u_language[3], u_language[4])) a.write("{0},{1},{2},{3},{4}\n".format(u_place[0], u_place[1], u_place[2], u_place[3], u_place[4])) p.write( " , user_interface, user_time_zone, tweet_language, tweet_place\n") if u_interface[1] != 0: p.write("{0},{1},{2},{3},{4}\n".format( u_interface[0], u_interface[1] / u_tot, u_interface[2] / u_interface[1], u_interface[3] / u_interface[1], u_interface[4] / u_interface[1])) else: p.write("{0},{1},{2},{3},{4}\n".format(u_interface[0], u_interface[1] / u_tot, None, None, None)) if u_time_zone[2] != 0: p.write("{0},{1},{2},{3},{4}\n".format( u_time_zone[0], u_time_zone[1] / u_time_zone[2], u_time_zone[2] / u_tot, u_time_zone[3] / u_time_zone[2], u_time_zone[4] / u_time_zone[2])) else: p.write("{0},{1},{2},{3},{4}\n".format(u_time_zone[0], None, u_time_zone[2] / u_tot, None, None)) if u_language[3] != 0: p.write("{0},{1},{2},{3},{4}\n".format( u_language[0], u_language[1] / u_language[3], u_language[2] / u_language[3], u_language[3] / u_tot, u_language[4] / u_language[3])) else: p.write("{0},{1},{2},{3},{4}\n".format(u_language[0], None, None, u_language[3] / u_tot, None)) if u_place[4] != 0: p.write("{0},{1},{2},{3},{4}\n".format(u_place[0], u_place[1] / u_place[4], u_place[2] / u_place[4], u_place[3] / u_place[4], u_place[4] / u_tot)) else: p.write("{0},{1},{2},{3},{4}\n".format(u_place[0], None, None, None, u_place[4] / u_tot))
def download_archive_json(filtered_user_path, archive_name, min_tweet_date, max_tweet_date, disk_out, last_user): # download_archive.download_archive_json('unique_frequent_users.csv', 'archive_frequent_users_2018', "2018-01-01", # "2018-12-31", '', None) """ Function to just download archive as json :param filtered_user_path: (str) path of the csv file with the list of filtered selected users :param archive_name: (str) name of the compressed file with the archive :param min_tweet_date: (str) lower limit to save the tweets (to be changed each month to speed up the code) :param max_tweet_date: (str) upper limit to save the tweets :param disk_out: (str) path of the disk_out to save json :param last_user: (int) plan B if no json file. Insert user id to start retrieve data from there :return: json with the tweets between min_tweet_date and max_tweet_date """ no_retrieved_users = [ row[0] for row in list_no_retrieved_users(filtered_user_path, archive_name + '.json', disk_out, last_user) ] processed_users, too_frequent_users = 0, 0 user_archive_list = None min_tweet_date = parser.parse(min_tweet_date) max_tweet_date = parser.parse(max_tweet_date) tot_users = len(no_retrieved_users) with open(os.path.join(disk_out, "too_frequent_users.csv"), "a") as tf: f_json = gzip.open(os.path.join(disk_out, archive_name + '.json.gz'), 'at', encoding='utf-8') for user_str in no_retrieved_users: processed_users += 1 user = int(user_str) print('User: '******'created_at']).replace(tzinfo=None) if len(user_archive_list ) >= 16: # too frequent users are written in a file tf.write("{0},{1}\n".format(user_str, date_last_tweet)) too_frequent_users += 1 for page in user_archive_list: # for each page, from the most recent to the oldest n_tweets_pag = len(page) if n_tweets_pag > 0 and "error" not in page: # if no errors in pages date_first_tweet = parser.parse( page[0]['created_at']).replace(tzinfo=None) date_last_tweet = parser.parse( page[n_tweets_pag - 1]['created_at']).replace(tzinfo=None) # if the page has tweets in the desired time interval (no time zone) # pages with some tweets in the interval if date_first_tweet >= min_tweet_date and date_last_tweet <= max_tweet_date: # if all the tweets are in the interval if date_last_tweet >= min_tweet_date and date_first_tweet <= max_tweet_date: # no extra checks on the tweets and save tweets of the page in json for tweet in page: # save tweets in a json file json_str = '{}\n'.format(json.dumps(tweet)) f_json.write(json_str) else: for tweet in page: # check 1 by one and save tweets of the page in json date_tweet = parser.parse( tweet['created_at']).replace( tzinfo=None) if min_tweet_date <= date_tweet <= max_tweet_date: # save tweets in a json file json_str = '{}\n'.format( json.dumps(tweet)) f_json.write(json_str) # timer to see what I am doing if processed_users % 10 == 0: print( "---------------------------------------------------------------------------------" ) print("user:"******"percentage of processed users:" + str(processed_users / tot_users)) print("processed users:" + str(processed_users)) print("too frequent users:" + str(too_frequent_users))
def download_new_user(unfiltered_user_path, archive_name, min_tweet_date, max_tweet_date, months_back, lang, capital, threshold_lang, disk_out, last_user, chunk_users): # download_archive.download_new_user('unique_users.csv', 'archive_01_08_2018', "2018-01-01", "2018-12-31", 1, "it", # "rome", 2, '', None, 0, 50) """ # for each user check if it still exists, if the selection criteria are still satisfied in the first page of the # archive (last 200 tweets), if the user is still active in the last month, if all the 3 criteria are satisfied, # then the user has field selected = True, all the its tweets in the time interval are saved in a json file. :param unfiltered_user_path: (str) path of the csv file with the list of unfiltered selected users :param archive_name: (str) name of the compressed file with the archive :param min_tweet_date: (str) lower limit to save the tweets (to be changed each month to speed up the code) :param max_tweet_date: (str) upper limit to save the tweets :param months_back: (int) number of monts to go dackwards to consider the user as inactive if no activity (1) :param lang: (str) selection parameter ("it") :param capital: (str) selection parameter ("rome") :param threshold_lang: (int) selection parameter: minimum lever of consistency for the language that is required. :param disk_out: (str) path of the disk_out to save json.gz :param last_user: (int) plan B if no csv file. Insert user id (as int) to start retrieve data from there :param chunk_users: (int) number of users of the chunk files :return: json and csv.gz files with the tweets and the tables of the selected users between min_tweet_date and max_tweet_date """ unfiltered_users_retrieval_date = list_no_retrieved_users( unfiltered_user_path, archive_name + '.json', disk_out, last_user) unfiltered_users = [row[0] for row in unfiltered_users_retrieval_date] processed_users = 0 unique_active_users = 0 min_tweet_date = parser.parse(min_tweet_date) max_tweet_date = parser.parse(max_tweet_date) n_unfiltered_users = len(unfiltered_users) f_json = open(os.path.join(disk_out, archive_name + '.json'), 'a', encoding='utf-8') filename = archive_name + '_user.csv' data_folder = os.path.join(disk_out, archive_name + '_' + "data") user_csv = csv.writer(open_file(data_folder, filename, 'w', 'csv'), delimiter=',') data_folder = os.path.join(disk_out, archive_name + '_' + "data") for user_str in unfiltered_users_retrieval_date: selected_criteria = False # user not yet selected for criteria saved_user = False # user saved in csv processed_users += 1 user = int(user_str[0]) retrieval_date = user_str[1] print('User: '******'created_at']) # as date date_first_tweet_str = date_first_tweet_date.strftime( "%Y-%m-%d") date_first_tweet = datetime.datetime.strptime( date_first_tweet_str, "%Y-%m-%d").date() # as str month_ago = ( datetime.datetime.today() - relativedelta(months=months_back)).date() # as str date_last_tweet = parser.parse( archive_first_page[n_tweets_first_page - 1]['created_at']) if date_first_tweet >= month_ago: time_interval = date_first_tweet_date - date_last_tweet # in days if time_interval.total_seconds != 0: frequency_user = n_tweets_first_page / time_interval.total_seconds( ) # check first page first_full_page = False for page in archive: # for each page, from the most recent to the oldest n_tweets_pag = len(page) if n_tweets_pag > 0 and "error" not in page: # if no errors in pages if first_full_page is False: # consistency nationality_user = [ 0, 0, 0, 0 ] # percentage of the satisfied criteria in time # characteristics of each tweet for tweet in archive_first_page: language = tweet["lang"] if tweet["place"] is not None: place = tweet["place"][ "country_code"].lower() else: place = None if tweet["user"]["lang"] is not None: interface = tweet["user"][ "lang"].lower() else: interface = None if tweet["user"][ "time_zone"] is not None: time_zone = tweet["user"][ "time_zone"].lower() else: time_zone = None # if the national criteria are still valid if interface == lang or time_zone == capital or language == lang or \ place == lang: if interface == lang: nationality_user[0] += 1 if time_zone == capital: nationality_user[1] += 1 if language == lang: nationality_user[2] += 1 if place == lang: nationality_user[3] += 1 if nationality_user[0] > 0 or nationality_user[1] > 0 or \ nationality_user[2] > threshold_lang or nationality_user[3] > 0: selected_criteria = True # user selected: criteria satisfied # save the array in str nationality_user = str([ l / n_tweets_first_page for l in nationality_user ]) retrieval_date = datetime.datetime.strptime( retrieval_date, "%Y-%m-%d") first_full_page = True # save user in csv, if criteria satisfied if selected_criteria is True: # save user in the csv if saved_user is False: unique_active_users += 1 user_l = user_from_tweet( tweet_1, retrieval_date, frequency_user, nationality_user) user_csv.writerow(user_l) saved_user = True # save also its tweets # if the page has tweets in the desired time interval (no time zone) date_first_tweet = parser.parse( page[0]['created_at']).replace( tzinfo=None) date_last_tweet = parser.parse( page[n_tweets_pag - 1]['created_at']).replace( tzinfo=None) # pages with some tweets in the interval if date_first_tweet >= min_tweet_date and date_last_tweet <= max_tweet_date: # if all the tweets are in the interval if date_last_tweet >= min_tweet_date and date_first_tweet <= max_tweet_date: # no extra checks on the tweets and save tweets of the page in json for tweet in page: # save tweets in a json file write_tweet_in_json( tweet, f_json) else: for tweet in page: # check 1 by one and save tweets of the page in json date_tweet = parser.parse( tweet['created_at'] ).replace(tzinfo=None) if min_tweet_date <= date_tweet <= max_tweet_date: # save tweets in a json file and write_tweet_in_json( tweet, f_json) # timer to see what I am doing if processed_users % 10 == 0: print( "---------------------------------------------------------------------------------" ) print("user:"******"percentage of processed users:" + str(processed_users / n_unfiltered_users)) print("processed users:" + str(processed_users)) # do chunk backup copies each tot users: split if processed_users % chunk_users == 0: user_csv.close() # close file os.rename(filename, 'backup_' + filename) # do a backup copy # open file and start writing on it again user_csv = csv.writer(open_file(data_folder, filename, 'w', 'csv'), delimiter=',') user_csv.close() f_json.close()