def process_day_files(t_day, file_l): print('Processing day ' + t_day) start_t = time.time() user_d = {} thin_l = [] id_s = set() for f in file_l: add_to_list(f, id_s, thin_l, user_d) write_pkl('.'.join([t_day, '.pkl']), thin_l) write_pkl(''.join([t_day, 'id_post_d.pkl']), user_d) end_t = time.time() print('elapsed time for ', t_day, 'is', end_t - start_t) return None
def generate_to_n(n, out_f_name): """ Generate a dictionary, keyed by user twitter id, with values a list of number of tweets and date of that number of tweets for the top n tweeters of the day. This is mean to be run over the combined_tweet files for a month :param n: generate a dictionary that contains the top n tweeters for each day :param out_f_name: name of the file in which a pickle of the dictionary will be written """ all_f = os.listdir('.') f_list = [] for f in all_f: if ('combined') in f: f_list.append(f) top_10_d = {} for f in f_list: print('Reading file', f) tweet_l = gt.read_pkl(f) top_10_d = build_top_n_d(tweet_l, n, top_10_d, f[-6:-4]) gt.write_pkl(out_f_name, top_10_d)
def distill_counts(t_l, fname_date): """ """ count_d = build_count_d(t_l) id_name_d = build_id_name_d(t_l) count_id = build_count_id_d(count_d) count_posters = build_count_posters(count_id) gt.write_pkl('count_d_' + fname_date + '.pkl', count_d) gt.write_pkl('id_name_d_' + fname_date + '.pkl', id_name_d) gt.write_pkl('count_id_' + fname_date + '.pkl', count_id) gt.write_pkl('count_posters_' + fname_date + '.pkl', count_posters)
def stream_writer(): global tweets global done_sampling global total_tweets global start_time global write_lock global write_dir while True: if len(tweets) >= 10000 or done_sampling: # make output file string datestr = time.strftime("%Y_%m_%d") datestr = '/'.join([write_dir, datestr]) if not os.path.isdir(datestr): os.mkdir(datestr) timestr = time.strftime("%Y_%m_%d-%H_%M_%S") outstr = write_dir + "/" + datestr + "/" + 'political_' + timestr + ".pkl" with open(outstr, 'wb') as f: # write tweets to file write_lock.acquire() total_tweets += len(tweets) gt.write_pkl(outstr, tweets) print("wrote " + timestr + " to file with " + str(len(tweets)) + " tweets.") print( str(total_tweets) + " total tweets downloaded since " + start_time + ".") tweets = [] write_lock.release() if done_sampling: return else: time.sleep(1)
:return a list of the thinned tweets in the files supplied """ tweet_list = [] tweet_set = set() for f_name in f_list: t_l = ut.read_pkl(f_name) for t in t_l: if t.tweet_id not in tweet_set: tweet_list.append(t) tweet_set.add(t) return tweet_list if __name__ == '__main__': if len(sys.argv) < 3: print( 'Usage: python merge_tweet_files.py target_directory file_pattern') sys.exit(1) f_list = get_flist(sys.argv[1], sys.argv[2]) out_fname = '-'.join(['combined_tweets', sys.argv[2]]) out_fname = '.'.join([out_fname, 'pkl']) out_name = '/'.join([sys.argv[1], out_fname]) if os.path.exists(out_name): print('Output file already exists...Continue? [y/n]') if (input() != 'y'): exit() comb_tweet_l = build_tweet_list(f_list) ut.write_pkl(out_name, comb_tweet_l)