Exemple #1
0
def process_day_files(t_day, file_l):
    print('Processing day ' + t_day)
    start_t = time.time()
    user_d = {}
    thin_l = []
    id_s = set()
    for f in file_l:
        add_to_list(f, id_s, thin_l, user_d)
    write_pkl('.'.join([t_day, '.pkl']), thin_l)
    write_pkl(''.join([t_day, 'id_post_d.pkl']), user_d)
    end_t = time.time()
    print('elapsed time for ', t_day, 'is', end_t - start_t)
    return None
Exemple #2
0
def generate_to_n(n, out_f_name):
    """
    Generate a dictionary, keyed by user twitter id, with values a list of number of tweets and date of that number of tweets
    for the top n tweeters of the day. This is mean to be run over the combined_tweet files for a month

    :param n: generate a dictionary that contains the top n tweeters for each day
    :param out_f_name: name of the file in which a pickle of the dictionary will be written
    """
    all_f = os.listdir('.')
    f_list = []
    for f in all_f:
        if ('combined') in f:
            f_list.append(f)

    top_10_d = {}
    for f in f_list:
        print('Reading file', f)
        tweet_l = gt.read_pkl(f)
        top_10_d = build_top_n_d(tweet_l, n, top_10_d, f[-6:-4])
        
    gt.write_pkl(out_f_name, top_10_d)
Exemple #3
0
def distill_counts(t_l, fname_date):
    """


    """
    count_d = build_count_d(t_l)
    id_name_d = build_id_name_d(t_l)
    count_id = build_count_id_d(count_d)
    count_posters = build_count_posters(count_id)
    
    gt.write_pkl('count_d_' + fname_date + '.pkl', count_d)
    gt.write_pkl('id_name_d_' + fname_date + '.pkl', id_name_d)
    gt.write_pkl('count_id_' + fname_date + '.pkl', count_id)
    gt.write_pkl('count_posters_' + fname_date + '.pkl', count_posters)
Exemple #4
0
def stream_writer():
    global tweets
    global done_sampling
    global total_tweets
    global start_time
    global write_lock
    global write_dir

    while True:
        if len(tweets) >= 10000 or done_sampling:

            # make output file string
            datestr = time.strftime("%Y_%m_%d")
            datestr = '/'.join([write_dir, datestr])
            if not os.path.isdir(datestr):
                os.mkdir(datestr)
            timestr = time.strftime("%Y_%m_%d-%H_%M_%S")
            outstr = write_dir + "/" + datestr + "/" + 'political_' + timestr + ".pkl"

            with open(outstr, 'wb') as f:
                # write tweets to file
                write_lock.acquire()
                total_tweets += len(tweets)
                gt.write_pkl(outstr, tweets)
                print("wrote " + timestr + " to file with " +
                      str(len(tweets)) + " tweets.")
                print(
                    str(total_tweets) + " total tweets downloaded since " +
                    start_time + ".")
                tweets = []
                write_lock.release()

            if done_sampling:
                return
        else:
            time.sleep(1)
Exemple #5
0
    :return a list of the thinned tweets in the files supplied
    """
    tweet_list = []
    tweet_set = set()
    for f_name in f_list:
        t_l = ut.read_pkl(f_name)
        for t in t_l:
            if t.tweet_id not in tweet_set:
                tweet_list.append(t)
                tweet_set.add(t)

    return tweet_list


if __name__ == '__main__':
    if len(sys.argv) < 3:
        print(
            'Usage: python merge_tweet_files.py target_directory file_pattern')
        sys.exit(1)

    f_list = get_flist(sys.argv[1], sys.argv[2])
    out_fname = '-'.join(['combined_tweets', sys.argv[2]])
    out_fname = '.'.join([out_fname, 'pkl'])
    out_name = '/'.join([sys.argv[1], out_fname])
    if os.path.exists(out_name):
        print('Output file already exists...Continue? [y/n]')
        if (input() != 'y'):
            exit()
    comb_tweet_l = build_tweet_list(f_list)
    ut.write_pkl(out_name, comb_tweet_l)