def main(argv): """ Creates n-grams from the file New Years Resolution_merged.tsv. Output in JSON format """ if validate_argv(argv) is False: print "Usage: tokenizeTweets.py <file name> <file directory> <n (for n gram)>" sys.exit() file_name = argv[0] input_directory_name = argv[1] n_for_ngrams = int(argv[2]) input_path = fp.get_file_path(file_name, input_directory_name) tweets = extract_tweets(input_path) tweets_deduped = dedupe_and_tokenize(tweets) # for creating an ngram dictionary ngrams = dg.create_ngrams(tweets_deduped, n_for_ngrams) ngram_dict = dg.create_ngram_dict(ngrams) output_path = fp.set_output_file_path('New Years Resolution_ngram_' + str(n_for_ngrams) + '.json', 'ngrams') output_ngram(ngram_dict, output_path) # for creating a list of tokens. Removing the words "New Years Resolution" as well. tokens = break_down_sentences(tweets_deduped) tokens_cleaned = remove_tokens(tokens, ['new', 'years', 'resolution', ':']) output_path2 = fp.set_output_file_path('New Years Resolution_tokens.tsv', 'tokens') output_tokens(tokens_cleaned, output_path2)
def main(argv): """ Merge files of the format <search_term>_####.tsv" in the data_raw directory and outputs into the "merged" directory """ if validate_argv(argv) is False: print "Usage: mergeFiles.py <search_term>" sys.exit() input_directory_name = 'data_raw' search_term = argv[0] output_file_name = search_term + '_merged.tsv' output_directory_name = 'merged' output_path = fp.set_output_file_path(output_file_name, output_directory_name) output = open(output_path, 'a') for h1 in range(3): for h2 in range(10): for m1 in range(6): for m2 in range(10): file_name = search_term + '_' + str(h1) + str(h2) + str(m1) + str(m2) + '.tsv' file_path = fp.get_file_path(file_name, input_directory_name) if fp.filename_exists(file_path): file = open(file_path, 'r') file.next() for line in file: output.write(line) file.close() output.close()