def get_common_vocab(in_dir, out_dir, out_file_name, in_suffix, years, n_vocab, donor_path, receptor_path): common_vocab = None for year in years: col1, col2 = ioutils.load_word_pairs(in_dir + str(year) + in_suffix) file_vocab = set(col1) #file_vocab = set(read_corpus_to_list(in_dir + str(year) + in_suffix)) # f = open(in_dir + str(year) + in_suffix) # for line in f: # for sent in nltk.sent_tokenize(line): # for word in nltk.word_tokenize(sent): # file_vocab.add(word) if common_vocab is None: common_vocab = file_vocab else: common_vocab = common_vocab & file_vocab # f.close() data_bern = bernoulli.rvs(size=len(common_vocab), p=float(n_vocab) / len(common_vocab)) common_vocab_list = list(common_vocab) random_common_vocab = set() for idx, i in enumerate(data_bern): if i == 1: random_common_vocab.add(common_vocab_list[idx]) random_common_vocab = random_common_vocab.union(set(ioutils.load_word_list(donor_path)).union(ioutils.load_word_list(receptor_path))) ioutils.write_list(out_dir + out_file_name, list(random_common_vocab))
def numpy2text(vec_path, year, extension): vocab_list = load_pickle(vec_path + str(year) + "-vocab.pkl") w_mat = np.load(vec_path + str(year) + "-w.npy") vocab_size = len(vocab_list) dim = len(w_mat[0]) ioutils.write_list(vec_path + str(year) + ".vocab", vocab_list) with open(vec_path + str(year) + "-w" + extension, "w") as fp: print >> fp, str(vocab_size), str(dim) for i, w in enumerate(vocab_list): print >> fp, w.encode("utf-8"), " ".join(map(str, w_mat[i, :]))
def worker(proc_num, queue, out_dir, input_dir, out_suffix): while True: if queue.empty(): break year = queue.get() print proc_num, "Cleaning vocab of year", year vocab_list = ioutils.load_pickle(input_dir + VOCAB_FILE.format(year=year)) cleaned_vocab_list = remove_non_alph(vocab_list) ioutils.write_list(out_dir + str(year) + out_suffix, cleaned_vocab_list)
def prepare_donor_receptor_lists(raw_data_file, words_pos_file, donor_out_file, receptor_out_file, n, same_pos): print "Reading words and pos tags" word_pos_df = pd.read_csv(words_pos_file) donor_list, receptor_candidate_list = get_donor_receptor_candidate( raw_data_file, n) receptor_list = get_receptor_same_pos(donor_list, receptor_candidate_list, word_pos_df) if same_pos \ else get_receptor_no_pos(donor_list, receptor_candidate_list) print "Dumping donor and receptor lists" ioutils.write_list(donor_out_file, donor_list) ioutils.write_list(receptor_out_file, receptor_list)
def get_common_vocab(in_dir, out_dir, out_file_name, in_suffix, years): common_vocab = None for year in years: file_vocab = set(read_corpus_to_list(in_dir + str(year) + in_suffix)) # f = open(in_dir + str(year) + in_suffix) # for line in f: # for sent in nltk.sent_tokenize(line): # for word in nltk.word_tokenize(sent): # file_vocab.add(word) if common_vocab is None: common_vocab = file_vocab else: common_vocab = common_vocab & file_vocab # f.close() ioutils.write_list(out_dir + out_file_name, list(common_vocab))
def main(args): encoding = sys.stdout.encoding or 'utf-8' f = open(args.filename) fd = nltk.FreqDist() stopWords = set(stopwords.words('english')) for line in f: for sent in nltk.sent_tokenize(line): for word in nltk.word_tokenize(sent): fd[word] += 1 words_freq = [] for w, count in fd.most_common(): if w not in stopWords: tup = u"{} {}".format(w, count) words_freq.append(tup.encode(encoding)) ioutils.write_list(args.filename + "_freq", words_freq)
def create_comman_vocab(in_dir, ngram_file_suffix, out_dir, out_file_name, years, lang): stop_set = set(stopwords.words(lang)) common_vocab_set = set() for year in years: file_content_list = ioutils.load_word_list(in_dir + str(year) + ngram_file_suffix) words_set = set() for line in file_content_list: words_line = line.split() for w in words_line: if not (w.lower().isalpha()) or (w.lower() in stop_set) or ( w.lower() in words_set) or (len(w) <= 2): continue words_set.add(w.lower()) # print words_set if year != years[0]: common_vocab_set = common_vocab_set.intersection(words_set) else: common_vocab_set = words_set # print list(common_vocab_set) ioutils.write_list(out_dir + out_file_name, list(common_vocab_set))