def main(path_all, path_train, path_val, size): print "[nlppreprocess.split_corpus] Processing ..." iterator = utils.read_sentences(path_all) count = 0 for s in open(path_all): count += 1 N = count print "[nlppreprocess.split_corpus] Total size: %d" % N perm = np.random.RandomState(1234).permutation(N) val_index = perm[-size:] print "[nlppreprocess.split_corpus] Writing ..." f_train = open(path_train, "w") f_val = open(path_val, "w") for i, s in enumerate(iterator): line = " ".join(s) if i in val_index: f_val.write("%s\n" % line.encode("utf-8")) else: f_train.write("%s\n" % line.encode("utf-8")) f_train.flush() f_train.close() f_val.flush() f_val.close()
def main(path, char): assert os.path.exists(path) if not char: iterator = utils.read_sentences(path) else: print "[nlppreprocess.create_dictionary] NOTE: char-level mode!" iterator = CharIterator(path) print "[nlppreprocess.create_dictionary] Processing ..." dictionary = gensim.corpora.Dictionary(iterator, prune_at=None) vocab = dictionary.token2id print "[nlppreprocess.create_dictionary] Vocabulary size: %d" % len(vocab) if not char: dictionary.save_as_text(path + ".dictionary") print "[nlppreprocess.create_dictionary] Saved the dictionary to %s" % (path + ".dictionary") else: dictionary.save_as_text(path + ".char.dictionary") print "[nlppreprocess.create_dictionary] Saved the dictionary to %s" % (path + ".char.dictionary")
def main(path_in, path_out): print "[nlppreprocess.tokenizer] Processing ..." iterator = utils.read_sentences(path_in) iterator = Tokenizer_with_nltk(iterator) utils.write_sentences(iterator, path_out)
def main(path_in, path_out, prune_at, min_count): print "[nlppreprocess.replace_rare_words] Processing ..." iterator = utils.read_sentences(path_in) iterator = ReplaceRareWords(iterator, prune_at, min_count) count_UNK_rate(iterator) utils.write_sentences(iterator, path_out)
def main(path_in, path_out): print "[nlppreprocess.replace_digits] Processing ..." iterator = utils.read_sentences(path_in) iterator = ReplaceDigits(iterator) utils.write_sentences(iterator, path_out)
def main(path_in, path_out): print "[nlppreprocess.generate_counts] Processing ..." iterator = utils.read_sentences(path_in) iterator = AppendEdgeOfSent(iterator) utils.write_sentences(iterator, path_out)
def main(path_in, path_out): print "[nlppreprocess.lowercase] Processing ..." iterator = utils.read_sentences(path_in) iterator = Lowercase(iterator) utils.write_sentences(iterator, path_out)
def main(path_in, path_out): print "[nlppreprocess.append_eos] Processing ..." iterator = utils.read_sentences(path_in) iterator = AppendEOS(iterator) utils.write_sentences(iterator, path_out)