def create_vocab(vocab_file_eng, vocab_file_fra): if not os.path.exists(vocab_file_eng) or not os.path.exists(vocab_file_fra): vocab_eng = Vocab("eng") vocab_fra = Vocab("fra") logger.info('Creating vocab.') with open("./data/eng-fra.txt", "r", encoding="utf-8") as f: print("Create Vocab") for line in tqdm(f.readlines()): line = line.split('\t') line = normalizePair(line) if not pair_is_simple(line): continue vocab_eng.add_sentence(line[0], to_lower=True, remove_punc=True) vocab_fra.add_sentence(line[1], to_lower=True, remove_punc=True) logger.info(vocab_eng) logger.info(vocab_fra) logger.info('Storing vocab.') vocab_eng.to_json(vocab_file_eng) vocab_fra.to_json(vocab_file_fra) return vocab_eng, vocab_fra
def create_vocab(vocab_file_eng): if not os.path.exists(vocab_file_eng): vocab_eng = Vocab("eng") logger.info('Creating vocab.') with open( "./data/cornell movie-dialogs corpus/formatted_movie_lines.txt", "r", encoding="utf-8") as f: print("Create Vocab") for line in tqdm(f.readlines()): line = line.split('\t') line = normalizePair(line) if not pair_is_simple(line): continue vocab_eng.add_sentence(line[0], to_lower=True, remove_punc=False) vocab_eng.add_sentence(line[1], to_lower=True, remove_punc=False) logger.info(vocab_eng) logger.info('Storing vocab.') vocab_eng.to_json(vocab_file_eng) return vocab_eng
datalist = [ "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.train.txt", "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.valid.txt", "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.test.txt" ] vocab_file = "my_vocab.json" if not os.path.exists(vocab_file): my_vocab = Vocab("my_vocab") for data in datalist: with open(data, "r", encoding="utf-8") as f: for line in f: line = line.split(' ', 1) tgt, inp = line[0], line[1] my_vocab.add_sentence(inp, to_lower=True, remove_punc=False) logger.info("vocab size: {}".format(len(my_vocab))) my_vocab.keep_most_frequent_k(k=50000) my_vocab.to_json("my_vocab.json") else: logger.info('Loading vocab...') my_vocab = Vocab.from_json(vocab_file) pretrain_embedding = my_vocab.extract_pretrain_embedding( "./glove.6B.100d.txt", 100) def prepare_data(data_path, vocab): data_ids = [] with open(data_path, "r", encoding="utf-8") as f: print("Prepare Data")