def get_vocab(): vocab_file_eng = "vocab_eng.json" vocab_file_fra = "vocab_fra.json" if not os.path.exists(vocab_file_eng) or not os.path.exists(vocab_file_fra): return create_vocab(vocab_file_eng, vocab_file_fra) else: logger.info('Loading vocab.') vocab_eng = Vocab.from_json(vocab_file_eng) vocab_fra = Vocab.from_json(vocab_file_fra) logger.info(vocab_eng) logger.info(vocab_fra) return vocab_eng, vocab_fra
def create_vocab(vocab_file_eng): if not os.path.exists(vocab_file_eng): vocab_eng = Vocab("eng") logger.info('Creating vocab.') with open( "./data/cornell movie-dialogs corpus/formatted_movie_lines.txt", "r", encoding="utf-8") as f: print("Create Vocab") for line in tqdm(f.readlines()): line = line.split('\t') line = normalizePair(line) if not pair_is_simple(line): continue vocab_eng.add_sentence(line[0], to_lower=True, remove_punc=False) vocab_eng.add_sentence(line[1], to_lower=True, remove_punc=False) logger.info(vocab_eng) logger.info('Storing vocab.') vocab_eng.to_json(vocab_file_eng) return vocab_eng
def create_vocab(vocab_file_eng, vocab_file_fra): if not os.path.exists(vocab_file_eng) or not os.path.exists( vocab_file_fra): vocab_eng = Vocab("eng") vocab_fra = Vocab("fra") logger.info('Creating vocab.') with open("./data/eng-fra.txt", "r", encoding="utf-8") as f: for line in f: line = line.split('\t') line = normalizePair(line) if not pair_is_simple(line): continue vocab_eng.add_sentence(line[0], to_lower=True, remove_punc=True) vocab_fra.add_sentence(line[1], to_lower=True, remove_punc=True) logger.info(vocab_eng) logger.info(vocab_fra) logger.info('Storing vocab.') vocab_eng.to_json(vocab_file_eng) vocab_fra.to_json(vocab_file_fra) return vocab_eng, vocab_fra
config_file = args.config with open(config_file, "r", encoding="utf-8") as f: config = json.load(f) logger.info("Use GPU: {}.".format(use_gpu)) logger.info("Configurations:\n{}".format(str(config))) vocab_file_eng = "vocab_eng.json" vocab_file_fra = "vocab_fra.json" if not os.path.exists(vocab_file_eng): raise FileNotFoundError(vocab_file_eng) elif not os.path.exists(vocab_file_fra): raise FileNotFoundError(vocab_file_fra) else: logger.info('Loading vocab.') vocab_eng = Vocab.from_json(vocab_file_eng) vocab_fra = Vocab.from_json(vocab_file_fra) logger.info(vocab_eng) logger.info(vocab_fra) logger.info('Preparing data.') val_data_path = 'val_data.json' if not os.path.exists(val_data_path): raise FileNotFoundError(val_data_path) else: with open(val_data_path, "r", encoding="utf-8") as f: val_data = json.load(f) # Load model if a args.checkpoint is provided if args.checkpoint is not None: logger.info('Loading checkpoint file [{}].'.format(args.checkpoint))
parser = argparse.ArgumentParser() parser.add_argument("--config", default="./config.json", type=str) parser.add_argument("--checkpoint", default=None, type=str) args = parser.parse_args() use_gpu = torch.cuda.is_available() device = torch.device("cuda" if use_gpu else "cpu") config_file = args.config with open(config_file) as f: config = json.load(f) logger.info("Use GPU: {}.".format(use_gpu)) logger.info("Configurations:\n{}".format(str(config))) my_vocab = Vocab.from_json("my_vocab.json") def prepare_data(data_path, vocab): data_ids = [] with open(data_path, "r", encoding="utf-8") as f: print("Prepare Data") for line in f: line = line.split(' ', 1) tgt, inp = line[0], line[1] tgt = int(tgt) inp = inp.lower() data_ids.append({ "input": vocab.indexes_from_sentence(inp, add_eos=False), "target":
config_file = args.config with open(config_file) as f: config = json.load(f) logger.info("Use GPU: {}.".format(use_gpu)) logger.info("Configurations:\n{}".format(str(config))) datalist = [ "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.train.txt", "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.valid.txt", "./imdb-binary_sentiment_classification-preprocessed_data/imdb.binary_sentiment_classification.test.txt" ] vocab_file = "my_vocab.json" if not os.path.exists(vocab_file): my_vocab = Vocab("my_vocab") for data in datalist: with open(data, "r", encoding="utf-8") as f: for line in f: line = line.split(' ', 1) tgt, inp = line[0], line[1] my_vocab.add_sentence(inp, to_lower=True, remove_punc=False) logger.info("vocab size: {}".format(len(my_vocab))) my_vocab.keep_most_frequent_k(k=50000) my_vocab.to_json("my_vocab.json") else: logger.info('Loading vocab...') my_vocab = Vocab.from_json(vocab_file) pretrain_embedding = my_vocab.extract_pretrain_embedding( "./glove.6B.100d.txt", 100)
use_gpu = torch.cuda.is_available() device = torch.device("cuda" if use_gpu else "cpu") config_file = args.config with open(config_file, "r", encoding="utf-8") as f: config = json.load(f) logger.info("Use GPU: {}.".format(use_gpu)) logger.info("Configurations:\n{}".format(str(config))) vocab_file_eng = "vocab_eng.json" if not os.path.exists(vocab_file_eng): raise FileNotFoundError(vocab_file_eng) else: logger.info('Loading vocab.') vocab_eng = Vocab.from_json(vocab_file_eng) logger.info(vocab_eng) logger.info('Preparing data.') val_data_path = 'val_data.json' if not os.path.exists(val_data_path): raise FileNotFoundError(val_data_path) else: with open(val_data_path, "r", encoding="utf-8") as f: val_data = json.load(f) # Load model if a args.checkpoint is provided if args.checkpoint is not None: logger.info('Loading checkpoint file [{}].'.format(args.checkpoint)) # If loading on same machine the model was trained on checkpoint = torch.load(args.checkpoint)