def test_NMT_3_read_train(self): src=["I am Philip", "I am a student"] trg=["私 は フィリップ です", "私 は 学生 です"] SRC, TRG, data = load_nmt_train_data(src, trg, cut_threshold=1) x_exp = Vocabulary(unk=True, eos=True) y_exp = Vocabulary(unk=True, eos=True) for w in "i am".split(): x_exp[w] for w in "私 は です".split(): y_exp[w] x_data_exp = [\ [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp.eos_id()], \ [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp.unk_id(), x_exp.eos_id()] \ ] y_data_exp = [\ [y_exp["私" ], y_exp["は" ], y_exp.unk_id(), y_exp["です"], y_exp.eos_id()], \ [y_exp["私" ], y_exp["は" ], y_exp.unk_id(), y_exp["です"], y_exp.eos_id()] \ ] data_exp = list(zip(x_data_exp, y_data_exp)) self.assertVocEqual(SRC, x_exp) self.assertVocEqual(TRG, y_exp) self.assertEqual(data, data_exp)
raise ValueError("When not using dict attn, you do not need to specify the dictionary.") if args.use_cpu: args.gpu = -1 if args.save_models: args.save_len = 1 """ Training """ trainer = ParallelTrainer(args.seed, args.gpu) # data UF.trace("Loading corpus + dictionary") with open(args.src) as src_fp: with open(args.trg) as trg_fp: SRC, TRG, train_data = load_nmt_train_data(src_fp, trg_fp, cut_threshold=args.unk_cut) train_data = list(batch_generator(train_data, (SRC, TRG), args.batch)) UF.trace("SRC size:", len(SRC)) UF.trace("TRG size:", len(TRG)) UF.trace("Data loaded.") # dev data dev_data = None if args.src_dev and args.trg_dev: with open(args.src_dev) as src_fp: with open(args.trg_dev) as trg_fp: UF.trace("Loading dev data") _, _, dev_data = load_nmt_train_data(src_fp, trg_fp, SRC, TRG) dev_data = list(batch_generator(dev_data, (SRC, TRG), args.batch)) UF.trace("Dev data loaded")
def setUp(self): src=["I am Philip .", "I am a student ."] trg=["私 は フィリップ です .", "私 は 学生 です ."] SRC, TRG, data = load_nmt_train_data(src, trg, cut_threshold=0) self.model = Attentional(SRC, TRG, Args(SRC,TRG)) self.data = batch_generator(data, (SRC, TRG), 1)