Exemple #1
0
    def test_NMT_3_read_train(self):
        src=["I am Philip", "I am a student"]
        trg=["私 は フィリップ です", "私 は 学生 です"]
        SRC, TRG, data = load_nmt_train_data(src, trg, cut_threshold=1)
        x_exp = Vocabulary(unk=True, eos=True)
        y_exp = Vocabulary(unk=True, eos=True)
        
        for w in "i am".split():
            x_exp[w]

        for w in "私 は です".split():
            y_exp[w]
        x_data_exp = [\
                [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp.eos_id()], \
                [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp.unk_id(), x_exp.eos_id()] \
        ]

        y_data_exp = [\
                [y_exp["私" ], y_exp["は" ], y_exp.unk_id(), y_exp["です"], y_exp.eos_id()], \
                [y_exp["私" ], y_exp["は" ], y_exp.unk_id(), y_exp["です"], y_exp.eos_id()] \
        ]

        data_exp = list(zip(x_data_exp, y_data_exp))
        self.assertVocEqual(SRC, x_exp)
        self.assertVocEqual(TRG, y_exp)
        self.assertEqual(data, data_exp)
Exemple #2
0
        raise ValueError("When not using dict attn, you do not need to specify the dictionary.")

if args.use_cpu:
    args.gpu = -1

if args.save_models:
    args.save_len = 1

""" Training """
trainer   = ParallelTrainer(args.seed, args.gpu)
 
# data
UF.trace("Loading corpus + dictionary")
with open(args.src) as src_fp:
    with open(args.trg) as trg_fp:
        SRC, TRG, train_data = load_nmt_train_data(src_fp, trg_fp, cut_threshold=args.unk_cut)
        train_data = list(batch_generator(train_data, (SRC, TRG), args.batch))
UF.trace("SRC size:", len(SRC))
UF.trace("TRG size:", len(TRG))
UF.trace("Data loaded.")

# dev data
dev_data = None
if args.src_dev and args.trg_dev:
    with open(args.src_dev) as src_fp:
        with open(args.trg_dev) as trg_fp:
            UF.trace("Loading dev data")
            _, _, dev_data = load_nmt_train_data(src_fp, trg_fp, SRC, TRG)
            dev_data = list(batch_generator(dev_data, (SRC, TRG), args.batch))
            UF.trace("Dev data loaded")
 def setUp(self):
     src=["I am Philip .", "I am a student ."]
     trg=["私 は フィリップ です .", "私 は 学生 です ."]
     SRC, TRG, data = load_nmt_train_data(src, trg, cut_threshold=0)
     self.model = Attentional(SRC, TRG, Args(SRC,TRG))
     self.data = batch_generator(data, (SRC, TRG), 1)