def test_NMT_3_read_train(self): src=["I am Philip", "I am a student"] trg=["私 は フィリップ です", "私 は 学生 です"] SRC, TRG, data = load_nmt_train_data(src, trg, cut_threshold=1) x_exp = Vocabulary(unk=True, eos=True) y_exp = Vocabulary(unk=True, eos=True) for w in "i am".split(): x_exp[w] for w in "私 は です".split(): y_exp[w] x_data_exp = [\ [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp.eos_id()], \ [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp.unk_id(), x_exp.eos_id()] \ ] y_data_exp = [\ [y_exp["私" ], y_exp["は" ], y_exp.unk_id(), y_exp["です"], y_exp.eos_id()], \ [y_exp["私" ], y_exp["は" ], y_exp.unk_id(), y_exp["です"], y_exp.eos_id()] \ ] data_exp = list(zip(x_data_exp, y_data_exp)) self.assertVocEqual(SRC, x_exp) self.assertVocEqual(TRG, y_exp) self.assertEqual(data, data_exp)
def test_read_train(self): train = ["I_NNP am_VBZ Philip_NNP", "I_NNP am_VBZ student_NN"] X, Y, data = load_pos_train_data(train) data = list(data) # Check Vocabulary x_exp, y_exp = Vocabulary(), Vocabulary(unk=False) x_exp["I"], x_exp["am"] y_exp["NNP"], y_exp["VBZ"], y_exp["NNP"], y_exp["NN"] self.assertVocEqual(X, x_exp) self.assertVocEqual(Y, y_exp) # Check data word_exp = [\ [x_exp["I"], x_exp["am"], x_exp.unk_id()],\ [x_exp["I"], x_exp["am"], x_exp.unk_id()]\ ] label_exp = [\ [y_exp["NNP"], y_exp["VBZ"], y_exp["NNP"]],\ [y_exp["NNP"], y_exp["VBZ"], y_exp["NN"]]\ ] data_exp = [(x,y) for x, y in zip(word_exp, label_exp)] self.assertEqual(data, data_exp)
def test_read_test(self): test = ["I live in Japan"] X = Vocabulary() X["I"], X["live"], X["in"] data = list(load_pos_test_data(test, X))[0][0] data_exp = [\ X["I"], X["live"], X["in"], X.unk_id()\ ] self.assertEqual(data, data_exp)
def test_read_train(self): train=["I am Philip", "I am student"] X, data = load_lm_data(train,cut_threshold=1) x_exp = Vocabulary() for w in "<s> </s> i am".split(): x_exp[w] word_exp = [\ [x_exp["<s>"], x_exp["i"], x_exp["am"], x_exp.unk_id()], \ [x_exp["<s>"], x_exp["i"], x_exp["am"], x_exp.unk_id()] \ ] next_word_exp = [\ [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp["</s>"]], \ [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp["</s>"]] \ ] data_exp = list(zip(word_exp, next_word_exp)) self.assertVocEqual(X, x_exp) self.assertEqual(data, data_exp)