def load_nmt_train_data(src, trg, SRC=None, TRG=None, cut_threshold=1): src_count = defaultdict(lambda:0) trg_count = defaultdict(lambda:0) rep_unk = SRC is not None and TRG is not None if SRC is None: SRC = Vocabulary(unk=True, eos=True) if TRG is None: TRG = Vocabulary(unk=True, eos=True) data = [] # Reading in data for sent_id, (src_line, trg_line) in enumerate(zip(src, trg)): src_line = src_line.strip().lower().split() + [SRC.eos()] trg_line = trg_line.strip().lower().split() + [TRG.eos()] for word in src_line: src_count[word] += 1 for word in trg_line: trg_count[word] += 1 data.append((src_line, trg_line)) # Data generator data = load_train_data(data, SRC, TRG, \ src_count=src_count, trg_count=trg_count, \ x_cut=cut_threshold, y_cut=cut_threshold, \ replace_unknown=rep_unk) # Return return SRC, TRG, data
def test_read_train(self): train = ["I_NNP am_VBZ Philip_NNP", "I_NNP am_VBZ student_NN"] X, Y, data = load_pos_train_data(train) data = list(data) # Check Vocabulary x_exp, y_exp = Vocabulary(), Vocabulary(unk=False) x_exp["I"], x_exp["am"] y_exp["NNP"], y_exp["VBZ"], y_exp["NNP"], y_exp["NN"] self.assertVocEqual(X, x_exp) self.assertVocEqual(Y, y_exp) # Check data word_exp = [\ [x_exp["I"], x_exp["am"], x_exp.unk_id()],\ [x_exp["I"], x_exp["am"], x_exp.unk_id()]\ ] label_exp = [\ [y_exp["NNP"], y_exp["VBZ"], y_exp["NNP"]],\ [y_exp["NNP"], y_exp["VBZ"], y_exp["NN"]]\ ] data_exp = [(x,y) for x, y in zip(word_exp, label_exp)] self.assertEqual(data, data_exp)
def test_read_test(self): test = ["I live in Japan"] X = Vocabulary() X["I"], X["live"], X["in"] data = list(load_pos_test_data(test, X))[0][0] data_exp = [\ X["I"], X["live"], X["in"], X.unk_id()\ ] self.assertEqual(data, data_exp)
def test_NMT_3_read_train(self): src=["I am Philip", "I am a student"] trg=["私 は フィリップ です", "私 は 学生 です"] SRC, TRG, data = load_nmt_train_data(src, trg, cut_threshold=1) x_exp = Vocabulary(unk=True, eos=True) y_exp = Vocabulary(unk=True, eos=True) for w in "i am".split(): x_exp[w] for w in "私 は です".split(): y_exp[w] x_data_exp = [\ [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp.eos_id()], \ [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp.unk_id(), x_exp.eos_id()] \ ] y_data_exp = [\ [y_exp["私" ], y_exp["は" ], y_exp.unk_id(), y_exp["です"], y_exp.eos_id()], \ [y_exp["私" ], y_exp["は" ], y_exp.unk_id(), y_exp["です"], y_exp.eos_id()] \ ] data_exp = list(zip(x_data_exp, y_data_exp)) self.assertVocEqual(SRC, x_exp) self.assertVocEqual(TRG, y_exp) self.assertEqual(data, data_exp)
def test_read_train(self): train=["I am Philip", "I am student"] X, data = load_lm_data(train,cut_threshold=1) x_exp = Vocabulary() for w in "<s> </s> i am".split(): x_exp[w] word_exp = [\ [x_exp["<s>"], x_exp["i"], x_exp["am"], x_exp.unk_id()], \ [x_exp["<s>"], x_exp["i"], x_exp["am"], x_exp.unk_id()] \ ] next_word_exp = [\ [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp["</s>"]], \ [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp["</s>"]] \ ] data_exp = list(zip(word_exp, next_word_exp)) self.assertVocEqual(X, x_exp) self.assertEqual(data, data_exp)
def _load_vocabulary(fp): src = Vocabulary.load(fp) return src, src