Exemple #1
0
def load_nmt_train_data(src, trg, SRC=None, TRG=None, cut_threshold=1):
    src_count = defaultdict(lambda:0)
    trg_count = defaultdict(lambda:0)
    rep_unk   = SRC is not None and TRG is not None
    if SRC is None:
        SRC  = Vocabulary(unk=True, eos=True)
    if TRG is None:
        TRG  = Vocabulary(unk=True, eos=True)
    data = []
    # Reading in data
    for sent_id, (src_line, trg_line) in enumerate(zip(src, trg)):
        src_line = src_line.strip().lower().split() + [SRC.eos()]
        trg_line = trg_line.strip().lower().split() + [TRG.eos()]

        for word in src_line:
            src_count[word] += 1
        for word in trg_line:
            trg_count[word] += 1

        data.append((src_line, trg_line))
   
    # Data generator
    data = load_train_data(data, SRC, TRG, \
            src_count=src_count, trg_count=trg_count, \
            x_cut=cut_threshold, y_cut=cut_threshold, \
            replace_unknown=rep_unk)
    
    # Return
    return SRC, TRG, data
Exemple #2
0
    def test_read_train(self):
        train = ["I_NNP am_VBZ Philip_NNP", "I_NNP am_VBZ student_NN"]
        X, Y, data = load_pos_train_data(train)
       
        data = list(data)
        # Check Vocabulary
        x_exp, y_exp = Vocabulary(), Vocabulary(unk=False)
        x_exp["I"], x_exp["am"]
        y_exp["NNP"], y_exp["VBZ"], y_exp["NNP"], y_exp["NN"]

        self.assertVocEqual(X, x_exp)
        self.assertVocEqual(Y, y_exp)
        
        # Check data
        word_exp = [\
                [x_exp["I"], x_exp["am"], x_exp.unk_id()],\
                [x_exp["I"], x_exp["am"], x_exp.unk_id()]\
        ]

        label_exp = [\
                [y_exp["NNP"], y_exp["VBZ"], y_exp["NNP"]],\
                [y_exp["NNP"], y_exp["VBZ"], y_exp["NN"]]\
        ]

        data_exp = [(x,y) for x, y in zip(word_exp, label_exp)]

        self.assertEqual(data, data_exp)
Exemple #3
0
    def test_read_test(self):
        test = ["I live in Japan"]
        X = Vocabulary()
        X["I"], X["live"], X["in"]

        data = list(load_pos_test_data(test, X))[0][0]

        data_exp = [\
                X["I"], X["live"], X["in"], X.unk_id()\
        ]
        self.assertEqual(data, data_exp)
Exemple #4
0
    def test_NMT_3_read_train(self):
        src=["I am Philip", "I am a student"]
        trg=["私 は フィリップ です", "私 は 学生 です"]
        SRC, TRG, data = load_nmt_train_data(src, trg, cut_threshold=1)
        x_exp = Vocabulary(unk=True, eos=True)
        y_exp = Vocabulary(unk=True, eos=True)
        
        for w in "i am".split():
            x_exp[w]

        for w in "私 は です".split():
            y_exp[w]
        x_data_exp = [\
                [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp.eos_id()], \
                [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp.unk_id(), x_exp.eos_id()] \
        ]

        y_data_exp = [\
                [y_exp["私" ], y_exp["は" ], y_exp.unk_id(), y_exp["です"], y_exp.eos_id()], \
                [y_exp["私" ], y_exp["は" ], y_exp.unk_id(), y_exp["です"], y_exp.eos_id()] \
        ]

        data_exp = list(zip(x_data_exp, y_data_exp))
        self.assertVocEqual(SRC, x_exp)
        self.assertVocEqual(TRG, y_exp)
        self.assertEqual(data, data_exp)
Exemple #5
0
    def test_read_train(self):
        train=["I am Philip", "I am student"]
        X, data = load_lm_data(train,cut_threshold=1)
        
        x_exp = Vocabulary()
        for w in "<s> </s> i am".split():
            x_exp[w]

        word_exp = [\
                [x_exp["<s>"], x_exp["i"], x_exp["am"], x_exp.unk_id()], \
                [x_exp["<s>"], x_exp["i"], x_exp["am"], x_exp.unk_id()] \
        ]

        next_word_exp = [\
                [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp["</s>"]], \
                [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp["</s>"]] \
        ]

        data_exp = list(zip(word_exp, next_word_exp))

        self.assertVocEqual(X, x_exp)
        self.assertEqual(data, data_exp)
Exemple #6
0
 def _load_vocabulary(fp):
     src = Vocabulary.load(fp)
     return src, src