Esempio n. 1
0
    def test_train(self):
        model_config = ModelConfig()
        training_config = TrainingConfig()

        # train_path = os.path.join(DATA_ROOT, 'conll.txt')
        # valid_path = os.path.join(DATA_ROOT, 'conll.txt')
        # x_train, y_train = load_data_and_labels(train_path)
        # x_valid, y_valid = load_data_and_labels(valid_path)
        path = os.path.join(DATA_ROOT, 'dataset.tsv')
        X, y = load_data_and_labels(path)
        from sklearn.model_selection import train_test_split
        x_train, x_valid, y_train, y_valid = train_test_split(X,
                                                              y,
                                                              test_size=0.3,
                                                              random_state=42)

        # p = prepare_preprocessor(x_train, y_train)
        p = prepare_preprocessor(X, y)
        p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        model_file = os.path.join(SAVE_ROOT, 'model.h5')

        model = CharNER(model_config, p.vocab_size(), p.tag_size())

        trainer = namaco.Trainer(model,
                                 model.loss,
                                 training_config,
                                 log_dir=LOG_ROOT,
                                 save_path=model_file,
                                 preprocessor=p)
        trainer.train(x_train, y_train, x_valid, y_valid)
Esempio n. 2
0
    def test_predict(self):
        X, y = load_data_and_labels(self.filename)
        p = prepare_preprocessor(X, y)
        self.model_config.vocab_size = len(p.vocab_char)

        model = CharNER(self.model_config, p.vocab_size(), p.tag_size())
        model.predict(p.transform(X))
 def test_unknown_word(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = Preprocessor(padding=False, return_lengths=False)
     p = preprocessor.fit(X, y)
     X = [['$unknownword$']]
     X = p.transform(X)
     self.assertEqual(X[0][0], p.vocab_char[UNK])
Esempio n. 4
0
 def test_batch_iter(self):
     sents, labels = load_data_and_labels(self.filename)
     batch_size = 32
     p = prepare_preprocessor(sents, labels)
     steps, batches = batch_iter(sents, labels, batch_size, preprocessor=p)
     for _ in range(steps):
         next(batches)
 def test_transform(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = Preprocessor(padding=False, return_lengths=False)
     p = preprocessor.fit(X, y)
     X, y = p.transform(X, y)
     char_id = X[0][0]  # 1th character of 1th sent.
     tag_id = y[0][0]  # 1th tag of 1th sent.
     self.assertIsInstance(char_id, int)
     self.assertIsInstance(tag_id, int)
 def test_fit(self):
     X, y = reader.load_data_and_labels(self.filename)
     p = Preprocessor(padding=False)
     p = p.fit(X, y)
     self.assertTrue(PAD in p.vocab_char)
     self.assertTrue(UNK in p.vocab_char)
     self.assertTrue(PAD in p.vocab_tag)
     char_set = set(p.vocab_char) - {PAD, UNK}
     for ch in char_set:
         self.assertEqual(len(ch), 1)
Esempio n. 7
0
    def test_eval(self):
        DATA_ROOT = os.path.join(os.path.dirname(__file__), 'data')
        SAVE_ROOT = os.path.join(os.path.dirname(__file__), 'models')

        test_path = os.path.join(DATA_ROOT, 'conll.txt')
        model_path = os.path.join(SAVE_ROOT, 'model.h5')

        x_test, y_test = load_data_and_labels(test_path)
        p = Preprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))

        evaluator = namaco.Evaluator(model_path, preprocessor=p)
        evaluator.eval(x_test, y_test)
    def test_return_lengths(self):
        X, y = reader.load_data_and_labels(self.filename)
        preprocessor = Preprocessor(padding=False, return_lengths=True)
        p = preprocessor.fit(X, y)
        X, y = p.transform(X, y)
        chars, lengths = X
        char_id = chars[0][0]
        tag_id = y[0][0]
        self.assertIsInstance(char_id, int)
        self.assertIsInstance(tag_id, int)

        for seq, leng in zip(chars, lengths):
            self.assertEqual(len(seq), leng)
Esempio n. 9
0
    def test_train(self):
        model_config = ModelConfig()
        training_config = TrainingConfig()

        train_path = os.path.join(DATA_ROOT, 'conll.txt')
        valid_path = os.path.join(DATA_ROOT, 'conll.txt')
        x_train, y_train = load_data_and_labels(train_path)
        x_valid, y_valid = load_data_and_labels(valid_path)

        p = prepare_preprocessor(x_train, y_train)
        p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        model_file = os.path.join(SAVE_ROOT, 'model.h5')

        model = CharNER(model_config, p.vocab_size(), p.tag_size())

        trainer = namaco.Trainer(model,
                                 model.loss,
                                 training_config,
                                 log_dir=LOG_ROOT,
                                 save_path=model_file,
                                 preprocessor=p)
        trainer.train(x_train, y_train, x_valid, y_valid)
Esempio n. 10
0
    def test_transform_with_padding(self):
        X, y = reader.load_data_and_labels(self.filename)
        preprocessor = Preprocessor(padding=True, return_lengths=False)
        p = preprocessor.fit(X, y)
        X, y = p.transform(X, y)
        char_id = X[0][0]
        tag_id = y[0][0]
        self.assertIsInstance(char_id, np.int32)
        self.assertIsInstance(tag_id, np.int32)

        length_set1 = set(map(len, X))
        length_set2 = set(map(len, y))
        self.assertEqual(len(length_set1), 1)  # all sequence has same length.
        self.assertEqual(len(length_set2), 1)
Esempio n. 11
0
    def test_load(self):
        X, y = reader.load_data_and_labels(self.filename)
        p = Preprocessor()
        p.fit(X, y)
        filepath = os.path.join(os.path.dirname(__file__),
                                'data/preprocessor.pkl')
        p.save(filepath)
        self.assertTrue(os.path.exists(filepath))

        loaded_p = Preprocessor.load(filepath)
        x_test1, y_test1 = p.transform(X, y)
        x_test2, y_test2 = loaded_p.transform(X, y)
        np.testing.assert_array_equal(x_test1[0], x_test2[0])  # word
        np.testing.assert_array_equal(x_test1[1], x_test2[1])  # char
        np.testing.assert_array_equal(y_test1, y_test2)
        if os.path.exists(filepath):
            os.remove(filepath)
Esempio n. 12
0
 def test_extract(self):
     sents, labels = load_data_and_labels(self.filename)
     self.assertTrue(len(sents) == len(labels))