예제 #1
0
 def test_save(self):
     preprocessor = WordPreprocessor()
     filepath = os.path.join(os.path.dirname(__file__), 'data/preprocessor.pkl')
     preprocessor.save(filepath)
     self.assertTrue(os.path.exists(filepath))
     if os.path.exists(filepath):
         os.remove(filepath)
예제 #2
0
 def test_unknown_word(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=False)
     p = preprocessor.fit(X, y)
     X = [['$unknownword$', 'あ']]
     y = [['O', 'O']]
     X, y = p.transform(X, y)
예제 #3
0
 def test_calc_sequence_lengths(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=True)
     p = preprocessor.fit(X, y)
     _, y = p.transform(X, y)
     y_t = np.argmax(y, -1)
     y_t = y_t.astype(np.int32)
     sequence_lengths = np.argmin(y_t, -1)
예제 #4
0
 def test_transform_with_padding(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=True)
     p = preprocessor.fit(X, y)
     X = p.transform(X)
     words, chars = X
     word, char = words[0][0], chars[0][0][0]
     self.assertIsInstance(int(word), int)
     self.assertIsInstance(int(char), int)
예제 #5
0
 def test_transform_only_words(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=False)
     p = preprocessor.fit(X, y)
     X = p.transform(X)
     words, chars = X
     char, word = chars[0][0][0], words[0][0]
     self.assertIsInstance(word, int)
     self.assertIsInstance(char, int)
예제 #6
0
 def test_preprocessor(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=False)
     p = preprocessor.fit(X, y)
     X, y = p.transform(X, y)
     words, chars = X
     char, word = chars[0][0][0], words[0][0]
     tag = y[0][0]
     self.assertIsInstance(word, int)
     self.assertIsInstance(char, int)
     self.assertIsInstance(tag, int)
     self.assertIsInstance(p.inverse_transform(y[0])[0], str)
예제 #7
0
    def setUp(self):
        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))

        config = ModelConfig()
        config.vocab_size = len(p.vocab_word)
        config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(config, ntags=len(p.vocab_tag))
        model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5'))

        self.tagger = anago.Tagger(model, preprocessor=p)
        self.sent = 'President Obama is speaking at the White House.'
예제 #8
0
    def test_eval(self):
        test_path = os.path.join(DATA_ROOT, 'test.txt')
        x_test, y_test = load_data_and_labels(test_path)

        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        config = ModelConfig()
        config.vocab_size = len(p.vocab_word)
        config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(config, ntags=len(p.vocab_tag))
        model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5'))

        evaluator = anago.Evaluator(model, preprocessor=p)
        evaluator.eval(x_test, y_test)
예제 #9
0
    def test_load(self):
        X, y = reader.load_data_and_labels(self.filename)
        p = WordPreprocessor()
        p.fit(X, y)
        filepath = os.path.join(os.path.dirname(__file__), 'data/preprocessor.pkl')
        p.save(filepath)
        self.assertTrue(os.path.exists(filepath))

        loaded_p = WordPreprocessor.load(filepath)
        x_test1, y_test1 = p.transform(X, y)
        x_test2, y_test2 = loaded_p.transform(X, y)
        np.testing.assert_array_equal(x_test1[0], x_test2[0])  # word
        np.testing.assert_array_equal(x_test1[1], x_test2[1])  # char
        np.testing.assert_array_equal(y_test1, y_test2)
        if os.path.exists(filepath):
            os.remove(filepath)
예제 #10
0
    def setUp(self):
        SAVE_ROOT = os.path.join(os.path.dirname(__file__), '../models')

        model_config = ModelConfig()

        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        model_config.vocab_size = len(p.vocab_word)
        model_config.char_vocab_size = len(p.vocab_char)

        weights = 'model_weights.h5'

        self.tagger = anago.Tagger(model_config,
                                   weights,
                                   save_path=SAVE_ROOT,
                                   preprocessor=p)
        self.sent = 'President Obama is speaking at the White House.'
예제 #11
0
    def test_eval(self):
        DATA_ROOT = os.path.join(os.path.dirname(__file__),
                                 '../data/conll2003/en/tagging')
        SAVE_ROOT = os.path.join(os.path.dirname(__file__), '../models')

        model_config = ModelConfig()

        test_path = os.path.join(DATA_ROOT, 'test.txt')
        x_test, y_test = load_data_and_labels(test_path)

        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        model_config.vocab_size = len(p.vocab_word)
        model_config.char_vocab_size = len(p.vocab_char)

        weights = 'model_weights.h5'

        evaluator = anago.Evaluator(model_config,
                                    weights,
                                    save_path=SAVE_ROOT,
                                    preprocessor=p)
        evaluator.eval(x_test, y_test)
예제 #12
0
    def test_vocab_init(self):
        X, y = reader.load_data_and_labels(self.filename)
        unknown_word = 'unknownword'
        X_test, y_test = [[unknown_word]], [['O']]

        preprocessor = WordPreprocessor(padding=False)
        p = preprocessor.fit(X, y)
        X_pred, _ = p.transform(X_test, y_test)
        words = X_pred[0][1]
        self.assertEqual(words, [p.vocab_word[UNK]])

        vocab_init = {unknown_word}
        preprocessor = WordPreprocessor(vocab_init=vocab_init, padding=False)
        p = preprocessor.fit(X, y)
        X_pred, _ = p.transform(X_test, y_test)
        words = X_pred[0][1]
        self.assertNotEqual(words, [p.vocab_word[UNK]])
예제 #13
0
import json
import os
import tornado.ioloop
import tornado.web

import anago
from anago.config import ModelConfig
from anago.data.preprocess import WordPreprocessor

SAVE_ROOT = os.path.join(os.path.dirname(__file__), '../../models')
model_config = ModelConfig()
p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
model_config.vocab_size = len(p.vocab_word)
model_config.char_vocab_size = len(p.vocab_char)
weights = 'model_weights.h5'
tagger = anago.Tagger(model_config,
                      weights,
                      save_path=SAVE_ROOT,
                      preprocessor=p)


class MainHandler(tornado.web.RequestHandler):
    def get(self):
        self.render('index.html', sent='')

    def post(self):
        sent = self.get_argument('sent')
        entities = tagger.get_entities(sent)
        if entities:
            self.write(json.dumps(dict(entities)))
예제 #14
0
 def test_pad_sequences(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=True)
     p = preprocessor.fit(X, y)
     X, y = p.transform(X, y)
예제 #15
0
 def test_to_numpy_array(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=False)
     p = preprocessor.fit(X, y)
     X, y = p.transform(X, y)
     y = np.asarray(y)
예제 #16
0
 def test_fit(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor()
     p = preprocessor.fit(X, y)