def test_save(self): preprocessor = WordPreprocessor() filepath = os.path.join(os.path.dirname(__file__), 'data/preprocessor.pkl') preprocessor.save(filepath) self.assertTrue(os.path.exists(filepath)) if os.path.exists(filepath): os.remove(filepath)
def test_unknown_word(self): X, y = reader.load_data_and_labels(self.filename) preprocessor = WordPreprocessor(padding=False) p = preprocessor.fit(X, y) X = [['$unknownword$', 'あ']] y = [['O', 'O']] X, y = p.transform(X, y)
def test_calc_sequence_lengths(self): X, y = reader.load_data_and_labels(self.filename) preprocessor = WordPreprocessor(padding=True) p = preprocessor.fit(X, y) _, y = p.transform(X, y) y_t = np.argmax(y, -1) y_t = y_t.astype(np.int32) sequence_lengths = np.argmin(y_t, -1)
def test_transform_with_padding(self): X, y = reader.load_data_and_labels(self.filename) preprocessor = WordPreprocessor(padding=True) p = preprocessor.fit(X, y) X = p.transform(X) words, chars = X word, char = words[0][0], chars[0][0][0] self.assertIsInstance(int(word), int) self.assertIsInstance(int(char), int)
def test_transform_only_words(self): X, y = reader.load_data_and_labels(self.filename) preprocessor = WordPreprocessor(padding=False) p = preprocessor.fit(X, y) X = p.transform(X) words, chars = X char, word = chars[0][0][0], words[0][0] self.assertIsInstance(word, int) self.assertIsInstance(char, int)
def test_preprocessor(self): X, y = reader.load_data_and_labels(self.filename) preprocessor = WordPreprocessor(padding=False) p = preprocessor.fit(X, y) X, y = p.transform(X, y) words, chars = X char, word = chars[0][0][0], words[0][0] tag = y[0][0] self.assertIsInstance(word, int) self.assertIsInstance(char, int) self.assertIsInstance(tag, int) self.assertIsInstance(p.inverse_transform(y[0])[0], str)
def setUp(self): p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) config = ModelConfig() config.vocab_size = len(p.vocab_word) config.char_vocab_size = len(p.vocab_char) model = SeqLabeling(config, ntags=len(p.vocab_tag)) model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5')) self.tagger = anago.Tagger(model, preprocessor=p) self.sent = 'President Obama is speaking at the White House.'
def test_eval(self): test_path = os.path.join(DATA_ROOT, 'test.txt') x_test, y_test = load_data_and_labels(test_path) p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) config = ModelConfig() config.vocab_size = len(p.vocab_word) config.char_vocab_size = len(p.vocab_char) model = SeqLabeling(config, ntags=len(p.vocab_tag)) model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5')) evaluator = anago.Evaluator(model, preprocessor=p) evaluator.eval(x_test, y_test)
def test_load(self): X, y = reader.load_data_and_labels(self.filename) p = WordPreprocessor() p.fit(X, y) filepath = os.path.join(os.path.dirname(__file__), 'data/preprocessor.pkl') p.save(filepath) self.assertTrue(os.path.exists(filepath)) loaded_p = WordPreprocessor.load(filepath) x_test1, y_test1 = p.transform(X, y) x_test2, y_test2 = loaded_p.transform(X, y) np.testing.assert_array_equal(x_test1[0], x_test2[0]) # word np.testing.assert_array_equal(x_test1[1], x_test2[1]) # char np.testing.assert_array_equal(y_test1, y_test2) if os.path.exists(filepath): os.remove(filepath)
def setUp(self): SAVE_ROOT = os.path.join(os.path.dirname(__file__), '../models') model_config = ModelConfig() p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) model_config.vocab_size = len(p.vocab_word) model_config.char_vocab_size = len(p.vocab_char) weights = 'model_weights.h5' self.tagger = anago.Tagger(model_config, weights, save_path=SAVE_ROOT, preprocessor=p) self.sent = 'President Obama is speaking at the White House.'
def test_eval(self): DATA_ROOT = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/tagging') SAVE_ROOT = os.path.join(os.path.dirname(__file__), '../models') model_config = ModelConfig() test_path = os.path.join(DATA_ROOT, 'test.txt') x_test, y_test = load_data_and_labels(test_path) p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) model_config.vocab_size = len(p.vocab_word) model_config.char_vocab_size = len(p.vocab_char) weights = 'model_weights.h5' evaluator = anago.Evaluator(model_config, weights, save_path=SAVE_ROOT, preprocessor=p) evaluator.eval(x_test, y_test)
def test_vocab_init(self): X, y = reader.load_data_and_labels(self.filename) unknown_word = 'unknownword' X_test, y_test = [[unknown_word]], [['O']] preprocessor = WordPreprocessor(padding=False) p = preprocessor.fit(X, y) X_pred, _ = p.transform(X_test, y_test) words = X_pred[0][1] self.assertEqual(words, [p.vocab_word[UNK]]) vocab_init = {unknown_word} preprocessor = WordPreprocessor(vocab_init=vocab_init, padding=False) p = preprocessor.fit(X, y) X_pred, _ = p.transform(X_test, y_test) words = X_pred[0][1] self.assertNotEqual(words, [p.vocab_word[UNK]])
import json import os import tornado.ioloop import tornado.web import anago from anago.config import ModelConfig from anago.data.preprocess import WordPreprocessor SAVE_ROOT = os.path.join(os.path.dirname(__file__), '../../models') model_config = ModelConfig() p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) model_config.vocab_size = len(p.vocab_word) model_config.char_vocab_size = len(p.vocab_char) weights = 'model_weights.h5' tagger = anago.Tagger(model_config, weights, save_path=SAVE_ROOT, preprocessor=p) class MainHandler(tornado.web.RequestHandler): def get(self): self.render('index.html', sent='') def post(self): sent = self.get_argument('sent') entities = tagger.get_entities(sent) if entities: self.write(json.dumps(dict(entities)))
def test_pad_sequences(self): X, y = reader.load_data_and_labels(self.filename) preprocessor = WordPreprocessor(padding=True) p = preprocessor.fit(X, y) X, y = p.transform(X, y)
def test_to_numpy_array(self): X, y = reader.load_data_and_labels(self.filename) preprocessor = WordPreprocessor(padding=False) p = preprocessor.fit(X, y) X, y = p.transform(X, y) y = np.asarray(y)
def test_fit(self): X, y = reader.load_data_and_labels(self.filename) preprocessor = WordPreprocessor() p = preprocessor.fit(X, y)