Ejemplo n.º 1
0
    def test_train(self):
        model_config = ModelConfig()
        training_config = TrainingConfig()

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        x_train, y_train = load_data_and_labels(train_path)
        x_valid, y_valid = load_data_and_labels(valid_path)

        p = prepare_preprocessor(x_train, y_train)
        p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        embeddings = load_word_embeddings(p.vocab_word, EMBEDDING_PATH,
                                          model_config.word_embedding_size)
        model_config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))

        trainer = anago.Trainer(model,
                                training_config,
                                checkpoint_path=LOG_ROOT,
                                save_path=SAVE_ROOT,
                                preprocessor=p,
                                embeddings=embeddings)
        trainer.train(x_train, y_train, x_valid, y_valid)

        model.save(os.path.join(SAVE_ROOT, 'model_weights.h5'))
Ejemplo n.º 2
0
    def train(self,
              x_train,
              y_train,
              x_valid=None,
              y_valid=None,
              vocab_init=None,
              verbose=1):
        self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init)
        embeddings = filter_embeddings(self.embeddings, self.p.vocab_word,
                                       self.model_config.word_embedding_size)
        self.model_config.vocab_size = len(self.p.vocab_word)
        self.model_config.char_vocab_size = len(self.p.vocab_char)

        self.model = SeqLabeling(self.model_config, embeddings,
                                 len(self.p.vocab_tag))

        if not os.path.exists(self.log_dir):
            print('Successfully made a directory: {}'.format(self.log_dir))
            os.mkdir(self.log_dir)
        self.p.save(os.path.join(self.log_dir, self.preprocessor_file))
        self.model_config.save(os.path.join(self.log_dir, self.config_file))
        print('Successfully save config and preprocess files')

        trainer = Trainer(self.model,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p)
        return trainer.train(x_train, y_train, x_valid, y_valid, verbose)
Ejemplo n.º 3
0
    def test_predict(self):
        X, y = load_data_and_labels(self.filename)
        X, y = X[:100], y[:100]
        p = prepare_preprocessor(X, y)
        self.model_config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(self.model_config, self.embeddings, ntags=len(p.vocab_tag))
        model.predict(p.transform(X))
Ejemplo n.º 4
0
 def test_batch_iter(self):
     sents, labels = load_data_and_labels(self.filename)
     batch_size = 32
     p = prepare_preprocessor(sents, labels)
     steps, batches = batch_iter(list(zip(sents, labels)),
                                 batch_size,
                                 preprocessor=p)
     self.assertEqual(len([_ for _ in batches]),
                      steps)  # Todo: infinite loop
Ejemplo n.º 5
0
    def train(self,
              x_train,
              y_train,
              x_valid=None,
              y_valid=None,
              vocab_init=None):
        self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init)
        embeddings = filter_embeddings(self.embeddings, self.p.vocab_word,
                                       self.model_config.word_embedding_size)
        self.model_config.vocab_size = len(self.p.vocab_word)
        self.model_config.char_vocab_size = len(self.p.vocab_char)

        self.model = SeqLabeling(self.model_config, embeddings,
                                 len(self.p.vocab_tag))

        trainer = Trainer(self.model,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p,
                          save_path='./models')
        trainer.train(x_train, y_train, x_valid, y_valid)
Ejemplo n.º 6
0
Archivo: tag.py Proyecto: janjve/anago
from anago.preprocess import prepare_preprocessor
from anago.config import ModelConfig, TrainingConfig
from anago.models import SeqLabeling
import numpy as np
from anago.reader import load_word_embeddings, load_data_and_labels

DATA_ROOT = 'data/conll2003/en/ner'
LOAD_ROOT = './models'  # trained model
LOG_ROOT = './logs'  # checkpoint, tensorboard
embedding_path = '/media/jan/OS/Dataset/WordEmbeddings/wiki.en.vec'
model_config = ModelConfig()

test_path = os.path.join(DATA_ROOT, 'train.small.txt')
x_test, y_test = load_data_and_labels(test_path)

p = prepare_preprocessor(x_test, y_test)

embeddings = load_word_embeddings(p.vocab_word, embedding_path,
                                  model_config.word_embedding_size)
model_config.vocab_size = len(p.vocab_word)
model_config.char_vocab_size = len(p.vocab_char)

model_path = os.path.join(LOAD_ROOT, 'mymodel.h5')
model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))
model.load(model_path)
X, y = p.transform(x_test, y_test)
predictions = model.predict(X)

for words, prediction, sentence_length in zip(x_test, predictions, X[2]):
    nopad_prediction = prediction[:sentence_length.item()]
    label_indices = [np.argmax(x) for x in nopad_prediction]
Ejemplo n.º 7
0
DATA_ROOT = 'data/conll2003/en/ner'
SAVE_ROOT = './models'  # trained model
LOG_ROOT = './logs'  # checkpoint, tensorboard
embedding_path = '/media/jan/OS/Dataset/WordEmbeddings/wiki.en.vec'
model_config = ModelConfig()
training_config = TrainingConfig()

model_path = os.path.join(SAVE_ROOT, 'mymodel.h5')

train_path = os.path.join(DATA_ROOT, 'train.small.txt')
valid_path = os.path.join(DATA_ROOT, 'valid.small.txt')

x_train, y_train = load_data_and_labels(train_path)
x_valid, y_valid = load_data_and_labels(valid_path)

p = prepare_preprocessor(x_train, y_train)
embeddings = load_word_embeddings(p.vocab_word, embedding_path,
                                  model_config.word_embedding_size)
model_config.vocab_size = len(p.vocab_word)
model_config.char_vocab_size = len(p.vocab_char)

model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))
trainer = anago.Trainer(model,
                        training_config,
                        checkpoint_path=LOG_ROOT,
                        save_path=SAVE_ROOT,
                        preprocessor=p)
trainer.train(x_train, y_train, x_valid, y_valid)
evaluator = anago.Evaluator(model, preprocessor=p)
model.save(model_path)