예제 #1
0
    def test_train(self):
        model_config = ModelConfig()
        training_config = TrainingConfig()

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        x_train, y_train = load_data_and_labels(train_path)
        x_valid, y_valid = load_data_and_labels(valid_path)

        p = prepare_preprocessor(x_train, y_train)
        p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        embeddings = load_word_embeddings(p.vocab_word, EMBEDDING_PATH,
                                          model_config.word_embedding_size)
        model_config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))

        trainer = anago.Trainer(model,
                                training_config,
                                checkpoint_path=LOG_ROOT,
                                save_path=SAVE_ROOT,
                                preprocessor=p,
                                embeddings=embeddings)
        trainer.train(x_train, y_train, x_valid, y_valid)

        model.save(os.path.join(SAVE_ROOT, 'model_weights.h5'))
예제 #2
0
    def __init__(self,
                 char_emb_size=25,
                 word_emb_size=100,
                 char_lstm_units=25,
                 word_lstm_units=100,
                 dropout=0.5,
                 char_feature=True,
                 crf=True,
                 batch_size=1024,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=15,
                 early_stopping=True,
                 patience=3,
                 train_embeddings=True,
                 max_checkpoints_to_keep=5,
                 log_dir=None,
                 embeddings=()):

        self.model_config = ModelConfig(char_emb_size, word_emb_size,
                                        char_lstm_units, word_lstm_units,
                                        dropout, char_feature, crf)
        self.training_config = TrainingConfig(batch_size, optimizer,
                                              learning_rate, lr_decay,
                                              clip_gradients, max_epoch,
                                              early_stopping, patience,
                                              train_embeddings,
                                              max_checkpoints_to_keep)
        self.model = None
        self.p = None
        self.log_dir = log_dir
        self.embeddings = embeddings
예제 #3
0
    def test_train(self):
        DATA_ROOT = os.path.join(os.path.dirname(__file__),
                                 '../data/conll2003/en/tagging')
        SAVE_ROOT = os.path.join(os.path.dirname(__file__),
                                 '../models')  # trained model
        LOG_ROOT = os.path.join(os.path.dirname(__file__),
                                '../logs')  # checkpoint, tensorboard
        embedding_path = os.path.join(os.path.dirname(__file__),
                                      '../data/glove.6B/glove.6B.100d.txt')

        model_config = ModelConfig()
        training_config = TrainingConfig()

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        test_path = os.path.join(DATA_ROOT, 'test.txt')
        x_train, y_train = load_data_and_labels(train_path)
        x_valid, y_valid = load_data_and_labels(valid_path)
        x_test, y_test = load_data_and_labels(test_path)

        p = prepare_preprocessor(np.r_[x_train, x_valid, x_test],
                                 y_train)  # np.r_ is for vocabulary expansion.
        p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        embeddings = load_word_embeddings(p.vocab_word, embedding_path,
                                          model_config.word_embedding_size)
        model_config.char_vocab_size = len(p.vocab_char)

        trainer = anago.Trainer(model_config,
                                training_config,
                                checkpoint_path=LOG_ROOT,
                                save_path=SAVE_ROOT,
                                preprocessor=p,
                                embeddings=embeddings)
        trainer.train(x_train, y_train, x_test, y_test)
예제 #4
0
    def setUp(self):
        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))

        config = ModelConfig()
        config.vocab_size = len(p.vocab_word)
        config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(config, ntags=len(p.vocab_tag))
        model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5'))

        self.tagger = anago.Tagger(model, preprocessor=p)
        self.sent = 'President Obama is speaking at the White House.'
예제 #5
0
    def test_eval(self):
        test_path = os.path.join(DATA_ROOT, 'test.txt')
        x_test, y_test = load_data_and_labels(test_path)

        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        config = ModelConfig()
        config.vocab_size = len(p.vocab_word)
        config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(config, ntags=len(p.vocab_tag))
        model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5'))

        evaluator = anago.Evaluator(model, preprocessor=p)
        evaluator.eval(x_test, y_test)
예제 #6
0
    def setUp(self):
        SAVE_ROOT = os.path.join(os.path.dirname(__file__), '../models')

        model_config = ModelConfig()

        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        model_config.vocab_size = len(p.vocab_word)
        model_config.char_vocab_size = len(p.vocab_char)

        weights = 'model_weights.h5'

        self.tagger = anago.Tagger(model_config,
                                   weights,
                                   save_path=SAVE_ROOT,
                                   preprocessor=p)
        self.sent = 'President Obama is speaking at the White House.'
예제 #7
0
 def setUp(self):
     self.model_config = ModelConfig()
     self.training_config = TrainingConfig()
     vocab = 10000
     self.model_config.char_vocab_size = 80
     self.embeddings = np.zeros((vocab, self.model_config.word_embedding_size))
     self.filename = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner/test.txt')
     self.valid_file = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner/valid.txt')
예제 #8
0
    def test_eval(self):
        DATA_ROOT = os.path.join(os.path.dirname(__file__),
                                 '../data/conll2003/en/tagging')
        SAVE_ROOT = os.path.join(os.path.dirname(__file__), '../models')

        model_config = ModelConfig()

        test_path = os.path.join(DATA_ROOT, 'test.txt')
        x_test, y_test = load_data_and_labels(test_path)

        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        model_config.vocab_size = len(p.vocab_word)
        model_config.char_vocab_size = len(p.vocab_char)

        weights = 'model_weights.h5'

        evaluator = anago.Evaluator(model_config,
                                    weights,
                                    save_path=SAVE_ROOT,
                                    preprocessor=p)
        evaluator.eval(x_test, y_test)
예제 #9
0
    def load(cls, dir_path):
        self = cls()
        self.p = WordPreprocessor.load(
            os.path.join(dir_path, cls.preprocessor_file))
        config = ModelConfig.load(os.path.join(dir_path, cls.config_file))
        dummy_embeddings = np.zeros(
            (config.vocab_size, config.word_embedding_size), dtype=np.float32)
        self.model = SeqLabeling(config,
                                 dummy_embeddings,
                                 ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(dir_path, cls.weight_file))

        return self
예제 #10
0
파일: tag.py 프로젝트: janjve/anago
import os
import anago
from anago.preprocess import prepare_preprocessor
from anago.config import ModelConfig, TrainingConfig
from anago.models import SeqLabeling
import numpy as np
from anago.reader import load_word_embeddings, load_data_and_labels

DATA_ROOT = 'data/conll2003/en/ner'
LOAD_ROOT = './models'  # trained model
LOG_ROOT = './logs'  # checkpoint, tensorboard
embedding_path = '/media/jan/OS/Dataset/WordEmbeddings/wiki.en.vec'
model_config = ModelConfig()

test_path = os.path.join(DATA_ROOT, 'train.small.txt')
x_test, y_test = load_data_and_labels(test_path)

p = prepare_preprocessor(x_test, y_test)

embeddings = load_word_embeddings(p.vocab_word, embedding_path,
                                  model_config.word_embedding_size)
model_config.vocab_size = len(p.vocab_word)
model_config.char_vocab_size = len(p.vocab_char)

model_path = os.path.join(LOAD_ROOT, 'mymodel.h5')
model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))
model.load(model_path)
X, y = p.transform(x_test, y_test)
predictions = model.predict(X)

for words, prediction, sentence_length in zip(x_test, predictions, X[2]):
예제 #11
0
파일: eval.py 프로젝트: pilehvar/phenebank
import os

from anago.config import ModelConfig
from anago.tagger import Tagger as tag

import anago
from CRF.anago.data import load_data_and_labels
from CRF.anago.data import prepare_preprocessor

DATA_ROOT = 'data/phenebank/'
train_path = os.path.join(DATA_ROOT, 'train.txt')

x_train, y_train = load_data_and_labels(train_path)

p = prepare_preprocessor(x_train, y_train)
model_config = ModelConfig()
SAVE_ROOT = './models'  # trained model
weights = 'model_weights.h5'
tagger = anago.Tagger(model_config,
                      weights,
                      save_path=SAVE_ROOT,
                      preprocessor=p)

test_path = "data/phenebank/test.txt"

with open(test_path) as ifile:
    this_sentence = []
    all_sentences = []
    this_output = []
    all_outputs = []
    for line in ifile:
예제 #12
0
class Sequence(object):

    config_file = 'config.json'
    weight_file = 'model_weights.h5'
    preprocessor_file = 'preprocessor.pkl'

    def __init__(self,
                 char_emb_size=25,
                 word_emb_size=100,
                 char_lstm_units=25,
                 word_lstm_units=100,
                 dropout=0.5,
                 char_feature=True,
                 crf=True,
                 batch_size=1024,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=15,
                 early_stopping=True,
                 patience=3,
                 train_embeddings=True,
                 max_checkpoints_to_keep=5,
                 log_dir=None,
                 embeddings=()):

        self.model_config = ModelConfig(char_emb_size, word_emb_size,
                                        char_lstm_units, word_lstm_units,
                                        dropout, char_feature, crf)
        self.training_config = TrainingConfig(batch_size, optimizer,
                                              learning_rate, lr_decay,
                                              clip_gradients, max_epoch,
                                              early_stopping, patience,
                                              train_embeddings,
                                              max_checkpoints_to_keep)
        self.model = None
        self.p = None
        self.log_dir = log_dir
        self.embeddings = embeddings

    def train(self,
              x_train,
              y_train,
              x_valid=None,
              y_valid=None,
              vocab_init=None,
              verbose=1):
        self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init)
        embeddings = filter_embeddings(self.embeddings, self.p.vocab_word,
                                       self.model_config.word_embedding_size)
        self.model_config.vocab_size = len(self.p.vocab_word)
        self.model_config.char_vocab_size = len(self.p.vocab_char)

        self.model = SeqLabeling(self.model_config, embeddings,
                                 len(self.p.vocab_tag))

        if not os.path.exists(self.log_dir):
            print('Successfully made a directory: {}'.format(self.log_dir))
            os.mkdir(self.log_dir)
        self.p.save(os.path.join(self.log_dir, self.preprocessor_file))
        self.model_config.save(os.path.join(self.log_dir, self.config_file))
        print('Successfully save config and preprocess files')

        trainer = Trainer(self.model,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p)
        return trainer.train(x_train, y_train, x_valid, y_valid, verbose)

    def eval(self, x_test, y_test):
        if self.model:
            evaluator = Evaluator(self.model, preprocessor=self.p)
            evaluator.eval(x_test, y_test)
        else:
            raise (OSError('Could not find a model. Call load(dir_path).'))

    def analyze(self, words):
        if self.model:
            tagger = Tagger(self.model, preprocessor=self.p)
            return tagger.analyze(words)
        else:
            raise (OSError('Could not find a model. Call load(dir_path).'))

    def save(self, dir_path):
        self.p.save(os.path.join(dir_path, self.preprocessor_file))
        self.model_config.save(os.path.join(dir_path, self.config_file))
        self.model.save(os.path.join(dir_path, self.weight_file))

    @classmethod
    def load(cls, dir_path):
        self = cls()
        self.p = WordPreprocessor.load(
            os.path.join(dir_path, cls.preprocessor_file))
        config = ModelConfig.load(os.path.join(dir_path, cls.config_file))
        dummy_embeddings = np.zeros(
            (config.vocab_size, config.word_embedding_size), dtype=np.float32)
        self.model = SeqLabeling(config,
                                 dummy_embeddings,
                                 ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(dir_path, cls.weight_file))
        self.model._make_predict_function()

        return self