def test_train(self): model_config = ModelConfig() training_config = TrainingConfig() train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) p = prepare_preprocessor(x_train, y_train) p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) embeddings = load_word_embeddings(p.vocab_word, EMBEDDING_PATH, model_config.word_embedding_size) model_config.char_vocab_size = len(p.vocab_char) model = SeqLabeling(model_config, embeddings, len(p.vocab_tag)) trainer = anago.Trainer(model, training_config, checkpoint_path=LOG_ROOT, save_path=SAVE_ROOT, preprocessor=p, embeddings=embeddings) trainer.train(x_train, y_train, x_valid, y_valid) model.save(os.path.join(SAVE_ROOT, 'model_weights.h5'))
def train(self, x_train, y_train, x_valid=None, y_valid=None, vocab_init=None, verbose=1): self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init) embeddings = filter_embeddings(self.embeddings, self.p.vocab_word, self.model_config.word_embedding_size) self.model_config.vocab_size = len(self.p.vocab_word) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model = SeqLabeling(self.model_config, embeddings, len(self.p.vocab_tag)) if not os.path.exists(self.log_dir): print('Successfully made a directory: {}'.format(self.log_dir)) os.mkdir(self.log_dir) self.p.save(os.path.join(self.log_dir, self.preprocessor_file)) self.model_config.save(os.path.join(self.log_dir, self.config_file)) print('Successfully save config and preprocess files') trainer = Trainer(self.model, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p) return trainer.train(x_train, y_train, x_valid, y_valid, verbose)
def test_predict(self): X, y = load_data_and_labels(self.filename) X, y = X[:100], y[:100] p = prepare_preprocessor(X, y) self.model_config.char_vocab_size = len(p.vocab_char) model = SeqLabeling(self.model_config, self.embeddings, ntags=len(p.vocab_tag)) model.predict(p.transform(X))
def setUp(self): p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) config = ModelConfig() config.vocab_size = len(p.vocab_word) config.char_vocab_size = len(p.vocab_char) model = SeqLabeling(config, ntags=len(p.vocab_tag)) model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5')) self.tagger = anago.Tagger(model, preprocessor=p) self.sent = 'President Obama is speaking at the White House.'
def load(cls, dir_path): self = cls() self.p = WordPreprocessor.load( os.path.join(dir_path, cls.preprocessor_file)) config = ModelConfig.load(os.path.join(dir_path, cls.config_file)) dummy_embeddings = np.zeros( (config.vocab_size, config.word_embedding_size), dtype=np.float32) self.model = SeqLabeling(config, dummy_embeddings, ntags=len(self.p.vocab_tag)) self.model.load(filepath=os.path.join(dir_path, cls.weight_file)) return self
def __init__(self, config, weights, save_path='', preprocessor=None, tokenizer=str.split): self.preprocessor = preprocessor self.tokenizer = tokenizer # Build the model self.model = SeqLabeling(config, ntags=len(self.preprocessor.vocab_tag)) self.model.load(filepath=os.path.join(save_path, weights))
def test_eval(self): test_path = os.path.join(DATA_ROOT, 'test.txt') x_test, y_test = load_data_and_labels(test_path) p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) config = ModelConfig() config.vocab_size = len(p.vocab_word) config.char_vocab_size = len(p.vocab_char) model = SeqLabeling(config, ntags=len(p.vocab_tag)) model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5')) evaluator = anago.Evaluator(model, preprocessor=p) evaluator.eval(x_test, y_test)
def train(self, x_train, y_train, x_valid=None, y_valid=None): # Prepare training and validation data(steps, generator) train_steps, train_batches = batch_iter( list(zip(x_train, y_train)), self.training_config.batch_size, preprocessor=self.preprocessor) valid_steps, valid_batches = batch_iter( list(zip(x_valid, y_valid)), self.training_config.batch_size, preprocessor=self.preprocessor) # Build the model model = SeqLabeling(self.model_config, self.embeddings, len(self.preprocessor.vocab_tag)) model.compile( loss=model.crf.loss, optimizer=Adam(lr=self.training_config.learning_rate), ) # Prepare callbacks for training callbacks = get_callbacks( log_dir=self.checkpoint_path, tensorboard=self.tensorboard, eary_stopping=self.training_config.early_stopping, valid=(valid_steps, valid_batches, self.preprocessor)) # Train the model model.fit_generator(generator=train_batches, steps_per_epoch=train_steps, epochs=self.training_config.max_epoch, callbacks=callbacks) # Save the model model.save(os.path.join(self.save_path, 'model_weights.h5'))
def eval(self, x_test, y_test): # Prepare test data(steps, generator) train_steps, train_batches = batch_iter(list(zip(x_test, y_test)), self.config.batch_size, preprocessor=self.preprocessor) # Build the model model = SeqLabeling(self.config, ntags=len(self.preprocessor.vocab_tag)) model.load(filepath=os.path.join(self.save_path, self.weights)) # Build the evaluator and evaluate the model f1score = F1score(train_steps, train_batches, self.preprocessor) f1score.model = model f1score.on_epoch_end(epoch=-1) # epoch takes any integer.
def train(self, x_train, y_train, x_valid=None, y_valid=None, vocab_init=None): self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init) embeddings = filter_embeddings(self.embeddings, self.p.vocab_word, self.model_config.word_embedding_size) self.model_config.vocab_size = len(self.p.vocab_word) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model = SeqLabeling(self.model_config, embeddings, len(self.p.vocab_tag)) trainer = Trainer(self.model, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p, save_path='./models') trainer.train(x_train, y_train, x_valid, y_valid)
DATA_ROOT = 'data/conll2003/en/ner' LOAD_ROOT = './models' # trained model LOG_ROOT = './logs' # checkpoint, tensorboard embedding_path = '/media/jan/OS/Dataset/WordEmbeddings/wiki.en.vec' model_config = ModelConfig() test_path = os.path.join(DATA_ROOT, 'train.small.txt') x_test, y_test = load_data_and_labels(test_path) p = prepare_preprocessor(x_test, y_test) embeddings = load_word_embeddings(p.vocab_word, embedding_path, model_config.word_embedding_size) model_config.vocab_size = len(p.vocab_word) model_config.char_vocab_size = len(p.vocab_char) model_path = os.path.join(LOAD_ROOT, 'mymodel.h5') model = SeqLabeling(model_config, embeddings, len(p.vocab_tag)) model.load(model_path) X, y = p.transform(x_test, y_test) predictions = model.predict(X) for words, prediction, sentence_length in zip(x_test, predictions, X[2]): nopad_prediction = prediction[:sentence_length.item()] label_indices = [np.argmax(x) for x in nopad_prediction] labels = p.inverse_transform(label_indices) print "\n".join(["{}\t{}".format(w, l) for w, l in zip(words, labels)]) print ''
class Tagger(object): def __init__(self, config, weights, save_path='', preprocessor=None, tokenizer=str.split): self.preprocessor = preprocessor self.tokenizer = tokenizer # Build the model self.model = SeqLabeling(config, ntags=len(self.preprocessor.vocab_tag)) self.model.load(filepath=os.path.join(save_path, weights)) def predict(self, words): sequence_lengths = [len(words)] X = self.preprocessor.transform([words]) pred = self.model.predict(X, sequence_lengths) pred = np.argmax(pred, -1) pred = self.preprocessor.inverse_transform(pred[0]) return pred def tag(self, sent): """Tags a sentence named entities. Args: sent: a sentence Return: labels_pred: list of (word, tag) for a sentence Example: >>> sent = 'President Obama is speaking at the White House.' >>> print(self.tag(sent)) [('President', 'O'), ('Obama', 'PERSON'), ('is', 'O'), ('speaking', 'O'), ('at', 'O'), ('the', 'O'), ('White', 'LOCATION'), ('House', 'LOCATION'), ('.', 'O')] """ assert isinstance(sent, str) words = self.tokenizer(sent) pred = self.predict(words) pred = [t.split('-')[-1] for t in pred] # remove prefix: e.g. B-Person -> Person return list(zip(words, pred)) def get_entities(self, sent): """Gets entities from a sentence. Args: sent: a sentence Return: labels_pred: dict of entities for a sentence Example: sent = 'President Obama is speaking at the White House.' result = {'Person': ['Obama'], 'LOCATION': ['White House']} """ assert isinstance(sent, str) words = self.tokenizer(sent) pred = self.predict(words) entities = self._get_chunks(words, pred) return entities def _get_chunks(self, words, tags): """ Args: words: sequence of word tags: sequence of labels Returns: dict of entities for a sequence Example: words = ['President', 'Obama', 'is', 'speaking', 'at', 'the', 'White', 'House', '.'] tags = ['O', 'B-Person', 'O', 'O', 'O', 'O', 'B-Location', 'I-Location', 'O'] result = {'Person': ['Obama'], 'LOCATION': ['White House']} """ chunks = get_entities(tags) res = defaultdict(list) for chunk_type, chunk_start, chunk_end in chunks: res[chunk_type].append(' '.join( words[chunk_start:chunk_end])) # todo delimiter changeable return res
def test_compile(self): model = SeqLabeling(self.model_config, self.embeddings, ntags=10) model.compile(loss=model.crf.loss, optimizer=Adam(lr=self.training_config.learning_rate) )
def test_build(self): model = SeqLabeling(self.model_config, self.embeddings, ntags=10)
DATA_ROOT = 'data/conll2003/en/ner' SAVE_ROOT = './models' # trained model LOG_ROOT = './logs' # checkpoint, tensorboard embedding_path = '/media/jan/OS/Dataset/WordEmbeddings/wiki.en.vec' model_config = ModelConfig() training_config = TrainingConfig() model_path = os.path.join(SAVE_ROOT, 'mymodel.h5') train_path = os.path.join(DATA_ROOT, 'train.small.txt') valid_path = os.path.join(DATA_ROOT, 'valid.small.txt') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) p = prepare_preprocessor(x_train, y_train) embeddings = load_word_embeddings(p.vocab_word, embedding_path, model_config.word_embedding_size) model_config.vocab_size = len(p.vocab_word) model_config.char_vocab_size = len(p.vocab_char) model = SeqLabeling(model_config, embeddings, len(p.vocab_tag)) trainer = anago.Trainer(model, training_config, checkpoint_path=LOG_ROOT, save_path=SAVE_ROOT, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid) evaluator = anago.Evaluator(model, preprocessor=p) model.save(model_path)
class Sequence(object): config_file = 'config.json' weight_file = 'model_weights.h5' preprocessor_file = 'preprocessor.pkl' def __init__(self, char_emb_size=25, word_emb_size=100, char_lstm_units=25, word_lstm_units=100, dropout=0.5, char_feature=True, crf=True, batch_size=1024, optimizer='adam', learning_rate=0.001, lr_decay=0.9, clip_gradients=5.0, max_epoch=15, early_stopping=True, patience=3, train_embeddings=True, max_checkpoints_to_keep=5, log_dir=None, embeddings=()): self.model_config = ModelConfig(char_emb_size, word_emb_size, char_lstm_units, word_lstm_units, dropout, char_feature, crf) self.training_config = TrainingConfig(batch_size, optimizer, learning_rate, lr_decay, clip_gradients, max_epoch, early_stopping, patience, train_embeddings, max_checkpoints_to_keep) self.model = None self.p = None self.log_dir = log_dir self.embeddings = embeddings def train(self, x_train, y_train, x_valid=None, y_valid=None, vocab_init=None, verbose=1): self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init) embeddings = filter_embeddings(self.embeddings, self.p.vocab_word, self.model_config.word_embedding_size) self.model_config.vocab_size = len(self.p.vocab_word) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model = SeqLabeling(self.model_config, embeddings, len(self.p.vocab_tag)) if not os.path.exists(self.log_dir): print('Successfully made a directory: {}'.format(self.log_dir)) os.mkdir(self.log_dir) self.p.save(os.path.join(self.log_dir, self.preprocessor_file)) self.model_config.save(os.path.join(self.log_dir, self.config_file)) print('Successfully save config and preprocess files') trainer = Trainer(self.model, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p) return trainer.train(x_train, y_train, x_valid, y_valid, verbose) def eval(self, x_test, y_test): if self.model: evaluator = Evaluator(self.model, preprocessor=self.p) evaluator.eval(x_test, y_test) else: raise (OSError('Could not find a model. Call load(dir_path).')) def analyze(self, words): if self.model: tagger = Tagger(self.model, preprocessor=self.p) return tagger.analyze(words) else: raise (OSError('Could not find a model. Call load(dir_path).')) def save(self, dir_path): self.p.save(os.path.join(dir_path, self.preprocessor_file)) self.model_config.save(os.path.join(dir_path, self.config_file)) self.model.save(os.path.join(dir_path, self.weight_file)) @classmethod def load(cls, dir_path): self = cls() self.p = WordPreprocessor.load( os.path.join(dir_path, cls.preprocessor_file)) config = ModelConfig.load(os.path.join(dir_path, cls.config_file)) dummy_embeddings = np.zeros( (config.vocab_size, config.word_embedding_size), dtype=np.float32) self.model = SeqLabeling(config, dummy_embeddings, ntags=len(self.p.vocab_tag)) self.model.load(filepath=os.path.join(dir_path, cls.weight_file)) self.model._make_predict_function() return self