def load_and_predict(): test = pd.read_csv(config.data_folder + "test.csv", converters={"pos": literal_eval}) x_test = [x.split() for x in test['sentence'].tolist()] p = IndexTransformer(use_char=True) p = p.load('../models/best_transform.it') model = BiLSTMCRF(char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, word_embedding_dim=300, char_embedding_dim=100, word_lstm_size=100, char_lstm_size=50, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True) model, loss = model.build() model.load_weights('../models/' + 'best_model.h5') predict(model, p, x_test)
def main(args): print('Loading dataset...') x_train, y_train = load_data_and_labels(args.train_data) x_valid, y_valid = load_data_and_labels(args.valid_data) x_test, y_test = load_data_and_labels(args.test_data) x_train = np.r_[x_train, x_valid, x_test] y_train = np.r_[y_train, y_valid, y_test] print('Transforming datasets...') p = IndexTransformer(use_char=args.no_char_feature) p.fit(x_train, y_train) print('Building a model.') model = BiLSTMCRF(char_embedding_dim=args.char_emb_size, word_embedding_dim=args.word_emb_size, char_lstm_size=args.char_lstm_units, word_lstm_size=args.word_lstm_units, char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, dropout=args.dropout, use_char=args.no_char_feature, use_crf=args.no_use_crf) model, loss = model.build() model.compile(loss=loss, optimizer='adam') print('Training the model...') trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid, epochs=args.max_epoch) print('Saving the model...') save_model(model, args.weights_file, args.params_file) p.save(args.preprocessor_file)
def test_train_no_crf(self): model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size, word_vocab_size=self.p.word_vocab_size, num_labels=self.p.label_size, use_crf=False) model, loss = model.build() model.compile(loss=loss, optimizer='adam') trainer = Trainer(model, preprocessor=self.p) trainer.train(self.x_train, self.y_train, x_valid=self.x_valid, y_valid=self.y_valid)
def test_train_no_character(self): p = IndexTransformer(use_char=False) p.fit(self.x_train, self.y_train) model = BiLSTMCRF(word_vocab_size=p.word_vocab_size, num_labels=p.label_size, use_crf=False, use_char=False) model, loss = model.build() model.compile(loss=loss, optimizer='adam') trainer = Trainer(model, preprocessor=p) trainer.train(self.x_train, self.y_train, x_valid=self.x_valid, y_valid=self.y_valid)
def fit(self, x_train, y_train, x_valid=None, y_valid=None, epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True): """Fit the model for a fixed number of epochs. Args: x_train: list of training data. y_train: list of training target (label) data. x_valid: list of validation data. y_valid: list of validation target (label) data. batch_size: Integer. Number of samples per gradient update. If unspecified, `batch_size` will default to 32. epochs: Integer. Number of epochs to train the model. verbose: Integer. 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch. callbacks: List of `keras.callbacks.Callback` instances. List of callbacks to apply during training. shuffle: Boolean (whether to shuffle the training data before each epoch). `shuffle` will default to True. """ p = IndexTransformer(initial_vocab=self.initial_vocab, use_char=self.use_char) p.fit(x_train, y_train) embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab, self.word_embedding_dim) model = BiLSTMCRF(char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, word_embedding_dim=self.word_embedding_dim, char_embedding_dim=self.char_embedding_dim, word_lstm_size=self.word_lstm_size, char_lstm_size=self.char_lstm_size, fc_dim=self.fc_dim, dropout=self.dropout, embeddings=embeddings, use_char=self.use_char, use_crf=self.use_crf) model, loss = model.build() model.compile(loss=loss, optimizer=self.optimizer) trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid, epochs=epochs, batch_size=batch_size, verbose=verbose, callbacks=callbacks, shuffle=shuffle) self.p = p self.model = model
def setUp(self): # Load datasets. train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') self.x_train, self.y_train = load_data_and_labels(train_path) self.x_valid, self.y_valid = load_data_and_labels(valid_path) # Fit transformer. self.p = IndexTransformer() self.p.fit(self.x_train, self.y_train) # Build a model. self.model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size, word_vocab_size=self.p.word_vocab_size, num_labels=self.p.label_size) self.model, loss = self.model.build() self.model.compile(loss=loss, optimizer='adam')
def fit(self, X, y): """ Trains the NER model. Input is list of list of tokens and tags. Parameters ---------- X : list(list(str)) list of list of tokens y : list(list(str)) list of list of BIO tags Returns ------- self """ log.info("Preprocessing dataset...") self.preprocessor_ = IndexTransformer(use_char=self.use_char) self.preprocessor_.fit(X, y) log.info("Building model...") self.model_ = BiLSTMCRF( char_embedding_dim=self.char_embedding_dim, word_embedding_dim=self.word_embedding_dim, char_lstm_size=self.char_lstm_size, word_lstm_size=self.word_lstm_size, char_vocab_size=self.preprocessor_.char_vocab_size, word_vocab_size=self.preprocessor_.word_vocab_size, num_labels=self.preprocessor_.label_size, dropout=self.dropout, use_char=self.use_char, use_crf=self.use_crf) self.model_, loss = self.model_.build() optimizer = Adam(lr=self.learning_rate) self.model_.compile(loss=loss, optimizer=optimizer) self.model_.summary() log.info('Training the model...') self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, batch_size=self.batch_size, epochs=self.max_iter) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self
def main(args): print('Loading objects...') model = BiLSTMCRF.load(args.weights_file, args.params_file) it = IndexTransformer.load(args.preprocessor_file) tagger = Tagger(model, preprocessor=it) print('Tagging a sentence...') res = tagger.analyze(args.sent) pprint(res)
def test_save_and_load(self): char_vocab_size = 100 word_vocab_size = 10000 num_labels = 10 model = BiLSTMCRF(char_vocab_size=char_vocab_size, word_vocab_size=word_vocab_size, num_labels=num_labels) model, loss = model.build() self.assertFalse(os.path.exists(self.weights_file)) self.assertFalse(os.path.exists(self.params_file)) save_model(model, self.weights_file, self.params_file) self.assertTrue(os.path.exists(self.weights_file)) self.assertTrue(os.path.exists(self.params_file)) model = load_model(self.weights_file, self.params_file)
def main(args): print('Loading dataset...') x_train, y_train = load_data_and_labels(args.train_data) x_valid, y_valid = load_data_and_labels(args.valid_data) print('Transforming datasets...') p = IndexTransformer(use_char=args.no_char_feature) p.fit(x_train, y_train) print('Building a model.') model = BiLSTMCRF(char_embedding_dim=args.char_emb_size, word_embedding_dim=args.word_emb_size, char_lstm_size=args.char_lstm_units, word_lstm_size=args.word_lstm_units, char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, dropout=args.dropout, use_char=args.no_char_feature, use_crf=args.no_use_crf) model.build_model() print('Training the model...') trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid) print('Saving the model...') model.save(args.weights_file, args.params_file) p.save(args.preprocessor_file)
def main(args): print('Loading datasets...') X, y = load_data_and_labels(args.data_path) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) embeddings = KeyedVectors.load(args.embedding_path).wv print('Transforming datasets...') p = IndexTransformer() p.fit(X, y) embeddings = filter_embeddings(embeddings, p._word_vocab, embeddings.vector_size) print('Building a model...') model = BiLSTMCRF(char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, embeddings=embeddings, char_embedding_dim=50) model.build() print('Training the model...') trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid) print('Saving the model...') model.save(args.weights_file, args.params_file) p.save(args.preprocessor_file)
def setUpClass(cls): weights_file = os.path.join(SAVE_ROOT, 'weights.h5') params_file = os.path.join(SAVE_ROOT, 'params.json') preprocessor_file = os.path.join(SAVE_ROOT, 'preprocessor.pickle') # Load preprocessor p = IndexTransformer.load(preprocessor_file) # Load the model. model = BiLSTMCRF.load(weights_file, params_file) # Build a tagger cls.tagger = anago.Tagger(model, preprocessor=p) cls.sent = 'President Obama is speaking at the White House.'
def test_save_and_load(self): char_vocab_size = 100 word_vocab_size = 10000 num_labels = 10 model = BiLSTMCRF(char_vocab_size=char_vocab_size, word_vocab_size=word_vocab_size, num_labels=num_labels) model.build() self.assertFalse(os.path.exists(self.weights_file)) self.assertFalse(os.path.exists(self.params_file)) model.save(self.weights_file, self.params_file) self.assertTrue(os.path.exists(self.weights_file)) self.assertTrue(os.path.exists(self.params_file)) model = BiLSTMCRF.load(self.weights_file, self.params_file) self.assertEqual(model._char_vocab_size, char_vocab_size) self.assertEqual(model._word_vocab_size, word_vocab_size) self.assertEqual(model._num_labels, num_labels)
def test_build_model(self): char_vocab_size = 100 word_vocab_size = 10000 num_labels = 10 # Normal. model = BiLSTMCRF(char_vocab_size=char_vocab_size, word_vocab_size=word_vocab_size, num_labels=num_labels) model.build() # No CRF. model = BiLSTMCRF(char_vocab_size=char_vocab_size, word_vocab_size=word_vocab_size, num_labels=num_labels, use_crf=False) model.build() # No character feature. model = BiLSTMCRF(char_vocab_size=char_vocab_size, word_vocab_size=word_vocab_size, num_labels=num_labels, use_char=False) model.build()
def training(train, test): x_train = [x.split() for x in train['sentence'].tolist()] y_train = train['tag'].tolist() x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=233) print('Transforming datasets...') p = IndexTransformer(use_char=True) p.fit(x_train, y_train) embeddings = load_glove(config.glove_file) embeddings = filter_embeddings(embeddings, p._word_vocab.vocab, config.glove_size) model = BiLSTMCRF(char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, word_embedding_dim=300, char_embedding_dim=100, word_lstm_size=100, char_lstm_size=50, fc_dim=100, dropout=0.5, embeddings=embeddings, use_char=True, use_crf=True) opt = Adam(lr=0.001) model, loss = model.build() model.compile(loss=loss, optimizer=opt, metrics=[crf_viterbi_accuracy]) filepath = '../models/' + 'best_model' ckp = ModelCheckpoint(filepath + '.h5', monitor='val_crf_viterbi_accuracy', verbose=1, save_best_only=True, mode='max', save_weights_only=True) es = EarlyStopping(monitor='val_crf_viterbi_accuracy', min_delta=0.00001, patience=3, verbose=1, mode='max') rlr = ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.2, patience=2, verbose=1, mode='max', min_delta=0.0001) callbacks = [ckp, es, rlr] train_seq = NERSequence(x_train, y_train, config.batch_size, p.transform) if x_val and y_val: valid_seq = NERSequence(x_val, y_val, config.batch_size, p.transform) f1 = F1score(valid_seq, preprocessor=p) callbacks.append(f1) model.fit_generator(generator=train_seq, validation_data=valid_seq, epochs=config.nepochs, callbacks=callbacks, verbose=True, shuffle=True, use_multiprocessing=True, workers=42)
class TestTrainer(unittest.TestCase): @classmethod def setUpClass(cls): if not os.path.exists(LOG_ROOT): os.mkdir(LOG_ROOT) if not os.path.exists(SAVE_ROOT): os.mkdir(SAVE_ROOT) cls.weights_file = os.path.join(SAVE_ROOT, 'weights.h5') cls.params_file = os.path.join(SAVE_ROOT, 'params.json') cls.preprocessor_file = os.path.join(SAVE_ROOT, 'preprocessor.pickle') def setUp(self): # Load datasets. train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') self.x_train, self.y_train = load_data_and_labels(train_path) self.x_valid, self.y_valid = load_data_and_labels(valid_path) # Fit transformer. self.p = IndexTransformer() self.p.fit(self.x_train, self.y_train) # Build a model. self.model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size, word_vocab_size=self.p.word_vocab_size, num_labels=self.p.label_size) self.model, loss = self.model.build() self.model.compile(loss=loss, optimizer='adam') def test_train(self): trainer = Trainer(self.model, preprocessor=self.p) trainer.train(self.x_train, self.y_train, x_valid=self.x_valid, y_valid=self.y_valid) def test_train_no_valid(self): trainer = Trainer(self.model, preprocessor=self.p) trainer.train(self.x_train, self.y_train) def test_train_no_crf(self): model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size, word_vocab_size=self.p.word_vocab_size, num_labels=self.p.label_size, use_crf=False) model, loss = model.build() model.compile(loss=loss, optimizer='adam') trainer = Trainer(model, preprocessor=self.p) trainer.train(self.x_train, self.y_train, x_valid=self.x_valid, y_valid=self.y_valid) def test_train_no_character(self): p = IndexTransformer(use_char=False) p.fit(self.x_train, self.y_train) model = BiLSTMCRF(word_vocab_size=p.word_vocab_size, num_labels=p.label_size, use_crf=False, use_char=False) model, loss = model.build() model.compile(loss=loss, optimizer='adam') trainer = Trainer(model, preprocessor=p) trainer.train(self.x_train, self.y_train, x_valid=self.x_valid, y_valid=self.y_valid) def test_save(self): # Train the model. trainer = Trainer(self.model, preprocessor=self.p) trainer.train(self.x_train, self.y_train) # Save the model. save_model(self.model, self.weights_file, self.params_file) self.p.save(self.preprocessor_file)
class BiLstmCrfNER(NERModel): def __init__(self, word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, batch_size=16, learning_rate=0.001, max_iter=10): """ Construct a BiLSTM-CRF NER model. Model is augmented with character level embeddings as well as word embeddings by default. Implementation is provided by the Anago project. Parameters ---------- word_embedding_dim : int, optional, default 100 word embedding dimensions. char_embedding_dim : int, optional, default 25 character embedding dimensions. word_lstm_size : int, optional, default 100 character LSTM feature extractor output dimensions. char_lstm_size : int, optional, default 25 word tagger LSTM output dimensions. fc_dim : int, optional, default 100 output fully-connected layer size. dropout : float, optional, default 0.5 dropout rate. embeddings : numpy array word embedding matrix. use_char : bool, optional, default True add char feature. use_crf : bool, optional, default True use crf as last layer. batch_size : int, optional, default 16 training batch size. learning_rate : float, optional, default 0.001 learning rate for Adam optimizer max_iter : int number of epochs of training Attributes ---------- preprocessor_ : reference to preprocessor model_ : reference to generated model trainer_ : internal reference to Anago Trainer (model) tagger_ : internal reference to Anago Tagger (predictor) """ super().__init__() self.word_embedding_dim = word_embedding_dim self.char_embedding_dim = char_embedding_dim self.word_lstm_size = word_lstm_size self.char_lstm_size = char_lstm_size self.fc_dim = fc_dim self.dropout = dropout self.embedding = None self.use_char = True self.use_crf = True self.batch_size = batch_size self.learning_rate = learning_rate self.max_iter = max_iter # populated by fit() and load(), expected by save() and transform() self.preprocessor_ = None self.model_ = None self.trainer_ = None self.tagger_ = None def fit(self, X, y): """ Trains the NER model. Input is list of list of tokens and tags. Parameters ---------- X : list(list(str)) list of list of tokens y : list(list(str)) list of list of BIO tags Returns ------- self """ log.info("Preprocessing dataset...") self.preprocessor_ = IndexTransformer(use_char=self.use_char) self.preprocessor_.fit(X, y) log.info("Building model...") self.model_ = BiLSTMCRF( char_embedding_dim=self.char_embedding_dim, word_embedding_dim=self.word_embedding_dim, char_lstm_size=self.char_lstm_size, word_lstm_size=self.word_lstm_size, char_vocab_size=self.preprocessor_.char_vocab_size, word_vocab_size=self.preprocessor_.word_vocab_size, num_labels=self.preprocessor_.label_size, dropout=self.dropout, use_char=self.use_char, use_crf=self.use_crf) self.model_, loss = self.model_.build() optimizer = Adam(lr=self.learning_rate) self.model_.compile(loss=loss, optimizer=optimizer) self.model_.summary() log.info('Training the model...') self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, batch_size=self.batch_size, epochs=self.max_iter) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self def predict(self, X): """ Predicts using the NER model. Parameters ---------- X : list(list(str)) list of list of tokens. Returns ------- y : list(list(str)) list of list of predicted BIO tags. """ if self.tagger_ is None: raise ValueError("No tagger found, either run fit() to train or load() a trained model") log.info("Predicting from model...") ypreds = [self.tagger_.predict(" ".join(x)) for x in X] return ypreds def save(self, dirpath): """ Saves model to local disk, given a dirpath Parameters ---------- dirpath : str a directory where model artifacts will be saved. Model saves a weights.h5 weights file, a params.json parameter file, and a preprocessor.pkl preprocessor file. Returns ------- None """ if self.model_ is None or self.preprocessor_ is None: raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model") if not os.path.exists(dirpath): os.makedirs(dirpath) weights_file = os.path.join(dirpath, "weights.h5") params_file = os.path.join(dirpath, "params.json") preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") save_model(self.model_, weights_file, params_file) self.preprocessor_.save(preprocessor_file) write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) def load(self, dirpath): """ Loads a trained model from local disk, given the dirpath Parameters ---------- dirpath : str a directory where model artifacts are saved. Returns ------- self """ if not os.path.exists(dirpath): raise ValueError("Model directory not found: {:s}".format(dirpath)) weights_file = os.path.join(dirpath, "weights.h5") params_file = os.path.join(dirpath, "params.json") preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") if not (os.path.exists(weights_file) or os.path.exists(params_file) or os.path.exists(preprocessor_file)): raise ValueError("Model files may be corrupted, exiting") self.model_ = load_model(weights_file, params_file) self.preprocessor_ = IndexTransformer.load(preprocessor_file) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self
def load(cls, weights_file, params_file, preprocessor_file): self = cls() self.p = IndexTransformer.load(preprocessor_file) self.model = BiLSTMCRF.load(weights_file, params_file) return self
def training(train, test, fold): x_train = [x.split() for x in train['sentence'].tolist()] y_train = train['tag'].tolist() x_test = [x.split() for x in test['sentence'].tolist()] print('Transforming datasets...') p = IndexTransformer(use_char=True) p.fit(x_train + x_test, y_train) skf = KFold(n_splits=config.nfolds, random_state=config.seed, shuffle=True) embeddings = load_glove(config.glove_file) # embeddings_fast = load_glove(config.glove_file) embeddings_wang = load_glove(config.wang_file) embeddings = filter_embeddings(embeddings, p._word_vocab.vocab, config.glove_size) # embeddings_fast = filter_embeddings(embeddings_fast, p._word_vocab.vocab, config.fasttext_size) embeddings_wang = filter_embeddings(embeddings_wang, p._word_vocab.vocab, config.wang_size) embeddings = np.concatenate((embeddings, embeddings_wang), axis=1) for n_fold, (train_indices, val_indices) in enumerate(skf.split(x_train)): if n_fold >= fold: print("Training fold: ", n_fold) x_val = list(np.array(x_train)[val_indices]) y_val = list(np.array(y_train)[val_indices]) x_train_spl = list(np.array(x_train)[train_indices]) y_train_spl = list(np.array(y_train)[train_indices]) model = BiLSTMCRF(char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, word_embedding_dim=1200, char_embedding_dim=50, word_lstm_size=300, char_lstm_size=300, fc_dim=50, dropout=0.5, embeddings=embeddings, use_char=True, use_crf=True) opt = Adam(lr=0.001) model, loss = model.build() model.compile(loss=loss, optimizer=opt, metrics=[crf_viterbi_accuracy]) es = EarlyStopping(monitor='val_crf_viterbi_accuracy', patience=3, verbose=1, mode='max', restore_best_weights=True) rlr = ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.2, patience=2, verbose=1, mode='max') callbacks = [es, rlr] train_seq = NERSequence(x_train_spl, y_train_spl, config.batch_size, p.transform) if x_val and y_val: valid_seq = NERSequence(x_val, y_val, config.batch_size, p.transform) f1 = F1score(valid_seq, preprocessor=p, fold=n_fold) callbacks.append(f1) model.fit_generator(generator=train_seq, validation_data=valid_seq, epochs=config.nepochs, callbacks=callbacks, verbose=True, shuffle=True, use_multiprocessing=True, workers=12) p.save('../models/best_transform.it') model.load_weights('../models/best_model_' + str(n_fold) + '.h5') predict(model, p, x_test, n_fold)