def main(args): print('Loading dataset...') x_train, y_train = load_data_and_labels(args.train_data) x_valid, y_valid = load_data_and_labels(args.valid_data) x_test, y_test = load_data_and_labels(args.test_data) x_train = np.r_[x_train, x_valid] y_train = np.r_[y_train, y_valid] print('Transforming datasets...') p = ELMoTransformer() p.fit(x_train, y_train) print('Loading word embeddings...') embeddings = load_glove(EMBEDDING_PATH) embeddings = filter_embeddings(embeddings, p._word_vocab.vocab, 100) print('Building a model.') model = ELModel(char_embedding_dim=args.char_emb_size, word_embedding_dim=args.word_emb_size, char_lstm_size=args.char_lstm_units, word_lstm_size=args.word_lstm_units, char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, embeddings=embeddings, dropout=args.dropout) model, loss = model.build() model.compile(loss=loss, optimizer='adam') print('Training the model...') trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_test, y_test) print('Saving the model...') model.save(args.weights_file, args.params_file)
def setUpClass(cls): if not os.path.exists(LOG_ROOT): os.mkdir(LOG_ROOT) if not os.path.exists(SAVE_ROOT): os.mkdir(SAVE_ROOT) train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') test_path = os.path.join(DATA_ROOT, 'test.txt') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) cls.x_test, cls.y_test = load_data_and_labels(test_path) cls.x_train = np.r_[x_train, x_valid] cls.y_train = np.r_[y_train, y_valid] cls.embeddings = load_glove(EMBEDDING_PATH) cls.text = 'President Obama is speaking at the White House.' cls.dir_path = 'models'
def __init__(self, emb_path, emb_dim, vocab, n_clusters): logger.info('Loading embeddings from: ' + emb_path) # loading pretrained vectors, model is glove dictionary self.embeddings = load_glove(emb_path) self.vector_size = len(self.embeddings) self.emb_dim = emb_dim if emb_dim is not None: assert emb_dim == len(self.embeddings['nice']) logger.info(' #vectors: %i, #dimensions: %i' % (self.vector_size, self.emb_dim)) # get_emb_matrix_given_vocab emb_matrix = [] counter = 0. for word, index in vocab.items(): try: emb_matrix[index] = self.embeddings[word] counter += 1 except KeyError: pass logger.info( '%i/%i word vectors initialized (hit rate: %.2f%%)' % (counter, len(vocab), 100 * counter / len(vocab))) # L2 normalization self.norm_emb_matrix = emb_matrix / np.linalg.norm(emb_matrix, axis=-1, keepdims=True) # get_aspect_matrix """ We need it for initialization: KMeans-clustered word embeddings """ km = KMeans(n_clusters=n_clusters) km.fit(self.norm_emb_matrix ) clusters = km.cluster_centers_ # L2 normalization self.norm_aspect_matrix = clusters / np.linalg.norm(clusters, axis=-1, keepdims=True)
def training(train, test): x_train = [x.split() for x in train['sentence'].tolist()] y_train = train['tag'].tolist() x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=233) print('Transforming datasets...') p = IndexTransformer(use_char=True) p.fit(x_train, y_train) embeddings = load_glove(config.glove_file) embeddings = filter_embeddings(embeddings, p._word_vocab.vocab, config.glove_size) model = BiLSTMCRF(char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, word_embedding_dim=300, char_embedding_dim=100, word_lstm_size=100, char_lstm_size=50, fc_dim=100, dropout=0.5, embeddings=embeddings, use_char=True, use_crf=True) opt = Adam(lr=0.001) model, loss = model.build() model.compile(loss=loss, optimizer=opt, metrics=[crf_viterbi_accuracy]) filepath = '../models/' + 'best_model' ckp = ModelCheckpoint(filepath + '.h5', monitor='val_crf_viterbi_accuracy', verbose=1, save_best_only=True, mode='max', save_weights_only=True) es = EarlyStopping(monitor='val_crf_viterbi_accuracy', min_delta=0.00001, patience=3, verbose=1, mode='max') rlr = ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.2, patience=2, verbose=1, mode='max', min_delta=0.0001) callbacks = [ckp, es, rlr] train_seq = NERSequence(x_train, y_train, config.batch_size, p.transform) if x_val and y_val: valid_seq = NERSequence(x_val, y_val, config.batch_size, p.transform) f1 = F1score(valid_seq, preprocessor=p) callbacks.append(f1) model.fit_generator(generator=train_seq, validation_data=valid_seq, epochs=config.nepochs, callbacks=callbacks, verbose=True, shuffle=True, use_multiprocessing=True, workers=42)
import pandas as pd from pathlib import Path import anago from anago.utils import load_glove import utils train_path = Path.cwd().joinpath('data/semeval-2016/train.csv') test_path = Path.cwd().joinpath('data/semeval-2016/test.csv') # Read data data_train = pd.read_csv(train_path) data_test = pd.read_csv(test_path) x_train, y_train = utils.df2data(data_train) x_test, y_test = utils.df2data(data_test) # Load glove embedding EMBEDDING_PATH = '../embedding_weights/glove.840B.300d.txt' embeddings = load_glove(EMBEDDING_PATH) # Use pre-trained word embeddings to train model = anago.Sequence(embeddings=embeddings, word_embedding_dim=300) model.fit(x_train, y_train, x_test, y_test, epochs=10)
def training(train, test, fold): x_train = [x.split() for x in train['sentence'].tolist()] y_train = train['tag'].tolist() x_test = [x.split() for x in test['sentence'].tolist()] print('Transforming datasets...') p = IndexTransformer(use_char=True) p.fit(x_train + x_test, y_train) skf = KFold(n_splits=config.nfolds, random_state=config.seed, shuffle=True) embeddings = load_glove(config.glove_file) # embeddings_fast = load_glove(config.glove_file) embeddings_wang = load_glove(config.wang_file) embeddings = filter_embeddings(embeddings, p._word_vocab.vocab, config.glove_size) # embeddings_fast = filter_embeddings(embeddings_fast, p._word_vocab.vocab, config.fasttext_size) embeddings_wang = filter_embeddings(embeddings_wang, p._word_vocab.vocab, config.wang_size) embeddings = np.concatenate((embeddings, embeddings_wang), axis=1) for n_fold, (train_indices, val_indices) in enumerate(skf.split(x_train)): if n_fold >= fold: print("Training fold: ", n_fold) x_val = list(np.array(x_train)[val_indices]) y_val = list(np.array(y_train)[val_indices]) x_train_spl = list(np.array(x_train)[train_indices]) y_train_spl = list(np.array(y_train)[train_indices]) model = BiLSTMCRF(char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, word_embedding_dim=1200, char_embedding_dim=50, word_lstm_size=300, char_lstm_size=300, fc_dim=50, dropout=0.5, embeddings=embeddings, use_char=True, use_crf=True) opt = Adam(lr=0.001) model, loss = model.build() model.compile(loss=loss, optimizer=opt, metrics=[crf_viterbi_accuracy]) es = EarlyStopping(monitor='val_crf_viterbi_accuracy', patience=3, verbose=1, mode='max', restore_best_weights=True) rlr = ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy', factor=0.2, patience=2, verbose=1, mode='max') callbacks = [es, rlr] train_seq = NERSequence(x_train_spl, y_train_spl, config.batch_size, p.transform) if x_val and y_val: valid_seq = NERSequence(x_val, y_val, config.batch_size, p.transform) f1 = F1score(valid_seq, preprocessor=p, fold=n_fold) callbacks.append(f1) model.fit_generator(generator=train_seq, validation_data=valid_seq, epochs=config.nepochs, callbacks=callbacks, verbose=True, shuffle=True, use_multiprocessing=True, workers=12) p.save('../models/best_transform.it') model.load_weights('../models/best_model_' + str(n_fold) + '.h5') predict(model, p, x_test, n_fold)
def fit(self, X, y): """ Trains the NER model. Input is list of AnnotatedDocuments. Parameters ---------- X : list(list(str)) list of list of tokens y : list(list(str)) list of list of BIO tags Returns ------- self """ if self.embeddings is None and self.embeddings_file is None: raise ValueError( "Either embeddings or embeddings_file should be provided, exiting." ) log.info("Preprocessing dataset...") self.preprocessor_ = ELMoTransformer() self.preprocessor_.fit(X, y) if self.embeddings is None: self.embeddings = load_glove(self.embeddings_file) embeddings_dim != self.embeddings[list( self.embeddings.keys())[0]].shape[0] self.embeddings = filter_embeddings( self.embeddings, self.preprocessor_._word_vocab.vocab, embeddings_dim) log.info("Building model...") self.model_ = ELModel( char_embedding_dim=self.char_embedding_dim, word_embedding_dim=self.word_embedding_dim, char_lstm_size=self.char_lstm_size, word_lstm_size=self.word_lstm_size, char_vocab_size=self.preprocessor_.char_vocab_size, word_vocab_size=self.preprocessor_.word_vocab_size, num_labels=self.preprocessor_.label_size, embeddings=self.embeddings, dropout=self.dropout) self.model_, loss = self.model_.build() optimizer = Adam(lr=self.learning_rate) self.model_.compile(loss=loss, optimizer=optimizer) self.model_.summary() log.info('Training the model...') self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, batch_size=self.batch_size, epochs=self.max_iter) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self