def main(args): print('Loading dataset...') x_train, y_train = load_data_and_labels(args.train_data) x_valid, y_valid = load_data_and_labels(args.valid_data) x_test, y_test = load_data_and_labels(args.test_data) x_train = np.r_[x_train, x_valid] y_train = np.r_[y_train, y_valid] print('Transforming datasets...') p = ELMoTransformer() p.fit(x_train, y_train) print('Loading word embeddings...') embeddings = load_glove(EMBEDDING_PATH) embeddings = filter_embeddings(embeddings, p._word_vocab.vocab, 100) print('Building a model.') model = ELModel(char_embedding_dim=args.char_emb_size, word_embedding_dim=args.word_emb_size, char_lstm_size=args.char_lstm_units, word_lstm_size=args.word_lstm_units, char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, embeddings=embeddings, dropout=args.dropout) model, loss = model.build() model.compile(loss=loss, optimizer='adam') print('Training the model...') trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_test, y_test) print('Saving the model...') model.save(args.weights_file, args.params_file)
def main(args): print('Loading dataset...') x_train, y_train = load_data_and_labels(args.train_data) x_valid, y_valid = load_data_and_labels(args.valid_data) print('Transforming datasets...') p = IndexTransformer(use_char=args.no_char_feature) p.fit(x_train, y_train) print('Building a model.') model = BiLSTMCRF(char_embedding_dim=args.char_emb_size, word_embedding_dim=args.word_emb_size, char_lstm_size=args.char_lstm_units, word_lstm_size=args.word_lstm_units, char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, dropout=args.dropout, use_char=args.no_char_feature, use_crf=args.no_use_crf) model, loss = model.build() model.compile(loss=loss, optimizer='adam') print('Training the model...') trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid) print('Saving the model...') model.save(args.weights_file, args.params_file) p.save(args.preprocessor_file)
def main(args): print('Loading datasets...') X, y = load_data_and_labels(args.data_path) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) embeddings = KeyedVectors.load(args.embedding_path).wv print('Transforming datasets...') p = IndexTransformer() p.fit(X, y) embeddings = filter_embeddings(embeddings, p._word_vocab, embeddings.vector_size) print('Building a model...') model = BiLSTMCRF(char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, embeddings=embeddings, char_embedding_dim=50) model.build() print('Training the model...') trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid) print('Saving the model...') model.save(args.weights_file, args.params_file) p.save(args.preprocessor_file)
def setUp(self): # Load datasets. train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') self.x_train, self.y_train = load_data_and_labels(train_path) self.x_valid, self.y_valid = load_data_and_labels(valid_path) # Fit transformer. self.p = IndexTransformer() self.p.fit(self.x_train, self.y_train) # Build a model. self.model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size, word_vocab_size=self.p.word_vocab_size, num_labels=self.p.label_size) self.model, loss = self.model.build() self.model.compile(loss=loss, optimizer='adam')
def setUpClass(cls): if not os.path.exists(LOG_ROOT): os.mkdir(LOG_ROOT) if not os.path.exists(SAVE_ROOT): os.mkdir(SAVE_ROOT) train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') test_path = os.path.join(DATA_ROOT, 'test.txt') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) cls.x_test, cls.y_test = load_data_and_labels(test_path) cls.x_train = np.r_[x_train, x_valid] cls.y_train = np.r_[y_train, y_valid] cls.embeddings = load_glove(EMBEDDING_PATH) cls.text = 'President Obama is speaking at the White House.' cls.dir_path = 'models'
def test_batch_iter(self): X, y = load_data_and_labels(self.filename) batch_size = 32 p = IndexTransformer() p.fit(X, y) gen = NERSequence(X, y, batch_size, preprocess=p.transform) y_gen = [] for i in range(len(gen)): x1, y1 = gen[i] y_gen.extend(y1) self.assertEqual(len(y_gen), len(y))
def test_batch_iter(self): X, y = load_data_and_labels(self.filename) batch_size = 32 p = IndexTransformer() p.fit(X, y) steps, generator = batch_iter(X, y, batch_size, shuffle=False, preprocessor=p) y_gen = [] for _ in range(steps): x1, y1 = next(generator) y_gen.extend(y1) self.assertEqual(len(y_gen), len(y))
import os import anago from anago.utils import download, load_data_and_labels if __name__ == '__main__': dir_path = 'test_dir' url = 'https://s3-ap-northeast-1.amazonaws.com/dev.tech-sketch.jp/chakki/public/conll2003_en.zip' DATA_ROOT = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner') test_path = os.path.join(DATA_ROOT, 'test.txt') x_test, y_test = load_data_and_labels(test_path) download(url, dir_path) model = anago.Sequence.load('weights.h5', 'params.json', 'preprocessor.pickle') model.score(x_test, y_test)
from anago.utils import load_data_and_labels import anago x_train, y_train = load_data_and_labels('data/conll2003/en/ner/train.txt') x_test, y_test = load_data_and_labels('data/conll2003/en/ner/test.txt') x_dev, y_dev = load_data_and_labels('data/conll2003/en/ner/valid.txt') model = anago.Sequence() model.fit(x_train, y_train, x_dev, y_dev, epochs=15) model.score(x_test, y_test)
from anago.preprocessing import IndexTransformer import gensim import numpy as np import anago from anago.tagger import Tagger from anago.utils import load_data_and_labels, filter_embeddings from gensim.models.keyedvectors import KeyedVectors if __name__ == "__main__": wv_model = gensim.models.Word2Vec.load( "wiki_cbow_100/wikipedia_cbow_100").wv train_path = '../../data/collected/NER/train.txt' valid_path = '../../data/collected/NER/valid.txt' print('Loading data...') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) print("got ", len(x_train), " entries for training and ", len(x_valid), " entries for testing") entities = set() for s in y_train: for w in s: entities.add(w) print("Defined entities are :", entities) preprocessor = IndexTransformer(use_char=True) x = x_train + x_valid y = y_train + y_valid preprocessor.fit(x, y) print(len(x_train), 'train sequences') print(len(x_valid), 'valid sequences')
def test_extract(self): X, y = load_data_and_labels(self.filename) self.assertTrue(len(X) == len(y))
/* from anago.utils import load_data_and_labels x_train, y_train = load_data_and_labels('train.txt') x_test, y_test = load_data_and_labels('test.txt') model = anago.Sequence() model.fit(x_train, y_train, epochs=15) model.score(x_test, y_test) model.analyze(text) text = 'President Obama is speaking at the White House.' text_tags = model.analyze(text) */ //example trainer using anago import argparse import os from anago.utils import load_data_and_labels from anago.models import BiLSTMCRF from anago.preprocessing import IndexTransformer from anago.trainer import Trainer def main(args): print('Loading dataset...') x_train, y_train = load_data_and_labels(args.train_data) x_valid, y_valid = load_data_and_labels(args.valid_data) print('Transforming datasets...') p = IndexTransformer(use_char=args.no_char_feature)