def test_train(self): # Not use pre-trained word embeddings model = anago.Sequence(max_epoch=1) model.train(self.x_train, self.y_train, self.x_valid, self.y_valid) # Use pre-trained word embeddings model = anago.Sequence(max_epoch=1, embeddings=self.embeddings) model.train(self.x_train, self.y_train, self.x_valid, self.y_valid)
def train(self): self.model = anago.Sequence() self.model.fit(self.x_train, self.y_train, self.x_test, self.y_test, epochs=self.epoch)
def test_train_vocab_init(self): vocab = set() for words in np.r_[self.x_train, self.x_test, self.x_test]: for word in words: vocab.add(word) model = anago.Sequence(initial_vocab=vocab, embeddings=self.embeddings) model.fit(self.x_train, self.y_train, self.x_test, self.y_test)
def test_analyze(self): model = anago.Sequence() model.fit(self.x_train, self.y_train) res = model.analyze(self.text) pprint(res) self.assertIn('words', res) self.assertIn('entities', res)
def test_train_vocab_init(self): vocab = set() for words in np.r_[self.x_train, self.x_valid, self.x_test]: for word in words: vocab.add(word) model = anago.Sequence(max_epoch=15, embeddings=self.embeddings, log_dir='logs') model.train(self.x_train, self.y_train, self.x_test, self.y_test, vocab_init=vocab) model.save(dir_path=self.dir_path)
def test_load(self): model = anago.Sequence(max_epoch=1, embeddings=self.embeddings) model.train(self.x_train, self.y_train, self.x_valid, self.y_valid) model.eval(self.x_test, self.y_test) model.save(dir_path=self.dir_path) model = anago.Sequence.load(self.dir_path) model.eval(self.x_test, self.y_test)
def test_analyze(self): model = anago.Sequence(max_epoch=1, embeddings=self.embeddings) model.train(self.x_train, self.y_train, self.x_valid, self.y_valid) res = model.analyze(self.words) pprint(res) self.assertIn('words', res) self.assertIn('entities', res)
def get_intonator(): model = anago.Sequence() model = model.load("char_model/best_model") model.model.model._make_predict_function() def intonator(sentence): chars = list(sentence) resp = model.analyze(chars) return marked_output(resp) return intonator
def test_save(self): model = anago.Sequence(max_epoch=1, embeddings=self.embeddings) model.train(self.x_train, self.y_train, self.x_valid, self.y_valid) model.save(dir_path=self.dir_path) config_file = os.path.join(self.dir_path, model.config_file) weight_file = os.path.join(self.dir_path, model.weight_file) preprocessor_file = os.path.join(self.dir_path, model.preprocessor_file) self.assertTrue(os.path.exists(config_file)) self.assertTrue(os.path.exists(weight_file)) self.assertTrue(os.path.exists(preprocessor_file))
def fit(self, X, y=None, char_emb_size=32, word_emb_size=128, char_lstm_units=32, word_lstm_units=128, dropout=0.1, batch_size=16, learning_rate=0.001, num_epochs=10): """ Trains the NER model. The input is a list of `AnnotatedDocument` instances. """ # Anago splits the BIO tags on the dash "-", so if the label contains # a dash, it corrupts it. This is a workaround for this behavior. for annotated_document in X: for annotation in annotated_document.annotations: if "-" in annotation.label: self._label_map[annotation.label.split("-") [-1]] = annotation.label else: self._label_map["B_" + annotation.label] = annotation.label self._label_map["I_" + annotation.label] = annotation.label self.model = anago.Sequence(char_emb_size=char_emb_size, word_emb_size=word_emb_size, char_lstm_units=char_lstm_units, word_lstm_units=word_lstm_units, dropout=dropout, batch_size=batch_size, learning_rate=learning_rate, max_epoch=num_epochs) log.info("Transforming {} items to BIO format...".format(len(X))) training_data = transform_annotated_documents_to_bio_format(X) BIO_Χ = np.asarray([x_i for x_i in training_data[0] if len(x_i) > 0]) BIO_y = np.asarray([y_i for y_i in training_data[1] if len(y_i) > 0]) log.info("Training the BiLSTM...") X_train, X_valid, y_train, y_valid = train_test_split(BIO_Χ, BIO_y, test_size=0.1) self.model.train(X_train, y_train, X_valid, y_valid) return self
def test_save_and_load(self): weights_file = os.path.join(SAVE_ROOT, 'weights.h5') params_file = os.path.join(SAVE_ROOT, 'params.json') preprocessor_file = os.path.join(SAVE_ROOT, 'preprocessor.pickle') model = anago.Sequence() model.fit(self.x_train, self.y_train) model.save(weights_file, params_file, preprocessor_file) score1 = model.score(self.x_test, self.y_test) self.assertTrue(weights_file) self.assertTrue(params_file) self.assertTrue(preprocessor_file) model = anago.Sequence.load(weights_file, params_file, preprocessor_file) score2 = model.score(self.x_test, self.y_test) self.assertEqual(score1, score2)
def test_train_callbacks(self): weights_file = os.path.join(SAVE_ROOT, 'weights.h5') params_file = os.path.join(SAVE_ROOT, 'params.json') preprocessor_file = os.path.join(SAVE_ROOT, 'preprocessor.pickle') log_dir = os.path.join(os.path.dirname(__file__), 'logs') file_name = '_'.join(['model_weights', '{epoch:02d}', '{f1:2.4f}']) + '.h5' callback = ModelCheckpoint(os.path.join(log_dir, file_name), monitor='f1', save_weights_only=True) vocab = set() for words in np.r_[self.x_train, self.x_test, self.x_test]: for word in words: vocab.add(word) model = anago.Sequence(initial_vocab=vocab, embeddings=self.embeddings) model.fit(self.x_train, self.y_train, self.x_test, self.y_test, epochs=30, callbacks=[callback]) model.save(weights_file, params_file, preprocessor_file)
def train_base_model(batch_size: int, max_epoch: int, log_dir: str, patience: int, no_log: bool) -> None: """Train a base NER model (Note: Not optimized for web parsing) Args: batch_size (int): number of batches to train on max_epoch (int): number of epochs to train the data on, early stopping is on by default patience (int); number of epochs to wait before stopping early log_dir (str): path to save tensorboard log information no_log (bool): don't log training data """ if not os.path.exists(log_dir): os.mkdir(log_dir) if not os.path.exists(SAVE_DIR): os.mkdir(SAVE_DIR) if not os.path.exists(BASE_MODEL_PATH): os.mkdir(BASE_MODEL_PATH) train_path = os.path.join(DATA_TRAIN, 'train.txt') valid_path = os.path.join(DATA_TRAIN, 'valid.txt') print('Loading data...') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) print(len(x_train), 'train sequences') print(len(x_valid), 'valid sequences') embeddings = load_glove(EMBEDDING_PATH) if no_log: log_dir = None model = anago.Sequence(batch_size=batch_size, max_epoch=max_epoch, log_dir=log_dir, embeddings=embeddings, patience=patience) model.train(x_train, y_train, x_valid, y_valid) model.save(BASE_MODEL_PATH)
def train(log_dir: str) -> None: """Fine-tune base model Args: log_dir (str): pth to save tensorboard log information """ if not os.path.exists(log_dir): os.mkdir(log_dir) if not os.path.exists(log_dir): os.mkdir(log_dir) x_train, y_train, x_valid, y_valid = train_test_split_from_queries() print(len(x_train), 'train sequences') print(len(x_valid), 'valid sequences') embeddings = load_glove(EMBEDDING_PATH) model = anago.Sequence(log_dir=LOG_DIR, embeddings=embeddings) model.load(BASE_MODEL_PATH) model.train(x_train, y_train, x_valid, y_valid) model.save(CUSTOM_MODEL_PATH)
def load_model(path): model_path = os.path.realpath(__file__).replace('main.py', '') + path print("loading model from {}".format(model_path)) return anago.Sequence().load(model_path)
from anago.utils import load_data_and_labels import anago x_train, y_train = load_data_and_labels('data/conll2003/en/ner/train.txt') x_test, y_test = load_data_and_labels('data/conll2003/en/ner/test.txt') x_dev, y_dev = load_data_and_labels('data/conll2003/en/ner/valid.txt') model = anago.Sequence() model.fit(x_train, y_train, x_dev, y_dev, epochs=15) model.score(x_test, y_test)
def test_eval(self): model = anago.Sequence(max_epoch=1, embeddings=self.embeddings) model.train(self.x_train, self.y_train, self.x_valid, self.y_valid) model.eval(self.x_test, self.y_test)
MODEL_DIRPATH = "pretrained/ner_tagger/" WEIGHT_FILENAME = "nerweight.h5" PARAMS_FILENAME = "nerparams.json" PREPROCESSOR_FILENAME = "nerprepro.pkl" MODEL_NAME = "model_ner_12514_softmax_v5_w2v_100_POS_LSTM_EmbNotTrainable_OOV" loaded_model = load_model(f"pretrained/{MODEL_NAME}") with open(f"pretrained/{MODEL_NAME}/dict.pickle", 'rb') as handle: idx2token_loaded = pickle.load(handle) idx2tag_loaded = pickle.load(handle) token2idx_loaded = pickle.load(handle) tag2idx_loaded = pickle.load(handle) model = anago.Sequence().load(weights_file=MODEL_DIRPATH + WEIGHT_FILENAME, params_file=MODEL_DIRPATH + PARAMS_FILENAME, preprocessor_file=MODEL_DIRPATH + PREPROCESSOR_FILENAME) def __get_ner_tags(result): ner_tags = ["O" for _ in result['words']] for entity in result['entities']: ner_tags[entity['beginOffset']] = entity['type'] return ner_tags def get_entities(sentence): result = model.analyze(sentence) return __get_ner_tags(result) def get_entities_tf(sentence, pad_token = np.nan, n_timesteps = 48): sentence=sentence.split() padded_sentence = sentence + [pad_token] * (n_timesteps - len(sentence))
def test_train_all(self): x_train = np.r_[self.x_train, self.x_valid, self.x_test] y_train = np.r_[self.y_train, self.y_valid, self.y_test] model = anago.Sequence(max_epoch=15, embeddings=self.embeddings, log_dir='logs') model.train(x_train, y_train, self.x_test, self.y_test) model.save(dir_path=self.dir_path)
import os from gensim.models.keyedvectors import KeyedVectors import anago from anago.utils import load_data_and_labels if __name__ == '__main__': DATA_ROOT = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner') EMBEDDING_PATH = 'model.txt' train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') print('Loading data...') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) print(len(x_train), 'train sequences') print(len(x_valid), 'valid sequences') embeddings = KeyedVectors.load_word2vec_format(EMBEDDING_PATH).wv # Use pre-trained word embeddings model = anago.Sequence(embeddings=embeddings) model.fit(x_train, y_train, x_valid, y_valid)
import os from gensim.models.keyedvectors import KeyedVectors import anago from anago.reader import load_data_and_labels DATA_ROOT = os.path.join(os.path.dirname(__file__), './data/conll2003/en/ner') EMBEDDING_PATH = 'model.txt' train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') print('Loading data...') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) print(len(x_train), 'train sequences') print(len(x_valid), 'valid sequences') embeddings = KeyedVectors.load_word2vec_format(EMBEDDING_PATH).wv # Use pre-trained word embeddings model = anago.Sequence(max_epoch=1, embeddings=embeddings) model.train(x_train, y_train, x_valid, y_valid)
def test_score(self): model = anago.Sequence() model.fit(self.x_train, self.y_train) score = model.score(self.x_test, self.y_test) self.assertIsInstance(score, float)
def test_train_with_pretrained_embedding(self): model = anago.Sequence(embeddings=self.embeddings) model.fit(self.x_train, self.y_train, self.x_test, self.y_test)
rn.seed(12345) import tensorflow as tf from keras import backend as K # tf.set_random_seed(1234) # atur parameternya disini model = anago.Sequence(char_emb_size=25, word_emb_size=100, char_lstm_units=25, word_lstm_units=100, dropout=0.5, char_feature=True, crf=True, batch_size=20, optimizer='adam', learning_rate=0.001, lr_decay=0.9, clip_gradients=5.0, max_epoch=30, early_stopping=True, patience=3, train_embeddings=True, max_checkpoints_to_keep=5, log_dir=None) model.train(x_train, y_train, x_valid, y_valid) print("\n\nEvaluasi Test:") model.eval(x_test, y_test) model.save('model/ModelAnagoIndo')
import anago from anago.reader import load_data_and_labels x_train, y_train = load_data_and_labels('./data/train.txt') x_valid, y_valid = load_data_and_labels('./data/dev.txt') x_test, y_test = load_data_and_labels('./data/test.txt') model = anago.Sequence() #.load('./models') model.train(x_train, y_train, x_valid, y_valid) model.save(dir_path='./models') model.eval(x_test, y_test)
import anago from anago.reader import load_data_and_labels, load_glove x_train, y_train = load_data_and_labels('train.txt') x_valid, y_valid = load_data_and_labels('valid.txt') x_test, y_test = load_data_and_labels('test.txt') EMBEDDING_PATH = 'vectors-ind.txt' embeddings = load_glove(EMBEDDING_PATH) # model = anago.Sequence() model = anago.Sequence(char_emb_size=100,word_emb_size=50,char_lstm_units=25,word_lstm_units=100,dropout=0.5,char_feature=True,crf=True,batch_size=3,optimizer='adam', learning_rate=0.005,lr_decay=0.7,clip_gradients=5.0, embeddings=embeddings) model.train(x_train, y_train, x_valid, y_valid) model.eval(x_test, y_test) matres = [] for sent in x_test: res = model.analyze(sent)['entities'] matres.append(res) y_resu = [] for i, sent in enumerate(matres): sent_pred = ['O']*len(y_test[i]) for enti in sent: bo = enti['beginOffset'] sent_pred[bo] = 'B-'+enti['type'] for x in range(bo+1, enti['endOffset']): sent_pred[x] = 'I-'+enti['type'] y_resu.append(sent_pred)
def fit(self, X, y=None, char_emb_size=32, word_emb_size=128, char_lstm_units=32, word_lstm_units=128, dropout=0.5, batch_size=8, num_epochs=10): """ Trains the NER model. The input is a list of `AnnotatedDocument` instances. We should be careful with batch size: it must satisfy len(X) % batch_size == 0. Otherwise, Anago crushes with an error from time to time. An example here is a token assigned a tag (the BIO scheme). """ log.info("Checking parameters...") self.config.set_parameters({ "num_epochs": num_epochs, "dropout": dropout, "batch_size": batch_size, "char_emb_size": char_emb_size, "word_emb_size": word_emb_size, "char_lstm_units": char_lstm_units, "word_lstm_units": word_lstm_units }) self.config.validate() # Anago splits the BIO tags on the dash "-", so if the label contains # a dash, it corrupts it. This is a workaround for this behavior. label_map = {} for annotated_document in X: for annotation in annotated_document.annotations: if "-" in annotation.label: label_map[annotation.label.split("-") [-1]] = annotation.label else: label_map["B_" + annotation.label] = annotation.label label_map["I_" + annotation.label] = annotation.label self.config.set_parameter("label_map", label_map) self.model = anago.Sequence( char_embedding_dim=self.config.get_parameter("char_emb_size"), word_embedding_dim=self.config.get_parameter("word_emb_size"), char_lstm_size=self.config.get_parameter("char_lstm_units"), word_lstm_size=self.config.get_parameter("word_lstm_units"), dropout=self.config.get_parameter("dropout")) log.info("Transforming {} items to BIO format...".format(len(X))) train_data = transform_annotated_documents_to_bio_format( X, entity_labels=self.entity_labels) # new version does not use numpy arrays as arguments # BIO_X = np.asarray([x_i for x_i in training_data[0] if len(x_i) > 0]) # BIO_y = np.asarray([y_i for y_i in training_data[1] if len(y_i) > 0]) # validation is not necessary as we normally use Optimizer # X_train, X_valid, y_train, y_valid = train_test_split( # BIO_X, BIO_y, test_size=0.1) X_train = [x_i for x_i in train_data[0]] y_train = [y_i for y_i in train_data[1]] # check sizes if len(X_train) != len(y_train): log.error( "Got {} feature vectors but {} labels, cannot train!".format( len(X_train), len(y_train))) return self # number of examples must be divisible by batch_size, # so skip examples in the end if needed exm_num = len(X_train) X_train = X_train[:exm_num - exm_num % batch_size] y_train = y_train[:exm_num - exm_num % batch_size] log.info("Training BiLSTM...") self.model.fit(X_train, y_train, epochs=self.config.get_parameter("num_epochs"), batch_size=self.config.get_parameter("batch_size")) return self
import pandas as pd from pathlib import Path import anago from anago.utils import load_glove import utils train_path = Path.cwd().joinpath('data/semeval-2016/train.csv') test_path = Path.cwd().joinpath('data/semeval-2016/test.csv') # Read data data_train = pd.read_csv(train_path) data_test = pd.read_csv(test_path) x_train, y_train = utils.df2data(data_train) x_test, y_test = utils.df2data(data_test) # Load glove embedding EMBEDDING_PATH = '../embedding_weights/glove.840B.300d.txt' embeddings = load_glove(EMBEDDING_PATH) # Use pre-trained word embeddings to train model = anago.Sequence(embeddings=embeddings, word_embedding_dim=300) model.fit(x_train, y_train, x_test, y_test, epochs=10)