def setUpClass(cls): if not os.path.exists(LOG_ROOT): os.mkdir(LOG_ROOT) if not os.path.exists(SAVE_ROOT): os.mkdir(SAVE_ROOT) train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') test_path = os.path.join(DATA_ROOT, 'test.txt') cls.x_train, cls.y_train = load_data_and_labels(train_path) cls.x_valid, cls.y_valid = load_data_and_labels(valid_path) cls.x_test, cls.y_test = load_data_and_labels(test_path) cls.embeddings = load_glove(EMBEDDING_PATH) cls.words = 'President Obama is speaking at the White House.'.split() cls.dir_path = 'models'
def train_base_model(batch_size: int, max_epoch: int, log_dir: str, patience: int, no_log: bool) -> None: """Train a base NER model (Note: Not optimized for web parsing) Args: batch_size (int): number of batches to train on max_epoch (int): number of epochs to train the data on, early stopping is on by default patience (int); number of epochs to wait before stopping early log_dir (str): path to save tensorboard log information no_log (bool): don't log training data """ if not os.path.exists(log_dir): os.mkdir(log_dir) if not os.path.exists(SAVE_DIR): os.mkdir(SAVE_DIR) if not os.path.exists(BASE_MODEL_PATH): os.mkdir(BASE_MODEL_PATH) train_path = os.path.join(DATA_TRAIN, 'train.txt') valid_path = os.path.join(DATA_TRAIN, 'valid.txt') print('Loading data...') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) print(len(x_train), 'train sequences') print(len(x_valid), 'valid sequences') embeddings = load_glove(EMBEDDING_PATH) if no_log: log_dir = None model = anago.Sequence(batch_size=batch_size, max_epoch=max_epoch, log_dir=log_dir, embeddings=embeddings, patience=patience) model.train(x_train, y_train, x_valid, y_valid) model.save(BASE_MODEL_PATH)
def train(log_dir: str) -> None: """Fine-tune base model Args: log_dir (str): pth to save tensorboard log information """ if not os.path.exists(log_dir): os.mkdir(log_dir) if not os.path.exists(log_dir): os.mkdir(log_dir) x_train, y_train, x_valid, y_valid = train_test_split_from_queries() print(len(x_train), 'train sequences') print(len(x_valid), 'valid sequences') embeddings = load_glove(EMBEDDING_PATH) model = anago.Sequence(log_dir=LOG_DIR, embeddings=embeddings) model.load(BASE_MODEL_PATH) model.train(x_train, y_train, x_valid, y_valid) model.save(CUSTOM_MODEL_PATH)
import anago from anago.reader import load_data_and_labels, load_glove x_train, y_train = load_data_and_labels('train.txt') x_valid, y_valid = load_data_and_labels('valid.txt') x_test, y_test = load_data_and_labels('test.txt') EMBEDDING_PATH = 'vectors-ind.txt' embeddings = load_glove(EMBEDDING_PATH) # model = anago.Sequence() model = anago.Sequence(char_emb_size=100,word_emb_size=50,char_lstm_units=25,word_lstm_units=100,dropout=0.5,char_feature=True,crf=True,batch_size=3,optimizer='adam', learning_rate=0.005,lr_decay=0.7,clip_gradients=5.0, embeddings=embeddings) model.train(x_train, y_train, x_valid, y_valid) model.eval(x_test, y_test) matres = [] for sent in x_test: res = model.analyze(sent)['entities'] matres.append(res) y_resu = [] for i, sent in enumerate(matres): sent_pred = ['O']*len(y_test[i]) for enti in sent: bo = enti['beginOffset'] sent_pred[bo] = 'B-'+enti['type'] for x in range(bo+1, enti['endOffset']): sent_pred[x] = 'I-'+enti['type'] y_resu.append(sent_pred)