def setUpClass(cls):
        if not os.path.exists(LOG_ROOT):
            os.mkdir(LOG_ROOT)

        if not os.path.exists(SAVE_ROOT):
            os.mkdir(SAVE_ROOT)

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        test_path = os.path.join(DATA_ROOT, 'test.txt')

        cls.x_train, cls.y_train = load_data_and_labels(train_path)
        cls.x_valid, cls.y_valid = load_data_and_labels(valid_path)
        cls.x_test, cls.y_test = load_data_and_labels(test_path)

        cls.embeddings = load_glove(EMBEDDING_PATH)

        cls.words = 'President Obama is speaking at the White House.'.split()

        cls.dir_path = 'models'
def train_base_model(batch_size: int, max_epoch: int, log_dir: str,
                     patience: int, no_log: bool) -> None:
    """Train a base NER model

    (Note: Not optimized for web parsing)

    Args:
        batch_size (int): number of batches to train on
        max_epoch (int): number of epochs to train the data on, early stopping
            is on by default
        patience (int); number of epochs to wait before stopping early
        log_dir (str): path to save tensorboard log information
        no_log (bool): don't log training data

    """
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
    if not os.path.exists(SAVE_DIR):
        os.mkdir(SAVE_DIR)
    if not os.path.exists(BASE_MODEL_PATH):
        os.mkdir(BASE_MODEL_PATH)

    train_path = os.path.join(DATA_TRAIN, 'train.txt')
    valid_path = os.path.join(DATA_TRAIN, 'valid.txt')

    print('Loading data...')
    x_train, y_train = load_data_and_labels(train_path)
    x_valid, y_valid = load_data_and_labels(valid_path)
    print(len(x_train), 'train sequences')
    print(len(x_valid), 'valid sequences')

    embeddings = load_glove(EMBEDDING_PATH)

    if no_log:
        log_dir = None

    model = anago.Sequence(batch_size=batch_size, max_epoch=max_epoch,
                           log_dir=log_dir, embeddings=embeddings,
                           patience=patience)
    model.train(x_train, y_train, x_valid, y_valid)
    model.save(BASE_MODEL_PATH)
def train(log_dir: str) -> None:
    """Fine-tune base model

    Args:
        log_dir (str): pth to save tensorboard log information

    """
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    x_train, y_train, x_valid, y_valid = train_test_split_from_queries()
    print(len(x_train), 'train sequences')
    print(len(x_valid), 'valid sequences')

    embeddings = load_glove(EMBEDDING_PATH)

    model = anago.Sequence(log_dir=LOG_DIR, embeddings=embeddings)
    model.load(BASE_MODEL_PATH)
    model.train(x_train, y_train, x_valid, y_valid)
    model.save(CUSTOM_MODEL_PATH)
Beispiel #4
0
import anago
from anago.reader import load_data_and_labels, load_glove

x_train, y_train = load_data_and_labels('train.txt')
x_valid, y_valid = load_data_and_labels('valid.txt')
x_test, y_test = load_data_and_labels('test.txt')

EMBEDDING_PATH = 'vectors-ind.txt'
embeddings = load_glove(EMBEDDING_PATH)

# model = anago.Sequence()
model = anago.Sequence(char_emb_size=100,word_emb_size=50,char_lstm_units=25,word_lstm_units=100,dropout=0.5,char_feature=True,crf=True,batch_size=3,optimizer='adam', learning_rate=0.005,lr_decay=0.7,clip_gradients=5.0, embeddings=embeddings)
model.train(x_train, y_train, x_valid, y_valid)

model.eval(x_test, y_test)

matres = []
for sent in x_test:
	res = model.analyze(sent)['entities']
	matres.append(res)

y_resu = []
for i, sent in enumerate(matres):
	sent_pred = ['O']*len(y_test[i])
	for enti in sent:
		bo = enti['beginOffset']
		sent_pred[bo] = 'B-'+enti['type']
		for x in range(bo+1, enti['endOffset']):
			sent_pred[x] = 'I-'+enti['type']
	y_resu.append(sent_pred)