Exemple #1
0
def main(args):
    print('Loading dataset...')
    x_train, y_train = load_data_and_labels(args.train_data)
    x_valid, y_valid = load_data_and_labels(args.valid_data)
    x_test, y_test = load_data_and_labels(args.test_data)
    x_train = np.r_[x_train, x_valid]
    y_train = np.r_[y_train, y_valid]

    print('Transforming datasets...')
    p = ELMoTransformer()
    p.fit(x_train, y_train)

    print('Loading word embeddings...')
    embeddings = load_glove(EMBEDDING_PATH)
    embeddings = filter_embeddings(embeddings, p._word_vocab.vocab, 100)

    print('Building a model.')
    model = ELModel(char_embedding_dim=args.char_emb_size,
                    word_embedding_dim=args.word_emb_size,
                    char_lstm_size=args.char_lstm_units,
                    word_lstm_size=args.word_lstm_units,
                    char_vocab_size=p.char_vocab_size,
                    word_vocab_size=p.word_vocab_size,
                    num_labels=p.label_size,
                    embeddings=embeddings,
                    dropout=args.dropout)
    model, loss = model.build()
    model.compile(loss=loss, optimizer='adam')

    print('Training the model...')
    trainer = Trainer(model, preprocessor=p)
    trainer.train(x_train, y_train, x_test, y_test)

    print('Saving the model...')
    model.save(args.weights_file, args.params_file)
Exemple #2
0
def main(args):
    print('Loading dataset...')
    x_train, y_train = load_data_and_labels(args.train_data)
    x_valid, y_valid = load_data_and_labels(args.valid_data)

    print('Transforming datasets...')
    p = IndexTransformer(use_char=args.no_char_feature)
    p.fit(x_train, y_train)

    print('Building a model.')
    model = BiLSTMCRF(char_embedding_dim=args.char_emb_size,
                      word_embedding_dim=args.word_emb_size,
                      char_lstm_size=args.char_lstm_units,
                      word_lstm_size=args.word_lstm_units,
                      char_vocab_size=p.char_vocab_size,
                      word_vocab_size=p.word_vocab_size,
                      num_labels=p.label_size,
                      dropout=args.dropout,
                      use_char=args.no_char_feature,
                      use_crf=args.no_use_crf)
    model, loss = model.build()
    model.compile(loss=loss, optimizer='adam')

    print('Training the model...')
    trainer = Trainer(model, preprocessor=p)
    trainer.train(x_train, y_train, x_valid, y_valid)

    print('Saving the model...')
    model.save(args.weights_file, args.params_file)
    p.save(args.preprocessor_file)
Exemple #3
0
def main(args):
    print('Loading datasets...')
    X, y = load_data_and_labels(args.data_path)
    x_train, x_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.1,
                                                          random_state=42)
    embeddings = KeyedVectors.load(args.embedding_path).wv

    print('Transforming datasets...')
    p = IndexTransformer()
    p.fit(X, y)
    embeddings = filter_embeddings(embeddings, p._word_vocab,
                                   embeddings.vector_size)

    print('Building a model...')
    model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
                      word_vocab_size=p.word_vocab_size,
                      num_labels=p.label_size,
                      embeddings=embeddings,
                      char_embedding_dim=50)
    model.build()

    print('Training the model...')
    trainer = Trainer(model, preprocessor=p)
    trainer.train(x_train, y_train, x_valid, y_valid)

    print('Saving the model...')
    model.save(args.weights_file, args.params_file)
    p.save(args.preprocessor_file)
Exemple #4
0
    def setUp(self):
        # Load datasets.
        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        self.x_train, self.y_train = load_data_and_labels(train_path)
        self.x_valid, self.y_valid = load_data_and_labels(valid_path)

        # Fit transformer.
        self.p = IndexTransformer()
        self.p.fit(self.x_train, self.y_train)

        # Build a model.
        self.model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size,
                               word_vocab_size=self.p.word_vocab_size,
                               num_labels=self.p.label_size)
        self.model, loss = self.model.build()
        self.model.compile(loss=loss, optimizer='adam')
Exemple #5
0
    def setUpClass(cls):
        if not os.path.exists(LOG_ROOT):
            os.mkdir(LOG_ROOT)

        if not os.path.exists(SAVE_ROOT):
            os.mkdir(SAVE_ROOT)

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        test_path = os.path.join(DATA_ROOT, 'test.txt')

        x_train, y_train = load_data_and_labels(train_path)
        x_valid, y_valid = load_data_and_labels(valid_path)
        cls.x_test, cls.y_test = load_data_and_labels(test_path)
        cls.x_train = np.r_[x_train, x_valid]
        cls.y_train = np.r_[y_train, y_valid]

        cls.embeddings = load_glove(EMBEDDING_PATH)
        cls.text = 'President Obama is speaking at the White House.'
        cls.dir_path = 'models'
Exemple #6
0
    def test_batch_iter(self):
        X, y = load_data_and_labels(self.filename)
        batch_size = 32
        p = IndexTransformer()
        p.fit(X, y)
        gen = NERSequence(X, y, batch_size, preprocess=p.transform)

        y_gen = []
        for i in range(len(gen)):
            x1, y1 = gen[i]
            y_gen.extend(y1)
        self.assertEqual(len(y_gen), len(y))
Exemple #7
0
    def test_batch_iter(self):
        X, y = load_data_and_labels(self.filename)
        batch_size = 32
        p = IndexTransformer()
        p.fit(X, y)
        steps, generator = batch_iter(X,
                                      y,
                                      batch_size,
                                      shuffle=False,
                                      preprocessor=p)

        y_gen = []
        for _ in range(steps):
            x1, y1 = next(generator)
            y_gen.extend(y1)
        self.assertEqual(len(y_gen), len(y))
Exemple #8
0
import os

import anago
from anago.utils import download, load_data_and_labels

if __name__ == '__main__':
    dir_path = 'test_dir'
    url = 'https://s3-ap-northeast-1.amazonaws.com/dev.tech-sketch.jp/chakki/public/conll2003_en.zip'
    DATA_ROOT = os.path.join(os.path.dirname(__file__),
                             '../data/conll2003/en/ner')

    test_path = os.path.join(DATA_ROOT, 'test.txt')
    x_test, y_test = load_data_and_labels(test_path)

    download(url, dir_path)

    model = anago.Sequence.load('weights.h5', 'params.json',
                                'preprocessor.pickle')
    model.score(x_test, y_test)
Exemple #9
0
from anago.utils import load_data_and_labels
import anago

x_train, y_train = load_data_and_labels('data/conll2003/en/ner/train.txt')
x_test, y_test = load_data_and_labels('data/conll2003/en/ner/test.txt')
x_dev, y_dev = load_data_and_labels('data/conll2003/en/ner/valid.txt')
model = anago.Sequence()
model.fit(x_train, y_train, x_dev, y_dev, epochs=15)
model.score(x_test, y_test)
Exemple #10
0
from anago.preprocessing import IndexTransformer
import gensim
import numpy as np
import anago
from anago.tagger import Tagger
from anago.utils import load_data_and_labels, filter_embeddings
from gensim.models.keyedvectors import KeyedVectors

if __name__ == "__main__":
    wv_model = gensim.models.Word2Vec.load(
        "wiki_cbow_100/wikipedia_cbow_100").wv
    train_path = '../../data/collected/NER/train.txt'
    valid_path = '../../data/collected/NER/valid.txt'

    print('Loading data...')
    x_train, y_train = load_data_and_labels(train_path)
    x_valid, y_valid = load_data_and_labels(valid_path)
    print("got ", len(x_train), " entries for training and ", len(x_valid),
          " entries for testing")
    entities = set()
    for s in y_train:
        for w in s:
            entities.add(w)
    print("Defined entities are :", entities)

    preprocessor = IndexTransformer(use_char=True)
    x = x_train + x_valid
    y = y_train + y_valid
    preprocessor.fit(x, y)
    print(len(x_train), 'train sequences')
    print(len(x_valid), 'valid sequences')
Exemple #11
0
 def test_extract(self):
     X, y = load_data_and_labels(self.filename)
     self.assertTrue(len(X) == len(y))
Exemple #12
0
/*
from anago.utils import load_data_and_labels
x_train, y_train = load_data_and_labels('train.txt')
x_test, y_test = load_data_and_labels('test.txt')
model = anago.Sequence()
model.fit(x_train, y_train, epochs=15)
model.score(x_test, y_test)
model.analyze(text)
text = 'President Obama is speaking at the White House.'
text_tags = model.analyze(text)

*/

//example trainer using anago

import argparse
import os

from anago.utils import load_data_and_labels
from anago.models import BiLSTMCRF
from anago.preprocessing import IndexTransformer
from anago.trainer import Trainer


def main(args):
    print('Loading dataset...')
    x_train, y_train = load_data_and_labels(args.train_data)
    x_valid, y_valid = load_data_and_labels(args.valid_data)

    print('Transforming datasets...')
    p = IndexTransformer(use_char=args.no_char_feature)