예제 #1
0
def main(args):
    print('Loading dataset...')
    x_train, y_train = load_data_and_labels(args.train_data)
    x_valid, y_valid = load_data_and_labels(args.valid_data)
    x_test, y_test = load_data_and_labels(args.test_data)
    x_train = np.r_[x_train, x_valid]
    y_train = np.r_[y_train, y_valid]

    print('Transforming datasets...')
    p = ELMoTransformer()
    p.fit(x_train, y_train)

    print('Loading word embeddings...')
    embeddings = load_glove(EMBEDDING_PATH)
    embeddings = filter_embeddings(embeddings, p._word_vocab.vocab, 100)

    print('Building a model.')
    model = ELModel(char_embedding_dim=args.char_emb_size,
                    word_embedding_dim=args.word_emb_size,
                    char_lstm_size=args.char_lstm_units,
                    word_lstm_size=args.word_lstm_units,
                    char_vocab_size=p.char_vocab_size,
                    word_vocab_size=p.word_vocab_size,
                    num_labels=p.label_size,
                    embeddings=embeddings,
                    dropout=args.dropout)
    model, loss = model.build()
    model.compile(loss=loss, optimizer='adam')

    print('Training the model...')
    trainer = Trainer(model, preprocessor=p)
    trainer.train(x_train, y_train, x_test, y_test)

    print('Saving the model...')
    model.save(args.weights_file, args.params_file)
예제 #2
0
    def setUpClass(cls):
        if not os.path.exists(LOG_ROOT):
            os.mkdir(LOG_ROOT)

        if not os.path.exists(SAVE_ROOT):
            os.mkdir(SAVE_ROOT)

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        test_path = os.path.join(DATA_ROOT, 'test.txt')

        x_train, y_train = load_data_and_labels(train_path)
        x_valid, y_valid = load_data_and_labels(valid_path)
        cls.x_test, cls.y_test = load_data_and_labels(test_path)
        cls.x_train = np.r_[x_train, x_valid]
        cls.y_train = np.r_[y_train, y_valid]

        cls.embeddings = load_glove(EMBEDDING_PATH)
        cls.text = 'President Obama is speaking at the White House.'
        cls.dir_path = 'models'
예제 #3
0
    def __init__(self, emb_path, emb_dim, vocab, n_clusters):

        logger.info('Loading embeddings from: ' + emb_path)
        # loading pretrained vectors, model is glove dictionary
        self.embeddings = load_glove(emb_path)
        self.vector_size = len(self.embeddings)
        self.emb_dim = emb_dim
        if emb_dim is not None:
            assert emb_dim == len(self.embeddings['nice'])

        logger.info('  #vectors: %i, #dimensions: %i' % (self.vector_size, self.emb_dim))

        # get_emb_matrix_given_vocab
        emb_matrix = []
        counter = 0.
        for word, index in vocab.items():
            try:
                emb_matrix[index] = self.embeddings[word]
                counter += 1
            except KeyError:
                pass

        logger.info(
            '%i/%i word vectors initialized (hit rate: %.2f%%)' % (counter, len(vocab), 100 * counter / len(vocab)))
        # L2 normalization
        self.norm_emb_matrix = emb_matrix / np.linalg.norm(emb_matrix, axis=-1, keepdims=True)

        # get_aspect_matrix
        """
            We need it for initialization: KMeans-clustered word embeddings
        """

        km = KMeans(n_clusters=n_clusters)
        km.fit(self.norm_emb_matrix )
        clusters = km.cluster_centers_

        # L2 normalization
        self.norm_aspect_matrix = clusters / np.linalg.norm(clusters, axis=-1, keepdims=True)
예제 #4
0
파일: anago.py 프로젝트: IgorMunizS/SeqTag
def training(train, test):
    x_train = [x.split() for x in train['sentence'].tolist()]
    y_train = train['tag'].tolist()

    x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                      y_train,
                                                      train_size=0.8,
                                                      random_state=233)

    print('Transforming datasets...')
    p = IndexTransformer(use_char=True)
    p.fit(x_train, y_train)

    embeddings = load_glove(config.glove_file)

    embeddings = filter_embeddings(embeddings, p._word_vocab.vocab,
                                   config.glove_size)

    model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
                      word_vocab_size=p.word_vocab_size,
                      num_labels=p.label_size,
                      word_embedding_dim=300,
                      char_embedding_dim=100,
                      word_lstm_size=100,
                      char_lstm_size=50,
                      fc_dim=100,
                      dropout=0.5,
                      embeddings=embeddings,
                      use_char=True,
                      use_crf=True)

    opt = Adam(lr=0.001)
    model, loss = model.build()
    model.compile(loss=loss, optimizer=opt, metrics=[crf_viterbi_accuracy])

    filepath = '../models/' + 'best_model'
    ckp = ModelCheckpoint(filepath + '.h5',
                          monitor='val_crf_viterbi_accuracy',
                          verbose=1,
                          save_best_only=True,
                          mode='max',
                          save_weights_only=True)

    es = EarlyStopping(monitor='val_crf_viterbi_accuracy',
                       min_delta=0.00001,
                       patience=3,
                       verbose=1,
                       mode='max')
    rlr = ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy',
                            factor=0.2,
                            patience=2,
                            verbose=1,
                            mode='max',
                            min_delta=0.0001)

    callbacks = [ckp, es, rlr]

    train_seq = NERSequence(x_train, y_train, config.batch_size, p.transform)

    if x_val and y_val:
        valid_seq = NERSequence(x_val, y_val, config.batch_size, p.transform)
        f1 = F1score(valid_seq, preprocessor=p)
        callbacks.append(f1)

    model.fit_generator(generator=train_seq,
                        validation_data=valid_seq,
                        epochs=config.nepochs,
                        callbacks=callbacks,
                        verbose=True,
                        shuffle=True,
                        use_multiprocessing=True,
                        workers=42)
import pandas as pd
from pathlib import Path
import anago
from anago.utils import load_glove
import utils

train_path = Path.cwd().joinpath('data/semeval-2016/train.csv')
test_path = Path.cwd().joinpath('data/semeval-2016/test.csv')

# Read data
data_train = pd.read_csv(train_path)
data_test = pd.read_csv(test_path)

x_train, y_train = utils.df2data(data_train)
x_test, y_test = utils.df2data(data_test)

# Load glove embedding
EMBEDDING_PATH = '../embedding_weights/glove.840B.300d.txt'
embeddings = load_glove(EMBEDDING_PATH)

# Use pre-trained word embeddings to train
model = anago.Sequence(embeddings=embeddings, word_embedding_dim=300)
model.fit(x_train, y_train, x_test, y_test, epochs=10)
예제 #6
0
def training(train, test, fold):
    x_train = [x.split() for x in train['sentence'].tolist()]
    y_train = train['tag'].tolist()

    x_test = [x.split() for x in test['sentence'].tolist()]

    print('Transforming datasets...')
    p = IndexTransformer(use_char=True)
    p.fit(x_train + x_test, y_train)

    skf = KFold(n_splits=config.nfolds, random_state=config.seed, shuffle=True)

    embeddings = load_glove(config.glove_file)
    # embeddings_fast = load_glove(config.glove_file)
    embeddings_wang = load_glove(config.wang_file)

    embeddings = filter_embeddings(embeddings, p._word_vocab.vocab,
                                   config.glove_size)
    # embeddings_fast = filter_embeddings(embeddings_fast, p._word_vocab.vocab, config.fasttext_size)
    embeddings_wang = filter_embeddings(embeddings_wang, p._word_vocab.vocab,
                                        config.wang_size)

    embeddings = np.concatenate((embeddings, embeddings_wang), axis=1)

    for n_fold, (train_indices, val_indices) in enumerate(skf.split(x_train)):

        if n_fold >= fold:
            print("Training fold: ", n_fold)
            x_val = list(np.array(x_train)[val_indices])
            y_val = list(np.array(y_train)[val_indices])

            x_train_spl = list(np.array(x_train)[train_indices])
            y_train_spl = list(np.array(y_train)[train_indices])

            model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
                              word_vocab_size=p.word_vocab_size,
                              num_labels=p.label_size,
                              word_embedding_dim=1200,
                              char_embedding_dim=50,
                              word_lstm_size=300,
                              char_lstm_size=300,
                              fc_dim=50,
                              dropout=0.5,
                              embeddings=embeddings,
                              use_char=True,
                              use_crf=True)

            opt = Adam(lr=0.001)
            model, loss = model.build()
            model.compile(loss=loss,
                          optimizer=opt,
                          metrics=[crf_viterbi_accuracy])

            es = EarlyStopping(monitor='val_crf_viterbi_accuracy',
                               patience=3,
                               verbose=1,
                               mode='max',
                               restore_best_weights=True)

            rlr = ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy',
                                    factor=0.2,
                                    patience=2,
                                    verbose=1,
                                    mode='max')

            callbacks = [es, rlr]

            train_seq = NERSequence(x_train_spl, y_train_spl,
                                    config.batch_size, p.transform)

            if x_val and y_val:
                valid_seq = NERSequence(x_val, y_val, config.batch_size,
                                        p.transform)
                f1 = F1score(valid_seq, preprocessor=p, fold=n_fold)
                callbacks.append(f1)

            model.fit_generator(generator=train_seq,
                                validation_data=valid_seq,
                                epochs=config.nepochs,
                                callbacks=callbacks,
                                verbose=True,
                                shuffle=True,
                                use_multiprocessing=True,
                                workers=12)

            p.save('../models/best_transform.it')
            model.load_weights('../models/best_model_' + str(n_fold) + '.h5')
            predict(model, p, x_test, n_fold)
예제 #7
0
    def fit(self, X, y):
        """ Trains the NER model. Input is list of AnnotatedDocuments.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens
            y : list(list(str))
                list of list of BIO tags

            Returns
            -------
            self
        """
        if self.embeddings is None and self.embeddings_file is None:
            raise ValueError(
                "Either embeddings or embeddings_file should be provided, exiting."
            )

        log.info("Preprocessing dataset...")
        self.preprocessor_ = ELMoTransformer()
        self.preprocessor_.fit(X, y)

        if self.embeddings is None:
            self.embeddings = load_glove(self.embeddings_file)
            embeddings_dim != self.embeddings[list(
                self.embeddings.keys())[0]].shape[0]
            self.embeddings = filter_embeddings(
                self.embeddings, self.preprocessor_._word_vocab.vocab,
                embeddings_dim)

        log.info("Building model...")
        self.model_ = ELModel(
            char_embedding_dim=self.char_embedding_dim,
            word_embedding_dim=self.word_embedding_dim,
            char_lstm_size=self.char_lstm_size,
            word_lstm_size=self.word_lstm_size,
            char_vocab_size=self.preprocessor_.char_vocab_size,
            word_vocab_size=self.preprocessor_.word_vocab_size,
            num_labels=self.preprocessor_.label_size,
            embeddings=self.embeddings,
            dropout=self.dropout)

        self.model_, loss = self.model_.build()
        optimizer = Adam(lr=self.learning_rate)
        self.model_.compile(loss=loss, optimizer=optimizer)
        self.model_.summary()

        log.info('Training the model...')
        self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_)

        x_train, x_valid, y_train, y_valid = train_test_split(X,
                                                              y,
                                                              test_size=0.1,
                                                              random_state=42)
        self.trainer_.train(x_train,
                            y_train,
                            x_valid=x_valid,
                            y_valid=y_valid,
                            batch_size=self.batch_size,
                            epochs=self.max_iter)

        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self