Esempio n. 1
0
                             batch_size=args.b)

    # train model
    model.fit(x=train_x,
              y=train_y,
              batch_size=args.b,
              epochs=args.e,
              validation=(test_x, test_y),
              callbacks=[conll_cb])
    print('Training done.')

    print('Saving model')
    model.save(args.model_path)
    with open(args.model_info_path, 'wb') as fp:
        info = {
            'type': 'seq2seq',
            'tags_vocab': dataset.tags_vocab.vocab,
            'word_vocab': dataset.word_vocab.vocab,
            'char_vocab': dataset.char_vocab.vocab,
            'intent_vocab': dataset.intents_vocab.vocab,
        }
        pickle.dump(info, fp)

    # test performance
    predictions = model.predict(test_x, batch_size=args.b)
    eval = get_conll_scores(
        predictions, test_y,
        {v: k
         for k, v in dataset.tags_vocab.vocab.items()})
    print(eval)
Esempio n. 2
0
 def on_epoch_end(self, epoch, logs=None):
     predictions = self.model.predict(self.x, batch_size=self.bsz)
     f1 = get_conll_scores(predictions, self.y, self.y_vocab)[0][-1]
     print()
     print('Conll eval F1: {}'.format(f1))
Esempio n. 3
0
        train_features = [words_train, char_train]
        test_features = [words_test, char_test]
    else:
        train_features = words_train
        test_features = words_test
    train_labels = [pos_train, chunk_train]
    test_labels = [pos_test, chunk_test]
    chunk_f1_cb = ConllCallback(test_features,
                                chunk_test,
                                dataset.chunk_vocab.vocab,
                                batch_size=64)
    model.fit(train_features,
              train_labels,
              epochs=args.e,
              batch_size=args.b,
              validation_data=(test_features, test_labels),
              callbacks=[chunk_f1_cb])

    # save model
    _save_model()

    # load model
    model = SequenceChunker(use_cudnn=args.use_cudnn)
    model.load(model_path)

    # print evaluation metric
    chunk_pred = model.predict(test_features, 64)
    res = get_conll_scores(chunk_pred, chunk_test,
                           dataset.chunk_vocab.reverse_vocab())
    print(res)
Esempio n. 4
0
 def on_epoch_end(self, epoch, logs=None):
     predictions = self.model.predict(self.x, batch_size=self.bsz)
     stats = get_conll_scores(predictions, self.y, self.y_vocab)
     print()
     print("Conll eval: \n{}".format(stats))
Esempio n. 5
0
def run_aspect_sequence_tagging(
    train_file,
    test_file,
    models_path: str,
    logs_path: Path,
    augment_data: bool,
    embedding_model: str,
    word_embedding_dims: int,
    character_embedding_dims: int,
    char_features_lstm_dims: int,
    entity_tagger_lstm_dims: int,
    tagger_fc_dims: int,
    batch_size=10,
    epoch=50,
    tag_num=2,
    sentence_length=30,
    word_length=20,
    dropout=0.2,
    bilstm_layer: bool = True,
    crf_layer: bool = False,
    word_embedding_flag: bool = True,
    char_embedding_flag: bool = True,
    similarity_threshold: float = 0.8,
):
    network_params = [
        ('char', char_embedding_flag),
        ('word', word_embedding_flag),
        ('bilstm', bilstm_layer),
        ('lstm', not bilstm_layer),
        ('crf', crf_layer),
        (str(epoch) + 'epochs', True),
        (str(similarity_threshold) + 'augmented', augment_data),
    ]

    network_params_string = '-'.join(
        [param for param, flag in network_params if flag])

    trained_models_path = Path('trained', models_path)
    trained_models_path.mkdir(exist_ok=True, parents=True)
    logs_path = logs_path / models_path
    logs_path.mkdir(exist_ok=True, parents=True)

    # load dataset and parameters
    model_name = 'model-info' + '-' + network_params_string + '-' + basename(
        train_file) + '.info'
    models_path = join(models_path, model_name)
    if Path(models_path).exists():
        click.echo('Model has been already computed and saved!')
        return

    dataset = SequentialTaggingDataset(
        train_file=train_file,
        test_file=test_file,
        augment_data=augment_data,
        similarity_threshold=similarity_threshold,
        max_sentence_length=sentence_length,
        max_word_length=word_length,
        tag_field_no=tag_num)

    # get the train and test data sets
    x_train, x_char_train, y_train = dataset.train
    x_test, x_char_test, y_test = dataset.test

    if word_embedding_flag and char_embedding_flag:
        x_train = [x_train, x_char_train]
        x_test = [x_test, x_char_test]
    elif word_embedding_flag and not char_embedding_flag:
        x_train = x_train
        x_test = x_test
    elif not word_embedding_flag and char_embedding_flag:
        x_train = x_char_train
        x_test = x_char_test
    else:
        raise Exception('Wrong features')

    num_y_labels = len(dataset.y_labels) + 1
    vocabulary_size = dataset.word_vocab_size + 1
    char_vocabulary_size = dataset.char_vocab_size + 1

    y_test = to_categorical(y_test, num_y_labels)
    y_train = to_categorical(y_train, num_y_labels)

    aspect_model = AspectExtraction()
    aspect_model.build(
        sentence_length,
        word_length,
        num_y_labels,
        dataset.word_vocab,
        vocabulary_size,
        char_vocabulary_size,
        word_embedding_dims=word_embedding_dims,
        char_embedding_dims=character_embedding_dims,
        word_lstm_dims=char_features_lstm_dims,
        tagger_lstm_dims=entity_tagger_lstm_dims,
        tagger_fc_dims=tagger_fc_dims,
        dropout=dropout,
        external_embedding_model=embedding_model,
        bilstm_layer=bilstm_layer,
        crf_layer=crf_layer,
        word_embedding_flag=word_embedding_flag,
        char_embedding_flag=char_embedding_flag,
    )

    # Set callback functions to early stop training and save the best model so far
    tensorboard_path = (logs_path / ('tensorboard-' + model_name)).as_posix()
    print('Tensorboard: ' + tensorboard_path)

    callbacks = [
        ConllCallback(x_test, y_test, dataset.y_labels, batch_size=batch_size),
        TensorBoard(log_dir=tensorboard_path),
        EarlyStopping(monitor='val_loss', patience=2),
        # ModelCheckpoint(
        #     filepath=(trained_models_path / '{}-best_model.h5'.format(model_name)).as_posix(),
        #     monitor='val_loss',
        #     save_best_only=True
        # )
    ]

    aspect_model.fit(x=x_train,
                     y=y_train,
                     batch_size=batch_size,
                     epochs=epoch,
                     callbacks=callbacks,
                     validation=(x_test, y_test))

    # running predictions
    predictions = aspect_model.predict(x=x_test, batch_size=1)
    eval = get_conll_scores(predictions, y_test,
                            {v: k
                             for k, v in dataset.y_labels.items()})
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(eval)

    # saving model
    with open(models_path, 'wb') as fp:
        info = {
            'sentence_len': sentence_length,
            'word_len': word_length,
            'num_of_labels': num_y_labels,
            'labels_id_to_word': {v: k
                                  for k, v in dataset.y_labels.items()},
            'epoch': epoch,
            # 'word_vocab': dataset.word_vocab,
            'vocab_size': vocabulary_size,
            'char_vocab_size': char_vocabulary_size,
            'char_vocab': dataset.char_vocab,
            'word_embedding_dims': word_embedding_dims,
            'char_embedding_dims': character_embedding_dims,
            'word_lstm_dims': char_features_lstm_dims,
            'tagger_lstm_dims': entity_tagger_lstm_dims,
            'dropout': dropout,
            'external_embedding_model': embedding_model,
            'train_file': train_file,
            'test_file': test_file,
            # 'test_raw_sentences': dataset.test_raw_sentences,
            'eval': eval,
            # 'data_augmentation': dataset.data_augmentation,
            # 'augment_data': augment_data,
            'similarity_threshold': similarity_threshold,
            'bilstm_layer': bilstm_layer,
            'crf_layer': crf_layer,
            'word_embedding_layer': word_embedding_flag,
            'char_embedding_layer': char_embedding_flag,
            # 'predictions': predictions,
            # 'y_test': y_test,
            # 'y_labels': dataset.y_labels
        }
        print('Save model in: ' + models_path)
        pickle.dump(info, fp)
Esempio n. 6
0
    conll_cb = ConllCallback(test_inputs,
                             y_test,
                             dataset.y_labels.vocab,
                             batch_size=args.b)
    ner_model.fit(
        x=train_inputs,
        y=y_train,
        batch_size=args.b,
        epochs=args.e,
        callbacks=[conll_cb],
        validation=(test_inputs, y_test),
    )

    # saving model
    ner_model.save(args.model_path)
    with open(args.model_info_path, "wb") as fp:
        info = {
            "y_vocab": dataset.y_labels.vocab,
            "word_vocab": dataset.word_vocab.vocab,
            "char_vocab": dataset.char_vocab.vocab,
        }
        pickle.dump(info, fp)

    # running predictions
    predictions = ner_model.predict(x=test_inputs, batch_size=args.b)
    eval = get_conll_scores(predictions, y_test,
                            {v: k
                             for k, v in dataset.y_labels.vocab.items()})
    print(eval)
        print('Creating new model, starting to train from scratch')
        model.build(args.sentence_length,
                    dataset.vocab_size,
                    dataset.label_vocab_size,
                    args.token_emb_size,
                    args.encoder_depth,
                    args.decoder_depth,
                    args.lstm_hidden_size,
                    args.encoder_dropout,
                    args.decoder_depth,
                    args.embedding_model)

    conll_cb = ConllCallback(test_x, test_y, dataset.labels_vocab, batch_size=args.b)
    cp_cb = ModelCheckpoint(model_path, verbose=1, period=args.save_epochs)

    # train model
    model.fit(x=train_x, y=train_y,
              batch_size=args.b, epochs=args.e,
              validation=(test_x, test_y),
              callbacks=[conll_cb, cp_cb])
    print('Training done.')

    # test performance
    predictions = model.predict(test_x, batch_size=args.b)
    eval = get_conll_scores(predictions, test_y, {
        v: k for k, v in dataset.labels_vocab.items()})
    if args.full_eval is True:
        print(eval)
    else:
        print(eval[0])
Esempio n. 8
0
                  epochs=args.e,
                  callbacks=[conll_cb],
                  validation=([x_test, x_char_test], y_test))

    # saving model
    ner_model.save(args.model_path)
    with open(args.model_info_path, 'wb') as fp:
        info = {
            'sentence_len': args.sentence_length,
            'word_len': args.word_length,
            'num_of_labels': num_y_labels,
            'labels_id_to_word': {v: k for k, v in dataset.y_labels.items()},
            'word_vocab': dataset.word_vocab,
            'vocab_size': vocabulary_size,
            'char_vocab_size': char_vocabulary_size,
            'char_vocab': dataset.char_vocab,
            'word_embedding_dims': args.word_embedding_dims,
            'char_embedding_dims': args.character_embedding_dims,
            'word_lstm_dims': args.char_features_lstm_dims,
            'tagger_lstm_dims': args.entity_tagger_lstm_dims,
            'dropout': args.dropout,
            'external_embedding_model': args.embedding_model
        }
        pickle.dump(info, fp)

    # running predictions
    predictions = ner_model.predict(x=[x_test, x_char_test], batch_size=1)
    eval = get_conll_scores(predictions, y_test, {v: k for k, v in dataset.y_labels.items()})
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(eval)
Esempio n. 9
0
 def on_epoch_end(self, epoch, logs=None):
     predictions = self.model.predict(self.x, batch_size=self.bsz)
     f1 = get_conll_scores(predictions, self.y, self.y_vocab)[0][-1]
     print()
     print('Conll eval F1: {}'.format(f1))
Esempio n. 10
0
              epochs=args.epochs,
              cost=cost,
              callbacks=callbacks)

    # save model
    model_settings = {
        'sentence_len': args.sentence_len,
        'use_embeddings': args.embedding_model is not None,
        'pos': args.use_pos,
        'char_rnn': args.use_char_rnn,
        'y_vocab': dataset.y_vocab,
        'vocabs': dataset.vocabs,
    }

    with open(settings_path + '.dat', 'wb') as fp:
        pickle.dump(model_settings, fp)
    model.save(model_path)

    # tagging accuracy
    y_preds = model.predict(test_set)
    predictions = y_preds.argmax(2)
    truth_labels = test_set.y.reshape(-1, args.sentence_len)

    eval = get_conll_scores(predictions, truth_labels,
                            {v + 1: k
                             for k, v in dataset.y_vocab.items()})
    if args.print_np_perf is True:
        print('NP performance: {}'.format(eval[1]['NP']))
    else:
        print('Global performance: {}'.format(eval[0]))
Esempio n. 11
0
    model.fit(train_set,
              optimizer=optimizer,
              epochs=args.epochs,
              cost=cost,
              callbacks=callbacks)

    # save model
    model_settings = {'sentence_len': args.sentence_len,
                      'use_embeddings': args.embedding_model is not None,
                      'pos': args.use_pos,
                      'char_rnn': args.use_char_rnn,
                      'y_vocab': dataset.y_vocab,
                      'vocabs': dataset.vocabs,
                      }

    with open(settings_path + '.dat', 'wb') as fp:
        pickle.dump(model_settings, fp)
    model.save(model_path)

    # tagging accuracy
    y_preds = model.predict(test_set)
    predictions = y_preds.argmax(2)
    truth_labels = test_set.y.reshape(-1, args.sentence_len)

    eval = get_conll_scores(predictions, truth_labels, {
        v+1: k for k, v in dataset.y_vocab.items()})
    if args.print_np_perf is True:
        print('NP performance: {}'.format(eval[1]['NP']))
    else:
        print('Global performance: {}'.format(eval[0]))