コード例 #1
0
def main(emb_file, datasource, n_epochs, emb_type, learning_rate):

    encoder_input_data, doc_length = load_encoder_inputs(
        'data/{}/train_body_vecs.npy'.format(datasource))
    decoder_input_data, decoder_target_data = load_decoder_inputs(
        'data/{}/train_title_vecs.npy'.format(datasource))

    num_encoder_tokens, body_pp = load_text_processor(
        'data/{}/body_pp.dpkl'.format(datasource))
    num_decoder_tokens, title_pp = load_text_processor(
        'data/{}/title_pp.dpkl'.format(datasource))

    vocabulary = np.load('data/{}/words.dat'.format(datasource))

    #arbitrarly set latent dimension for embedding and hidden units
    latent_dim = 300

    # load embeddings
    embeddings_index = {}
    f = open(emb_file)
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        except ValueError as ve:
            print(values)
    f.close()

    # build encoder embedding matrix
    encoder_embedding_matrix = np.zeros((num_encoder_tokens, latent_dim))
    not_found = 0
    print('Found %s word vectors.' % len(embeddings_index))
    for i, word in body_pp.id2token.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None and word in vocabulary:
            # words not found in embedding index will be all-zeros.
            encoder_embedding_matrix[i] = embedding_vector
        else:
            not_found += 1
            print('%s word out of the vocab.' % word)

    print('Found %s word out of the vocab.' % str(not_found))
    ##### Define Model Architecture ######

    ########################
    #### Encoder Model ####
    encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')

    # Word embeding for encoder (ex: Issue Body)
    x = Embedding(num_encoder_tokens,
                  latent_dim,
                  name='Body-Word-Embedding',
                  mask_zero=False,
                  weights=[encoder_embedding_matrix],
                  trainable=False)(encoder_inputs)
    x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

    # Intermediate GRU layer (optional)
    #x = GRU(latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True)(x)
    #x = BatchNormalization(name='Encoder-Batchnorm-2')(x)

    # We do not need the `encoder_output` just the hidden state.
    _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

    # Encapsulate the encoder as a separate entity so we can just
    #  encode without decoding if we want to.
    encoder_model = Model(inputs=encoder_inputs,
                          outputs=state_h,
                          name='Encoder-Model')

    seq2seq_encoder_out = encoder_model(encoder_inputs)

    # build encoder embedding matrix
    decoder_embedding_matrix = np.zeros((num_decoder_tokens, latent_dim))
    print('Found %s word vectors.' % len(embeddings_index))
    for i, word in title_pp.id2token.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None and word in vocabulary:
            # words not found in embedding index will be all-zeros.
            decoder_embedding_matrix[i] = embedding_vector

    ########################
    #### Decoder Model ####
    decoder_inputs = Input(shape=(None, ),
                           name='Decoder-Input')  # for teacher forcing

    # Word Embedding For Decoder (ex: Issue Titles)
    dec_emb = Embedding(num_decoder_tokens,
                        latent_dim,
                        name='Decoder-Word-Embedding',
                        mask_zero=False,
                        weights=[decoder_embedding_matrix],
                        trainable=False)(decoder_inputs)
    dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

    # Set up the decoder, using `decoder_state_input` as initial state.
    decoder_gru = GRU(latent_dim,
                      return_state=True,
                      return_sequences=True,
                      name='Decoder-GRU')
    decoder_gru_output, _ = decoder_gru(dec_bn,
                                        initial_state=seq2seq_encoder_out)
    x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

    # Dense layer for prediction
    decoder_dense = Dense(num_decoder_tokens,
                          activation='softmax',
                          name='Final-Output-Dense')
    decoder_outputs = decoder_dense(x)

    ########################
    #### Seq2Seq Model ####

    #seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out])
    seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
                          loss='sparse_categorical_crossentropy')

    script_name_base = 'tutorial_seq2seq'

    model_checkpoint = ModelCheckpoint(
        'data/{}/{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}_{}.hdf5'.format(
            datasource, script_name_base, emb_type),
        save_best_only=True)

    batch_size = 1024
    epochs = n_epochs
    history = seq2seq_Model.fit([encoder_input_data, decoder_input_data],
                                np.expand_dims(decoder_target_data, -1),
                                batch_size=batch_size,
                                epochs=epochs,
                                validation_split=0.12,
                                callbacks=[model_checkpoint])

    #save model
    seq2seq_Model.save('data/{}/seq2seq_model_tutorial_{}.hdf5'.format(
        datasource, emb_type))
コード例 #2
0
ファイル: validation_kp20k.py プロジェクト: menajosep/seq2seq
from keras.models import load_model
from sklearn.model_selection import train_test_split
from utils import load_text_processor
from utils_recipes import Seq2Seq_Inference
import pandas as pd

#read in data sample 2M rows (for speed of tutorial)
testdf = pd.read_pickle('data/kp20k/test.pd')
body_text = testdf.body.tolist()
title_text = testdf.title.tolist()
seq2seq_Model_glove = load_model('data/kp20k/seq2seq_model_tutorial_glove.hdf5')
seq2seq_Model_fasttext = load_model('data/kp20k/seq2seq_model_tutorial_fasttext.hdf5')
seq2seq_Model_word2vec = load_model('data/kp20k/seq2seq_model_tutorial_word2vec.hdf5')
num_encoder_tokens, body_pp = load_text_processor('data/kp20k/body_pp.dpkl')
num_decoder_tokens, title_pp = load_text_processor('data/kp20k/title_pp.dpkl')


seq2seq_inf_glove = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                 decoder_preprocessor=title_pp,
                                 seq2seq_model=seq2seq_Model_glove)
# this method displays the predictions on random rows of the holdout set
#seq2seq_inf_glove.demo_model_predictions(n=10, issue_df=testdf)


bleu, rouge1_f, rouge1_p, rouge1_r, rouge2_f, rouge2_p, rouge2_r, rougel_f, rougel_p, rougel_r = \
    seq2seq_inf_glove.evaluate_model(body_text[:10000], title_text[:10000])
print("\n****** Glove BLEU scrore ******: %s" % str(bleu))
print("\n****** Glove ROUGE 1 f scrore ******: %s" % str(rouge1_f))
print("\n****** Glove ROUGE 1 precission scrore ******: %s" % str(rouge1_p))
print("\n****** Glove ROUGE 1 recall scrore ******: %s" % str(rouge1_r))
print("\n****** Glove ROUGE 2 f scrore ******: %s" % str(rouge2_f))
コード例 #3
0
from utils_economics import Seq2Seq_Inference
import pandas as pd

#read in data sample 2M rows (for speed of tutorial)
testdf = pd.read_pickle('data/economics/test.pd')

body_text = testdf.text.tolist()
title_text = testdf.headline.tolist()
seq2seq_Model_glove = load_model(
    'data/economics/seq2seq_model_tutorial_glove.hdf5')
seq2seq_Model_fasttext = load_model(
    'data/economics/seq2seq_model_tutorial_fasttext.hdf5')
seq2seq_Model_word2vec = load_model(
    'data/economics/seq2seq_model_tutorial_word2vec.hdf5')
#seq2seq_Model_custom = load_model('data/economics/seq2seq_model_tutorial_custom.hdf5')
num_encoder_tokens, body_pp = load_text_processor(
    'data/economics/body_pp.dpkl')
num_decoder_tokens, title_pp = load_text_processor(
    'data/economics/title_pp.dpkl')

seq2seq_inf_glove = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                      decoder_preprocessor=title_pp,
                                      seq2seq_model=seq2seq_Model_glove)
# this method displays the predictions on random rows of the holdout set
#seq2seq_inf_glove.demo_model_predictions(n=10, issue_df=testdf)


bleu, rouge1_f, rouge1_p, rouge1_r, rouge2_f, rouge2_p, rouge2_r, rougel_f, rougel_p, rougel_r = \
    seq2seq_inf_glove.evaluate_model(body_text[:10000], title_text[:10000])
print("\n****** Glove BLEU scrore ******: %s" % str(bleu))
print("\n****** Glove ROUGE 1 f scrore ******: %s" % str(rouge1_f))
print("\n****** Glove ROUGE 1 precission scrore ******: %s" % str(rouge1_p))