def main(emb_file, datasource, n_epochs, emb_type, learning_rate): encoder_input_data, doc_length = load_encoder_inputs( 'data/{}/train_body_vecs.npy'.format(datasource)) decoder_input_data, decoder_target_data = load_decoder_inputs( 'data/{}/train_title_vecs.npy'.format(datasource)) num_encoder_tokens, body_pp = load_text_processor( 'data/{}/body_pp.dpkl'.format(datasource)) num_decoder_tokens, title_pp = load_text_processor( 'data/{}/title_pp.dpkl'.format(datasource)) vocabulary = np.load('data/{}/words.dat'.format(datasource)) #arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 # load embeddings embeddings_index = {} f = open(emb_file) for line in f: values = line.split() word = values[0] try: coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs except ValueError as ve: print(values) f.close() # build encoder embedding matrix encoder_embedding_matrix = np.zeros((num_encoder_tokens, latent_dim)) not_found = 0 print('Found %s word vectors.' % len(embeddings_index)) for i, word in body_pp.id2token.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None and word in vocabulary: # words not found in embedding index will be all-zeros. encoder_embedding_matrix[i] = embedding_vector else: not_found += 1 print('%s word out of the vocab.' % word) print('Found %s word out of the vocab.' % str(not_found)) ##### Define Model Architecture ###### ######################## #### Encoder Model #### encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False, weights=[encoder_embedding_matrix], trainable=False)(encoder_inputs) x = BatchNormalization(name='Encoder-Batchnorm-1')(x) # Intermediate GRU layer (optional) #x = GRU(latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True)(x) #x = BatchNormalization(name='Encoder-Batchnorm-2')(x) # We do not need the `encoder_output` just the hidden state. _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x) # Encapsulate the encoder as a separate entity so we can just # encode without decoding if we want to. encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model') seq2seq_encoder_out = encoder_model(encoder_inputs) # build encoder embedding matrix decoder_embedding_matrix = np.zeros((num_decoder_tokens, latent_dim)) print('Found %s word vectors.' % len(embeddings_index)) for i, word in title_pp.id2token.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None and word in vocabulary: # words not found in embedding index will be all-zeros. decoder_embedding_matrix[i] = embedding_vector ######################## #### Decoder Model #### decoder_inputs = Input(shape=(None, ), name='Decoder-Input') # for teacher forcing # Word Embedding For Decoder (ex: Issue Titles) dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False, weights=[decoder_embedding_matrix], trainable=False)(decoder_inputs) dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) # Set up the decoder, using `decoder_state_input` as initial state. decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU') decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output) # Dense layer for prediction decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense') decoder_outputs = decoder_dense(x) ######################## #### Seq2Seq Model #### #seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out]) seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs) seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy') script_name_base = 'tutorial_seq2seq' model_checkpoint = ModelCheckpoint( 'data/{}/{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}_{}.hdf5'.format( datasource, script_name_base, emb_type), save_best_only=True) batch_size = 1024 epochs = n_epochs history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1), batch_size=batch_size, epochs=epochs, validation_split=0.12, callbacks=[model_checkpoint]) #save model seq2seq_Model.save('data/{}/seq2seq_model_tutorial_{}.hdf5'.format( datasource, emb_type))
from keras.models import load_model from sklearn.model_selection import train_test_split from utils import load_text_processor from utils_recipes import Seq2Seq_Inference import pandas as pd #read in data sample 2M rows (for speed of tutorial) testdf = pd.read_pickle('data/kp20k/test.pd') body_text = testdf.body.tolist() title_text = testdf.title.tolist() seq2seq_Model_glove = load_model('data/kp20k/seq2seq_model_tutorial_glove.hdf5') seq2seq_Model_fasttext = load_model('data/kp20k/seq2seq_model_tutorial_fasttext.hdf5') seq2seq_Model_word2vec = load_model('data/kp20k/seq2seq_model_tutorial_word2vec.hdf5') num_encoder_tokens, body_pp = load_text_processor('data/kp20k/body_pp.dpkl') num_decoder_tokens, title_pp = load_text_processor('data/kp20k/title_pp.dpkl') seq2seq_inf_glove = Seq2Seq_Inference(encoder_preprocessor=body_pp, decoder_preprocessor=title_pp, seq2seq_model=seq2seq_Model_glove) # this method displays the predictions on random rows of the holdout set #seq2seq_inf_glove.demo_model_predictions(n=10, issue_df=testdf) bleu, rouge1_f, rouge1_p, rouge1_r, rouge2_f, rouge2_p, rouge2_r, rougel_f, rougel_p, rougel_r = \ seq2seq_inf_glove.evaluate_model(body_text[:10000], title_text[:10000]) print("\n****** Glove BLEU scrore ******: %s" % str(bleu)) print("\n****** Glove ROUGE 1 f scrore ******: %s" % str(rouge1_f)) print("\n****** Glove ROUGE 1 precission scrore ******: %s" % str(rouge1_p)) print("\n****** Glove ROUGE 1 recall scrore ******: %s" % str(rouge1_r)) print("\n****** Glove ROUGE 2 f scrore ******: %s" % str(rouge2_f))
from utils_economics import Seq2Seq_Inference import pandas as pd #read in data sample 2M rows (for speed of tutorial) testdf = pd.read_pickle('data/economics/test.pd') body_text = testdf.text.tolist() title_text = testdf.headline.tolist() seq2seq_Model_glove = load_model( 'data/economics/seq2seq_model_tutorial_glove.hdf5') seq2seq_Model_fasttext = load_model( 'data/economics/seq2seq_model_tutorial_fasttext.hdf5') seq2seq_Model_word2vec = load_model( 'data/economics/seq2seq_model_tutorial_word2vec.hdf5') #seq2seq_Model_custom = load_model('data/economics/seq2seq_model_tutorial_custom.hdf5') num_encoder_tokens, body_pp = load_text_processor( 'data/economics/body_pp.dpkl') num_decoder_tokens, title_pp = load_text_processor( 'data/economics/title_pp.dpkl') seq2seq_inf_glove = Seq2Seq_Inference(encoder_preprocessor=body_pp, decoder_preprocessor=title_pp, seq2seq_model=seq2seq_Model_glove) # this method displays the predictions on random rows of the holdout set #seq2seq_inf_glove.demo_model_predictions(n=10, issue_df=testdf) bleu, rouge1_f, rouge1_p, rouge1_r, rouge2_f, rouge2_p, rouge2_r, rougel_f, rougel_p, rougel_r = \ seq2seq_inf_glove.evaluate_model(body_text[:10000], title_text[:10000]) print("\n****** Glove BLEU scrore ******: %s" % str(bleu)) print("\n****** Glove ROUGE 1 f scrore ******: %s" % str(rouge1_f)) print("\n****** Glove ROUGE 1 precission scrore ******: %s" % str(rouge1_p))