def pair_test(params): print("\n=========\n") sys.stdout.flush() print(str(datetime.now())) sys.stdout.flush() t0 = time.time() print("Pairing test data to train the model...") sys.stdout.flush() data_path = params['data_path'] ''' read training set ''' X_test_connector = DataConnector(data_path, 'X_test.npy', data=None) X_test_connector.read_numpys() X_test = X_test_connector.read_file y_test_in_connector = DataConnector(data_path, 'y_test_in.npy', data=None) y_test_in_connector.read_numpys() y_test_in = y_test_in_connector.read_file y_test_out_connector = DataConnector(data_path, 'y_test_out.npy', data=None) y_test_out_connector.read_numpys() y_test_out = y_test_out_connector.read_file sequences_processing = SequenceProcessing() doc_pair, x_pair_test, y_pair_test_in, y_pair_test_out = sequences_processing.pairing_data(X_test, y_test_in, y_test_out) x_pair_test = np.array(x_pair_test) x_pair_test = x_pair_test.reshape((x_pair_test.shape[0], x_pair_test.shape[2])) y_pair_test_in = np.array(y_pair_test_in) y_pair_test_in = y_pair_test_in.reshape((y_pair_test_in.shape[0], y_pair_test_in.shape[2])) y_pair_test_out = np.array(y_pair_test_out) y_pair_test_out = y_pair_test_out.reshape((y_pair_test_out.shape[0], y_pair_test_out.shape[2])) print("\nshape of x_pair in test set: %s\n"%str(x_pair_test.shape)) print("\nshape of y_pair_in in test set: %s\n"%str(y_pair_test_in.shape)) print("\nshape of y_pair_out in test set: %s\n"%str(y_pair_test_out.shape)) doc_in_connector = DataConnector(data_path, 'doc_pair_test.npy', doc_pair) doc_in_connector.save_numpys() x_in_connector = DataConnector(data_path, 'x_pair_test.npy', x_pair_test) x_in_connector.save_numpys() y_in_connector = DataConnector(data_path, 'y_pair_test_in.npy', y_pair_test_in) y_in_connector.save_numpys() y_out_connector = DataConnector(data_path, 'y_pair_test_out.npy', y_pair_test_out) y_out_connector.save_numpys() t1 = time.time() print("Pairing test set into sequences of inputs - outputs done in %.3fsec" % (t1 - t0)) sys.stdout.flush()
def pair_sent_all(params): print("\n=========\n") sys.stdout.flush() print(str(datetime.now())) sys.stdout.flush() t0 = time.time() print("Pairing data to train the model...") sys.stdout.flush() data_path = params['data_path'] max_sents = params['max_sents'] kp20k_path = params['kp20k_path'] encoder_length = params['encoder_length'] decoder_length = params['decoder_length'] ''' read training set ''' x_connector = DataConnector(data_path, 'X_sent.npy', data=None) x_connector.read_numpys() X = x_connector.read_file y_in_connector = DataConnector(data_path, 'y_sent_in.npy', data=None) y_in_connector.read_numpys() y_in_ = y_in_connector.read_file y_out_connector = DataConnector(data_path, 'y_sent_out.npy', data=None) y_out_connector.read_numpys() y_out_ = y_out_connector.read_file sequences_processing = SequenceProcessing(indices_words=None, words_indices=None, encoder_length=None, decoder_length=None) doc_pair, x_pair, y_pair_in, y_pair_out = sequences_processing.pairing_data(X, y_in_, y_out_) print("\nshape of x_pair in training set: %s\n"%str(np.array(x_pair).shape)) print("\nshape of y_pair_in in training set: %s\n"%str(np.array(y_pair_in).shape)) print("\nshape of y_pair_out in training set: %s\n"%str(np.array(y_pair_out).shape)) ### Use another functions to store large file doc_in_connector = DataConnector(data_path, 'doc_pair_sent.npy', doc_pair) doc_in_connector.save_numpys() x_in_connector = DataConnector(data_path, 'x_pair_sent.npy', x_pair) x_in_connector.save_numpys() y_in_connector = DataConnector(data_path, 'y_pair_in_sent.npy', y_pair_in) y_in_connector.save_numpys() y_out_connector = DataConnector(data_path, 'y_pair_out_sent.npy', y_pair_out) y_out_connector.save_numpys() t1 = time.time() print("Pairing into sequences of inputs - outputs done in %.3fsec" % (t1 - t0)) sys.stdout.flush()
def transform_test(params): print("\n=========\n") sys.stdout.flush() print(str(datetime.now())) sys.stdout.flush() t0 = time.time() print("Transforming test set into integer sequences") sys.stdout.flush() data_path = params['data_path'] encoder_length = params['encoder_length'] decoder_length = params['decoder_length'] ''' read stored vocabulary index ''' vocab = DataConnector(data_path, 'all_indices_words.pkl', data=None) vocab.read_pickle() indices_words = vocab.read_file reversed_vocab = DataConnector(data_path, 'all_words_indices.pkl', data=None) reversed_vocab.read_pickle() words_indices = reversed_vocab.read_file ''' read tokenized data set ''' test_in_tokens_connector = DataConnector(data_path, 'in_test.npy', data=None) test_in_tokens_connector.read_numpys() test_in_tokens = test_in_tokens_connector.read_file test_out_tokens_connector = DataConnector(data_path, 'out_test.npy', data=None) test_out_tokens_connector.read_numpys() test_out_tokens = test_out_tokens_connector.read_file ''' transforming texts into integer sequences ''' sequences_processing = SequenceProcessing(indices_words, words_indices, encoder_length, decoder_length) X_test = sequences_processing.intexts_to_integers(test_in_tokens) y_test_in, y_test_out = sequences_processing.outtexts_to_integers(test_out_tokens) x_in_connector = DataConnector(data_path, 'X_test.npy', X_test) x_in_connector.save_numpys() y_in_connector = DataConnector(data_path, 'y_test_in.npy', y_test_in) y_in_connector.save_numpys() y_out_connector = DataConnector(data_path, 'y_test_out.npy', y_test_out) y_out_connector.save_numpys() t1 = time.time() print("Transforming test set into integer sequences of inputs - outputs done in %.3fsec" % (t1 - t0)) sys.stdout.flush()
def decoder(params): data_path = params['data_path'] preprocessed_v2 = params['preprocessed_v2'] preprocessed_data = params['preprocessed_data'] decode_path = params['decode_path'] model_path = params['model_path'] result_path = params['result_path'] file_name = params['file_name'] weights = params['weights'] encoder_length = params['encoder_length'] decoder_length = params['decoder_length'] embedding_dim = params['embedding_dim'] birnn_dim = params['birnn_dim'] rnn_dim = params['rnn_dim'] vocab_size = params['vocab_size'] ''' Reading vocabulary dictionaries ''' indices_words_connector = DataConnector(preprocessed_v2, 'all_indices_words_fsoftmax.pkl', data=None) indices_words_connector.read_pickle() indices_words = indices_words_connector.read_file words_indices_connector = DataConnector(preprocessed_v2, 'all_words_indices_fsoftmax.pkl', data=None) words_indices_connector.read_pickle() words_indices = words_indices_connector.read_file y_test_true_connector = DataConnector(data_path, 'test_output_tokens.npy', data=None) y_test_true_connector.read_numpys() y_test_true = y_test_true_connector.read_file # paired data set X_pair_test_connector = DataConnector(preprocessed_data, 'x_pair_test_fsoftmax.npy', data=None) X_pair_test_connector.read_numpys() X_pair_test = X_pair_test_connector.read_file y_pair_test_in_connector = DataConnector(preprocessed_data, 'y_pair_test_in_fsoftmax.npy', data=None) y_pair_test_in_connector.read_numpys() y_pair_test_in = y_pair_test_in_connector.read_file y_pair_test_out_connector = DataConnector(preprocessed_data, 'y_pair_test_out_fsoftmax.npy', data=None) y_pair_test_out_connector.read_numpys() y_pair_test_out = y_pair_test_out_connector.read_file # non-paired data set X_test_connector = DataConnector(preprocessed_data, 'X_test_pad_fsoftmax.npy', data=None) X_test_connector.read_numpys() X_test = X_test_connector.read_file y_test_in_connector = DataConnector(preprocessed_data, 'y_test_in_fsoftmax.npy', data=None) y_test_in_connector.read_numpys() y_test_in = y_test_in_connector.read_file y_test_out_connector = DataConnector(preprocessed_data, 'y_test_out_fsoftmax.npy', data=None) y_test_out_connector.read_numpys() y_test_out = y_test_out_connector.read_file print("\n Non-paired test set: \n") sys.stdout.flush() print("X (input for encoder) shape: %s" % str(X_test.shape)) # input for encoder sys.stdout.flush() print("y_in (input for decoder) shape: %s" % str(y_test_in.shape)) # input for decoder sys.stdout.flush() print("y_out (output for decoder) shape: %s\n\n" % str(y_test_out.shape)) # output for decoder sys.stdout.flush() ''' Decoder model for inference stage Return: generated keyphrases ''' full_softmax = AttentionFullSoftmax(encoder_length=encoder_length, decoder_length=decoder_length, embedding_dim=embedding_dim, birnn_dim=birnn_dim, rnn_dim=rnn_dim, vocab_size=vocab_size, filepath=result_path, filename=file_name, batch_train_iter=None, batch_val_iter=None, batch_size=None, steps_epoch=None, val_steps=None, epochs=None) # skeleton of model architecture full_softmax.train_att_seq2seq() predict_softmax_model = full_softmax.predict_att_seq2seq(weights) encoder_model = full_softmax.encoder_model ''' Inference stage Model: layers from prediction model and decoder model Inference (text generation) approach: 1. One best search decoding (Greedy search): Return one best (top) probable word sequence, from joint probability of words within decoder time steps (decoder sequence length) 2. N-Beam search decoding: Return N-top best most probable word sequences, by utilizing beam tree search per time steps and joint probability within decoder time steps (decoder sequence length) ''' decoder_model = full_softmax.create_decoder_model() # transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases keyphrases_transform = TrueKeyphrases(y_test_true) keyphrases_transform.get_true_keyphrases() keyphrases_transform.get_stat_keyphrases() y_true = keyphrases_transform.y_true max_kp_num = keyphrases_transform.max_kp_num mean_kp_num = keyphrases_transform.mean_kp_num std_kp_num = keyphrases_transform.std_kp_num print("Maximum number of key phrases per document in corpus: %s" % max_kp_num) sys.stdout.flush() print("Average number of key phrases per document in corpus: %s" % mean_kp_num) sys.stdout.flush() print( "Standard Deviation of number of key phrases per document in corpus: %s" % std_kp_num) sys.stdout.flush() # round up function for computing beam width def roundup(x): return x if x % 5 == 0 else x + 5 - x % 5 beam_width = int(roundup(mean_kp_num + (3 * std_kp_num))) print("\nBeam width: %s\n" % beam_width) sys.stdout.flush() num_hypotheses = beam_width s0_test = np.zeros((len(X_test), rnn_dim)) att0_test = np.zeros((len(X_test), encoder_length, 1)) print(str(datetime.now())) sys.stdout.flush() inference_mode = Decoding(encoder_model=encoder_model, decoder_model=decoder_model, indices_words=indices_words, words_indices=words_indices, enc_in_seq=None, states=None, attentions=None, decoder_length=decoder_length, rnn_dim=rnn_dim, beam_width=beam_width, num_hypotheses=num_hypotheses, filepath=decode_path, filename=file_name) t0_1 = time.time() print("Start beam decoding...") sys.stdout.flush() beam_keyphrases = inference_mode.beam_decoder(X_test[:500], s0_test[:500], att0_test[:500]) beam_decode_connector = DataConnector(decode_path, 'beam_kp-%s.npy' % (file_name), beam_keyphrases) beam_decode_connector.save_numpys() t1_1 = time.time() print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1)) sys.stdout.flush()
def decoder(params): data_path = params['data_path'] glove_embed = params['glove_embedding'] oov_embed = params['oov_embedding'] preprocessed_v2 = params['preprocessed_v2'] preprocessed_data = params['preprocessed_data'] decode_path = params['decode_path'] model_path = params['model_path'] result_path = params['result_path'] result_kp20k = params['result_kp20k'] file_name = params['file_name'] weights = params['weights'] encoder_length = params['encoder_length'] decoder_length = params['decoder_length'] embedding_dim = params['embedding_dim'] birnn_dim = params['birnn_dim'] rnn_dim = params['rnn_dim'] vocab_size = params['vocab_size'] batch_size = params['batch_size'] epoch = params['epoch'] ''' Reading vocabulary dictionaries ''' indices_words_connector = DataConnector( preprocessed_v2, 'all_idxword_vocabulary_fsoftmax.pkl', data=None) indices_words_connector.read_pickle() indices_words = indices_words_connector.read_file words_indices_connector = DataConnector( preprocessed_v2, 'all_wordidx_vocabulary_fsoftmax.pkl', data=None) words_indices_connector.read_pickle() words_indices = words_indices_connector.read_file ## merge all set into one test set for trained model train_outputs_conn = DataConnector(data_path, 'train_output_tokens.npy', data=None) train_outputs_conn.read_numpys() train_outputs = train_outputs_conn.read_file valid_outputs_conn = DataConnector(data_path, 'val_output_tokens.npy', data=None) valid_outputs_conn.read_numpys() valid_outputs = valid_outputs_conn.read_file test_outputs_conn = DataConnector(data_path, 'test_output_tokens.npy', data=None) test_outputs_conn.read_numpys() test_outputs = test_outputs_conn.read_file y_test_true = np.concatenate((train_outputs, valid_outputs, test_outputs)) print("Ground truth of keyphrases shape: %s" % str(y_test_true.shape)) # input for encoder sys.stdout.flush() # non-paired data set X_train_connector = DataConnector(preprocessed_data, 'X_train_pad_fsoftmax.npy', data=None) X_train_connector.read_numpys() X_train = X_train_connector.read_file X_valid_connector = DataConnector(preprocessed_data, 'X_valid_pad_fsoftmax.npy', data=None) X_valid_connector.read_numpys() X_valid = X_valid_connector.read_file X_test_connector = DataConnector(preprocessed_data, 'X_test_pad_fsoftmax.npy', data=None) X_test_connector.read_numpys() X_test = X_test_connector.read_file X_in = np.concatenate((X_train, X_valid, X_test)) glove_embedding_conn = DataConnector(preprocessed_v2, glove_embed, data=None) glove_embedding_conn.read_pickle() pretrained_embedding = glove_embedding_conn.read_file print("pretrained_embedding shape: %s" % str(pretrained_embedding.shape)) print("pretrained_embedding [0][:10]: %s" % str(pretrained_embedding[0, :10])) print("pretrained_embedding [1][:10]: %s" % str(pretrained_embedding[1, :10])) oov_embedding_conn = DataConnector(preprocessed_v2, oov_embed, data=None) oov_embedding_conn.read_pickle() oov_embedding = oov_embedding_conn.read_file print("oov_embedding shape: %s" % str(oov_embedding.shape)) print("oov_embedding [0][:10]: %s" % str(oov_embedding[0, :10])) print("oov_embedding [1][:10]: %s" % str(oov_embedding[1, :10])) print("oov_embedding [2][:10]: %s" % str(oov_embedding[2, :10])) full_softmax = AttentionFullSoftmax(encoder_length=encoder_length, decoder_length=decoder_length, embedding_dim=embedding_dim, birnn_dim=birnn_dim, rnn_dim=rnn_dim, vocab_size=vocab_size, filepath=result_kp20k, filename=file_name, batch_train_iter=None, batch_val_iter=None, batch_size=None, steps_epoch=None, val_steps=None, epochs=None) full_softmax.train_att_seq2seq(pretrained_embedding, oov_embedding) full_softmax.predict_att_seq2seq(weights) encoder_model = full_softmax.encoder_model # 1. Prediction model after being trained on sampled softmax setting predict_softmax_model = full_softmax.prediction_model ''' Inference stage Model: layers from prediction model and decoder model Inference (text generation) approach: 1. One best search decoding (Greedy search): Return one best (top) probable word sequence, from joint probability of words within decoder time steps (decoder sequence length) 2. N-Beam search decoding: Return N-top best most probable word sequences, by utilizing beam tree search per time steps and joint probability within decoder time steps (decoder sequence length) ''' ''' Decoder model for inference stage Return: generated keyphrases ''' decoder_model = full_softmax.create_decoder_model() # transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases keyphrases_transform = TrueKeyphrases(y_test_true) keyphrases_transform.get_true_keyphrases() keyphrases_transform.get_stat_keyphrases() y_true = keyphrases_transform.y_true max_kp_num = keyphrases_transform.max_kp_num mean_kp_num = keyphrases_transform.mean_kp_num std_kp_num = keyphrases_transform.std_kp_num print("Maximum number of key phrases per document in corpus: %s" % max_kp_num) sys.stdout.flush() print("Average number of key phrases per document in corpus: %s" % mean_kp_num) sys.stdout.flush() print( "Standard Deviation of number of key phrases per document in corpus: %s" % std_kp_num) sys.stdout.flush() # round up function for computing beam width def roundup(x): return x if x % 5 == 0 else x + 5 - x % 5 beam_width = int(roundup(mean_kp_num + (3 * std_kp_num))) num_hypotheses = beam_width print("\nBeam width: %s\n" % beam_width) sys.stdout.flush() s0_test = np.zeros((len(X_in), rnn_dim)) att0_test = np.zeros((len(X_in), encoder_length, 1)) print(str(datetime.now())) sys.stdout.flush() inference_mode = Decoding(encoder_model=encoder_model, decoder_model=decoder_model, indices_words=indices_words, words_indices=words_indices, enc_in_seq=None, states=None, attentions=None, decoder_length=decoder_length, rnn_dim=rnn_dim, beam_width=beam_width, num_hypotheses=num_hypotheses, filepath=decode_path, filename=file_name) t0_1 = time.time() print("Start beam decoding...") sys.stdout.flush() beam_keyphrases = inference_mode.beam_decoder(X_in[:500], s0_test[:500], att0_test[:500]) beam_decode_connector = DataConnector(decode_path, 'beam_kp-%s.npy' % (file_name), beam_keyphrases) beam_decode_connector.save_numpys() t1_1 = time.time() print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1)) sys.stdout.flush()
def decoder(params): data_path = params['data_path'] preprocessed_v2 = params['preprocessed_v2'] preprocessed_data = params['preprocessed_data'] decode_path = params['decode_path'] model_path = params['model_path'] result_path = params['result_path'] result_kp20k = params['result_kp20k'] file_name = params['file_name'] weights = params['weights'] encoder_length = params['encoder_length'] decoder_length = params['decoder_length'] max_sents = params['max_sents'] embedding_dim = params['embedding_dim'] birnn_dim = params['birnn_dim'] rnn_dim = params['rnn_dim'] vocab_size = params['vocab_size'] num_samples = params['num_samples'] batch_size = params['batch_size'] epoch = params['epoch'] ''' Reading vocabulary dictionaries ''' indices_words_connector = DataConnector(preprocessed_v2, 'all_indices_words_sent.pkl', data=None) indices_words_connector.read_pickle() indices_words = indices_words_connector.read_file words_indices_connector = DataConnector(preprocessed_v2, 'all_words_indices_sent.pkl', data=None) words_indices_connector.read_pickle() words_indices = words_indices_connector.read_file ## merge all set into one test set for trained model outputs_conn = DataConnector(data_path, 'output_sent_tokens.npy', data=None) outputs_conn.read_numpys() outputs = outputs_conn.read_file y_test_true = outputs print("Ground truth of keyphrases shape: %s" % str(y_test_true.shape)) # input for encoder sys.stdout.flush() # non-paired data set X_connector = DataConnector(data_path, 'X_sent_pad.npy', data=None) X_connector.read_numpys() X_in = X_connector.read_file print("\n Non-paired test set: \n") sys.stdout.flush() print("X (input for encoder) shape: %s" % str(X_in.shape)) # input for encoder sys.stdout.flush() ''' Decoder model for inference stage Return: generated keyphrases ''' sampled_softmax = HierarchySampledSoftmax(encoder_length=encoder_length, decoder_length=decoder_length, max_sents=max_sents, embedding_dim=embedding_dim, birnn_dim=birnn_dim, rnn_dim=rnn_dim, vocab_size=vocab_size, num_samples=num_samples, filepath=result_kp20k, filename=file_name, batch_train_iter=None, batch_val_iter=None, batch_size=None, steps_epoch=None, val_steps=None, epochs=None) # skeleton of model architecture sampled_softmax.train_hier_sampled_softmax() ''' Model for retrieving softmax probability Return: softmax probability of prediction layer ''' sampled_softmax.predict_sampled_softmax(weights) encoder_model = sampled_softmax.encoder_model # 1. Prediction model after being trained on sampled softmax setting predict_softmax_model = sampled_softmax.prediction_model ''' Inference stage Model: layers from prediction model and decoder model Inference (text generation) approach: 1. One best search decoding (Greedy search): Return one best (top) probable word sequence, from joint probability of words within decoder time steps (decoder sequence length) 2. N-Beam search decoding: Return N-top best most probable word sequences, by utilizing beam tree search per time steps and joint probability within decoder time steps (decoder sequence length) ''' decoder_model = sampled_softmax.create_decoder_model() # transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases keyphrases_transform = TrueKeyphrases(y_test_true) keyphrases_transform.get_true_keyphrases() keyphrases_transform.get_stat_keyphrases() y_true = keyphrases_transform.y_true max_kp_num = keyphrases_transform.max_kp_num mean_kp_num = keyphrases_transform.mean_kp_num std_kp_num = keyphrases_transform.std_kp_num print("Maximum number of key phrases per document in corpus: %s" % max_kp_num) sys.stdout.flush() print("Average number of key phrases per document in corpus: %s" % mean_kp_num) sys.stdout.flush() print( "Standard Deviation of number of key phrases per document in corpus: %s" % std_kp_num) sys.stdout.flush() # round up function for computing beam width def roundup(x): return x if x % 5 == 0 else x + 5 - x % 5 beam_width = int(roundup(mean_kp_num + (3 * std_kp_num))) num_hypotheses = beam_width print("\nBeam width: %s\n" % beam_width) sys.stdout.flush() y_dummy_test = np.zeros((len(X_in), decoder_length + 1, 1)) inference_mode = Decoding(encoder_model=encoder_model, decoder_model=decoder_model, indices_words=indices_words, words_indices=words_indices, enc_in_seq=None, labels=None, decoder_length=decoder_length, rnn_dim=rnn_dim, beam_width=beam_width, num_hypotheses=num_hypotheses, filepath=decode_path, filename=file_name) t0_1 = time.time() print("Start beam decoding...") sys.stdout.flush() beam_keyphrases = inference_mode.beam_decoder(X_in[:500], y_dummy_test[:500]) beam_decode_connector = DataConnector(decode_path, 'beam_kp-hier-%s.npy' % (file_name), beam_keyphrases) beam_decode_connector.save_numpys() t1_1 = time.time() print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1)) sys.stdout.flush()
def decoder(params): data_path = params['data_path'] preprocessed_data = params['preprocessed_data'] glove_embed = params['glove_embedding'] oov_embed = params['oov_embedding'] model_path = params['model_path'] result_path = params['result_path'] decode_path = params['decode_path'] file_name = params['file_name'] weights = params['weights'] encoder_length = params['encoder_length'] decoder_length = params['decoder_length'] max_sents = params['max_sents'] embedding_dim = params['embedding_dim'] birnn_dim = params['birnn_dim'] rnn_dim = params['rnn_dim'] vocab_size = params['vocab_size'] num_samples = params['num_samples'] batch_size = params['batch_size'] epoch = params['epoch'] ''' Reading vocabulary dictionaries ''' indices_words_connector = DataConnector(preprocessed_data, 'all_idxword_vocabulary_sent.pkl', data=None) indices_words_connector.read_pickle() indices_words = indices_words_connector.read_file words_indices_connector = DataConnector(preprocessed_data, 'all_wordidx_vocabulary_sent.pkl', data=None) words_indices_connector.read_pickle() words_indices = words_indices_connector.read_file y_test_true_connector = DataConnector(data_path, 'test_sent_output_tokens.npy', data=None) y_test_true_connector.read_numpys() y_test_true = y_test_true_connector.read_file # paired data set X_pair_test_connector = DataConnector(preprocessed_data, 'x_pair_test_sent.npy', data=None) X_pair_test_connector.read_numpys() X_pair_test = X_pair_test_connector.read_file y_pair_test_in_connector = DataConnector(preprocessed_data, 'y_pair_test_in_sent.npy', data=None) y_pair_test_in_connector.read_numpys() y_pair_test_in = y_pair_test_in_connector.read_file y_pair_test_out_connector = DataConnector(preprocessed_data, 'y_pair_test_out_sent.npy', data=None) y_pair_test_out_connector.read_numpys() y_pair_test_out = y_pair_test_out_connector.read_file # non-paired data set X_test_connector = DataConnector(preprocessed_data, 'X_test_pad_sent.npy', data=None) X_test_connector.read_numpys() X_test = X_test_connector.read_file y_test_in_connector = DataConnector(preprocessed_data, 'y_test_in_sent.npy', data=None) y_test_in_connector.read_numpys() y_test_in = y_test_in_connector.read_file y_test_out_connector = DataConnector(preprocessed_data, 'y_test_out_sent.npy', data=None) y_test_out_connector.read_numpys() y_test_out = y_test_out_connector.read_file print("\n Non-paired test set: \n") sys.stdout.flush() print("X (input for encoder) shape: %s" % str(X_test.shape)) # input for encoder sys.stdout.flush() print("y_in (input for decoder) shape: %s" % str(y_test_in.shape)) # input for decoder sys.stdout.flush() print("y_out (output for decoder) shape: %s\n\n" % str(y_test_out.shape)) # output for decoder sys.stdout.flush() ''' Decoder model for inference stage Return: generated keyphrases ''' glove_embedding_conn = DataConnector(preprocessed_data, glove_embed, data=None) glove_embedding_conn.read_pickle() pretrained_embedding = glove_embedding_conn.read_file print("pretrained_embedding shape: %s" % str(pretrained_embedding.shape)) print("pretrained_embedding [0][:10]: %s" % str(pretrained_embedding[0, :10])) print("pretrained_embedding [1][:10]: %s" % str(pretrained_embedding[1, :10])) oov_embedding_conn = DataConnector(preprocessed_data, oov_embed, data=None) oov_embedding_conn.read_pickle() oov_embedding = oov_embedding_conn.read_file print("oov_embedding shape: %s" % str(oov_embedding.shape)) print("oov_embedding [0][:10]: %s" % str(oov_embedding[0, :10])) print("oov_embedding [1][:10]: %s" % str(oov_embedding[1, :10])) print("oov_embedding [2][:10]: %s" % str(oov_embedding[2, :10])) sampled_softmax = HierarchySampledSoftmax(encoder_length=encoder_length, decoder_length=decoder_length, max_sents=max_sents, embedding_dim=embedding_dim, birnn_dim=birnn_dim, rnn_dim=rnn_dim, vocab_size=vocab_size, num_samples=num_samples, filepath=result_path, filename=file_name, batch_train_iter=None, batch_val_iter=None, batch_size=None, steps_epoch=None, val_steps=None, epochs=None) # skeleton of model architecture sampled_softmax.train_hier_sampled_softmax(pretrained_embedding, oov_embedding) ''' Model for retrieving softmax probability Return: softmax probability of prediction layer ''' sampled_softmax.predict_sampled_softmax(weights) encoder_model = sampled_softmax.encoder_model # 1. Prediction model after being trained on sampled softmax setting predict_softmax_model = sampled_softmax.prediction_model ''' Compute softmax loss on validation set Model: 'Eval' mode sampled softmax Return: Loss on validation set ''' """ t0 = time.time() print("Evaluate model with full softmax setting...") sys.stdout.flush() y_pair_test = y_pair_test_out.reshape((y_pair_test_out.shape[0], y_pair_test_out.shape[1], 1)) # as true labels outputs_test = list(y_pair_test.swapaxes(0,1)) m_test = X_pair_test.shape[0] s0_test = np.zeros((m_test, rnn_dim)) score = eval_softmax_model.evaluate([X_pair_test, y_pair_test_in, s0_test, y_pair_test], outputs_test, batch_size=64) print("average loss: %s"%str(score[0]/(decoder_length+1))) sys.stdout.flush() print("all time steps loss: %s"%score) sys.stdout.flush() avg_loss = score[0]/(decoder_length+1) perplex = np.exp(avg_loss) print("average perplexity score: %s"%perplex) sys.stdout.flush() print("all time steps perplexity score: %s"%(np.exp(score))) sys.stdout.flush() t1 = time.time() print("Full softmax evaluation is done in %.3fsec" % (t1 - t0)) sys.stdout.flush() """ ''' Inference stage Model: layers from prediction model and decoder model Inference (text generation) approach: 1. One best search decoding (Greedy search): Return one best (top) probable word sequence, from joint probability of words within decoder time steps (decoder sequence length) 2. N-Beam search decoding: Return N-top best most probable word sequences, by utilizing beam tree search per time steps and joint probability within decoder time steps (decoder sequence length) ''' decoder_model = sampled_softmax.create_decoder_model() # transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases keyphrases_transform = TrueKeyphrases(y_test_true) keyphrases_transform.get_true_keyphrases() keyphrases_transform.get_stat_keyphrases() y_true = keyphrases_transform.y_true max_kp_num = keyphrases_transform.max_kp_num mean_kp_num = keyphrases_transform.mean_kp_num std_kp_num = keyphrases_transform.std_kp_num print("Maximum number of key phrases per document in corpus: %s" % max_kp_num) sys.stdout.flush() print("Average number of key phrases per document in corpus: %s" % mean_kp_num) sys.stdout.flush() print( "Standard Deviation of number of key phrases per document in corpus: %s" % std_kp_num) sys.stdout.flush() # round up function for computing beam width def roundup(x): return x if x % 5 == 0 else x + 5 - x % 5 beam_width = int(roundup(mean_kp_num + (3 * std_kp_num))) print("\nBeam width: %s\n" % beam_width) sys.stdout.flush() num_hypotheses = beam_width y_dummy_test = np.zeros((len(X_test), decoder_length + 1, 1)) print(str(datetime.now())) sys.stdout.flush() t0 = time.time() print("Start decoding...") sys.stdout.flush() inference_mode = Decoding(encoder_model=encoder_model, decoder_model=decoder_model, indices_words=indices_words, words_indices=words_indices, enc_in_seq=None, labels=None, decoder_length=decoder_length, rnn_dim=rnn_dim, beam_width=beam_width, num_hypotheses=num_hypotheses, filepath=decode_path, filename=file_name) t0_1 = time.time() print("Start beam decoding...") sys.stdout.flush() beam_keyphrases = inference_mode.beam_decoder(X_test[:500], y_dummy_test[:500]) beam_decode_connector = DataConnector(decode_path, 'beam_kp-%s.npy' % (file_name), beam_keyphrases) beam_decode_connector.save_numpys() t1_1 = time.time() print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1)) sys.stdout.flush()
def decoder(params): data_path = params['data_path'] glove_embed = params['glove_embedding'] oov_embed = params['oov_embedding'] preprocessed_data = params['preprocessed_data'] decode_path = params['decode_path'] model_path = params['model_path'] result_path = params['result_path'] file_name = params['file_name'] weights = params['weights'] encoder_length = params['encoder_length'] decoder_length = params['decoder_length'] max_sents = params['max_sents'] embedding_dim = params['embedding_dim'] birnn_dim = params['birnn_dim'] rnn_dim = params['rnn_dim'] vocab_size = params['vocab_size'] batch_size = params['batch_size'] epoch = params['epoch'] ''' Reading vocabulary dictionaries ''' indices_words_connector = DataConnector(preprocessed_data, 'all_idxword_vocabulary_sent_fsoftmax.pkl', data=None) indices_words_connector.read_pickle() indices_words = indices_words_connector.read_file words_indices_connector = DataConnector(preprocessed_data, 'all_wordidx_vocabulary_sent_fsoftmax.pkl', data=None) words_indices_connector.read_pickle() words_indices = words_indices_connector.read_file y_test_true_connector = DataConnector(data_path, 'test_sent_output_tokens.npy', data=None) y_test_true_connector.read_numpys() y_test_true = y_test_true_connector.read_file # non-paired data set X_test_connector = DataConnector(preprocessed_data, 'X_test_pad_sent_fsoftmax.npy', data=None) X_test_connector.read_numpys() X_test = X_test_connector.read_file ''' Decoder model for inference stage Return: generated keyphrases ''' glove_embedding_conn = DataConnector(preprocessed_data, glove_embed, data=None) glove_embedding_conn.read_pickle() pretrained_embedding = glove_embedding_conn.read_file print("pretrained_embedding shape: %s"%str(pretrained_embedding.shape)) print("pretrained_embedding [0][:10]: %s"%str(pretrained_embedding[0,:10])) print("pretrained_embedding [1][:10]: %s"%str(pretrained_embedding[1,:10])) oov_embedding_conn = DataConnector(preprocessed_data, oov_embed, data=None) oov_embedding_conn.read_pickle() oov_embedding = oov_embedding_conn.read_file print("oov_embedding shape: %s"%str(oov_embedding.shape)) print("oov_embedding [0][:10]: %s"%str(oov_embedding[0,:10])) print("oov_embedding [1][:10]: %s"%str(oov_embedding[1,:10])) print("oov_embedding [2][:10]: %s"%str(oov_embedding[2,:10])) full_softmax = HierarchyFullSoftmax(encoder_length=encoder_length, decoder_length=decoder_length, max_sents=max_sents, embedding_dim=embedding_dim, birnn_dim=birnn_dim, rnn_dim=rnn_dim, vocab_size=vocab_size, filepath=result_path, filename=file_name, batch_train_iter=None, batch_val_iter=None, batch_size=None, steps_epoch=None, val_steps=None, epochs=None) # skeleton of model architecture full_softmax.train_hier_seq2seq(pretrained_embedding, oov_embedding) encoder_model = full_softmax.encoder_model predict_softmax_model = full_softmax.predict_seq2seq(weights) decoder_model = full_softmax.create_decoder_model() # transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases keyphrases_transform = TrueKeyphrases(y_test_true) keyphrases_transform.get_true_keyphrases() keyphrases_transform.get_stat_keyphrases() y_true = keyphrases_transform.y_true max_kp_num = keyphrases_transform.max_kp_num mean_kp_num = keyphrases_transform.mean_kp_num std_kp_num = keyphrases_transform.std_kp_num print("Maximum number of key phrases per document in corpus: %s" %max_kp_num) sys.stdout.flush() print("Average number of key phrases per document in corpus: %s" %mean_kp_num) sys.stdout.flush() print("Standard Deviation of number of key phrases per document in corpus: %s" %std_kp_num) sys.stdout.flush() # round up function for computing beam width def roundup(x): return x if x % 5 == 0 else x + 5 - x % 5 beam_width = int(roundup(mean_kp_num + (3 * std_kp_num))) print("\nBeam width: %s\n" %beam_width) sys.stdout.flush() num_hypotheses = beam_width print(str(datetime.now())) sys.stdout.flush() t0 = time.time() print("Start decoding...") sys.stdout.flush() inference_mode = DecodingSoftmax(encoder_model=encoder_model, decoder_model=decoder_model, indices_words=indices_words, words_indices=words_indices, enc_in_seq=None, decoder_length=decoder_length, rnn_dim=rnn_dim, beam_width=beam_width, num_hypotheses=num_hypotheses, filepath=decode_path, filename=file_name) t0_1 = time.time() print("Start beam decoding...") sys.stdout.flush() beam_keyphrases = inference_mode.beam_decoder(X_test[:500]) beam_decode_connector = DataConnector(decode_path, 'beam_kp-%s.npy'%(file_name), beam_keyphrases) beam_decode_connector.save_numpys() t1_1 = time.time() print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1)) sys.stdout.flush()
def decoder(params): data_path = params['data_path'] preprocessed_v2 = params['preprocessed_v2'] preprocessed_data = params['preprocessed_data'] decode_path = params['decode_path'] model_path = params['model_path'] result_path = params['result_path'] result_kp20k = params['result_kp20k'] file_name = params['file_name'] weights = params['weights'] encoder_length = params['encoder_length'] decoder_length = params['decoder_length'] embedding_dim = params['embedding_dim'] birnn_dim = params['birnn_dim'] rnn_dim = params['rnn_dim'] vocab_size = params['vocab_size'] batch_size = params['batch_size'] epoch = params['epoch'] ''' Reading vocabulary dictionaries ''' indices_words_connector = DataConnector(preprocessed_v2, 'all_indices_words_fsoftmax.pkl', data=None) indices_words_connector.read_pickle() indices_words = indices_words_connector.read_file words_indices_connector = DataConnector(preprocessed_v2, 'all_words_indices_fsoftmax.pkl', data=None) words_indices_connector.read_pickle() words_indices = words_indices_connector.read_file ## merge all set into one test set for trained model train_outputs_conn = DataConnector(data_path, 'train_output_tokens.npy', data=None) train_outputs_conn.read_numpys() train_outputs = train_outputs_conn.read_file valid_outputs_conn = DataConnector(data_path, 'val_output_tokens.npy', data=None) valid_outputs_conn.read_numpys() valid_outputs = valid_outputs_conn.read_file test_outputs_conn = DataConnector(data_path, 'test_output_tokens.npy', data=None) test_outputs_conn.read_numpys() test_outputs = test_outputs_conn.read_file y_test_true = np.concatenate((train_outputs, valid_outputs, test_outputs)) print("Ground truth of keyphrases shape: %s"%str(y_test_true.shape)) # input for encoder sys.stdout.flush() # non-paired data set X_train_connector = DataConnector(preprocessed_data, 'X_train_pad_fsoftmax.npy', data=None) X_train_connector.read_numpys() X_train = X_train_connector.read_file X_valid_connector = DataConnector(preprocessed_data, 'X_valid_pad_fsoftmax.npy', data=None) X_valid_connector.read_numpys() X_valid = X_valid_connector.read_file X_test_connector = DataConnector(preprocessed_data, 'X_test_pad_fsoftmax.npy', data=None) X_test_connector.read_numpys() X_test = X_test_connector.read_file X_in = np.concatenate((X_train, X_valid, X_test)) print("\n Non-paired test set: \n") sys.stdout.flush() print("X (input for encoder) shape: %s"%str(X_in.shape)) # input for encoder sys.stdout.flush() full_softmax = FullSoftmax(encoder_length=encoder_length, decoder_length=decoder_length, embedding_dim=embedding_dim, birnn_dim=birnn_dim, rnn_dim=rnn_dim, vocab_size=vocab_size, filepath=result_kp20k, filename=file_name, batch_train_iter=None, batch_val_iter=None, batch_size=None, steps_epoch=None, val_steps=None, epochs=None) full_softmax.train_seq2seq() predict_softmax_model = full_softmax.predict_seq2seq(weights) encoder_model = full_softmax.encoder_model decoder_model = full_softmax.create_decoder_model() # transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases keyphrases_transform = TrueKeyphrases(y_test_true) keyphrases_transform.get_true_keyphrases() keyphrases_transform.get_stat_keyphrases() y_true = keyphrases_transform.y_true max_kp_num = keyphrases_transform.max_kp_num mean_kp_num = keyphrases_transform.mean_kp_num std_kp_num = keyphrases_transform.std_kp_num print("Maximum number of key phrases per document in corpus: %s" %max_kp_num) sys.stdout.flush() print("Average number of key phrases per document in corpus: %s" %mean_kp_num) sys.stdout.flush() print("Standard Deviation of number of key phrases per document in corpus: %s" %std_kp_num) sys.stdout.flush() # round up function for computing beam width def roundup(x): return x if x % 5 == 0 else x + 5 - x % 5 beam_width = int(roundup(mean_kp_num + (3 * std_kp_num))) num_hypotheses = beam_width print("\nBeam width: %s\n" %beam_width) sys.stdout.flush() inference_mode = DecodingSoftmax(encoder_model=encoder_model, decoder_model=decoder_model, indices_words=indices_words, words_indices=words_indices, enc_in_seq=None, decoder_length=decoder_length, rnn_dim=rnn_dim, beam_width=beam_width, num_hypotheses=num_hypotheses, filepath=decode_path, filename=file_name) t0_1 = time.time() print("Start beam decoding...") sys.stdout.flush() beam_keyphrases = inference_mode.beam_decoder(X_in[:500]) beam_decode_connector = DataConnector(decode_path, 'beam_kp-%s.npy'%(file_name), beam_keyphrases) beam_decode_connector.save_numpys() t1_1 = time.time() print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1)) sys.stdout.flush()
def transform_sent_all(params): print("\n=========\n") print(str(datetime.now())) sys.stdout.flush() t0 = time.time() print("Transforming all data set into integer sequences") sys.stdout.flush() data_path = params['data_path'] kp20k_path = params['kp20k_path'] max_sents= params['max_sents'] encoder_length = params['encoder_length'] decoder_length = params['decoder_length'] ''' read stored vocabulary index ''' vocab = DataConnector(kp20k_path, 'all_indices_words_sent_r3.pkl', data=None) vocab.read_pickle() indices_words = vocab.read_file reversed_vocab = DataConnector(kp20k_path, 'all_words_indices_sent_r3.pkl', data=None) reversed_vocab.read_pickle() words_indices = reversed_vocab.read_file ''' read tokenized data set ''' in_connector = DataConnector(data_path, 'input_sent_tokens.npy', data=None) in_connector.read_numpys() input_tokens = in_connector.read_file out_connector = DataConnector(data_path, 'output_sent_tokens.npy', data=None) out_connector.read_numpys() output_tokens = out_connector.read_file ''' transforming texts into integer sequences ''' sequences_processing = SequenceProcessing(indices_words, words_indices, encoder_length, decoder_length) x_in = sequences_processing.in_sents_to_integers(in_texts=input_tokens, max_sents=max_sents) x_in_pad = sequences_processing.pad_sequences_sent_in(max_len=encoder_length, max_sents=max_sents,sequences=x_in) y_in, y_out = sequences_processing.outtexts_to_integers(out_texts=output_tokens) x_in_connector = DataConnector(data_path, 'X_sent_r3.npy', x_in) x_in_connector.save_numpys() x_in_pad_connector = DataConnector(data_path, 'X_sent_pad_r3.npy', x_in_pad) x_in_pad_connector.save_numpys() y_in_connector = DataConnector(data_path, 'y_sent_in_r3.npy', y_in) y_in_connector.save_numpys() y_out_connector = DataConnector(data_path, 'y_sent_out_r3.npy', y_out) y_out_connector.save_numpys() t1 = time.time() print("Transforming data set into integer sequences of inputs - outputs done in %.3fsec" % (t1 - t0)) sys.stdout.flush()
def transform_v2_fsoftmax(params): print("\n=========\n") print(str(datetime.now())) sys.stdout.flush() t0 = time.time() print("Transforming all data set into integer sequences") sys.stdout.flush() data_path = params['data_path'] preprocessed_data = params['preprocessed_data'] preprocessed_v2 = params['preprocessed_v2'] encoder_length = params['encoder_length'] decoder_length = params['decoder_length'] ''' read stored vocabulary index ''' vocab = DataConnector(preprocessed_v2, 'all_idxword_vocabulary_fsoftmax.pkl', data=None) vocab.read_pickle() indices_words = vocab.read_file reversed_vocab = DataConnector(preprocessed_v2, 'all_wordidx_vocabulary_fsoftmax.pkl', data=None) reversed_vocab.read_pickle() words_indices = reversed_vocab.read_file ''' read tokenized data set ''' in_connector = DataConnector(data_path, 'input_tokens.npy', data=None) in_connector.read_numpys() input_tokens = in_connector.read_file out_connector = DataConnector(data_path, 'output_tokens.npy', data=None) out_connector.read_numpys() output_tokens = out_connector.read_file ''' transforming texts into integer sequences ''' sequences_processing = SequenceProcessing(indices_words, words_indices, encoder_length, decoder_length) x_in = sequences_processing.intexts_to_integers(input_tokens) x_in_pad = sequences_processing.pad_sequences_in(encoder_length, x_in) y_in, y_out = sequences_processing.outtexts_to_integers(output_tokens) x_in_connector = DataConnector(preprocessed_data, 'X_fsoftmax.npy', x_in) x_in_connector.save_numpys() x_in_pad_connector = DataConnector(preprocessed_data, 'X_pad_fsoftmax.npy', x_in_pad) x_in_pad_connector.save_numpys() y_in_connector = DataConnector(preprocessed_data, 'y_in_fsoftmax.npy', y_in) y_in_connector.save_numpys() y_out_connector = DataConnector(preprocessed_data, 'y_out_fsoftmax.npy', y_out) y_out_connector.save_numpys() t1 = time.time() print("Transforming data set into integer sequences of inputs - outputs done in %.3fsec" % (t1 - t0)) sys.stdout.flush()
def preprocessing_(params): data_path = params['data_path'] print("\n=========\n") sys.stdout.flush() print(str(datetime.now())) sys.stdout.flush() t0 = time.time() print("Reading raw test data...") sys.stdout.flush() # this data set consist of: # title, abstract, main text, list of topics of scientific articles # we will use title + abstract as model input data_connector = DataConnector(data_path, 'krapivin_doc_keyphrases.pkl', data=None) data_connector.read_pickle() data = data_connector.read_file in_text = [] out_keyphrases = [] for k,v in data.items(): title = v[0] abstract = v[1] text = title + " . " + abstract kps = v[3] in_text.append(text) out_keyphrases.append(kps) print("\nnumber of examples in raw data inputs: %s\n"%(len(in_text))) sys.stdout.flush() print("\nnumber of examples in raw data outputs: %s\n"%(len(out_keyphrases))) sys.stdout.flush() print("\n in_text[0]: %s\n"%(in_text[0])) sys.stdout.flush() print("\n out_keyphrases[0]: %s\n"%(out_keyphrases[0])) sys.stdout.flush() prep = Preprocessing() prep_inputs = prep.preprocess_in(in_text) prep_outputs = prep.preprocess_out(out_keyphrases) input_tokens = prep.tokenize_in(prep_inputs) output_tokens = prep.tokenize_out(prep_outputs) all_tokens = prep.get_all_tokens(input_tokens, output_tokens) # without splitting data into training and test set print("\nnumber of examples in preprocessed data inputs: %s\n"%(len(input_tokens))) sys.stdout.flush() print("\nnumber of examples in preprocessed data outputs: %s\n"%(len(output_tokens))) sys.stdout.flush() print("\n input_tokens[0]: %s\n"%(input_tokens[0])) sys.stdout.flush() print("\n output_tokens[0]: %s\n"%(output_tokens[0])) sys.stdout.flush() in_connector = DataConnector(data_path, 'input_tokens.npy', input_tokens) in_connector.save_numpys() out_connector = DataConnector(data_path, 'output_tokens.npy', output_tokens) out_connector.save_numpys() tokens_connector = DataConnector(data_path, 'all_tokens.npy', all_tokens) tokens_connector.save_numpys() # splitting into training and test set n_train = int(0.8 * len(input_tokens)) in_train = input_tokens[:n_train] out_train = output_tokens[:n_train] in_test = input_tokens[n_train:len(input_tokens)] out_test = output_tokens[n_train:len(input_tokens)] print("\nnumber of examples in training set: %s\n"%(len(in_train))) sys.stdout.flush() print("\nnumber of examples in test set: %s\n"%(len(in_test))) sys.stdout.flush() in_train_connector = DataConnector(data_path, 'in_train.npy', in_train) in_train_connector.save_numpys() out_train_connector = DataConnector(data_path, 'out_train.npy', out_train) out_train_connector.save_numpys() in_test_connector = DataConnector(data_path, 'in_test.npy', in_test) in_test_connector.save_numpys() out_test_connector = DataConnector(data_path, 'out_test.npy', out_test) out_test_connector.save_numpys() t1 = time.time() print("Reading raw training data done in %.3fsec" % (t1 - t0)) sys.stdout.flush()