Exemple #1
0
def transform_test(params):

	print("\n=========\n")
	sys.stdout.flush()

	print(str(datetime.now()))
	sys.stdout.flush()

	t0 = time.time()

	print("Transforming test set into integer sequences")
	sys.stdout.flush()

	data_path = params['data_path']

	encoder_length = params['encoder_length']
	decoder_length = params['decoder_length']

	'''
	read stored vocabulary index
	'''

	vocab = DataConnector(data_path, 'all_indices_words.pkl', data=None)
	vocab.read_pickle()
	indices_words = vocab.read_file

	reversed_vocab = DataConnector(data_path, 'all_words_indices.pkl', data=None)
	reversed_vocab.read_pickle()
	words_indices = reversed_vocab.read_file

	'''
	read tokenized data set
	'''

	test_in_tokens_connector = DataConnector(data_path, 'in_test.npy', data=None)
	test_in_tokens_connector.read_numpys()
	test_in_tokens = test_in_tokens_connector.read_file
	test_out_tokens_connector = DataConnector(data_path, 'out_test.npy', data=None)
	test_out_tokens_connector.read_numpys()
	test_out_tokens = test_out_tokens_connector.read_file

	'''
	transforming texts into integer sequences
	'''
	sequences_processing = SequenceProcessing(indices_words, words_indices, encoder_length, decoder_length)
	X_test = sequences_processing.intexts_to_integers(test_in_tokens)
	y_test_in, y_test_out = sequences_processing.outtexts_to_integers(test_out_tokens)


	x_in_connector = DataConnector(data_path, 'X_test.npy', X_test)
	x_in_connector.save_numpys()
	y_in_connector = DataConnector(data_path, 'y_test_in.npy', y_test_in)
	y_in_connector.save_numpys()
	y_out_connector = DataConnector(data_path, 'y_test_out.npy', y_test_out)
	y_out_connector.save_numpys()

	t1 = time.time()
	print("Transforming test set into integer sequences of inputs - outputs done in %.3fsec" % (t1 - t0))
	sys.stdout.flush()
Exemple #2
0
def decoder(params):

    data_path = params['data_path']
    preprocessed_v2 = params['preprocessed_v2']
    preprocessed_data = params['preprocessed_data']
    decode_path = params['decode_path']
    model_path = params['model_path']
    result_path = params['result_path']
    file_name = params['file_name']
    weights = params['weights']

    encoder_length = params['encoder_length']
    decoder_length = params['decoder_length']
    embedding_dim = params['embedding_dim']
    birnn_dim = params['birnn_dim']
    rnn_dim = params['rnn_dim']
    vocab_size = params['vocab_size']
    '''
	Reading vocabulary dictionaries

	'''
    indices_words_connector = DataConnector(preprocessed_v2,
                                            'all_indices_words_fsoftmax.pkl',
                                            data=None)
    indices_words_connector.read_pickle()
    indices_words = indices_words_connector.read_file

    words_indices_connector = DataConnector(preprocessed_v2,
                                            'all_words_indices_fsoftmax.pkl',
                                            data=None)
    words_indices_connector.read_pickle()
    words_indices = words_indices_connector.read_file

    y_test_true_connector = DataConnector(data_path,
                                          'test_output_tokens.npy',
                                          data=None)
    y_test_true_connector.read_numpys()
    y_test_true = y_test_true_connector.read_file

    # paired data set

    X_pair_test_connector = DataConnector(preprocessed_data,
                                          'x_pair_test_fsoftmax.npy',
                                          data=None)
    X_pair_test_connector.read_numpys()
    X_pair_test = X_pair_test_connector.read_file

    y_pair_test_in_connector = DataConnector(preprocessed_data,
                                             'y_pair_test_in_fsoftmax.npy',
                                             data=None)
    y_pair_test_in_connector.read_numpys()
    y_pair_test_in = y_pair_test_in_connector.read_file

    y_pair_test_out_connector = DataConnector(preprocessed_data,
                                              'y_pair_test_out_fsoftmax.npy',
                                              data=None)
    y_pair_test_out_connector.read_numpys()
    y_pair_test_out = y_pair_test_out_connector.read_file

    # non-paired data set

    X_test_connector = DataConnector(preprocessed_data,
                                     'X_test_pad_fsoftmax.npy',
                                     data=None)
    X_test_connector.read_numpys()
    X_test = X_test_connector.read_file

    y_test_in_connector = DataConnector(preprocessed_data,
                                        'y_test_in_fsoftmax.npy',
                                        data=None)
    y_test_in_connector.read_numpys()
    y_test_in = y_test_in_connector.read_file

    y_test_out_connector = DataConnector(preprocessed_data,
                                         'y_test_out_fsoftmax.npy',
                                         data=None)
    y_test_out_connector.read_numpys()
    y_test_out = y_test_out_connector.read_file

    print("\n Non-paired test set: \n")
    sys.stdout.flush()
    print("X (input for encoder) shape: %s" %
          str(X_test.shape))  # input for encoder
    sys.stdout.flush()
    print("y_in (input for decoder) shape: %s" %
          str(y_test_in.shape))  # input for decoder
    sys.stdout.flush()
    print("y_out (output for decoder) shape: %s\n\n" %
          str(y_test_out.shape))  # output for decoder
    sys.stdout.flush()
    '''
	Decoder model for inference stage
	Return: generated keyphrases
	'''

    full_softmax = AttentionFullSoftmax(encoder_length=encoder_length,
                                        decoder_length=decoder_length,
                                        embedding_dim=embedding_dim,
                                        birnn_dim=birnn_dim,
                                        rnn_dim=rnn_dim,
                                        vocab_size=vocab_size,
                                        filepath=result_path,
                                        filename=file_name,
                                        batch_train_iter=None,
                                        batch_val_iter=None,
                                        batch_size=None,
                                        steps_epoch=None,
                                        val_steps=None,
                                        epochs=None)

    # skeleton of model architecture
    full_softmax.train_att_seq2seq()

    predict_softmax_model = full_softmax.predict_att_seq2seq(weights)
    encoder_model = full_softmax.encoder_model
    '''

	Inference stage
	Model: layers from prediction model and decoder model
	Inference (text generation) approach: 
	1. One best search decoding (Greedy search): 
	   Return one best (top) probable word sequence, from joint probability of words within decoder time steps (decoder sequence length)
	2. N-Beam search decoding: 
	   Return N-top best most probable word sequences, by utilizing beam tree search per time steps and joint probability within decoder time steps (decoder sequence length)

	'''

    decoder_model = full_softmax.create_decoder_model()

    # transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases
    keyphrases_transform = TrueKeyphrases(y_test_true)
    keyphrases_transform.get_true_keyphrases()
    keyphrases_transform.get_stat_keyphrases()
    y_true = keyphrases_transform.y_true
    max_kp_num = keyphrases_transform.max_kp_num
    mean_kp_num = keyphrases_transform.mean_kp_num
    std_kp_num = keyphrases_transform.std_kp_num

    print("Maximum number of key phrases per document in corpus: %s" %
          max_kp_num)
    sys.stdout.flush()
    print("Average number of key phrases per document in corpus: %s" %
          mean_kp_num)
    sys.stdout.flush()
    print(
        "Standard Deviation of number of key phrases per document in corpus: %s"
        % std_kp_num)
    sys.stdout.flush()

    # round up function for computing beam width
    def roundup(x):
        return x if x % 5 == 0 else x + 5 - x % 5

    beam_width = int(roundup(mean_kp_num + (3 * std_kp_num)))
    print("\nBeam width: %s\n" % beam_width)
    sys.stdout.flush()
    num_hypotheses = beam_width

    s0_test = np.zeros((len(X_test), rnn_dim))
    att0_test = np.zeros((len(X_test), encoder_length, 1))

    print(str(datetime.now()))
    sys.stdout.flush()

    inference_mode = Decoding(encoder_model=encoder_model,
                              decoder_model=decoder_model,
                              indices_words=indices_words,
                              words_indices=words_indices,
                              enc_in_seq=None,
                              states=None,
                              attentions=None,
                              decoder_length=decoder_length,
                              rnn_dim=rnn_dim,
                              beam_width=beam_width,
                              num_hypotheses=num_hypotheses,
                              filepath=decode_path,
                              filename=file_name)

    t0_1 = time.time()
    print("Start beam decoding...")
    sys.stdout.flush()

    beam_keyphrases = inference_mode.beam_decoder(X_test[:500], s0_test[:500],
                                                  att0_test[:500])

    beam_decode_connector = DataConnector(decode_path,
                                          'beam_kp-%s.npy' % (file_name),
                                          beam_keyphrases)
    beam_decode_connector.save_numpys()

    t1_1 = time.time()
    print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1))
    sys.stdout.flush()
Exemple #3
0
def evaluator(params):

    title = params['title']
    data_path = params['data_path']
    idx_words = params['idx_words']
    words_idx = params['words_idx']
    preprocessed_v2 = params['preprocessed_v2']
    preprocessed_data = params['preprocessed_data']
    decode_path = params['decode_path']
    result_path = params['result_path']
    decoded = params['decoded_files']
    y_true = params['y_true']
    '''
    Reading vocabulary dictionaries

    '''

    indices_words_connector = DataConnector(preprocessed_v2,
                                            idx_words,
                                            data=None)
    indices_words_connector.read_pickle()
    indices_words = indices_words_connector.read_file

    words_indices_connector = DataConnector(preprocessed_v2,
                                            words_idx,
                                            data=None)
    words_indices_connector.read_pickle()
    words_indices = words_indices_connector.read_file
    '''

    indices_words_connector = DataConnector(preprocessed_v2, 'all_indices_words_fsoftmax.pkl', data=None)
    indices_words_connector.read_pickle()
    indices_words = indices_words_connector.read_file

    words_indices_connector = DataConnector(preprocessed_v2, 'all_words_indices_fsoftmax.pkl', data=None)
    words_indices_connector.read_pickle()
    words_indices = words_indices_connector.read_file

    '''

    # y_true (true keyphrases) from test set

    y_test_true_connector = DataConnector(data_path, y_true, data=None)
    y_test_true_connector.read_numpys()
    y_test_true = y_test_true_connector.read_file
    '''
    Reading generated keyphrases

    '''
    # read N-generated keyphrases
    #kp_paths = ['keyphrases-beam-sts-kp20k-v1-%s' % (i) for i in range(500)]
    #kp_paths = ['keyphrases-beam-sts-kp20k-att-v1-%s' % (i) for i in range(500)]
    #kp_paths = ['keyphrases-beam-sts-kp20k-fsoftmax-v1-%s' % (i) for i in range(500)]
    kp_paths = ['%s-%s' % (decoded, i) for i in range(500)]

    dataconn = DataConnector(filepath=None, filename=None)

    # uncomment this, for reading all generated hypothesis in one list
    # hypotheses = dataconn.read_pickles_all(result_path, kp_paths)

    # uncomment this, for reading all generated hypothesis in list of arrays
    hypotheses = dataconn.read_pickles_doc(decode_path, kp_paths)
    '''
    print("length of hypotheses: %s" % (len(hypotheses)))
    print("len hypotheses[0]: %s" % len(hypotheses[0]))
    print("len hypotheses[1]: %s" % len(hypotheses[1]))
    print("len hypotheses[2]: %s" % len(hypotheses[2]))
    print("len hypotheses[3]: %s" % len(hypotheses[3]))
    '''

    # on average
    n_rank = [1, 5, 10, 15, 20]
    all_rank_prediction = []
    for n in range(len(n_rank)):
        beam_predicted_keyphrases = []
        for keyphrase_list in hypotheses:
            stemmed_kps = []
            for keyphrases in keyphrase_list:

                beam_decoded = BeamDecoded(keyphrases, words_indices,
                                           indices_words, result_path)
                keyphrase = beam_decoded.decript_hypotheses()

                tokenized_keyphrase = keyphrase.split()
                #stemmed_kps = set()
                for kp in tokenized_keyphrase:
                    stemmed = sno.stem(kp)
                    if stemmed not in stemmed_kps:
                        stemmed_kps.append(stemmed)

            decoded_kps = stemmed_kps[:n_rank[n]]

            beam_predicted_keyphrases.append(decoded_kps)
            #beam_predicted_keyphrases.extend(decoded_kps)
        all_rank_prediction.append(beam_predicted_keyphrases)
    '''
    Evaluating generated keyphrases of sampled softmax model + beam search decoding approach
    '''

    all_rank_acc = []
    all_rank_precision = []
    all_rank_recall = []
    all_rank_fscore = []

    all_rank_tps = []
    all_rank_fns = []
    all_rank_fps = []

    print("******************")
    print("Model: %s" % (title))
    print("******************")

    for i, beam_predicted in enumerate(all_rank_prediction):
        evaluate_beam = Evaluate(beam_predicted, y_test_true, result_path)

        evaluate_beam.get_true_label_list()

        y_true = evaluate_beam.y_true

        y_pred = evaluate_beam.y_pred
        '''
        print("length of y_true: %s"%(len(y_true)))
        print("y_true[0]: %s"%str(y_true[0]))
        print("y_pred[0]: %s"%str(y_pred[0]))
        print("=============================")
        print("y_true[1]: %s"%str(y_true[1]))
        print("y_pred[1]: %s"%str(y_pred[1]))
        print("=============================")
        print("y_true[2]: %s"%str(y_true[2]))
        print("y_pred[2]: %s"%str(y_pred[2]))
        print("=============================")
        print("y_true[3]: %s"%str(y_true[3]))
        print("y_pred[3]: %s"%str(y_pred[3]))
        print("=============================")
        print("y_true[4]: %s"%str(y_true[4]))
        print("y_pred[4]: %s"%str(y_pred[4]))
        print("=============================")
        print("y_true[5]: %s"%str(y_true[5]))
        print("y_pred[5]: %s"%str(y_pred[5]))
        print("=============================")
        print("y_true[6]: %s"%str(y_true[6]))
        print("y_pred[6]: %s"%str(y_pred[6]))
        print("=============================")
        print("y_true[7]: %s"%str(y_true[7]))
        print("y_pred[7]: %s"%str(y_pred[7]))
        print("=============================")
        '''

        evaluate_beam.compute_true_positive()
        evaluate_beam.compute_false_negative()
        evaluate_beam.compute_false_positive()

        evaluate_beam.compute_accuracy()
        evaluate_beam.compute_precision()
        evaluate_beam.compute_recall()
        evaluate_beam.compute_fscore()

        mean_acc, mean_precision, mean_recall, mean_fscore = evaluate_beam.compute_mean_evals(
        )
        all_rank_acc.append(mean_acc)
        all_rank_precision.append(mean_precision)
        all_rank_recall.append(mean_recall)
        all_rank_fscore.append(mean_fscore)

        mean_tps, mean_fns, mean_fps = evaluate_beam.compute_mean_cm()
        all_rank_tps.append(mean_tps)
        all_rank_fns.append(mean_fns)
        all_rank_fps.append(mean_fps)

        print("===================")

        print("N-Rank: %s" % (n_rank[i]))

        evaluate_beam.print_mean_evals()
        evaluate_beam.print_mean_cm()
        '''

        
        tps, tp_list = evaluate_beam.compute_true_positive_all()
        fns, fn_list = evaluate_beam.compute_false_negative_all()
        fps, fp_list = evaluate_beam.compute_false_positive_all()

        all_rank_tps.append(tps)
        all_rank_fns.append(fns)
        all_rank_fps.append(fps)

        print("===================")
        print("N-Rank: %s"%(n_rank[i]))

        acc = evaluate_beam.compute_accuracy_all()
        precision = evaluate_beam.compute_precision_all()
        recall = evaluate_beam.compute_recall_all()
        fscore = evaluate_beam.compute_fscore_all()

        all_rank_acc.append(acc)
        all_rank_precision.append(precision)
        all_rank_recall.append(recall)
        all_rank_fscore.append(fscore)
        
        '''

    all_rank_acc_conn = DataConnector(result_path, 'all_rank_acc_unigrams',
                                      all_rank_acc)
    all_rank_acc_conn.save_pickle()

    all_rank_precision_conn = DataConnector(result_path,
                                            'all_rank_precision_unigrams',
                                            all_rank_precision)
    all_rank_precision_conn.save_pickle()

    all_rank_recall_conn = DataConnector(result_path,
                                         'all_rank_recall_unigrams',
                                         all_rank_recall)
    all_rank_recall_conn.save_pickle()

    all_rank_fscore_conn = DataConnector(result_path,
                                         'all_rank_fscore_unigrams',
                                         all_rank_fscore)
    all_rank_fscore_conn.save_pickle()

    all_rank_tps_conn = DataConnector(result_path, 'all_rank_tps_unigrams',
                                      all_rank_tps)
    all_rank_tps_conn.save_pickle()

    all_rank_fns_conn = DataConnector(result_path, 'all_rank_fns_unigrams',
                                      all_rank_fns)
    all_rank_fns_conn.save_pickle()

    all_rank_fps_conn = DataConnector(result_path, 'all_rank_fps_unigrams',
                                      all_rank_fps)
    all_rank_fps_conn.save_pickle()

    plot_metrics = Plotting(all_rank_acc, all_rank_precision, all_rank_recall,
                            all_rank_fscore, all_rank_tps, all_rank_fps,
                            all_rank_fns, result_path)
    plot_metrics.plot_acc_fscore('plot_acc_fscore_unigrams.png')
    plot_metrics.plot_metrics('plot_metrics_unigrams.png')
    plot_metrics.plot_confusion_matrix('plot_confusion_matrix_unigrams.png')
    '''
def trainer(params):

	try:

		data_path = params['data_path']
		preprocessed_data = params['preprocessed_data']
		preprocessed_v2 = params['preprocessed_v2']
		model_path = params['model_path']
		result_path = params['result_path']
		file_name = params['file_name']

		encoder_length = params['encoder_length']
		decoder_length = params['decoder_length']
		max_sents = params['max_sents']
		embedding_dim = params['embedding_dim']
		birnn_dim = params['birnn_dim']
		rnn_dim = params['rnn_dim']
		vocab_size = params['vocab_size']
		batch_size = params['batch_size']
		epoch = params['epoch']

		'''
		Reading vocabulary dictionaries

		'''
		indices_words_connector = DataConnector(preprocessed_v2, 'all_indices_words_sent_fsoftmax.pkl', data=None)
		indices_words_connector.read_pickle()
		indices_words = indices_words_connector.read_file

		words_indices_connector = DataConnector(preprocessed_v2, 'all_words_indices_sent_fsoftmax.pkl', data=None)
		words_indices_connector.read_pickle()
		words_indices = words_indices_connector.read_file


		'''
		Reading X, y pair data set for training and validating model

		'''
		# 1. training set

		X_train_connector = DataConnector(preprocessed_data, 'x_pair_train_sent_fsoftmax.npy', data=None)
		X_train_connector.read_numpys()
		X_train = X_train_connector.read_file

		y_train_in_connector = DataConnector(preprocessed_data, 'y_pair_train_in_sent_fsoftmax.npy', data=None)
		y_train_in_connector.read_numpys()
		y_train_in = y_train_in_connector.read_file

		y_train_out_connector = DataConnector(preprocessed_data, 'y_pair_train_out_sent_fsoftmax.npy', data=None)
		y_train_out_connector.read_numpys()
		y_train_out = y_train_out_connector.read_file

		print("\n X,y pair of training set: \n")
		sys.stdout.flush()
		print("X (input for encoder) shape: %s"%str(X_train.shape)) # input for encoder
		sys.stdout.flush()
		print("y_in (input for decoder) shape: %s"%str(y_train_in.shape)) # input for decoder
		sys.stdout.flush()
		print("y_out (output for decoder) shape: %s\n\n"%str(y_train_out.shape)) # output for decoder
		sys.stdout.flush()

		# 2. validation set

		# pair data set

		X_valid_pair_connector = DataConnector(preprocessed_data, 'x_pair_valid_sent_fsoftmax.npy', data=None)
		X_valid_pair_connector.read_numpys()
		X_valid_pair = X_valid_pair_connector.read_file

		y_valid_in_pair_connector = DataConnector(preprocessed_data, 'y_pair_valid_in_sent_fsoftmax.npy', data=None)
		y_valid_in_pair_connector.read_numpys()
		y_valid_in_pair = y_valid_in_pair_connector.read_file

		y_valid_out_pair_connector = DataConnector(preprocessed_data, 'y_pair_valid_out_sent_fsoftmax.npy', data=None)
		y_valid_out_pair_connector.read_numpys()
		y_valid_out_pair = y_valid_out_pair_connector.read_file

		print("\n X, y pair of validation set: \n")
		sys.stdout.flush()
		print("X (input for encoder) shape: %s"%str(X_valid_pair.shape)) # input for encoder
		sys.stdout.flush()
		print("y_in (input for decoder) shape: %s"%str(y_valid_in_pair.shape)) # input for decoder
		sys.stdout.flush()
		print("y_out (output for decoder) shape: %s\n\n"%str(y_valid_out_pair.shape)) # output for decoder
		sys.stdout.flush()


		steps_epoch = len(X_train)/batch_size
		batch_train_iter = DataiteratorAttention(X_train, y_train_in, y_train_out, vocab_size, decoder_dim=rnn_dim, batch_size=batch_size)

		val_steps = len(X_valid_pair)/batch_size
		batch_val_iter = DataiteratorAttention(X_valid_pair, y_valid_in_pair, y_valid_out_pair, vocab_size, decoder_dim=rnn_dim, batch_size=batch_size)


	except:
		raise

	'''
	1. Initiate model for training Seq2Seq with sampled softmax layer
	2. Compile with sampled softmax training loss, as an underestimate of full softmax loss
	3. Train with per-batch samples

	'''

	full_softmax = HierarchyAttFullSoftmax(encoder_length, decoder_length, max_sents, embedding_dim, birnn_dim, rnn_dim, vocab_size, result_path, file_name, batch_train_iter, batch_val_iter, batch_size, steps_epoch, val_steps, epoch)

	'''
	Train model with sampled softmax layer 
	Return: LOSS in training stage (an underestimate of full softmax)
	'''

	print(str(datetime.now()))
	sys.stdout.flush()

	full_softmax.train_hier_att_seq2seq()
	full_softmax.compile_()

	print(str(datetime.now()))
	sys.stdout.flush()

	t0 = time.time()
	print("Training hierarchical model with approximate softmax + attention...")
	sys.stdout.flush()

	full_softmax.train_()

	t1 = time.time()
	print("training is done in %.3fsec" % (t1 - t0))
	sys.stdout.flush()

	full_softmax.plot_()
def decoder(params):

    data_path = params['data_path']
    glove_embed = params['glove_embedding']
    oov_embed = params['oov_embedding']
    preprocessed_v2 = params['preprocessed_v2']
    preprocessed_data = params['preprocessed_data']
    decode_path = params['decode_path']
    model_path = params['model_path']
    result_path = params['result_path']
    result_kp20k = params['result_kp20k']
    file_name = params['file_name']
    weights = params['weights']

    encoder_length = params['encoder_length']
    decoder_length = params['decoder_length']
    embedding_dim = params['embedding_dim']
    birnn_dim = params['birnn_dim']
    rnn_dim = params['rnn_dim']
    vocab_size = params['vocab_size']
    batch_size = params['batch_size']
    epoch = params['epoch']
    '''
	Reading vocabulary dictionaries

	'''
    indices_words_connector = DataConnector(
        preprocessed_v2, 'all_idxword_vocabulary_fsoftmax.pkl', data=None)
    indices_words_connector.read_pickle()
    indices_words = indices_words_connector.read_file

    words_indices_connector = DataConnector(
        preprocessed_v2, 'all_wordidx_vocabulary_fsoftmax.pkl', data=None)
    words_indices_connector.read_pickle()
    words_indices = words_indices_connector.read_file

    ## merge all set into one test set for trained model

    train_outputs_conn = DataConnector(data_path,
                                       'train_output_tokens.npy',
                                       data=None)
    train_outputs_conn.read_numpys()
    train_outputs = train_outputs_conn.read_file

    valid_outputs_conn = DataConnector(data_path,
                                       'val_output_tokens.npy',
                                       data=None)
    valid_outputs_conn.read_numpys()
    valid_outputs = valid_outputs_conn.read_file

    test_outputs_conn = DataConnector(data_path,
                                      'test_output_tokens.npy',
                                      data=None)
    test_outputs_conn.read_numpys()
    test_outputs = test_outputs_conn.read_file

    y_test_true = np.concatenate((train_outputs, valid_outputs, test_outputs))

    print("Ground truth of keyphrases shape: %s" %
          str(y_test_true.shape))  # input for encoder
    sys.stdout.flush()

    # non-paired data set

    X_train_connector = DataConnector(preprocessed_data,
                                      'X_train_pad_fsoftmax.npy',
                                      data=None)
    X_train_connector.read_numpys()
    X_train = X_train_connector.read_file

    X_valid_connector = DataConnector(preprocessed_data,
                                      'X_valid_pad_fsoftmax.npy',
                                      data=None)
    X_valid_connector.read_numpys()
    X_valid = X_valid_connector.read_file

    X_test_connector = DataConnector(preprocessed_data,
                                     'X_test_pad_fsoftmax.npy',
                                     data=None)
    X_test_connector.read_numpys()
    X_test = X_test_connector.read_file

    X_in = np.concatenate((X_train, X_valid, X_test))

    glove_embedding_conn = DataConnector(preprocessed_v2,
                                         glove_embed,
                                         data=None)
    glove_embedding_conn.read_pickle()
    pretrained_embedding = glove_embedding_conn.read_file

    print("pretrained_embedding shape: %s" % str(pretrained_embedding.shape))
    print("pretrained_embedding [0][:10]: %s" %
          str(pretrained_embedding[0, :10]))
    print("pretrained_embedding [1][:10]: %s" %
          str(pretrained_embedding[1, :10]))

    oov_embedding_conn = DataConnector(preprocessed_v2, oov_embed, data=None)
    oov_embedding_conn.read_pickle()
    oov_embedding = oov_embedding_conn.read_file

    print("oov_embedding shape: %s" % str(oov_embedding.shape))
    print("oov_embedding [0][:10]: %s" % str(oov_embedding[0, :10]))
    print("oov_embedding [1][:10]: %s" % str(oov_embedding[1, :10]))
    print("oov_embedding [2][:10]: %s" % str(oov_embedding[2, :10]))

    full_softmax = AttentionFullSoftmax(encoder_length=encoder_length,
                                        decoder_length=decoder_length,
                                        embedding_dim=embedding_dim,
                                        birnn_dim=birnn_dim,
                                        rnn_dim=rnn_dim,
                                        vocab_size=vocab_size,
                                        filepath=result_kp20k,
                                        filename=file_name,
                                        batch_train_iter=None,
                                        batch_val_iter=None,
                                        batch_size=None,
                                        steps_epoch=None,
                                        val_steps=None,
                                        epochs=None)

    full_softmax.train_att_seq2seq(pretrained_embedding, oov_embedding)

    full_softmax.predict_att_seq2seq(weights)
    encoder_model = full_softmax.encoder_model
    # 1. Prediction model after being trained on sampled softmax setting
    predict_softmax_model = full_softmax.prediction_model
    '''
	Inference stage
	Model: layers from prediction model and decoder model
	Inference (text generation) approach: 
	1. One best search decoding (Greedy search): 
	   Return one best (top) probable word sequence, from joint probability of words within decoder time steps (decoder sequence length)
	2. N-Beam search decoding: 
	   Return N-top best most probable word sequences, by utilizing beam tree search per time steps and joint probability within decoder time steps (decoder sequence length)

	'''
    '''
	Decoder model for inference stage
	Return: generated keyphrases
	'''

    decoder_model = full_softmax.create_decoder_model()

    # transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases
    keyphrases_transform = TrueKeyphrases(y_test_true)
    keyphrases_transform.get_true_keyphrases()
    keyphrases_transform.get_stat_keyphrases()
    y_true = keyphrases_transform.y_true
    max_kp_num = keyphrases_transform.max_kp_num
    mean_kp_num = keyphrases_transform.mean_kp_num
    std_kp_num = keyphrases_transform.std_kp_num

    print("Maximum number of key phrases per document in corpus: %s" %
          max_kp_num)
    sys.stdout.flush()
    print("Average number of key phrases per document in corpus: %s" %
          mean_kp_num)
    sys.stdout.flush()
    print(
        "Standard Deviation of number of key phrases per document in corpus: %s"
        % std_kp_num)
    sys.stdout.flush()

    # round up function for computing beam width
    def roundup(x):
        return x if x % 5 == 0 else x + 5 - x % 5

    beam_width = int(roundup(mean_kp_num + (3 * std_kp_num)))
    num_hypotheses = beam_width
    print("\nBeam width: %s\n" % beam_width)
    sys.stdout.flush()

    s0_test = np.zeros((len(X_in), rnn_dim))
    att0_test = np.zeros((len(X_in), encoder_length, 1))

    print(str(datetime.now()))
    sys.stdout.flush()

    inference_mode = Decoding(encoder_model=encoder_model,
                              decoder_model=decoder_model,
                              indices_words=indices_words,
                              words_indices=words_indices,
                              enc_in_seq=None,
                              states=None,
                              attentions=None,
                              decoder_length=decoder_length,
                              rnn_dim=rnn_dim,
                              beam_width=beam_width,
                              num_hypotheses=num_hypotheses,
                              filepath=decode_path,
                              filename=file_name)

    t0_1 = time.time()
    print("Start beam decoding...")
    sys.stdout.flush()

    beam_keyphrases = inference_mode.beam_decoder(X_in[:500], s0_test[:500],
                                                  att0_test[:500])

    beam_decode_connector = DataConnector(decode_path,
                                          'beam_kp-%s.npy' % (file_name),
                                          beam_keyphrases)
    beam_decode_connector.save_numpys()

    t1_1 = time.time()
    print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1))
    sys.stdout.flush()
Exemple #6
0
def evaluator(params):

    data_path = params['data_path']
    preprocessed_v2 = params['preprocessed_v2']
    preprocessed_data = params['preprocessed_data']
    decode_path = params['decode_path']
    model_path = params['model_path']
    result_path = params['result_path']
    file_name = params['file_name']
    weights = params['weights']

    idx_words = params['idx_words']
    words_idx = params['words_idx']
    decoded = params['decoded_files']
    y_1 = params['y1']
    y_2 = params['y2']
    y_3 = params['y3']
    '''
    Reading vocabulary dictionaries

    '''
    indices_words_connector = DataConnector(preprocessed_v2,
                                            idx_words,
                                            data=None)
    indices_words_connector.read_pickle()
    indices_words = indices_words_connector.read_file

    words_indices_connector = DataConnector(preprocessed_v2,
                                            words_idx,
                                            data=None)
    words_indices_connector.read_pickle()
    words_indices = words_indices_connector.read_file

    ## merge all set into one test set for trained model

    train_outputs_conn = DataConnector(data_path, y_1, data=None)
    train_outputs_conn.read_numpys()
    train_outputs = train_outputs_conn.read_file

    valid_outputs_conn = DataConnector(data_path, y_2, data=None)
    valid_outputs_conn.read_numpys()
    valid_outputs = valid_outputs_conn.read_file

    test_outputs_conn = DataConnector(data_path, y_3, data=None)
    test_outputs_conn.read_numpys()
    test_outputs = test_outputs_conn.read_file

    y_test_true = np.concatenate((train_outputs, valid_outputs, test_outputs))

    print("Ground truth of keyphrases shape: %s" %
          str(y_test_true.shape))  # input for encoder
    sys.stdout.flush()
    '''
    Reading generated keyphrases

    '''

    # number of documents in inspec datasets
    kp_paths = ['%s-%s' % (decoded, i) for i in range(500)]

    #kp_paths = ['keyphrases-beam-decode-inspec-%s'%(i) for i in range(2000)]
    dataconn = DataConnector(filepath=None, filename=None)
    # uncomment this, for reading all generated hypothesis in one list
    #hypotheses = dataconn.read_pickles_all(result_path, kp_paths)

    # uncomment this, for reading all generated hypothesis in list of arrays

    hypotheses = dataconn.read_pickles_doc(decode_path, kp_paths)

    print("length of hypotheses: %s" % (len(hypotheses)))
    print("len hypotheses[0]: %s" % len(hypotheses[0]))
    print("len hypotheses[1]: %s" % len(hypotheses[1]))
    print("len hypotheses[2]: %s" % len(hypotheses[2]))
    print("len hypotheses[3]: %s" % len(hypotheses[3]))

    # on average
    n_rank = [1, 5, 10, 15, 20]

    all_rank_prediction = []
    for n in range(len(n_rank)):

        beam_predicted_keyphrases = []

        for keyphrase_list in hypotheses:

            i = 0

            decoded_kps = []
            for keyphrases in keyphrase_list:

                beam_decoded = BeamDecoded(keyphrases, words_indices,
                                           indices_words, result_path)
                keyphrase = beam_decoded.decript_hypotheses()
                if i < n_rank[n]:
                    decoded_kps.append(keyphrase)
                    i += 1
                continue

            # print("len(decoded_kps): %s"%len(decoded_kps))

            beam_predicted_keyphrases.append(decoded_kps)
            #beam_predicted_keyphrases.extend(decoded_kps)
        all_rank_prediction.append(beam_predicted_keyphrases)
    '''
    Evaluating generated keyphrases of sampled softmax model + beam search decoding approach
    '''

    all_rank_acc = []
    all_rank_precision = []
    all_rank_recall = []
    all_rank_fscore = []

    all_rank_tps = []
    all_rank_fns = []
    all_rank_fps = []

    for i, beam_predicted in enumerate(all_rank_prediction):
        evaluate_beam = Evaluate(beam_predicted, y_test_true, result_path)

        evaluate_beam.get_true_keyphrases()
        #evaluate_beam.get_true_keyphrases_all()
        y_true = evaluate_beam.y_true
        print("length of y_true: %s" % (len(y_true)))
        '''

        print("length of y_true: %s"%(len(y_true)))
        print("y_true[0]: %s"%str(y_true[0]))
        print("y_true[1]: %s"%str(y_true[1]))
        print("y_true[2]: %s"%str(y_true[2]))
        print("y_true[3]: %s"%str(y_true[3]))
        print("y_true[4]: %s"%str(y_true[4]))
        print("y_true[5]: %s"%str(y_true[5]))
        print("y_true[6]: %s"%str(y_true[6]))
        print("y_true[7]: %s"%str(y_true[7]))

        '''

        evaluate_beam.compute_true_positive()
        evaluate_beam.compute_false_negative()
        evaluate_beam.compute_false_positive()

        evaluate_beam.compute_accuracy()
        evaluate_beam.compute_precision()
        evaluate_beam.compute_recall()
        evaluate_beam.compute_fscore()

        mean_acc, mean_precision, mean_recall, mean_fscore = evaluate_beam.compute_mean_evals(
        )
        all_rank_acc.append(mean_acc)
        all_rank_precision.append(mean_precision)
        all_rank_recall.append(mean_recall)
        all_rank_fscore.append(mean_fscore)

        mean_tps, mean_fns, mean_fps = evaluate_beam.compute_mean_cm()
        all_rank_tps.append(mean_tps)
        all_rank_fns.append(mean_fns)
        all_rank_fps.append(mean_fps)

        print("===================")
        print("N-Rank: %s" % (n_rank[i]))

        evaluate_beam.print_mean_evals()
        evaluate_beam.print_mean_cm()
        '''

        
        tps, tp_list = evaluate_beam.compute_true_positive_all()
        fns, fn_list = evaluate_beam.compute_false_negative_all()
        fps, fp_list = evaluate_beam.compute_false_positive_all()

        all_rank_tps.append(tps)
        all_rank_fns.append(fns)
        all_rank_fps.append(fps)

        print("===================")
        print("N-Rank: %s"%(n_rank[i]))

        acc = evaluate_beam.compute_accuracy_all()
        precision = evaluate_beam.compute_precision_all()
        recall = evaluate_beam.compute_recall_all()
        fscore = evaluate_beam.compute_fscore_all()

        all_rank_acc.append(acc)
        all_rank_precision.append(precision)
        all_rank_recall.append(recall)
        all_rank_fscore.append(fscore)
        
        '''

    all_rank_acc_conn = DataConnector(result_path, 'all_rank_acc',
                                      all_rank_acc)
    all_rank_acc_conn.save_pickle()

    all_rank_precision_conn = DataConnector(result_path, 'all_rank_precision',
                                            all_rank_precision)
    all_rank_precision_conn.save_pickle()

    all_rank_recall_conn = DataConnector(result_path, 'all_rank_recall',
                                         all_rank_recall)
    all_rank_recall_conn.save_pickle()

    all_rank_fscore_conn = DataConnector(result_path, 'all_rank_fscore',
                                         all_rank_fscore)
    all_rank_fscore_conn.save_pickle()

    all_rank_tps_conn = DataConnector(result_path, 'all_rank_tps',
                                      all_rank_tps)
    all_rank_tps_conn.save_pickle()

    all_rank_fns_conn = DataConnector(result_path, 'all_rank_fns',
                                      all_rank_fns)
    all_rank_fns_conn.save_pickle()

    all_rank_fps_conn = DataConnector(result_path, 'all_rank_fps',
                                      all_rank_fps)
    all_rank_fps_conn.save_pickle()

    plot_metrics = Plotting(all_rank_acc, all_rank_precision, all_rank_recall,
                            all_rank_fscore, all_rank_tps, all_rank_fps,
                            all_rank_fns, result_path)
    plot_metrics.plot_acc_fscore()
    plot_metrics.plot_metrics()
    plot_metrics.plot_confusion_matrix()
    '''
Exemple #7
0
def decoder(params):

    data_path = params['data_path']
    preprocessed_v2 = params['preprocessed_v2']
    preprocessed_data = params['preprocessed_data']
    decode_path = params['decode_path']
    model_path = params['model_path']
    result_path = params['result_path']
    result_kp20k = params['result_kp20k']
    file_name = params['file_name']
    weights = params['weights']

    encoder_length = params['encoder_length']
    decoder_length = params['decoder_length']
    max_sents = params['max_sents']
    embedding_dim = params['embedding_dim']
    birnn_dim = params['birnn_dim']
    rnn_dim = params['rnn_dim']
    vocab_size = params['vocab_size']
    num_samples = params['num_samples']
    batch_size = params['batch_size']
    epoch = params['epoch']
    '''
	Reading vocabulary dictionaries

	'''
    indices_words_connector = DataConnector(preprocessed_v2,
                                            'all_indices_words_sent.pkl',
                                            data=None)
    indices_words_connector.read_pickle()
    indices_words = indices_words_connector.read_file

    words_indices_connector = DataConnector(preprocessed_v2,
                                            'all_words_indices_sent.pkl',
                                            data=None)
    words_indices_connector.read_pickle()
    words_indices = words_indices_connector.read_file

    ## merge all set into one test set for trained model

    outputs_conn = DataConnector(data_path,
                                 'output_sent_tokens.npy',
                                 data=None)
    outputs_conn.read_numpys()
    outputs = outputs_conn.read_file

    y_test_true = outputs

    print("Ground truth of keyphrases shape: %s" %
          str(y_test_true.shape))  # input for encoder
    sys.stdout.flush()

    # non-paired data set

    X_connector = DataConnector(data_path, 'X_sent_pad.npy', data=None)
    X_connector.read_numpys()
    X_in = X_connector.read_file

    print("\n Non-paired test set: \n")
    sys.stdout.flush()
    print("X (input for encoder) shape: %s" %
          str(X_in.shape))  # input for encoder
    sys.stdout.flush()
    '''
	Decoder model for inference stage
	Return: generated keyphrases
	'''

    sampled_softmax = HierarchySampledSoftmax(encoder_length=encoder_length,
                                              decoder_length=decoder_length,
                                              max_sents=max_sents,
                                              embedding_dim=embedding_dim,
                                              birnn_dim=birnn_dim,
                                              rnn_dim=rnn_dim,
                                              vocab_size=vocab_size,
                                              num_samples=num_samples,
                                              filepath=result_kp20k,
                                              filename=file_name,
                                              batch_train_iter=None,
                                              batch_val_iter=None,
                                              batch_size=None,
                                              steps_epoch=None,
                                              val_steps=None,
                                              epochs=None)

    # skeleton of model architecture
    sampled_softmax.train_hier_sampled_softmax()
    '''
	Model for retrieving softmax probability
	Return: softmax probability of prediction layer
	'''

    sampled_softmax.predict_sampled_softmax(weights)
    encoder_model = sampled_softmax.encoder_model
    # 1. Prediction model after being trained on sampled softmax setting
    predict_softmax_model = sampled_softmax.prediction_model
    '''

	Inference stage
	Model: layers from prediction model and decoder model
	Inference (text generation) approach: 
	1. One best search decoding (Greedy search): 
	   Return one best (top) probable word sequence, from joint probability of words within decoder time steps (decoder sequence length)
	2. N-Beam search decoding: 
	   Return N-top best most probable word sequences, by utilizing beam tree search per time steps and joint probability within decoder time steps (decoder sequence length)

	'''

    decoder_model = sampled_softmax.create_decoder_model()

    # transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases
    keyphrases_transform = TrueKeyphrases(y_test_true)
    keyphrases_transform.get_true_keyphrases()
    keyphrases_transform.get_stat_keyphrases()
    y_true = keyphrases_transform.y_true
    max_kp_num = keyphrases_transform.max_kp_num
    mean_kp_num = keyphrases_transform.mean_kp_num
    std_kp_num = keyphrases_transform.std_kp_num

    print("Maximum number of key phrases per document in corpus: %s" %
          max_kp_num)
    sys.stdout.flush()
    print("Average number of key phrases per document in corpus: %s" %
          mean_kp_num)
    sys.stdout.flush()
    print(
        "Standard Deviation of number of key phrases per document in corpus: %s"
        % std_kp_num)
    sys.stdout.flush()

    # round up function for computing beam width
    def roundup(x):
        return x if x % 5 == 0 else x + 5 - x % 5

    beam_width = int(roundup(mean_kp_num + (3 * std_kp_num)))
    num_hypotheses = beam_width
    print("\nBeam width: %s\n" % beam_width)
    sys.stdout.flush()

    y_dummy_test = np.zeros((len(X_in), decoder_length + 1, 1))

    inference_mode = Decoding(encoder_model=encoder_model,
                              decoder_model=decoder_model,
                              indices_words=indices_words,
                              words_indices=words_indices,
                              enc_in_seq=None,
                              labels=None,
                              decoder_length=decoder_length,
                              rnn_dim=rnn_dim,
                              beam_width=beam_width,
                              num_hypotheses=num_hypotheses,
                              filepath=decode_path,
                              filename=file_name)

    t0_1 = time.time()
    print("Start beam decoding...")
    sys.stdout.flush()

    beam_keyphrases = inference_mode.beam_decoder(X_in[:500],
                                                  y_dummy_test[:500])

    beam_decode_connector = DataConnector(decode_path,
                                          'beam_kp-hier-%s.npy' % (file_name),
                                          beam_keyphrases)
    beam_decode_connector.save_numpys()

    t1_1 = time.time()
    print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1))
    sys.stdout.flush()
def trainer(params):

	try:

		data_path = params['data_path']
		preprocessed_data = params['preprocessed_data']
		glove_path = params['glove_path']
		glove_embed = params['glove_embedding']
		oov_embed = params['oov_embedding']
		model_path = params['model_path']
		result_path = params['result_path']
		file_name = params['file_name']

		encoder_length = params['encoder_length']
		decoder_length = params['decoder_length']
		embedding_dim = params['embedding_dim']
		birnn_dim = params['birnn_dim']
		rnn_dim = params['rnn_dim']
		vocab_size = params['vocab_size']
		batch_size = params['batch_size']
		epoch = params['epoch']

		'''
		Reading vocabulary dictionaries

		'''
		indices_words_connector = DataConnector(preprocessed_data, 'all_idxword_vocabulary_fsoftmax.pkl', data=None)
		indices_words_connector.read_pickle()
		indices_words = indices_words_connector.read_file

		words_indices_connector = DataConnector(preprocessed_data, 'all_wordidx_vocabulary_fsoftmax.pkl', data=None)
		words_indices_connector.read_pickle()
		words_indices = words_indices_connector.read_file


		'''
		Reading X, y pair data set for training and validating model

		'''
		# 1. training set

		X_train_connector = DataConnector(preprocessed_data, 'x_pair_train_fsoftmax.npy', data=None)
		X_train_connector.read_numpys()
		X_train = X_train_connector.read_file

		y_train_in_connector = DataConnector(preprocessed_data, 'y_pair_train_in_fsoftmax.npy', data=None)
		y_train_in_connector.read_numpys()
		y_train_in = y_train_in_connector.read_file

		y_train_out_connector = DataConnector(preprocessed_data, 'y_pair_train_out_fsoftmax.npy', data=None)
		y_train_out_connector.read_numpys()
		y_train_out = y_train_out_connector.read_file

		print("\n X,y pair of training set: \n")
		sys.stdout.flush()
		print("X (input for encoder) shape: %s"%str(X_train.shape)) # input for encoder
		sys.stdout.flush()
		print("y_in (input for decoder) shape: %s"%str(y_train_in.shape)) # input for decoder
		sys.stdout.flush()
		print("y_out (output for decoder) shape: %s\n\n"%str(y_train_out.shape)) # output for decoder
		sys.stdout.flush()

		# 2. validation set

		# pair data set

		X_valid_pair_connector = DataConnector(preprocessed_data, 'x_pair_valid_fsoftmax.npy', data=None)
		X_valid_pair_connector.read_numpys()
		X_valid_pair = X_valid_pair_connector.read_file

		y_valid_in_pair_connector = DataConnector(preprocessed_data, 'y_pair_valid_in_fsoftmax.npy', data=None)
		y_valid_in_pair_connector.read_numpys()
		y_valid_in_pair = y_valid_in_pair_connector.read_file

		y_valid_out_pair_connector = DataConnector(preprocessed_data, 'y_pair_valid_out_fsoftmax.npy', data=None)
		y_valid_out_pair_connector.read_numpys()
		y_valid_out_pair = y_valid_out_pair_connector.read_file

		print("\n X, y pair of validation set: \n")
		sys.stdout.flush()
		print("X (input for encoder) shape: %s"%str(X_valid_pair.shape)) # input for encoder
		sys.stdout.flush()
		print("y_in (input for decoder) shape: %s"%str(y_valid_in_pair.shape)) # input for decoder
		sys.stdout.flush()
		print("y_out (output for decoder) shape: %s\n\n"%str(y_valid_out_pair.shape)) # output for decoder
		sys.stdout.flush()

		# non-pair data set

		X_valid_connector = DataConnector(preprocessed_data, 'X_valid_pad_fsoftmax.npy', data=None)
		X_valid_connector.read_numpys()
		X_valid = X_valid_connector.read_file

		y_valid_in_connector = DataConnector(preprocessed_data, 'y_valid_in_fsoftmax.npy', data=None)
		y_valid_in_connector.read_numpys()
		y_valid_in = y_valid_in_connector.read_file

		y_valid_out_connector = DataConnector(preprocessed_data, 'y_valid_out_fsoftmax.npy', data=None)
		y_valid_out_connector.read_numpys()
		y_valid_out = y_valid_out_connector.read_file

		print("\n Non-paired validation set: \n")
		sys.stdout.flush()
		print("X (input for encoder) shape: %s"%str(X_valid.shape)) # input for encoder
		sys.stdout.flush()
		print("y_in (input for decoder) shape: %s"%str(y_valid_in.shape)) # input for decoder
		sys.stdout.flush()
		print("y_out (output for decoder) shape: %s\n\n"%str(y_valid_out.shape)) # output for decoder
		sys.stdout.flush()


		# 3. test set

		# paired data set

		X_pair_test_connector = DataConnector(preprocessed_data, 'x_pair_test_fsoftmax.npy', data=None)
		X_pair_test_connector.read_numpys()
		X_pair_test = X_pair_test_connector.read_file

		y_pair_test_in_connector = DataConnector(preprocessed_data, 'y_pair_test_in_fsoftmax.npy', data=None)
		y_pair_test_in_connector.read_numpys()
		y_pair_test_in = y_pair_test_in_connector.read_file

		y_pair_test_out_connector = DataConnector(preprocessed_data, 'y_pair_test_out_fsoftmax.npy', data=None)
		y_pair_test_out_connector.read_numpys()
		y_pair_test_out = y_pair_test_out_connector.read_file

		# non-paired data set


		X_test_connector = DataConnector(preprocessed_data, 'X_test_pad_fsoftmax.npy', data=None)
		X_test_connector.read_numpys()
		X_test = X_test_connector.read_file

		y_test_in_connector = DataConnector(preprocessed_data, 'y_test_in_fsoftmax.npy', data=None)
		y_test_in_connector.read_numpys()
		y_test_in = y_test_in_connector.read_file

		y_test_out_connector = DataConnector(preprocessed_data, 'y_test_out_fsoftmax.npy', data=None)
		y_test_out_connector.read_numpys()
		y_test_out = y_test_out_connector.read_file

		print("\n Non-paired test set: \n")
		sys.stdout.flush()
		print("X (input for encoder) shape: %s"%str(X_test.shape)) # input for encoder
		sys.stdout.flush()
		print("y_in (input for decoder) shape: %s"%str(y_test_in.shape)) # input for decoder
		sys.stdout.flush()
		print("y_out (output for decoder) shape: %s\n\n"%str(y_test_out.shape)) # output for decoder
		sys.stdout.flush()


		# 4. y_true (true keyphrases) from test set

		y_test_true_connector = DataConnector(data_path, 'test_output_tokens.npy', data=None)
		y_test_true_connector.read_numpys()
		y_test_true = y_test_true_connector.read_file

		'''
		Data iterator: preparing per batch training set
		
		INPUTS: 

		x_train (sequence in encoder). dimension shape ( #examples, encoder_length )

		y_train_in (y_true labels to be exposed to the decoder layer as a part of "teacher forcing" method). dimension shape ( #examples, decoder_length )

		y_train_out (y_true labels as the output projection of decoder layer -- need to be provided to calculate entropy of prediction vs. ground truth)

		OUTPUTS: 
		Per batch training examples through data iterator and generator

		x_train, y_train_in, states, labels, y_output

		states : numpy zeros array with dimension (#examples, #decoder dimension) --> will be used as the initial state of decoder

		labels: 3D shape of y_train_out. Needed as input for sampled softmax layer. dimension shape ( #examples, decoder_length, 1 )

		y_output: list format of labels --> since we use one-step decoder and sampled softmax projection. ( decoder_length, #examples, 1 )


		'''

		steps_epoch = len(X_train)/batch_size
		batch_train_iter = DataiteratorAttention(X_train, y_train_in, y_train_out, vocab_size, decoder_dim=rnn_dim, batch_size=batch_size)

		val_steps = len(X_valid_pair)/batch_size
		batch_val_iter = DataiteratorAttention(X_valid_pair, y_valid_in_pair, y_valid_out_pair, vocab_size, decoder_dim=rnn_dim, batch_size=batch_size)


	except:
		raise

	'''
	1. Initiate model for training Seq2Seq with sampled softmax layer
	2. Compile with sampled softmax training loss, as an underestimate of full softmax loss
	3. Train with per-batch samples

	'''

	glove_embedding_conn = DataConnector(preprocessed_data, glove_embed, data=None)
	glove_embedding_conn.read_pickle()
	pretrained_embedding = glove_embedding_conn.read_file

	print("pretrained_embedding shape: %s"%str(pretrained_embedding.shape))
	print("pretrained_embedding [0][:10]: %s"%str(pretrained_embedding[0,:10]))
	print("pretrained_embedding [1][:10]: %s"%str(pretrained_embedding[1,:10]))

	oov_embedding_conn = DataConnector(preprocessed_data, oov_embed, data=None)
	oov_embedding_conn.read_pickle()
	oov_embedding = oov_embedding_conn.read_file

	print("oov_embedding shape: %s"%str(oov_embedding.shape))
	print("oov_embedding [0][:10]: %s"%str(oov_embedding[0,:10]))
	print("oov_embedding [1][:10]: %s"%str(oov_embedding[1,:10]))
	print("oov_embedding [2][:10]: %s"%str(oov_embedding[2,:10]))

	full_softmax = AttentionFullSoftmax(encoder_length, decoder_length, embedding_dim, birnn_dim, rnn_dim, vocab_size, result_path, file_name, batch_train_iter, batch_val_iter, batch_size, steps_epoch, val_steps, epoch)

	'''
	Train model with sampled softmax layer 
	Return: LOSS in training stage (an underestimate of full softmax)
	'''

	print(str(datetime.now()))
	sys.stdout.flush()

	full_softmax.train_att_seq2seq(pretrained_embedding, oov_embedding)
	full_softmax.compile_()

	print(str(datetime.now()))
	sys.stdout.flush()

	t0 = time.time()
	print("Training model with attention + full softmax...")
	sys.stdout.flush()

	full_softmax.train_()

	t1 = time.time()
	print("training is done in %.3fsec" % (t1 - t0))
	sys.stdout.flush()

	full_softmax.plot_()
def decoder(params):

    data_path = params['data_path']
    preprocessed_data = params['preprocessed_data']
    glove_embed = params['glove_embedding']
    oov_embed = params['oov_embedding']
    model_path = params['model_path']
    result_path = params['result_path']
    decode_path = params['decode_path']
    file_name = params['file_name']
    weights = params['weights']

    encoder_length = params['encoder_length']
    decoder_length = params['decoder_length']
    max_sents = params['max_sents']
    embedding_dim = params['embedding_dim']
    birnn_dim = params['birnn_dim']
    rnn_dim = params['rnn_dim']
    vocab_size = params['vocab_size']
    num_samples = params['num_samples']
    batch_size = params['batch_size']
    epoch = params['epoch']
    '''
	Reading vocabulary dictionaries

	'''
    indices_words_connector = DataConnector(preprocessed_data,
                                            'all_idxword_vocabulary_sent.pkl',
                                            data=None)
    indices_words_connector.read_pickle()
    indices_words = indices_words_connector.read_file

    words_indices_connector = DataConnector(preprocessed_data,
                                            'all_wordidx_vocabulary_sent.pkl',
                                            data=None)
    words_indices_connector.read_pickle()
    words_indices = words_indices_connector.read_file

    y_test_true_connector = DataConnector(data_path,
                                          'test_sent_output_tokens.npy',
                                          data=None)
    y_test_true_connector.read_numpys()
    y_test_true = y_test_true_connector.read_file

    # paired data set

    X_pair_test_connector = DataConnector(preprocessed_data,
                                          'x_pair_test_sent.npy',
                                          data=None)
    X_pair_test_connector.read_numpys()
    X_pair_test = X_pair_test_connector.read_file

    y_pair_test_in_connector = DataConnector(preprocessed_data,
                                             'y_pair_test_in_sent.npy',
                                             data=None)
    y_pair_test_in_connector.read_numpys()
    y_pair_test_in = y_pair_test_in_connector.read_file

    y_pair_test_out_connector = DataConnector(preprocessed_data,
                                              'y_pair_test_out_sent.npy',
                                              data=None)
    y_pair_test_out_connector.read_numpys()
    y_pair_test_out = y_pair_test_out_connector.read_file

    # non-paired data set

    X_test_connector = DataConnector(preprocessed_data,
                                     'X_test_pad_sent.npy',
                                     data=None)
    X_test_connector.read_numpys()
    X_test = X_test_connector.read_file

    y_test_in_connector = DataConnector(preprocessed_data,
                                        'y_test_in_sent.npy',
                                        data=None)
    y_test_in_connector.read_numpys()
    y_test_in = y_test_in_connector.read_file

    y_test_out_connector = DataConnector(preprocessed_data,
                                         'y_test_out_sent.npy',
                                         data=None)
    y_test_out_connector.read_numpys()
    y_test_out = y_test_out_connector.read_file

    print("\n Non-paired test set: \n")
    sys.stdout.flush()
    print("X (input for encoder) shape: %s" %
          str(X_test.shape))  # input for encoder
    sys.stdout.flush()
    print("y_in (input for decoder) shape: %s" %
          str(y_test_in.shape))  # input for decoder
    sys.stdout.flush()
    print("y_out (output for decoder) shape: %s\n\n" %
          str(y_test_out.shape))  # output for decoder
    sys.stdout.flush()
    '''
	Decoder model for inference stage
	Return: generated keyphrases
	'''

    glove_embedding_conn = DataConnector(preprocessed_data,
                                         glove_embed,
                                         data=None)
    glove_embedding_conn.read_pickle()
    pretrained_embedding = glove_embedding_conn.read_file

    print("pretrained_embedding shape: %s" % str(pretrained_embedding.shape))
    print("pretrained_embedding [0][:10]: %s" %
          str(pretrained_embedding[0, :10]))
    print("pretrained_embedding [1][:10]: %s" %
          str(pretrained_embedding[1, :10]))

    oov_embedding_conn = DataConnector(preprocessed_data, oov_embed, data=None)
    oov_embedding_conn.read_pickle()
    oov_embedding = oov_embedding_conn.read_file

    print("oov_embedding shape: %s" % str(oov_embedding.shape))
    print("oov_embedding [0][:10]: %s" % str(oov_embedding[0, :10]))
    print("oov_embedding [1][:10]: %s" % str(oov_embedding[1, :10]))
    print("oov_embedding [2][:10]: %s" % str(oov_embedding[2, :10]))

    sampled_softmax = HierarchySampledSoftmax(encoder_length=encoder_length,
                                              decoder_length=decoder_length,
                                              max_sents=max_sents,
                                              embedding_dim=embedding_dim,
                                              birnn_dim=birnn_dim,
                                              rnn_dim=rnn_dim,
                                              vocab_size=vocab_size,
                                              num_samples=num_samples,
                                              filepath=result_path,
                                              filename=file_name,
                                              batch_train_iter=None,
                                              batch_val_iter=None,
                                              batch_size=None,
                                              steps_epoch=None,
                                              val_steps=None,
                                              epochs=None)

    # skeleton of model architecture
    sampled_softmax.train_hier_sampled_softmax(pretrained_embedding,
                                               oov_embedding)
    '''
	Model for retrieving softmax probability
	Return: softmax probability of prediction layer
	'''

    sampled_softmax.predict_sampled_softmax(weights)
    encoder_model = sampled_softmax.encoder_model
    # 1. Prediction model after being trained on sampled softmax setting
    predict_softmax_model = sampled_softmax.prediction_model
    '''
	Compute softmax loss on validation set
	Model: 'Eval' mode sampled softmax
	Return: Loss on validation set
	'''
    """

	t0 = time.time()
	print("Evaluate model with full softmax setting...")
	sys.stdout.flush()

	y_pair_test = y_pair_test_out.reshape((y_pair_test_out.shape[0], y_pair_test_out.shape[1], 1)) # as true labels
	outputs_test = list(y_pair_test.swapaxes(0,1))
	m_test = X_pair_test.shape[0]
	s0_test = np.zeros((m_test, rnn_dim))

	score = eval_softmax_model.evaluate([X_pair_test, y_pair_test_in, s0_test, y_pair_test], outputs_test, batch_size=64)
	print("average loss: %s"%str(score[0]/(decoder_length+1)))
	sys.stdout.flush()
	print("all time steps loss: %s"%score)
	sys.stdout.flush()

	avg_loss = score[0]/(decoder_length+1)
	perplex = np.exp(avg_loss)

	print("average perplexity score: %s"%perplex)
	sys.stdout.flush()
	print("all time steps perplexity score: %s"%(np.exp(score)))
	sys.stdout.flush()

	t1 = time.time()
	print("Full softmax evaluation is done in %.3fsec" % (t1 - t0))
	sys.stdout.flush()

	"""
    '''

	Inference stage
	Model: layers from prediction model and decoder model
	Inference (text generation) approach: 
	1. One best search decoding (Greedy search): 
	   Return one best (top) probable word sequence, from joint probability of words within decoder time steps (decoder sequence length)
	2. N-Beam search decoding: 
	   Return N-top best most probable word sequences, by utilizing beam tree search per time steps and joint probability within decoder time steps (decoder sequence length)

	'''

    decoder_model = sampled_softmax.create_decoder_model()

    # transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases
    keyphrases_transform = TrueKeyphrases(y_test_true)
    keyphrases_transform.get_true_keyphrases()
    keyphrases_transform.get_stat_keyphrases()
    y_true = keyphrases_transform.y_true
    max_kp_num = keyphrases_transform.max_kp_num
    mean_kp_num = keyphrases_transform.mean_kp_num
    std_kp_num = keyphrases_transform.std_kp_num

    print("Maximum number of key phrases per document in corpus: %s" %
          max_kp_num)
    sys.stdout.flush()
    print("Average number of key phrases per document in corpus: %s" %
          mean_kp_num)
    sys.stdout.flush()
    print(
        "Standard Deviation of number of key phrases per document in corpus: %s"
        % std_kp_num)
    sys.stdout.flush()

    # round up function for computing beam width
    def roundup(x):
        return x if x % 5 == 0 else x + 5 - x % 5

    beam_width = int(roundup(mean_kp_num + (3 * std_kp_num)))
    print("\nBeam width: %s\n" % beam_width)
    sys.stdout.flush()
    num_hypotheses = beam_width

    y_dummy_test = np.zeros((len(X_test), decoder_length + 1, 1))

    print(str(datetime.now()))
    sys.stdout.flush()

    t0 = time.time()
    print("Start decoding...")
    sys.stdout.flush()

    inference_mode = Decoding(encoder_model=encoder_model,
                              decoder_model=decoder_model,
                              indices_words=indices_words,
                              words_indices=words_indices,
                              enc_in_seq=None,
                              labels=None,
                              decoder_length=decoder_length,
                              rnn_dim=rnn_dim,
                              beam_width=beam_width,
                              num_hypotheses=num_hypotheses,
                              filepath=decode_path,
                              filename=file_name)

    t0_1 = time.time()
    print("Start beam decoding...")
    sys.stdout.flush()

    beam_keyphrases = inference_mode.beam_decoder(X_test[:500],
                                                  y_dummy_test[:500])

    beam_decode_connector = DataConnector(decode_path,
                                          'beam_kp-%s.npy' % (file_name),
                                          beam_keyphrases)
    beam_decode_connector.save_numpys()

    t1_1 = time.time()
    print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1))
    sys.stdout.flush()
Exemple #10
0
def decoder(params):

	data_path = params['data_path']
	glove_embed = params['glove_embedding']
	oov_embed = params['oov_embedding']
	preprocessed_data = params['preprocessed_data']
	decode_path = params['decode_path']
	model_path = params['model_path']
	result_path = params['result_path']
	file_name = params['file_name']
	weights = params['weights']

	encoder_length = params['encoder_length']
	decoder_length = params['decoder_length']
	max_sents = params['max_sents']
	embedding_dim = params['embedding_dim']
	birnn_dim = params['birnn_dim']
	rnn_dim = params['rnn_dim']
	vocab_size = params['vocab_size']
	batch_size = params['batch_size']
	epoch = params['epoch']
	

	'''
	Reading vocabulary dictionaries

	'''
	indices_words_connector = DataConnector(preprocessed_data, 'all_idxword_vocabulary_sent_fsoftmax.pkl', data=None)
	indices_words_connector.read_pickle()
	indices_words = indices_words_connector.read_file

	words_indices_connector = DataConnector(preprocessed_data, 'all_wordidx_vocabulary_sent_fsoftmax.pkl', data=None)
	words_indices_connector.read_pickle()
	words_indices = words_indices_connector.read_file

	y_test_true_connector = DataConnector(data_path, 'test_sent_output_tokens.npy', data=None)
	y_test_true_connector.read_numpys()
	y_test_true = y_test_true_connector.read_file


	# non-paired data set


	X_test_connector = DataConnector(preprocessed_data, 'X_test_pad_sent_fsoftmax.npy', data=None)
	X_test_connector.read_numpys()
	X_test = X_test_connector.read_file

	


	'''
	Decoder model for inference stage
	Return: generated keyphrases
	'''

	glove_embedding_conn = DataConnector(preprocessed_data, glove_embed, data=None)
	glove_embedding_conn.read_pickle()
	pretrained_embedding = glove_embedding_conn.read_file

	print("pretrained_embedding shape: %s"%str(pretrained_embedding.shape))
	print("pretrained_embedding [0][:10]: %s"%str(pretrained_embedding[0,:10]))
	print("pretrained_embedding [1][:10]: %s"%str(pretrained_embedding[1,:10]))

	oov_embedding_conn = DataConnector(preprocessed_data, oov_embed, data=None)
	oov_embedding_conn.read_pickle()
	oov_embedding = oov_embedding_conn.read_file

	print("oov_embedding shape: %s"%str(oov_embedding.shape))
	print("oov_embedding [0][:10]: %s"%str(oov_embedding[0,:10]))
	print("oov_embedding [1][:10]: %s"%str(oov_embedding[1,:10]))
	print("oov_embedding [2][:10]: %s"%str(oov_embedding[2,:10]))

	full_softmax = HierarchyFullSoftmax(encoder_length=encoder_length, decoder_length=decoder_length, max_sents=max_sents, embedding_dim=embedding_dim, birnn_dim=birnn_dim, rnn_dim=rnn_dim, vocab_size=vocab_size, filepath=result_path, filename=file_name, batch_train_iter=None, batch_val_iter=None, batch_size=None, steps_epoch=None, val_steps=None, epochs=None)

	# skeleton of model architecture
	full_softmax.train_hier_seq2seq(pretrained_embedding, oov_embedding)
	encoder_model = full_softmax.encoder_model

	predict_softmax_model = full_softmax.predict_seq2seq(weights)
	
	decoder_model = full_softmax.create_decoder_model()

	
	# transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases
	keyphrases_transform =  TrueKeyphrases(y_test_true)
	keyphrases_transform.get_true_keyphrases()
	keyphrases_transform.get_stat_keyphrases()
	y_true = keyphrases_transform.y_true
	max_kp_num = keyphrases_transform.max_kp_num
	mean_kp_num = keyphrases_transform.mean_kp_num
	std_kp_num = keyphrases_transform.std_kp_num

	print("Maximum number of key phrases per document in corpus: %s" %max_kp_num)
	sys.stdout.flush()
	print("Average number of key phrases per document in corpus: %s" %mean_kp_num)
	sys.stdout.flush()
	print("Standard Deviation of number of key phrases per document in corpus: %s" %std_kp_num)
	sys.stdout.flush()

	# round up function for computing beam width 
	def roundup(x):
		return x if x % 5 == 0 else x + 5 - x % 5

	beam_width = int(roundup(mean_kp_num + (3 * std_kp_num)))
	print("\nBeam width: %s\n" %beam_width)
	sys.stdout.flush()
	num_hypotheses = beam_width

	print(str(datetime.now()))
	sys.stdout.flush()

	t0 = time.time()
	print("Start decoding...")
	sys.stdout.flush()

	inference_mode = DecodingSoftmax(encoder_model=encoder_model, decoder_model=decoder_model, indices_words=indices_words, words_indices=words_indices, enc_in_seq=None, decoder_length=decoder_length, rnn_dim=rnn_dim, beam_width=beam_width, num_hypotheses=num_hypotheses, filepath=decode_path, filename=file_name)

	
	t0_1 = time.time()
	print("Start beam decoding...")
	sys.stdout.flush()

	beam_keyphrases = inference_mode.beam_decoder(X_test[:500])
	
	beam_decode_connector = DataConnector(decode_path, 'beam_kp-%s.npy'%(file_name), beam_keyphrases)
	beam_decode_connector.save_numpys()

	t1_1 = time.time()
	print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1))
	sys.stdout.flush()
Exemple #11
0
def decoder(params):

	data_path = params['data_path']
	preprocessed_v2 = params['preprocessed_v2']
	preprocessed_data = params['preprocessed_data']
	decode_path = params['decode_path']
	model_path = params['model_path']
	result_path = params['result_path']
	result_kp20k = params['result_kp20k']
	file_name = params['file_name']
	weights = params['weights']

	encoder_length = params['encoder_length']
	decoder_length = params['decoder_length']
	embedding_dim = params['embedding_dim']
	birnn_dim = params['birnn_dim']
	rnn_dim = params['rnn_dim']
	vocab_size = params['vocab_size']
	batch_size = params['batch_size']
	epoch = params['epoch']

	'''
	Reading vocabulary dictionaries

	'''
	indices_words_connector = DataConnector(preprocessed_v2, 'all_indices_words_fsoftmax.pkl', data=None)
	indices_words_connector.read_pickle()
	indices_words = indices_words_connector.read_file

	words_indices_connector = DataConnector(preprocessed_v2, 'all_words_indices_fsoftmax.pkl', data=None)
	words_indices_connector.read_pickle()
	words_indices = words_indices_connector.read_file

	## merge all set into one test set for trained model

	train_outputs_conn = DataConnector(data_path, 'train_output_tokens.npy', data=None)
	train_outputs_conn.read_numpys()
	train_outputs = train_outputs_conn.read_file

	valid_outputs_conn = DataConnector(data_path, 'val_output_tokens.npy', data=None)
	valid_outputs_conn.read_numpys()
	valid_outputs = valid_outputs_conn.read_file

	test_outputs_conn = DataConnector(data_path, 'test_output_tokens.npy', data=None)
	test_outputs_conn.read_numpys()
	test_outputs = test_outputs_conn.read_file

	y_test_true = np.concatenate((train_outputs, valid_outputs, test_outputs))

	print("Ground truth of keyphrases shape: %s"%str(y_test_true.shape)) # input for encoder
	sys.stdout.flush()


	# non-paired data set

	X_train_connector = DataConnector(preprocessed_data, 'X_train_pad_fsoftmax.npy', data=None)
	X_train_connector.read_numpys()
	X_train = X_train_connector.read_file

	
	X_valid_connector = DataConnector(preprocessed_data, 'X_valid_pad_fsoftmax.npy', data=None)
	X_valid_connector.read_numpys()
	X_valid = X_valid_connector.read_file

	
	X_test_connector = DataConnector(preprocessed_data, 'X_test_pad_fsoftmax.npy', data=None)
	X_test_connector.read_numpys()
	X_test = X_test_connector.read_file


	X_in = np.concatenate((X_train, X_valid, X_test))
	
	print("\n Non-paired test set: \n")
	sys.stdout.flush()
	print("X (input for encoder) shape: %s"%str(X_in.shape)) # input for encoder
	sys.stdout.flush()

	full_softmax = FullSoftmax(encoder_length=encoder_length, decoder_length=decoder_length, embedding_dim=embedding_dim, birnn_dim=birnn_dim, rnn_dim=rnn_dim, vocab_size=vocab_size, filepath=result_kp20k, filename=file_name, batch_train_iter=None, batch_val_iter=None, batch_size=None, steps_epoch=None, val_steps=None, epochs=None)

	full_softmax.train_seq2seq()
	predict_softmax_model = full_softmax.predict_seq2seq(weights)
	encoder_model = full_softmax.encoder_model
	

	decoder_model = full_softmax.create_decoder_model()
	
	# transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases
	keyphrases_transform =  TrueKeyphrases(y_test_true)
	keyphrases_transform.get_true_keyphrases()
	keyphrases_transform.get_stat_keyphrases()
	y_true = keyphrases_transform.y_true
	max_kp_num = keyphrases_transform.max_kp_num
	mean_kp_num = keyphrases_transform.mean_kp_num
	std_kp_num = keyphrases_transform.std_kp_num

	print("Maximum number of key phrases per document in corpus: %s" %max_kp_num)
	sys.stdout.flush()
	print("Average number of key phrases per document in corpus: %s" %mean_kp_num)
	sys.stdout.flush()
	print("Standard Deviation of number of key phrases per document in corpus: %s" %std_kp_num)
	sys.stdout.flush()

	# round up function for computing beam width 
	def roundup(x):
		return x if x % 5 == 0 else x + 5 - x % 5

	beam_width = int(roundup(mean_kp_num + (3 * std_kp_num)))
	num_hypotheses = beam_width
	print("\nBeam width: %s\n" %beam_width)
	sys.stdout.flush()


	inference_mode = DecodingSoftmax(encoder_model=encoder_model, decoder_model=decoder_model, indices_words=indices_words, words_indices=words_indices, enc_in_seq=None, decoder_length=decoder_length, rnn_dim=rnn_dim, beam_width=beam_width, num_hypotheses=num_hypotheses, filepath=decode_path, filename=file_name)


	t0_1 = time.time()
	print("Start beam decoding...")
	sys.stdout.flush()

	beam_keyphrases = inference_mode.beam_decoder(X_in[:500])
	
	beam_decode_connector = DataConnector(decode_path, 'beam_kp-%s.npy'%(file_name), beam_keyphrases)
	beam_decode_connector.save_numpys()

	t1_1 = time.time()
	print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1))
	sys.stdout.flush()
def trainer(params):

	try:

		data_path = params['data_path']
		preprocessed_data = params['preprocessed_data']
		glove_path = params['glove_path']
		#glove_name = params['glove_name']
		glove_embed = params['glove_embedding']
		oov_embed = params['oov_embedding']
		model_path = params['model_path']
		result_path = params['result_path']
		file_name = params['file_name']

		encoder_length = params['encoder_length']
		decoder_length = params['decoder_length']
		embedding_dim = params['embedding_dim']
		birnn_dim = params['birnn_dim']
		rnn_dim = params['rnn_dim']
		vocab_size = params['vocab_size']
		num_samples= params['num_samples']
		batch_size = params['batch_size']
		epoch = params['epoch']

		'''
		Reading vocabulary dictionaries

		'''
		indices_words_connector = DataConnector(preprocessed_data, 'all_idxword_vocabulary.pkl', data=None)
		indices_words_connector.read_pickle()
		indices_words = indices_words_connector.read_file

		words_indices_connector = DataConnector(preprocessed_data, 'all_wordidx_vocabulary.pkl', data=None)
		words_indices_connector.read_pickle()
		words_indices = words_indices_connector.read_file


		'''
		Reading X, y pair data set for training and validating model

		'''
		# 1. training set

		X_train_connector = DataConnector(preprocessed_data, 'x_pair_train.npy', data=None)
		X_train_connector.read_numpys()
		X_train = X_train_connector.read_file

		y_train_in_connector = DataConnector(preprocessed_data, 'y_pair_train_in.npy', data=None)
		y_train_in_connector.read_numpys()
		y_train_in = y_train_in_connector.read_file

		y_train_out_connector = DataConnector(preprocessed_data, 'y_pair_train_out.npy', data=None)
		y_train_out_connector.read_numpys()
		y_train_out = y_train_out_connector.read_file

		print("\n X,y pair of training set: \n")
		sys.stdout.flush()
		print("X (input for encoder) shape: %s"%str(X_train.shape)) # input for encoder
		sys.stdout.flush()
		print("y_in (input for decoder) shape: %s"%str(y_train_in.shape)) # input for decoder
		sys.stdout.flush()
		print("y_out (output for decoder) shape: %s\n\n"%str(y_train_out.shape)) # output for decoder
		sys.stdout.flush()

		# 2. validation set

		# pair data set

		X_valid_pair_connector = DataConnector(preprocessed_data, 'x_pair_valid.npy', data=None)
		X_valid_pair_connector.read_numpys()
		X_valid_pair = X_valid_pair_connector.read_file

		y_valid_in_pair_connector = DataConnector(preprocessed_data, 'y_pair_valid_in.npy', data=None)
		y_valid_in_pair_connector.read_numpys()
		y_valid_in_pair = y_valid_in_pair_connector.read_file

		y_valid_out_pair_connector = DataConnector(preprocessed_data, 'y_pair_valid_out.npy', data=None)
		y_valid_out_pair_connector.read_numpys()
		y_valid_out_pair = y_valid_out_pair_connector.read_file

		print("\n X, y pair of validation set: \n")
		sys.stdout.flush()
		print("X (input for encoder) shape: %s"%str(X_valid_pair.shape)) # input for encoder
		sys.stdout.flush()
		print("y_in (input for decoder) shape: %s"%str(y_valid_in_pair.shape)) # input for decoder
		sys.stdout.flush()
		print("y_out (output for decoder) shape: %s\n\n"%str(y_valid_out_pair.shape)) # output for decoder
		sys.stdout.flush()

		# non-pair data set

		X_valid_connector = DataConnector(preprocessed_data, 'X_valid_pad.npy', data=None)
		X_valid_connector.read_numpys()
		X_valid = X_valid_connector.read_file

		y_valid_in_connector = DataConnector(preprocessed_data, 'y_valid_in.npy', data=None)
		y_valid_in_connector.read_numpys()
		y_valid_in = y_valid_in_connector.read_file

		y_valid_out_connector = DataConnector(preprocessed_data, 'y_valid_out.npy', data=None)
		y_valid_out_connector.read_numpys()
		y_valid_out = y_valid_out_connector.read_file

		print("\n Non-paired validation set: \n")
		sys.stdout.flush()
		print("X (input for encoder) shape: %s"%str(X_valid.shape)) # input for encoder
		sys.stdout.flush()
		print("y_in (input for decoder) shape: %s"%str(y_valid_in.shape)) # input for decoder
		sys.stdout.flush()
		print("y_out (output for decoder) shape: %s\n\n"%str(y_valid_out.shape)) # output for decoder
		sys.stdout.flush()


		

		steps_epoch = len(X_train)/batch_size
		batch_train_iter = Dataiterator(X_train, y_train_in, y_train_out, decoder_dim=rnn_dim, batch_size=batch_size)

		val_steps = len(X_valid_pair)/batch_size
		batch_val_iter = Dataiterator(X_valid_pair, y_valid_in_pair, y_valid_out_pair, decoder_dim=rnn_dim, batch_size=batch_size)


	except:
		raise

	'''
	1. Initiate model for training Seq2Seq with sampled softmax layer
	2. Compile with sampled softmax training loss, as an underestimate of full softmax loss
	3. Train with per-batch samples

	'''

	glove_embedding_conn = DataConnector(preprocessed_data, glove_embed, data=None)
	glove_embedding_conn.read_pickle()
	pretrained_embedding = glove_embedding_conn.read_file

	print("pretrained_embedding shape: %s"%str(pretrained_embedding.shape))
	print("pretrained_embedding [0][:10]: %s"%str(pretrained_embedding[0,:10]))
	print("pretrained_embedding [1][:10]: %s"%str(pretrained_embedding[1,:10]))

	oov_embedding_conn = DataConnector(preprocessed_data, oov_embed, data=None)
	oov_embedding_conn.read_pickle()
	oov_embedding = oov_embedding_conn.read_file

	print("oov_embedding shape: %s"%str(oov_embedding.shape))
	print("oov_embedding [0][:10]: %s"%str(oov_embedding[0,:10]))
	print("oov_embedding [1][:10]: %s"%str(oov_embedding[1,:10]))
	print("oov_embedding [2][:10]: %s"%str(oov_embedding[2,:10]))

	sampled_softmax = AttentionSampledSoftmax(encoder_length, decoder_length, embedding_dim, birnn_dim, rnn_dim, vocab_size, num_samples, result_path, file_name, batch_train_iter, batch_val_iter, batch_size, steps_epoch, val_steps, epoch)

	'''
	Train model with sampled softmax layer 
	Return: LOSS in training stage (an underestimate of full softmax)
	'''

	print(str(datetime.now()))
	sys.stdout.flush()

	sampled_softmax.train_att_sampled_softmax(pretrained_embedding, oov_embedding)
	sampled_softmax.compile_()

	print(str(datetime.now()))
	sys.stdout.flush()

	t0 = time.time()
	print("Training model with approximate softmax...")
	sys.stdout.flush()

	sampled_softmax.train_()

	t1 = time.time()
	print("training is done in %.3fsec" % (t1 - t0))
	sys.stdout.flush()

	sampled_softmax.plot_()
Exemple #13
0
def transform_sent_all(params):

	print("\n=========\n")

	print(str(datetime.now()))
	sys.stdout.flush()

	t0 = time.time()

	print("Transforming all data set into integer sequences")
	sys.stdout.flush()

	data_path = params['data_path']
	kp20k_path = params['kp20k_path']
	max_sents= params['max_sents']

	encoder_length = params['encoder_length']
	decoder_length = params['decoder_length']

	'''
	read stored vocabulary index
	'''

	vocab = DataConnector(kp20k_path, 'all_indices_words_sent_r3.pkl', data=None)
	vocab.read_pickle()
	indices_words = vocab.read_file

	reversed_vocab = DataConnector(kp20k_path, 'all_words_indices_sent_r3.pkl', data=None)
	reversed_vocab.read_pickle()
	words_indices = reversed_vocab.read_file

	'''
	read tokenized data set
	'''

	in_connector = DataConnector(data_path, 'input_sent_tokens.npy', data=None)
	in_connector.read_numpys()
	input_tokens = in_connector.read_file
	out_connector = DataConnector(data_path, 'output_sent_tokens.npy', data=None)
	out_connector.read_numpys()
	output_tokens = out_connector.read_file

	
	'''
	transforming texts into integer sequences
	'''

	
	sequences_processing = SequenceProcessing(indices_words, words_indices, encoder_length, decoder_length)
	x_in = sequences_processing.in_sents_to_integers(in_texts=input_tokens, max_sents=max_sents)
	x_in_pad = sequences_processing.pad_sequences_sent_in(max_len=encoder_length, max_sents=max_sents,sequences=x_in)
	y_in, y_out = sequences_processing.outtexts_to_integers(out_texts=output_tokens)


	x_in_connector = DataConnector(data_path, 'X_sent_r3.npy', x_in)
	x_in_connector.save_numpys()
	x_in_pad_connector = DataConnector(data_path, 'X_sent_pad_r3.npy', x_in_pad)
	x_in_pad_connector.save_numpys()
	y_in_connector = DataConnector(data_path, 'y_sent_in_r3.npy', y_in)
	y_in_connector.save_numpys()
	y_out_connector = DataConnector(data_path, 'y_sent_out_r3.npy', y_out)
	y_out_connector.save_numpys()

	t1 = time.time()
	print("Transforming data set into integer sequences of inputs - outputs done in %.3fsec" % (t1 - t0))
	sys.stdout.flush()
Exemple #14
0
def transform_v2_fsoftmax(params):

	print("\n=========\n")

	print(str(datetime.now()))
	sys.stdout.flush()

	t0 = time.time()

	print("Transforming all data set into integer sequences")
	sys.stdout.flush()

	data_path = params['data_path']
	preprocessed_data = params['preprocessed_data']
	preprocessed_v2 = params['preprocessed_v2']

	encoder_length = params['encoder_length']
	decoder_length = params['decoder_length']

	'''
	read stored vocabulary index
	'''

	vocab = DataConnector(preprocessed_v2, 'all_idxword_vocabulary_fsoftmax.pkl', data=None)
	vocab.read_pickle()
	indices_words = vocab.read_file

	reversed_vocab = DataConnector(preprocessed_v2, 'all_wordidx_vocabulary_fsoftmax.pkl', data=None)
	reversed_vocab.read_pickle()
	words_indices = reversed_vocab.read_file

	'''
	read tokenized data set
	'''

	in_connector = DataConnector(data_path, 'input_tokens.npy', data=None)
	in_connector.read_numpys()
	input_tokens = in_connector.read_file
	out_connector = DataConnector(data_path, 'output_tokens.npy', data=None)
	out_connector.read_numpys()
	output_tokens = out_connector.read_file

	
	'''
	transforming texts into integer sequences
	'''

	
	sequences_processing = SequenceProcessing(indices_words, words_indices, encoder_length, decoder_length)
	x_in = sequences_processing.intexts_to_integers(input_tokens)
	x_in_pad = sequences_processing.pad_sequences_in(encoder_length, x_in)
	y_in, y_out = sequences_processing.outtexts_to_integers(output_tokens)


	x_in_connector = DataConnector(preprocessed_data, 'X_fsoftmax.npy', x_in)
	x_in_connector.save_numpys()
	x_in_pad_connector = DataConnector(preprocessed_data, 'X_pad_fsoftmax.npy', x_in_pad)
	x_in_pad_connector.save_numpys()
	y_in_connector = DataConnector(preprocessed_data, 'y_in_fsoftmax.npy', y_in)
	y_in_connector.save_numpys()
	y_out_connector = DataConnector(preprocessed_data, 'y_out_fsoftmax.npy', y_out)
	y_out_connector.save_numpys()

	t1 = time.time()
	print("Transforming data set into integer sequences of inputs - outputs done in %.3fsec" % (t1 - t0))
	sys.stdout.flush()
Exemple #15
0
def preprocessing_(params):

	data_path = params['data_path']

	print("\n=========\n")
	sys.stdout.flush()

	print(str(datetime.now()))
	sys.stdout.flush()

	t0 = time.time()
	print("Reading raw test data...")
	sys.stdout.flush()

	# this data set consist of:
	# title, abstract, main text, list of topics of scientific articles
	# we will use title + abstract as model input

	data_connector = DataConnector(data_path, 'krapivin_doc_keyphrases.pkl', data=None)
	data_connector.read_pickle()
	data = data_connector.read_file

	in_text = []
	out_keyphrases = []

	for k,v in data.items():
		title = v[0]
		abstract = v[1]
		text = title + " . " + abstract
		kps = v[3]

		in_text.append(text)
		out_keyphrases.append(kps)

	print("\nnumber of examples in raw data inputs: %s\n"%(len(in_text)))
	sys.stdout.flush()
	print("\nnumber of examples in raw data outputs: %s\n"%(len(out_keyphrases)))
	sys.stdout.flush()

	print("\n in_text[0]: %s\n"%(in_text[0]))
	sys.stdout.flush()
	print("\n out_keyphrases[0]: %s\n"%(out_keyphrases[0]))
	sys.stdout.flush()

	prep = Preprocessing()
	prep_inputs = prep.preprocess_in(in_text)
	prep_outputs = prep.preprocess_out(out_keyphrases)
	input_tokens = prep.tokenize_in(prep_inputs)
	output_tokens = prep.tokenize_out(prep_outputs)
	all_tokens = prep.get_all_tokens(input_tokens, output_tokens)

	
	# without splitting data into training and test set
	print("\nnumber of examples in preprocessed data inputs: %s\n"%(len(input_tokens)))
	sys.stdout.flush()

	print("\nnumber of examples in preprocessed data outputs: %s\n"%(len(output_tokens)))
	sys.stdout.flush()

	print("\n input_tokens[0]: %s\n"%(input_tokens[0]))
	sys.stdout.flush()
	print("\n output_tokens[0]: %s\n"%(output_tokens[0]))
	sys.stdout.flush()

	in_connector = DataConnector(data_path, 'input_tokens.npy', input_tokens)
	in_connector.save_numpys()
	out_connector = DataConnector(data_path, 'output_tokens.npy', output_tokens)
	out_connector.save_numpys()
	tokens_connector = DataConnector(data_path, 'all_tokens.npy', all_tokens)
	tokens_connector.save_numpys()

	# splitting into training and test set
	n_train = int(0.8 * len(input_tokens))
	in_train = input_tokens[:n_train]
	out_train = output_tokens[:n_train]
	in_test = input_tokens[n_train:len(input_tokens)]
	out_test = output_tokens[n_train:len(input_tokens)]

	print("\nnumber of examples in training set: %s\n"%(len(in_train)))
	sys.stdout.flush()
	print("\nnumber of examples in test set: %s\n"%(len(in_test)))
	sys.stdout.flush()

	in_train_connector = DataConnector(data_path, 'in_train.npy', in_train)
	in_train_connector.save_numpys()
	out_train_connector = DataConnector(data_path, 'out_train.npy', out_train)
	out_train_connector.save_numpys()
	in_test_connector = DataConnector(data_path, 'in_test.npy', in_test)
	in_test_connector.save_numpys()
	out_test_connector = DataConnector(data_path, 'out_test.npy', out_test)
	out_test_connector.save_numpys()

	t1 = time.time()
	print("Reading raw training data done in %.3fsec" % (t1 - t0))
	sys.stdout.flush()