def test_step_report(logger,session,PadZeroBegin,max_length,test_path, dropout_keep_prob,step,out_dir,char_alphabet,label_alphabet,word_alphabet, word_column, label_column,char_embedd_dim,max_char_per_word): # read test data graph = tf.get_default_graph() logger.info("Reading data from test set...") word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = dp.read_conll_sequence_labeling( test_path, word_alphabet, label_alphabet, word_column, label_column) logger.info("Padding test text and lables ...") word_index_sentences_test_pad,test_seq_length = utils.padSequence(word_index_sentences_test,max_length, beginZero=PadZeroBegin) label_index_sentences_test_pad,_= utils.padSequence(label_index_sentences_test,max_length, beginZero=PadZeroBegin) logger.info("Creating character set FROM test set ...") char_index_test,_= dp.generate_character_data(word_sentences_test, char_alphabet=char_alphabet, setType="Test") logger.info("Padding Test set ...") char_index_test_pad = dp.construct_padded_char(char_index_test, char_alphabet, max_sent_length=max_length,max_char_per_word=max_char_per_word) print(type(char_index_test_pad)) print(type(word_index_sentences_test_pad)) feed_dict=create_feed_Dict_Eval(graph,PadZeroBegin=PadZeroBegin,max_length=max_length, x_batch=word_index_sentences_test_pad, act_seq_lengths= test_seq_length, dropout_keep_prob=dropout_keep_prob, char_batch=char_index_test_pad) #tf.Print(feed_dict,feed_dict) logit_op = graph.get_tensor_by_name('output/logits:0') transition_params_op = graph.get_tensor_by_name('transitions:0') logits,transition_params = session.run([logit_op, transition_params_op],feed_dict) viterbi_decode(logits=logits,transition_params=transition_params, seq_length=test_seq_length,x_batch=word_index_sentences_test_pad,word_alphabet=word_alphabet,label_alphabet=label_alphabet, prefix_filename="test",beginZero=PadZeroBegin) return
def test_step(logger,session,BiLSTM,PadZeroBegin,max_length,test_path, dropout_keep_prob,step,out_dir,char_alphabet,label_alphabet,word_alphabet, word_column, label_column,char_embedd_dim,max_char_per_word): # read test data logger.info("Reading data from test set...") word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = dp.read_conll_sequence_labeling( test_path, word_alphabet, label_alphabet, word_column, label_column) logger.info("Padding test text and lables ...") word_index_sentences_test_pad,test_seq_length = utils.padSequence(word_index_sentences_test,max_length, beginZero=PadZeroBegin) label_index_sentences_test_pad,_= utils.padSequence(label_index_sentences_test,max_length, beginZero=PadZeroBegin) logger.info("Creating character set FROM test set ...") char_index_test,_= dp.generate_character_data(word_sentences_test, char_alphabet=char_alphabet, setType="Test") logger.info("Padding Test set ...") char_index_test_pad = dp.construct_padded_char(char_index_test, char_alphabet, max_sent_length=max_length,max_char_per_word=max_char_per_word) # test summaries #test_summary_op = tf.summary.merge([loss_summary]) #test_summary_dir = os.path.join(out_dir, "summaries", "test") #test_summary_writer = tf.summary.FileWriter(test_summary_dir, sess.graph) feed_dict=create_feed_Dict_Test(BiLSTM,PadZeroBegin=PadZeroBegin,max_length=max_length, x_batch=word_index_sentences_test_pad, y_batch=label_index_sentences_test_pad, act_seq_lengths= test_seq_length, dropout_keep_prob=dropout_keep_prob, char_batch=char_index_test_pad) '''#tf.Print(feed_dict,feed_dict) logits, transition_params = session.run([BiLSTM.logits, BiLSTM.transition_params],feed_dict) #logits is a list of numpy.ndarray #transition_params : ndarray''' logits, transition_params,embedded_char,embedded_words,char_pool_flat,input_x_test = session.run([BiLSTM.logits, BiLSTM.transition_params, BiLSTM.W_char,BiLSTM.W_word,BiLSTM.char_pool_flat,BiLSTM.input_x],feed_dict) accuracy,accuracy_low_classes = predictAccuracyAndWrite(logits,transition_params,test_seq_length, label_index_sentences_test_pad,step,word_index_sentences_test_pad,word_alphabet,label_alphabet,prefix_filename="test",beginZero=PadZeroBegin) #test_summary_writer.add_summary(summaries, step) print("step {}, accuracy on test set {:g}, accuracy for classes except Others: {:g}".format(step,accuracy,accuracy_low_classes)) checkpoint_dir_test = os.path.abspath(os.path.join(out_dir, "checkpoints_test")) if not os.path.exists(checkpoint_dir_test): os.makedirs(checkpoint_dir_test) fname_data = "input_x_test_"+str(step)+".pkl" fname_conv_out = "char_pool_flat_"+str(step)+".pkl" fname_seqLength = "act_seq_len_"+str(step)+".pkl" fname_embedded_char = "embedded_char_"+str(step)+".pkl" fname_embedded_words = "embedded_words_"+str(step)+".pkl" pickle.dump(input_x_test,open(os.path.join(checkpoint_dir_test, fname_data),'wb')) pickle.dump(char_pool_flat,open(os.path.join(checkpoint_dir_test, fname_conv_out),'wb')) pickle.dump(test_seq_length,open(os.path.join(checkpoint_dir_test, fname_seqLength),'wb')) pickle.dump(embedded_char,open(os.path.join(checkpoint_dir_test, fname_embedded_char),'wb')) pickle.dump(embedded_words,open(os.path.join(checkpoint_dir_test, fname_embedded_words),'wb')) print("Saved test data checkpoint to {}\n".format(checkpoint_dir_test)) return accuracy,accuracy_low_classes
char_index_dev, max_char_per_word_dev = dp.generate_character_data( word_sentences_dev, char_alphabet=char_alphabet, setType="Dev") logger.info("character alphabet size: %d" % (char_alphabet.size() - 1)) max_char_per_word = min(dp.MAX_CHAR_PER_WORD, max_char_per_word_train, max_char_per_word_dev) logger.info("Maximum character length is %d" % max_char_per_word) logger.info("Constructing embedding table ...") #TODO : modify network to use this char_embedd_table = dp.build_char_embedd_table( char_alphabet, char_embedd_dim=FLAGS.char_embedd_dim) logger.info("Padding Training set ...") char_index_train_pad = dp.construct_padded_char( char_index_train, char_alphabet, max_sent_length=max_length, max_char_per_word=max_char_per_word) logger.info("Padding Dev set ...") char_index_dev_pad = dp.construct_padded_char( char_index_dev, char_alphabet, max_sent_length=max_length, max_char_per_word=max_char_per_word) #logger.info("Generating data with fine tuning...") embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict( embedding, embedding_path, logger) logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) #Create an embedding table where if the word from training/train/dev set is in glove , then assign glove values else assign random values