def dev_step(sess, model, target_loss_weight): results = defaultdict(list) num_test = 0 num_correct = 0.0 test_batches = data_helpers.batch_iter(valid_dataset, FLAGS.batch_size, 1, target_loss_weight, FLAGS.max_sequence_length, charVocab, FLAGS.max_word_length, shuffle=True) for test_batch in test_batches: x_question, x_answer, x_question_len, x_answer_len, x_target, x_target_weight, id_pairs, x_q_char, x_q_len, x_a_char, x_a_len = test_batch feed_dict = { model.question: x_question, model.answer: x_answer, model.question_len: x_question_len, model.answer_len: x_answer_len, model.target: x_target, model.target_loss_weight: x_target_weight, model.dropout_keep_prob: 1.0, model.q_charVec: x_q_char, model.q_charLen: x_q_len, model.a_charVec: x_a_char, model.a_charLen: x_a_len } batch_accuracy, predicted_prob = sess.run( [model.accuracy, model.probs], feed_dict) num_test += len(predicted_prob) if num_test % 1000 == 0: print(num_test) num_correct += len(predicted_prob) * batch_accuracy for i, prob_score in enumerate(predicted_prob): question_id, answer_id, label = id_pairs[i] results[question_id].append((answer_id, label, prob_score)) #calculate top-1 precision print('num_test_samples: {} test_accuracy: {}'.format( num_test, num_correct / num_test)) accu, precision, recall, f1, loss = metrics.classification_metrics(results) print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format( accu, precision, recall, f1, loss)) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_k_precision(results, k=1) top_2_precision = metrics.top_k_precision(results, k=2) top_5_precision = metrics.top_k_precision(results, k=5) top_10_precision = metrics.top_k_precision(results, k=10) total_valid_query = metrics.get_num_valid_query(results) print( 'MRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}\n'. format(mrr, top_1_precision, total_valid_query)) print('Top-2 precision: {}'.format(top_2_precision)) print('Top-5 precision: {}'.format(top_5_precision)) print('Top-10 precision: {}'.format(top_10_precision)) return mrr
def dev_step(): results = defaultdict(list) num_test = 0 num_correct = 0.0 test_batches = data_helpers.batch_iter(test_dataset, FLAGS.batch_size, 1, target_loss_weight, idf, SEQ_LEN, charVocab, FLAGS.max_word_length, shuffle=True) for test_batch in test_batches: x_question, x_answer, x_question_len, x_answer_len, x_target, x_target_weight, id_pairs, extra_feature, q_feature, a_feature, x_q_char, x_q_len, x_a_char, x_a_len = test_batch feed_dict = { esim.question: x_question, esim.answer: x_answer, esim.question_len: x_question_len, esim.answer_len: x_answer_len, esim.target: x_target, esim.target_loss_weight: x_target_weight, esim.dropout_keep_prob: 1.0, esim.extra_feature: extra_feature, esim.q_word_feature: q_feature, esim.a_word_feature: a_feature, esim.q_charVec: x_q_char, esim.q_charLen: x_q_len, esim.a_charVec: x_a_char, esim.a_charLen: x_a_len } batch_accuracy, predicted_prob = sess.run( [esim.accuracy, esim.probs], feed_dict) num_test += len(predicted_prob) if num_test % 1000 == 0: print(num_test) num_correct += len(predicted_prob) * batch_accuracy for i, prob_score in enumerate(predicted_prob): question_id, answer_id, label = id_pairs[i] results[question_id].append( (answer_id, label, prob_score)) #calculate top-1 precision print('num_test_samples: {} test_accuracy: {}'.format( num_test, num_correct / num_test)) accu, precision, recall, f1, loss = metrics.classification_metrics( results) print( 'Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'. format(accu, precision, recall, f1, loss)) mvp = metrics.mean_average_precision(results) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_1_precision(results) total_valid_query = metrics.get_num_valid_query(results) print( 'MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}' .format(mvp, mrr, top_1_precision, total_valid_query)) return mrr
q_char_len: x_q_len, a_char_feature: x_a_char, a_char_len: x_a_len } predicted_prob = sess.run(prob, feed_dict) num_test += len(predicted_prob) print('num_test_sample={}'.format(num_test)) for i, prob_score in enumerate(predicted_prob): qid, aid, label = batch_id_pairs[i] results[qid].append((aid, label, prob_score)) accu, precision, recall, f1, loss = metrics.classification_metrics(results) print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format( accu, precision, recall, f1, loss)) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_k_precision(results, k=1) top_2_precision = metrics.top_k_precision(results, k=2) top_5_precision = metrics.top_k_precision(results, k=5) top_10_precision = metrics.top_k_precision(results, k=10) total_valid_query = metrics.get_num_valid_query(results) print('MRR (mean reciprocal rank): {}\tTop-1 recall: {}\tNum_query: {}'.format( mrr, top_1_precision, total_valid_query)) print('Top-2 recall: {}'.format(top_2_precision)) print('Top-5 recall: {}'.format(top_5_precision)) print('Top-10 recall: {}'.format(top_10_precision)) out_path = FLAGS.output_file print("Saving evaluation to {}".format(out_path)) with open(out_path, 'w') as f: f.write("query_id\tdocument_id\tscore\trank\trelevance\n")