def dev_step(): results = defaultdict(list) num_test = 0 num_correct = 0.0 valid_batches = data_helpers.batch_iter(valid_dataset, FLAGS.batch_size, 1, FLAGS.max_utter_num, FLAGS.max_utter_len, \ FLAGS.max_response_num, FLAGS.max_response_len, FLAGS.max_persona_num, FLAGS.max_persona_len, \ charVocab, FLAGS.max_word_length, shuffle=True) for valid_batch in valid_batches: x_utterances, x_utterances_len, x_response, x_response_len, \ x_utters_num, x_target, x_ids, \ x_u_char, x_u_char_len, x_r_char, x_r_char_len, \ x_personas, x_personas_len, x_p_char, x_p_char_len, x_personas_num = valid_batch feed_dict = { dim.utterances: x_utterances, dim.utterances_len: x_utterances_len, dim.responses: x_response, dim.responses_len: x_response_len, dim.utters_num: x_utters_num, dim.target: x_target, dim.dropout_keep_prob: 1.0, dim.u_charVec: x_u_char, dim.u_charLen: x_u_char_len, dim.r_charVec: x_r_char, dim.r_charLen: x_r_char_len, dim.personas: x_personas, dim.personas_len: x_personas_len, dim.p_charVec: x_p_char, dim.p_charLen: x_p_char_len, dim.personas_num: x_personas_num } batch_accuracy, predicted_prob = sess.run([dim.accuracy, dim.probs], feed_dict) num_test += len(predicted_prob) if num_test % 1000 == 0: print(num_test) num_correct += len(predicted_prob) * batch_accuracy # predicted_prob = [batch_size, max_response_num] for i in range(len(predicted_prob)): probs = predicted_prob[i] us_id = x_ids[i] label = x_target[i] labels = np.zeros(FLAGS.max_response_num) labels[label] = 1 for r_id, prob in enumerate(probs): results[us_id].append((str(r_id), labels[r_id], prob)) #calculate top-1 precision print('num_test_samples: {} test_accuracy: {}'.format(num_test, num_correct/num_test)) accu, precision, recall, f1, loss = metrics.classification_metrics(results) print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss)) mvp = metrics.mean_average_precision(results) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_1_precision(results) total_valid_query = metrics.get_num_valid_query(results) print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(mvp, mrr, top_1_precision, total_valid_query)) return mrr
def dev_step(): results = defaultdict(list) num_test = 0 num_correct = 0.0 valid_batches = data_helpers.batch_iter(valid_dataset, FLAGS.batch_size, 1, target_loss_weight, FLAGS.max_utter_len, FLAGS.max_utter_num, FLAGS.max_response_len, charVocab, FLAGS.max_word_length, shuffle=True) for valid_batch in valid_batches: x_utterances, x_response, x_utterances_len, x_response_len, x_utters_num, x_target, x_target_weight, id_pairs, x_u_char, x_u_char_len, x_r_char, x_r_char_len = valid_batch feed_dict = { imn.utterances: x_utterances, imn.response: x_response, imn.utterances_len: x_utterances_len, imn.response_len: x_response_len, imn.utters_num: x_utters_num, imn.target: x_target, imn.target_loss_weight: x_target_weight, imn.dropout_keep_prob: 1.0, imn.u_charVec: x_u_char, imn.u_charLen: x_u_char_len, imn.r_charVec: x_r_char, imn.r_charLen: x_r_char_len, } batch_accuracy, predicted_prob = sess.run( [imn.accuracy, imn.probs], feed_dict) num_test += len(predicted_prob) if num_test % 1000 == 0: print(num_test) num_correct += len(predicted_prob) * batch_accuracy for i, prob_score in enumerate(predicted_prob): question_id, response_id, label = id_pairs[i] results[question_id].append( (response_id, label, prob_score)) #calculate top-1 precision print('num_test_samples: {} test_accuracy: {}'.format( num_test, num_correct / num_test)) accu, precision, recall, f1, loss = metrics.classification_metrics( results) print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'. format(accu, precision, recall, f1, loss)) mvp = metrics.mean_average_precision(results) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_1_precision(results) total_valid_query = metrics.get_num_valid_query(results) print( 'MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}' .format(mvp, mrr, top_1_precision, total_valid_query)) return mrr
print('num_test_sample={}'.format(num_test)) for i in range(len(predicted_prob)): probs = predicted_prob[i] us_id = x_ids[i] label = x_target[i] labels = np.zeros(FLAGS.max_response_num) labels[label] = 1 for r_id, prob in enumerate(probs): results[us_id].append((str(r_id), labels[r_id], prob)) accu, precision, recall, f1, loss = metrics.classification_metrics(results) print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss)) mvp = metrics.mean_average_precision(results) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_1_precision(results) total_valid_query = metrics.get_num_valid_query(results) print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(mvp, mrr, top_1_precision, total_valid_query)) out_path = FLAGS.output_file print("Saving evaluation to {}".format(out_path)) with open(out_path, 'w') as f: f.write("query_id\tdocument_id\tscore\trank\trelevance\n") for us_id, v in results.items(): v.sort(key=operator.itemgetter(2), reverse=True) for i, rec in enumerate(v): r_id, label, prob_score = rec rank = i+1 f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, r_id, prob_score, rank, label))