def test_step(): results = defaultdict(list) num_test = 0 num_correct = 0.0 test_batches = data_helpers.batch_iter(test_dataset, FLAGS.batch_size, 1, target_loss_weight, FLAGS.max_utter_len, FLAGS.max_utter_num, FLAGS.max_response_len, shuffle=False) for test_batch in test_batches: x_utterances, x_response, x_utterances_len, x_response_len, x_utters_num, x_target, x_target_weight, id_pairs = test_batch feed_dict = { imn.utterances: x_utterances, imn.response: x_response, imn.utterances_len: x_utterances_len, imn.response_len: x_response_len, imn.utters_num: x_utters_num, imn.target: x_target, imn.target_loss_weight: x_target_weight, imn.dropout_keep_prob: 1.0 } batch_accuracy, predicted_prob = sess.run( [imn.accuracy, imn.probs], feed_dict) num_test += len(predicted_prob) if num_test % 1000 == 0: print(num_test) num_correct += len(predicted_prob) * batch_accuracy for i, prob_score in enumerate(predicted_prob): question_id, response_id, label = id_pairs[i] results[question_id].append( (response_id, label, prob_score)) #calculate top-1 precision print('num_test_samples: {} test_accuracy: {}'.format( num_test, num_correct / num_test)) accu, precision, recall, f1, loss = metrics.classification_metrics( results) print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'. format(accu, precision, recall, f1, loss)) mvp = metrics.mean_average_precision(results) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_1_precision(results) total_valid_query = metrics.get_num_valid_query(results) print( 'MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}' .format(mvp, mrr, top_1_precision, total_valid_query)) return mrr
def run_test(dir_path, op_name, sess, training, accuracy, prob, pair_ids, output_layer): results = defaultdict(list) num_test = 0 num_correct = 0.0 n_updates = 0 mrr = 0 t0 = time() try: while True: n_updates += 1 batch_accuracy, predicted_prob, pair_ = sess.run([accuracy, prob, pair_ids], feed_dict={training: False}) question_id, answer_id, label = pair_ num_test += len(predicted_prob) num_correct += len(predicted_prob) * batch_accuracy for i, prob_score in enumerate(predicted_prob): # question_id, answer_id, label = pair_id[i] results[question_id[i]].append((answer_id[i], label[i], prob_score[0])) if n_updates%2000 == 0: tf.logging.info("n_update %d , %s: Mins Used: %.2f" % (n_updates, op_name, (time() - t0) / 60.0)) except tf.errors.OutOfRangeError: # calculate top-1 precision print('num_test_samples: {} test_accuracy: {}'.format(num_test, num_correct / num_test)) accu, precision, recall, f1, loss = metrics.classification_metrics(results) print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss)) mvp = metrics.mean_average_precision(results) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_1_precision(results) total_valid_query = metrics.get_num_valid_query(results) print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format( mvp, mrr, top_1_precision, total_valid_query)) out_path = os.path.join(dir_path, "output_test.txt") print("Saving evaluation to {}".format(out_path)) with open(out_path, 'w') as f: f.write("query_id\tdocument_id\tscore\trank\trelevance\n") for us_id, v in results.items(): v.sort(key=operator.itemgetter(2), reverse=True) for i, rec in enumerate(v): r_id, label, prob_score = rec rank = i+1 f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, r_id, prob_score, rank, label)) return mrr
response_len: x_response_len, utterances_num: x_utters_num, dropout_keep_prob: 1.0, u_char_feature: x_u_char, u_char_len: x_u_char_len, r_char_feature: x_r_char, r_char_len: x_r_char_len } predicted_prob = sess.run(prob, feed_dict) num_test += len(predicted_prob) print('num_test_sample={}'.format(num_test)) for i, prob_score in enumerate(predicted_prob): us_id, r_id, label = id_pairs[i] results[us_id].append((r_id, label, prob_score)) accu, precision, recall, f1, loss = metrics.classification_metrics(results) print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format( accu, precision, recall, f1, loss)) mvp = metrics.mean_average_precision(results) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_1_precision(results) total_valid_query = metrics.get_num_valid_query(results) print( 'MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}' .format(mvp, mrr, top_1_precision, total_valid_query)) out_path = FLAGS.output_file print("Saving evaluation to {}".format(out_path)) with open(out_path, 'w') as f: f.write("query_id\tdocument_id\tscore\trank\trelevance\n")
def run_test(epoch_no, dir_path, op_name, sess, training, accuracy, prob, pair_ids): results = defaultdict(list) num_test = 0 num_correct = 0.0 n_updates = 0 mrr = 0 t0 = time() try: while True: n_updates += 1 batch_accuracy, predicted_prob, pair_ = sess.run( [accuracy, prob, pair_ids], feed_dict={training: False}) question_id, answer_id, label = pair_ # question_id = question_id.eval() # answer_id = answer_id.eval() # label = label.eval() num_test += len(predicted_prob) # if num_test % 1000 == 0: # print(num_test) num_correct += len(predicted_prob) * batch_accuracy for i, prob_score in enumerate(predicted_prob): # question_id, answer_id, label = pair_id[i] results[question_id[i]].append( (answer_id[i], label[i], prob_score[0])) if n_updates % 2000 == 0: tf.logging.info( "epoch: %i n_update %d , %s: Mins Used: %.2f" % (epoch_no, n_updates, op_name, (time() - t0) / 60.0)) except tf.errors.OutOfRangeError: threshold = 0.95 none_id = 10000000 print("threshold: {}".format(threshold)) for q_id, a_list in results.items(): correct_flag = 0 for (a_id, label, score) in a_list: if int(label) == 1: correct_flag = 1 if correct_flag == 0: results[q_id].append((none_id, 1, threshold)) else: results[q_id].append((none_id, 0, threshold)) # calculate top-1 precision print('num_test_samples: {} test_accuracy: {}'.format( num_test, num_correct / num_test)) accu, precision, recall, f1, loss = metrics.classification_metrics( results) print( 'Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format( accu, precision, recall, f1, loss)) mvp = metrics.mean_average_precision(results) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_1_precision(results) total_valid_query = metrics.get_num_valid_query(results) print( 'MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}' .format(mvp, mrr, top_1_precision, total_valid_query)) out_path = os.path.join(dir_path, "ubuntu_output_epoch_{}.txt".format(epoch_no)) print("Saving evaluation to {}".format(out_path)) with open(out_path, 'w') as f: f.write("query_id\tdocument_id\tscore\trank\trelevance\n") for us_id, v in results.items(): v.sort(key=operator.itemgetter(2), reverse=True) for i, rec in enumerate(v): r_id, label, prob_score = rec rank = i + 1 f.write('{}\t{}\t{}\t{}\t{}\n'.format( us_id, r_id, prob_score, rank, label)) global best_score if op_name == 'valid' and mrr > best_score: best_score = mrr saver = tf.train.Saver() dir_path = os.path.join(dir_path, "epoch {}".format(epoch_no)) if not os.path.exists(dir_path): os.makedirs(dir_path) saver.save(sess, dir_path) tf.logging.info(">> save model!") return mrr
def dev_step(): results = defaultdict(list) num_test = 0 num_correct = 0.0 valid_batches = data_helpers.batch_iter(valid_dataset, FLAGS.batch_size, 1, target_loss_weight, FLAGS.max_utter_len, FLAGS.max_utter_num, FLAGS.max_response_len, charVocab, FLAGS.max_word_length, shuffle=False) for valid_batch in valid_batches: x_utterances, x_response, x_utterances_len, x_response_len, x_utters_num, x_responses_num, x_dist, x_target, x_target_weight, id_pairs, x_u_char, x_u_char_len, x_r_char, x_r_char_len = valid_batch feed_dict = { u2u_imn.utterances: x_utterances, u2u_imn.response: x_response, u2u_imn.utterances_len: x_utterances_len, u2u_imn.response_len: x_response_len, u2u_imn.utters_num: x_utters_num, u2u_imn.responses_num: x_responses_num, u2u_imn.distance: x_dist, u2u_imn.target: x_target, u2u_imn.target_loss_weight: x_target_weight, u2u_imn.dropout_keep_prob: 1.0, u2u_imn.u_charVec: x_u_char, u2u_imn.u_charLen: x_u_char_len, u2u_imn.r_charVec: x_r_char, u2u_imn.r_charLen: x_r_char_len } batch_accuracy, predicted_prob = sess.run( [u2u_imn.accuracy, u2u_imn.probs], feed_dict) num_test += len(predicted_prob) if num_test % 1000 == 0: print(num_test) num_correct += len(predicted_prob) * batch_accuracy for i, prob_score in enumerate(predicted_prob): question_id, response_id, label = id_pairs[i] results[question_id].append( (response_id, label, prob_score)) #calculate top-1 precision print('num_test_samples: {} test_accuracy: {}'.format( num_test, num_correct / num_test)) accu, precision, recall, f1, loss = metrics.classification_metrics( results) print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'. format(accu, precision, recall, f1, loss)) mvp = metrics.mean_average_precision(results) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_1_precision(results) total_valid_query = metrics.get_num_valid_query(results) print( 'MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}' .format(mvp, mrr, top_1_precision, total_valid_query)) all_preds = [] for i in range(len(results)): all_preds.append([r[2] for r in results[str(i)]]) df = pd.DataFrame(all_preds, columns=[ 'prediction_' + str(i) for i in range(len(all_preds[0])) ]) if not os.path.isdir(FLAGS.output_predictions_folder): os.makedirs(FLAGS.output_predictions_folder) with open( os.path.join(FLAGS.output_predictions_folder, 'config.json'), 'w') as f: conf = {} for k, v in FLAGS.__dict__['__flags'].items(): conf[k] = v conf['ranker'] = "U2U" conf['seed'] = str(conf['random_seed']) args_dict = {} args_dict['args'] = conf f.write(json.dumps(args_dict, indent=4, sort_keys=True)) df.to_csv(FLAGS.output_predictions_folder + "/predictions.csv", index=False) return mrr