def test(test_set, model): print("starting testing...") start_time = time.time() model.eval() predictions, references = [], [] with torch.no_grad(): for i in range(len(test_set)): Y, T, data = test_set.get_candidate(i) Y = Y.to(device) T = T.to(device) ids = model.ranking(Y, T).data candidate = [] comments = list(data['candidate'].keys()) for id in ids: candidate.append(comments[id]) predictions.append(candidate) references.append(data['candidate']) if i % 100 == 0: print(i) recall_1 = recall(predictions, references, 1) recall_5 = recall(predictions, references, 5) recall_10 = recall(predictions, references, 10) mr = mean_rank(predictions, references) mrr = mean_reciprocal_rank(predictions, references) s = "r1={}, r5={}, r10={}, mr={}, mrr={}" print(s.format(recall_1, recall_5, recall_10, mr, mrr)) print("testing time:", time.time() - start_time)
def test_step(): results = defaultdict(list) num_test = 0 num_correct = 0.0 test_batches = data_helpers.batch_iter(test_dataset, FLAGS.batch_size, 1, target_loss_weight, FLAGS.max_utter_len, FLAGS.max_utter_num, FLAGS.max_response_len, shuffle=False) for test_batch in test_batches: x_utterances, x_response, x_utterances_len, x_response_len, x_utters_num, x_target, x_target_weight, id_pairs = test_batch feed_dict = { imn.utterances: x_utterances, imn.response: x_response, imn.utterances_len: x_utterances_len, imn.response_len: x_response_len, imn.utters_num: x_utters_num, imn.target: x_target, imn.target_loss_weight: x_target_weight, imn.dropout_keep_prob: 1.0 } batch_accuracy, predicted_prob = sess.run( [imn.accuracy, imn.probs], feed_dict) num_test += len(predicted_prob) if num_test % 1000 == 0: print(num_test) num_correct += len(predicted_prob) * batch_accuracy for i, prob_score in enumerate(predicted_prob): question_id, response_id, label = id_pairs[i] results[question_id].append( (response_id, label, prob_score)) #calculate top-1 precision print('num_test_samples: {} test_accuracy: {}'.format( num_test, num_correct / num_test)) accu, precision, recall, f1, loss = metrics.classification_metrics( results) print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'. format(accu, precision, recall, f1, loss)) mvp = metrics.mean_average_precision(results) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_1_precision(results) total_valid_query = metrics.get_num_valid_query(results) print( 'MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}' .format(mvp, mrr, top_1_precision, total_valid_query)) return mrr
def test_eval_metric(self): scores = torch.tensor([[1., 3., 2.], [1., 2., 3.], [3., 1., 2.]]) labels = torch.tensor([[0., 0., 1.], [0., 1., 2.], [0., 1., 0.]]) weights = torch.tensor([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]) gain_fn = lambda rel: rel rank_discount_fn = lambda rank: 1. / rank self._check_metrics([ (metrics_lib.mean_reciprocal_rank(labels, scores), metrics_lib.eval_metric( metric_fn=metrics_lib.mean_reciprocal_rank, labels=labels, predictions=scores)), (metrics_lib.mean_reciprocal_rank(labels, scores, topn=1), metrics_lib.eval_metric( metric_fn=metrics_lib.mean_reciprocal_rank, labels=labels, predictions=scores, topn=1)), (metrics_lib.mean_reciprocal_rank(labels, scores, weights), metrics_lib.eval_metric( metric_fn=metrics_lib.mean_reciprocal_rank, labels=labels, predictions=scores, weights=weights)), (metrics_lib.discounted_cumulative_gain( labels, scores, gain_fn=gain_fn, rank_discount_fn=rank_discount_fn), metrics_lib.eval_metric( metric_fn=metrics_lib.discounted_cumulative_gain, labels=labels, predictions=scores, gain_fn=gain_fn, rank_discount_fn=rank_discount_fn)), ])
def run_test(dir_path, op_name, sess, training, accuracy, prob, pair_ids, output_layer): results = defaultdict(list) num_test = 0 num_correct = 0.0 n_updates = 0 mrr = 0 t0 = time() try: while True: n_updates += 1 batch_accuracy, predicted_prob, pair_ = sess.run([accuracy, prob, pair_ids], feed_dict={training: False}) question_id, answer_id, label = pair_ num_test += len(predicted_prob) num_correct += len(predicted_prob) * batch_accuracy for i, prob_score in enumerate(predicted_prob): # question_id, answer_id, label = pair_id[i] results[question_id[i]].append((answer_id[i], label[i], prob_score[0])) if n_updates%2000 == 0: tf.logging.info("n_update %d , %s: Mins Used: %.2f" % (n_updates, op_name, (time() - t0) / 60.0)) except tf.errors.OutOfRangeError: # calculate top-1 precision print('num_test_samples: {} test_accuracy: {}'.format(num_test, num_correct / num_test)) accu, precision, recall, f1, loss = metrics.classification_metrics(results) print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format(accu, precision, recall, f1, loss)) mvp = metrics.mean_average_precision(results) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_1_precision(results) total_valid_query = metrics.get_num_valid_query(results) print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format( mvp, mrr, top_1_precision, total_valid_query)) out_path = os.path.join(dir_path, "output_test.txt") print("Saving evaluation to {}".format(out_path)) with open(out_path, 'w') as f: f.write("query_id\tdocument_id\tscore\trank\trelevance\n") for us_id, v in results.items(): v.sort(key=operator.itemgetter(2), reverse=True) for i, rec in enumerate(v): r_id, label, prob_score = rec rank = i+1 f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, r_id, prob_score, rank, label)) return mrr
def evaluate(query_list, label_list, corpus, args): top_pred_list = [] top_inds_list = [] thre_pred_list = [] f1_score_list = [] cos_scores_matrix = cosine_similarity(corpus) p = progressbar.ProgressBar() start = time.time() for i in p(range(len(query_list))): query, label = query_list[i], label_list[i] top_inds, thresh_preds = evaluate_iter(cos_scores_matrix[i, :], args) ground_true = 1 * (label_list == label) if not args.include_self: # remove query itself from the corpus thresh_preds = np.delete(thresh_preds, i) ground_true = np.delete(ground_true, i) top_inds = top_inds[top_inds != i] try: assert (len(top_inds) == 10) except AssertionError: top_inds = top_inds[:10] f1 = f1_score(ground_true, thresh_preds) top_labels = label_list[top_inds] rs = 1 * (top_labels == label) top_pred_list.append(rs) top_inds_list.append(top_inds) f1_score_list.append(f1) thre_pred_list.append(thresh_preds) duration = time.time() - start print("Execution time: {:.2f}ms".format(duration * 1000 / len(query_list))) mAP = mean_average_precision(top_pred_list) mrr = mean_reciprocal_rank(top_pred_list) total_f1 = np.mean(f1_score_list) if args.save_result: save_file_name = "pca_pred%s.pickle" % ("_include_self" if args.include_self else "") with open(os.path.join(args.save_dir, save_file_name), "wb") as f: pickle.dump(thre_pred_list, f) print("save pca prediction result over.") return mAP, mrr, total_f1
def evaluate_on_df(self, predictor, k: int = 5): """evaluate on datafiller""" # instantiate the prredictor class which contains a name attribute and preict metho p = predictor(self.titles) # eval procedure scores = defaultdict(list) # loop oveer eval_dataset for line in tqdm(self._eval_data, desc="Scoring Documents"): dcg, mrr, recall, precision = [], [], [], [] # loop over variants queries of the doc for example in line["examples"]: ### perform prediction using the predictor and the query y_pred = p.predict(example["query"], self.slugs, n=k) ### y_true = example["y_true"] y_score = example["y_score"] ## compute metrics based on the slugs returned and the ground truth dcg.append( discounted_cumulative_gain(y_score, y_true, y_pred, k=k)) mrr.append(mean_reciprocal_rank(y_pred, y_true)) precision.append(find_precision_k(y_pred, y_true, k=k)) recall.append(find_recall_k(y_pred, y_true, k=k)) # average scores over the variants per document scores["dcg"].append(np.mean(dcg)) scores["mrr"].append(np.mean(mrr)) scores["precision"].append(np.mean(precision)) scores["recall"].append(np.mean(recall)) # average the scores over all documents print("#" * 50) print("evaluation for {}".format(p.name)) print("--" * 25) print("dcg:", np.nanmean(scores["dcg"])) print("mrr:", np.nanmean(scores["mrr"])) print("precision:", np.nanmean(scores["precision"])) print("recall:", np.nanmean(scores["recall"])) print("#" * 50) return scores
def compute_metrics(sess, logits_op, placeholders, data_file, exporter=None): """Compute metrics MAP and MRR over a dataset. :param sess: TensorFlow session :param logits_op: an operation that returns the scores for a given set of sentences :param placeholders: placeholders defined for `logits_op` :data_file: a HDF5 file object holding the dataset :returns: the values of MAP and MRR as a tuple: (MAP, MRR) """ questions_ph, sentences_ph, keep_prob_ph = placeholders if exporter is None: exporter = dataio.no_op() next(exporter) # priming the coroutine total_avep = 0.0 total_mrr = 0.0 n_questions = 0 for batch in dataio.question_batches(data_file): feed_dict = { questions_ph: batch.questions, sentences_ph: batch.sentences, keep_prob_ph: 1.0 } scores = logits_op.eval(session=sess, feed_dict=feed_dict) exporter.send(scores) n_questions += 1 avep = average_precision(batch.labels, scores) total_avep += avep mrr = mean_reciprocal_rank(batch.labels, scores) total_mrr += mrr exporter.close() mean_avep = total_avep / n_questions mean_mrr = total_mrr / n_questions return mean_avep, mean_mrr
def metrics(*args, **kwargs): model_recommender, x_df, cols = args top_n = kwargs.get(_KWARG_TOP_N, _TOP_N_DEFAULT) y_true = [] y_pred = {n: [] for n in top_n} top_n_max = np.max(top_n) for _, (_, files, reviewers) in x_df.iterrows(): _y_pred = model_recommender.recommend((files, files), N=top_n_max) y_true.append(reviewers) for n in top_n: y_pred[n].append(_y_pred[:n] if not pd.isna(_y_pred).any() else []) acc = [accuracy(y_true, _y_pred) for n, _y_pred in y_pred.items()] mrr = [ mean_reciprocal_rank(y_true, _y_pred) for n, _y_pred in y_pred.items() ] metrics_df = pd.DataFrame([acc, mrr], index=['acc', 'mrr'], columns=[f'top-{n}' for n in top_n]) return metrics_df
r_char_feature: x_r_char, r_char_len: x_r_char_len } predicted_prob = sess.run(prob, feed_dict) num_test += len(predicted_prob) print('num_test_sample={}'.format(num_test)) for i, prob_score in enumerate(predicted_prob): us_id, r_id, label = id_pairs[i] results[us_id].append((r_id, label, prob_score)) accu, precision, recall, f1, loss = metrics.classification_metrics(results) print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format( accu, precision, recall, f1, loss)) mvp = metrics.mean_average_precision(results) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_1_precision(results) total_valid_query = metrics.get_num_valid_query(results) print( 'MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}' .format(mvp, mrr, top_1_precision, total_valid_query)) out_path = FLAGS.output_file print("Saving evaluation to {}".format(out_path)) with open(out_path, 'w') as f: f.write("query_id\tdocument_id\tscore\trank\trelevance\n") for us_id, v in results.items(): v.sort(key=operator.itemgetter(2), reverse=True) for i, rec in enumerate(v): r_id, label, prob_score = rec rank = i + 1
def score(self, X, y): return mean_reciprocal_rank(y, self.predict(X))
def run_test(epoch_no, dir_path, op_name, sess, training, accuracy, prob, pair_ids): results = defaultdict(list) num_test = 0 num_correct = 0.0 n_updates = 0 mrr = 0 t0 = time() try: while True: n_updates += 1 batch_accuracy, predicted_prob, pair_ = sess.run( [accuracy, prob, pair_ids], feed_dict={training: False}) question_id, answer_id, label = pair_ # question_id = question_id.eval() # answer_id = answer_id.eval() # label = label.eval() num_test += len(predicted_prob) # if num_test % 1000 == 0: # print(num_test) num_correct += len(predicted_prob) * batch_accuracy for i, prob_score in enumerate(predicted_prob): # question_id, answer_id, label = pair_id[i] results[question_id[i]].append( (answer_id[i], label[i], prob_score[0])) if n_updates % 2000 == 0: tf.logging.info( "epoch: %i n_update %d , %s: Mins Used: %.2f" % (epoch_no, n_updates, op_name, (time() - t0) / 60.0)) except tf.errors.OutOfRangeError: threshold = 0.95 none_id = 10000000 print("threshold: {}".format(threshold)) for q_id, a_list in results.items(): correct_flag = 0 for (a_id, label, score) in a_list: if int(label) == 1: correct_flag = 1 if correct_flag == 0: results[q_id].append((none_id, 1, threshold)) else: results[q_id].append((none_id, 0, threshold)) # calculate top-1 precision print('num_test_samples: {} test_accuracy: {}'.format( num_test, num_correct / num_test)) accu, precision, recall, f1, loss = metrics.classification_metrics( results) print( 'Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'.format( accu, precision, recall, f1, loss)) mvp = metrics.mean_average_precision(results) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_1_precision(results) total_valid_query = metrics.get_num_valid_query(results) print( 'MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}' .format(mvp, mrr, top_1_precision, total_valid_query)) out_path = os.path.join(dir_path, "ubuntu_output_epoch_{}.txt".format(epoch_no)) print("Saving evaluation to {}".format(out_path)) with open(out_path, 'w') as f: f.write("query_id\tdocument_id\tscore\trank\trelevance\n") for us_id, v in results.items(): v.sort(key=operator.itemgetter(2), reverse=True) for i, rec in enumerate(v): r_id, label, prob_score = rec rank = i + 1 f.write('{}\t{}\t{}\t{}\t{}\n'.format( us_id, r_id, prob_score, rank, label)) global best_score if op_name == 'valid' and mrr > best_score: best_score = mrr saver = tf.train.Saver() dir_path = os.path.join(dir_path, "epoch {}".format(epoch_no)) if not os.path.exists(dir_path): os.makedirs(dir_path) saver.save(sess, dir_path) tf.logging.info(">> save model!") return mrr
def plot_single_number_metric_helper(dataset, dsmetric, models, rs, true_result, metric, norm, ds_kernel, thresh_pos, thresh_neg, thresh_pos_sim, thresh_neg_sim, plot_results, extra_dir): # dsmetric: distance/similarity metric, e.g. ged, mcs, etc. # metric: eval metric. print_ids = [] rtn = {} val_list = [] for model in models: if metric == 'mrr': val = mean_reciprocal_rank( true_result, rs[model], norm, print_ids) elif metric == 'mse': val = mean_squared_error( true_result, rs[model], ds_kernel, norm) elif metric == 'dev': val = mean_deviation( true_result, rs[model], ds_kernel, norm) elif metric == 'time': val = average_time(rs[model]) elif 'acc' in metric: val = accuracy( true_result, rs[model], thresh_pos, thresh_neg, thresh_pos_sim, thresh_neg_sim, norm) pos_acc, neg_acc, acc = val if metric == 'pos_acc': val = pos_acc elif metric == 'neg_acc': val = neg_acc elif metric == 'acc': val = acc # only the overall acc else: assert (metric == 'accall') elif metric == 'kendalls_tau': val = kendalls_tau(true_result, rs[model], norm) elif metric == 'spearmans_rho': val = spearmans_rho(true_result, rs[model], norm) else: raise RuntimeError('Unknown {}'.format(metric)) # print('{} {}: {}'.format(metric, model, mrr_mse_time)) rtn[model] = val val_list.append(val) rtn = {'{}{}'.format(metric, get_norm_str(norm)): rtn} if not plot_results: return rtn plt = plot_multiple_bars(val_list, models, metric) if metric == 'time': ylabel = 'time (msec)' norm = None elif metric == 'pos_acc': ylabel = 'pos_recall' elif metric == 'neg_acc': ylabel = 'neg_recall' elif metric == 'kendalls_tau': ylabel = 'Kendall\'s $\\tau$' elif metric == 'spearmans_rho': ylabel = 'Spearman\'s $\\rho$' else: ylabel = metric plt.ylabel(ylabel) if metric == 'time': plt.yscale('log') metric_addi_info = '' bfn = '{}_{}{}_{}_{}{}'.format( dsmetric, metric, metric_addi_info, dataset, '_'.join(models), get_norm_str(norm)) sp = get_result_path() + '/{}/{}/'.format(dataset, metric) save_fig(plt, sp, bfn) if extra_dir: save_fig(plt, extra_dir, bfn) print(metric, 'plotted') return rtn
def test(experiment_name, task, gpu_num=0, pretrained='', margin=0.4, losstype='deepcca'): cosined = False embed_dim = 1024 gpu_num = int(gpu_num) margin = float(margin) # Setup the results and device. results_dir = setup_dirs(experiment_name) if not os.path.exists(results_dir + 'test_results/'): os.makedirs(results_dir + 'test_results/') test_results_dir = results_dir + 'test_results/' device = setup_device(gpu_num) #### Hyperparameters ##### #Initialize wandb #import wandb #wandb.init(project=experiment_name) #config = wandb.config with open(results_dir + 'hyperparams_test.txt', 'w') as f: f.write('Command used to run: python ') f.write(' '.join(sys.argv)) f.write('\n') f.write('device in use: ' + str(device)) f.write('\n') f.write('--experiment_name ' + str(experiment_name)) f.write('\n') # Setup data loaders and models based on task. if task == 'cifar10': train_loader, test_loader = cifar10_loaders() model_A = CIFAREmbeddingNet() model_B = CIFAREmbeddingNet() elif task == 'mnist': train_loader, test_loader = mnist_loaders() model_A = MNISTEmbeddingNet() model_B = MNISTEmbeddingNet() elif task == 'uw': uw_data = 'bert' train_loader, test_loader = uw_loaders(uw_data) if uw_data == 'bert': model_A = RowNet(3072, embed_dim=1024) # Language. model_B = RowNet(4096, embed_dim=1024) # Vision. # Finish model setup. model_A.load_state_dict( torch.load(results_dir + 'train_results/model_A_state.pt')) model_B.load_state_dict( torch.load(results_dir + 'train_results/model_B_state.pt')) model_A.to(device) model_B.to(device) # Put models into evaluation mode. model_A.eval() model_B.eval() """For UW data.""" ## we use train data to calculate the threshhold for distance. a_train = [] b_train = [] # loading saved embeddings to be faster a_train = load_embeddings(test_results_dir + 'lang_embeds_train.npy') b_train = load_embeddings(test_results_dir + 'img_embeds_train.npy') # Iterate through the train data. if a_train is None or b_train is None: a_train = [] b_train = [] print( "Computing embeddings for train data to calculate threshhold for distance" ) for data in train_loader: anchor_data = data[0].to(device) positive_data = data[1].to(device) label = data[2] a_train.append( model_A(anchor_data.to(device)).cpu().detach().numpy()) b_train.append( model_B(positive_data.to(device)).cpu().detach().numpy()) print("Finished Computing embeddings for train data") #saving embeddings if not already saved save_embeddings(test_results_dir + 'lang_embeds_train.npy', a_train) save_embeddings(test_results_dir + 'img_embeds_train.npy', b_train) a_train = np.concatenate(a_train, axis=0) b_train = np.concatenate(b_train, axis=0) # Test data # For accumulating predictions to check embedding visually using test set. # a is embeddings from domain A, b is embeddings from domain B, ys is their labels a = [] b = [] ys = [] instance_data = [] # loading saved embeddings to be faster a = load_embeddings(test_results_dir + 'lang_embeds.npy') b = load_embeddings(test_results_dir + 'img_embeds.npy') if a is None or b is None: compute_test_embeddings = True a = [] b = [] # Iterate through the test data. print("computing embeddings for test data") for data in test_loader: language_data, vision_data, object_name, instance_name = data language_data = language_data.to(device) vision_data = vision_data.to(device) instance_data.extend(instance_name) if compute_test_embeddings: a.append( model_A(language_data).cpu().detach().numpy()) # Language. b.append(model_B(vision_data).cpu().detach().numpy()) # Vision. ys.extend(object_name) print("finished computing embeddings for test data") # Convert string labels to ints. labelencoder = LabelEncoder() labelencoder.fit(ys) ys = labelencoder.transform(ys) #saving embeddings if not already saved save_embeddings(test_results_dir + 'lang_embeds.npy', a) save_embeddings(test_results_dir + 'img_embeds.npy', b) # Concatenate predictions. a = np.concatenate(a, axis=0) b = np.concatenate(b, axis=0) ab = np.concatenate((a, b), axis=0) ground_truth, predicted, distance = object_identification_task_classifier( a, b, ys, a_train, b_train, lamb_std=1, cosine=cosined) #### Retrieval task by giving an image and finding the closest word descriptions #### ground_truth_word, predicted_word, distance_word = object_identification_task_classifier( b, a, ys, b_train, a_train, lamb_std=1, cosine=cosined) with open('retrieval_non_pro.csv', mode='w') as retrieval_non_pro: csv_file_writer = csv.writer(retrieval_non_pro, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_file_writer.writerow( ['image', 'language', 'predicted', 'ground truth']) for i in range(50): csv_file_writer.writerow([ instance_data[0], instance_data[i], predicted_word[0][i], ground_truth_word[0][i] ]) precisions = [] recalls = [] f1s = [] precisions_pos = [] recalls_pos = [] f1s_pos = [] #print(classification_report(oit_res[i], 1/np.arange(1,len(oit_res[i])+1) > 0.01)) for i in range(len(ground_truth)): p, r, f, s = precision_recall_fscore_support(ground_truth[i], predicted[i], warn_for=(), average='micro') precisions.append(p) recalls.append(r) f1s.append(f) p, r, f, s = precision_recall_fscore_support(ground_truth[i], predicted[i], warn_for=(), average='binary') precisions_pos.append(p) recalls_pos.append(r) f1s_pos.append(f) print('\n ') print(experiment_name + '_' + str(embed_dim)) print('MRR, KNN, Corr, Mean F1, Mean F1 (pos only)') print('%.3g & %.3g & %.3g & %.3g & %.3g' % (mean_reciprocal_rank( a, b, ys, cosine=cosined), knn(a, b, ys, k=5, cosine=cosined), corr_between(a, b, cosine=cosined), np.mean(f1s), np.mean(f1s_pos))) plt.figure(figsize=(14, 7)) for i in range(len(ground_truth)): fpr, tpr, thres = roc_curve(ground_truth[i], [1 - e for e in distance[i]], drop_intermediate=True) plt.plot(fpr, tpr, alpha=0.08, color='r') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.savefig(test_results_dir + '_' + str(embed_dim) + '_ROC.svg') # Pick a pair, plot distance in A vs distance in B. Should be correlated. a_dists = [] b_dists = [] for _ in range(3000): i1 = random.randrange(len(a)) i2 = random.randrange(len(a)) a_dists.append(euclidean(a[i1], a[i2])) b_dists.append(euclidean(b[i1], b[i2])) # a_dists.append(cosine(a[i1], a[i2])) # b_dists.append(cosine(b[i1], b[i2])) # Plot. plt.figure(figsize=(14, 14)) #plt.title('Check Distance Correlation Between Domains') plt.xlim([0, 3]) plt.ylim([0, 3]) # plt.xlim([0,max(a_dists)]) # plt.ylim([0,max(b_dists)]) # plt.xlabel('Distance in Domain A') # plt.ylabel('Distance in Domain B') plt.xlabel('Distance in Language Domain') plt.ylabel('Distance in Vision Domain') #plt.plot(a_dists_norm[0],b_dists_norm[0],'.') #plt.plot(np.arange(0,2)/20,np.arange(0,2)/20,'k-',lw=3) plt.plot(a_dists, b_dists, 'o', alpha=0.5) plt.plot(np.arange(0, 600), np.arange(0, 600), 'k--', lw=3, alpha=0.5) #plt.text(-0.001, -0.01, 'Corr: %.3f'%(pearsonr(a_dists,b_dists)[0]), fontsize=20) plt.savefig(test_results_dir + '_' + str(embed_dim) + '_CORR.svg') # Inspect embedding distances. clas = 5 # Base class. i_clas = [i for i in range(len(ys)) if ys[i].item() == clas] i_clas_2 = np.random.choice(i_clas, len(i_clas), replace=False) clas_ref = 4 # Comparison class. i_clas_ref = [i for i in range(len(ys)) if ys[i].item() == clas_ref] ac = np.array([a[i] for i in i_clas]) bc = np.array([b[i] for i in i_clas]) ac2 = np.array([a[i] for i in i_clas_2]) bc2 = np.array([b[i] for i in i_clas_2]) ac_ref = np.array([a[i] for i in i_clas_ref]) aa_diff_ref = norm(ac[:min(len(ac), len(ac_ref))] - ac_ref[:min(len(ac), len(ac_ref))], ord=2, axis=1) ab_diff = norm(ac - bc2, ord=2, axis=1) aa_diff = norm(ac - ac2, ord=2, axis=1) bb_diff = norm(bc - bc2, ord=2, axis=1) # aa_diff_ref = [cosine(ac[:min(len(ac),len(ac_ref))][i],ac_ref[:min(len(ac),len(ac_ref))][i]) for i in range(len(ac[:min(len(ac),len(ac_ref))]))] # ab_diff = [cosine(ac[i],bc2[i]) for i in range(len(ac))] # aa_diff = [cosine(ac[i],ac2[i]) for i in range(len(ac))] # bb_diff = [cosine(bc[i],bc2[i]) for i in range(len(ac))] bins = np.linspace(0, 0.1, 100) plt.figure(figsize=(14, 7)) plt.hist(ab_diff, bins, alpha=0.5, label='between embeddings') plt.hist(aa_diff, bins, alpha=0.5, label='within embedding A') plt.hist(bb_diff, bins, alpha=0.5, label='within embedding B') plt.hist(aa_diff_ref, bins, alpha=0.5, label='embedding A, from class ' + str(clas_ref)) plt.title('Embedding Distances - Class: ' + str(clas)) plt.xlabel('L2 Distance') plt.ylabel('Count') plt.legend() #labelencoder.classes_ classes_to_keep = [36, 6, 9, 46, 15, 47, 50, 22, 26, 28] print(labelencoder.inverse_transform(classes_to_keep)) ab_norm = [ e for i, e in enumerate(ab) if ys[i % len(ys)] in classes_to_keep ] ys_norm = [e for e in ys if e in classes_to_keep] color_index = {list(set(ys_norm))[i]: i for i in range(len(set(ys_norm)))} #set(ys_norm) markers = ["o", "v", "^", "s", "*", "+", "x", "D", "h", "4"] marker_index = { list(set(ys_norm))[i]: markers[i] for i in range(len(set(ys_norm))) } embedding = umap.UMAP(n_components=2).fit_transform( ab_norm) # metric='cosine' # Plot UMAP embedding of embeddings for all classes. f, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10)) mid = len(ys_norm) ax1.set_title('Language UMAP') for e in list(set(ys_norm)): x1 = [ embedding[:mid, 0][i] for i in range(len(ys_norm)) if ys_norm[i] == e ] x2 = [ embedding[:mid, 1][i] for i in range(len(ys_norm)) if ys_norm[i] == e ] ax1.scatter( x1, x2, marker=marker_index[int(e)], alpha=0.5, c=[sns.color_palette("colorblind", 10)[color_index[int(e)]]], label=labelencoder.inverse_transform([int(e)])[0]) ax1.set_xlim([min(embedding[:, 0]) - 4, max(embedding[:, 0]) + 4]) ax1.set_ylim([min(embedding[:, 1]) - 4, max(embedding[:, 1]) + 4]) ax1.grid(True) ax1.legend(loc='upper center', bbox_to_anchor=(1.1, -0.08), fancybox=True, shadow=True, ncol=5) ax2.set_title('Vision UMAP') for e in list(set(ys_norm)): x1 = [ embedding[mid::, 0][i] for i in range(len(ys_norm)) if ys_norm[i] == e ] x2 = [ embedding[mid::, 1][i] for i in range(len(ys_norm)) if ys_norm[i] == e ] ax2.scatter( x1, x2, marker=marker_index[int(e)], alpha=0.5, c=[sns.color_palette("colorblind", 10)[color_index[int(e)]]]) ax2.set_xlim([min(embedding[:, 0]) - 4, max(embedding[:, 0]) + 4]) ax2.set_ylim([min(embedding[:, 1]) - 4, max(embedding[:, 1]) + 4]) ax2.grid(True) plt.savefig(test_results_dir + '_' + str(embed_dim) + '_UMAP_wl.svg', bbox_inches='tight')
# MAGIC * The example external evaluations may hold out and consider multiple items as ground truth, while the internal evaluations only hold out the last item in each user-history as the ground truth. There is no absolute preference as to how many items should be held out; we recommend designing the evaluation methods that are similar to the actual use case. # COMMAND ---------- relevance = [] for user_id, true_items in tqdm_notebook(holdout.groupby('USER_ID').ITEM_ID): rec_response = personalize_runtime.get_recommendations( campaignArn = campaign_arn, userId = str(user_id) ) rec_items = [int(x['itemId']) for x in rec_response['itemList']] relevance.append([int(x in true_items.values) for x in rec_items]) # COMMAND ---------- print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in relevance])) print('precision_at_5', np.mean([precision_at_k(r, 5) for r in relevance])) print('precision_at_10', np.mean([precision_at_k(r, 10) for r in relevance])) print('precision_at_25', np.mean([precision_at_k(r, 25) for r in relevance])) print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in relevance])) print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in relevance])) print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in relevance])) # COMMAND ---------- # MAGIC %md # MAGIC ### Optional: slightly better results after deduplicating previous purchase histories # COMMAND ---------- rel_dedup = []
def dev_step(): results = defaultdict(list) num_test = 0 num_correct = 0.0 valid_batches = data_helpers.batch_iter(valid_dataset, FLAGS.batch_size, 1, target_loss_weight, FLAGS.max_utter_len, FLAGS.max_utter_num, FLAGS.max_response_len, charVocab, FLAGS.max_word_length, shuffle=False) for valid_batch in valid_batches: x_utterances, x_response, x_utterances_len, x_response_len, x_utters_num, x_responses_num, x_dist, x_target, x_target_weight, id_pairs, x_u_char, x_u_char_len, x_r_char, x_r_char_len = valid_batch feed_dict = { u2u_imn.utterances: x_utterances, u2u_imn.response: x_response, u2u_imn.utterances_len: x_utterances_len, u2u_imn.response_len: x_response_len, u2u_imn.utters_num: x_utters_num, u2u_imn.responses_num: x_responses_num, u2u_imn.distance: x_dist, u2u_imn.target: x_target, u2u_imn.target_loss_weight: x_target_weight, u2u_imn.dropout_keep_prob: 1.0, u2u_imn.u_charVec: x_u_char, u2u_imn.u_charLen: x_u_char_len, u2u_imn.r_charVec: x_r_char, u2u_imn.r_charLen: x_r_char_len } batch_accuracy, predicted_prob = sess.run( [u2u_imn.accuracy, u2u_imn.probs], feed_dict) num_test += len(predicted_prob) if num_test % 1000 == 0: print(num_test) num_correct += len(predicted_prob) * batch_accuracy for i, prob_score in enumerate(predicted_prob): question_id, response_id, label = id_pairs[i] results[question_id].append( (response_id, label, prob_score)) #calculate top-1 precision print('num_test_samples: {} test_accuracy: {}'.format( num_test, num_correct / num_test)) accu, precision, recall, f1, loss = metrics.classification_metrics( results) print('Accuracy: {}, Precision: {} Recall: {} F1: {} Loss: {}'. format(accu, precision, recall, f1, loss)) mvp = metrics.mean_average_precision(results) mrr = metrics.mean_reciprocal_rank(results) top_1_precision = metrics.top_1_precision(results) total_valid_query = metrics.get_num_valid_query(results) print( 'MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}' .format(mvp, mrr, top_1_precision, total_valid_query)) all_preds = [] for i in range(len(results)): all_preds.append([r[2] for r in results[str(i)]]) df = pd.DataFrame(all_preds, columns=[ 'prediction_' + str(i) for i in range(len(all_preds[0])) ]) if not os.path.isdir(FLAGS.output_predictions_folder): os.makedirs(FLAGS.output_predictions_folder) with open( os.path.join(FLAGS.output_predictions_folder, 'config.json'), 'w') as f: conf = {} for k, v in FLAGS.__dict__['__flags'].items(): conf[k] = v conf['ranker'] = "U2U" conf['seed'] = str(conf['random_seed']) args_dict = {} args_dict['args'] = conf f.write(json.dumps(args_dict, indent=4, sort_keys=True)) df.to_csv(FLAGS.output_predictions_folder + "/predictions.csv", index=False) return mrr