Esempio n. 1
0
File: run.py Progetto: zingp/cbot
def test(test_set, model):
    print("starting testing...")
    start_time = time.time()
    model.eval()
    predictions, references = [], []
    with torch.no_grad():
        for i in range(len(test_set)):
            Y, T, data = test_set.get_candidate(i)
            Y = Y.to(device)
            T = T.to(device)
            ids = model.ranking(Y, T).data

            candidate = []
            comments = list(data['candidate'].keys())
            for id in ids:
                candidate.append(comments[id])
            predictions.append(candidate)
            references.append(data['candidate'])
            if i % 100 == 0:
                print(i)

    recall_1 = recall(predictions, references, 1)
    recall_5 = recall(predictions, references, 5)
    recall_10 = recall(predictions, references, 10)
    mr = mean_rank(predictions, references)
    mrr = mean_reciprocal_rank(predictions, references)
    s = "r1={}, r5={}, r10={}, mr={}, mrr={}"
    print(s.format(recall_1, recall_5, recall_10, mr, mrr))

    print("testing time:", time.time() - start_time)
Esempio n. 2
0
        def test_step():
            results = defaultdict(list)
            num_test = 0
            num_correct = 0.0
            test_batches = data_helpers.batch_iter(test_dataset,
                                                   FLAGS.batch_size,
                                                   1,
                                                   target_loss_weight,
                                                   FLAGS.max_utter_len,
                                                   FLAGS.max_utter_num,
                                                   FLAGS.max_response_len,
                                                   shuffle=False)
            for test_batch in test_batches:
                x_utterances, x_response, x_utterances_len, x_response_len, x_utters_num, x_target, x_target_weight, id_pairs = test_batch
                feed_dict = {
                    imn.utterances: x_utterances,
                    imn.response: x_response,
                    imn.utterances_len: x_utterances_len,
                    imn.response_len: x_response_len,
                    imn.utters_num: x_utters_num,
                    imn.target: x_target,
                    imn.target_loss_weight: x_target_weight,
                    imn.dropout_keep_prob: 1.0
                }
                batch_accuracy, predicted_prob = sess.run(
                    [imn.accuracy, imn.probs], feed_dict)
                num_test += len(predicted_prob)
                if num_test % 1000 == 0:
                    print(num_test)

                num_correct += len(predicted_prob) * batch_accuracy
                for i, prob_score in enumerate(predicted_prob):
                    question_id, response_id, label = id_pairs[i]
                    results[question_id].append(
                        (response_id, label, prob_score))

            #calculate top-1 precision
            print('num_test_samples: {}  test_accuracy: {}'.format(
                num_test, num_correct / num_test))
            accu, precision, recall, f1, loss = metrics.classification_metrics(
                results)
            print('Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.
                  format(accu, precision, recall, f1, loss))

            mvp = metrics.mean_average_precision(results)
            mrr = metrics.mean_reciprocal_rank(results)
            top_1_precision = metrics.top_1_precision(results)
            total_valid_query = metrics.get_num_valid_query(results)
            print(
                'MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'
                .format(mvp, mrr, top_1_precision, total_valid_query))

            return mrr
Esempio n. 3
0
 def test_eval_metric(self):
     scores = torch.tensor([[1., 3., 2.], [1., 2., 3.], [3., 1., 2.]])
     labels = torch.tensor([[0., 0., 1.], [0., 1., 2.], [0., 1., 0.]])
     weights = torch.tensor([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]])
     gain_fn = lambda rel: rel
     rank_discount_fn = lambda rank: 1. / rank
     self._check_metrics([
         (metrics_lib.mean_reciprocal_rank(labels, scores),
          metrics_lib.eval_metric(
              metric_fn=metrics_lib.mean_reciprocal_rank,
              labels=labels,
              predictions=scores)),
         (metrics_lib.mean_reciprocal_rank(labels, scores, topn=1),
          metrics_lib.eval_metric(
              metric_fn=metrics_lib.mean_reciprocal_rank,
              labels=labels,
              predictions=scores,
              topn=1)),
         (metrics_lib.mean_reciprocal_rank(labels, scores, weights),
          metrics_lib.eval_metric(
              metric_fn=metrics_lib.mean_reciprocal_rank,
              labels=labels,
              predictions=scores,
              weights=weights)),
         (metrics_lib.discounted_cumulative_gain(
             labels,
             scores,
             gain_fn=gain_fn,
             rank_discount_fn=rank_discount_fn),
          metrics_lib.eval_metric(
              metric_fn=metrics_lib.discounted_cumulative_gain,
              labels=labels,
              predictions=scores,
              gain_fn=gain_fn,
              rank_discount_fn=rank_discount_fn)),
     ])
Esempio n. 4
0
def run_test(dir_path, op_name, sess, training, accuracy, prob, pair_ids, output_layer):
    results = defaultdict(list)
    num_test = 0
    num_correct = 0.0
    n_updates = 0
    mrr = 0
    t0 = time()
    try:
        while True:
            n_updates += 1

            batch_accuracy, predicted_prob, pair_ = sess.run([accuracy, prob, pair_ids], feed_dict={training: False})
            question_id, answer_id, label = pair_
            
            num_test += len(predicted_prob)
            num_correct += len(predicted_prob) * batch_accuracy
            for i, prob_score in enumerate(predicted_prob):
                # question_id, answer_id, label = pair_id[i]
                results[question_id[i]].append((answer_id[i], label[i], prob_score[0]))

            if n_updates%2000 == 0:
                tf.logging.info("n_update %d , %s: Mins Used: %.2f" %
                                (n_updates, op_name, (time() - t0) / 60.0))

    except tf.errors.OutOfRangeError:
        # calculate top-1 precision
        print('num_test_samples: {}  test_accuracy: {}'.format(num_test, num_correct / num_test))
        accu, precision, recall, f1, loss = metrics.classification_metrics(results)
        print('Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.format(accu, precision, recall, f1, loss))

        mvp = metrics.mean_average_precision(results)
        mrr = metrics.mean_reciprocal_rank(results)
        top_1_precision = metrics.top_1_precision(results)
        total_valid_query = metrics.get_num_valid_query(results)
        print('MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'.format(
            mvp, mrr, top_1_precision, total_valid_query))

        out_path = os.path.join(dir_path, "output_test.txt")
        print("Saving evaluation to {}".format(out_path))
        with open(out_path, 'w') as f:
          f.write("query_id\tdocument_id\tscore\trank\trelevance\n")
          for us_id, v in results.items():
            v.sort(key=operator.itemgetter(2), reverse=True)
            for i, rec in enumerate(v):
              r_id, label, prob_score = rec
              rank = i+1
              f.write('{}\t{}\t{}\t{}\t{}\n'.format(us_id, r_id, prob_score, rank, label))
    return mrr
Esempio n. 5
0
def evaluate(query_list, label_list, corpus, args):
    top_pred_list = []
    top_inds_list = []
    thre_pred_list = []
    f1_score_list = []
    cos_scores_matrix = cosine_similarity(corpus)
    p = progressbar.ProgressBar()
    start = time.time()
    for i in p(range(len(query_list))):
        query, label = query_list[i], label_list[i]
        top_inds, thresh_preds = evaluate_iter(cos_scores_matrix[i, :], args)
        ground_true = 1 * (label_list == label)
        if not args.include_self:
            # remove query itself from the corpus
            thresh_preds = np.delete(thresh_preds, i)
            ground_true = np.delete(ground_true, i)
            top_inds = top_inds[top_inds != i]
        try:
            assert (len(top_inds) == 10)
        except AssertionError:
            top_inds = top_inds[:10]

        f1 = f1_score(ground_true, thresh_preds)
        top_labels = label_list[top_inds]
        rs = 1 * (top_labels == label)

        top_pred_list.append(rs)
        top_inds_list.append(top_inds)
        f1_score_list.append(f1)
        thre_pred_list.append(thresh_preds)

    duration = time.time() - start
    print("Execution time: {:.2f}ms".format(duration * 1000 / len(query_list)))
    mAP = mean_average_precision(top_pred_list)
    mrr = mean_reciprocal_rank(top_pred_list)
    total_f1 = np.mean(f1_score_list)

    if args.save_result:
        save_file_name = "pca_pred%s.pickle" % ("_include_self"
                                                if args.include_self else "")
        with open(os.path.join(args.save_dir, save_file_name), "wb") as f:
            pickle.dump(thre_pred_list, f)
        print("save pca prediction result over.")

    return mAP, mrr, total_f1
Esempio n. 6
0
    def evaluate_on_df(self, predictor, k: int = 5):
        """evaluate on datafiller"""

        # instantiate the prredictor class which contains a name attribute and preict metho
        p = predictor(self.titles)
        # eval procedure

        scores = defaultdict(list)
        # loop oveer eval_dataset
        for line in tqdm(self._eval_data, desc="Scoring Documents"):
            dcg, mrr, recall, precision = [], [], [], []
            # loop over variants queries of the doc
            for example in line["examples"]:
                ### perform prediction using the predictor and the query
                y_pred = p.predict(example["query"], self.slugs, n=k)
                ###
                y_true = example["y_true"]
                y_score = example["y_score"]

                ## compute metrics based on the slugs returned and the ground truth
                dcg.append(
                    discounted_cumulative_gain(y_score, y_true, y_pred, k=k))
                mrr.append(mean_reciprocal_rank(y_pred, y_true))
                precision.append(find_precision_k(y_pred, y_true, k=k))
                recall.append(find_recall_k(y_pred, y_true, k=k))

            # average scores over the variants per document
            scores["dcg"].append(np.mean(dcg))
            scores["mrr"].append(np.mean(mrr))
            scores["precision"].append(np.mean(precision))
            scores["recall"].append(np.mean(recall))

        # average the scores over all documents
        print("#" * 50)
        print("evaluation for {}".format(p.name))
        print("--" * 25)
        print("dcg:", np.nanmean(scores["dcg"]))
        print("mrr:", np.nanmean(scores["mrr"]))
        print("precision:", np.nanmean(scores["precision"]))
        print("recall:", np.nanmean(scores["recall"]))
        print("#" * 50)
        return scores
Esempio n. 7
0
def compute_metrics(sess, logits_op, placeholders, data_file, exporter=None):
    """Compute metrics MAP and MRR over a dataset.

    :param sess: TensorFlow session
    :param logits_op: an operation that returns the scores for a given set of
    sentences
    :param placeholders: placeholders defined for `logits_op`
    :data_file: a HDF5 file object holding the dataset

    :returns: the values of MAP and MRR as a tuple: (MAP, MRR)
    """
    questions_ph, sentences_ph, keep_prob_ph = placeholders

    if exporter is None:
        exporter = dataio.no_op()
    next(exporter)  # priming the coroutine

    total_avep = 0.0
    total_mrr = 0.0
    n_questions = 0
    for batch in dataio.question_batches(data_file):
        feed_dict = {
            questions_ph: batch.questions,
            sentences_ph: batch.sentences,
            keep_prob_ph: 1.0
        }
        scores = logits_op.eval(session=sess, feed_dict=feed_dict)
        exporter.send(scores)

        n_questions += 1
        avep = average_precision(batch.labels, scores)
        total_avep += avep
        mrr = mean_reciprocal_rank(batch.labels, scores)
        total_mrr += mrr
    exporter.close()

    mean_avep = total_avep / n_questions
    mean_mrr = total_mrr / n_questions
    return mean_avep, mean_mrr
Esempio n. 8
0
def metrics(*args, **kwargs):
    model_recommender, x_df, cols = args
    top_n = kwargs.get(_KWARG_TOP_N, _TOP_N_DEFAULT)

    y_true = []
    y_pred = {n: [] for n in top_n}
    top_n_max = np.max(top_n)
    for _, (_, files, reviewers) in x_df.iterrows():
        _y_pred = model_recommender.recommend((files, files), N=top_n_max)

        y_true.append(reviewers)
        for n in top_n:
            y_pred[n].append(_y_pred[:n] if not pd.isna(_y_pred).any() else [])

    acc = [accuracy(y_true, _y_pred) for n, _y_pred in y_pred.items()]
    mrr = [
        mean_reciprocal_rank(y_true, _y_pred) for n, _y_pred in y_pred.items()
    ]

    metrics_df = pd.DataFrame([acc, mrr],
                              index=['acc', 'mrr'],
                              columns=[f'top-{n}' for n in top_n])

    return metrics_df
Esempio n. 9
0
                r_char_feature: x_r_char,
                r_char_len: x_r_char_len
            }
            predicted_prob = sess.run(prob, feed_dict)
            num_test += len(predicted_prob)
            print('num_test_sample={}'.format(num_test))
            for i, prob_score in enumerate(predicted_prob):
                us_id, r_id, label = id_pairs[i]
                results[us_id].append((r_id, label, prob_score))

accu, precision, recall, f1, loss = metrics.classification_metrics(results)
print('Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.format(
    accu, precision, recall, f1, loss))

mvp = metrics.mean_average_precision(results)
mrr = metrics.mean_reciprocal_rank(results)
top_1_precision = metrics.top_1_precision(results)
total_valid_query = metrics.get_num_valid_query(results)
print(
    'MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'
    .format(mvp, mrr, top_1_precision, total_valid_query))

out_path = FLAGS.output_file
print("Saving evaluation to {}".format(out_path))
with open(out_path, 'w') as f:
    f.write("query_id\tdocument_id\tscore\trank\trelevance\n")
    for us_id, v in results.items():
        v.sort(key=operator.itemgetter(2), reverse=True)
        for i, rec in enumerate(v):
            r_id, label, prob_score = rec
            rank = i + 1
Esempio n. 10
0
 def score(self, X, y):
     return mean_reciprocal_rank(y, self.predict(X))
Esempio n. 11
0
def run_test(epoch_no, dir_path, op_name, sess, training, accuracy, prob,
             pair_ids):
    results = defaultdict(list)
    num_test = 0
    num_correct = 0.0
    n_updates = 0
    mrr = 0
    t0 = time()
    try:
        while True:
            n_updates += 1

            batch_accuracy, predicted_prob, pair_ = sess.run(
                [accuracy, prob, pair_ids], feed_dict={training: False})
            question_id, answer_id, label = pair_

            # question_id = question_id.eval()
            # answer_id = answer_id.eval()
            # label = label.eval()
            num_test += len(predicted_prob)
            # if num_test % 1000 == 0:
            #     print(num_test)

            num_correct += len(predicted_prob) * batch_accuracy
            for i, prob_score in enumerate(predicted_prob):
                # question_id, answer_id, label = pair_id[i]
                results[question_id[i]].append(
                    (answer_id[i], label[i], prob_score[0]))

            if n_updates % 2000 == 0:
                tf.logging.info(
                    "epoch: %i  n_update %d , %s: Mins Used: %.2f" %
                    (epoch_no, n_updates, op_name, (time() - t0) / 60.0))

    except tf.errors.OutOfRangeError:

        threshold = 0.95
        none_id = 10000000
        print("threshold: {}".format(threshold))
        for q_id, a_list in results.items():
            correct_flag = 0
            for (a_id, label, score) in a_list:
                if int(label) == 1:
                    correct_flag = 1
            if correct_flag == 0:
                results[q_id].append((none_id, 1, threshold))
            else:
                results[q_id].append((none_id, 0, threshold))
        # calculate top-1 precision
        print('num_test_samples: {}  test_accuracy: {}'.format(
            num_test, num_correct / num_test))
        accu, precision, recall, f1, loss = metrics.classification_metrics(
            results)
        print(
            'Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.format(
                accu, precision, recall, f1, loss))

        mvp = metrics.mean_average_precision(results)
        mrr = metrics.mean_reciprocal_rank(results)
        top_1_precision = metrics.top_1_precision(results)
        total_valid_query = metrics.get_num_valid_query(results)
        print(
            'MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'
            .format(mvp, mrr, top_1_precision, total_valid_query))

        out_path = os.path.join(dir_path,
                                "ubuntu_output_epoch_{}.txt".format(epoch_no))
        print("Saving evaluation to {}".format(out_path))
        with open(out_path, 'w') as f:
            f.write("query_id\tdocument_id\tscore\trank\trelevance\n")
            for us_id, v in results.items():
                v.sort(key=operator.itemgetter(2), reverse=True)
                for i, rec in enumerate(v):
                    r_id, label, prob_score = rec
                    rank = i + 1
                    f.write('{}\t{}\t{}\t{}\t{}\n'.format(
                        us_id, r_id, prob_score, rank, label))

        global best_score
        if op_name == 'valid' and mrr > best_score:
            best_score = mrr
            saver = tf.train.Saver()
            dir_path = os.path.join(dir_path, "epoch {}".format(epoch_no))
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            saver.save(sess, dir_path)
            tf.logging.info(">> save model!")

    return mrr
Esempio n. 12
0
def plot_single_number_metric_helper(dataset, dsmetric, models, rs, true_result,
                                     metric, norm,
                                     ds_kernel, thresh_pos, thresh_neg,
                                     thresh_pos_sim, thresh_neg_sim,
                                     plot_results, extra_dir):
    # dsmetric: distance/similarity metric, e.g. ged, mcs, etc.
    # metric: eval metric.
    print_ids = []
    rtn = {}
    val_list = []
    for model in models:
        if metric == 'mrr':
            val = mean_reciprocal_rank(
                true_result, rs[model], norm, print_ids)
        elif metric == 'mse':
            val = mean_squared_error(
                true_result, rs[model], ds_kernel, norm)
        elif metric == 'dev':
            val = mean_deviation(
                true_result, rs[model], ds_kernel, norm)
        elif metric == 'time':
            val = average_time(rs[model])
        elif 'acc' in metric:
            val = accuracy(
                true_result, rs[model], thresh_pos, thresh_neg,
                thresh_pos_sim, thresh_neg_sim, norm)
            pos_acc, neg_acc, acc = val
            if metric == 'pos_acc':
                val = pos_acc
            elif metric == 'neg_acc':
                val = neg_acc
            elif metric == 'acc':
                val = acc  # only the overall acc
            else:
                assert (metric == 'accall')
        elif metric == 'kendalls_tau':
            val = kendalls_tau(true_result, rs[model], norm)
        elif metric == 'spearmans_rho':
            val = spearmans_rho(true_result, rs[model], norm)
        else:
            raise RuntimeError('Unknown {}'.format(metric))
        # print('{} {}: {}'.format(metric, model, mrr_mse_time))
        rtn[model] = val
        val_list.append(val)
    rtn = {'{}{}'.format(metric, get_norm_str(norm)): rtn}
    if not plot_results:
        return rtn
    plt = plot_multiple_bars(val_list, models, metric)
    if metric == 'time':
        ylabel = 'time (msec)'
        norm = None
    elif metric == 'pos_acc':
        ylabel = 'pos_recall'
    elif metric == 'neg_acc':
        ylabel = 'neg_recall'
    elif metric == 'kendalls_tau':
        ylabel = 'Kendall\'s $\\tau$'
    elif metric == 'spearmans_rho':
        ylabel = 'Spearman\'s $\\rho$'
    else:
        ylabel = metric
    plt.ylabel(ylabel)
    if metric == 'time':
        plt.yscale('log')
    metric_addi_info = ''
    bfn = '{}_{}{}_{}_{}{}'.format(
        dsmetric, metric, metric_addi_info,
        dataset, '_'.join(models),
        get_norm_str(norm))
    sp = get_result_path() + '/{}/{}/'.format(dataset, metric)
    save_fig(plt, sp, bfn)
    if extra_dir:
        save_fig(plt, extra_dir, bfn)
    print(metric, 'plotted')
    return rtn
Esempio n. 13
0
 def score(self, X, y):
     return mean_reciprocal_rank(y, self.predict(X))
def test(experiment_name,
         task,
         gpu_num=0,
         pretrained='',
         margin=0.4,
         losstype='deepcca'):
    cosined = False
    embed_dim = 1024
    gpu_num = int(gpu_num)
    margin = float(margin)

    # Setup the results and device.
    results_dir = setup_dirs(experiment_name)
    if not os.path.exists(results_dir + 'test_results/'):
        os.makedirs(results_dir + 'test_results/')
    test_results_dir = results_dir + 'test_results/'

    device = setup_device(gpu_num)

    #### Hyperparameters #####
    #Initialize wandb
    #import wandb
    #wandb.init(project=experiment_name)
    #config = wandb.config

    with open(results_dir + 'hyperparams_test.txt', 'w') as f:
        f.write('Command used to run: python ')
        f.write(' '.join(sys.argv))
        f.write('\n')
        f.write('device in use: ' + str(device))
        f.write('\n')
        f.write('--experiment_name ' + str(experiment_name))
        f.write('\n')

    # Setup data loaders and models based on task.
    if task == 'cifar10':
        train_loader, test_loader = cifar10_loaders()
        model_A = CIFAREmbeddingNet()
        model_B = CIFAREmbeddingNet()
    elif task == 'mnist':
        train_loader, test_loader = mnist_loaders()
        model_A = MNISTEmbeddingNet()
        model_B = MNISTEmbeddingNet()
    elif task == 'uw':
        uw_data = 'bert'
        train_loader, test_loader = uw_loaders(uw_data)
        if uw_data == 'bert':
            model_A = RowNet(3072, embed_dim=1024)  # Language.
            model_B = RowNet(4096, embed_dim=1024)  # Vision.

    # Finish model setup.
    model_A.load_state_dict(
        torch.load(results_dir + 'train_results/model_A_state.pt'))
    model_B.load_state_dict(
        torch.load(results_dir + 'train_results/model_B_state.pt'))
    model_A.to(device)
    model_B.to(device)
    # Put models into evaluation mode.
    model_A.eval()
    model_B.eval()
    """For UW data."""
    ## we use train data to calculate the threshhold for distance.
    a_train = []
    b_train = []
    # loading saved embeddings to be faster
    a_train = load_embeddings(test_results_dir + 'lang_embeds_train.npy')
    b_train = load_embeddings(test_results_dir + 'img_embeds_train.npy')

    # Iterate through the train data.
    if a_train is None or b_train is None:
        a_train = []
        b_train = []
        print(
            "Computing embeddings for train data to calculate threshhold for distance"
        )
        for data in train_loader:
            anchor_data = data[0].to(device)
            positive_data = data[1].to(device)
            label = data[2]
            a_train.append(
                model_A(anchor_data.to(device)).cpu().detach().numpy())
            b_train.append(
                model_B(positive_data.to(device)).cpu().detach().numpy())
        print("Finished Computing embeddings for train data")
    #saving embeddings if not already saved
    save_embeddings(test_results_dir + 'lang_embeds_train.npy', a_train)
    save_embeddings(test_results_dir + 'img_embeds_train.npy', b_train)

    a_train = np.concatenate(a_train, axis=0)
    b_train = np.concatenate(b_train, axis=0)

    # Test data
    # For accumulating predictions to check embedding visually using test set.
    # a is embeddings from domain A, b is embeddings from domain B, ys is their labels
    a = []
    b = []
    ys = []
    instance_data = []

    # loading saved embeddings to be faster
    a = load_embeddings(test_results_dir + 'lang_embeds.npy')
    b = load_embeddings(test_results_dir + 'img_embeds.npy')
    if a is None or b is None:
        compute_test_embeddings = True
        a = []
        b = []

    # Iterate through the test data.
    print("computing embeddings for test data")
    for data in test_loader:
        language_data, vision_data, object_name, instance_name = data
        language_data = language_data.to(device)
        vision_data = vision_data.to(device)
        instance_data.extend(instance_name)
        if compute_test_embeddings:
            a.append(
                model_A(language_data).cpu().detach().numpy())  # Language.
            b.append(model_B(vision_data).cpu().detach().numpy())  # Vision.
        ys.extend(object_name)
    print("finished computing embeddings for test data")
    # Convert string labels to ints.
    labelencoder = LabelEncoder()
    labelencoder.fit(ys)
    ys = labelencoder.transform(ys)

    #saving embeddings if not already saved
    save_embeddings(test_results_dir + 'lang_embeds.npy', a)
    save_embeddings(test_results_dir + 'img_embeds.npy', b)

    # Concatenate predictions.
    a = np.concatenate(a, axis=0)
    b = np.concatenate(b, axis=0)
    ab = np.concatenate((a, b), axis=0)

    ground_truth, predicted, distance = object_identification_task_classifier(
        a, b, ys, a_train, b_train, lamb_std=1, cosine=cosined)

    #### Retrieval task by giving an image and finding the closest word descriptions ####
    ground_truth_word, predicted_word, distance_word = object_identification_task_classifier(
        b, a, ys, b_train, a_train, lamb_std=1, cosine=cosined)
    with open('retrieval_non_pro.csv', mode='w') as retrieval_non_pro:
        csv_file_writer = csv.writer(retrieval_non_pro,
                                     delimiter=',',
                                     quotechar='"',
                                     quoting=csv.QUOTE_MINIMAL)
        csv_file_writer.writerow(
            ['image', 'language', 'predicted', 'ground truth'])
        for i in range(50):
            csv_file_writer.writerow([
                instance_data[0], instance_data[i], predicted_word[0][i],
                ground_truth_word[0][i]
            ])

    precisions = []
    recalls = []
    f1s = []
    precisions_pos = []
    recalls_pos = []
    f1s_pos = []
    #print(classification_report(oit_res[i], 1/np.arange(1,len(oit_res[i])+1) > 0.01))
    for i in range(len(ground_truth)):
        p, r, f, s = precision_recall_fscore_support(ground_truth[i],
                                                     predicted[i],
                                                     warn_for=(),
                                                     average='micro')
        precisions.append(p)
        recalls.append(r)
        f1s.append(f)
        p, r, f, s = precision_recall_fscore_support(ground_truth[i],
                                                     predicted[i],
                                                     warn_for=(),
                                                     average='binary')
        precisions_pos.append(p)
        recalls_pos.append(r)
        f1s_pos.append(f)

    print('\n ')
    print(experiment_name + '_' + str(embed_dim))
    print('MRR,    KNN,    Corr,   Mean F1,    Mean F1 (pos only)')
    print('%.3g & %.3g & %.3g & %.3g & %.3g' %
          (mean_reciprocal_rank(
              a, b, ys, cosine=cosined), knn(a, b, ys, k=5, cosine=cosined),
           corr_between(a, b, cosine=cosined), np.mean(f1s), np.mean(f1s_pos)))

    plt.figure(figsize=(14, 7))
    for i in range(len(ground_truth)):
        fpr, tpr, thres = roc_curve(ground_truth[i],
                                    [1 - e for e in distance[i]],
                                    drop_intermediate=True)
        plt.plot(fpr, tpr, alpha=0.08, color='r')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.savefig(test_results_dir + '_' + str(embed_dim) + '_ROC.svg')

    # Pick a pair, plot distance in A vs distance in B. Should be correlated.
    a_dists = []
    b_dists = []
    for _ in range(3000):
        i1 = random.randrange(len(a))
        i2 = random.randrange(len(a))
        a_dists.append(euclidean(a[i1], a[i2]))
        b_dists.append(euclidean(b[i1], b[i2]))
    #     a_dists.append(cosine(a[i1], a[i2]))
    #     b_dists.append(cosine(b[i1], b[i2]))

    # Plot.
    plt.figure(figsize=(14, 14))
    #plt.title('Check Distance Correlation Between Domains')
    plt.xlim([0, 3])
    plt.ylim([0, 3])
    # plt.xlim([0,max(a_dists)])
    # plt.ylim([0,max(b_dists)])
    # plt.xlabel('Distance in Domain A')
    # plt.ylabel('Distance in Domain B')
    plt.xlabel('Distance in Language Domain')
    plt.ylabel('Distance in Vision Domain')
    #plt.plot(a_dists_norm[0],b_dists_norm[0],'.')
    #plt.plot(np.arange(0,2)/20,np.arange(0,2)/20,'k-',lw=3)
    plt.plot(a_dists, b_dists, 'o', alpha=0.5)
    plt.plot(np.arange(0, 600), np.arange(0, 600), 'k--', lw=3, alpha=0.5)
    #plt.text(-0.001, -0.01, 'Corr: %.3f'%(pearsonr(a_dists,b_dists)[0]),  fontsize=20)
    plt.savefig(test_results_dir + '_' + str(embed_dim) + '_CORR.svg')

    # Inspect embedding distances.
    clas = 5  # Base class.
    i_clas = [i for i in range(len(ys)) if ys[i].item() == clas]
    i_clas_2 = np.random.choice(i_clas, len(i_clas), replace=False)

    clas_ref = 4  # Comparison class.
    i_clas_ref = [i for i in range(len(ys)) if ys[i].item() == clas_ref]

    ac = np.array([a[i] for i in i_clas])
    bc = np.array([b[i] for i in i_clas])

    ac2 = np.array([a[i] for i in i_clas_2])
    bc2 = np.array([b[i] for i in i_clas_2])

    ac_ref = np.array([a[i] for i in i_clas_ref])
    aa_diff_ref = norm(ac[:min(len(ac), len(ac_ref))] -
                       ac_ref[:min(len(ac), len(ac_ref))],
                       ord=2,
                       axis=1)

    ab_diff = norm(ac - bc2, ord=2, axis=1)
    aa_diff = norm(ac - ac2, ord=2, axis=1)
    bb_diff = norm(bc - bc2, ord=2, axis=1)

    # aa_diff_ref = [cosine(ac[:min(len(ac),len(ac_ref))][i],ac_ref[:min(len(ac),len(ac_ref))][i]) for i in range(len(ac[:min(len(ac),len(ac_ref))]))]

    # ab_diff = [cosine(ac[i],bc2[i]) for i in range(len(ac))]
    # aa_diff = [cosine(ac[i],ac2[i]) for i in range(len(ac))]
    # bb_diff = [cosine(bc[i],bc2[i]) for i in range(len(ac))]

    bins = np.linspace(0, 0.1, 100)

    plt.figure(figsize=(14, 7))
    plt.hist(ab_diff, bins, alpha=0.5, label='between embeddings')
    plt.hist(aa_diff, bins, alpha=0.5, label='within embedding A')
    plt.hist(bb_diff, bins, alpha=0.5, label='within embedding B')

    plt.hist(aa_diff_ref,
             bins,
             alpha=0.5,
             label='embedding A, from class ' + str(clas_ref))

    plt.title('Embedding Distances - Class: ' + str(clas))
    plt.xlabel('L2 Distance')
    plt.ylabel('Count')
    plt.legend()

    #labelencoder.classes_
    classes_to_keep = [36, 6, 9, 46, 15, 47, 50, 22, 26, 28]
    print(labelencoder.inverse_transform(classes_to_keep))

    ab_norm = [
        e for i, e in enumerate(ab) if ys[i % len(ys)] in classes_to_keep
    ]
    ys_norm = [e for e in ys if e in classes_to_keep]

    color_index = {list(set(ys_norm))[i]: i
                   for i in range(len(set(ys_norm)))}  #set(ys_norm)
    markers = ["o", "v", "^", "s", "*", "+", "x", "D", "h", "4"]
    marker_index = {
        list(set(ys_norm))[i]: markers[i]
        for i in range(len(set(ys_norm)))
    }

    embedding = umap.UMAP(n_components=2).fit_transform(
        ab_norm)  # metric='cosine'
    # Plot UMAP embedding of embeddings for all classes.
    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))

    mid = len(ys_norm)

    ax1.set_title('Language UMAP')
    for e in list(set(ys_norm)):
        x1 = [
            embedding[:mid, 0][i] for i in range(len(ys_norm))
            if ys_norm[i] == e
        ]
        x2 = [
            embedding[:mid, 1][i] for i in range(len(ys_norm))
            if ys_norm[i] == e
        ]
        ax1.scatter(
            x1,
            x2,
            marker=marker_index[int(e)],
            alpha=0.5,
            c=[sns.color_palette("colorblind", 10)[color_index[int(e)]]],
            label=labelencoder.inverse_transform([int(e)])[0])
    ax1.set_xlim([min(embedding[:, 0]) - 4, max(embedding[:, 0]) + 4])
    ax1.set_ylim([min(embedding[:, 1]) - 4, max(embedding[:, 1]) + 4])
    ax1.grid(True)
    ax1.legend(loc='upper center',
               bbox_to_anchor=(1.1, -0.08),
               fancybox=True,
               shadow=True,
               ncol=5)

    ax2.set_title('Vision UMAP')
    for e in list(set(ys_norm)):
        x1 = [
            embedding[mid::, 0][i] for i in range(len(ys_norm))
            if ys_norm[i] == e
        ]
        x2 = [
            embedding[mid::, 1][i] for i in range(len(ys_norm))
            if ys_norm[i] == e
        ]
        ax2.scatter(
            x1,
            x2,
            marker=marker_index[int(e)],
            alpha=0.5,
            c=[sns.color_palette("colorblind", 10)[color_index[int(e)]]])
    ax2.set_xlim([min(embedding[:, 0]) - 4, max(embedding[:, 0]) + 4])
    ax2.set_ylim([min(embedding[:, 1]) - 4, max(embedding[:, 1]) + 4])
    ax2.grid(True)

    plt.savefig(test_results_dir + '_' + str(embed_dim) + '_UMAP_wl.svg',
                bbox_inches='tight')
# MAGIC * The example external evaluations may hold out and consider multiple items as ground truth, while the internal evaluations only hold out the last item in each user-history as the ground truth. There is no absolute preference as to how many items should be held out; we recommend designing the evaluation methods that are similar to the actual use case.

# COMMAND ----------

relevance = []
for user_id, true_items in tqdm_notebook(holdout.groupby('USER_ID').ITEM_ID):
    rec_response = personalize_runtime.get_recommendations(
        campaignArn = campaign_arn,
        userId = str(user_id)
    )
    rec_items = [int(x['itemId']) for x in rec_response['itemList']]
    relevance.append([int(x in true_items.values) for x in rec_items])

# COMMAND ----------

print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in relevance]))
print('precision_at_5', np.mean([precision_at_k(r, 5) for r in relevance]))
print('precision_at_10', np.mean([precision_at_k(r, 10) for r in relevance]))
print('precision_at_25', np.mean([precision_at_k(r, 25) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in relevance]))

# COMMAND ----------

# MAGIC %md
# MAGIC ### Optional: slightly better results after deduplicating previous purchase histories

# COMMAND ----------

rel_dedup = []
Esempio n. 16
0
        def dev_step():
            results = defaultdict(list)
            num_test = 0
            num_correct = 0.0
            valid_batches = data_helpers.batch_iter(valid_dataset,
                                                    FLAGS.batch_size,
                                                    1,
                                                    target_loss_weight,
                                                    FLAGS.max_utter_len,
                                                    FLAGS.max_utter_num,
                                                    FLAGS.max_response_len,
                                                    charVocab,
                                                    FLAGS.max_word_length,
                                                    shuffle=False)
            for valid_batch in valid_batches:
                x_utterances, x_response, x_utterances_len, x_response_len, x_utters_num, x_responses_num, x_dist, x_target, x_target_weight, id_pairs, x_u_char, x_u_char_len, x_r_char, x_r_char_len = valid_batch
                feed_dict = {
                    u2u_imn.utterances: x_utterances,
                    u2u_imn.response: x_response,
                    u2u_imn.utterances_len: x_utterances_len,
                    u2u_imn.response_len: x_response_len,
                    u2u_imn.utters_num: x_utters_num,
                    u2u_imn.responses_num: x_responses_num,
                    u2u_imn.distance: x_dist,
                    u2u_imn.target: x_target,
                    u2u_imn.target_loss_weight: x_target_weight,
                    u2u_imn.dropout_keep_prob: 1.0,
                    u2u_imn.u_charVec: x_u_char,
                    u2u_imn.u_charLen: x_u_char_len,
                    u2u_imn.r_charVec: x_r_char,
                    u2u_imn.r_charLen: x_r_char_len
                }
                batch_accuracy, predicted_prob = sess.run(
                    [u2u_imn.accuracy, u2u_imn.probs], feed_dict)
                num_test += len(predicted_prob)
                if num_test % 1000 == 0:
                    print(num_test)

                num_correct += len(predicted_prob) * batch_accuracy
                for i, prob_score in enumerate(predicted_prob):
                    question_id, response_id, label = id_pairs[i]
                    results[question_id].append(
                        (response_id, label, prob_score))

            #calculate top-1 precision
            print('num_test_samples: {}  test_accuracy: {}'.format(
                num_test, num_correct / num_test))
            accu, precision, recall, f1, loss = metrics.classification_metrics(
                results)
            print('Accuracy: {}, Precision: {}  Recall: {}  F1: {} Loss: {}'.
                  format(accu, precision, recall, f1, loss))

            mvp = metrics.mean_average_precision(results)
            mrr = metrics.mean_reciprocal_rank(results)
            top_1_precision = metrics.top_1_precision(results)
            total_valid_query = metrics.get_num_valid_query(results)
            print(
                'MAP (mean average precision: {}\tMRR (mean reciprocal rank): {}\tTop-1 precision: {}\tNum_query: {}'
                .format(mvp, mrr, top_1_precision, total_valid_query))

            all_preds = []
            for i in range(len(results)):
                all_preds.append([r[2] for r in results[str(i)]])
            df = pd.DataFrame(all_preds,
                              columns=[
                                  'prediction_' + str(i)
                                  for i in range(len(all_preds[0]))
                              ])
            if not os.path.isdir(FLAGS.output_predictions_folder):
                os.makedirs(FLAGS.output_predictions_folder)
            with open(
                    os.path.join(FLAGS.output_predictions_folder,
                                 'config.json'), 'w') as f:
                conf = {}
                for k, v in FLAGS.__dict__['__flags'].items():
                    conf[k] = v
                conf['ranker'] = "U2U"
                conf['seed'] = str(conf['random_seed'])
                args_dict = {}
                args_dict['args'] = conf

                f.write(json.dumps(args_dict, indent=4, sort_keys=True))
            df.to_csv(FLAGS.output_predictions_folder + "/predictions.csv",
                      index=False)

            return mrr