def test_paired_euclidean_distances():
    # Check the paired Euclidean distances computation
    X = [[0], [0]]
    Y = [[1], [2]]
    D = paired_euclidean_distances(X, Y)
    assert_array_almost_equal(D, [1., 2.])
Beispiel #2
0
    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        model.eval()

        if epoch != -1:
            if steps == -1:
                out_txt = " after epoch {}:".format(epoch)
            else:
                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
        else:
            out_txt = ":"

        logging.info("Evaluation the model on "+self.name+" dataset"+out_txt)

        num_triplets = 0
        num_correct_cos_triplets, num_correct_manhatten_triplets, num_correct_euclidean_triplets = 0, 0, 0

        self.dataloader.collate_fn = model.smart_batching_collate
        for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
            features, label_ids = batch_to_device(batch, self.device)
            with torch.no_grad():
                emb1, emb2, emb3 = [model(sent_features)['sentence_embedding'].to("cpu").numpy() for sent_features in features]

            #Cosine distance
            pos_cos_distance = paired_cosine_distances(emb1, emb2)
            neg_cos_distances = paired_cosine_distances(emb1, emb3)

            # Manhatten
            pos_manhatten_distance = paired_manhattan_distances(emb1, emb2)
            neg_manhatten_distances = paired_manhattan_distances(emb1, emb3)

            # Euclidean
            pos_euclidean_distance = paired_euclidean_distances(emb1, emb2)
            neg_euclidean_distances = paired_euclidean_distances(emb1, emb3)

            for idx in range(len(pos_cos_distance)):
                num_triplets += 1

                if pos_cos_distance[idx] < neg_cos_distances[idx]:
                    num_correct_cos_triplets += 1

                if pos_manhatten_distance[idx] < neg_manhatten_distances[idx]:
                    num_correct_manhatten_triplets += 1

                if pos_euclidean_distance[idx] < neg_euclidean_distances[idx]:
                    num_correct_euclidean_triplets += 1



        accuracy_cos = num_correct_cos_triplets / num_triplets
        accuracy_manhatten = num_correct_manhatten_triplets / num_triplets
        accuracy_euclidean = num_correct_euclidean_triplets / num_triplets

        logging.info("Accuracy Cosine Distance:\t{:.4f}".format(accuracy_cos))
        logging.info("Accuracy Manhatten Distance:\t{:.4f}".format(accuracy_manhatten))
        logging.info("Accuracy Euclidean Distance:\t{:.4f}\n".format(accuracy_euclidean))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([epoch, steps, accuracy_cos, accuracy_manhatten, accuracy_euclidean])

            else:
                store = []
                with open(csv_path, mode="r", encoding="utf-8") as f:
                    read = csv.reader(f)
                    for obj in read:
                        if obj!=[]:
                            store.append(obj)

                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    for row in store:
                        writer.writerow(row)
                        writer.writerow([epoch, steps, accuracy_cos, accuracy_manhatten, accuracy_euclidean])

        if self.main_distance_function == SimilarityFunction.COSINE:
            return accuracy_cos
        if self.main_distance_function == SimilarityFunction.MANHATTAN:
            return accuracy_manhatten
        if self.main_distance_function == SimilarityFunction.EUCLIDEAN:
            return accuracy_euclidean

        return max(accuracy_cos, accuracy_manhatten, accuracy_euclidean)
    def __call__(self,
                 model,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:
        model.eval()
        embeddings1 = []
        embeddings2 = []
        labels = []

        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}:"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        logging.info("Evaluation the model on " + self.name + " dataset" +
                     out_txt)
        self.dataloader.collate_fn = model.smart_batching_collate
        for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
            features, label_ids = batch_to_device(batch, self.device)
            with torch.no_grad():
                emb1, emb2 = [
                    model(sent_features)['sentence_embedding'].to(
                        "cpu").numpy() for sent_features in features
                ]

            labels.extend(label_ids.to("cpu").numpy())
            embeddings1.extend(emb1)
            embeddings2.extend(emb2)
        cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
        manhattan_distances = paired_manhattan_distances(
            embeddings1, embeddings2)
        euclidean_distances = paired_euclidean_distances(
            embeddings1, embeddings2)

        # Ensure labels are just 0 or 1
        for label in labels:
            assert (label == 0 or label == 1)

        labels = np.asarray(labels)
        cosine_acc, cosine_threshold = self.find_best_acc_and_threshold(
            cosine_scores, labels, True)
        manhattan_acc, manhatten_threshold = self.find_best_acc_and_threshold(
            manhattan_distances, labels, False)
        euclidean_acc, euclidean_threshold = self.find_best_acc_and_threshold(
            euclidean_distances, labels, False)

        logging.info(
            "Accuracy with Cosine-Similarity:\t{:.2f}\t(Threshold: {:.4f})".
            format(cosine_acc * 100, cosine_threshold))
        logging.info(
            "Accuracy with Manhattan-Distance:\t{:.2f}\t(Threshold: {:.4f})".
            format(manhattan_acc * 100, manhatten_threshold))
        logging.info(
            "Accuracy with Euclidean-Distance:\t{:.2f}\t(Threshold: {:.4f})\n".
            format(euclidean_acc * 100, euclidean_threshold))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([
                        epoch, steps, cosine_acc, euclidean_acc, manhattan_acc
                    ])
            else:
                with open(csv_path, mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([
                        epoch, steps, cosine_acc, euclidean_acc, manhattan_acc
                    ])

        if self.main_similarity == SimilarityFunction.COSINE:
            return cosine_acc
        elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
            return euclidean_acc
        elif self.main_similarity == SimilarityFunction.MANHATTAN:
            return manhattan_acc
        else:
            raise ValueError("Unknown main_similarity value")
Beispiel #4
0
def test_paired_euclidean_distances():
    # Check the paired Euclidean distances computation
    X = [[0], [0]]
    Y = [[1], [2]]
    D = paired_euclidean_distances(X, Y)
    assert_array_almost_equal(D, [1., 2.])
    def __call__(self, model: 'SequentialSentenceEmbedder', output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        model.eval()
        embeddings1 = []
        embeddings2 = []
        labels = []

        if epoch != -1:
            if steps == -1:
                out_txt = " after epoch {}:".format(epoch)
            else:
                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
        else:
            out_txt = ":"

        logging.info("Evaluation the model on "+self.name+" dataset"+out_txt)

        self.dataloader.collate_fn = model.smart_batching_collate

        iterator = self.dataloader
        if self.show_progress_bar:
            iterator = tqdm(iterator, desc="Convert Evaluating")

        for step, batch in enumerate(iterator):
            features, label_ids = batch_to_device(batch, self.device)
            with torch.no_grad():
                emb1, emb2 = [model(sent_features)['sentence_embedding'].to("cpu").numpy() for sent_features in features]

            labels.extend(label_ids.to("cpu").numpy())
            embeddings1.extend(emb1)
            embeddings2.extend(emb2)

        try:
            cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
        except Exception as e:
            print(embeddings1)
            print(embeddings2)
            raise(e)

        manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
        euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
        dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]


        eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
        eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

        eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
        eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

        eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
        eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

        eval_pearson_dot, _ = pearsonr(labels, dot_products)
        eval_spearman_dot, _ = spearmanr(labels, dot_products)

        logging.info("Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format(
            eval_pearson_cosine, eval_spearman_cosine))
        logging.info("Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
            eval_pearson_manhattan, eval_spearman_manhattan))
        logging.info("Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
            eval_pearson_euclidean, eval_spearman_euclidean))
        logging.info("Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
            eval_pearson_dot, eval_spearman_dot))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            output_file_exists = os.path.isfile(csv_path)
            with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
                writer = csv.writer(f)
                if not output_file_exists:
                    writer.writerow(self.csv_headers)

                writer.writerow([epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean,
                                 eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan, eval_pearson_dot, eval_spearman_dot])


        if self.main_similarity == SimilarityFunction.COSINE:
            return eval_spearman_cosine
        elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
            return eval_spearman_euclidean
        elif self.main_similarity == SimilarityFunction.MANHATTAN:
            return eval_spearman_manhattan
        elif self.main_similarity == SimilarityFunction.DOT_PRODUCT:
            return eval_spearman_dot
        elif self.main_similarity is None:
            return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot)
        else:
            raise ValueError("Unknown main_similarity value")
Beispiel #6
0
    def __call__(self,
                 model,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:

        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}:"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        logger.info("Binary Accuracy Evaluation of the model on " + self.name +
                    " dataset" + out_txt)
        embeddings1 = model.encode(self.sentences1,
                                   batch_size=self.batch_size,
                                   show_progress_bar=self.show_progress_bar,
                                   convert_to_numpy=True)
        embeddings2 = model.encode(self.sentences2,
                                   batch_size=self.batch_size,
                                   show_progress_bar=self.show_progress_bar,
                                   convert_to_numpy=True)

        cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
        manhattan_distances = paired_manhattan_distances(
            embeddings1, embeddings2)
        euclidean_distances = paired_euclidean_distances(
            embeddings1, embeddings2)

        labels = np.asarray(self.labels)

        file_output_data = [epoch, steps]

        main_score = None
        for name, scores, reverse in [[
                'Cosine-Similarity', cosine_scores, True
        ], ['Manhatten-Distance', manhattan_distances,
                False], ['Euclidean-Distance', euclidean_distances, False]]:
            acc, acc_threshold = self.find_best_acc_and_threshold(
                scores, labels, reverse)
            f1, precision, recall, f1_threshold = self.find_best_f1_and_threshold(
                scores, labels, reverse)
            ap = average_precision_score(labels,
                                         scores * (1 if reverse else -1))

            logger.info(
                "Accuracy with {}:           {:.2f}\t(Threshold: {:.4f})".
                format(name, acc * 100, acc_threshold))
            logger.info(
                "F1 with {}:                 {:.2f}\t(Threshold: {:.4f})".
                format(name, f1 * 100, f1_threshold))
            logger.info("Precision with {}:          {:.2f}".format(
                name, precision * 100))
            logger.info("Recall with {}:             {:.2f}".format(
                name, recall * 100))
            logger.info("Average Precision with {}:  {:.2f}\n".format(
                name, ap * 100))

            file_output_data.extend(
                [acc, acc_threshold, f1, precision, recall, f1_threshold, ap])

            if main_score is None:  #Use AveragePrecision with Cosine-Similarity as main score
                main_score = ap

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow(file_output_data)
            else:
                with open(csv_path, mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(file_output_data)

        return main_score