Esempio n. 1
0
    def __call__(self,
                 model,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:
        if epoch != -1:
            if steps == -1:
                out_txt = " after epoch {}:".format(epoch)
            else:
                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
        else:
            out_txt = ":"

        # logging.info("Evaluation the model on " + self.name + " dataset" + out_txt)

        embeddings1 = model.encode(self.sentences1,
                                   batch_size=self.batch_size,
                                   show_progress_bar=self.show_progress_bar,
                                   convert_to_tensor=True)
        embeddings2 = model.encode(self.sentences2,
                                   batch_size=self.batch_size,
                                   show_progress_bar=self.show_progress_bar,
                                   convert_to_tensor=True)
        num_pairs = embeddings1.shape[0]

        embed = []
        meta_list = []
        num = 200
        if self.summary_path:
            meta_list.extend([
                "idx-{}<S1>{}".format(i, s1) for (i, s1) in zip(
                    range(min(num_pairs, num)), self.sentences1[:num])
            ])
            embed.append(embeddings1[:num, :])
            meta_list.extend([
                "idx-{}<S2>{}".format(i, s2) for (i, s2) in zip(
                    range(min(num_pairs, num)), self.sentences2[:num])
            ])
            embed.append(embeddings2[:num, :])
        embeddings = whitening_torch_final(
            torch.cat([embeddings1, embeddings2], dim=0))
        embeddings1 = embeddings[:num_pairs, :]
        embeddings2 = embeddings[num_pairs:, :]
        if self.summary_path:
            meta_list.extend([
                "white-idx-{}<WS1>{}".format(i, s1) for (i, s1) in zip(
                    range(min(num_pairs, num)), self.sentences1[:num])
            ])
            embed.append(embeddings1[:num, :])
            meta_list.extend([
                "white-idx-{}<WS2>{}".format(i, s2) for (i, s2) in zip(
                    range(min(num_pairs, num)), self.sentences2[:num])
            ])
            embed.append(embeddings2[:num, :])
            embed = torch.cat(embed, dim=0)
            self.writer.add_embedding(embed,
                                      metadata=meta_list,
                                      tag="all{}".format(num * 4))
        embeddings1 = embeddings1[:self.measure_data_num, :self.embed_dim]
        embeddings2 = embeddings2[:self.measure_data_num, :self.embed_dim]
        labels = self.scores[:self.measure_data_num]

        if self.intra_diversity:
            intra_div = self.compute_intra_diversity(embeddings1, embeddings2)
            logging.info("IntraDiversity on " + self.name + out_txt +
                         ": {:.4f}".format(intra_div))
            return intra_div

        cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
        manhattan_distances = -paired_manhattan_distances(
            embeddings1, embeddings2)
        euclidean_distances = -paired_euclidean_distances(
            embeddings1, embeddings2)
        dot_products = [
            np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)
        ]

        eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
        eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

        eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
        eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

        eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
        eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

        eval_pearson_dot, _ = pearsonr(labels, dot_products)
        eval_spearman_dot, _ = spearmanr(labels, dot_products)

        logging.info("Eval on " + self.name + out_txt +
                     "Cosine :\tPearson: {:.4f}\tSpearman: {:.4f}".format(
                         eval_pearson_cosine, eval_spearman_cosine))
        # logging.info("Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
        #     eval_pearson_manhattan, eval_spearman_manhattan))
        # logging.info("Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
        #     eval_pearson_euclidean, eval_spearman_euclidean))
        # logging.info("Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
        #     eval_pearson_dot, eval_spearman_dot))
        # logging.info("Eval on "+self.name+out_txt+"Cosine3 :\tPearson: {:.4f}\tSpearman: {:.4f}".format(
        #     eval_pearson_cosine3, eval_spearman_cosine3))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            output_file_exists = os.path.isfile(csv_path)
            with open(csv_path,
                      mode="a" if output_file_exists else 'w',
                      encoding="utf-8") as f:
                writer = csv.writer(f)
                if not output_file_exists:
                    writer.writerow(self.csv_headers)

                writer.writerow([
                    epoch, steps, eval_pearson_cosine, eval_spearman_cosine,
                    eval_pearson_euclidean, eval_spearman_euclidean,
                    eval_pearson_manhattan, eval_spearman_manhattan,
                    eval_pearson_dot, eval_spearman_dot
                ])

        if self.main_similarity == SimilarityFunction.COSINE:
            return eval_spearman_cosine
        elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
            return eval_spearman_euclidean
        elif self.main_similarity == SimilarityFunction.MANHATTAN:
            return eval_spearman_manhattan
        elif self.main_similarity == SimilarityFunction.DOT_PRODUCT:
            return eval_spearman_dot
        elif self.main_similarity is None:
            return max(eval_spearman_cosine, eval_spearman_manhattan,
                       eval_spearman_euclidean, eval_spearman_dot)
        else:
            raise ValueError("Unknown main_similarity value")
Esempio n. 2
0
    def __call__(self,
                 model,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:
        if epoch != -1:
            if steps == -1:
                out_txt = " after epoch {}:".format(epoch)
            else:
                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
        else:
            out_txt = ":"

        logging.info("TripletEvaluator: Evaluating the model on " + self.name +
                     " dataset" + out_txt)

        num_triplets = 0
        num_correct_cos_triplets, num_correct_manhatten_triplets, num_correct_euclidean_triplets = 0, 0, 0

        embeddings_anchors = model.encode(
            self.anchors,
            batch_size=self.batch_size,
            show_progress_bar=self.show_progress_bar,
            convert_to_numpy=True)
        embeddings_positives = model.encode(
            self.positives,
            batch_size=self.batch_size,
            show_progress_bar=self.show_progress_bar,
            convert_to_numpy=True)
        embeddings_negatives = model.encode(
            self.negatives,
            batch_size=self.batch_size,
            show_progress_bar=self.show_progress_bar,
            convert_to_numpy=True)

        #Cosine distance
        pos_cos_distance = paired_cosine_distances(embeddings_anchors,
                                                   embeddings_positives)
        neg_cos_distances = paired_cosine_distances(embeddings_anchors,
                                                    embeddings_negatives)

        # Manhatten
        pos_manhatten_distance = paired_manhattan_distances(
            embeddings_anchors, embeddings_positives)
        neg_manhatten_distances = paired_manhattan_distances(
            embeddings_anchors, embeddings_negatives)

        # Euclidean
        pos_euclidean_distance = paired_euclidean_distances(
            embeddings_anchors, embeddings_positives)
        neg_euclidean_distances = paired_euclidean_distances(
            embeddings_anchors, embeddings_negatives)

        for idx in range(len(pos_cos_distance)):
            num_triplets += 1

            if pos_cos_distance[idx] < neg_cos_distances[idx]:
                num_correct_cos_triplets += 1

            if pos_manhatten_distance[idx] < neg_manhatten_distances[idx]:
                num_correct_manhatten_triplets += 1

            if pos_euclidean_distance[idx] < neg_euclidean_distances[idx]:
                num_correct_euclidean_triplets += 1

        accuracy_cos = num_correct_cos_triplets / num_triplets
        accuracy_manhatten = num_correct_manhatten_triplets / num_triplets
        accuracy_euclidean = num_correct_euclidean_triplets / num_triplets

        logging.info("Accuracy Cosine Distance:   \t{:.2f}".format(
            accuracy_cos * 100))
        logging.info("Accuracy Manhatten Distance:\t{:.2f}".format(
            accuracy_manhatten * 100))
        logging.info("Accuracy Euclidean Distance:\t{:.2f}\n".format(
            accuracy_euclidean * 100))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([
                        epoch, steps, accuracy_cos, accuracy_manhatten,
                        accuracy_euclidean
                    ])

            else:
                with open(csv_path, mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([
                        epoch, steps, accuracy_cos, accuracy_manhatten,
                        accuracy_euclidean
                    ])

        if self.main_distance_function == SimilarityFunction.COSINE:
            return accuracy_cos
        if self.main_distance_function == SimilarityFunction.MANHATTAN:
            return accuracy_manhatten
        if self.main_distance_function == SimilarityFunction.EUCLIDEAN:
            return accuracy_euclidean

        return max(accuracy_cos, accuracy_manhatten, accuracy_euclidean)
Esempio n. 3
0
    def __call__(self,
                 model,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:

        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}:"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        logging.info("Binary Accuracy Evaluation of the model on " +
                     self.name + " dataset" + out_txt)
        embeddings1 = model.encode(self.sentences1,
                                   batch_size=self.batch_size,
                                   show_progress_bar=self.show_progress_bar,
                                   convert_to_numpy=True)
        embeddings2 = model.encode(self.sentences2,
                                   batch_size=self.batch_size,
                                   show_progress_bar=self.show_progress_bar,
                                   convert_to_numpy=True)

        cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
        manhattan_distances = paired_manhattan_distances(
            embeddings1, embeddings2)
        euclidean_distances = paired_euclidean_distances(
            embeddings1, embeddings2)
        dot_products = [
            np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)
        ]

        labels = np.asarray(self.labels)
        cosine_acc, cosine_threshold = self.find_best_acc_and_threshold(
            cosine_scores, labels, True)
        manhattan_acc, manhatten_threshold = self.find_best_acc_and_threshold(
            manhattan_distances, labels, False)
        euclidean_acc, euclidean_threshold = self.find_best_acc_and_threshold(
            euclidean_distances, labels, False)
        dot_acc, dot_threshold = self.find_best_acc_and_threshold(
            dot_products, labels, False)

        logging.info(
            "Accuracy with Cosine-Similarity:\t{:.2f}\t(Threshold: {:.4f})".
            format(cosine_acc * 100, cosine_threshold))
        logging.info(
            "Accuracy with Manhattan-Distance:\t{:.2f}\t(Threshold: {:.4f})".
            format(manhattan_acc * 100, manhatten_threshold))
        logging.info(
            "Accuracy with Euclidean-Distance:\t{:.2f}\t(Threshold: {:.4f})".
            format(euclidean_acc * 100, euclidean_threshold))
        logging.info(
            "Accuracy with Dot-Product:\t{:.2f}\t(Threshold: {:.4f})\n".format(
                dot_acc * 100, dot_threshold))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([
                        epoch, steps, cosine_acc, cosine_threshold,
                        manhattan_acc, manhatten_threshold, euclidean_acc,
                        euclidean_threshold, dot_acc, dot_threshold
                    ])
            else:
                with open(csv_path, mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([
                        epoch, steps, cosine_acc, cosine_threshold,
                        manhattan_acc, manhatten_threshold, euclidean_acc,
                        euclidean_threshold, dot_acc, dot_threshold
                    ])

        if self.main_similarity == SimilarityFunction.COSINE:
            return cosine_acc
        elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
            return euclidean_acc
        elif self.main_similarity == SimilarityFunction.MANHATTAN:
            return manhattan_acc
        else:
            raise ValueError("Unknown main_similarity value")
Esempio n. 4
0
def get_tfidf_count_hash_features(ql, qr, vectorModels, signature=""):
    texts_ql = []
    texts_qr = []
    
    res = ["%s" %(token.term) for token in ql.basic_words]
    texts_ql.append(" ".join(res))
    
    res = ["%s" %(token.term) for token in qr.basic_words]
    texts_qr.append(" ".join(res))

    feature_dict = {}
    
    ql_tfidf1, ql_count1, ql_hash1_18, ql_hash1_20, ql_hash2_18, ql_hash2_20, qr_tfidf1, qr_count1, qr_hash1_18, qr_hash1_20, qr_hash2_18, qr_hash2_20 = get_count_tfidf_hash(texts_ql, texts_qr, vectorModels)
    
    if signature:
        signature = signature + "_"
        
    tfidf1_PED = paired_euclidean_distances(ql_tfidf1, qr_tfidf1)
    feature_dict[signature+ 'tfidf1_PED'] = float(tfidf1_PED[0])
    
    count1_PED = paired_euclidean_distances(ql_count1, qr_count1)
    feature_dict[signature+ 'count1_PED'] = float(count1_PED[0])
    
    hash1_18_PED = paired_euclidean_distances(ql_hash1_18, qr_hash1_18)
    feature_dict[signature+ 'hash1_18_PED'] = float(hash1_18_PED[0])
    
    hash1_20_PED = paired_euclidean_distances(ql_hash1_20, qr_hash1_20)
    feature_dict[signature+ 'hash1_20_PED'] = float(hash1_20_PED[0])
    #-------------------------------------------------------------------
    
    tfidf1_PCD = paired_cosine_distances(ql_tfidf1, qr_tfidf1)
    feature_dict[signature+ 'tfidf1_PCD'] = float(tfidf1_PCD[0])
    
    count1_PCD = paired_cosine_distances(ql_count1, qr_count1)
    feature_dict[signature+ 'count1_PCD'] = float(count1_PCD[0])
    
    hash1_18_PCD = paired_cosine_distances(ql_hash1_18, qr_hash1_18)
    feature_dict[signature+ 'hash1_18_PCD'] = float(hash1_18_PCD[0])
    
    hash1_20_PCD = paired_cosine_distances(ql_hash1_20, qr_hash1_20)
    feature_dict[signature+ 'hash1_20_PCD'] = float(hash1_20_PCD[0])
    
    hash2_18_PCD = paired_cosine_distances(ql_hash2_18, qr_hash2_18)
    feature_dict[signature+ 'hash2_18_PCD'] = float(hash2_18_PCD[0])
    
    hash2_20_PCD = paired_cosine_distances(ql_hash2_20, qr_hash2_20)
    feature_dict[signature+ 'hash2_20_PCD'] = float(hash2_20_PCD[0])
    #------------------------------------------------------------------
    
    tfidf1_PMD = paired_manhattan_distances(ql_tfidf1, qr_tfidf1)
    feature_dict[signature+ 'tfidf1_PMD'] = float(tfidf1_PMD)
    
    count1_PMD = paired_manhattan_distances(ql_count1, qr_count1)
    feature_dict[signature+ 'count1_PMD'] = float(count1_PMD)
    
    hash1_18_PMD = paired_manhattan_distances(ql_hash1_18, qr_hash1_18)
    feature_dict[signature+ 'hash1_18_PMD'] = float(hash1_18_PMD)
    
    hash1_20_PMD = paired_manhattan_distances(ql_hash1_20, qr_hash1_20)
    feature_dict[signature+ 'hash1_20_PMD'] = float(hash1_20_PMD)
    
    hash2_18_PMD = paired_manhattan_distances(ql_hash2_18, qr_hash2_18)
    feature_dict[signature+ 'hash2_18_PMD'] = float(hash2_18_PMD)
    
    hash2_20_PMD = paired_manhattan_distances(ql_hash2_20, qr_hash2_20)
    feature_dict[signature+ 'hash2_20_PMD'] = float(hash2_20_PMD)
    
    return feature_dict
Esempio n. 5
0
    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        if epoch != -1:
            if steps == -1:
                out_txt = " after epoch {}:".format(epoch)
            else:
                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
        else:
            out_txt = ":"

        logger.info("EmbeddingSimilarityEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)

        embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
        embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
        labels = self.scores

        cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
        manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
        euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
        dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]


        eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
        eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

        eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
        eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

        eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
        eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

        eval_pearson_dot, _ = pearsonr(labels, dot_products)
        eval_spearman_dot, _ = spearmanr(labels, dot_products)

        logger.info("Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format(
            eval_pearson_cosine, eval_spearman_cosine))
        logger.info("Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
            eval_pearson_manhattan, eval_spearman_manhattan))
        logger.info("Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
            eval_pearson_euclidean, eval_spearman_euclidean))
        logger.info("Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
            eval_pearson_dot, eval_spearman_dot))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            output_file_exists = os.path.isfile(csv_path)
            with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
                writer = csv.writer(f)
                if not output_file_exists:
                    writer.writerow(self.csv_headers)

                writer.writerow([epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean,
                                 eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan, eval_pearson_dot, eval_spearman_dot])


        if self.main_similarity == SimilarityFunction.COSINE:
            return eval_spearman_cosine
        elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
            return eval_spearman_euclidean
        elif self.main_similarity == SimilarityFunction.MANHATTAN:
            return eval_spearman_manhattan
        elif self.main_similarity == SimilarityFunction.DOT_PRODUCT:
            return eval_spearman_dot
        elif self.main_similarity is None:
            return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot)
        else:
            raise ValueError("Unknown main_similarity value")
Esempio n. 6
0
ft = FastTextKeyedVectors.load("D:/fasttext_300_3_polish.bin")
models[f"CBOW-FT"] = Average(ft, lang_freq="pl")
models[f"SIF-FT"] = SIF(ft, components=10)
models[f"uSIF-FT"] = uSIF(ft, length=11)


s=models[f"uSIF-W2V"]
s.sv[0]

cs, md, ed = [],[],[]
for i, j in zip(range(task_length), range(task_length, 2*task_length)):
    temp1 = s.sv[i].reshape(1, -1)
    temp2 = s.sv[j].reshape(1, -1)
    cs.append((1 - (paired_cosine_distances(temp1, temp2)))[0])
    md.append(-paired_manhattan_distances(temp1, temp2)[0])
    ed.append(-paired_euclidean_distances(temp1, temp2)[0])


eval_pearson_cosine, _ = pearsonr(similarities, cs)
eval_spearman_cosine, _ = spearmanr(similarities, cs)
eval_pearson_manhattan, _ = pearsonr(similarities, md)
eval_spearman_manhattan, _ = spearmanr(similarities, md)
eval_pearson_euclidean, _ = pearsonr(similarities, ed)
eval_spearman_euclidean, _ = spearmanr(similarities, ed)

def compute_similarities(task_length, model):
    sims = []
    for i, j in zip(range(task_length), range(task_length, 2*task_length)):
        sims.append(model.sv.similarity(i,j))
    print(sims)
Esempio n. 7
0
    def __call__(self,
                 model,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:
        model.eval()

        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}:"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        logging.info("Evaluation the model on " + self.name + " dataset" +
                     out_txt)

        num_triplets = 0
        num_correct_cos_triplets, num_correct_manhatten_triplets, num_correct_euclidean_triplets = 0, 0, 0

        self.dataloader.collate_fn = model.smart_batching_collate
        for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
            features, label_ids = batch_to_device(batch, model.device)
            with torch.no_grad():
                emb1, emb2, emb3 = [
                    model(sent_features)['sentence_embedding'].to(
                        "cpu").numpy() for sent_features in features
                ]

            #Cosine distance
            pos_cos_distance = paired_cosine_distances(emb1, emb2)
            neg_cos_distances = paired_cosine_distances(emb1, emb3)

            # Manhatten
            pos_manhatten_distance = paired_manhattan_distances(emb1, emb2)
            neg_manhatten_distances = paired_manhattan_distances(emb1, emb3)

            # Euclidean
            pos_euclidean_distance = paired_euclidean_distances(emb1, emb2)
            neg_euclidean_distances = paired_euclidean_distances(emb1, emb3)

            for idx in range(len(pos_cos_distance)):
                num_triplets += 1

                if pos_cos_distance[idx] < neg_cos_distances[idx]:
                    num_correct_cos_triplets += 1

                if pos_manhatten_distance[idx] < neg_manhatten_distances[idx]:
                    num_correct_manhatten_triplets += 1

                if pos_euclidean_distance[idx] < neg_euclidean_distances[idx]:
                    num_correct_euclidean_triplets += 1

        accuracy_cos = num_correct_cos_triplets / num_triplets
        accuracy_manhatten = num_correct_manhatten_triplets / num_triplets
        accuracy_euclidean = num_correct_euclidean_triplets / num_triplets

        logging.info("Accuracy Cosine Distance:\t{:.4f}".format(accuracy_cos))
        logging.info(
            "Accuracy Manhatten Distance:\t{:.4f}".format(accuracy_manhatten))
        logging.info("Accuracy Euclidean Distance:\t{:.4f}\n".format(
            accuracy_euclidean))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([
                        epoch, steps, accuracy_cos, accuracy_manhatten,
                        accuracy_euclidean
                    ])

            else:
                with open(csv_path, mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([
                        epoch, steps, accuracy_cos, accuracy_manhatten,
                        accuracy_euclidean
                    ])

        if self.main_distance_function == SimilarityFunction.COSINE:
            return accuracy_cos
        if self.main_distance_function == SimilarityFunction.MANHATTAN:
            return accuracy_manhatten
        if self.main_distance_function == SimilarityFunction.EUCLIDEAN:
            return accuracy_euclidean

        return max(accuracy_cos, accuracy_manhatten, accuracy_euclidean)
Esempio n. 8
0
def test_paired_manhattan_distances():
    # Check the paired manhattan distances computation
    X = [[0], [0]]
    Y = [[1], [2]]
    D = paired_manhattan_distances(X, Y)
    assert_array_almost_equal(D, [1., 2.])
Esempio n. 9
0
    def __call__(self,
                 model: 'SequentialSentenceEmbedder',
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1,
                 additional_evaluator: Callable[[], float] = None) -> float:
        model.eval()

        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}:"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        logging.info("Evaluation the model on " + self.name + " dataset" +
                     out_txt)

        self.dataloader.collate_fn = model.smart_batching_collate

        iterator = self.dataloader
        if self.show_progress_bar:
            iterator = tqdm(iterator, desc="Convert Evaluating")

        embeddings1, embeddings2, labels = paired_embeddings_for_dataloader(
            self.device, model, iterator)

        try:
            cosine_scores = 1 - (paired_cosine_distances(
                embeddings1, embeddings2))
        except Exception as e:
            print(embeddings1)
            print(embeddings2)
            raise (e)

        manhattan_distances = -paired_manhattan_distances(
            embeddings1, embeddings2)
        euclidean_distances = -paired_euclidean_distances(
            embeddings1, embeddings2)
        dot_products = [
            np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)
        ]

        eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
        eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

        eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
        eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

        eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
        eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

        eval_pearson_dot, _ = pearsonr(labels, dot_products)
        eval_spearman_dot, _ = spearmanr(labels, dot_products)

        logging.info(
            "Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format(
                eval_pearson_cosine, eval_spearman_cosine))
        logging.info(
            "Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
                eval_pearson_manhattan, eval_spearman_manhattan))
        logging.info(
            "Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
                eval_pearson_euclidean, eval_spearman_euclidean))
        logging.info(
            "Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".
            format(eval_pearson_dot, eval_spearman_dot))

        self.csv_headers = [
            "epoch", "steps", "cosine_pearson", "cosine_spearman",
            "euclidean_pearson", "euclidean_spearman", "manhattan_pearson",
            "manhattan_spearman", "dot_pearson", "dot_spearman"
        ]

        if additional_evaluator is not None:
            additional_val = additional_evaluator()
            self.csv_headers += [self.trec_metric]
            logging.info("Additional metric: " + self.trec_metric +
                         " value: {:.4f}".format(additional_val))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            output_file_exists = os.path.isfile(csv_path)
            with open(csv_path,
                      mode="a" if output_file_exists else 'w',
                      encoding="utf-8") as f:
                writer = csv.writer(f)
                if not output_file_exists:
                    writer.writerow(self.csv_headers)
                out_row = [
                    epoch, steps, eval_pearson_cosine, eval_spearman_cosine,
                    eval_pearson_euclidean, eval_spearman_euclidean,
                    eval_pearson_manhattan, eval_spearman_manhattan,
                    eval_pearson_dot, eval_spearman_dot
                ]
                if additional_evaluator is not None:
                    out_row += [additional_val]
                writer.writerow(out_row)
        if additional_evaluator is not None:
            return additional_val

        if self.main_similarity == SimilarityFunction.COSINE:
            return eval_spearman_cosine
        elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
            return eval_spearman_euclidean
        elif self.main_similarity == SimilarityFunction.MANHATTAN:
            return eval_spearman_manhattan
        elif self.main_similarity == SimilarityFunction.DOT_PRODUCT:
            return eval_spearman_dot
        elif self.main_similarity is None:
            return max(eval_spearman_cosine, eval_spearman_manhattan,
                       eval_spearman_euclidean, eval_spearman_dot)
        else:
            raise ValueError("Unknown main_similarity value")
Esempio n. 10
0
#%%
man = np.abs(a - b).sum()

#%%
help(cityblock)

#%%
np.isclose(man, cityblock(a, b))

#%%
help(paired_manhattan_distances)

#%%
np.isclose(man,
           paired_manhattan_distances(a.reshape((1, -1)), b.reshape((1, -1))))

#%% [markdown]
# ## コサイン類似度
# ---
# 2 つのベクトルの間の角度のコサインで、両ベクトルの類似度を表す。
# $
# \displaystyle \begin{aligned}
#     cos\theta  & =\frac
#         {a\cdot b}
#         {\| a\| \ \| b\| }
#     \\
#      & =\frac
#          {\displaystyle \sum ^{n}_{i=1} a_{i} b_{i}}
#          {\sqrt{
#              \displaystyle \sum ^{n}_{i=1} a^{2}_{i}
Esempio n. 11
0
def get_sentvec_features(word2vec,
                         word_weights,
                         model,
                         sent_l,
                         sent_r,
                         signature=""):
    feature_dict = {}

    ql = ["%s" % (word.term.decode('utf8')) for word in sent_l.basic_words]
    qr = ["%s" % (word.term.decode('utf8')) for word in sent_r.basic_words]

    dim = word2vec.vectors.shape[1]

    sif_vec_ql_weight, naive_vec_ql_weight = calc_naive_sif_embedding(
        word2vec, dim, word_weights, model, ql)
    sif_vec_qr_weight, naive_vec_qr_weight = calc_naive_sif_embedding(
        word2vec, dim, word_weights, model, qr)

    if signature:
        signature = signature + "_"

    #calculate sif feature
    if np.isnan(sif_vec_ql_weight[0][0]) or np.isnan(sif_vec_qr_weight[0][0]):
        feature_dict[signature + 'sif_sentvec_weight_PED'] = 1.0
        feature_dict[signature + 'sif_sentvec_weight_PCD'] = 1.0
        feature_dict[signature + 'sif_sentvec_weight_PMD'] = 1000.0
    else:
        sif_sentvec_weight_PED = paired_euclidean_distances(
            sif_vec_ql_weight, sif_vec_qr_weight)
        feature_dict[signature + 'sif_sentvec_weight_PED'] = float(
            sif_sentvec_weight_PED[0])

        sif_sentvec_weight_PCD = paired_cosine_distances(
            sif_vec_ql_weight, sif_vec_qr_weight)
        feature_dict[signature + 'sif_sentvec_weight_PCD'] = float(
            sif_sentvec_weight_PCD[0])

        sif_sentvec_weight_PMD = paired_manhattan_distances(
            sif_vec_ql_weight, sif_vec_qr_weight)
        feature_dict[signature + 'sif_sentvec_weight_PMD'] = float(
            sif_sentvec_weight_PMD[0])

    #calculate naive feature
    if np.isnan(naive_vec_ql_weight[0][0]) or np.isnan(
            naive_vec_qr_weight[0][0]):
        feature_dict[signature + 'avg_sentvec_weight_PED'] = 1.0
        feature_dict[signature + 'avg_sentvec_weight_PCD'] = 1.0
        feature_dict[signature + 'avg_sentvec_weight_PMD'] = 1000.0
    else:
        naive_sentvec_weight_PED = paired_euclidean_distances(
            naive_vec_ql_weight, naive_vec_qr_weight)
        feature_dict[signature + 'avg_sentvec_weight_PED'] = float(
            naive_sentvec_weight_PED[0])

        naive_sentvec_weight_PCD = paired_cosine_distances(
            naive_vec_ql_weight, naive_vec_qr_weight)
        feature_dict[signature + 'avg_sentvec_weight_PCD'] = float(
            naive_sentvec_weight_PCD[0])

        naive_sentvec_weight_PMD = paired_manhattan_distances(
            naive_vec_ql_weight, naive_vec_qr_weight)
        feature_dict[signature + 'avg_sentvec_weight_PMD'] = float(
            naive_sentvec_weight_PMD[0])

    #calculate wmdistance
    feature_dict[signature + 'WMD'] = word2vec.wmdistance(ql, qr)

    return feature_dict
Esempio n. 12
0
    def __call__(self,
                 model: TransformerModel,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:
        model.eval()
        embeddings1 = []
        embeddings2 = []
        labels = []

        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}:"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        logging.info("Evaluation the model on " + self.name + " dataset" +
                     out_txt)
        for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
            batch = batch_to_device(batch, self.device)
            input_ids, segment_ids, input_masks, label_ids = batch
            with torch.no_grad():
                emb1 = model.get_sentence_representation(
                    input_ids[0], segment_ids[0],
                    input_masks[0]).to("cpu").numpy()
                emb2 = model.get_sentence_representation(
                    input_ids[1], segment_ids[1],
                    input_masks[1]).to("cpu").numpy()
            labels.extend(label_ids.to("cpu").numpy())
            embeddings1.extend(emb1)
            embeddings2.extend(emb2)
        cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
        manhattan_distances = -paired_manhattan_distances(
            embeddings1, embeddings2)
        euclidean_distances = -paired_euclidean_distances(
            embeddings1, embeddings2)

        eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
        eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

        eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
        eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

        eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
        eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

        logging.info(
            "Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:4f}".format(
                eval_pearson_cosine, eval_spearman_cosine))
        logging.info(
            "Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:4f}".format(
                eval_pearson_manhattan, eval_spearman_manhattan))
        logging.info(
            "Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:4f}".format(
                eval_pearson_euclidean, eval_spearman_euclidean))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([
                        epoch, steps, eval_pearson_cosine,
                        eval_spearman_cosine, eval_pearson_euclidean,
                        eval_spearman_euclidean, eval_pearson_manhattan,
                        eval_spearman_manhattan
                    ])
            else:
                with open(csv_path, mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([
                        epoch, steps, eval_pearson_cosine,
                        eval_spearman_cosine, eval_pearson_euclidean,
                        eval_spearman_euclidean, eval_pearson_manhattan,
                        eval_spearman_manhattan
                    ])

        if self.main_similarity == EmbeddingSimilarity.COSINE:
            return eval_spearman_cosine
        elif self.main_similarity == EmbeddingSimilarity.EUCLIDEAN:
            return eval_spearman_euclidean
        elif self.main_similarity == EmbeddingSimilarity.MANHATTAN:
            return eval_spearman_manhattan
        elif self.main_similarity is None:
            return max(eval_spearman_cosine, eval_spearman_manhattan,
                       eval_spearman_euclidean)
        else:
            raise ValueError("Unknown main_similarity value")
Esempio n. 13
0
def manhattan_distances(embeddings1, embeddings2):
    return -paired_manhattan_distances(embeddings1, embeddings2)
Esempio n. 14
0
    def __call__(self,
                 model: 'SequentialSentenceEmbedder',
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:
        model.eval()
        embeddings1 = []
        embeddings2 = []
        labels = []

        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}:"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        logging.info("Evaluation the model on " + self.name + " dataset" +
                     out_txt)

        self.dataloader.collate_fn = model.smart_batching_collate

        iterator = self.dataloader
        if self.show_progress_bar:
            iterator = tqdm(iterator, desc="Convert Evaluating")

        for step, batch in enumerate(iterator):
            features, label_ids = batch_to_device(batch, self.device)
            with torch.no_grad():
                emb1, emb2 = [
                    model(sent_features)['sentence_embedding'].to(
                        "cpu").numpy() for sent_features in features
                ]

            labels.extend(label_ids.to("cpu").numpy())
            embeddings1.extend(emb1)
            embeddings2.extend(emb2)

        try:
            cosine_scores = 1 - (paired_cosine_distances(
                embeddings1, embeddings2))
        except Exception as e:
            print(embeddings1)
            print(embeddings2)
            raise (e)

        manhattan_distances = -paired_manhattan_distances(
            embeddings1, embeddings2)
        euclidean_distances = -paired_euclidean_distances(
            embeddings1, embeddings2)
        dot_products = [
            np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)
        ]

        eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
        eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

        eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
        eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

        eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
        eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

        eval_pearson_dot, _ = pearsonr(labels, dot_products)
        eval_spearman_dot, _ = spearmanr(labels, dot_products)

        logging.info(
            "Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format(
                eval_pearson_cosine, eval_spearman_cosine))
        logging.info(
            "Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
                eval_pearson_manhattan, eval_spearman_manhattan))
        logging.info(
            "Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
                eval_pearson_euclidean, eval_spearman_euclidean))
        logging.info(
            "Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".
            format(eval_pearson_dot, eval_spearman_dot))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            output_file_exists = os.path.isfile(csv_path)
            with open(csv_path,
                      mode="a" if output_file_exists else 'w',
                      encoding="utf-8") as f:
                writer = csv.writer(f)
                if not output_file_exists:
                    writer.writerow(self.csv_headers)

                writer.writerow([
                    epoch, steps, eval_pearson_cosine, eval_spearman_cosine,
                    eval_pearson_euclidean, eval_spearman_euclidean,
                    eval_pearson_manhattan, eval_spearman_manhattan,
                    eval_pearson_dot, eval_spearman_dot
                ])

        if self.main_similarity == SimilarityFunction.COSINE:
            return eval_spearman_cosine
        elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
            return eval_spearman_euclidean
        elif self.main_similarity == SimilarityFunction.MANHATTAN:
            return eval_spearman_manhattan
        elif self.main_similarity == SimilarityFunction.DOT_PRODUCT:
            return eval_spearman_dot
        elif self.main_similarity is None:
            return max(eval_spearman_cosine, eval_spearman_manhattan,
                       eval_spearman_euclidean, eval_spearman_dot)
        else:
            raise ValueError("Unknown main_similarity value")
Original file is located at
    https://colab.research.google.com/drive/1UTrfj1JfPSMo52T6tTXPs3V9Q67zJ7OD
"""

from sklearn.metrics.pairwise import paired_euclidean_distances
X = [[0, 1, 2, 3]]
Y = [[1, 2, 3, 4]]

paired_euclidean_distances(X, Y)

from sklearn.metrics.pairwise import paired_manhattan_distances
X = [[0, 1, 2, 3]]
Y = [[1, 2, 3, 4]]

paired_manhattan_distances(X, Y)

movie_a = [0, 2, 1, 3]  # user_id’s who bought the movie a
movie_b = [0, 1, 2, 3]  # user_id’s who bought the movie b


def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection / union)


movie_a = [0, 2, 1, 3]  # user_id’s who bought the movie a
movie_b = [0, 1, 2, 3]  # user_id’s who bought the movie b

print(jaccard_similarity(movie_a, movie_b))
Esempio n. 16
0
    def __call__(self,
                 model,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:
        model.eval()
        embeddings1 = []
        embeddings2 = []
        labels = []

        if epoch != -1:
            if steps == -1:
                out_txt = " after epoch {}:".format(epoch)
            else:
                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
        else:
            out_txt = ":"

        logging.info("Evaluation the model on " + self.name + " dataset" +
                     out_txt)
        self.dataloader.collate_fn = model.smart_batching_collate
        for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
            features, label_ids = batch_to_device(batch, self.device)
            with torch.no_grad():
                emb1, emb2 = [
                    model(sent_features)['sentence_embedding'].to(
                        "cpu").numpy() for sent_features in features
                ]

            labels.extend(label_ids.to("cpu").numpy())
            embeddings1.extend(emb1)
            embeddings2.extend(emb2)
        cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
        manhattan_distances = -paired_manhattan_distances(
            embeddings1, embeddings2)
        euclidean_distances = -paired_euclidean_distances(
            embeddings1, embeddings2)

        # Ensure labels are just 0 or 1
        for label in labels:
            assert (label == 0 or label == 1)

        cosine_middle = np.median(cosine_scores)
        cosine_acc = 0
        for label, score in zip(labels, cosine_scores):
            if (label == 1
                    and score > cosine_middle) or (label == 0
                                                   and score <= cosine_middle):
                cosine_acc += 1
        cosine_acc /= len(labels)

        manhattan_middle = np.median(manhattan_distances)
        manhattan_acc = 0
        for label, score in zip(labels, manhattan_distances):
            if (label == 1 and score > manhattan_middle) or (
                    label == 0 and score <= manhattan_middle):
                manhattan_acc += 1
        manhattan_acc /= len(labels)

        euclidean_middle = np.median(euclidean_distances)
        euclidean_acc = 0
        for label, score in zip(labels, euclidean_distances):
            if (label == 1 and score > euclidean_middle) or (
                    label == 0 and score <= euclidean_middle):
                euclidean_acc += 1
        euclidean_acc /= len(labels)

        logging.info("Cosine-Classification:\t{:4f}".format(cosine_acc))
        logging.info("Manhattan-Classification:\t{:4f}".format(manhattan_acc))
        logging.info(
            "Euclidean-Classification:\t{:4f}\n".format(euclidean_acc))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([
                        epoch, steps, cosine_acc, euclidean_acc, manhattan_acc
                    ])
            else:
                with open(csv_path, mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([
                        epoch, steps, cosine_acc, euclidean_acc, manhattan_acc
                    ])

        if self.main_similarity == SimilarityFunction.COSINE:
            return cosine_acc
        elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
            return euclidean_acc
        elif self.main_similarity == SimilarityFunction.MANHATTAN:
            return manhattan_acc
        else:
            raise ValueError("Unknown main_similarity value")
Esempio n. 17
0
cs99 = paired_euclidean_distances(x.values.reshape(1, -1), [df1])
cs100 = paired_euclidean_distances(x1.values.reshape(1, -1), [df2])
print(cs99)  ### [34.21532517]
print(cs100)  ### [28.53037718]
print(np.argmax(cs99))  ## 0
print(np.argmax(cs100))  ### 0
r = [0, 0]
cs_49 = [[34.21532517], [28.53037718]]
print(cs_49)  #############  [[34.21532517],[28.53037718]]
np.argmax(cs_49)  ### 0
from sklearn.metrics.pairwise import paired_manhattan_distances
x = ratings.iloc[100:250, 1]
df1 = ratings.iloc[1, :150]
x1 = ratings.iloc[100:175, 2]
df2 = ratings.iloc[2, :75]
cs101 = paired_manhattan_distances(x.values.reshape(1, -1), [df1])
cs102 = paired_manhattan_distances(x1.values.reshape(1, -1), [df2])
print(cs101)  ### [126.03125]
print(cs102)  ### array[167.625]
print(np.argmax(cs101))  ## 0
print(np.argmax(cs102))  ### 0
r = [0, 0]
cs_50 = [[126.03125], [167.625]]
print(cs_50)  #############  [[126.03125],[167.625]]
np.argmax(cs_50)  ### 1
from sklearn.metrics.pairwise import paired_cosine_distances
x = ratings.iloc[100:250, 1]
df1 = ratings.iloc[1, :150]
x1 = ratings.iloc[100:175, 2]
df2 = ratings.iloc[2, :75]
cs103 = paired_cosine_distances(x.values.reshape(1, -1), [df1])
Esempio n. 18
0
    def __call__(self,
                 model,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:

        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}:"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        logging.info("Binary Accuracy Evaluation of the model on " +
                     self.name + " dataset" + out_txt)
        embeddings1 = model.encode(self.sentences1,
                                   batch_size=self.batch_size,
                                   show_progress_bar=self.show_progress_bar,
                                   convert_to_numpy=True)
        embeddings2 = model.encode(self.sentences2,
                                   batch_size=self.batch_size,
                                   show_progress_bar=self.show_progress_bar,
                                   convert_to_numpy=True)

        cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
        manhattan_distances = paired_manhattan_distances(
            embeddings1, embeddings2)
        euclidean_distances = paired_euclidean_distances(
            embeddings1, embeddings2)

        labels = np.asarray(self.labels)

        file_output_data = [epoch, steps]

        main_score = None
        for name, scores, reverse in [[
                'Cosine-Similarity', cosine_scores, True
        ], ['Manhatten-Distance', manhattan_distances,
                False], ['Euclidean-Distance', euclidean_distances, False]]:
            acc, acc_threshold = self.find_best_acc_and_threshold(
                scores, labels, reverse)
            f1, precision, recall, f1_threshold = self.find_best_f1_and_threshold(
                scores, labels, reverse)
            ap = average_precision_score(labels,
                                         scores * (1 if reverse else -1))

            logging.info(
                "Accuracy with {}:           {:.2f}\t(Threshold: {:.4f})".
                format(name, acc * 100, acc_threshold))
            logging.info(
                "F1 with {}:                 {:.2f}\t(Threshold: {:.4f})".
                format(name, f1 * 100, f1_threshold))
            logging.info("Precision with {}:          {:.2f}".format(
                name, precision * 100))
            logging.info("Recall with {}:             {:.2f}".format(
                name, recall * 100))
            logging.info("Average Precision with {}:  {:.2f}\n".format(
                name, ap * 100))

            file_output_data.extend(
                [acc, acc_threshold, f1, precision, recall, f1_threshold, ap])

            if main_score is None:  #Use AveragePrecision with Cosine-Similarity as main score
                main_score = ap

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow(file_output_data)
            else:
                with open(csv_path, mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(file_output_data)

        return main_score
    def _get_vectors(self, df):
        def mean_vector(lemmatized_text):

            res = list([
                np.zeros(self.word2vec_vector_length),
            ])

            for word in lemmatized_text:

                try:

                    res.append(self.word2vec_model[word])

                except KeyError:

                    pass

                    # self.logger.warning('There is no "%s" in vocabulary of the given model; ommited' % word)

            mean = sum(np.array(res)) / (len(res) - 1 + 1e-25)

            return mean

        if not self.word2vec_stopwords:

            df.lemmas_x = df.lemmas_x.map(self._remove_stop_words)

            df.lemmas_y = df.lemmas_y.map(self._remove_stop_words)

        # Add the required UPoS postags (as in the rusvectores word2vec model's vocabulary)

        if self.word2vec_tag_required:

            df.lemmas_x = df.snippet_x_locs.map(self._tag_postags)

            df.lemmas_y = df.snippet_y_locs.map(self._tag_postags)

        # Make two dataframes with average vectors for x and y,

        # merge them with the original dataframe

        df_embed_x = df.lemmas_x.apply(mean_vector).values.tolist()

        df_embed_y = df.lemmas_y.apply(mean_vector).values.tolist()

        embeddings = pd.DataFrame(df_embed_x).merge(pd.DataFrame(df_embed_y),
                                                    left_index=True,
                                                    right_index=True)

        embeddings['cos_embed_dist'] = paired_cosine_distances(
            df_embed_x, df_embed_y)

        embeddings['eucl_embed_dist'] = paired_euclidean_distances(
            df_embed_x, df_embed_y)

        embeddings['manh_embed_dist'] = paired_manhattan_distances(
            df_embed_x, df_embed_y)

        df = pd.concat(
            [df.reset_index(drop=True),
             embeddings.reset_index(drop=True)],
            axis=1)

        return df
Esempio n. 20
0
def test_paired_manhattan_distances():
    # Check the paired manhattan distances computation
    X = [[0], [0]]
    Y = [[1], [2]]
    D = paired_manhattan_distances(X, Y)
    assert_array_almost_equal(D, [1., 2.])
    def __call__(self,
                 model,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:
        model.eval()
        embeddings1 = []
        embeddings2 = []
        labels = []

        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}:"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        logging.info("Evaluation the model on " + self.name + " dataset" +
                     out_txt)
        self.dataloader.collate_fn = model.smart_batching_collate
        for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
            features, label_ids = batch_to_device(batch, self.device)
            with torch.no_grad():
                emb1, emb2 = [
                    model(sent_features)['sentence_embedding'].to(
                        "cpu").numpy() for sent_features in features
                ]

            labels.extend(label_ids.to("cpu").numpy())
            embeddings1.extend(emb1)
            embeddings2.extend(emb2)
        cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
        manhattan_distances = paired_manhattan_distances(
            embeddings1, embeddings2)
        euclidean_distances = paired_euclidean_distances(
            embeddings1, embeddings2)

        # Ensure labels are just 0 or 1
        for label in labels:
            assert (label == 0 or label == 1)

        labels = np.asarray(labels)
        cosine_acc, cosine_threshold = self.find_best_acc_and_threshold(
            cosine_scores, labels, True)
        manhattan_acc, manhatten_threshold = self.find_best_acc_and_threshold(
            manhattan_distances, labels, False)
        euclidean_acc, euclidean_threshold = self.find_best_acc_and_threshold(
            euclidean_distances, labels, False)

        logging.info(
            "Accuracy with Cosine-Similarity:\t{:.2f}\t(Threshold: {:.4f})".
            format(cosine_acc * 100, cosine_threshold))
        logging.info(
            "Accuracy with Manhattan-Distance:\t{:.2f}\t(Threshold: {:.4f})".
            format(manhattan_acc * 100, manhatten_threshold))
        logging.info(
            "Accuracy with Euclidean-Distance:\t{:.2f}\t(Threshold: {:.4f})\n".
            format(euclidean_acc * 100, euclidean_threshold))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([
                        epoch, steps, cosine_acc, euclidean_acc, manhattan_acc
                    ])
            else:
                with open(csv_path, mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([
                        epoch, steps, cosine_acc, euclidean_acc, manhattan_acc
                    ])

        if self.main_similarity == SimilarityFunction.COSINE:
            return cosine_acc
        elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
            return euclidean_acc
        elif self.main_similarity == SimilarityFunction.MANHATTAN:
            return manhattan_acc
        else:
            raise ValueError("Unknown main_similarity value")