Esempio n. 1
0
    def get_glove_similarity(self, num_hits=25, method='cosine'):
        """ Calculate GloVe based similarities (all-vs-all)
        
        Args:
        -------
        num_centroid_hits: int
            Function will store the num_centroid_hits closest matches. Default is 25.      
        method: str
            See scipy spatial.distance.cdist for options. Default is 'cosine'.
        """

        if self.model_glove is None:
            print("No GloVe document vectors found.")
            print(
                "Please first train model using 'build_model_glove' function.")
            print(
                "Then create document vectors using 'get_vectors_glove' function."
            )
        else:
            list_similars_idx, list_similars, mean_similarity = functions.calculate_similarities(
                self.vectors_glove, num_hits, method=method)
            print("Calculated distances between ", list_similars.shape[0],
                  " documents.")

            self.list_similars_glove_idx = list_similars_idx
            self.list_similars_glove = list_similars
Esempio n. 2
0
    def get_doc2vec_similarity(self, num_hits=25, method='cosine'):
        """ Calculate Doc2Vec based similarities (all-vs-all)
        
        Args:
        -------
        num_centroid_hits: int
            Function will store the num_centroid_hits closest matches. Default is 25.      
        method: str
            See scipy spatial.distance.cdist for options. Default is 'cosine'.
        """

        if self.vectors_glove is None:
            print("No trained Doc2Vec model found.")
            print(
                "Please first train model using 'build_model_doc2vec' function."
            )
        else:
            vectors = np.zeros(
                (len(self.corpus), self.model_doc2vec.vector_size))

            for i in range(len(self.corpus)):
                vectors[i, :] = self.model_doc2vec.docvecs[i]

            list_similars_idx, list_similars, mean_similarity = functions.calculate_similarities(
                vectors, num_hits, method=method)

            self.list_similars_ctr_idx = list_similars_idx
            self.list_similars_ctr = list_similars
def test_calculate_similarities():
    # Test with test-vectors and known outcome
    testvectors = np.array([[0, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 1, 1, 0, 0],
                            [0.5, 0.5, 0.5, 0, 0]])

    # Run function:
    list_similars_ids, list_similars, mean_similarity = calculate_similarities(
        testvectors, num_hits=4, method='cosine')

    assert list_similars[0][1] == list_similars[1][1] > 0.7
    assert list_similars[2][1] == list_similars[3][1] == 1
    assert list_similars[0][3] == list_similars[2][3] == 0
    assert np.min(list_similars_ids[1, :] == np.array([1, 0, 2, 3]))
    assert np.min(list_similars_ids[2, :] == list_similars_ids[3, :] ==
                  np.array([2, 3, 1, 0]))
Esempio n. 4
0
    def get_pca_similarity(self, num_hits=25, method='cosine'):
        """ Calculate PCA similarities(all-versus-all --> matrix)
        
        Args:
        -------
        num_centroid_hits: int
            Function will store the num_centroid_hits closest matches. Default is 25.      
        method: str
            See scipy spatial.distance.cdist for options. Default is 'cosine'.
        
        """
        list_similars_idx, list_similars, mean_similarity = functions.calculate_similarities(
            self.vectors_pca, num_hits, method=method)

        self.list_similars_pca_idx = list_similars_idx
        self.list_similars_pca = list_similars
Esempio n. 5
0
 def get_centroid_similarity(self, num_hits=25, method='cosine'):
     """ Calculate centroid similarities(all-versus-all --> matrix)
     
     Args:
     -------
     num_centroid_hits: int
         Function will store the num_centroid_hits closest matches. Default is 25.      
     method: str
         See scipy spatial.distance.cdist for options. Default is 'cosine'.
     
     """
     list_similars_idx, list_similars, mean_similarity = functions.calculate_similarities(
         self.vectors_centroid, num_hits, method=method)
     print("Calculated distances between ", list_similars.shape[0],
           " documents.")
     self.list_similars_ctr_idx = list_similars_idx
     self.list_similars_ctr = list_similars
Esempio n. 6
0
    def get_autoencoder_similarity(self, num_hits=25, method='cosine'):
        """ Calculate autoencoder similarities(all-versus-all --> matrix)
        
        Args:
        -------
        num_centroid_hits: int
            Function will store the num_centroid_hits closest matches. Default is 25.      
        method: str
            See scipy spatial.distance.cdist for options. Default is 'cosine'.
        
        """
        self.vectors_ae = self.encoder.predict(self.X_data)

        list_similars_ae_idx, list_similars_ae, mean_similarity = functions.calculate_similarities(
            self.vectors_ae, num_hits, method=method)

        self.list_similars_ae_idx = list_similars_ae_idx
        self.list_similars_ae = list_similars_ae