Beispiel #1
0
def cluster_score(W, W_true, cost_dist='euclidean'):
    """
    Subsequently, computes several distances between the true and estimated loading matrices.

    Parameters:
    ----------
    W: np.ndarray
        Estimated loading matrix
    W_true: np.ndarray
        True loading matrix

    Returns:
    ----------
    distances: dict
        The distance between the true and the estimated clusters.
        Computes the "Jaccard", "Hamming" and "Kulsinski" distances and return them as a dict.
    score: float
        The optimal assignment cost.
    W_aligned: np.ndarray
        A copy of W with its columns rearranged according to the optimal alignment.
    alignment: tuple of (np.ndarray, np.ndarray)
        The row-idx and column-idx of the optimal alignment.
    """
    # align W to W_true by shuffling its columns
    alignment, score = get_alignment(W, W_true, cost_dist)
    # create an aligned version of W by permuting its columns
    W_aligned = W.copy()[:, alignment[1]]
    # compare clusters
    am_W, am_W_true = W_aligned.argmax(1), W_true.argmax(1)
    distances = {}
    distances['jaccard'] = spd.jaccard(am_W, am_W_true)
    distances['hamming'] = spd.hamming(am_W, am_W_true)
    distances['kulsinski'] = spd.kulsinski(am_W, am_W_true)
    return distances, score, W_aligned, alignment
Beispiel #2
0
    def get_nearest_neighbor(self, x_test, k, sample_class):
        distances = []
        targets_index = []
        for i in range(len(sample_class)):
            if (sample_class[i][:] != x_test).any():
                if self.distance_calculator == 'jaccard':
                    distance = dis.jaccard(x_test, sample_class[i][:])
                elif self.distance_calculator == 'dice':
                    distance = dis.dice(x_test, sample_class[i][:])
                elif self.distance_calculator == 'correlation':
                    distance = dis.correlation(x_test, sample_class[i][:])
                elif self.distance_calculator == 'yule':
                    distance = dis.yule(x_test, sample_class[i][:])
                elif self.distance_calculator == 'russelo-rao':
                    distance = dis.russellrao(x_test, sample_class[i][:])
                elif self.distance_calculator == 'sokal-michener':
                    distance = dis.sokalmichener(x_test, sample_class[i][:])
                elif self.distance_calculator == 'rogers-tanimoto':
                    distance = dis.rogerstanimoto(x_test, sample_class[i][:])
                elif self.distance_calculator == 'kulzinsky':
                    distance = dis.kulsinski(x_test, sample_class[i][:])
                distances.append([distance, i])

        # make a list of the k neighbors' targets
        distances.sort()
        for i in range(k):
            targets_index.append(distances[i][1])
        return targets_index
 def test_kulsinski(self):
     n_items = MATRIX.shape[1]
     should_be = np.zeros((n_items, n_items))
     for i in range(n_items):
         for j in range(n_items):
             should_be[i, j] = spd.kulsinski(BOOL_MATRIX.T[i],
                                             BOOL_MATRIX.T[j])
     actually_is = (1 - kulsinski(self.data).toarray())
     self.assertTrue(np.allclose(should_be, actually_is))
Beispiel #4
0
    def kulsinski(self, x=None, y=None, w=None):
        """
        库尔辛斯基差异

        x = [1, 0, 0]
        y = [0, 1, 0]
        """
        x = x or self.x
        y = y or self.y
        w = w or self.w
        return distance.kulsinski(x, y, w)
Beispiel #5
0
def calc_kulczynski(query_vec, num_of_docs):  # smaller better!
    vec_distances = []
    for index, row in data.iterrows():
        vec_distances.append(kulsinski(query_vec.toarray(), row['text']))

    result_docs = data.copy()
    result_docs['kulsinski'] = list(vec_distances)
    result_docs = result_docs.sort_values(by=['kulsinski'])  # default: asc

    result_docs = result_docs.head(num_of_docs)
    result_docs.drop('kulsinski', axis=1, inplace=True)
    return result_docs
Beispiel #6
0
def distances(W, W_true, G, G_true, alignment=None):
    if alignment is None:
        alignment = np.arange(W.shape[1])
    Wal, Gal = align_from_permutation(W, G, alignment)
    am_W, am_W_true = Wal.argmax(1), W_true.argmax(1)
    distances = {}
    distances['jaccard'] = spd.jaccard(am_W, am_W_true)
    distances['hamming'] = spd.hamming(am_W, am_W_true)
    distances['kulsinski'] = spd.kulsinski(am_W, am_W_true)
    cov_mse = covariance_mse(Gal, G_true)
    distances['cov_mse'] = cov_mse
    distances['cov_mse_mean'] = np.mean(cov_mse)
    distances['cov_mse_max'] = np.max(cov_mse)
    return distances
def test_kulsinski_similarity():
    true = np.double(np.random.binomial(n=1, p=.5, size=10))
    predicted = np.double(np.round(np.random.random(10)))
    refscore = kulsinski(true, predicted)
    yt = T.vector('yt')
    yp = T.vector('yp')
    f = theano.function([yt, yp], tmetrics.classification.kulsinski_similarity(yt, yp), allow_input_downcast=True)
    score = f(true.astype('float32'), predicted.astype('float32'))
    print 'true'
    print true
    print 'predicted'
    print predicted
    print 'refscore {}'.format(refscore)
    print 'score {}'.format(score)
    assert np.allclose(refscore, score)
Beispiel #8
0
def distances(v1, v2):
    if v1.sum() == 0 or v2.sum() == 0:
        if v1.sum() == v2.sum():
            return _NEAR
        else:
            return _FAR
    v1 = v1.toarray()
    v2 = v2.toarray()

    b1 = v1 > 0
    b2 = v2 > 0
    return np.asarray([
        sp_dist.cosine(v1, v2),
        sp_dist.dice(b1, b2),
        sp_dist.hamming(b1, b2),
        sp_dist.kulsinski(b1, b2)
    ])
def cross_channel_boolean_distance_features(mask):
    """calculates the cross channel distance features 
    
    Calculates the distances across channels 

    Parameters
    ----------
    mask : 3D array, shape (M, N, C)
        The input mask with multiple channels. 

    Returns
    -------
    features :  dict  
        dictionary including different distances across channels

    """

    features = dict()
    for ch1 in range(mask.shape[2]):
        for ch2 in range(ch1 + 1, mask.shape[2]):
            # rehaping the channels to 1D
            channel1 = mask[:, :, ch1].ravel()
            channel2 = mask[:, :, ch2].ravel()

            # creating the suffix name for better readability
            suffix = "_Ch" + str(ch1 + 1) + "_Ch" + str(ch2 + 1)

            # storing the distance values
            features["dice_distance" + suffix] = dist.dice(channel1, channel2)
            features["hamming_distance" + suffix] = dist.hamming(
                channel1, channel2)
            features["jaccard_distance" + suffix] = dist.jaccard(
                channel1, channel2)
            features["kulsinski_distance" + suffix] = dist.kulsinski(
                channel1, channel2)
            features["rogerstanimoto_distance" + suffix] = dist.rogerstanimoto(
                channel1, channel2)
            features["russellrao_distance" + suffix] = dist.russellrao(
                channel1, channel2)
            features["sokalmichener_distance" + suffix] = dist.sokalmichener(
                channel1, channel2)
            features["sokalsneath_distance" + suffix] = dist.sokalsneath(
                channel1, channel2)
            features["yule_distance" + suffix] = dist.yule(channel1, channel2)

    return features
Beispiel #10
0
 def calculate_pss(self,
                   profile,
                   ignore=None,
                   method="pairwise"):
     """
     Calculate Profiles Similarity Score.
     """
     if len(self) != len(profile):
         raise ProfileError("Different profiles' lengths")
     prof_1 = self
     prof_2 = profile
     if ignore:
         for i in ignore:
             try:
                 prof_1.profile = list(prof_1.profile)
                 del prof_1.profile[prof_1.query.index(i)]
                 prof_1.profile = tuple(prof_1.profile)
             except IndexError:
                 raise ProfileError("Element to ignore not in profile")
             try:
                 prof_2.profile = list(prof_2.profile)
                 del prof_2.profile[prof_2.query.index(i)]
                 prof_2.profile = tuple(prof_2.profile)
             except IndexError:
                 raise ProfileError("Element to ignore not in profile")
     if method == "pairwise":
         return sum(a == b for a, b in zip(prof_1.profile, prof_2.profile))
     elif method == "jaccard":
         return dist.jaccard(prof_1.profile, prof_2.profile)
     elif method == "yule":
         return dist.yule(prof_1.profile, prof_2.profile)
     elif method == "dice":
         return dist.dice(prof_1.profile, prof_2.profile)
     elif method == "hamming":
         return dist.hamming(prof_1.profile, prof_2.profile)
     elif method == "kulsinski":
         return dist.kulsinski(prof_1.profile, prof_2.profile)
     elif method == "rogerstanimoto":
         return dist.rogerstanimoto(prof_1.profile, prof_2.profile)
     elif method == "russellrao":
         return dist.russellrao(prof_1.profile, prof_2.profile)
     elif method == "sokalmichener":
         return dist.sokalmichener(prof_1.profile, prof_2.profile)
Beispiel #11
0
def kulsinski(app1SyscallsVector, app2SyscallsVector):
    return spDist.kulsinski(app1SyscallsVector, app2SyscallsVector)
Beispiel #12
0
def exec_similarity(dct, algorithm):
    if validate_similarity_algorithms(dct, algorithm):
        return {}
    if algorithm == 'braycurtis':
        return [
            answer.update({
                algorithm:
                braycurtis(ndarray_dict(dct['tf_idf']),
                           ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'canberra':
        return [
            answer.update({
                algorithm:
                canberra(ndarray_dict(dct['tf_idf']),
                         ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'chebyshev':
        return [
            answer.update({
                algorithm:
                chebyshev(ndarray_dict(dct['tf_idf']),
                          ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'cityblock':
        return [
            answer.update({
                algorithm:
                cityblock(ndarray_dict(dct['tf_idf']),
                          ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'correlation':
        return [
            answer.update({
                algorithm:
                correlation(ndarray_dict(dct['tf_idf']),
                            ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'cosine':
        return [
            answer.update({
                algorithm:
                cosine(ndarray_dict(dct['tf_idf']),
                       ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'euclidean':
        return [
            answer.update({
                algorithm:
                euclidean(ndarray_dict(dct['tf_idf']),
                          ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'mahalanobis':
        return [
            answer.update({
                algorithm:
                mahalanobis(ndarray_dict(dct['tf_idf']),
                            ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    #elif algorithm is 'minkowski':
    #return [answer.update({algorithm:minkowski(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf']))}) for answer in dct['answers']]
    elif algorithm == 'seuclidean':
        return [
            answer.update({
                algorithm:
                seuclidean(ndarray_dict(dct['tf_idf']),
                           ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'sqeuclidean':
        return [
            answer.update({
                algorithm:
                sqeuclidean(ndarray_dict(dct['tf_idf']),
                            ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'wminkowski':
        return [
            answer.update({
                algorithm:
                wminkowski(ndarray_dict(dct['tf_idf']),
                           ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'dice':
        return [
            answer.update({
                algorithm:
                dice(ndarray_dict(dct['tf_idf']),
                     ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'hamming':
        return [
            answer.update({
                algorithm:
                hamming(ndarray_dict(dct['tf_idf']),
                        ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'jaccard':
        return [
            answer.update({
                algorithm:
                jaccard(ndarray_dict(dct['tf_idf']),
                        ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'kulsinski':
        return [
            answer.update({
                algorithm:
                kulsinski(ndarray_dict(dct['tf_idf']),
                          ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'rogerstanimoto':
        return [
            answer.update({
                algorithm:
                rogerstanimoto(ndarray_dict(dct['tf_idf']),
                               ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'russellrao':
        return [
            answer.update({
                algorithm:
                russellrao(ndarray_dict(dct['tf_idf']),
                           ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'sokalmichener':
        return [
            answer.update({
                algorithm:
                sokalmichener(ndarray_dict(dct['tf_idf']),
                              ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'sokalsneath':
        return [
            answer.update({
                algorithm:
                sokalsneath(ndarray_dict(dct['tf_idf']),
                            ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
    elif algorithm == 'yule':
        return [
            answer.update({
                algorithm:
                yule(ndarray_dict(dct['tf_idf']),
                     ndarray_dict(answer['tf_idf']))
            }) for answer in dct['answers']
        ]
 def distance(self, vector1, vector2, type_):
     """ 
     Calculate distance between two vectors.
     
     Args:
         vector1 (list of int/float/bool): Vector in vector space
         vector2 (list of int/float/bool): Vector in vector space
         type_ (str): Type of distance calculation. Allowed types are:
             * For numeric vectors *
             - braycurtis: Computes the Bray-Curtis distance between two arrays.
             - canberra: Computes the Canberra distance between two arrays.
             - chebyshev: 	Computes the Chebyshev distance.
             - cityblock: Computes the City Block (Manhattan) distance.
             - correlation: Computes the correlation distance between two arrays.
             - cosine: Computes the Cosine distance between arrays.
             - euclidean: Computes the Euclidean distance between two arrays.
             - sqeuclidean: Computes the squared Euclidean distance between two arrays.
             
             * For boolean vectors *
             - dice: Computes the Dice dissimilarity between two boolean arrays.
             - hamming: Computes the Hamming distance between two arrays.
             - jaccard: Computes the Jaccard-Needham dissimilarity between two boolean arrays.
             - kulsinski: Computes the Kulsinski dissimilarity between two boolean arrays.
             - rogerstanimoto: Computes the Rogers-Tanimoto dissimilarity between two boolean arrays.
             - russellrao: Computes the Russell-Rao dissimilarity between two boolean arrays.
             - sokalmichener: Computes the Sokal-Michener dissimilarity between two boolean arrays.
             - sokalsneath: Computes the Sokal-Sneath dissimilarity between two boolean arrays.
             - yule: Computes the Yule dissimilarity between two boolean arrays.
             
     Returns:
         float: Distance between vectors.
     """
     if type_ == "braycurtis":
         return distance.braycurtis(vector1, vector2)
     elif type_ == "canberra":
         return distance.canberra(vector1, vector2)
     elif type_ == "chebyshev":
         return distance.chebyshev(vector1, vector2)
     elif type_ == "cityblock":
         return distance.cityblock(vector1, vector2)
     elif type_ == "correlation":
         return distance.correlation(vector1, vector2)
     elif type_ == "cosine":
         return distance.cosine(vector1, vector2)
     elif type_ == "euclidean":
         return distance.euclidean(vector1, vector2)
     elif type_ == "sqeuclidean":
         return distance.sqeuclidean(vector1, vector2)
     elif type_ == "dice":
         return distance.dice(vector1, vector2)
     elif type_ == "hamming":
         return distance.hamming(vector1, vector2)
     elif type_ == "jaccard":
         return distance.jaccard(vector1, vector2)
     elif type_ == "kulsinski":
         return distance.kulsinski(vector1, vector2)
     elif type_ == "kulsinski":
         return distance.kulsinski(vector1, vector2)
     elif type_ == "rogerstanimoto":
         return distance.rogerstanimoto(vector1, vector2)
     elif type_ == "russellrao":
         return distance.russellrao(vector1, vector2)
     elif type_ == "sokalmichener":
         return distance.sokalmichener(vector1, vector2)
     elif type_ == "sokalsneath":
         return distance.sokalsneath(vector1, vector2)
     elif type_ == "yule":
         return distance.yule(vector1, vector2)
     else:
         raise ValueError(
             """Wrong value for type_. Please enter one of supported values.
                          Type help(distance) to see supported values.""")
Beispiel #14
0
def main():
    from scipy.spatial import distance
    a = np.array([1, 2, 43])
    b = np.array([3, 2, 1])

    d = Distance()
    print('-----------------------------------------------------------------')

    print('My       braycurtis: {}'.format(d.braycurtis(a, b)))
    print('SciPy    braycurtis: {}'.format(distance.braycurtis(a, b)))
    print('-----------------------------------------------------------------')

    print('My       canberra: {}'.format(d.canberra(a, b)))
    print('SciPy    canberra: {}'.format(distance.canberra(a, b)))
    print('-----------------------------------------------------------------')

    print('My       chebyshev: {}'.format(d.chebyshev(a, b)))
    print('SciPy    chebyshev: {}'.format(distance.chebyshev(a, b)))
    print('-----------------------------------------------------------------')

    print('My       cityblock: {}'.format(d.cityblock(a, b)))
    print('SciPy    cityblock: {}'.format(distance.cityblock(a, b)))
    print('-----------------------------------------------------------------')

    print('My       correlation: {}'.format(d.correlation(a, b)))
    print('SciPy    correlation: {}'.format(distance.correlation(a, b)))
    print('-----------------------------------------------------------------')

    print('My       euclidean: {}'.format(d.euclidean(a, b)))
    print('SciPy    euclidean: {}'.format(distance.euclidean(a, b)))
    print('-----------------------------------------------------------------')

    print('My       hamming: {}'.format(d.hamming(a, b)))
    print('SciPy    hamming: {}'.format(distance.hamming(a, b)))
    print('-----------------------------------------------------------------')

    print('My       jaccard: {}'.format(d.jaccard(a, b)))
    print('SciPy    jaccard: {}'.format(distance.jaccard(a, b)))
    print('-----------------------------------------------------------------')

    print('My       manhattan: {}'.format(d.cityblock(a, b)))
    print('SciPy    manhattan: {}'.format(distance.cityblock(a, b)))
    print('-----------------------------------------------------------------')

    print('My       cosine: {}'.format(d.cosine(a, b)))
    print('SciPy    cosine: {}'.format(distance.cosine(a, b)))
    print('-----------------------------------------------------------------')

    print('My       dice: {}'.format(d.dice(a, b)))
    print('SciPy    dice: {}'.format(distance.dice(a, b)))
    print('-----------------------------------------------------------------')

    print('My       kulsinski: {}'.format(d.kulsinski(a, b)))
    print('SciPy    kulsinski: {}'.format(distance.kulsinski(a, b)))
    print('-----------------------------------------------------------------')

    iv = np.array([[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]])
    print('My       mahalanobis: {}'.format(d.mahalanobis(a, b, iv)))
    print('SciPy    mahalanobis: {}'.format(distance.mahalanobis(a, b, iv)))
    print('-----------------------------------------------------------------')

    print('My       seuclidean: {}'.format(
        d.seuclidean(a, b, np.array([0.1, 0.1, 0.1]))))
    print('SciPy    seuclidean: {}'.format(
        distance.seuclidean(a, b, [0.1, 0.1, 0.1])))
    print('-----------------------------------------------------------------')

    print('My       sokalmichener: {}'.format(d.sokalmichener(a, b)))
    print('SciPy    sokalmichener: {}'.format(distance.sokalmichener(a, b)))
    print('-----------------------------------------------------------------')

    print('My       sokal_sneath: {}'.format(d.sokalsneath(a, b)))
    print('SciPy    sokal_sneath: {}'.format(distance.sokalsneath(a, b)))
    print('-----------------------------------------------------------------')

    print('My       sqeuclidean: {}'.format(d.sqeuclidean(a, b)))
    print('SciPy    sqeuclidean: {}'.format(distance.sqeuclidean(a, b)))
    print('-----------------------------------------------------------------')

    print('My       minkowski: {}'.format(d.minkowski(a, b, 2)))
    print('SciPy    minkowski: {}'.format(distance.minkowski(a, b, 2)))
    print('-----------------------------------------------------------------')

    print('My       rogerstanimoto: {}'.format(d.rogerstanimoto(a, b)))
    print('SciPy    rogerstanimoto: {}'.format(distance.rogerstanimoto(a, b)))
    print('-----------------------------------------------------------------')

    print('My       russellrao: {}'.format(d.russellrao(a, b)))
    print('SciPy    russellrao: {}'.format(distance.russellrao(a, b)))
    print('-----------------------------------------------------------------')

    print('My       wminkowski: {}'.format(d.wminkowski(a, b, 2, np.ones(3))))
    print('SciPy    wminkowski: {}'.format(
        distance.wminkowski(a, b, 2, np.ones(3))))
    print('-----------------------------------------------------------------')

    print('My       yule: {}'.format(d.yule(a, b)))
    print('SciPy    yule: {}'.format(distance.yule(a, b)))
    print('-----------------------------------------------------------------')