def setUpMod(self, mode='rnd'):
        np.random.seed(626)
        if mode == 'rnd':
            points = 50
            dim = 500
            self.vector = 99. * (np.random.rand(points, dim) - 0.5)
            self.label = np.random.randint(0, 5, points)
            self.dist = euclidean_distance(self.vector)
            # scale to [0, 1), avoiding 1: otherwise sparseMP != denseMP (by design)
            self.dist /= (self.dist.max() + 1e-12)
        elif mode == 'toy':
            # MP empiric ground truth calculated by hand for this toy example
            self.dist = squareform([.2, .1, .8, .4, .3, .5, .7, 1., .6, .9])

            # MP with div/(n-0)
            self.mp_dist_truth = squareform(
                [.6, .4, 1., .8, .6, .8, 1., 1., .8, 1.])
            """
            # MP with div/(n-1)
            self.mp_dist_truth = squareform([.5, .25, 1., .75, .5,
                                             .75, 1., 1., .75, 1.])

            # MP with div/(n-2)
            self.mp_dist_truth = squareform([1/3, 0., 1., 2/3, 1/3,
                                             2/3, 1., 1., 2/3, 1.])
            """
            self.vector = None
            self.label = None
Ejemplo n.º 2
0
 def test_hubness_return_values_are_self_consistent(self):
     """Test that the three returned values fit together"""
     np.random.seed(626)
     points = 200
     dim = 500
     vector = 99. * (np.random.rand(points, dim) - 0.5)
     dist = euclidean_distance(vector)
     k = 10
     Sk10, Dk10, Nk10 = hubness(dist, k=k)
     # Dk is just checked for correct shape
     correct_dim_Dk10 = Dk10.shape == (points, k)
     # Count k-occurence (different method than in module)
     Dk10 = Dk10.ravel()
     Nk10_true = np.zeros(points, dtype=int)
     for i in range(points):
         Nk10_true[i] = (Dk10 == i).sum()
     correct_Nk10 = np.all(Nk10 == Nk10_true)
     # Calculate skewness (different method than in module)
     x0 = Nk10 - Nk10.mean()
     s2 = (x0**2).mean()
     m3 = (x0**3).mean()
     s = m3 / (s2**1.5)
     Sk10_true = s
     correct_Sk10 = Sk10 == Sk10_true
     return self.assertTrue(correct_dim_Dk10 and correct_Nk10
                            and correct_Sk10)
Ejemplo n.º 3
0
 def setUp(self):
     points = 100
     dim = 10
     self.vector = 99. * (np.random.rand(points, dim) - 0.5)
     self.label = np.random.randint(0, 5, points)
     self.dist = euclidean_distance(self.vector)
     self.SEC_DIST = set([
         'mp', 'mp_gaussi', 'mp_gammai', 'ls', 'nicdm', 'snn', 'cent',
         'wcent', 'lcent', 'dsg', 'dsl', 'orig'
     ])
Ejemplo n.º 4
0
def calculate_AUC(X, embed):
    # FPR, TPRを近傍数kごとに算出
    # 高次元の近傍数は20, 低次元の近傍数は1~100
    # fpr.shape = (100, )
    # D_XとD_embedは昇順にソートされた距離行列(0は除く)

    k_high = 20
    Ks = 100  # 低次元での近傍数の数
    n = X.shape[0]

    r_i = 20
    k_i = [a for a in range(1, 101)]

    D_X = euclidean_distance(X)
    D_embed = euclidean_distance(embed)

    sortD_X, sortD_X_idx = sort_D(D_X, k=20)
    sortD_embed, sortD_embed_idx = sort_D(D_embed, k=100)  # n×100-matrix
    n_precision = np.zeros((n, len(k_i)), dtype=float)  # precision
    n_recall = np.zeros((n, len(k_i)), dtype=float)  # recall
    # print(n_precision.shape, n_recall.shape)
    for i in range(n):
        for j in range(100):
            tp = np.intersect1d(sortD_X_idx[i, :], sortD_embed_idx[i, :j + 1])
            # fp = np.setdiff1d(sortD_X_idx[i, :], sortD_embed_idx[i, :j + 1])
            if len(tp) > 0:
                n_precision[i, j] += len(tp) / (j + 1.)
                n_recall[i, j] += len(tp) / 20.
            # if len(fp) > 0:
            #     n_recall[i, j] += len(fp) / 20.

    # print(n_tp, n_fp)

    average_precision = np.mean(n_precision, axis=0)
    average_recall = np.mean(n_recall, axis=0)

    # print(fpr, tpr)

    auc = metrics.auc(average_recall, average_precision)

    return auc
Ejemplo n.º 5
0
def create_knngraph(X, k):

    n = X.shape[0]
    D = euclidean_distance(X)
    neigh_dist = np.zeros((n, k), )
    neigh_idx = np.zeros((n, k), dtype=int)

    for i in np.arange(0, n):
        d_vec = D[i, :]  # i-th row
        v = np.argsort(d_vec)  # 昇順にソートした配列のインデックス
        neigh_idx[i, :] = v[1:k + 1]  # 距離が短い順にk個選ぶ(自分を除く)
        neigh_dist[i, :] = d_vec[neigh_idx[i, :]]

    return neigh_dist, neigh_idx
Ejemplo n.º 6
0
 def setUpMod(self, mode='rnd'):
     np.random.seed(626)
     if mode == 'rnd':
         points = 200
         dim = 500
         self.vector = 99. * (np.random.rand(points, dim) - 0.5)
         self.label = np.random.randint(0, 5, points)
         self.dist = euclidean_distance(self.vector)
         #self.dist /= (self.dist.max() + 1e-12)
     elif mode == 'toy':
         # SNN (k=2) ground truth calculated by hand for this toy example
         self.dist = squareform([.2, .1, .8, .4, .3, .5, .7, 1., .6, .9])
         self.snn_dist_truth = squareform(
             [.5, .5, .5, .5, .5, .5, 0., 0., .5, .5])
         self.vector = None
         self.label = None
 def setUpMod(self, mode='rnd'):
     np.random.seed(626)
     if mode == 'rnd':
         points = 200  # 200
         dim = 500  # 500
         self.vector = 99. * (np.random.rand(points, dim) - 0.5)
         self.label = np.random.randint(0, 5, points)
         self.dist = euclidean_distance(self.vector)
     elif mode == 'toy':
         # LS/NICDM ground truth calculated in spreadsheet for toy example
         self.dist = squareform([.2, .1, .8, .4, .3, .5, .7, 1., .6, .9])
         self.ls_dist_truth = squareform([
             0.486582881, 0.1535182751, 0.9816843611, 0.7364028619,
             0.6321205588, 0.6471339185, 0.9342714714, 0.9844961464,
             0.8646647168, 0.8150186001
         ])
         self.nicdm_dist_truth = squareform([
             0.310029690448236, 0.173311865721368, 0.769089007390428,
             0.438448192970227, 0.402740381783397, 0.37233361467179,
             0.594335892341949, 0.832563272714335, 0.569560910033398,
             0.473903322836619
         ])
         self.vector = None
         self.label = None
Ejemplo n.º 8
0
def generate_triplets(X,
                      n_inlier,
                      n_outlier,
                      n_random,
                      fast_trimap=True,
                      weight_adj=True,
                      verbose=True,
                      hub='mp'):
    n, dim = X.shape
    if dim > 100:
        X = TruncatedSVD(n_components=100, random_state=0).fit_transform(X)
        dim = 100
    exact = n <= 10000
    n_extra = min(max(n_inlier, 150), n)

    # if hub == 'mp_app':
    #     # D = euclidean_distance(X)
    #     n = X.shape[0]
    #     D_mp = SuQHR(n_samples=n-1).fit_transform(X)
    #     print("kjk", D_mp.shape)
    #
    #     # make knn graph
    #     distances, nbrs = KNN_Info(D_mp, n_extra)
    #
    #     if verbose:
    #         print("hubness reduction with {}".format(hub))

    if hub == 'mp1':  # hubness reductionをtriplet選択のみに使用
        neigbour_graph = kneighbors_graph(X,
                                          n_neighbors=n_extra,
                                          mode='distance',
                                          hubness='mutual_proximity',
                                          hubness_params={'method': 'normal'})
        nbrs = neigbour_graph.indices.astype(int).reshape(
            (X.shape[0], n_extra))
        # distances = neigbour_graph.data.reshape((X.shape[0], n_extra))

        flag = nbrs.tolist()

        D = euclidean_distance(X)
        D = np.array([D[i][flag[i]] for i in range(D.shape[0])])

        distances = D

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'mp2':  # 類似度Pを1−Dmpにする

        D = euclidean_distance(X)
        D_mp = hub_toolbox.global_scaling.mutual_proximity_gaussi(
            D=D, metric='distance')

        # make knn graph
        distances, nbrs = KNN_Info(D_mp, n_extra)

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'mp3_gauss':  # secondary distanceで類似度を計算
        D = euclidean_distance(X)
        D_mp = hub_toolbox.global_scaling.mutual_proximity_gaussi(
            D=D, metric='distance')

        del D
        gc.collect()

        # make knn graph
        distances, nbrs = KNN_Info(D_mp, n_extra)

        # neigbour_graph = sknn(X, n_neighbors=n_extra, mode='distance')
        # nbrs = neigbour_graph.indices.astype(int).reshape((X.shape[0], n_extra))
        # distances = neigbour_graph.data.reshape((X.shape[0], n_extra))

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'mp3_emp':  # secondary distanceで類似度を計算
        D = euclidean_distance(X)
        D_mp = hub_toolbox.global_scaling._mutual_proximity_empiric_full(
            D=D, metric='distance')

        # make knn graph
        # distances, nbrs = KNN_Info(D_mp, n_extra)
        neigbour_graph = k

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'mp4':  # 謎
        neigbour_graph = kneighbors_graph(X,
                                          n_neighbors=n_extra,
                                          mode='distance',
                                          hubness='mutual_proximity',
                                          hubness_params={'method': 'normal'})
        nbrs = neigbour_graph.indices.astype(int).reshape(
            (X.shape[0], n_extra))
        distances = neigbour_graph.data.reshape((X.shape[0], n_extra))

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'ls1':
        neigbour_graph = kneighbors_graph(X,
                                          n_neighbors=n_extra,
                                          mode='distance',
                                          hubness='local_scaling')
        nbrs = neigbour_graph.indices.astype(int).reshape(
            (X.shape[0], n_extra))
        # distances = neigbour_graph.data.reshape((X.shape[0], n_extra))

        flag = nbrs.tolist()

        D = euclidean_distance(X)
        D = np.array([D[i][flag[i]] for i in range(D.shape[0])])

        distances = D

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'ls2':
        D = euclidean_distance(X)
        D_ls = hub_toolbox.local_scaling.local_scaling(D=D,
                                                       k=10,
                                                       metric='distance')

        # make knn graph
        distances, nbrs = KNN_Info(D_ls, n_extra)

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'dsl':
        neigbour_graph = kneighbors_graph(X,
                                          n_neighbors=n_extra,
                                          mode='connectivity',
                                          hubness='dsl')
        nbrs = neigbour_graph.indices.astype(int).reshape(
            (X.shape[0], n_extra))
        # flag = neigbour_graph.data.reshape((X.shape[0], n_extra))
        flag = nbrs.tolist()

        D = euclidean_distance(X)
        D = np.array([D[i][flag[i]] for i in range(D.shape[0])])

        distances = D

        # D = np.empty((X.shape[0], n_extra, dtype=np.float64)
        # for i in range(X.shape[0]):
        #     for j in range(n_extra):
        #         D[i, j] = euclid_dist(X[i, :], X[nbrs[i][j]])
        #         np.sqrt(np.sum((X[triplets[t, 0], :] - X[triplets[t, 2], :]) ** 2))

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif hub == 'mutual':
        # D = euclidean_distance(X)
        # # make knn graph
        # _, nbrs = KNN_Info(D_mp, n_extra)

        knn_tree = knn(n_neighbors=n_extra, algorithm='auto').fit(X)
        distances, nbrs = knn_tree.kneighbors(X)

        nbrs = make_mutual(nbrs)
        # a = nbrs == X.shape[0] + 1
        # print(a)

    elif hub == 'SNN1' or hub == 'SNN2':
        D = euclidean_distance(X)
        D_snn = hub_toolbox.shared_neighbors.shared_nearest_neighbors(
            D=D, metric='distance')

        # snn = shared_neighbors(k=10, metric='euclidean')
        # D_snn = snn.fit_tr(X)

        # make knn graph
        distances, nbrs = KNN_Info(D_snn, n_extra)

        if verbose:
            print("hubness reduction with {}".format(hub))

    elif exact:  # do exact knn search
        knn_tree = knn(n_neighbors=n_extra, algorithm='auto').fit(X)
        distances, nbrs = knn_tree.kneighbors(X)

        # print(nbrs)
    elif fast_trimap:  # use annoy
        tree = AnnoyIndex(dim, metric='euclidean')
        for i in range(n):
            tree.add_item(i, X[i, :])
        tree.build(10)
        nbrs = np.empty((n, n_extra), dtype=np.int64)
        distances = np.empty((n, n_extra), dtype=np.float64)
        dij = np.empty(n_extra, dtype=np.float64)
        for i in range(n):
            nbrs[i, :] = tree.get_nns_by_item(i, n_extra)
            for j in range(n_extra):
                dij[j] = euclid_dist(X[i, :], X[nbrs[i, j], :])
            sort_indices = np.argsort(dij)
            nbrs[i, :] = nbrs[i, sort_indices]
            # for j in range(n_extra):
            #     distances[i,j] = tree.get_distance(i, nbrs[i,j])
            distances[i, :] = dij[sort_indices]
    else:
        n_bf = 10
        n_extra += n_bf
        knn_tree = knn(n_neighbors=n_bf, algorithm='auto').fit(X)
        _, nbrs_bf = knn_tree.kneighbors(X)
        nbrs = np.empty((n, n_extra), dtype=np.int64)
        nbrs[:, :n_bf] = nbrs_bf
        tree = AnnoyIndex(dim, metric='euclidean')
        for i in range(n):
            tree.add_item(i, X[i, :])
        tree.build(100)
        distances = np.empty((n, n_extra), dtype=np.float64)
        dij = np.empty(n_extra, dtype=np.float64)
        for i in range(n):
            nbrs[i, n_bf:] = tree.get_nns_by_item(i, n_extra - n_bf)
            unique_nn = np.unique(nbrs[i, :])
            n_unique = len(unique_nn)
            nbrs[i, :n_unique] = unique_nn
            for j in range(n_unique):
                dij[j] = euclid_dist(X[i, :], X[nbrs[i, j], :])
            sort_indices = np.argsort(dij[:n_unique])
            nbrs[i, :n_unique] = nbrs[i, sort_indices]
            distances[i, :n_unique] = dij[sort_indices]
    if verbose:
        print("found nearest neighbors")
    # if hub == 'ls':
    # #     sig = np.array([1.]*X.shape[0])
    # else:
    if hub == 'mp2':
        P = 1 - distances  # (n, k)

    # elif hub == 'mp3':
    #     sig = np.median(D_mp[np.triu_indices(D_mp.shape[0], k=1)])
    #     sig = np.array([sig] * D_mp.shape[0])
    #     P = find_p(distances, sig, nbrs)

    else:
        sig = np.maximum(np.mean(distances[:, 10:20], axis=1),
                         1e-20)  # scale parameter
        P = find_p(distances, sig, nbrs)
    # if hub == 'ls':
    #     P = -np.log(P)
    #     P = np.sqrt(P)
    #     P = 1 - P
    triplets = sample_knn_triplets(P, nbrs, n_inlier, n_outlier)
    print("tri_shape", triplets[0], triplets[0][2])
    n_triplets = triplets.shape[0]
    # if hub == 'mp':
    #     outlier_dist

    # if not hub == 'mp':
    #
    outlier_dist = np.empty(n_triplets, dtype=np.float64)
    # if hub == 'mp':
    #     for t in range(n_triplets):
    #         outlier_dist[t] = D_mp[triplets[t][0], triplets[t][2]]
    # el

    if hub == 'mp2' or hub == 'SNN1' or hub == 'ls2':
        pass

    elif hub == 'mp3_gauss' or hub == 'mp3_emp':
        for t in range(n_triplets):
            outlier_dist[t] = D_mp[triplets[t][0], triplets[t][2]]

    elif hub == 'SNN2':
        for t in range(n_triplets):
            outlier_dist[t] = D_snn[triplets[t][0], triplets[t][2]]

    elif exact or not fast_trimap:
        for t in range(n_triplets):
            outlier_dist[t] = np.sqrt(
                np.sum((X[triplets[t, 0], :] - X[triplets[t, 2], :])**2))
    else:
        for t in range(n_triplets):
            outlier_dist[t] = euclid_dist(X[triplets[t, 0], :],
                                          X[triplets[t, 2], :])
            # outlier_dist[t] = tree.get_distance(triplets[t,0], triplets[t,2])

    if hub == 'mp2' or hub == 'SNN1' or hub == 'ls2':
        if hub == 'SNN1':
            D_mp = D_snn
        elif hub == 'ls2':
            D_mp = D_ls

        n_triplets = triplets.shape[0]
        weights = np.empty(n_triplets, dtype=np.float64)
        print("P and triplets' shape", triplets)
        P = 1 - D_mp  # (n, n)
        for t in range(n_triplets):
            i = triplets[t, 0]
            p_sim = P[i, triplets[t, 1]]
            p_out = P[i, triplets[t, 2]]
            if p_out < 1e-20:
                p_out = 1e-20
            weights[t] = p_sim / p_out
    else:
        weights = find_weights(triplets, P, nbrs, outlier_dist, sig)

    if hub == 'weight':
        deg, mean_deg, var_deg = calculate_deg(nbrs)
        var_deg = max(var_deg, 1e-20)
        # hubness_score = (deg - mean_deg) / var_deg
        # hs_med = np.mean(hubness_score)
        hs_med = np.median(deg)
        hub_weights = np.exp(-deg / hs_med)
        # hub_weights = np.exp(- hubness_score)

        # print(hubness_score)

        m = hub_weights.shape[0]
        l = n_inlier * n_outlier

        for i in range(m):
            for j in range(l):
                weights[i * l:i * l +
                        j] = hub_weights[i] * weights[i * l:i * l + j]

    print('out_dist: ', outlier_dist)

    if n_random > 0:
        if hub == 'mp2' or hub == 'SNN1' or hub == 'ls2':
            rand_triplets = sample_random_triplets(X, n_random,
                                                   P=P)  # P: (n, n)

        else:
            rand_triplets = sample_random_triplets(X, n_random, sig=sig)

        rand_weights = rand_triplets[:, -1]
        rand_triplets = rand_triplets[:, :-1].astype(np.int64)
        triplets = np.vstack((triplets, rand_triplets))
        weights = np.hstack((weights, rand_weights))
    weights /= np.max(weights)
    weights += 0.0001
    if weight_adj:
        if not isinstance(weight_adj, (int, float)):
            weight_adj = 400.0
        weights = np.log(1 + weight_adj * weights)
        weights /= np.max(weights)
    return (triplets, weights)
Ejemplo n.º 9
0
def mantel_test(X, L, embed, describe=True):
    sss = StratifiedShuffleSplit(n_splits=50, test_size=1000, random_state=0)
    sss.get_n_splits(X, L)

    label_type = list(set(L))
    r_lst = np.array([])
    p_lst = np.array([])
    for _, idx in sss.split(X, L):
        # print('Index: ', idx)
        # X_test = X[idx]
        # y_train =

        X_high, L_hl = X[idx], L[idx]
        X_low = embed[idx]

        # print(X_high.shape, L_high.shape)
        # print(X_low.shape, L_low.shape)

        label_idx = []

        for _, i in enumerate(label_type):
            l_idx = np.where(L_hl == i)
            label_idx.append(l_idx)

        # print(label_type)

        # label_idx
        X_high_lst = []
        X_low_lst = []
        # for _, i in enumerate(label_type):
        #     X_high_lst.append(X_high[label_idx[i]])
        for i, _ in enumerate(label_type):
            centroid = np.mean(X_high[label_idx[i]], axis=0)
            # print(centroid)
            X_high_lst.append(centroid)
            # print(centroid.shape)
            # X_high_lst.append((X_high[label_idx[i]] - centroid))
            # X_high_lst[label_idx[i]] = np.sqrt(np.linalg.norm(X_high[label_idx[i]] - centroid, ord=2))
            # for _, i in enumerate(label_type):

            centroid = np.mean(X_low[label_idx[i]], axis=0)
            X_low_lst.append(centroid)
            # print(centroid.shape)
            # X_high_lst.append((X_low[label_idx[i]] - centroid))
            # X_low_lst[label_idx[i]] = np.sqrt(np.linalg.norm(X_low[label_idx[i]] - centroid, ord=2))

        # print(X_low_lst[0].shape, centroid.shape)
        D_high = euclidean_distance(X_high_lst)
        D_low = euclidean_distance(X_low_lst)
        # print(D_high, D_low)

        r, p, z = Mantel.test(D_high,
                              D_low,
                              perms=10000,
                              method='pearson',
                              tail='upper')
        r_lst = np.append(r_lst, r)
        p_lst = np.append(p_lst, p)

    if describe == True:
        print(p_lst)
        print(pd.DataFrame(pd.Series(r_lst.ravel()).describe()).transpose())

    return r_lst, p_lst
Ejemplo n.º 10
0
iter_n = 5
seed_lst = random.sample(range(100), k=iter_n)
print(seed_lst)
for i in range(iter_n):
    # seed = random.randint(0, 100)
    seed = seed_lst[i]

    fit = umap.UMAP(init='random',
                    metric='euclidean',
                    n_neighbors=k,
                    n_epochs=2000,
                    random_state=seed,
                    min_dist=0.5)
    u_org = fit.fit_transform(data)

    D = euclidean_distance(data)
    fit = umap.UMAP(init='random',
                    n_neighbors=k,
                    metric='precomputed',
                    n_epochs=2000,
                    random_state=seed,
                    min_dist=0.5)
    u_hub = fit.fit_transform(D)
    # neigbour_graph = kneighbors_graph(data, algorithm='hnsw', algorithm_params={'n_candidates': 100}, n_neighbors=k,
    #                                   mode='distance', hubness='mutual_proximity',
    #                                   hubness_params={'method': 'normal'})
    # u = fit.fit_transform(D_mp)

    plt.scatter(u_org[:, 0], u_org[:, 1], c=labels, cmap="Spectral", s=10)
    plt.show()
Ejemplo n.º 11
0
 def test_euclidean_dist_equal_to_scipy_cdist_eucl(self):
     eucl_dist = euclidean_distance(self.vectors)
     eucl_dist_cdist = cdist(self.vectors, self.vectors, 'euclidean')
     return np.testing.assert_array_almost_equal(eucl_dist,
                                                 eucl_dist_cdist,
                                                 decimal=7)
Ejemplo n.º 12
0
 def setUp(self):
     """Hubness truth: S_k=5, skewness calculated with bias"""
     np.random.seed(123)
     self.X = np.random.rand(100, 50)
     self.D = euclidean_distance(self.X)
     self.verbose = 1