Ejemplo n.º 1
0
 def test_mp_dissim(self):
     ''' Test that mp_dissim improves kNN-accuracy for dexter. '''
     D_part = cdist(self.X_test, self.X_train, 'euclidean')
     knn = KNeighborsClassifier(n_neighbors=5,
                                metric='precomputed',
                                n_jobs=4)
     knn.fit(self.X_train, self.y_train)
     y_pred = knn.predict(D_part)
     acc_eucl = accuracy_score(self.y_test, y_pred)
     h_eucl = hubness(D_part, k=5, metric='distance', n_jobs=4)[0]
     D_part_mp = mp_dissim(X=self.X_test,
                           Y=self.X_train,
                           p=0,
                           n_bins=10,
                           bin_size='r',
                           verbose=1,
                           n_jobs=-1)
     y_pred_mp = knn.predict(D_part_mp)
     acc_mp = accuracy_score(self.y_test, y_pred_mp)
     h_mp = hubness(D_part_mp, k=5, metric='distance', n_jobs=4)[0]
     #=======================================================================
     # print("Hub:", h_eucl, h_mp)
     # print("Acc:", acc_eucl, acc_mp)
     # D_mp = mp_dissim(self.X, p=2, n_bins=10, bin_size='r', n_jobs=-1, verbose=1)
     #=======================================================================
     self.assertLess(h_mp, h_eucl)
     self.assertGreater(acc_mp, acc_eucl)
Ejemplo n.º 2
0
 def test_dis_sim_global(self):
     """Test whether hubness and k-NN accuracy improve for dexter"""
     h_orig = hubness(self.distance)[0]
     acc_orig = score(self.distance, self.target)[0][0, 0]
     dist_dsg = dis_sim_global(self.vectors)
     h_dsg = hubness(dist_dsg)[0]
     acc_dsg = score(dist_dsg, self.target)[0][0, 0]
     result = (h_orig / h_dsg > 2) & (acc_dsg - acc_orig > 0.07)
     return self.assertTrue(result)
Ejemplo n.º 3
0
 def test_localized_centering(self):
     """Test whether hubness and k-NN accuracy improve for dexter"""
     h_orig = hubness(self.distance)[0]
     acc_orig = score(self.distance, self.target)[0][0, 0]
     sim_lcent = localized_centering(self.vectors, kappa=20, gamma=1.)
     h_lcent = hubness(sim_lcent, metric='similarity')[0]
     acc_lcent = score(sim_lcent, self.target, metric='similarity')[0][0, 0]
     result = (h_orig / h_lcent > 1.5) & (acc_lcent - acc_orig > 0.03)
     return self.assertTrue(result)
Ejemplo n.º 4
0
 def test_dis_sim_local(self):
     """Test whether hubness and k-NN accuracy improve for dexter"""
     #self.vectors = np.tile(self.vectors, 1)
     h_orig = hubness(self.distance)[0]
     acc_orig = score(self.distance, self.target)[0][0, 0]
     dist_dsl = dis_sim_local(self.vectors, k=50)
     h_dsl = hubness(dist_dsl)[0]
     acc_dsl = score(dist_dsl, self.target)[0][0, 0]
     result = (h_orig / h_dsl > 10) & (acc_dsl - acc_orig > 0.03)
     return self.assertTrue(result)
 def test_ls_dist_equals_sim(self):
     """Test for equal RANKS using dist. vs. sim. (LS_dist != 1-LS_sim).
        Using hubness and k-NN accuracy as proxy."""
     self.setUpMod('rnd')
     ls_dist = local_scaling(self.dist, metric='distance')
     ls_sim = local_scaling(1 - self.dist, metric='similarity')
     h_dist, _, _ = hubness(ls_dist, metric='distance')
     h_sim, _, _ = hubness(ls_sim, metric='similarity')
     acc_dist, _, _ = score(ls_dist, self.label, metric='distance')
     acc_sim, _, _ = score(ls_sim, self.label, metric='similarity')
     dist_sim_equal_in_hubness_knn = np.allclose(h_dist, h_sim) and \
                                     np.allclose(acc_dist, acc_sim)
     return self.assertTrue(dist_sim_equal_in_hubness_knn)
Ejemplo n.º 6
0
 def test_parallel_hubness_equal_serial_hubness_distance_based(self):
     S_k_p, D_k_p, N_k_p = hubness(self.dist,
                                   k=5,
                                   metric='distance',
                                   verbose=True,
                                   n_jobs=-1)
     S_k_s, D_k_s, N_k_s = hubness(self.dist,
                                   k=5,
                                   metric='distance',
                                   verbose=False,
                                   n_jobs=1)
     np.testing.assert_array_almost_equal(S_k_p, S_k_s, decimal=7)
     np.testing.assert_array_almost_equal(D_k_p, D_k_s, decimal=7)
     np.testing.assert_array_almost_equal(N_k_p, N_k_s, decimal=7)
Ejemplo n.º 7
0
 def test_parallel_hubness_equal_serial_hubness_similarity_based(self):
     similarity = random_sparse_matrix(size=1000)
     S_k_p, D_k_p, N_k_p = hubness(similarity,
                                   k=5,
                                   metric='similarity',
                                   verbose=False,
                                   n_jobs=-1)
     S_k_s, D_k_s, N_k_s = hubness(similarity,
                                   k=5,
                                   metric='similarity',
                                   verbose=False,
                                   n_jobs=1)
     np.testing.assert_array_almost_equal(S_k_p, S_k_s, decimal=7)
     np.testing.assert_array_almost_equal(D_k_p, D_k_s, decimal=7)
     np.testing.assert_array_almost_equal(N_k_p, N_k_s, decimal=7)
Ejemplo n.º 8
0
 def test_hubness_return_values_are_self_consistent(self):
     """Test that the three returned values fit together"""
     np.random.seed(626)
     points = 200
     dim = 500
     vector = 99. * (np.random.rand(points, dim) - 0.5)
     dist = euclidean_distance(vector)
     k = 10
     Sk10, Dk10, Nk10 = hubness(dist, k=k)
     # Dk is just checked for correct shape
     correct_dim_Dk10 = Dk10.shape == (points, k)
     # Count k-occurence (different method than in module)
     Dk10 = Dk10.ravel()
     Nk10_true = np.zeros(points, dtype=int)
     for i in range(points):
         Nk10_true[i] = (Dk10 == i).sum()
     correct_Nk10 = np.all(Nk10 == Nk10_true)
     # Calculate skewness (different method than in module)
     x0 = Nk10 - Nk10.mean()
     s2 = (x0**2).mean()
     m3 = (x0**3).mean()
     s = m3 / (s2**1.5)
     Sk10_true = s
     correct_Sk10 = Sk10 == Sk10_true
     return self.assertTrue(correct_dim_Dk10 and correct_Nk10
                            and correct_Sk10)
Ejemplo n.º 9
0
 def test_hubness_against_distance(self):
     """Test hubness class against distance-based methods."""
     Sk_dist, Dk_dist, Nk_dist = hubness(self.D, k=10)
     hub = Hubness(k=10,
                   return_k_neighbors=True,
                   return_k_occurrence=True,
                   verbose=self.verbose)
     hub.fit_transform(self.X)
     Sk_class = hub.k_skewness_
     Dk_class = hub.k_neighbors_
     Nk_class = hub.k_occurrence_
     np.testing.assert_almost_equal(Sk_class, Sk_dist, decimal=10)
     np.testing.assert_array_equal(Dk_class, Dk_dist)
     np.testing.assert_array_equal(Nk_class, Nk_dist)
     hub = Hubness(k=10,
                   return_k_neighbors=True,
                   return_k_occurrence=True,
                   metric='precomputed',
                   verbose=self.verbose)
     hub.fit_transform(self.D, has_self_distances=True)
     Sk_class = hub.k_skewness_
     Dk_class = hub.k_neighbors_
     Nk_class = hub.k_occurrence_
     np.testing.assert_almost_equal(Sk_class, Sk_dist, decimal=10)
     np.testing.assert_array_equal(Dk_class, Dk_dist)
     np.testing.assert_array_equal(Nk_class, Nk_dist)
    def _calc_hubness(self, k: int = 5):
        """Calculate hubness (skewness of `k`-occurence).

        Also calculate percentage of anti hubs (`k`-occurence == 0) and
        percentage of k-NN lists the largest hub occurs in.
        """
        S_k, _, N_k = hubness(D=self.secondary_distance,
                              metric=self.metric,
                              k=k)
        self.hubness[k] = S_k
        self.anti_hubs[k] = 100 * (N_k == 0).sum() / self.n
        self.max_hub_k_occurence[k] = 100 * N_k.max() / self.n
        return self
        return D_shi
    else:
        # only return test-train-distances (there are no self distances here)
        return D_shi[test_ind]


if __name__ == '__main__':
    from hub_toolbox.hubness import hubness
    from hub_toolbox.knn_classification import score
    D, y, X = io.load_dexter()
    print("D", D.shape)
    print("y", y.shape)
    print("X", X.shape)
    D_shi = simhub(D, y=None)
    D_snn = shared_nearest_neighbors(D, k=50)
    h = hubness(D_shi, k=5)
    h_snn = hubness(D_snn, k=5)
    acc = score(D_shi, y, 5)
    acc_snn = score(D_snn, y, 5)

    D_sh = simhub(D=D, y=y)
    h_sh = hubness(D_sh, k=5)
    acc_sh = score(D_sh, y, 5)
    print("hubness SNN:", h_snn[0])
    print("hubness SHI:", h[0])
    print("hubness SH :", h_sh[0])

    print("kNN SNN:", acc_snn[0][0, 0])
    print("kNN SHI:", acc[0][0, 0])
    print("kNN SH :", acc_sh[0][0, 0])
Ejemplo n.º 12
0
 def test_hubness(self):
     """Test hubness against ground truth calc on spreadsheet"""
     Sk5, _, _ = hubness(self.dist, k=2, verbose=1)
     return self.assertAlmostEqual(Sk5, self.hubness_truth, places=10)