def test_mp_dissim(self): ''' Test that mp_dissim improves kNN-accuracy for dexter. ''' D_part = cdist(self.X_test, self.X_train, 'euclidean') knn = KNeighborsClassifier(n_neighbors=5, metric='precomputed', n_jobs=4) knn.fit(self.X_train, self.y_train) y_pred = knn.predict(D_part) acc_eucl = accuracy_score(self.y_test, y_pred) h_eucl = hubness(D_part, k=5, metric='distance', n_jobs=4)[0] D_part_mp = mp_dissim(X=self.X_test, Y=self.X_train, p=0, n_bins=10, bin_size='r', verbose=1, n_jobs=-1) y_pred_mp = knn.predict(D_part_mp) acc_mp = accuracy_score(self.y_test, y_pred_mp) h_mp = hubness(D_part_mp, k=5, metric='distance', n_jobs=4)[0] #======================================================================= # print("Hub:", h_eucl, h_mp) # print("Acc:", acc_eucl, acc_mp) # D_mp = mp_dissim(self.X, p=2, n_bins=10, bin_size='r', n_jobs=-1, verbose=1) #======================================================================= self.assertLess(h_mp, h_eucl) self.assertGreater(acc_mp, acc_eucl)
def test_dis_sim_global(self): """Test whether hubness and k-NN accuracy improve for dexter""" h_orig = hubness(self.distance)[0] acc_orig = score(self.distance, self.target)[0][0, 0] dist_dsg = dis_sim_global(self.vectors) h_dsg = hubness(dist_dsg)[0] acc_dsg = score(dist_dsg, self.target)[0][0, 0] result = (h_orig / h_dsg > 2) & (acc_dsg - acc_orig > 0.07) return self.assertTrue(result)
def test_localized_centering(self): """Test whether hubness and k-NN accuracy improve for dexter""" h_orig = hubness(self.distance)[0] acc_orig = score(self.distance, self.target)[0][0, 0] sim_lcent = localized_centering(self.vectors, kappa=20, gamma=1.) h_lcent = hubness(sim_lcent, metric='similarity')[0] acc_lcent = score(sim_lcent, self.target, metric='similarity')[0][0, 0] result = (h_orig / h_lcent > 1.5) & (acc_lcent - acc_orig > 0.03) return self.assertTrue(result)
def test_dis_sim_local(self): """Test whether hubness and k-NN accuracy improve for dexter""" #self.vectors = np.tile(self.vectors, 1) h_orig = hubness(self.distance)[0] acc_orig = score(self.distance, self.target)[0][0, 0] dist_dsl = dis_sim_local(self.vectors, k=50) h_dsl = hubness(dist_dsl)[0] acc_dsl = score(dist_dsl, self.target)[0][0, 0] result = (h_orig / h_dsl > 10) & (acc_dsl - acc_orig > 0.03) return self.assertTrue(result)
def test_ls_dist_equals_sim(self): """Test for equal RANKS using dist. vs. sim. (LS_dist != 1-LS_sim). Using hubness and k-NN accuracy as proxy.""" self.setUpMod('rnd') ls_dist = local_scaling(self.dist, metric='distance') ls_sim = local_scaling(1 - self.dist, metric='similarity') h_dist, _, _ = hubness(ls_dist, metric='distance') h_sim, _, _ = hubness(ls_sim, metric='similarity') acc_dist, _, _ = score(ls_dist, self.label, metric='distance') acc_sim, _, _ = score(ls_sim, self.label, metric='similarity') dist_sim_equal_in_hubness_knn = np.allclose(h_dist, h_sim) and \ np.allclose(acc_dist, acc_sim) return self.assertTrue(dist_sim_equal_in_hubness_knn)
def test_parallel_hubness_equal_serial_hubness_distance_based(self): S_k_p, D_k_p, N_k_p = hubness(self.dist, k=5, metric='distance', verbose=True, n_jobs=-1) S_k_s, D_k_s, N_k_s = hubness(self.dist, k=5, metric='distance', verbose=False, n_jobs=1) np.testing.assert_array_almost_equal(S_k_p, S_k_s, decimal=7) np.testing.assert_array_almost_equal(D_k_p, D_k_s, decimal=7) np.testing.assert_array_almost_equal(N_k_p, N_k_s, decimal=7)
def test_parallel_hubness_equal_serial_hubness_similarity_based(self): similarity = random_sparse_matrix(size=1000) S_k_p, D_k_p, N_k_p = hubness(similarity, k=5, metric='similarity', verbose=False, n_jobs=-1) S_k_s, D_k_s, N_k_s = hubness(similarity, k=5, metric='similarity', verbose=False, n_jobs=1) np.testing.assert_array_almost_equal(S_k_p, S_k_s, decimal=7) np.testing.assert_array_almost_equal(D_k_p, D_k_s, decimal=7) np.testing.assert_array_almost_equal(N_k_p, N_k_s, decimal=7)
def test_hubness_return_values_are_self_consistent(self): """Test that the three returned values fit together""" np.random.seed(626) points = 200 dim = 500 vector = 99. * (np.random.rand(points, dim) - 0.5) dist = euclidean_distance(vector) k = 10 Sk10, Dk10, Nk10 = hubness(dist, k=k) # Dk is just checked for correct shape correct_dim_Dk10 = Dk10.shape == (points, k) # Count k-occurence (different method than in module) Dk10 = Dk10.ravel() Nk10_true = np.zeros(points, dtype=int) for i in range(points): Nk10_true[i] = (Dk10 == i).sum() correct_Nk10 = np.all(Nk10 == Nk10_true) # Calculate skewness (different method than in module) x0 = Nk10 - Nk10.mean() s2 = (x0**2).mean() m3 = (x0**3).mean() s = m3 / (s2**1.5) Sk10_true = s correct_Sk10 = Sk10 == Sk10_true return self.assertTrue(correct_dim_Dk10 and correct_Nk10 and correct_Sk10)
def test_hubness_against_distance(self): """Test hubness class against distance-based methods.""" Sk_dist, Dk_dist, Nk_dist = hubness(self.D, k=10) hub = Hubness(k=10, return_k_neighbors=True, return_k_occurrence=True, verbose=self.verbose) hub.fit_transform(self.X) Sk_class = hub.k_skewness_ Dk_class = hub.k_neighbors_ Nk_class = hub.k_occurrence_ np.testing.assert_almost_equal(Sk_class, Sk_dist, decimal=10) np.testing.assert_array_equal(Dk_class, Dk_dist) np.testing.assert_array_equal(Nk_class, Nk_dist) hub = Hubness(k=10, return_k_neighbors=True, return_k_occurrence=True, metric='precomputed', verbose=self.verbose) hub.fit_transform(self.D, has_self_distances=True) Sk_class = hub.k_skewness_ Dk_class = hub.k_neighbors_ Nk_class = hub.k_occurrence_ np.testing.assert_almost_equal(Sk_class, Sk_dist, decimal=10) np.testing.assert_array_equal(Dk_class, Dk_dist) np.testing.assert_array_equal(Nk_class, Nk_dist)
def _calc_hubness(self, k: int = 5): """Calculate hubness (skewness of `k`-occurence). Also calculate percentage of anti hubs (`k`-occurence == 0) and percentage of k-NN lists the largest hub occurs in. """ S_k, _, N_k = hubness(D=self.secondary_distance, metric=self.metric, k=k) self.hubness[k] = S_k self.anti_hubs[k] = 100 * (N_k == 0).sum() / self.n self.max_hub_k_occurence[k] = 100 * N_k.max() / self.n return self
return D_shi else: # only return test-train-distances (there are no self distances here) return D_shi[test_ind] if __name__ == '__main__': from hub_toolbox.hubness import hubness from hub_toolbox.knn_classification import score D, y, X = io.load_dexter() print("D", D.shape) print("y", y.shape) print("X", X.shape) D_shi = simhub(D, y=None) D_snn = shared_nearest_neighbors(D, k=50) h = hubness(D_shi, k=5) h_snn = hubness(D_snn, k=5) acc = score(D_shi, y, 5) acc_snn = score(D_snn, y, 5) D_sh = simhub(D=D, y=y) h_sh = hubness(D_sh, k=5) acc_sh = score(D_sh, y, 5) print("hubness SNN:", h_snn[0]) print("hubness SHI:", h[0]) print("hubness SH :", h_sh[0]) print("kNN SNN:", acc_snn[0][0, 0]) print("kNN SHI:", acc[0][0, 0]) print("kNN SH :", acc_sh[0][0, 0])
def test_hubness(self): """Test hubness against ground truth calc on spreadsheet""" Sk5, _, _ = hubness(self.dist, k=2, verbose=1) return self.assertAlmostEqual(Sk5, self.hubness_truth, places=10)