def test_dis_sim_local(self): """Test whether hubness and k-NN accuracy improve for dexter""" h_orig = hubness(self.distance)[0] acc_orig = score(self.distance, self.target)[0][0, 0] dist_dsl = dis_sim_local(self.vectors, k=50) h_dsl = hubness(dist_dsl)[0] acc_dsl = score(dist_dsl, self.target)[0][0, 0] result = (h_orig / h_dsl > 10) & (acc_dsl - acc_orig > 0.03) return self.assertTrue(result)
def test_dis_sim_global(self): """Test whether hubness and k-NN accuracy improve for dexter""" h_orig = hubness(self.distance)[0] acc_orig = score(self.distance, self.target)[0][0, 0] dist_dsg = dis_sim_global(self.vectors) h_dsg = hubness(dist_dsg)[0] acc_dsg = score(dist_dsg, self.target)[0][0, 0] result = (h_orig / h_dsg > 2) & (acc_dsg - acc_orig > 0.07) return self.assertTrue(result)
def test_localized_centering(self): """Test whether hubness and k-NN accuracy improve for dexter""" h_orig = hubness(self.distance)[0] acc_orig = score(self.distance, self.target)[0][0, 0] sim_lcent = localized_centering(self.vectors, "cosine", 20, 1) h_lcent = hubness(sim_lcent, metric="similarity")[0] acc_lcent = score(sim_lcent, self.target, metric="similarity")[0][0, 0] result = (h_orig / h_lcent > 1.5) & (acc_lcent - acc_orig > 0.03) return self.assertTrue(result)
def test_ls_dist_equals_sim(self): """Test for equal RANKS using dist. vs. sim. (LS_dist != 1-LS_sim). Using hubness and k-NN accuracy as proxy.""" self.setUpMod('rnd') ls_dist = local_scaling(self.dist, metric='distance') ls_sim = local_scaling(1 - self.dist, metric='similarity') h_dist, _, _ = hubness(ls_dist, metric='distance') h_sim, _, _ = hubness(ls_sim, metric='similarity') acc_dist, _, _ = score(ls_dist, self.label, metric='distance') acc_sim, _, _ = score(ls_sim, self.label, metric='similarity') dist_sim_equal_in_hubness_knn = np.allclose(h_dist, h_sim) and \ np.allclose(acc_dist, acc_sim) return self.assertTrue(dist_sim_equal_in_hubness_knn)
def test_knn_score_equal_sklearn_loocv_score(self): acc, correct, cmat = \ score(self.distance, self.label, k=5, metric='distance') # scoring only one k value, so take just the first elements: acc = acc[0, 0] correct = correct[0] cmat = cmat[0] # This should work too, but is much slower than using precomp. dist. #======================================================================= # knclassifier = KNeighborsClassifier(n_neighbors=5, algorithm='brute', # metric='cosine') #======================================================================= knclassifier = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='precomputed') n = self.distance.shape[0] # for LOO-CV loo_cv = LeaveOneOut(n) predicted_sklearn = cross_val_predict( knclassifier, self.distance, self.label, cv=loo_cv) acc_sklearn = accuracy_score(self.label, predicted_sklearn) if not np.allclose(acc, acc_sklearn): return self.assertAlmostEqual(acc, acc_sklearn, places=7) else: correct_sklearn = predicted_sklearn == self.label equal_prediction = np.all(correct == correct_sklearn) msg = """Accuracies of hub toolbox k-NN and sklearn-kNN are almost equal, but the predictions per data point are not.""" return self.assertTrue(equal_prediction, msg)
def test_knn_score_matches_correct_prediction_fraction(self): k = np.array([1, 5, 20]) acc, correct, _ = score(self.distance, self.label, k=k) acc_match = np.zeros_like(k, dtype=bool) for i, _ in enumerate(k): cur_acc = acc[i] cur_correct = correct[i] acc_match[i] = np.allclose(cur_acc, cur_correct.sum() / self.n) return self.assertTrue(np.all(acc_match))
def test_knn_score_matches_confusion_matrix(self): k = np.array([1, 5, 20]) acc, _, cmat = score(self.distance, self.label, k=k) acc_match = np.zeros_like(k, dtype=bool) for i, _ in enumerate(k): cur_acc = acc[i] cur_cmat = cmat[i] TP = cur_cmat[0, 0] FN = cur_cmat[0, 1] FP = cur_cmat[1, 0] TN = cur_cmat[1, 1] acc_from_cmat = (TP + TN) / (TP + FN + FP + TN) acc_match[i] = np.allclose(cur_acc, acc_from_cmat) return self.assertTrue(np.all(acc_match))
from hub_toolbox.Hubness import hubness from hub_toolbox.HubnessAnalysis import load_dexter from hub_toolbox.KnnClassification import score #do = 'random' do = 'dexter' if do == 'random': print("RANDOM DATA:") print("------------") S = triu(rand(1000, 1000, 0.05, 'csr', np.float32, 43), 1) S += S.T D = 1. - S.toarray() elif do == 'dexter': print("DEXTER:") print("-------") D, c, v = load_dexter() acc_d, _, _ = score(D, c, [5], 'distance') S = csr_matrix(1 - D) acc_s, _, _ = score(S, c, [5], 'similarity') Sn_d, _, _ = hubness(D, 5, 'distance') Sn_s, _, _ = hubness(S, 5, 'similarity') print("Orig. dist. hubness:", Sn_d) print("Orig. sim. hubness:", Sn_s) if do == 'dexter': print("Orig. dist. k-NN accuracy:", acc_d) print('Orig. sim. k-NN accuracy:', acc_s) D_mp_emp_d = mutual_proximity_empiric(D) D_mp_emp_s = mutual_proximity_empiric(S, 'similarity') Sn_mp_emp_d, _, _ = hubness(D_mp_emp_d, 5) Sn_mp_emp_s, _, _ = hubness(D_mp_emp_s, 5, 'similarity')
def _calc_knn_accuracy(self, k:int=5): """Calculate `k`-NN accuracy.""" acc, _, _ = score(D=self.secondary_distance, target=self.classes, k=k, metric=self.metric) self.knn_accuracy[k] = acc return self