def test_sparse_equal_dense_if_variable_hits_per_row(shuffle_equal): X, _ = make_classification(random_state=123) dist = euclidean_distances(X) dist[0, 1:3] = 999 dist[1:3, 0] = 999 dist[1, 1:5] = 999 dist[1:5, 1] = 999 sparse = dist.copy() sparse[0, 1:3] = 0 sparse[1:3, 0] = 0 sparse[1, 1:5] = 0 sparse[1:5, 1] = 0 sparse = csr_matrix(sparse) hub = Hubness(metric='precomputed', shuffle_equal=shuffle_equal, random_state=123) hub.fit(dist) skew_dense = hub.score(has_self_distances=True) hub = Hubness(metric='precomputed', shuffle_equal=shuffle_equal, random_state=123) hub.fit(sparse) skew_sparse = hub.score(has_self_distances=True) np.testing.assert_almost_equal(skew_dense, skew_sparse, decimal=2)
def test_hubness(verbose): """Test hubness against ground truth calc on spreadsheet""" HUBNESS_TRUE = -0.2561204163 # Hubness truth: S_k=5, skewness calculated with bias hub = Hubness(k=2, metric='precomputed', verbose=verbose) hub.fit(DIST) Sk2 = hub.score(has_self_distances=True) np.testing.assert_almost_equal(Sk2, HUBNESS_TRUE, decimal=10)
def test_parallel_hubness_equal_serial_hubness_distance_based(dist, n_jobs): # Parallel hub = Hubness(k=4, metric='precomputed', store_k_occurrence=True, store_k_neighbors=True, n_jobs=n_jobs) hub.fit(dist) skew_p = hub.score(has_self_distances=True) neigh_p = hub.k_neighbors occ_p = hub.k_occurrence # Sequential hub = Hubness(k=4, metric='precomputed', store_k_occurrence=True, store_k_neighbors=True, n_jobs=1) hub.fit(dist) skew_s = hub.score(has_self_distances=True) neigh_s = hub.k_neighbors occ_s = hub.k_occurrence np.testing.assert_array_almost_equal(skew_p, skew_s, decimal=7) np.testing.assert_array_almost_equal(neigh_p, neigh_s, decimal=7) np.testing.assert_array_almost_equal(occ_p, occ_s, decimal=7)
def test_hubness_return_values_are_self_consistent(n_samples, n_features, k, seed): """Test that the three returned values fit together""" np.random.seed(seed) vectors = 99. * (np.random.rand(n_samples, n_features) - 0.5) k = 10 hub = Hubness(k=k, metric='euclidean', store_k_neighbors=True, store_k_occurrence=True) hub.fit(vectors) skew = hub.score() neigh = hub.k_neighbors occ = hub.k_occurrence # Neighbors are just checked for correct shape assert neigh.shape == (n_samples, k) # Count k-occurrence (different method than in module) neigh = neigh.ravel() occ_true = np.zeros(n_samples, dtype=int) for i in range(n_samples): occ_true[i] = (neigh == i).sum() np.testing.assert_array_equal(occ, occ_true) # Calculate skewness (different method than in module) x0 = occ - occ.mean() s2 = (x0**2).mean() m3 = (x0**3).mean() skew_true = m3 / (s2**1.5) np.testing.assert_equal(skew, skew_true)
def test_hubness_against_distance(has_self_distances): """Test hubness class against distance-based methods.""" np.random.seed(123) X = np.random.rand(100, 50) D = euclidean_distances(X) verbose = 1 hub = Hubness(k=10, metric='precomputed', store_k_occurrence=True, store_k_neighbors=True, ) hub.fit(D) skew_d = hub.score(has_self_distances=has_self_distances) neigh_d = hub.k_neighbors occ_d = hub.k_occurrence hub = Hubness(k=10, metric='euclidean', store_k_neighbors=True, store_k_occurrence=True, verbose=verbose) hub.fit(X) skew_v = hub.score(X if not has_self_distances else None) neigh_v = hub.k_neighbors occ_v = hub.k_occurrence np.testing.assert_allclose(skew_d, skew_v, atol=1e-7) np.testing.assert_array_equal(neigh_d, neigh_v) np.testing.assert_array_equal(occ_d, occ_v)
def test_atkinson(): X, _ = make_classification(random_state=123) hub = Hubness(return_value='k_occurrence').fit(X) k_occ = hub.score() atkinson_0999 = hub._calc_atkinson_index(k_occ, eps=.999) atkinson_1000 = hub._calc_atkinson_index(k_occ, eps=1) np.testing.assert_almost_equal(atkinson_0999, atkinson_1000, decimal=3)
def test_all_params_none(): X, _ = make_classification() hub = Hubness(k=None, return_value=None, hub_size=None, metric=None, store_k_neighbors=None, store_k_occurrence=None, algorithm_params=None, hubness=None, hubness_params=None, verbose=None, n_jobs=None, random_state=None, shuffle_equal=None) hub.fit(X) _ = hub.score()
def test_all_but_gini(): X, _ = make_classification() hub = Hubness(store_k_occurrence=True, return_value='all_but_gini', store_k_neighbors=True) hub.fit(X) measures = hub.score() for m in VALID_HUBNESS_MEASURES: if m in ['all', 'all_but_gini']: continue elif m == 'gini': assert m not in measures else: assert m in measures
def test_shuffle_equal(verbose): # for this data set there shouldn't be any equal distances, # and shuffle should make no difference X, _ = make_classification(random_state=12354) dist = euclidean_distances(X) skew_shuffle, skew_no_shuffle = \ [Hubness(metric='precomputed', shuffle_equal=v, verbose=verbose) .fit(dist).score() for v in [True, False]] assert skew_no_shuffle == skew_shuffle
def test_hubness_independent_on_data_set_size(hubness_measure): """ New measures should pass, traditional skewness should fail. """ thousands = 3 n_objects = thousands * 1_000 rng = np.random.RandomState(1247) X = rng.rand(n_objects, 128) N_SAMPLES_LIST = np.arange(1, thousands + 1) * 1_000 value = np.empty(N_SAMPLES_LIST.size) for i, n_samples in enumerate(N_SAMPLES_LIST): ind = rng.permutation(n_objects)[:n_samples] X_sample = X[ind, :] hub = Hubness(return_value='all') hub.fit(X_sample) measures = hub.score() if hubness_measure == 'k_skewness': value[i] = hub.k_skewness elif hubness_measure == 'k_skewness_truncnorm': value[i] = hub.k_skewness_truncnorm elif hubness_measure == 'robinhood': value[i] = hub.robinhood_index elif hubness_measure == 'gini': value[i] = hub.gini_index elif hubness_measure == 'atkinson': value[i] = hub.atkinson_index assert value[i] == measures[hubness_measure] if i > 0: if hubness_measure == 'k_skewness': with np.testing.assert_raises(AssertionError): np.testing.assert_allclose(value[i], value[i - 1], rtol=0.1) else: np.testing.assert_allclose( value[i], value[i - 1], rtol=2e-1, err_msg=(f'Hubness measure is too dependent on data set ' f'size with S({N_SAMPLES_LIST[i]}) = x ' f'and S({N_SAMPLES_LIST[i-1]}) = y.')) if hubness_measure == 'k_skewness': with np.testing.assert_raises(AssertionError): np.testing.assert_allclose(value[-1], value[0], rtol=0.1) else: np.testing.assert_allclose(value[-1], value[0], rtol=2e-1)
def test_limiting_factor(): """ Different implementations of Gini index calculation should give the same result. """ X, _ = make_classification() hub = Hubness(store_k_occurrence=True, return_value='k_occurrence') hub.fit(X) k_occ = hub.score() gini_space = hub._calc_gini_index(k_occ, limiting='space') gini_time = hub._calc_gini_index(k_occ, limiting='time') gini_naive = hub._calc_gini_index(k_occ, limiting=None) assert gini_space == gini_time == gini_naive
def test_return_k_occurrence(store_k_occurrence): X, _ = make_classification() hub = Hubness(return_value='k_occurrence', store_k_occurrence=store_k_occurrence) if store_k_occurrence: hub.fit(X) else: with pytest.warns(UserWarning): hub.fit(X) _ = hub.score()
def test_return_k_neighbors(store_k_neighbors): X, _ = make_classification() hub = Hubness(return_value='k_neighbors', store_k_neighbors=store_k_neighbors) if store_k_neighbors: hub.fit(X) else: with pytest.warns(UserWarning): hub.fit(X) _ = hub.score()
def hub_eval( input_points, methods={ 'nothing': (None, None), 'mp_normal': ('mp', { 'method': 'normal' }), 'ls': ('ls', None), 'ls_nicdm': ('ls', { 'method': 'nicdm' }), 'dsl': ('dsl', None) }, k=k): skewness = [] for method_name, (hubness, hubness_params) in tqdm(methods.items()): hub = Hubness(k=k, hubness=hubness, hubness_params=hubness_params) hub.fit(input_points) skewness.append(hub.score()) return skewness
def test_handle_negative_neighbor_indices(): def mock_kneighbors(X_test): nn = np.zeros((len(X_test), 2), dtype=int) nn[:, 1] = -1 return nn X, _ = make_classification(n_samples=10) hub = Hubness() hub.fit(X[:8, :]) hub._k_neighbors = mock_kneighbors hub.score(X[8:, :])
def test_sparse_equal_dense(verbose, shuffle_equal): X, _ = make_classification() dist_dense = euclidean_distances(X) dist_sparse = csr_matrix(dist_dense) hub = Hubness(metric='precomputed', shuffle_equal=shuffle_equal, verbose=verbose) hub.fit(dist_dense) skew_dense = hub.score(has_self_distances=True) hub.fit(dist_sparse) skew_sparse = hub.score(has_self_distances=True) np.testing.assert_almost_equal(skew_dense, skew_sparse)
def measure_hubness(n_tracks, output_file, metric, projection, dimensions, n_jobs, random): from skhubness import Hubness tracks = Track.get_all(limit=n_tracks, random=random) models = get_models() models_iter = models.get_combinations() if projection is None else models.get_offline_projections(projection) results = [] for model in list(models_iter): for _dimensions in tqdm(range(2, dimensions+1), desc=str(model)): embeddings = model.get_embeddings(tracks, dimensions=slice(_dimensions)) embeddings_stacked = np.vstack(embeddings) hub = Hubness(k=10, metric=metric, return_value='all', n_jobs=n_jobs) hub.fit(embeddings_stacked[:, :_dimensions]) result = {key: value for key, value in hub.score().items() if key in RETURN_VALUES} result.update({ 'model': f'{model.dataset}-{model.architecture}', 'layer': model.layer, 'dimensions': _dimensions }) results.append(result) results_df = pd.DataFrame(results) results_df.to_csv(output_file, float_format=FLOAT_FORMAT)
from skhubness import Hubness from skhubness.neighbors import KNeighborsClassifier # Fetch data and have a look d = olivetti_faces.fetch_olivetti_faces() X, y = d['data'], d['target'] print(f'Data shape: {X.shape}') print(f'Label shape: {y.shape}') # (400, 4096) # (400,) # The data is embedded in a high-dimensional space. # Is there hubness, and can we reduce it? for hubness in [None, 'dsl', 'ls', 'mp']: hub = Hubness(k=10, hubness=hubness, return_value='k_skewness') hub.fit(X) score = hub.score() print(f'Hubness (10-skew): {score:.3f} with hubness reduction: {hubness}') # Hubness (10-skew): 1.972 with hubness reduction: None # Hubness (10-skew): 1.526 with hubness reduction: dsl # Hubness (10-skew): 0.943 with hubness reduction: ls # Hubness (10-skew): 0.184 with hubness reduction: mp # There is some hubness, and all hubness reduction methods can reduce it (to varying degree) # Let's assess the best kNN strategy and its estimated performance. cv_perf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7263) cv_select = StratifiedKFold(n_splits=5, shuffle=True, random_state=32634) knn = KNeighborsClassifier(algorithm_params={'n_candidates': 100})
# d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors) # # vectors = vectors[:10000, :] # # d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors) # # vectors = mnist.data # # vectors = vectors[:10000, :] # # d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors) # D = euclidean_distance(vectors) # # S_k, _, _ = hub_toolbox.hubness.hubness(D=D, k=5, metric='distance') # D_mp = hub_toolbox.global_scaling.mutual_proximity_empiric( # D=D, metric='distance') # S_k_mp, _, _ = hub_toolbox.hubness.hubness(D=D_mp, k=5, metric='distance') # # print(S_k, S_k_mp) from skhubness.data import load_dexter X, y = load_dexter() from skhubness import Hubness hub = Hubness(k=10, metric='cosine') hub.fit(X) k_skew = hub.score() print(f'Skewness = {k_skew:.3f}') from skhubness.neighbors import kneighbors_graph k = 5 # neigbor_graph = kneighbors_graph(X, n_neighbors=k, hubness='mutual_proximity') neigbor_graph = kneighbors_graph(X, n_neighbors=k, hubness=None) neighbor_matrix = neigbor_graph.indices.reshape((X.shape[0], k)) print(neighbor_matrix)