def test_hubness_independent_on_data_set_size(hubness_measure): """ New measures should pass, traditional skewness should fail. """ thousands = 3 n_objects = thousands * 1_000 rng = np.random.RandomState(1247) X = rng.rand(n_objects, 128) N_SAMPLES_LIST = np.arange(1, thousands + 1) * 1_000 value = np.empty(N_SAMPLES_LIST.size) for i, n_samples in enumerate(N_SAMPLES_LIST): ind = rng.permutation(n_objects)[:n_samples] X_sample = X[ind, :] hub = Hubness(return_value='all') hub.fit(X_sample) measures = hub.score() if hubness_measure == 'k_skewness': value[i] = hub.k_skewness elif hubness_measure == 'k_skewness_truncnorm': value[i] = hub.k_skewness_truncnorm elif hubness_measure == 'robinhood': value[i] = hub.robinhood_index elif hubness_measure == 'gini': value[i] = hub.gini_index elif hubness_measure == 'atkinson': value[i] = hub.atkinson_index assert value[i] == measures[hubness_measure] if i > 0: if hubness_measure == 'k_skewness': with np.testing.assert_raises(AssertionError): np.testing.assert_allclose(value[i], value[i-1], rtol=0.1) else: np.testing.assert_allclose( value[i], value[i - 1], rtol=2e-1, err_msg=(f'Hubness measure is too dependent on data set ' f'size with S({N_SAMPLES_LIST[i]}) = x ' f'and S({N_SAMPLES_LIST[i-1]}) = y.')) if hubness_measure == 'k_skewness': with np.testing.assert_raises(AssertionError): np.testing.assert_allclose(value[-1], value[0], rtol=0.1) else: np.testing.assert_allclose(value[-1], value[0], rtol=2e-1)
def test_hubness_return_values_are_self_consistent(n_samples, n_features, k, seed): """Test that the three returned values fit together""" np.random.seed(seed) vectors = 99. * (np.random.rand(n_samples, n_features) - 0.5) k = 10 hub = Hubness(k=k, metric='euclidean', store_k_neighbors=True, store_k_occurrence=True) hub.fit(vectors) skew = hub.score() neigh = hub.k_neighbors occ = hub.k_occurrence # Neighbors are just checked for correct shape assert neigh.shape == (n_samples, k) # Count k-occurrence (different method than in module) neigh = neigh.ravel() occ_true = np.zeros(n_samples, dtype=int) for i in range(n_samples): occ_true[i] = (neigh == i).sum() np.testing.assert_array_equal(occ, occ_true) # Calculate skewness (different method than in module) x0 = occ - occ.mean() s2 = (x0 ** 2).mean() m3 = (x0 ** 3).mean() skew_true = m3 / (s2 ** 1.5) np.testing.assert_equal(skew, skew_true)
def measure_hubness(n_tracks, output_file, metric, projection, dimensions, n_jobs, random): from skhubness import Hubness tracks = Track.get_all(limit=n_tracks, random=random) models = get_models() models_iter = models.get_combinations() if projection is None else models.get_offline_projections(projection) results = [] for model in list(models_iter): for _dimensions in tqdm(range(2, dimensions+1), desc=str(model)): embeddings = model.get_embeddings(tracks, dimensions=slice(_dimensions)) embeddings_stacked = np.vstack(embeddings) hub = Hubness(k=10, metric=metric, return_value='all', n_jobs=n_jobs) hub.fit(embeddings_stacked[:, :_dimensions]) result = {key: value for key, value in hub.score().items() if key in RETURN_VALUES} result.update({ 'model': f'{model.dataset}-{model.architecture}', 'layer': model.layer, 'dimensions': _dimensions }) results.append(result) results_df = pd.DataFrame(results) results_df.to_csv(output_file, float_format=FLOAT_FORMAT)
from skhubness import Hubness from skhubness.neighbors import KNeighborsClassifier # Fetch data and have a look d = olivetti_faces.fetch_olivetti_faces() X, y = d['data'], d['target'] print(f'Data shape: {X.shape}') print(f'Label shape: {y.shape}') # (400, 4096) # (400,) # The data is embedded in a high-dimensional space. # Is there hubness, and can we reduce it? for hubness in [None, 'dsl', 'ls', 'mp']: hub = Hubness(k=10, hubness=hubness, return_value='k_skewness') hub.fit(X) score = hub.score() print(f'Hubness (10-skew): {score:.3f} with hubness reduction: {hubness}') # Hubness (10-skew): 1.972 with hubness reduction: None # Hubness (10-skew): 1.526 with hubness reduction: dsl # Hubness (10-skew): 0.943 with hubness reduction: ls # Hubness (10-skew): 0.184 with hubness reduction: mp # There is some hubness, and all hubness reduction methods can reduce it (to varying degree) # Let's assess the best kNN strategy and its estimated performance. cv_perf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7263) cv_select = StratifiedKFold(n_splits=5, shuffle=True, random_state=32634) knn = KNeighborsClassifier(algorithm_params={'n_candidates': 100}) # specify parameters and distributions to sample from