def test_hubness_independent_on_data_set_size(hubness_measure):
    """ New measures should pass, traditional skewness should fail. """
    thousands = 3
    n_objects = thousands * 1_000
    rng = np.random.RandomState(1247)
    X = rng.rand(n_objects, 128)
    N_SAMPLES_LIST = np.arange(1, thousands + 1) * 1_000
    value = np.empty(N_SAMPLES_LIST.size)
    for i, n_samples in enumerate(N_SAMPLES_LIST):
        ind = rng.permutation(n_objects)[:n_samples]
        X_sample = X[ind, :]
        hub = Hubness(return_value='all')
        hub.fit(X_sample)
        measures = hub.score()
        if hubness_measure == 'k_skewness':
            value[i] = hub.k_skewness
        elif hubness_measure == 'k_skewness_truncnorm':
            value[i] = hub.k_skewness_truncnorm
        elif hubness_measure == 'robinhood':
            value[i] = hub.robinhood_index
        elif hubness_measure == 'gini':
            value[i] = hub.gini_index
        elif hubness_measure == 'atkinson':
            value[i] = hub.atkinson_index
        assert value[i] == measures[hubness_measure]
        if i > 0:
            if hubness_measure == 'k_skewness':
                with np.testing.assert_raises(AssertionError):
                    np.testing.assert_allclose(value[i], value[i-1], rtol=0.1)
            else:
                np.testing.assert_allclose(
                    value[i], value[i - 1], rtol=2e-1,
                    err_msg=(f'Hubness measure is too dependent on data set '
                             f'size with S({N_SAMPLES_LIST[i]}) = x '
                             f'and S({N_SAMPLES_LIST[i-1]}) = y.'))
    if hubness_measure == 'k_skewness':
        with np.testing.assert_raises(AssertionError):
            np.testing.assert_allclose(value[-1], value[0], rtol=0.1)
    else:
        np.testing.assert_allclose(value[-1], value[0], rtol=2e-1)
def test_hubness_return_values_are_self_consistent(n_samples, n_features, k, seed):
    """Test that the three returned values fit together"""
    np.random.seed(seed)
    vectors = 99. * (np.random.rand(n_samples, n_features) - 0.5)
    k = 10
    hub = Hubness(k=k, metric='euclidean', store_k_neighbors=True, store_k_occurrence=True)
    hub.fit(vectors)
    skew = hub.score()
    neigh = hub.k_neighbors
    occ = hub.k_occurrence
    # Neighbors are just checked for correct shape
    assert neigh.shape == (n_samples, k)
    # Count k-occurrence (different method than in module)
    neigh = neigh.ravel()
    occ_true = np.zeros(n_samples, dtype=int)
    for i in range(n_samples):
        occ_true[i] = (neigh == i).sum()
    np.testing.assert_array_equal(occ, occ_true)
    # Calculate skewness (different method than in module)
    x0 = occ - occ.mean()
    s2 = (x0 ** 2).mean()
    m3 = (x0 ** 3).mean()
    skew_true = m3 / (s2 ** 1.5)
    np.testing.assert_equal(skew, skew_true)
Beispiel #3
0
def measure_hubness(n_tracks, output_file, metric, projection, dimensions, n_jobs, random):
    from skhubness import Hubness
    tracks = Track.get_all(limit=n_tracks, random=random)

    models = get_models()
    models_iter = models.get_combinations() if projection is None else models.get_offline_projections(projection)
    results = []
    for model in list(models_iter):
        for _dimensions in tqdm(range(2, dimensions+1), desc=str(model)):
            embeddings = model.get_embeddings(tracks, dimensions=slice(_dimensions))
            embeddings_stacked = np.vstack(embeddings)

            hub = Hubness(k=10, metric=metric, return_value='all', n_jobs=n_jobs)
            hub.fit(embeddings_stacked[:, :_dimensions])
            result = {key: value for key, value in hub.score().items() if key in RETURN_VALUES}
            result.update({
                'model': f'{model.dataset}-{model.architecture}',
                'layer': model.layer,
                'dimensions': _dimensions
            })
            results.append(result)

    results_df = pd.DataFrame(results)
    results_df.to_csv(output_file, float_format=FLOAT_FORMAT)
Beispiel #4
0
from skhubness import Hubness
from skhubness.neighbors import KNeighborsClassifier

# Fetch data and have a look
d = olivetti_faces.fetch_olivetti_faces()
X, y = d['data'], d['target']
print(f'Data shape: {X.shape}')
print(f'Label shape: {y.shape}')
# (400, 4096)
# (400,)

# The data is embedded in a high-dimensional space.
# Is there hubness, and can we reduce it?
for hubness in [None, 'dsl', 'ls', 'mp']:
    hub = Hubness(k=10, hubness=hubness, return_value='k_skewness')
    hub.fit(X)
    score = hub.score()
    print(f'Hubness (10-skew): {score:.3f} with hubness reduction: {hubness}')
# Hubness (10-skew): 1.972 with hubness reduction: None
# Hubness (10-skew): 1.526 with hubness reduction: dsl
# Hubness (10-skew): 0.943 with hubness reduction: ls
# Hubness (10-skew): 0.184 with hubness reduction: mp

# There is some hubness, and all hubness reduction methods can reduce it (to varying degree)
# Let's assess the best kNN strategy and its estimated performance.
cv_perf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7263)
cv_select = StratifiedKFold(n_splits=5, shuffle=True, random_state=32634)

knn = KNeighborsClassifier(algorithm_params={'n_candidates': 100})

# specify parameters and distributions to sample from