Esempio n. 1
0
def test_sparse_equal_dense_if_variable_hits_per_row(shuffle_equal):
    X, _ = make_classification(random_state=123)
    dist = euclidean_distances(X)
    dist[0, 1:3] = 999
    dist[1:3, 0] = 999
    dist[1, 1:5] = 999
    dist[1:5, 1] = 999
    sparse = dist.copy()
    sparse[0, 1:3] = 0
    sparse[1:3, 0] = 0
    sparse[1, 1:5] = 0
    sparse[1:5, 1] = 0
    sparse = csr_matrix(sparse)

    hub = Hubness(metric='precomputed',
                  shuffle_equal=shuffle_equal,
                  random_state=123)
    hub.fit(dist)
    skew_dense = hub.score(has_self_distances=True)

    hub = Hubness(metric='precomputed',
                  shuffle_equal=shuffle_equal,
                  random_state=123)
    hub.fit(sparse)
    skew_sparse = hub.score(has_self_distances=True)

    np.testing.assert_almost_equal(skew_dense, skew_sparse, decimal=2)
Esempio n. 2
0
def test_parallel_hubness_equal_serial_hubness_distance_based(dist, n_jobs):
    # Parallel
    hub = Hubness(k=4,
                  metric='precomputed',
                  store_k_occurrence=True,
                  store_k_neighbors=True,
                  n_jobs=n_jobs)
    hub.fit(dist)
    skew_p = hub.score(has_self_distances=True)
    neigh_p = hub.k_neighbors
    occ_p = hub.k_occurrence

    # Sequential
    hub = Hubness(k=4,
                  metric='precomputed',
                  store_k_occurrence=True,
                  store_k_neighbors=True,
                  n_jobs=1)
    hub.fit(dist)
    skew_s = hub.score(has_self_distances=True)
    neigh_s = hub.k_neighbors
    occ_s = hub.k_occurrence

    np.testing.assert_array_almost_equal(skew_p, skew_s, decimal=7)
    np.testing.assert_array_almost_equal(neigh_p, neigh_s, decimal=7)
    np.testing.assert_array_almost_equal(occ_p, occ_s, decimal=7)
Esempio n. 3
0
def test_hubness_against_distance(has_self_distances):
    """Test hubness class against distance-based methods."""

    np.random.seed(123)
    X = np.random.rand(100, 50)
    D = euclidean_distances(X)
    verbose = 1

    hub = Hubness(k=10, metric='precomputed',
                  store_k_occurrence=True,
                  store_k_neighbors=True,
                  )
    hub.fit(D)
    skew_d = hub.score(has_self_distances=has_self_distances)
    neigh_d = hub.k_neighbors
    occ_d = hub.k_occurrence

    hub = Hubness(k=10, metric='euclidean',
                  store_k_neighbors=True,
                  store_k_occurrence=True,
                  verbose=verbose)
    hub.fit(X)
    skew_v = hub.score(X if not has_self_distances else None)
    neigh_v = hub.k_neighbors
    occ_v = hub.k_occurrence

    np.testing.assert_allclose(skew_d, skew_v, atol=1e-7)
    np.testing.assert_array_equal(neigh_d, neigh_v)
    np.testing.assert_array_equal(occ_d, occ_v)
Esempio n. 4
0
def test_handle_negative_neighbor_indices():
    def mock_kneighbors(X_test):
        nn = np.zeros((len(X_test), 2), dtype=int)
        nn[:, 1] = -1
        return nn
    X, _ = make_classification(n_samples=10)
    hub = Hubness()
    hub.fit(X[:8, :])
    hub._k_neighbors = mock_kneighbors
    hub.score(X[8:, :])
Esempio n. 5
0
def test_sparse_equal_dense(verbose, shuffle_equal):
    X, _ = make_classification()
    dist_dense = euclidean_distances(X)
    dist_sparse = csr_matrix(dist_dense)

    hub = Hubness(metric='precomputed',
                  shuffle_equal=shuffle_equal,
                  verbose=verbose)
    hub.fit(dist_dense)
    skew_dense = hub.score(has_self_distances=True)

    hub.fit(dist_sparse)
    skew_sparse = hub.score(has_self_distances=True)

    np.testing.assert_almost_equal(skew_dense, skew_sparse)
Esempio n. 6
0
def test_hubness(verbose):
    """Test hubness against ground truth calc on spreadsheet"""
    HUBNESS_TRUE = -0.2561204163  # Hubness truth: S_k=5, skewness calculated with bias
    hub = Hubness(k=2, metric='precomputed', verbose=verbose)
    hub.fit(DIST)
    Sk2 = hub.score(has_self_distances=True)
    np.testing.assert_almost_equal(Sk2, HUBNESS_TRUE, decimal=10)
Esempio n. 7
0
def test_hubness_return_values_are_self_consistent(n_samples, n_features, k,
                                                   seed):
    """Test that the three returned values fit together"""
    np.random.seed(seed)
    vectors = 99. * (np.random.rand(n_samples, n_features) - 0.5)
    k = 10
    hub = Hubness(k=k,
                  metric='euclidean',
                  store_k_neighbors=True,
                  store_k_occurrence=True)
    hub.fit(vectors)
    skew = hub.score()
    neigh = hub.k_neighbors
    occ = hub.k_occurrence
    # Neighbors are just checked for correct shape
    assert neigh.shape == (n_samples, k)
    # Count k-occurrence (different method than in module)
    neigh = neigh.ravel()
    occ_true = np.zeros(n_samples, dtype=int)
    for i in range(n_samples):
        occ_true[i] = (neigh == i).sum()
    np.testing.assert_array_equal(occ, occ_true)
    # Calculate skewness (different method than in module)
    x0 = occ - occ.mean()
    s2 = (x0**2).mean()
    m3 = (x0**3).mean()
    skew_true = m3 / (s2**1.5)
    np.testing.assert_equal(skew, skew_true)
Esempio n. 8
0
def test_return_k_neighbors(store_k_neighbors):
    X, _ = make_classification()
    hub = Hubness(return_value='k_neighbors', store_k_neighbors=store_k_neighbors)
    if store_k_neighbors:
        hub.fit(X)
    else:
        with pytest.warns(UserWarning):
            hub.fit(X)
    _ = hub.score()
Esempio n. 9
0
def test_atkinson():
    X, _ = make_classification(random_state=123)
    hub = Hubness(return_value='k_occurrence').fit(X)
    k_occ = hub.score()

    atkinson_0999 = hub._calc_atkinson_index(k_occ, eps=.999)
    atkinson_1000 = hub._calc_atkinson_index(k_occ, eps=1)

    np.testing.assert_almost_equal(atkinson_0999, atkinson_1000, decimal=3)
Esempio n. 10
0
def test_return_k_occurrence(store_k_occurrence):
    X, _ = make_classification()
    hub = Hubness(return_value='k_occurrence', store_k_occurrence=store_k_occurrence)
    if store_k_occurrence:
        hub.fit(X)
    else:
        with pytest.warns(UserWarning):
            hub.fit(X)
    _ = hub.score()
Esempio n. 11
0
def test_all_params_none():
    X, _ = make_classification()
    hub = Hubness(k=None, return_value=None, hub_size=None, metric=None,
                  store_k_neighbors=None, store_k_occurrence=None,
                  algorithm_params=None,
                  hubness=None, hubness_params=None,
                  verbose=None, n_jobs=None, random_state=None,
                  shuffle_equal=None)
    hub.fit(X)
    _ = hub.score()
Esempio n. 12
0
def test_limiting_factor():
    """ Different implementations of Gini index calculation should give the same result. """
    X, _ = make_classification()
    hub = Hubness(store_k_occurrence=True, return_value='k_occurrence')
    hub.fit(X)
    k_occ = hub.score()

    gini_space = hub._calc_gini_index(k_occ, limiting='space')
    gini_time = hub._calc_gini_index(k_occ, limiting='time')
    gini_naive = hub._calc_gini_index(k_occ, limiting=None)

    assert gini_space == gini_time == gini_naive
Esempio n. 13
0
def test_all_but_gini():
    X, _ = make_classification()
    hub = Hubness(store_k_occurrence=True, return_value='all_but_gini',
                  store_k_neighbors=True)
    hub.fit(X)
    measures = hub.score()

    for m in VALID_HUBNESS_MEASURES:
        if m in ['all', 'all_but_gini']:
            continue
        elif m == 'gini':
            assert m not in measures
        else:
            assert m in measures
Esempio n. 14
0
def test_hubness_independent_on_data_set_size(hubness_measure):
    """ New measures should pass, traditional skewness should fail. """
    thousands = 3
    n_objects = thousands * 1_000
    rng = np.random.RandomState(1247)
    X = rng.rand(n_objects, 128)
    N_SAMPLES_LIST = np.arange(1, thousands + 1) * 1_000
    value = np.empty(N_SAMPLES_LIST.size)
    for i, n_samples in enumerate(N_SAMPLES_LIST):
        ind = rng.permutation(n_objects)[:n_samples]
        X_sample = X[ind, :]
        hub = Hubness(return_value='all')
        hub.fit(X_sample)
        measures = hub.score()
        if hubness_measure == 'k_skewness':
            value[i] = hub.k_skewness
        elif hubness_measure == 'k_skewness_truncnorm':
            value[i] = hub.k_skewness_truncnorm
        elif hubness_measure == 'robinhood':
            value[i] = hub.robinhood_index
        elif hubness_measure == 'gini':
            value[i] = hub.gini_index
        elif hubness_measure == 'atkinson':
            value[i] = hub.atkinson_index
        assert value[i] == measures[hubness_measure]
        if i > 0:
            if hubness_measure == 'k_skewness':
                with np.testing.assert_raises(AssertionError):
                    np.testing.assert_allclose(value[i],
                                               value[i - 1],
                                               rtol=0.1)
            else:
                np.testing.assert_allclose(
                    value[i],
                    value[i - 1],
                    rtol=2e-1,
                    err_msg=(f'Hubness measure is too dependent on data set '
                             f'size with S({N_SAMPLES_LIST[i]}) = x '
                             f'and S({N_SAMPLES_LIST[i-1]}) = y.'))
    if hubness_measure == 'k_skewness':
        with np.testing.assert_raises(AssertionError):
            np.testing.assert_allclose(value[-1], value[0], rtol=0.1)
    else:
        np.testing.assert_allclose(value[-1], value[0], rtol=2e-1)
Esempio n. 15
0
def hub_eval(
        input_points,
        methods={
            'nothing': (None, None),
            'mp_normal': ('mp', {
                'method': 'normal'
            }),
            'ls': ('ls', None),
            'ls_nicdm': ('ls', {
                'method': 'nicdm'
            }),
            'dsl': ('dsl', None)
        },
        k=k):
    skewness = []
    for method_name, (hubness, hubness_params) in tqdm(methods.items()):
        hub = Hubness(k=k, hubness=hubness, hubness_params=hubness_params)
        hub.fit(input_points)
        skewness.append(hub.score())
    return skewness
Esempio n. 16
0
def measure_hubness(n_tracks, output_file, metric, projection, dimensions, n_jobs, random):
    from skhubness import Hubness
    tracks = Track.get_all(limit=n_tracks, random=random)

    models = get_models()
    models_iter = models.get_combinations() if projection is None else models.get_offline_projections(projection)
    results = []
    for model in list(models_iter):
        for _dimensions in tqdm(range(2, dimensions+1), desc=str(model)):
            embeddings = model.get_embeddings(tracks, dimensions=slice(_dimensions))
            embeddings_stacked = np.vstack(embeddings)

            hub = Hubness(k=10, metric=metric, return_value='all', n_jobs=n_jobs)
            hub.fit(embeddings_stacked[:, :_dimensions])
            result = {key: value for key, value in hub.score().items() if key in RETURN_VALUES}
            result.update({
                'model': f'{model.dataset}-{model.architecture}',
                'layer': model.layer,
                'dimensions': _dimensions
            })
            results.append(result)

    results_df = pd.DataFrame(results)
    results_df.to_csv(output_file, float_format=FLOAT_FORMAT)
Esempio n. 17
0
from skhubness.neighbors import KNeighborsClassifier

# Fetch data and have a look
d = olivetti_faces.fetch_olivetti_faces()
X, y = d['data'], d['target']
print(f'Data shape: {X.shape}')
print(f'Label shape: {y.shape}')
# (400, 4096)
# (400,)

# The data is embedded in a high-dimensional space.
# Is there hubness, and can we reduce it?
for hubness in [None, 'dsl', 'ls', 'mp']:
    hub = Hubness(k=10, hubness=hubness, return_value='k_skewness')
    hub.fit(X)
    score = hub.score()
    print(f'Hubness (10-skew): {score:.3f} with hubness reduction: {hubness}')
# Hubness (10-skew): 1.972 with hubness reduction: None
# Hubness (10-skew): 1.526 with hubness reduction: dsl
# Hubness (10-skew): 0.943 with hubness reduction: ls
# Hubness (10-skew): 0.184 with hubness reduction: mp

# There is some hubness, and all hubness reduction methods can reduce it (to varying degree)
# Let's assess the best kNN strategy and its estimated performance.
cv_perf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7263)
cv_select = StratifiedKFold(n_splits=5, shuffle=True, random_state=32634)

knn = KNeighborsClassifier(algorithm_params={'n_candidates': 100})

# specify parameters and distributions to sample from
param_dist = {"n_neighbors": np.arange(1, 26),
Esempio n. 18
0
# d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors)
# # vectors = vectors[:10000, :]
# # d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors)
# # vectors = mnist.data
# # vectors = vectors[:10000, :]
# # d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors)
# D = euclidean_distance(vectors)
#
# S_k, _, _ = hub_toolbox.hubness.hubness(D=D, k=5, metric='distance')
# D_mp = hub_toolbox.global_scaling.mutual_proximity_empiric(
#         D=D, metric='distance')
# S_k_mp, _, _ = hub_toolbox.hubness.hubness(D=D_mp, k=5, metric='distance')
#
# print(S_k, S_k_mp)

from skhubness.data import load_dexter

X, y = load_dexter()

from skhubness import Hubness
hub = Hubness(k=10, metric='cosine')
hub.fit(X)
k_skew = hub.score()
print(f'Skewness = {k_skew:.3f}')

from skhubness.neighbors import kneighbors_graph
k = 5
# neigbor_graph = kneighbors_graph(X, n_neighbors=k, hubness='mutual_proximity')
neigbor_graph = kneighbors_graph(X, n_neighbors=k, hubness=None)
neighbor_matrix = neigbor_graph.indices.reshape((X.shape[0], k))
print(neighbor_matrix)