def test_load_dexter(): X, y = load_dexter() n_samples = 300 n_features = 20_000 assert X.shape == ( n_samples, n_features ), f'Wrong shape: X.shape = {X.shape}, should be (300, 20_000).' assert y.shape == ( n_samples, ), f'Wrong shape: y.shape = {y.shape}, should be (300, ).'
def dexter(): X, y = load_dexter() print(X) print(y) y = y.astype(np.int64) y[y == -1] = 0 print(y) print('Dexter bin counts', np.bincount(y)) # TODO: # pd.to_csv() return X, y
def test_neighbors_dexter(hubness_param, metric): hubness, param = hubness_param X, y = load_dexter() # Hubness in standard spaces hub = Hubness(k=10, metric=metric) hub.fit(X) k_skew_orig = hub.score() # Hubness in secondary distance spaces (after hub. red.) graph = kneighbors_graph(X, n_neighbors=10, metric=metric, hubness=hubness, hubness_params=param) hub = Hubness(k=10, metric='precomputed') hub.fit(graph) k_skew_hr = hub.score(has_self_distances=True) assert k_skew_hr < k_skew_orig * 8/10,\ f'k-occurrence skewness was not reduced by at least 20% for dexter with {hubness}'
# d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors) # # vectors = vectors[:10000, :] # # d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors) # # vectors = mnist.data # # vectors = vectors[:10000, :] # # d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors) # D = euclidean_distance(vectors) # # S_k, _, _ = hub_toolbox.hubness.hubness(D=D, k=5, metric='distance') # D_mp = hub_toolbox.global_scaling.mutual_proximity_empiric( # D=D, metric='distance') # S_k_mp, _, _ = hub_toolbox.hubness.hubness(D=D_mp, k=5, metric='distance') # # print(S_k, S_k_mp) from skhubness.data import load_dexter X, y = load_dexter() from skhubness import Hubness hub = Hubness(k=10, metric='cosine') hub.fit(X) k_skew = hub.score() print(f'Skewness = {k_skew:.3f}') from skhubness.neighbors import kneighbors_graph k = 5 # neigbor_graph = kneighbors_graph(X, n_neighbors=k, hubness='mutual_proximity') neigbor_graph = kneighbors_graph(X, n_neighbors=k, hubness=None) neighbor_matrix = neigbor_graph.indices.reshape((X.shape[0], k)) print(neighbor_matrix)