def test_kernel_density_sampling(n_samples=100, n_features=3): rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) bandwidth = 0.2 for kernel in ['gaussian', 'tophat']: # draw a tophat sample kde = KernelDensity(bandwidth, kernel=kernel).fit(X) samp = kde.sample(100) assert X.shape == samp.shape # check that samples are in the right range nbrs = NearestNeighbors(n_neighbors=1).fit(X) dist, ind = nbrs.kneighbors(X, return_distance=True) if kernel == 'tophat': assert np.all(dist < bandwidth) elif kernel == 'gaussian': # 5 standard deviations is safe for 100 samples, but there's a # very small chance this test could fail. assert np.all(dist < 5 * bandwidth) # check unsupported kernels for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']: kde = KernelDensity(bandwidth, kernel=kernel).fit(X) assert_raises(NotImplementedError, kde.sample, 100) # non-regression test: used to return a scalar X = rng.randn(4, 1) kde = KernelDensity(kernel="gaussian").fit(X) assert kde.sample().shape == (1, 1)
def test_invalid_method(method): X, y = make_classification(n_samples=10, ) nn = NearestNeighbors() nn.fit(X, y) neigh_dist, neigh_ind = nn.kneighbors() mp = MutualProximity(method=method) with assert_raises(ValueError): mp.fit(neigh_dist, neigh_ind, X, assume_sorted=True)
def test_mp_runs_without_error(method, verbose): X, y = make_classification(n_samples=20, n_features=10) nn = NearestNeighbors() nn.fit(X, y) neigh_dist, neigh_ind = nn.kneighbors() mp = MutualProximity(method=method, verbose=verbose) _ = mp.fit(neigh_dist, neigh_ind, X, assume_sorted=True)\ .transform(neigh_dist, neigh_ind, X, assume_sorted=True)
def test_correct_mp_empiric(): X, y = make_classification(n_samples=120, n_features=10, random_state=1234, ) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=20) nn = NearestNeighbors(n_neighbors=20) nn.fit(X_train, y_train) neigh_dist_train, neigh_ind_train = nn.kneighbors() neigh_dist_test, neigh_ind_test = nn.kneighbors(X_test) # Calcuate MP with fast vectorized routines mp = MutualProximity(method='empiric') mp.fit(neigh_dist_train, neigh_ind_train, X=None, assume_sorted=True) mp_dist_test, mp_ind_test = mp.transform(neigh_dist_test, neigh_ind_test, X=None, assume_sorted=True) # Calculate MP in slow, naive loops mp_dist_test_correct = np.empty_like(neigh_dist_test, dtype=float) mp_ind_test_correct = np.empty_like(neigh_ind_test, dtype=int) n_test, n_train = neigh_ind_test.shape # Loop over all test distances for x in range(n_test): for y in range(n_train): idx = neigh_ind_test[x, y] d_xy = neigh_dist_test[x, y] set1 = set() set2 = set() # P(X > d_xy), i.e. how many distances from query x to indexed objects j # are greater than distance between x and y? for j, d_xj in zip(neigh_ind_test[x, :], neigh_dist_test[x, :]): if d_xj > d_xy: set1.add(j) # P(Y > d_yx), i.e. how many distances from indexed object y to other indexed objects j # are greater than distance between y and x? for j in neigh_ind_test[x, :]: k = np.argwhere(neigh_ind_train[idx] == j).ravel() # Since we don't store all distances between all pairs of indexed objects, # this is approximated by setting all distance to not-nearest neighbors # to the distance to the k-th neighbor plus some epsilon d_yj = neigh_dist_train[idx, k] if k.size else neigh_dist_train[idx, -1] + 1e-6 if d_yj > d_xy: set2.add(j) mp_dist_test_correct[x, y] = 1 - (len(set1.intersection(set2)) / n_train) mp_ind_test_correct[x, y] = idx np.testing.assert_array_almost_equal(mp_dist_test, mp_dist_test_correct) np.testing.assert_array_equal(mp_ind_test, mp_ind_test_correct)
def test_invalid_method(method): X, y = make_classification(n_samples=10, ) nn = NearestNeighbors() nn.fit(X, y) neigh_dist, neigh_ind = nn.kneighbors() ls = LocalScaling(method=method) ls.fit(neigh_dist, neigh_ind, X, assume_sorted=True) with assert_raises(ValueError): _ = ls.transform(neigh_dist, neigh_ind, X, assume_sorted=True)
def test_sparse_and_hubness_reduction_disables_hr_and_warns(hr): X = csr_matrix([[0, 0], [0, 1], [0, 3]]) nn_true = [1, 0, 1] nn = NearestNeighbors(n_neighbors=1, hubness=hr, algorithm_params={'n_candidates': 1}) msg = 'cannot use hubness reduction with sparse data: disabling hubness reduction.' with pytest.warns(UserWarning, match=msg): nn.fit(X) nn_pred = nn.kneighbors(n_neighbors=1, return_distance=False).ravel() np.testing.assert_array_equal(nn_true, nn_pred)
def test_snn(method): X, y = make_classification() nn = NearestNeighbors() nn.fit(X, y) neigh_dist, neigh_ind = nn.kneighbors() snn = method() with assert_raises(NotImplementedError): snn.fit(neigh_dist, neigh_ind, X, assume_sorted=True) with assert_raises(NotFittedError): snn.transform(neigh_dist, neigh_ind, X, assume_sorted=True)
def _k_neighbors(self, X_test: np.ndarray = None, X_train: np.ndarray = None) -> np.array: """ Return indices of nearest neighbors in X_train for each vector in X_test. """ nn = NearestNeighbors(n_neighbors=self.k, metric=self.metric, algorithm=self.algorithm, algorithm_params=self.algorithm_params, hubness=self.hubness, hubness_params=self.hubness_params) nn.fit(X_train) # if X_test is None, self distances are ignored indices = nn.kneighbors(X_test, return_distance=False) return indices
def test_fit_sorted(method, verbose): X, y = make_classification() nn = NearestNeighbors() nn.fit(X, y) neigh_dist, neigh_ind = nn.kneighbors() ls = LocalScaling(method=method, verbose=verbose) nd_sorted, ni_sorted = ls.fit(neigh_dist, neigh_ind, X, assume_sorted=True)\ .transform(neigh_dist, neigh_ind, X, assume_sorted=True) nd_unsort, ni_unsort = ls.fit(neigh_dist, neigh_ind, X, assume_sorted=False)\ .transform(neigh_dist, neigh_ind, X, assume_sorted=False) assert_array_almost_equal(nd_sorted, nd_unsort) assert_array_equal(ni_sorted, ni_unsort)
def test_same_indices(): X, y = make_classification() nn = NearestNeighbors() nn.fit(X, y) neigh_dist, neigh_ind = nn.kneighbors() hr = NoHubnessReduction() _, neigh_ind_hr = hr.fit_transform(neigh_dist, neigh_ind, X, return_distance=True) neigh_ind_ht_no_dist = hr.fit_transform(neigh_dist, neigh_ind, X, return_distance=False) assert_array_equal(neigh_ind, neigh_ind_hr) assert_array_equal(neigh_ind_hr, neigh_ind_ht_no_dist)
# Create the nearest neighbor index nn_plain = NearestNeighbors(n_neighbors=100, algorithm='nng', algorithm_params={'n_candidates': 1_000, 'index_dir': 'auto', 'n_jobs': 8}, verbose=2, ) nn_plain.fit(X_train) # Note that NNG must save its index. By setting `index_dir='auto'`, # NNG will try to save it to shared memory, if available, otherwise to $TMP. # This index is NOT removed automatically, as one will typically want build an index once and use it often. # Retrieve nearest neighbors for each test object neigh_pred_plain = nn_plain.kneighbors(X_test, n_neighbors=100, return_distance=False) # Calculate the recall per test object recalled_plain = [np.intersect1d(neigh_true[i], neigh_pred_plain) for i in range(len(X_test))] recall_plain = np.array([recalled_plain[i].size / neigh_true.shape[1] for i in range(len(X_test))]) # Statistics print(f'Mean = {recall_plain.mean():.4f}, ' f'stdev = {recall_plain.std():.4f}') # ANN with HUBNESS REDUCTION # Here we set `n_candidates=1000`, so that for each query,