Example #1
0
def test_kernel_density_sampling(n_samples=100, n_features=3):
    rng = np.random.RandomState(0)
    X = rng.randn(n_samples, n_features)

    bandwidth = 0.2

    for kernel in ['gaussian', 'tophat']:
        # draw a tophat sample
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        samp = kde.sample(100)
        assert X.shape == samp.shape

        # check that samples are in the right range
        nbrs = NearestNeighbors(n_neighbors=1).fit(X)
        dist, ind = nbrs.kneighbors(X, return_distance=True)

        if kernel == 'tophat':
            assert np.all(dist < bandwidth)
        elif kernel == 'gaussian':
            # 5 standard deviations is safe for 100 samples, but there's a
            # very small chance this test could fail.
            assert np.all(dist < 5 * bandwidth)

    # check unsupported kernels
    for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        assert_raises(NotImplementedError, kde.sample, 100)

    # non-regression test: used to return a scalar
    X = rng.randn(4, 1)
    kde = KernelDensity(kernel="gaussian").fit(X)
    assert kde.sample().shape == (1, 1)
Example #2
0
def test_invalid_method(method):
    X, y = make_classification(n_samples=10, )
    nn = NearestNeighbors()
    nn.fit(X, y)
    neigh_dist, neigh_ind = nn.kneighbors()

    mp = MutualProximity(method=method)
    with assert_raises(ValueError):
        mp.fit(neigh_dist, neigh_ind, X, assume_sorted=True)
Example #3
0
def test_mp_runs_without_error(method, verbose):
    X, y = make_classification(n_samples=20, n_features=10)
    nn = NearestNeighbors()
    nn.fit(X, y)
    neigh_dist, neigh_ind = nn.kneighbors()

    mp = MutualProximity(method=method, verbose=verbose)
    _ = mp.fit(neigh_dist, neigh_ind, X, assume_sorted=True)\
          .transform(neigh_dist, neigh_ind, X, assume_sorted=True)
Example #4
0
def test_correct_mp_empiric():
    X, y = make_classification(n_samples=120, n_features=10, random_state=1234, )
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=20)
    nn = NearestNeighbors(n_neighbors=20)
    nn.fit(X_train, y_train)
    neigh_dist_train, neigh_ind_train = nn.kneighbors()
    neigh_dist_test, neigh_ind_test = nn.kneighbors(X_test)

    # Calcuate MP with fast vectorized routines
    mp = MutualProximity(method='empiric')
    mp.fit(neigh_dist_train, neigh_ind_train, X=None, assume_sorted=True)
    mp_dist_test, mp_ind_test = mp.transform(neigh_dist_test, neigh_ind_test, X=None, assume_sorted=True)

    # Calculate MP in slow, naive loops
    mp_dist_test_correct = np.empty_like(neigh_dist_test, dtype=float)
    mp_ind_test_correct = np.empty_like(neigh_ind_test, dtype=int)
    n_test, n_train = neigh_ind_test.shape

    # Loop over all test distances
    for x in range(n_test):
        for y in range(n_train):
            idx = neigh_ind_test[x, y]
            d_xy = neigh_dist_test[x, y]
            set1 = set()
            set2 = set()
            # P(X > d_xy), i.e. how many distances from query x to indexed objects j
            # are greater than distance between x and y?
            for j, d_xj in zip(neigh_ind_test[x, :], neigh_dist_test[x, :]):
                if d_xj > d_xy:
                    set1.add(j)
            # P(Y > d_yx), i.e. how many distances from indexed object y to other indexed objects j
            # are greater than distance between y and x?
            for j in neigh_ind_test[x, :]:
                k = np.argwhere(neigh_ind_train[idx] == j).ravel()
                # Since we don't store all distances between all pairs of indexed objects,
                # this is approximated by setting all distance to not-nearest neighbors
                # to the distance to the k-th neighbor plus some epsilon
                d_yj = neigh_dist_train[idx, k] if k.size else neigh_dist_train[idx, -1] + 1e-6
                if d_yj > d_xy:
                    set2.add(j)
            mp_dist_test_correct[x, y] = 1 - (len(set1.intersection(set2)) / n_train)
            mp_ind_test_correct[x, y] = idx
    np.testing.assert_array_almost_equal(mp_dist_test, mp_dist_test_correct)
    np.testing.assert_array_equal(mp_ind_test, mp_ind_test_correct)
def test_invalid_method(method):
    X, y = make_classification(n_samples=10, )
    nn = NearestNeighbors()
    nn.fit(X, y)
    neigh_dist, neigh_ind = nn.kneighbors()

    ls = LocalScaling(method=method)
    ls.fit(neigh_dist, neigh_ind, X, assume_sorted=True)
    with assert_raises(ValueError):
        _ = ls.transform(neigh_dist, neigh_ind, X, assume_sorted=True)
Example #6
0
def test_sparse_and_hubness_reduction_disables_hr_and_warns(hr):
    X = csr_matrix([[0, 0], [0, 1], [0, 3]])
    nn_true = [1, 0, 1]
    nn = NearestNeighbors(n_neighbors=1,
                          hubness=hr,
                          algorithm_params={'n_candidates': 1})
    msg = 'cannot use hubness reduction with sparse data: disabling hubness reduction.'
    with pytest.warns(UserWarning, match=msg):
        nn.fit(X)
    nn_pred = nn.kneighbors(n_neighbors=1, return_distance=False).ravel()
    np.testing.assert_array_equal(nn_true, nn_pred)
def test_snn(method):
    X, y = make_classification()
    nn = NearestNeighbors()
    nn.fit(X, y)
    neigh_dist, neigh_ind = nn.kneighbors()

    snn = method()
    with assert_raises(NotImplementedError):
        snn.fit(neigh_dist, neigh_ind, X, assume_sorted=True)

    with assert_raises(NotFittedError):
        snn.transform(neigh_dist, neigh_ind, X, assume_sorted=True)
Example #8
0
 def _k_neighbors(self,
                  X_test: np.ndarray = None,
                  X_train: np.ndarray = None) -> np.array:
     """ Return indices of nearest neighbors in X_train for each vector in X_test. """
     nn = NearestNeighbors(n_neighbors=self.k,
                           metric=self.metric,
                           algorithm=self.algorithm,
                           algorithm_params=self.algorithm_params,
                           hubness=self.hubness,
                           hubness_params=self.hubness_params)
     nn.fit(X_train)
     # if X_test is None, self distances are ignored
     indices = nn.kneighbors(X_test, return_distance=False)
     return indices
def test_fit_sorted(method, verbose):
    X, y = make_classification()
    nn = NearestNeighbors()
    nn.fit(X, y)
    neigh_dist, neigh_ind = nn.kneighbors()

    ls = LocalScaling(method=method, verbose=verbose)

    nd_sorted, ni_sorted = ls.fit(neigh_dist, neigh_ind, X, assume_sorted=True)\
                             .transform(neigh_dist, neigh_ind, X, assume_sorted=True)
    nd_unsort, ni_unsort = ls.fit(neigh_dist, neigh_ind, X, assume_sorted=False)\
                             .transform(neigh_dist, neigh_ind, X, assume_sorted=False)

    assert_array_almost_equal(nd_sorted, nd_unsort)
    assert_array_equal(ni_sorted, ni_unsort)
def test_same_indices():
    X, y = make_classification()
    nn = NearestNeighbors()
    nn.fit(X, y)
    neigh_dist, neigh_ind = nn.kneighbors()
    hr = NoHubnessReduction()
    _, neigh_ind_hr = hr.fit_transform(neigh_dist,
                                       neigh_ind,
                                       X,
                                       return_distance=True)
    neigh_ind_ht_no_dist = hr.fit_transform(neigh_dist,
                                            neigh_ind,
                                            X,
                                            return_distance=False)
    assert_array_equal(neigh_ind, neigh_ind_hr)
    assert_array_equal(neigh_ind_hr, neigh_ind_ht_no_dist)
Example #11
0
# Create the nearest neighbor index
nn_plain = NearestNeighbors(n_neighbors=100,
                            algorithm='nng',
                            algorithm_params={'n_candidates': 1_000,
                                              'index_dir': 'auto',
                                              'n_jobs': 8},
                            verbose=2,
                            )
nn_plain.fit(X_train)

# Note that NNG must save its index. By setting `index_dir='auto'`,
# NNG will try to save it to shared memory, if available, otherwise to $TMP.
# This index is NOT removed automatically, as one will typically want build an index once and use it often.
# Retrieve nearest neighbors for each test object
neigh_pred_plain = nn_plain.kneighbors(X_test,
                                       n_neighbors=100,
                                       return_distance=False)

# Calculate the recall per test object
recalled_plain = [np.intersect1d(neigh_true[i], neigh_pred_plain)
                  for i in range(len(X_test))]
recall_plain = np.array([recalled_plain[i].size / neigh_true.shape[1]
                         for i in range(len(X_test))])

# Statistics
print(f'Mean = {recall_plain.mean():.4f}, '
      f'stdev = {recall_plain.std():.4f}')


# ANN with HUBNESS REDUCTION
# Here we set `n_candidates=1000`, so that for each query,