Ejemplo n.º 1
0
def test_kernel_density_sampling(n_samples=100, n_features=3):
    rng = np.random.RandomState(0)
    X = rng.randn(n_samples, n_features)

    bandwidth = 0.2

    for kernel in ['gaussian', 'tophat']:
        # draw a tophat sample
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        samp = kde.sample(100)
        assert X.shape == samp.shape

        # check that samples are in the right range
        nbrs = NearestNeighbors(n_neighbors=1).fit(X)
        dist, ind = nbrs.kneighbors(X, return_distance=True)

        if kernel == 'tophat':
            assert np.all(dist < bandwidth)
        elif kernel == 'gaussian':
            # 5 standard deviations is safe for 100 samples, but there's a
            # very small chance this test could fail.
            assert np.all(dist < 5 * bandwidth)

    # check unsupported kernels
    for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        assert_raises(NotImplementedError, kde.sample, 100)

    # non-regression test: used to return a scalar
    X = rng.randn(4, 1)
    kde = KernelDensity(kernel="gaussian").fit(X)
    assert kde.sample().shape == (1, 1)
Ejemplo n.º 2
0
def test_invalid_method(method):
    X, y = make_classification(n_samples=10, )
    nn = NearestNeighbors()
    nn.fit(X, y)
    neigh_dist, neigh_ind = nn.kneighbors()

    mp = MutualProximity(method=method)
    with assert_raises(ValueError):
        mp.fit(neigh_dist, neigh_ind, X, assume_sorted=True)
Ejemplo n.º 3
0
def test_mp_runs_without_error(method, verbose):
    X, y = make_classification(n_samples=20, n_features=10)
    nn = NearestNeighbors()
    nn.fit(X, y)
    neigh_dist, neigh_ind = nn.kneighbors()

    mp = MutualProximity(method=method, verbose=verbose)
    _ = mp.fit(neigh_dist, neigh_ind, X, assume_sorted=True)\
          .transform(neigh_dist, neigh_ind, X, assume_sorted=True)
Ejemplo n.º 4
0
def test_invalid_method(method):
    X, y = make_classification(n_samples=10, )
    nn = NearestNeighbors()
    nn.fit(X, y)
    neigh_dist, neigh_ind = nn.kneighbors()

    ls = LocalScaling(method=method)
    ls.fit(neigh_dist, neigh_ind, X, assume_sorted=True)
    with assert_raises(ValueError):
        _ = ls.transform(neigh_dist, neigh_ind, X, assume_sorted=True)
Ejemplo n.º 5
0
def test_same_neighbors_as_with_exact_nn_search():
    X = np.random.RandomState(42).randn(10, 2)

    nn = NearestNeighbors()
    nn_dist, nn_neigh = nn.fit(X).kneighbors(return_distance=True)

    ann = RandomProjectionTree()
    ann_dist, ann_neigh = ann.fit(X).kneighbors(return_distance=True)

    assert_array_almost_equal(ann_dist, nn_dist, decimal=5)
    assert_array_almost_equal(ann_neigh, nn_neigh, decimal=0)
Ejemplo n.º 6
0
def test_sparse_and_hubness_reduction_disables_hr_and_warns(hr):
    X = csr_matrix([[0, 0], [0, 1], [0, 3]])
    nn_true = [1, 0, 1]
    nn = NearestNeighbors(n_neighbors=1,
                          hubness=hr,
                          algorithm_params={'n_candidates': 1})
    msg = 'cannot use hubness reduction with sparse data: disabling hubness reduction.'
    with pytest.warns(UserWarning, match=msg):
        nn.fit(X)
    nn_pred = nn.kneighbors(n_neighbors=1, return_distance=False).ravel()
    np.testing.assert_array_equal(nn_true, nn_pred)
def test_snn(method):
    X, y = make_classification()
    nn = NearestNeighbors()
    nn.fit(X, y)
    neigh_dist, neigh_ind = nn.kneighbors()

    snn = method()
    with assert_raises(NotImplementedError):
        snn.fit(neigh_dist, neigh_ind, X, assume_sorted=True)

    with assert_raises(NotFittedError):
        snn.transform(neigh_dist, neigh_ind, X, assume_sorted=True)
Ejemplo n.º 8
0
 def _k_neighbors(self,
                  X_test: np.ndarray = None,
                  X_train: np.ndarray = None) -> np.array:
     """ Return indices of nearest neighbors in X_train for each vector in X_test. """
     nn = NearestNeighbors(n_neighbors=self.k,
                           metric=self.metric,
                           algorithm=self.algorithm,
                           algorithm_params=self.algorithm_params,
                           hubness=self.hubness,
                           hubness_params=self.hubness_params)
     nn.fit(X_train)
     # if X_test is None, self distances are ignored
     indices = nn.kneighbors(X_test, return_distance=False)
     return indices
Ejemplo n.º 9
0
def test_fit_sorted(method, verbose):
    X, y = make_classification()
    nn = NearestNeighbors()
    nn.fit(X, y)
    neigh_dist, neigh_ind = nn.kneighbors()

    ls = LocalScaling(method=method, verbose=verbose)

    nd_sorted, ni_sorted = ls.fit(neigh_dist, neigh_ind, X, assume_sorted=True)\
                             .transform(neigh_dist, neigh_ind, X, assume_sorted=True)
    nd_unsort, ni_unsort = ls.fit(neigh_dist, neigh_ind, X, assume_sorted=False)\
                             .transform(neigh_dist, neigh_ind, X, assume_sorted=False)

    assert_array_almost_equal(nd_sorted, nd_unsort)
    assert_array_equal(ni_sorted, ni_unsort)
def test_same_indices():
    X, y = make_classification()
    nn = NearestNeighbors()
    nn.fit(X, y)
    neigh_dist, neigh_ind = nn.kneighbors()
    hr = NoHubnessReduction()
    _, neigh_ind_hr = hr.fit_transform(neigh_dist,
                                       neigh_ind,
                                       X,
                                       return_distance=True)
    neigh_ind_ht_no_dist = hr.fit_transform(neigh_dist,
                                            neigh_ind,
                                            X,
                                            return_distance=False)
    assert_array_equal(neigh_ind, neigh_ind_hr)
    assert_array_equal(neigh_ind_hr, neigh_ind_ht_no_dist)
Ejemplo n.º 11
0
def test_correct_mp_empiric():
    X, y = make_classification(n_samples=120, n_features=10, random_state=1234, )
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=20)
    nn = NearestNeighbors(n_neighbors=20)
    nn.fit(X_train, y_train)
    neigh_dist_train, neigh_ind_train = nn.kneighbors()
    neigh_dist_test, neigh_ind_test = nn.kneighbors(X_test)

    # Calcuate MP with fast vectorized routines
    mp = MutualProximity(method='empiric')
    mp.fit(neigh_dist_train, neigh_ind_train, X=None, assume_sorted=True)
    mp_dist_test, mp_ind_test = mp.transform(neigh_dist_test, neigh_ind_test, X=None, assume_sorted=True)

    # Calculate MP in slow, naive loops
    mp_dist_test_correct = np.empty_like(neigh_dist_test, dtype=float)
    mp_ind_test_correct = np.empty_like(neigh_ind_test, dtype=int)
    n_test, n_train = neigh_ind_test.shape

    # Loop over all test distances
    for x in range(n_test):
        for y in range(n_train):
            idx = neigh_ind_test[x, y]
            d_xy = neigh_dist_test[x, y]
            set1 = set()
            set2 = set()
            # P(X > d_xy), i.e. how many distances from query x to indexed objects j
            # are greater than distance between x and y?
            for j, d_xj in zip(neigh_ind_test[x, :], neigh_dist_test[x, :]):
                if d_xj > d_xy:
                    set1.add(j)
            # P(Y > d_yx), i.e. how many distances from indexed object y to other indexed objects j
            # are greater than distance between y and x?
            for j in neigh_ind_test[x, :]:
                k = np.argwhere(neigh_ind_train[idx] == j).ravel()
                # Since we don't store all distances between all pairs of indexed objects,
                # this is approximated by setting all distance to not-nearest neighbors
                # to the distance to the k-th neighbor plus some epsilon
                d_yj = neigh_dist_train[idx, k] if k.size else neigh_dist_train[idx, -1] + 1e-6
                if d_yj > d_xy:
                    set2.add(j)
            mp_dist_test_correct[x, y] = 1 - (len(set1.intersection(set2)) / n_train)
            mp_ind_test_correct[x, y] = idx
    np.testing.assert_array_almost_equal(mp_dist_test, mp_dist_test_correct)
    np.testing.assert_array_equal(mp_ind_test, mp_ind_test_correct)
Ejemplo n.º 12
0
    def fit(self, X, y=None):
        """ Fit indexed objects.

        Parameters
        ----------
        X: {array-like, sparse matrix}, shape (n_samples, n_features) or (n_query, n_indexed) if metric=='precomputed'
            Training data vectors or distance matrix, if metric == 'precomputed'.

        y: ignored

        Returns
        -------
        self:
            Fitted instance of :mod:Hubness
        """
        X = check_array(X, accept_sparse=True)

        # Making sure parameters have sensible values
        k = self.k
        if k is None:
            k = 10
        else:
            if k < 1:
                raise ValueError(f"Neighborhood size 'k' must "
                                 f"be >= 1, but is {k}.")
        self.k = k

        store_k_neighbors = self.store_k_neighbors
        if store_k_neighbors is None:
            store_k_neighbors = False
        elif not isinstance(store_k_neighbors, bool):
            raise ValueError(f"k_neighbors must be True or False.")
        self.store_k_neighbors = store_k_neighbors

        store_k_occurrence = self.store_k_occurrence
        if store_k_occurrence is None:
            store_k_occurrence = False
        elif not isinstance(store_k_occurrence, bool):
            raise ValueError(f"k_occurrence must be True or False.")
        self.store_k_occurrence = store_k_occurrence

        return_value = self.return_value
        if return_value is None:
            return_value = 'k_skewness'
        elif return_value not in VALID_HUBNESS_MEASURES:
            raise ValueError(
                f'Unknown return value: {return_value}. '
                f'Allowed hubness measures: {VALID_HUBNESS_MEASURES}.')
        elif return_value == 'k_neighbors' and not self.store_k_neighbors:
            warnings.warn(
                f'Incompatible parameters return_value={return_value} '
                f'and store_k_neighbors={self.store_k_neighbors}. '
                f'Overriding store_k_neighbor=True.')
            self.store_k_neighbors = True
        elif return_value == 'k_occurrence' and not self.store_k_occurrence:
            warnings.warn(
                f'Incompatible parameters return_value={return_value} '
                f'and store_k_occurrence={self.store_k_occurrence}. '
                f'Overriding store_k_occurrence=True.')
            self.store_k_occurrence = True
        self.return_value = return_value

        hub_size = self.hub_size
        if hub_size is None:
            hub_size = 2.
        elif hub_size <= 0:
            raise ValueError(f"Hub size must be greater than zero.")
        self.hub_size = hub_size

        metric = self.metric
        if metric is None:
            metric = 'euclidean'
        if metric not in VALID_METRICS:
            raise ValueError(f"Unknown metric '{metric}'. "
                             f"Must be one of {VALID_METRICS}.")
        self.metric = metric

        n_jobs = self.n_jobs
        if n_jobs is None:
            n_jobs = 1
        elif n_jobs == -1:
            self.n_jobs = cpu_count()
        elif n_jobs < -1 or n_jobs == 0:
            raise ValueError(f"Number of parallel processes 'n_jobs' must be "
                             f"a positive integer, or ``-1`` to use all local"
                             f" CPU cores. Was {n_jobs} instead.")
        self.n_jobs = n_jobs

        verbose = self.verbose
        if verbose is None:
            verbose = 0
        elif verbose < 0:
            verbose = 0
        self.verbose = verbose

        # check random state
        self._random_state = check_random_state(self.random_state)

        shuffle_equal = self.shuffle_equal
        if shuffle_equal is None:
            shuffle_equal = False
        elif not isinstance(shuffle_equal, bool):
            raise ValueError(f'Parameter shuffle_equal must be True or False, '
                             f'but was {shuffle_equal}.')
        self.shuffle_equal = shuffle_equal

        # Fit Hubness to training data: store as indexed objects
        self.X_train_ = X
        nn = NearestNeighbors(
            n_neighbors=self.k,
            metric=self.metric,
            algorithm=self.algorithm,
            algorithm_params=self.algorithm_params,
            hubness=self.hubness,
            hubness_params=self.hubness_params,
            n_jobs=self.n_jobs,
            verbose=self.verbose,
        )
        self.nn_index_ = nn.fit(X)

        return self
Ejemplo n.º 13
0
neigh_true = f['neighbors']
dist = f['distances']

# How many object have we got?
for k in f.keys():
    print(f'{k}: shape = {f[k].shape}')

# APPROXIMATE NEAREST NEIGHBOR SEARCH
# In order to retrieve most similar words from the GLOVE embeddings,
# we use the unsupervised `skhubness.neighbors.NearestNeighbors` class.
# The (approximate) nearest neighbor algorithm is set to NNG by passing `algorithm='nng'`.
# We can pass additional parameters to `NNG` via the `algorithm_params` dict.
# Here we set `n_jobs=8` to enable parallelism.
# Create the nearest neighbor index
nn_plain = NearestNeighbors(n_neighbors=100,
                            algorithm='nng',
                            algorithm_params={'n_candidates': 1_000,
                                              'index_dir': 'auto',
                                              'n_jobs': 8},
                            verbose=2,
                            )
nn_plain.fit(X_train)

# Note that NNG must save its index. By setting `index_dir='auto'`,
# NNG will try to save it to shared memory, if available, otherwise to $TMP.
# This index is NOT removed automatically, as one will typically want build an index once and use it often.
# Retrieve nearest neighbors for each test object
neigh_pred_plain = nn_plain.kneighbors(X_test,
                                       n_neighbors=100,
                                       return_distance=False)