def test_kernel_density_sampling(n_samples=100, n_features=3): rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) bandwidth = 0.2 for kernel in ['gaussian', 'tophat']: # draw a tophat sample kde = KernelDensity(bandwidth, kernel=kernel).fit(X) samp = kde.sample(100) assert X.shape == samp.shape # check that samples are in the right range nbrs = NearestNeighbors(n_neighbors=1).fit(X) dist, ind = nbrs.kneighbors(X, return_distance=True) if kernel == 'tophat': assert np.all(dist < bandwidth) elif kernel == 'gaussian': # 5 standard deviations is safe for 100 samples, but there's a # very small chance this test could fail. assert np.all(dist < 5 * bandwidth) # check unsupported kernels for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']: kde = KernelDensity(bandwidth, kernel=kernel).fit(X) assert_raises(NotImplementedError, kde.sample, 100) # non-regression test: used to return a scalar X = rng.randn(4, 1) kde = KernelDensity(kernel="gaussian").fit(X) assert kde.sample().shape == (1, 1)
def test_invalid_method(method): X, y = make_classification(n_samples=10, ) nn = NearestNeighbors() nn.fit(X, y) neigh_dist, neigh_ind = nn.kneighbors() mp = MutualProximity(method=method) with assert_raises(ValueError): mp.fit(neigh_dist, neigh_ind, X, assume_sorted=True)
def test_mp_runs_without_error(method, verbose): X, y = make_classification(n_samples=20, n_features=10) nn = NearestNeighbors() nn.fit(X, y) neigh_dist, neigh_ind = nn.kneighbors() mp = MutualProximity(method=method, verbose=verbose) _ = mp.fit(neigh_dist, neigh_ind, X, assume_sorted=True)\ .transform(neigh_dist, neigh_ind, X, assume_sorted=True)
def test_invalid_method(method): X, y = make_classification(n_samples=10, ) nn = NearestNeighbors() nn.fit(X, y) neigh_dist, neigh_ind = nn.kneighbors() ls = LocalScaling(method=method) ls.fit(neigh_dist, neigh_ind, X, assume_sorted=True) with assert_raises(ValueError): _ = ls.transform(neigh_dist, neigh_ind, X, assume_sorted=True)
def test_same_neighbors_as_with_exact_nn_search(): X = np.random.RandomState(42).randn(10, 2) nn = NearestNeighbors() nn_dist, nn_neigh = nn.fit(X).kneighbors(return_distance=True) ann = RandomProjectionTree() ann_dist, ann_neigh = ann.fit(X).kneighbors(return_distance=True) assert_array_almost_equal(ann_dist, nn_dist, decimal=5) assert_array_almost_equal(ann_neigh, nn_neigh, decimal=0)
def test_sparse_and_hubness_reduction_disables_hr_and_warns(hr): X = csr_matrix([[0, 0], [0, 1], [0, 3]]) nn_true = [1, 0, 1] nn = NearestNeighbors(n_neighbors=1, hubness=hr, algorithm_params={'n_candidates': 1}) msg = 'cannot use hubness reduction with sparse data: disabling hubness reduction.' with pytest.warns(UserWarning, match=msg): nn.fit(X) nn_pred = nn.kneighbors(n_neighbors=1, return_distance=False).ravel() np.testing.assert_array_equal(nn_true, nn_pred)
def test_snn(method): X, y = make_classification() nn = NearestNeighbors() nn.fit(X, y) neigh_dist, neigh_ind = nn.kneighbors() snn = method() with assert_raises(NotImplementedError): snn.fit(neigh_dist, neigh_ind, X, assume_sorted=True) with assert_raises(NotFittedError): snn.transform(neigh_dist, neigh_ind, X, assume_sorted=True)
def _k_neighbors(self, X_test: np.ndarray = None, X_train: np.ndarray = None) -> np.array: """ Return indices of nearest neighbors in X_train for each vector in X_test. """ nn = NearestNeighbors(n_neighbors=self.k, metric=self.metric, algorithm=self.algorithm, algorithm_params=self.algorithm_params, hubness=self.hubness, hubness_params=self.hubness_params) nn.fit(X_train) # if X_test is None, self distances are ignored indices = nn.kneighbors(X_test, return_distance=False) return indices
def test_fit_sorted(method, verbose): X, y = make_classification() nn = NearestNeighbors() nn.fit(X, y) neigh_dist, neigh_ind = nn.kneighbors() ls = LocalScaling(method=method, verbose=verbose) nd_sorted, ni_sorted = ls.fit(neigh_dist, neigh_ind, X, assume_sorted=True)\ .transform(neigh_dist, neigh_ind, X, assume_sorted=True) nd_unsort, ni_unsort = ls.fit(neigh_dist, neigh_ind, X, assume_sorted=False)\ .transform(neigh_dist, neigh_ind, X, assume_sorted=False) assert_array_almost_equal(nd_sorted, nd_unsort) assert_array_equal(ni_sorted, ni_unsort)
def test_same_indices(): X, y = make_classification() nn = NearestNeighbors() nn.fit(X, y) neigh_dist, neigh_ind = nn.kneighbors() hr = NoHubnessReduction() _, neigh_ind_hr = hr.fit_transform(neigh_dist, neigh_ind, X, return_distance=True) neigh_ind_ht_no_dist = hr.fit_transform(neigh_dist, neigh_ind, X, return_distance=False) assert_array_equal(neigh_ind, neigh_ind_hr) assert_array_equal(neigh_ind_hr, neigh_ind_ht_no_dist)
def test_correct_mp_empiric(): X, y = make_classification(n_samples=120, n_features=10, random_state=1234, ) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=20) nn = NearestNeighbors(n_neighbors=20) nn.fit(X_train, y_train) neigh_dist_train, neigh_ind_train = nn.kneighbors() neigh_dist_test, neigh_ind_test = nn.kneighbors(X_test) # Calcuate MP with fast vectorized routines mp = MutualProximity(method='empiric') mp.fit(neigh_dist_train, neigh_ind_train, X=None, assume_sorted=True) mp_dist_test, mp_ind_test = mp.transform(neigh_dist_test, neigh_ind_test, X=None, assume_sorted=True) # Calculate MP in slow, naive loops mp_dist_test_correct = np.empty_like(neigh_dist_test, dtype=float) mp_ind_test_correct = np.empty_like(neigh_ind_test, dtype=int) n_test, n_train = neigh_ind_test.shape # Loop over all test distances for x in range(n_test): for y in range(n_train): idx = neigh_ind_test[x, y] d_xy = neigh_dist_test[x, y] set1 = set() set2 = set() # P(X > d_xy), i.e. how many distances from query x to indexed objects j # are greater than distance between x and y? for j, d_xj in zip(neigh_ind_test[x, :], neigh_dist_test[x, :]): if d_xj > d_xy: set1.add(j) # P(Y > d_yx), i.e. how many distances from indexed object y to other indexed objects j # are greater than distance between y and x? for j in neigh_ind_test[x, :]: k = np.argwhere(neigh_ind_train[idx] == j).ravel() # Since we don't store all distances between all pairs of indexed objects, # this is approximated by setting all distance to not-nearest neighbors # to the distance to the k-th neighbor plus some epsilon d_yj = neigh_dist_train[idx, k] if k.size else neigh_dist_train[idx, -1] + 1e-6 if d_yj > d_xy: set2.add(j) mp_dist_test_correct[x, y] = 1 - (len(set1.intersection(set2)) / n_train) mp_ind_test_correct[x, y] = idx np.testing.assert_array_almost_equal(mp_dist_test, mp_dist_test_correct) np.testing.assert_array_equal(mp_ind_test, mp_ind_test_correct)
def fit(self, X, y=None): """ Fit indexed objects. Parameters ---------- X: {array-like, sparse matrix}, shape (n_samples, n_features) or (n_query, n_indexed) if metric=='precomputed' Training data vectors or distance matrix, if metric == 'precomputed'. y: ignored Returns ------- self: Fitted instance of :mod:Hubness """ X = check_array(X, accept_sparse=True) # Making sure parameters have sensible values k = self.k if k is None: k = 10 else: if k < 1: raise ValueError(f"Neighborhood size 'k' must " f"be >= 1, but is {k}.") self.k = k store_k_neighbors = self.store_k_neighbors if store_k_neighbors is None: store_k_neighbors = False elif not isinstance(store_k_neighbors, bool): raise ValueError(f"k_neighbors must be True or False.") self.store_k_neighbors = store_k_neighbors store_k_occurrence = self.store_k_occurrence if store_k_occurrence is None: store_k_occurrence = False elif not isinstance(store_k_occurrence, bool): raise ValueError(f"k_occurrence must be True or False.") self.store_k_occurrence = store_k_occurrence return_value = self.return_value if return_value is None: return_value = 'k_skewness' elif return_value not in VALID_HUBNESS_MEASURES: raise ValueError( f'Unknown return value: {return_value}. ' f'Allowed hubness measures: {VALID_HUBNESS_MEASURES}.') elif return_value == 'k_neighbors' and not self.store_k_neighbors: warnings.warn( f'Incompatible parameters return_value={return_value} ' f'and store_k_neighbors={self.store_k_neighbors}. ' f'Overriding store_k_neighbor=True.') self.store_k_neighbors = True elif return_value == 'k_occurrence' and not self.store_k_occurrence: warnings.warn( f'Incompatible parameters return_value={return_value} ' f'and store_k_occurrence={self.store_k_occurrence}. ' f'Overriding store_k_occurrence=True.') self.store_k_occurrence = True self.return_value = return_value hub_size = self.hub_size if hub_size is None: hub_size = 2. elif hub_size <= 0: raise ValueError(f"Hub size must be greater than zero.") self.hub_size = hub_size metric = self.metric if metric is None: metric = 'euclidean' if metric not in VALID_METRICS: raise ValueError(f"Unknown metric '{metric}'. " f"Must be one of {VALID_METRICS}.") self.metric = metric n_jobs = self.n_jobs if n_jobs is None: n_jobs = 1 elif n_jobs == -1: self.n_jobs = cpu_count() elif n_jobs < -1 or n_jobs == 0: raise ValueError(f"Number of parallel processes 'n_jobs' must be " f"a positive integer, or ``-1`` to use all local" f" CPU cores. Was {n_jobs} instead.") self.n_jobs = n_jobs verbose = self.verbose if verbose is None: verbose = 0 elif verbose < 0: verbose = 0 self.verbose = verbose # check random state self._random_state = check_random_state(self.random_state) shuffle_equal = self.shuffle_equal if shuffle_equal is None: shuffle_equal = False elif not isinstance(shuffle_equal, bool): raise ValueError(f'Parameter shuffle_equal must be True or False, ' f'but was {shuffle_equal}.') self.shuffle_equal = shuffle_equal # Fit Hubness to training data: store as indexed objects self.X_train_ = X nn = NearestNeighbors( n_neighbors=self.k, metric=self.metric, algorithm=self.algorithm, algorithm_params=self.algorithm_params, hubness=self.hubness, hubness_params=self.hubness_params, n_jobs=self.n_jobs, verbose=self.verbose, ) self.nn_index_ = nn.fit(X) return self
neigh_true = f['neighbors'] dist = f['distances'] # How many object have we got? for k in f.keys(): print(f'{k}: shape = {f[k].shape}') # APPROXIMATE NEAREST NEIGHBOR SEARCH # In order to retrieve most similar words from the GLOVE embeddings, # we use the unsupervised `skhubness.neighbors.NearestNeighbors` class. # The (approximate) nearest neighbor algorithm is set to NNG by passing `algorithm='nng'`. # We can pass additional parameters to `NNG` via the `algorithm_params` dict. # Here we set `n_jobs=8` to enable parallelism. # Create the nearest neighbor index nn_plain = NearestNeighbors(n_neighbors=100, algorithm='nng', algorithm_params={'n_candidates': 1_000, 'index_dir': 'auto', 'n_jobs': 8}, verbose=2, ) nn_plain.fit(X_train) # Note that NNG must save its index. By setting `index_dir='auto'`, # NNG will try to save it to shared memory, if available, otherwise to $TMP. # This index is NOT removed automatically, as one will typically want build an index once and use it often. # Retrieve nearest neighbors for each test object neigh_pred_plain = nn_plain.kneighbors(X_test, n_neighbors=100, return_distance=False)