def test_kernel_density_sampling(n_samples=100, n_features=3): rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) bandwidth = 0.2 for kernel in ['gaussian', 'tophat']: # draw a tophat sample kde = KernelDensity(bandwidth, kernel=kernel).fit(X) samp = kde.sample(100) assert X.shape == samp.shape # check that samples are in the right range nbrs = NearestNeighbors(n_neighbors=1).fit(X) dist, ind = nbrs.kneighbors(X, return_distance=True) if kernel == 'tophat': assert np.all(dist < bandwidth) elif kernel == 'gaussian': # 5 standard deviations is safe for 100 samples, but there's a # very small chance this test could fail. assert np.all(dist < 5 * bandwidth) # check unsupported kernels for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']: kde = KernelDensity(bandwidth, kernel=kernel).fit(X) assert_raises(NotImplementedError, kde.sample, 100) # non-regression test: used to return a scalar X = rng.randn(4, 1) kde = KernelDensity(kernel="gaussian").fit(X) assert kde.sample().shape == (1, 1)
def test_kde_sample_weights(): n_samples = 400 size_test = 20 weights_neutral = np.full(n_samples, 3.) for d in [1, 2, 10]: rng = np.random.RandomState(0) X = rng.rand(n_samples, d) weights = 1 + (10 * X.sum(axis=1)).astype(np.int8) X_repetitions = np.repeat(X, weights, axis=0) n_samples_test = size_test // d test_points = rng.rand(n_samples_test, d) for algorithm in ['auto', 'ball_tree', 'kd_tree']: for metric in ['euclidean', 'minkowski', 'manhattan', 'chebyshev']: if algorithm != 'kd_tree' or metric in KDTree.valid_metrics: kde = KernelDensity(algorithm=algorithm, metric=metric) # Test that adding a constant sample weight has no effect kde.fit(X, sample_weight=weights_neutral) scores_const_weight = kde.score_samples(test_points) sample_const_weight = kde.sample(random_state=1234) kde.fit(X) scores_no_weight = kde.score_samples(test_points) sample_no_weight = kde.sample(random_state=1234) assert_allclose(scores_const_weight, scores_no_weight) assert_allclose(sample_const_weight, sample_no_weight) # Test equivalence between sampling and (integer) weights kde.fit(X, sample_weight=weights) scores_weight = kde.score_samples(test_points) sample_weight = kde.sample(random_state=1234) kde.fit(X_repetitions) scores_ref_sampling = kde.score_samples(test_points) sample_ref_sampling = kde.sample(random_state=1234) assert_allclose(scores_weight, scores_ref_sampling) assert_allclose(sample_weight, sample_ref_sampling) # Test that sample weights has a non-trivial effect diff = np.max(np.abs(scores_no_weight - scores_weight)) assert diff > 0.001 # Test invariance with respect to arbitrary scaling scale_factor = rng.rand() kde.fit(X, sample_weight=(scale_factor * weights)) scores_scaled_weight = kde.score_samples(test_points) assert_allclose(scores_scaled_weight, scores_weight)