def test_additivity_with_weights(data, split_index): """ Test the additive propery of the KDE. TODO: Parameterize this test w.r.t implementation. """ x = np.linspace(-10, 15) weights = np.arange(len(data)) + 1 weights = weights / np.sum(weights) # Fit to add data y = FFTKDE("epa").fit(data, weights).evaluate(x) # Split up the data and the weights data = list(data) weights = list(weights) data_first_split = data[:split_index] data_second_split = data[split_index:] weights_first_split = weights[:split_index] weights_second_split = weights[split_index:] # Fit to splits, and compensate for smaller data using weights y_1 = FFTKDE("epa").fit( data_first_split, weights_first_split).evaluate(x) * sum(weights_first_split) y_2 = FFTKDE("epa").fit( data_second_split, weights_second_split).evaluate(x) * sum(weights_second_split) # Additive property of the functions assert np.allclose(y, y_1 + y_2)
def test_additivity(data, split_index): """ Test the additive propery of the KDE. """ x = np.linspace(-10, 10) # Fit to add data y = FFTKDE('epa').fit(data).evaluate(x) # Fit to splits, and compensate for smaller data using weights weight_1 = split_index / len(data) y_1 = FFTKDE('epa').fit(data[:split_index]).evaluate(x) * weight_1 weight_2 = (len(data) - split_index) / len(data) y_2 = FFTKDE('epa').fit(data[split_index:]).evaluate(x) * weight_2 # Additive property of the functions assert np.allclose(y, y_1 + y_2)
def test_additivity(data, split_index): """ Test the additive propery of the KDE. TODO: Parameterize this test w.r.t implementation. """ x = np.linspace(-10, 12) # Fit to add data y = FFTKDE("epa").fit(data).evaluate(x) # Fit to splits, and compensate for smaller data using weights weight_1 = split_index / len(data) y_1 = FFTKDE("epa").fit(data[:split_index]).evaluate(x) * weight_1 weight_2 = (len(data) - split_index) / len(data) y_2 = FFTKDE("epa").fit(data[split_index:]).evaluate(x) * weight_2 # Additive property of the functions assert np.allclose(y, y_1 + y_2)
def test_against_naive_KDE(data, bw): """ The the FFTKDE against a naive KDE without weights. """ # Higher accuracy when num gets larger x = np.linspace(min(data) - bw, max(data) + bw, num=2**10) y1 = NaiveKDE("epa", bw=bw).fit(data, weights=None).evaluate(x) y2 = FFTKDE("epa", bw=bw).fit(data, weights=None).evaluate(x) assert np.allclose(y1, y2, atol=10e-5)
def FFTKDE_test_grid_inside_data_2D(): """ When using a custom grid, an error should be raised if the data is not contained in the grid. The linear binning routine will crash if this is not the case. See Issue: https://github.com/tommyod/KDEpy/issues/7 """ data = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) grid, y = FFTKDE().fit(data).evaluate() # To get a grid with pytest.raises(ValueError): data = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [0, 6]]) FFTKDE().fit(data).evaluate(grid) with pytest.raises(ValueError): data = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [0, -4]]) FFTKDE().fit(data).evaluate(grid) with pytest.raises(ValueError): data = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [0, 100]]) FFTKDE().fit(data).evaluate(grid)
def FFTKDE_test_grid_inside_data_1D(): """ When using a custom grid, an error should be raised if the data is not contained in the grid. The linear binning routine will crash if this is not the case. See Issue: https://github.com/tommyod/KDEpy/issues/7 """ data = np.array([0, 1, 2, 3, 4, 5]) grid = np.linspace(-1, 6, num=2**6) FFTKDE().fit(data).evaluate(grid) # This should cause no problem with pytest.raises(ValueError): bad_grid = np.linspace(2, 6, num=2**6) FFTKDE().fit(data).evaluate(bad_grid) with pytest.raises(ValueError): bad_grid = np.linspace(-2, 4, num=2**6) FFTKDE().fit(data).evaluate(bad_grid) with pytest.raises(ValueError): bad_grid = np.linspace(0, 5, num=2**6) FFTKDE().fit(data).evaluate(bad_grid)
def test_against_naive_KDE_w_weights(data, bw): """ The the FFTKDE against a naive KDE with weights. """ # Higher accuracy when num gets larger x = np.linspace(min(data) - bw, max(data) + bw, num=2**10) weights = np.arange(len(data)) + 1 y1 = NaiveKDE('epa', bw=bw).fit(data, weights=weights).evaluate(x) y2 = FFTKDE('epa', bw=bw).fit(data, weights=weights).evaluate(x) assert np.allclose(y1, y2, atol=10e-4)
def _compute_kde(data, bw, weights, standardize, n_grid_points, return_pdf_at=None): """Compute KDE and return log densities. :param data: [numpy array] The data on which to run the algorithm. It is of shape [_n_samples x _n_dimensions]. The data is stored in the object so that boosting iterations can be added without the need providing the data again. :param bw: [float] The bandwidth parameter that will be passed to sklearn.neighbors.kde.KernelDensity. :param weights: [numpy array] The weights for the highest boosting iteration. It has shape [n_samples x 1]. :param standardize: [bool] Whether to standardize the data to mean 0 and standard deviation 1. :param n_grid_points: [int] Number of grid points on an equidistant grid on which to evaluate the KDE. :param return_pdf_at: [numpy array] The data on which to calculate the log density given the KDE of `data`. If None, the log density of all `data` samples will be returned. In the context of leave-one-out KDE, it is the left-out sample. :return: [numpy array] Log density estimation of each left-out sample, given the KDE computed on all but this one sample. """ # Set data points at which to return the PDF, and scale them if return_pdf_at is None: return_pdf_at = data.copy() # Standardize data, if requested if standardize: scaler = StandardScaler(with_mean=True, with_std=True, copy=True).fit(data) data = scaler.transform(data) if standardize: return_pdf_at = scaler.transform(return_pdf_at) # Estimate KDE over a grid kde_estimator = FFTKDE(kernel='gaussian', bw=bw).fit(data, weights=weights.ravel()) kde_grid, kde = kde_estimator.evaluate(grid_points=n_grid_points) # Get number of data dimensions n_dims = data.shape[1] if n_dims == 1: kde_grid = kde_grid.reshape(-1, 1) # Get grid shape and unique grid axes grid_axes = [ np.atleast_2d(np.unique(kde_grid[:, i])).T for i in range(n_dims) ] # Ge the number of bins along each dimension num_grid_points = [len(i) for i in grid_axes] # Find coordinates of data points on grid coords = np.array([ np.abs(grid_axes[d] - np.atleast_2d(return_pdf_at[:, d])).argmin(axis=0) for d in range(n_dims) ]).T bin_indices = np.ravel_multi_index(multi_index=coords.T, dims=num_grid_points) # Take the log of the values at those positions values = kde[bin_indices].ravel() return values
def kde(self, num=2**10): nn = (num, num) x, y = FFTKDE().fit(self.samples).evaluate((num, num)) return (x[:, 0].reshape(nn), x[:, 1].reshape(nn), y.reshape(nn))