def test_cython_binning(self, dims, use_weights, eq_grid): """ Test the fast N-dimensional binning up against the naive Python implementation - using weights, no weights, equal grid values in each direction and unequal ones. """ num_points = 1000 data = np.random.randn(dims * num_points).reshape(num_points, dims) / 7 if use_weights: weights = np.random.randn(num_points) else: weights = None if eq_grid: num_points = (16, ) * dims else: num_points = tuple([random.randint(8, 16) for i in range(dims)]) grid_points = autogrid(np.array([[0] * dims]), num_points=num_points) result = linear_binning(data, grid_points, weights=weights) result_slow = linbin_Ndim_python(data, grid_points, weights=weights) assert np.allclose(result, result_slow)
def test_binning_correctness_single_point(self, dims): """ Permute a single grid poind make sure that same point is weighted highly. """ eps = 10e-6 for subtest in range(25): data = np.random.randint(-2, 2, size=(1, dims)) - eps grid_points = autogrid(np.array([[0] * dims]), num_points=(7,) * dims) answer = linear_binning(data, grid_points) for grid_point, a in zip(grid_points, answer): diff = np.sum((grid_point - data.ravel())**2) if diff < eps: assert np.allclose(a, (1 - eps)**dims)
def test_binning_correctness_single_point(self, dims): """ Create a single data point that is close to a grid point on an integer grid. Compute linear binning, and test that the grid point that is close to the data point has a lot of weight assigned to it. """ eps = 10e-6 for subtest in range(25): # Create a data point between a random integer between -2 and 2 np.random.seed(subtest) data = np.random.randint(-2, 2, size=(1, dims)) - eps # Create grid points [-3, -2, -1, 0, 1, 2, 3]^dims grid_points = autogrid(np.array([[0] * dims]), num_points=(7,) * dims) # Compute the linear binning answer = linear_binning(data, grid_points) # At the grid point where data point is placed, # a large weight should be placed by the linear binning for grid_point, a in zip(grid_points, answer): diff = np.sum((grid_point - data.ravel()) ** 2) if diff < eps: assert np.allclose(a, (1 - eps) ** dims)
def improved_sheather_jones(data): """ The Improved Sheater Jones (ISJ) algorithm from the paper by Botev et al. This algorithm computes the optimal bandwidth for a gaussian kernel, and works very well for bimodal data (unlike other rules). The disadvantage of this algorithm is longer computation time, and the fact that this implementation does not always converge if very few data points are supplied. Understanding this algorithm is difficult, see: https://books.google.no/books?id=Trj9HQ7G8TUC&pg=PA328&lpg=PA328&dq= sheather+jones+why+use+dct&source=bl&ots=1ETdKd_6EF&sig=jZk4R515GB1xsn- VZVnjr-JfjSI&hl=en&sa=X&ved=2ahUKEwi1_czNncTcAhVGhqYKHaPiBtcQ6AEwA3oEC AcQAQ#v=onepage&q=sheather%20jones%20why%20use%20dct&f=false """ obs, dims = data.shape if not dims == 1: raise ValueError('ISJ is only available for 1D data.') n = 2**10 # Setting `percentile` higher decreases the chance of overflow xmesh = autogrid(data, boundary_abs=6, num_points=n, boundary_rel=0.5) data = data.ravel() xmesh = xmesh.ravel() # Create an equidistant grid R = np.max(data) - np.min(data) # dx = R / (n - 1) data = data.ravel() N = len(np.unique(data)) # Use linear binning to bin the data on an equidistant grid, this is a # prerequisite for using the FFT (evenly spaced samples) initial_data = linear_binning(data.reshape(-1, 1), xmesh) assert np.allclose(initial_data.sum(), 1) # Compute the type 2 Discrete Cosine Transform (DCT) of the data a = fftpack.dct(initial_data) # Compute the bandwidth I_sq = np.power(np.arange(1, n, dtype=FLOAT), 2) a2 = a[1:]**2 / 4 # Solve for the optimal (in the AMISE sense) t t_star = _root(_fixed_point, N, args=(N, I_sq, a2)) # The remainder of the algorithm computes the actual density # estimate, but this function is only used to compute the # bandwidth, since the bandwidth may be used for other kernels # apart from the Gaussian kernel # Smooth the initial data using the computed optimal t # Multiplication in frequency domain is convolution # integers = np.arange(n, dtype=np.float) # a_t = a * np.exp(-integers**2 * np.pi ** 2 * t_star / 2) # Diving by 2 done because of the implementation of fftpack.idct # density = fftpack.idct(a_t) / (2 * R) # Due to overflow, some values might be smaller than zero, correct it # density[density < 0] = 0. bandwidth = np.sqrt(t_star) * R return bandwidth
def evaluate(self, grid_points=None): """ Evaluate on equidistant grid points. Parameters ---------- grid_points: array-like, int, tuple or None A grid (mesh) to evaluate on. High dimensional grids must have shape (obs, dims). If an integer is passed, it's the number of grid points on an equidistant grid. If a tuple is passed, it's the number of grid points in each dimension. If None, a grid will be automatically created. Returns ------- y: array-like If a grid is supplied, `y` is returned. If no grid is supplied, a tuple (`x`, `y`) is returned. Examples -------- >>> kde = FFTKDE().fit([1, 3, 4, 7]) >>> # Three ways to evaluate a fitted KDE object: >>> x, y = kde.evaluate() # (1) Auto grid >>> x, y = kde.evaluate(256) # (2) Auto grid with 256 points >>> # (3) Use a custom grid (make sure it's wider than the data) >>> x_grid = np.linspace(-10, 25, num=2**10) # <- Must be equidistant >>> y = kde.evaluate(x_grid) # Notice that only y is returned """ # This method sets self.grid_points and verifies it super().evaluate(grid_points) # Extra verification for FFTKDE (checking the sorting property) if not grid_is_sorted(self.grid_points): raise ValueError("The grid must be sorted.") if isinstance(self.bw, numbers.Number) and self.bw > 0: bw = self.bw else: raise ValueError("The bw must be a number.") self.bw = bw # Step 0 - Make sure data points are inside of the grid min_grid = np.min(self.grid_points, axis=0) max_grid = np.max(self.grid_points, axis=0) min_data = np.min(self.data, axis=0) max_data = np.max(self.data, axis=0) if not ((min_grid < min_data).all() and (max_grid > max_data).all()): raise ValueError("Every data point must be inside of the grid.") # Step 1 - Obtaining the grid counts # TODO: Consider moving this to the fitting phase instead data = linear_binning(self.data, grid_points=self.grid_points, weights=self.weights) # Step 2 - Computing kernel weights g_shape = self.grid_points.shape[1] num_grid_points = np.array( list( len(np.unique(self.grid_points[:, i])) for i in range(g_shape))) num_intervals = num_grid_points - 1 dx = (max_grid - min_grid) / num_intervals # Find the real bandwidth, the support times the desired bw factor if self.kernel.finite_support: real_bw = self.kernel.support * self.bw else: # The parent class should compute this already. If not, compute # it again. This optimization only dominates a little bit with # few data points try: real_bw = self._kernel_practical_support except AttributeError: real_bw = self.kernel.practical_support(self.bw) # Compute L, the number of dx'es to move out from 0 in kernel L = np.minimum(np.floor(real_bw / dx), num_intervals + 1) assert (dx * L <= real_bw).all() # Evaluate the kernel once grids = [ np.linspace(-dx * L, dx * L, int(L * 2 + 1)) for (dx, L) in zip(dx, L) ] kernel_grid = cartesian(grids) kernel_weights = self.kernel(kernel_grid, bw=self.bw, norm=self.norm) # Reshape in preparation to kernel_weights = kernel_weights.reshape(*[int(k * 2 + 1) for k in L]) data = data.reshape(*tuple(num_grid_points)) # Step 3 - Performing the convolution # The following code block surpressed the warning: # anaconda3/lib/python3.6/site-packages/mkl_fft/_numpy_fft.py: # FutureWarning: Using a non-tuple sequence for multidimensional ... # output = mkl_fft.rfftn_numpy(a, s, axes) with warnings.catch_warnings(): warnings.simplefilter("ignore") ans = convolve(data, kernel_weights, mode="same").reshape(-1, 1) return self._evalate_return_logic(ans, self.grid_points)
def evaluate(self, grid_points=None): """ Evaluate on equidistant grid points. Parameters ---------- grid_points: array-like, int, tuple or None A grid (mesh) to evaluate on. High dimensional grids must have shape (obs, dims). If an integer is passed, it's the number of grid points on an equidistant grid. If a tuple is passed, it's the number of grid points in each dimension. If None, a grid will be automatically created. Returns ------- y: array-like If a grid is supplied, `y` is returned. If no grid is supplied, a tuple (`x`, `y`) is returned. Examples -------- >>> kde = FFTKDE().fit([1, 3, 4, 7]) >>> # Two ways to evaluate, either with a grid or without >>> x, y = kde.evaluate() >>> x, y = kde.evaluate(256) >>> y = kde.evaluate(x) """ # This method sets self.grid_points and verifies it super().evaluate(grid_points) if callable(self.bw): bw = self.bw(self.data) elif isinstance(self.bw, numbers.Number) and self.bw > 0: bw = self.bw else: raise ValueError('The bw must be a callable or a number.') self.bw = bw # Step 1 - Obtaining the grid counts data = linear_binning(self.data, grid_points=self.grid_points, weights=self.weights) # Step 2 - Computing kernel weights g_shape = self.grid_points.shape[1] num_grid_points = np.array( list( len(np.unique(self.grid_points[:, i])) for i in range(g_shape))) min_grid = np.min(self.grid_points, axis=0) max_grid = np.max(self.grid_points, axis=0) num_intervals = (num_grid_points - 1) dx = (max_grid - min_grid) / num_intervals # Find the real bandwidth, the support times the desired bw factor if self.kernel.finite_support: real_bw = self.kernel.support * self.bw else: # The parent class should compute this already. If not, compute # it again. This optimization only dominates a little bit with # few data points try: real_bw = self._kernel_practical_support except AttributeError: real_bw = self.kernel.practical_support(self.bw) # Compute L, the number of dx'es to move out from 0 in kernel L = np.minimum(np.floor(real_bw / dx), num_intervals + 1) assert (dx * L <= real_bw).all() # Evaluate the kernel once grids = [ np.linspace(-dx * L, dx * L, int(L * 2 + 1)) for (dx, L) in zip(dx, L) ] kernel_grid = cartesian(grids) kernel_weights = self.kernel(kernel_grid, bw=self.bw, norm=self.norm) # Reshape in preparation to kernel_weights = kernel_weights.reshape(*[int(k * 2 + 1) for k in L]) data = data.reshape(*tuple(num_grid_points)) # Step 3 - Performing the convolution evaluated = convolve(data, kernel_weights, mode='same').reshape(-1, 1) return self._evalate_return_logic(evaluated, self.grid_points)