Example #1
0
    def test_cython_binning(self, dims, use_weights, eq_grid):
        """
        Test the fast N-dimensional binning up against the naive
        Python implementation - using weights, no weights, equal grid values
        in each direction and unequal ones.
        """

        num_points = 1000
        data = np.random.randn(dims * num_points).reshape(num_points, dims) / 7

        if use_weights:
            weights = np.random.randn(num_points)
        else:
            weights = None

        if eq_grid:
            num_points = (16, ) * dims
        else:
            num_points = tuple([random.randint(8, 16) for i in range(dims)])

        grid_points = autogrid(np.array([[0] * dims]), num_points=num_points)
        result = linear_binning(data, grid_points, weights=weights)
        result_slow = linbin_Ndim_python(data, grid_points, weights=weights)

        assert np.allclose(result, result_slow)
Example #2
0
 def test_binning_correctness_single_point(self, dims):
     """
     Permute a single grid poind make sure that same point is weighted
     highly.
     """
     
     eps = 10e-6
     for subtest in range(25):
         data = np.random.randint(-2, 2, size=(1, dims)) - eps
         grid_points = autogrid(np.array([[0] * dims]), 
                                num_points=(7,) * dims)
         answer = linear_binning(data, grid_points)
         
         for grid_point, a in zip(grid_points, answer):
             diff = np.sum((grid_point - data.ravel())**2)
             if diff < eps:
                 assert np.allclose(a, (1 - eps)**dims)
Example #3
0
    def test_binning_correctness_single_point(self, dims):
        """
        Create a single data point that is close to a grid point on an integer
        grid. Compute linear binning, and test that the grid point that is
        close to the data point has a lot of weight assigned to it.
        """

        eps = 10e-6
        for subtest in range(25):
            # Create a data point between a random integer between -2 and 2
            np.random.seed(subtest)
            data = np.random.randint(-2, 2, size=(1, dims)) - eps
            # Create grid points [-3, -2, -1, 0, 1, 2, 3]^dims
            grid_points = autogrid(np.array([[0] * dims]), num_points=(7,) * dims)
            # Compute the linear binning
            answer = linear_binning(data, grid_points)

            # At the grid point where data point is placed,
            # a large weight should be placed by the linear binning
            for grid_point, a in zip(grid_points, answer):
                diff = np.sum((grid_point - data.ravel()) ** 2)
                if diff < eps:
                    assert np.allclose(a, (1 - eps) ** dims)
Example #4
0
def improved_sheather_jones(data):
    """
    The Improved Sheater Jones (ISJ) algorithm from the paper by Botev et al.
    This algorithm computes the optimal bandwidth for a gaussian kernel,
    and works very well for bimodal data (unlike other rules). The
    disadvantage of this algorithm is longer computation time, and the fact
    that this implementation does not always converge if very few data
    points are supplied.

    Understanding this algorithm is difficult, see:
    https://books.google.no/books?id=Trj9HQ7G8TUC&pg=PA328&lpg=PA328&dq=
    sheather+jones+why+use+dct&source=bl&ots=1ETdKd_6EF&sig=jZk4R515GB1xsn-
    VZVnjr-JfjSI&hl=en&sa=X&ved=2ahUKEwi1_czNncTcAhVGhqYKHaPiBtcQ6AEwA3oEC
    AcQAQ#v=onepage&q=sheather%20jones%20why%20use%20dct&f=false
    """
    obs, dims = data.shape
    if not dims == 1:
        raise ValueError('ISJ is only available for 1D data.')

    n = 2**10
    # Setting `percentile` higher decreases the chance of overflow
    xmesh = autogrid(data, boundary_abs=6, num_points=n, boundary_rel=0.5)
    data = data.ravel()
    xmesh = xmesh.ravel()

    # Create an equidistant grid
    R = np.max(data) - np.min(data)
    # dx = R / (n - 1)
    data = data.ravel()
    N = len(np.unique(data))

    # Use linear binning to bin the data on an equidistant grid, this is a
    # prerequisite for using the FFT (evenly spaced samples)
    initial_data = linear_binning(data.reshape(-1, 1), xmesh)
    assert np.allclose(initial_data.sum(), 1)

    # Compute the type 2 Discrete Cosine Transform (DCT) of the data
    a = fftpack.dct(initial_data)

    # Compute the bandwidth
    I_sq = np.power(np.arange(1, n, dtype=FLOAT), 2)
    a2 = a[1:]**2 / 4

    # Solve for the optimal (in the AMISE sense) t
    t_star = _root(_fixed_point, N, args=(N, I_sq, a2))

    # The remainder of the algorithm computes the actual density
    # estimate, but this function is only used to compute the
    # bandwidth, since the bandwidth may be used for other kernels
    # apart from the Gaussian kernel

    # Smooth the initial data using the computed optimal t
    # Multiplication in frequency domain is convolution
    # integers = np.arange(n, dtype=np.float)
    # a_t = a * np.exp(-integers**2 * np.pi ** 2 * t_star / 2)

    # Diving by 2 done because of the implementation of fftpack.idct
    # density = fftpack.idct(a_t) / (2 * R)

    # Due to overflow, some values might be smaller than zero, correct it
    # density[density < 0] = 0.
    bandwidth = np.sqrt(t_star) * R
    return bandwidth
Example #5
0
    def evaluate(self, grid_points=None):
        """
        Evaluate on equidistant grid points.

        Parameters
        ----------
        grid_points: array-like, int, tuple or None
            A grid (mesh) to evaluate on. High dimensional grids must have
            shape (obs, dims). If an integer is passed, it's the number of grid
            points on an equidistant grid. If a tuple is passed, it's the
            number of grid points in each dimension. If None, a grid will be
            automatically created.

        Returns
        -------
        y: array-like
            If a grid is supplied, `y` is returned. If no grid is supplied,
            a tuple (`x`, `y`) is returned.

        Examples
        --------
        >>> kde = FFTKDE().fit([1, 3, 4, 7])
        >>> # Three ways to evaluate a fitted KDE object:
        >>> x, y = kde.evaluate()  # (1) Auto grid
        >>> x, y = kde.evaluate(256)  # (2) Auto grid with 256 points
        >>> # (3) Use a custom grid (make sure it's wider than the data)
        >>> x_grid = np.linspace(-10, 25, num=2**10)  # <- Must be equidistant
        >>> y = kde.evaluate(x_grid)  # Notice that only y is returned
        """

        # This method sets self.grid_points and verifies it
        super().evaluate(grid_points)

        # Extra verification for FFTKDE (checking the sorting property)
        if not grid_is_sorted(self.grid_points):
            raise ValueError("The grid must be sorted.")

        if isinstance(self.bw, numbers.Number) and self.bw > 0:
            bw = self.bw
        else:
            raise ValueError("The bw must be a number.")
        self.bw = bw

        # Step 0 - Make sure data points are inside of the grid
        min_grid = np.min(self.grid_points, axis=0)
        max_grid = np.max(self.grid_points, axis=0)

        min_data = np.min(self.data, axis=0)
        max_data = np.max(self.data, axis=0)
        if not ((min_grid < min_data).all() and (max_grid > max_data).all()):
            raise ValueError("Every data point must be inside of the grid.")

        # Step 1 - Obtaining the grid counts
        # TODO: Consider moving this to the fitting phase instead
        data = linear_binning(self.data,
                              grid_points=self.grid_points,
                              weights=self.weights)

        # Step 2 - Computing kernel weights
        g_shape = self.grid_points.shape[1]
        num_grid_points = np.array(
            list(
                len(np.unique(self.grid_points[:, i]))
                for i in range(g_shape)))

        num_intervals = num_grid_points - 1
        dx = (max_grid - min_grid) / num_intervals

        # Find the real bandwidth, the support times the desired bw factor
        if self.kernel.finite_support:
            real_bw = self.kernel.support * self.bw
        else:
            # The parent class should compute this already. If not, compute
            # it again. This optimization only dominates a little bit with
            # few data points
            try:
                real_bw = self._kernel_practical_support
            except AttributeError:
                real_bw = self.kernel.practical_support(self.bw)

        # Compute L, the number of dx'es to move out from 0 in kernel
        L = np.minimum(np.floor(real_bw / dx), num_intervals + 1)
        assert (dx * L <= real_bw).all()

        # Evaluate the kernel once
        grids = [
            np.linspace(-dx * L, dx * L, int(L * 2 + 1))
            for (dx, L) in zip(dx, L)
        ]
        kernel_grid = cartesian(grids)
        kernel_weights = self.kernel(kernel_grid, bw=self.bw, norm=self.norm)

        # Reshape in preparation to
        kernel_weights = kernel_weights.reshape(*[int(k * 2 + 1) for k in L])
        data = data.reshape(*tuple(num_grid_points))

        # Step 3 - Performing the convolution

        # The following code block surpressed the warning:
        #        anaconda3/lib/python3.6/site-packages/mkl_fft/_numpy_fft.py:
        #            FutureWarning: Using a non-tuple sequence for multidimensional ...
        #        output = mkl_fft.rfftn_numpy(a, s, axes)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            ans = convolve(data, kernel_weights, mode="same").reshape(-1, 1)

        return self._evalate_return_logic(ans, self.grid_points)
Example #6
0
    def evaluate(self, grid_points=None):
        """
        Evaluate on equidistant grid points.
        
        Parameters
        ----------
        grid_points: array-like, int, tuple or None
            A grid (mesh) to evaluate on. High dimensional grids must have 
            shape (obs, dims). If an integer is passed, it's the number of grid
            points on an equidistant grid. If a tuple is passed, it's the
            number of grid points in each dimension. If None, a grid will be 
            automatically created.
            
        Returns
        -------
        y: array-like
            If a grid is supplied, `y` is returned. If no grid is supplied,
            a tuple (`x`, `y`) is returned.
            
        Examples
        --------
        >>> kde = FFTKDE().fit([1, 3, 4, 7])
        >>> # Two ways to evaluate, either with a grid or without
        >>> x, y = kde.evaluate()
        >>> x, y = kde.evaluate(256)
        >>> y = kde.evaluate(x)
        """

        # This method sets self.grid_points and verifies it
        super().evaluate(grid_points)

        if callable(self.bw):
            bw = self.bw(self.data)
        elif isinstance(self.bw, numbers.Number) and self.bw > 0:
            bw = self.bw
        else:
            raise ValueError('The bw must be a callable or a number.')
        self.bw = bw

        # Step 1 - Obtaining the grid counts
        data = linear_binning(self.data,
                              grid_points=self.grid_points,
                              weights=self.weights)

        # Step 2 - Computing kernel weights
        g_shape = self.grid_points.shape[1]
        num_grid_points = np.array(
            list(
                len(np.unique(self.grid_points[:, i]))
                for i in range(g_shape)))

        min_grid = np.min(self.grid_points, axis=0)
        max_grid = np.max(self.grid_points, axis=0)
        num_intervals = (num_grid_points - 1)
        dx = (max_grid - min_grid) / num_intervals

        # Find the real bandwidth, the support times the desired bw factor
        if self.kernel.finite_support:
            real_bw = self.kernel.support * self.bw
        else:
            # The parent class should compute this already. If not, compute
            # it again. This optimization only dominates a little bit with
            # few data points
            try:
                real_bw = self._kernel_practical_support
            except AttributeError:
                real_bw = self.kernel.practical_support(self.bw)

        # Compute L, the number of dx'es to move out from 0 in kernel
        L = np.minimum(np.floor(real_bw / dx), num_intervals + 1)
        assert (dx * L <= real_bw).all()

        # Evaluate the kernel once
        grids = [
            np.linspace(-dx * L, dx * L, int(L * 2 + 1))
            for (dx, L) in zip(dx, L)
        ]
        kernel_grid = cartesian(grids)
        kernel_weights = self.kernel(kernel_grid, bw=self.bw, norm=self.norm)

        # Reshape in preparation to
        kernel_weights = kernel_weights.reshape(*[int(k * 2 + 1) for k in L])
        data = data.reshape(*tuple(num_grid_points))

        # Step 3 - Performing the convolution
        evaluated = convolve(data, kernel_weights, mode='same').reshape(-1, 1)
        return self._evalate_return_logic(evaluated, self.grid_points)