Beispiel #1
0
 def KDE_statsmodels(data, kernel='gaussian'):
     data = [data.reshape(-1, 1), data.reshape(-1, 1) * 0.5]
     kde = sm.nonparametric.KDEMultivariate(data, var_type='cc')
     grid = cartesian([np.linspace(-7, 7, num=64), 
                       np.linspace(-7, 7, num=64)])
     y = kde.pdf(grid)
     assert len(y) == 64 * 64
     return y
Beispiel #2
0
 def KDE_sklearn(data, kernel='gaussian'):
     if kernel == 'epa':
         kernel = 'epanechnikov'
     
     # instantiate and fit the KDE model
     kde = KernelDensity(bandwidth=1.0, kernel=kernel, rtol=1E-4)
     data = np.concatenate((data.reshape(-1, 1), data.reshape(-1, 1) * 0.5), 
                           axis=1)
     kde.fit(data)
     
     # score_samples returns the log of the probability density
     linspace = np.linspace(-7, 7, num=64)
     grid = cartesian([linspace, linspace])
     logprob = kde.score_samples(grid)
     y = np.exp(logprob)
     assert len(y) == 64 * 64
     return y
Beispiel #3
0
    def evaluate(self, grid_points=None):
        """
        Evaluate on equidistant grid points.

        Parameters
        ----------
        grid_points: array-like, int, tuple or None
            A grid (mesh) to evaluate on. High dimensional grids must have
            shape (obs, dims). If an integer is passed, it's the number of grid
            points on an equidistant grid. If a tuple is passed, it's the
            number of grid points in each dimension. If None, a grid will be
            automatically created.

        Returns
        -------
        y: array-like
            If a grid is supplied, `y` is returned. If no grid is supplied,
            a tuple (`x`, `y`) is returned.

        Examples
        --------
        >>> kde = FFTKDE().fit([1, 3, 4, 7])
        >>> # Three ways to evaluate a fitted KDE object:
        >>> x, y = kde.evaluate()  # (1) Auto grid
        >>> x, y = kde.evaluate(256)  # (2) Auto grid with 256 points
        >>> # (3) Use a custom grid (make sure it's wider than the data)
        >>> x_grid = np.linspace(-10, 25, num=2**10)  # <- Must be equidistant
        >>> y = kde.evaluate(x_grid)  # Notice that only y is returned
        """

        # This method sets self.grid_points and verifies it
        super().evaluate(grid_points)

        # Extra verification for FFTKDE (checking the sorting property)
        if not grid_is_sorted(self.grid_points):
            raise ValueError("The grid must be sorted.")

        if isinstance(self.bw, numbers.Number) and self.bw > 0:
            bw = self.bw
        else:
            raise ValueError("The bw must be a number.")
        self.bw = bw

        # Step 0 - Make sure data points are inside of the grid
        min_grid = np.min(self.grid_points, axis=0)
        max_grid = np.max(self.grid_points, axis=0)

        min_data = np.min(self.data, axis=0)
        max_data = np.max(self.data, axis=0)
        if not ((min_grid < min_data).all() and (max_grid > max_data).all()):
            raise ValueError("Every data point must be inside of the grid.")

        # Step 1 - Obtaining the grid counts
        # TODO: Consider moving this to the fitting phase instead
        data = linear_binning(self.data,
                              grid_points=self.grid_points,
                              weights=self.weights)

        # Step 2 - Computing kernel weights
        g_shape = self.grid_points.shape[1]
        num_grid_points = np.array(
            list(
                len(np.unique(self.grid_points[:, i]))
                for i in range(g_shape)))

        num_intervals = num_grid_points - 1
        dx = (max_grid - min_grid) / num_intervals

        # Find the real bandwidth, the support times the desired bw factor
        if self.kernel.finite_support:
            real_bw = self.kernel.support * self.bw
        else:
            # The parent class should compute this already. If not, compute
            # it again. This optimization only dominates a little bit with
            # few data points
            try:
                real_bw = self._kernel_practical_support
            except AttributeError:
                real_bw = self.kernel.practical_support(self.bw)

        # Compute L, the number of dx'es to move out from 0 in kernel
        L = np.minimum(np.floor(real_bw / dx), num_intervals + 1)
        assert (dx * L <= real_bw).all()

        # Evaluate the kernel once
        grids = [
            np.linspace(-dx * L, dx * L, int(L * 2 + 1))
            for (dx, L) in zip(dx, L)
        ]
        kernel_grid = cartesian(grids)
        kernel_weights = self.kernel(kernel_grid, bw=self.bw, norm=self.norm)

        # Reshape in preparation to
        kernel_weights = kernel_weights.reshape(*[int(k * 2 + 1) for k in L])
        data = data.reshape(*tuple(num_grid_points))

        # Step 3 - Performing the convolution

        # The following code block surpressed the warning:
        #        anaconda3/lib/python3.6/site-packages/mkl_fft/_numpy_fft.py:
        #            FutureWarning: Using a non-tuple sequence for multidimensional ...
        #        output = mkl_fft.rfftn_numpy(a, s, axes)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            ans = convolve(data, kernel_weights, mode="same").reshape(-1, 1)

        return self._evalate_return_logic(ans, self.grid_points)
Beispiel #4
0
def linbin_Ndim(data, grid_points, weights=None):
    """
    d-dimensional linear binning, when d >= 2.

    With :math:`N` data points, and :math:`n` grid points in each dimension
    :math:`d`, the running time is :math:`O(N2^d)`. For each point the
    algorithm finds the nearest points, of which there are two in each
    dimension. Approximately 200 times faster than pure Python implementation.

    Parameters
    ----------
    data : array-like
        The data must be of shape (obs, dims).
    grid_points : array-like
        Grid, where cartesian product is already performed.
    weights : array-like
        Must have shape (obs,).

    Examples
    --------
    >>> from KDEpy.utils import autogrid
    >>> grid_points = autogrid(np.array([[0, 0, 0]]), num_points=(3, 3, 3))
    >>> d = linbin_Ndim(np.array([[1.0, 0, 0]]), grid_points, None)
    """
    data_obs, data_dims = data.shape
    assert len(grid_points.shape) == 2
    assert data_dims >= 2

    # Convert the data and grid points
    data = np.asarray_chkfinite(data, dtype=np.float)
    grid_points = np.asarray_chkfinite(grid_points, dtype=np.float)
    if weights is not None:
        weights = np.asarray_chkfinite(weights, dtype=np.float)
        weights = weights / np.sum(weights)

    if (weights is not None) and (data.shape[0] != len(weights)):
        raise ValueError('Length of data must match length of weights.')

    obs_tot, dims = grid_points.shape

    # Compute the number of grid points for each dimension in the grid
    grid_num = (grid_points[:, i] for i in range(dims))
    grid_num = np.array(list(len(np.unique(g)) for g in grid_num))

    # Scale the data to the grid
    min_grid = np.min(grid_points, axis=0)
    max_grid = np.max(grid_points, axis=0)
    num_intervals = (grid_num - 1)
    dx = (max_grid - min_grid) / num_intervals
    data = (data - min_grid) / dx

    # Create results
    result = np.zeros(grid_points.shape[0], dtype=np.float)

    # Call the Cython implementation. Loops are unrolled if d=1 or d=2,
    # and if d >= 3 a more general routine is called. It's a bit slower since
    # the loops are not unrolled.

    # Weighted data has two specific routines
    if weights is not None:
        if data_dims >= 3:
            binary_flgs = cartesian(([0, 1], ) * dims)
            result = cutils.iterate_data_ND_weighted(data, weights, result,
                                                     grid_num, obs_tot,
                                                     binary_flgs)
        else:
            result = cutils.iterate_data_2D_weighted(data, weights, result,
                                                     grid_num, obs_tot)
        result = np.asarray_chkfinite(result, dtype=np.float)

    # Unweighted data has two specific routines too. This is because creating
    # uniform weights takes relatively long time. It's faster to have a
    # specialize routine for this case.
    else:
        if data_dims >= 3:
            binary_flgs = cartesian(([0, 1], ) * dims)
            result = cutils.iterate_data_ND(data, result, grid_num, obs_tot,
                                            binary_flgs)
        else:
            result = cutils.iterate_data_2D(data, result, grid_num, obs_tot)
        result = np.asarray_chkfinite(result, dtype=np.float)
        result = result / data_obs

    assert np.allclose(np.sum(result), 1)
    return result
Beispiel #5
0
def linbin_Ndim(data, grid_points, weights=None):
    """
    2 and 3-dimensional linear binning.
    
    With :math:`N` data points, and :math:`n` grid points in each dimension
    :math:`d`, the running time is :math:`O(N2^d)`. For each point the
    algorithm finds the nearest points, of which there are two in each
    dimension.
    
    Approximately 200 times faster than pure python implementation.
    
    Parameters
    ----------
    data : array-like
        The data must be of shape (obs, dims).
    grid_points : array-like
        Grid, where cartesian product is already performed.
    weights : array-like
        Must have shape (obs,).
        
    Examples
    --------
    >>> 1 + 1
    2
    """
    data_obs, data_dims = data.shape
    assert len(grid_points.shape) == 2
    assert data_dims >= 2
    
    # Convert the data and grid points
    data = np.asarray_chkfinite(data, dtype=np.float)
    grid_points = np.asarray_chkfinite(grid_points, dtype=np.float)
    if weights is not None:
        weights = np.asarray_chkfinite(weights, dtype=np.float)
        weights = weights / np.sum(weights)

    if (weights is not None) and (data.shape[0] != len(weights)):
        raise ValueError('Length of data must match length of weights.')
    
    obs_tot, dims = grid_points.shape
    
    # Compute the number of grid points for each dimension in the grid
    grid_num = (grid_points[:, i] for i in range(dims))
    grid_num = np.array(list(len(np.unique(g)) for g in grid_num))
    
    # Scale the data to the grid
    min_grid = np.min(grid_points, axis=0)
    max_grid = np.max(grid_points, axis=0)
    num_intervals = (grid_num - 1)
    dx = (max_grid - min_grid) / num_intervals
    data = (data - min_grid) / dx

    # Create results
    result = np.zeros(grid_points.shape[0], dtype=np.float)
    
    # Call the Cython implementation
    if weights is not None:
        if data_dims >= 3:
            binary_flgs = cartesian(([0, 1], ) * dims)
            result = cutils.iterate_data_ND_weighted(data, weights, result, 
                                                     grid_num, obs_tot, 
                                                     binary_flgs)
        else:
            result = cutils.iterate_data_2D_weighted(data, weights, result, 
                                                     grid_num, obs_tot)
        result = np.asarray_chkfinite(result, dtype=np.float)
    else:
        if data_dims >= 3:
            binary_flgs = cartesian(([0, 1], ) * dims)
            result = cutils.iterate_data_ND(data, result, grid_num, obs_tot, 
                                            binary_flgs)
        else:
            result = cutils.iterate_data_2D(data, result, grid_num, obs_tot)
        result = np.asarray_chkfinite(result, dtype=np.float)
        result = result / data_obs

    assert np.allclose(np.sum(result), 1)
    return result
Beispiel #6
0
    def evaluate(self, grid_points=None):
        """
        Evaluate on equidistant grid points.
        
        Parameters
        ----------
        grid_points: array-like, int, tuple or None
            A grid (mesh) to evaluate on. High dimensional grids must have 
            shape (obs, dims). If an integer is passed, it's the number of grid
            points on an equidistant grid. If a tuple is passed, it's the
            number of grid points in each dimension. If None, a grid will be 
            automatically created.
            
        Returns
        -------
        y: array-like
            If a grid is supplied, `y` is returned. If no grid is supplied,
            a tuple (`x`, `y`) is returned.
            
        Examples
        --------
        >>> kde = FFTKDE().fit([1, 3, 4, 7])
        >>> # Two ways to evaluate, either with a grid or without
        >>> x, y = kde.evaluate()
        >>> x, y = kde.evaluate(256)
        >>> y = kde.evaluate(x)
        """

        # This method sets self.grid_points and verifies it
        super().evaluate(grid_points)

        if callable(self.bw):
            bw = self.bw(self.data)
        elif isinstance(self.bw, numbers.Number) and self.bw > 0:
            bw = self.bw
        else:
            raise ValueError('The bw must be a callable or a number.')
        self.bw = bw

        # Step 1 - Obtaining the grid counts
        data = linear_binning(self.data,
                              grid_points=self.grid_points,
                              weights=self.weights)

        # Step 2 - Computing kernel weights
        g_shape = self.grid_points.shape[1]
        num_grid_points = np.array(
            list(
                len(np.unique(self.grid_points[:, i]))
                for i in range(g_shape)))

        min_grid = np.min(self.grid_points, axis=0)
        max_grid = np.max(self.grid_points, axis=0)
        num_intervals = (num_grid_points - 1)
        dx = (max_grid - min_grid) / num_intervals

        # Find the real bandwidth, the support times the desired bw factor
        if self.kernel.finite_support:
            real_bw = self.kernel.support * self.bw
        else:
            # The parent class should compute this already. If not, compute
            # it again. This optimization only dominates a little bit with
            # few data points
            try:
                real_bw = self._kernel_practical_support
            except AttributeError:
                real_bw = self.kernel.practical_support(self.bw)

        # Compute L, the number of dx'es to move out from 0 in kernel
        L = np.minimum(np.floor(real_bw / dx), num_intervals + 1)
        assert (dx * L <= real_bw).all()

        # Evaluate the kernel once
        grids = [
            np.linspace(-dx * L, dx * L, int(L * 2 + 1))
            for (dx, L) in zip(dx, L)
        ]
        kernel_grid = cartesian(grids)
        kernel_weights = self.kernel(kernel_grid, bw=self.bw, norm=self.norm)

        # Reshape in preparation to
        kernel_weights = kernel_weights.reshape(*[int(k * 2 + 1) for k in L])
        data = data.reshape(*tuple(num_grid_points))

        # Step 3 - Performing the convolution
        evaluated = convolve(data, kernel_weights, mode='same').reshape(-1, 1)
        return self._evalate_return_logic(evaluated, self.grid_points)