Ejemplo n.º 1
0
 def test_cython_binning(self, dims, use_weights, eq_grid):
     """
     Test the fast N-dimensional binning up against the naive
     Python implementation - using weights, no weights, equal grid values
     in each direction and unequal ones.
     """
     
     num_points = 1000
     data = np.random.randn(dims * num_points).reshape(num_points, dims) / 7
     
     if use_weights:
         weights = np.random.randn(num_points)
     else:
         weights = None
         
     if eq_grid:
         num_points = (16,) * dims
     else:
         num_points = tuple([random.randint(8, 16) for i in range(dims)])
         
     grid_points = autogrid(np.array([[0] * dims]), num_points=num_points)
     result = linear_binning(data, grid_points, weights=weights)
     result_slow = linbin_Ndim_python(data, grid_points, weights=weights)
     
     assert np.allclose(result, result_slow)
Ejemplo n.º 2
0
Archivo: BaseKDE.py Proyecto: bz2/KDEpy
    def evaluate(self, grid_points=None, bw_to_scalar=True):
        """
        Evaluate the kernel density estimator on the grid points.

        Parameters
        ----------
        grid_points : integer, tuple or array-like
            If an integer, the number of equidistant grid point in every
            dimension. If a tuple, the number of grid points in each
            dimension. If array-like, grid points of shape (obs, dims).
        """
        if not hasattr(self, 'data'):
            raise ValueError('Must call fit before evaluating.')

        # -------------- Set up the bandwidth depending on inputs -------------
        if isinstance(self.bw, (np.ndarray, Sequence)):
            if bw_to_scalar:
                bw = max(self.bw)
            else:
                bw = self.bw
        elif callable(self.bw):
            bw = self.bw(self.data)
        else:
            bw = self.bw
        self.bw = bw

        # -------------- Set up the grid depending on input -------------------
        # If the grid None or an integer, use that in the autogrid method
        types_for_autogrid = (numbers.Integral, tuple)
        if grid_points is None or isinstance(grid_points, types_for_autogrid):
            self._user_supplied_grid = False
            bw_grid = self.kernel.practical_support(bw)
            grid_points = autogrid(self.data, bw_grid, grid_points)
            # Set it here, so as not to call kernel.practical_support(bw) again
            self._kernel_practical_support = bw_grid
        else:
            self._user_supplied_grid = True
            grid_points = self._process_sequence(grid_points)

        obs, dims = grid_points.shape
        if not obs > 0:
            raise ValueError('Grid must contain at least one data point.')
        self.grid_points = grid_points

        # Test quickly that the method has done what is was supposed to do
        if bw_to_scalar:
            assert isinstance(self.bw, numbers.Number)
            assert self.bw > 0
        assert len(self.grid_points.shape) == 2
Ejemplo n.º 3
0
 def test_binning_correctness_single_point(self, dims):
     """
     Permute a single grid poind make sure that same point is weighted
     highly.
     """
     
     eps = 10e-6
     for subtest in range(25):
         data = np.random.randint(-2, 2, size=(1, dims)) - eps
         grid_points = autogrid(np.array([[0] * dims]), 
                                num_points=(7,) * dims)
         answer = linear_binning(data, grid_points)
         
         for grid_point, a in zip(grid_points, answer):
             diff = np.sum((grid_point - data.ravel())**2)
             if diff < eps:
                 assert np.allclose(a, (1 - eps)**dims)
Ejemplo n.º 4
0
    def test_binning_correctness_single_point(self, dims):
        """
        Create a single data point that is close to a grid point on an integer
        grid. Compute linear binning, and test that the grid point that is
        close to the data point has a lot of weight assigned to it.
        """

        eps = 10e-6
        for subtest in range(25):
            # Create a data point between a random integer between -2 and 2
            np.random.seed(subtest)
            data = np.random.randint(-2, 2, size=(1, dims)) - eps
            # Create grid points [-3, -2, -1, 0, 1, 2, 3]^dims
            grid_points = autogrid(np.array([[0] * dims]), num_points=(7,) * dims)
            # Compute the linear binning
            answer = linear_binning(data, grid_points)

            # At the grid point where data point is placed,
            # a large weight should be placed by the linear binning
            for grid_point, a in zip(grid_points, answer):
                diff = np.sum((grid_point - data.ravel()) ** 2)
                if diff < eps:
                    assert np.allclose(a, (1 - eps) ** dims)
Ejemplo n.º 5
0
def improved_sheather_jones(data):
    """
    The Improved Sheater Jones (ISJ) algorithm from the paper by Botev et al.
    This algorithm computes the optimal bandwidth for a gaussian kernel,
    and works very well for bimodal data (unlike other rules). The
    disadvantage of this algorithm is longer computation time, and the fact
    that this implementation does not always converge if very few data
    points are supplied.

    Understanding this algorithm is difficult, see:
    https://books.google.no/books?id=Trj9HQ7G8TUC&pg=PA328&lpg=PA328&dq=
    sheather+jones+why+use+dct&source=bl&ots=1ETdKd_6EF&sig=jZk4R515GB1xsn-
    VZVnjr-JfjSI&hl=en&sa=X&ved=2ahUKEwi1_czNncTcAhVGhqYKHaPiBtcQ6AEwA3oEC
    AcQAQ#v=onepage&q=sheather%20jones%20why%20use%20dct&f=false
    """
    obs, dims = data.shape
    if not dims == 1:
        raise ValueError('ISJ is only available for 1D data.')

    n = 2**10
    # Setting `percentile` higher decreases the chance of overflow
    xmesh = autogrid(data, boundary_abs=6, num_points=n, boundary_rel=0.5)
    data = data.ravel()
    xmesh = xmesh.ravel()

    # Create an equidistant grid
    R = np.max(data) - np.min(data)
    # dx = R / (n - 1)
    data = data.ravel()
    N = len(np.unique(data))

    # Use linear binning to bin the data on an equidistant grid, this is a
    # prerequisite for using the FFT (evenly spaced samples)
    initial_data = linear_binning(data.reshape(-1, 1), xmesh)
    assert np.allclose(initial_data.sum(), 1)

    # Compute the type 2 Discrete Cosine Transform (DCT) of the data
    a = fftpack.dct(initial_data)

    # Compute the bandwidth
    I_sq = np.power(np.arange(1, n, dtype=FLOAT), 2)
    a2 = a[1:]**2 / 4

    # Solve for the optimal (in the AMISE sense) t
    t_star = _root(_fixed_point, N, args=(N, I_sq, a2))

    # The remainder of the algorithm computes the actual density
    # estimate, but this function is only used to compute the
    # bandwidth, since the bandwidth may be used for other kernels
    # apart from the Gaussian kernel

    # Smooth the initial data using the computed optimal t
    # Multiplication in frequency domain is convolution
    # integers = np.arange(n, dtype=np.float)
    # a_t = a * np.exp(-integers**2 * np.pi ** 2 * t_star / 2)

    # Diving by 2 done because of the implementation of fftpack.idct
    # density = fftpack.idct(a_t) / (2 * R)

    # Due to overflow, some values might be smaller than zero, correct it
    # density[density < 0] = 0.
    bandwidth = np.sqrt(t_star) * R
    return bandwidth