Python LeaveOneOut Examples, _kernel_base.LeaveOneOut Python Examples

Example #1

0

Show file

File: kernel_density.py Project: yarikoptic/pystatsmodels

    def loo_likelihood(self, bw, func=lambda x: x):
        r"""
        Returns the leave-one-out likelihood function.

        The leave-one-out likelihood function for the unconditional KDE.

        Parameters
        ----------
        bw: array_like
            The value for the bandwidth parameter(s).
        func: callable, optional
            Function to transform the likelihood values (before summing); for
            the log likelihood, use ``func=np.log``.  Default is ``f(x) = x``.

        Notes
        -----
        The leave-one-out kernel estimator of :math:`f_{-i}` is:

        .. math:: f_{-i}(X_{i})=\frac{1}{(n-1)h}
                    \sum_{j=1,j\neq i}K_{h}(X_{i},X_{j})

        where :math:`K_{h}` represents the generalized product kernel
        estimator:

        .. math:: K_{h}(X_{i},X_{j}) =
            \prod_{s=1}^{q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right)
        """
        LOO = LeaveOneOut(self.data)
        L = 0
        for i, X_not_i in enumerate(LOO):
            f_i = gpke(bw, data=-X_not_i, data_predict=-self.data[i, :],
                       var_type=self.var_type)
            L += func(f_i)

        return -L

Example #2

0

Show file

File: kernel_regression.py Project: AnaMP/statsmodels

    def cv_loo(self, bw, func):
        """
        The cross-validation function with leave-one-out
        estimator

        Parameters
        ----------
        bw: array_like
            Vector of bandwidth values
        func: callable function
            Returns the estimator of g(x).
            Can be either ``_est_loc_constant`` (local constant) or
            ``_est_loc_linear`` (local_linear).

        Returns
        -------
        L: float
            The value of the CV function

        Notes
        -----
        Calculates the cross-validation least-squares
        function. This function is minimized by compute_bw
        to calculate the optimal value of bw

        For details see p.35 in [2]

        ..math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2}

        where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X)
        and :math:`h` is the vector of bandwidths

        """
        LOO_X = LeaveOneOut(self.exog)
        LOO_Y = LeaveOneOut(self.endog).__iter__()
        LOO_W = LeaveOneOut(self.W_in).__iter__()
        L = 0
        for ii, X_not_i in enumerate(LOO_X):
            Y = LOO_Y.next()
            w = LOO_W.next()
            G = func(bw, endog=Y, exog=-X_not_i,
                     data_predict=-self.exog[ii, :], W=w)[0]
            L += (self.endog[ii] - G) ** 2

        # Note: There might be a way to vectorize this. See p.72 in [1]
        return L / self.nobs

Example #3

0

Show file

    def loo_likelihood(self, bw, func=lambda x: x):
        """
        Returns the leave-one-out conditional likelihood of the data.

        If `func` is not equal to the default, what's calculated is a function
        of the leave-one-out conditional likelihood.

        Parameters
        ----------
        bw: array_like
            The bandwidth parameter(s).
        func: callable, optional
            Function to transform the likelihood values (before summing); for
            the log likelihood, use ``func=np.log``.  Default is ``f(x) = x``.

        Returns
        -------
        L: float
            The value of the leave-one-out function for the data.

        Notes
        -----
        Similar to ``KDE.loo_likelihood`, but substitute ``f(y|x)=f(x,y)/f(y)``
        for ``f(x)``.
        """
        yLOO = LeaveOneOut(self.data)
        xLOO = LeaveOneOut(self.exog).__iter__()
        L = 0
        for i, Y_j in enumerate(yLOO):
            X_not_i = xLOO.next()
            f_yx = gpke(bw,
                        data=-Y_j,
                        data_predict=-self.data[i, :],
                        var_type=(self.dep_type + self.indep_type))
            f_x = gpke(bw[self.k_dep:],
                       data=-X_not_i,
                       data_predict=-self.exog[i, :],
                       var_type=self.indep_type)
            f_i = f_yx / f_x
            L += func(f_i)

        return -L

Example #4

0

Show file

File: kernel_regression.py Project: yarikoptic/pystatsmodels

    def cv_loo(self, bw, func):
        """
        The cross-validation function with leave-one-out estimator.

        Parameters
        ----------
        bw: array_like
            Vector of bandwidth values.
        func: callable function
            Returns the estimator of g(x).  Can be either ``_est_loc_constant``
            (local constant) or ``_est_loc_linear`` (local_linear).

        Returns
        -------
        L: float
            The value of the CV function.

        Notes
        -----
        Calculates the cross-validation least-squares function. This function
        is minimized by compute_bw to calculate the optimal value of `bw`.

        For details see p.35 in [2]

        ..math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2}

        where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X)
        and :math:`h` is the vector of bandwidths

        """
        LOO_X = LeaveOneOut(self.exog)
        LOO_Y = LeaveOneOut(self.endog).__iter__()
        L = 0
        for ii, X_not_i in enumerate(LOO_X):
            Y = LOO_Y.next()
            G = func(bw, endog=Y, exog=-X_not_i,
                     data_predict=-self.exog[ii, :])[0]
            L += (self.endog[ii] - G) ** 2

        # Note: There might be a way to vectorize this. See p.72 in [1]
        return L / self.nobs

Example #5

0

Show file

File: kernel_density.py Project: Daniel-B-Smith/statsmodels

    def loo_likelihood(self, bw, func=lambda x: x):
        """
        Returns the leave-one-out conditional likelihood of the data.

        If `func` is not equal to the default, what's calculated is a function
        of the leave-one-out conditional likelihood.

        Parameters
        ----------
        bw: array_like
            The bandwidth parameter(s).
        func: callable, optional
            Function to transform the likelihood values (before summing); for
            the log likelihood, use ``func=np.log``.  Default is ``f(x) = x``.

        Returns
        -------
        L: float
            The value of the leave-one-out function for the data.

        Notes
        -----
        Similar to ``KDE.loo_likelihood`, but substitute ``f(y|x)=f(x,y)/f(y)``
        for ``f(x)``.
        """
        yLOO = LeaveOneOut(self.data)
        xLOO = LeaveOneOut(self.exog).__iter__()
        L = 0
        for i, Y_j in enumerate(yLOO):
            X_not_i = xLOO.next()
            f_yx = gpke(bw, data=-Y_j, data_predict=-self.data[i, :],
                        var_type=(self.dep_type + self.indep_type))
            f_x = gpke(bw[self.k_dep:], data=-X_not_i,
                       data_predict=-self.exog[i, :],
                       var_type=self.indep_type)
            f_i = f_yx / f_x
            L += func(f_i)

        return -L

Example #6

0

Show file

File: kernel_density.py Project: yarikoptic/pystatsmodels

    def imse(self, bw):
        r"""
        The integrated mean square error for the conditional KDE.

        Parameters
        ----------
        bw: array_like
            The bandwidth parameter(s).

        Returns
        -------
        CV: float
            The cross-validation objective function.

        Notes
        -----
        For more details see pp. 156-166 in [1].
        For details on how to handle the mixed variable types see [3].

        The formula for the cross-validation objective function for mixed
        variable types is:

        .. math:: CV(h,\lambda)=\frac{1}{n}\sum_{l=1}^{n}
            \frac{G_{-l}(X_{l})}{\left[\mu_{-l}(X_{l})\right]^{2}}-
            \frac{2}{n}\sum_{l=1}^{n}\frac{f_{-l}(X_{l},Y_{l})}{\mu_{-l}(X_{l})}

        where

        .. math:: G_{-l}(X_{l}) = n^{-2}\sum_{i\neq l}\sum_{j\neq l}
                        K_{X_{i},X_{l}} K_{X_{j},X_{l}}K_{Y_{i},Y_{j}}^{(2)}

        where :math:`K_{X_{i},X_{l}}` is the multivariate product kernel and
        :math:`\mu_{-l}(X_{l})` is the leave-one-out estimator of the pdf.

        :math:`K_{Y_{i},Y_{j}}^{(2)}` is the convolution kernel.

        The value of the function is minimized by the ``_cv_ls`` method of the
        `GenericKDE` class to return the bw estimates that minimize the
        distance between the estimated and "true" probability density.
        """
        zLOO = LeaveOneOut(self.data)
        CV = 0
        nobs = float(self.nobs)
        expander = np.ones((self.nobs - 1, 1))
        for ii, Z in enumerate(zLOO):
            X = Z[:, self.k_dep:]
            Y = Z[:, :self.k_dep]
            Ye_L = np.kron(Y, expander)
            Ye_R = np.kron(expander, Y)
            Xe_L = np.kron(X, expander)
            Xe_R = np.kron(expander, X)
            K_Xi_Xl = gpke(bw[self.k_dep:], data=Xe_L,
                           data_predict=self.exog[ii, :],
                           var_type=self.indep_type, tosum=False)
            K_Xj_Xl = gpke(bw[self.k_dep:], data=Xe_R,
                           data_predict=self.exog[ii, :],
                           var_type=self.indep_type, tosum=False)
            K2_Yi_Yj = gpke(bw[0:self.k_dep], data=Ye_L,
                            data_predict=Ye_R, var_type=self.dep_type,
                            ckertype='gauss_convolution',
                            okertype='wangryzin_convolution',
                            ukertype='aitchisonaitken_convolution',
                            tosum=False)
            G = (K_Xi_Xl * K_Xj_Xl * K2_Yi_Yj).sum() / nobs**2
            f_X_Y = gpke(bw, data=-Z, data_predict=-self.data[ii, :],
                         var_type=(self.dep_type + self.indep_type)) / nobs
            m_x = gpke(bw[self.k_dep:], data=-X,
                       data_predict=-self.exog[ii, :],
                       var_type=self.indep_type) / nobs
            CV += (G / m_x ** 2) - 2 * (f_X_Y / m_x)

        return CV / nobs

Example #7

0

Show file

File: kernel_density.py Project: yarikoptic/pystatsmodels

    def imse(self, bw):
        r"""
        Returns the Integrated Mean Square Error for the unconditional KDE.

        Parameters
        ----------
        bw: array_like
            The bandwidth parameter(s).

        Returns
        ------
        CV: float
            The cross-validation objective function.

        Notes
        -----
        See p. 27 in [1]
        For details on how to handle the multivariate
        estimation with mixed data types see p.6 in [3]

        The formula for the cross-validation objective function is:

        .. math:: CV=\frac{1}{n^{2}}\sum_{i=1}^{n}\sum_{j=1}^{N}
            \bar{K}_{h}(X_{i},X_{j})-\frac{2}{n(n-1)}\sum_{i=1}^{n}
            \sum_{j=1,j\neq i}^{N}K_{h}(X_{i},X_{j})

        Where :math:`\bar{K}_{h}` is the multivariate product convolution
        kernel (consult [3] for mixed data types).
        """
        #F = 0
        #for i in range(self.nobs):
        #    k_bar_sum = gpke(bw, data=-self.data,
        #                     data_predict=-self.data[i, :],
        #                     var_type=self.var_type,
        #                     ckertype='gauss_convolution',
        #                     okertype='wangryzin_convolution',
        #                     ukertype='aitchisonaitken_convolution')
        #    F += k_bar_sum
        ## there is a + because loo_likelihood returns the negative
        #return (F / self.nobs**2 + self.loo_likelihood(bw) * \
        #        2 / ((self.nobs) * (self.nobs - 1)))

        # The code below is equivalent to the commented-out code above.  It's
        # about 20% faster due to some code being moved outside the for-loops
        # and shared by gpke() and loo_likelihood().
        F = 0
        kertypes = dict(c=kernels.gaussian_convolution,
                        o=kernels.wang_ryzin_convolution,
                        u=kernels.aitchison_aitken_convolution)
        nobs = self.nobs
        data = -self.data
        var_type = self.var_type
        ix_cont = np.array([c == 'c' for c in var_type])
        _bw_cont_product = bw[ix_cont].prod()
        Kval = np.empty(data.shape)
        for i in range(nobs):
            for ii, vtype in enumerate(var_type):
                Kval[:, ii] = kertypes[vtype](bw[ii],
                                              data[:, ii],
                                              data[i, ii])

            dens = Kval.prod(axis=1) / _bw_cont_product
            k_bar_sum = dens.sum(axis=0)
            F += k_bar_sum  # sum of prod kernel over nobs

        kertypes = dict(c=kernels.gaussian,
                        o=kernels.wang_ryzin,
                        u=kernels.aitchison_aitken)
        LOO = LeaveOneOut(self.data)
        L = 0   # leave-one-out likelihood
        Kval = np.empty((data.shape[0]-1, data.shape[1]))
        for i, X_not_i in enumerate(LOO):
            for ii, vtype in enumerate(var_type):
                Kval[:, ii] = kertypes[vtype](bw[ii],
                                              -X_not_i[:, ii],
                                              data[i, ii])
            dens = Kval.prod(axis=1) / _bw_cont_product
            L += dens.sum(axis=0)

        # CV objective function, eq. (2.4) of Ref. [3]
        return (F / nobs**2 - 2 * L / (nobs * (nobs - 1)))