Esempio n. 1
0
def mad(a, c=Gaussian.ppf(3 / 4.), axis=0, center=np.median):
    # c \approx .6745
    """
    The Median Absolute Deviation along given axis of an array

    Parameters
    ----------
    a : array-like
        Input array.
    c : float, optional
        The normalization constant.  Defined as scipy.stats.norm.ppf(3/4.),
        which is approximately .6745.
    axis : int, optional
        The defaul is 0. Can also be None.
    center : callable or float
        If a callable is provided, such as the default `np.median` then it
        is expected to be called center(a). The axis argument will be applied
        via np.apply_over_axes. Otherwise, provide a float.

    Returns
    -------
    mad : float
        `mad` = median(abs(`a` - center))/`c`
    """
    a = np.asarray(a)
    if callable(center):
        center = np.apply_over_axes(center, a, axis)
    return np.median((np.fabs(a - center)) / c, axis=axis)
Esempio n. 2
0
def mad(a, c=Gaussian.ppf(3/4.), axis=0, center=np.median):
    # c \approx .6745
    """
    The Median Absolute Deviation along given axis of an array

    Parameters
    ----------
    a : array-like
        Input array.
    c : float, optional
        The normalization constant.  Defined as scipy.stats.norm.ppf(3/4.),
        which is approximately .6745.
    axis : int, optional
        The defaul is 0. Can also be None.
    center : callable or float
        If a callable is provided, such as the default `np.median` then it
        is expected to be called center(a). The axis argument will be applied
        via np.apply_over_axes. Otherwise, provide a float.

    Returns
    -------
    mad : float
        `mad` = median(abs(`a` - center))/`c`
    """
    a = np.asarray(a)
    if callable(center):
        center = np.apply_over_axes(center, a, axis)
    return np.median((np.fabs(a-center))/c, axis=axis)
Esempio n. 3
0
    def __init__(self, shape, h = 1.0, domain = None, norm = None):
        """
        shape should be a function taking and returning numeric type.

        For sanity it should always return positive or zero but this isn't
        enforced in case you want to do weird things. Bear in mind that the
        statistical tests etc. may not be valid for non-positive kernels.

        The bandwidth of the kernel is supplied as h.

        You may specify a domain as a list of 2 values [min, max], in which case
        kernel will be treated as zero outside these values. This will speed up
        calculation.

        You may also specify the normalisation constant for the supplied Kernel.
        If you do this number will be stored and used as the normalisation
        without calculation.  It is recommended you do this if you know the
        constant, to speed up calculation.  In particular if the shape function
        provided is already normalised you should provide norm = 1.0.

        Warning: I think several calculations assume that the kernel is
        normalized. No tests for non-normalized kernel.
        """
        self._normconst = norm   # a value or None, if None, then calculate
        self.domain = domain
        self.weights = None
        if callable(shape):
            self._shape = shape
        else:
            raise TypeError("shape must be a callable object/function")
        self._h = h
        self._L2Norm = None
        self._kernel_var = None
        self._normal_reference_constant = None
        self._order = None
Esempio n. 4
0
def date_parser(timestr, parserinfo=None, **kwargs):
    """
    Uses dateutil.parser.parse, but also handles monthly dates of the form
    1999m4, 1999:m4, 1999:mIV, 1999mIV and the same for quarterly data
    with q instead of m. It is not case sensitive. The default for annual
    data is the end of the year, which also differs from dateutil.
    """
    flags = re.IGNORECASE | re.VERBOSE
    if re.search(_q_pattern, timestr, flags):
        y,q = timestr.replace(":","").lower().split('q')
        month, day = _quarter_to_day[q.upper()]
        year = int(y)
    elif re.search(_m_pattern, timestr, flags):
        y,m = timestr.replace(":","").lower().split('m')
        month, day = _month_to_day[m.upper()]
        year = int(y)
        if _is_leap(y) and month == 2:
            day += 1
    elif re.search(_y_pattern, timestr, flags):
        month, day = 12, 31
        year = int(timestr)
    else:
        if (hasattr(pandas_datetools, 'parser') and
            not callable(pandas_datetools.parser)):
            # exists in 0.8.0 pandas, but it's the class not the module
            return pandas_datetools.parser.parse(timestr, parserinfo,
                                                 **kwargs)
        else: # 0.8.1 pandas version didn't import this into namespace
            from dateutil import parser
            return parser.parse(timestr, parserinfo, **kwargs)


    return datetime.datetime(year, month, day)
Esempio n. 5
0
    def get_columns(self, *args, **kw):
        """
        Calling function for factor instance.
        """

        v = self.namespace[self._name]
        while True:
            if callable(v):
                if isinstance(v, (Term, Formula)):
                    v = copy.copy(v)
                    v.namespace = self.namespace
                v = v(*args, **kw)
            else:
                break

        n = len(v)

        if self.ordinal:
            col = [float(self.keys.index(v[i])) for i in range(n)]
            return np.array(col)

        else:
            value = []
            for key in self.keys:
                col = [float((v[i] == key)) for i in range(n)]
                value.append(col)
            return np.array(value)
Esempio n. 6
0
    def get_columns(self, *args, **kw):
        """
        Calling function for factor instance.
        """

        v = self.namespace[self._name]
        while True:
            if callable(v):
                if isinstance(v, (Term, Formula)):
                    v = copy.copy(v)
                    v.namespace = self.namespace
                v = v(*args, **kw)
            else: break

        n = len(v)

        if self.ordinal:
            col = [float(self.keys.index(v[i])) for i in range(n)]
            return np.array(col)

        else:
            value = []
            for key in self.keys:
                col = [float((v[i] == key)) for i in range(n)]
                value.append(col)
            return np.array(value)
Esempio n. 7
0
def date_parser(timestr, parserinfo=None, **kwargs):
    """
    Uses dateutil.parser.parse, but also handles monthly dates of the form
    1999m4, 1999:m4, 1999:mIV, 1999mIV and the same for quarterly data
    with q instead of m. It is not case sensitive. The default for annual
    data is the end of the year, which also differs from dateutil.
    """
    flags = re.IGNORECASE | re.VERBOSE
    if re.search(_q_pattern, timestr, flags):
        y, q = timestr.replace(":", "").lower().split('q')
        month, day = _quarter_to_day[q.upper()]
        year = int(y)
    elif re.search(_m_pattern, timestr, flags):
        y, m = timestr.replace(":", "").lower().split('m')
        month, day = _month_to_day[m.upper()]
        year = int(y)
        if _is_leap(y) and month == 2:
            day += 1
    elif re.search(_y_pattern, timestr, flags):
        month, day = 12, 31
        year = int(timestr)
    else:
        if (hasattr(pandas_datetools, 'parser')
                and not callable(pandas_datetools.parser)):
            # exists in 0.8.0 pandas, but it's the class not the module
            return pandas_datetools.parser.parse(timestr, parserinfo, **kwargs)
        else:  # 0.8.1 pandas version didn't import this into namespace
            from dateutil import parser
            return parser.parse(timestr, parserinfo, **kwargs)

    return datetime.datetime(year, month, day)
Esempio n. 8
0
    def __init__(self, shape, h = 1.0, domain = None, norm = None):
        """
        shape should be a function taking and returning numeric type.

        For sanity it should always return positive or zero but this isn't
        enforced in case you want to do weird things. Bear in mind that the
        statistical tests etc. may not be valid for non-positive kernels.

        The bandwidth of the kernel is supplied as h.

        You may specify a domain as a list of 2 values [min, max], in which case
        kernel will be treated as zero outside these values. This will speed up
        calculation.

        You may also specify the normalisation constant for the supplied Kernel.
        If you do this number will be stored and used as the normalisation
        without calculation.  It is recommended you do this if you know the
        constant, to speed up calculation.  In particular if the shape function
        provided is already normalised you should provide norm = 1.0.

        Warning: I think several calculations assume that the kernel is
        normalized. No tests for non-normalized kernel.
        """
        self._normconst = norm   # a value or None, if None, then calculate
        self.domain = domain
        self.weights = None
        if callable(shape):
            self._shape = shape
        else:
            raise TypeError("shape must be a callable object/function")
        self._h = h
        self._L2Norm = None
        self._kernel_var = None
        self._normal_reference_constant = None
        self._order = None
Esempio n. 9
0
    def __call__(self, *args, **kw):
        """
        Return the columns associated to self in a design matrix.
        If the term has no 'func' attribute, it returns
        ``self.namespace[self.termname]``
        else, it returns
        ``self.func(*args, **kw)``
        """

        if not hasattr(self, 'func'):
            val = self.namespace[self.termname]
        else:
            val = self.func
        if callable(val):
            if isinstance(val, (Term, Formula)):
                val = copy.copy(val)
                val.namespace = self.namespace
            val = val(*args, **kw)

        val = np.asarray(val)
        return np.squeeze(val)
Esempio n. 10
0
    def __call__(self, *args, **kw):
        """
        Return the columns associated to self in a design matrix.
        If the term has no 'func' attribute, it returns
        ``self.namespace[self.termname]``
        else, it returns
        ``self.func(*args, **kw)``
        """

        if not hasattr(self, 'func'):
            val = self.namespace[self.termname]
        else:
            val = self.func
        if callable(val):
            if isinstance(val, (Term, Formula)):
                val = copy.copy(val)
                val.namespace = self.namespace
            val = val(*args, **kw)

        val = np.asarray(val)
        return np.squeeze(val)
Esempio n. 11
0
    def __init__(self, rvs, cdf, args=(), N=20):
        if isinstance(rvs, string_types):
            #cdf = getattr(stats, rvs).cdf
            if (not cdf) or (cdf == rvs):
                cdf = getattr(distributions, rvs).cdf
                rvs = getattr(distributions, rvs).rvs
            else:
                raise AttributeError(
                    'if rvs is string, cdf has to be the same distribution')

        if isinstance(cdf, string_types):
            cdf = getattr(distributions, cdf).cdf
        if callable(rvs):
            kwds = {'size': N}
            vals = np.sort(rvs(*args, **kwds))
        else:
            vals = np.sort(rvs)
            N = len(vals)
        cdfvals = cdf(vals, *args)

        self.nobs = N
        self.vals_sorted = vals
        self.cdfvals = cdfvals
Esempio n. 12
0
    def __init__(self, rvs, cdf, args=(), N=20):
        if isinstance(rvs, string_types):
            #cdf = getattr(stats, rvs).cdf
            if (not cdf) or (cdf == rvs):
                cdf = getattr(distributions, rvs).cdf
                rvs = getattr(distributions, rvs).rvs
            else:
                raise AttributeError('if rvs is string, cdf has to be the same distribution')


        if isinstance(cdf, string_types):
            cdf = getattr(distributions, cdf).cdf
        if callable(rvs):
            kwds = {'size':N}
            vals = np.sort(rvs(*args,**kwds))
        else:
            vals = np.sort(rvs)
            N = len(vals)
        cdfvals = cdf(vals, *args)

        self.nobs = N
        self.vals_sorted = vals
        self.cdfvals = cdfvals
Esempio n. 13
0
 def _updateloglike(self, params, xi10, ntrain, penalty, upperbounds,
                    lowerbounds, F, A, H, Q, R, history):
     """
     """
     paramsorig = params
     # are the bounds binding?
     if penalty:
         params = np.min((np.max(
             (lowerbounds, params), axis=0), upperbounds),
                         axis=0)
     #TODO: does it make sense for all of these to be allowed to be None?
     if F != None and callable(F):
         F = F(params)
     elif F == None:
         F = 0
     if A != None and callable(A):
         A = A(params)
     elif A == None:
         A = 0
     if H != None and callable(H):
         H = H(params)
     elif H == None:
         H = 0
     print(callable(Q))
     if Q != None and callable(Q):
         Q = Q(params)
     elif Q == None:
         Q = 0
     if R != None and callable(R):
         R = R(params)
     elif R == None:
         R = 0
     X = self.exog
     if X == None:
         X = 0
     y = self.endog
     loglike = kalmanfilter(F, A, H, Q, R, y, X, xi10, ntrain, history)
     # use a quadratic penalty function to move away from bounds
     if penalty:
         loglike += penalty * np.sum((paramsorig - params)**2)
     return loglike
Esempio n. 14
0
 def _updateloglike(self, params, xi10, ntrain, penalty, upperbounds, lowerbounds,
         F,A,H,Q,R, history):
     """
     """
     paramsorig = params
     # are the bounds binding?
     if penalty:
         params = np.min((np.max((lowerbounds, params), axis=0),upperbounds),
             axis=0)
     #TODO: does it make sense for all of these to be allowed to be None?
     if F != None and callable(F):
         F = F(params)
     elif F == None:
         F = 0
     if A != None and callable(A):
         A = A(params)
     elif A == None:
         A = 0
     if H != None and callable(H):
         H = H(params)
     elif H == None:
         H = 0
     print(callable(Q))
     if Q != None and callable(Q):
         Q = Q(params)
     elif Q == None:
         Q = 0
     if R != None and callable(R):
         R = R(params)
     elif R == None:
         R = 0
     X = self.exog
     if X == None:
         X = 0
     y = self.endog
     loglike = kalmanfilter(F,A,H,Q,R,y,X, xi10, ntrain, history)
     # use a quadratic penalty function to move away from bounds
     if penalty:
         loglike += penalty * np.sum((paramsorig-params)**2)
     return loglike
Esempio n. 15
0
def margeff_cov_params(model, params, exog, cov_params, at, derivative,
                       dummy_ind, count_ind, method, J):
    """
    Computes the variance-covariance of marginal effects by the delta method.

    Parameters
    ----------
    model : model instance
        The model that returned the fitted results. Its pdf method is used
        for computing the Jacobian of discrete variables in dummy_ind and
        count_ind
    params : array-like
        estimated model parameters
    exog : array-like
        exogenous variables at which to calculate the derivative
    cov_params : array-like
        The variance-covariance of the parameters
    at : str
       Options are:

        - 'overall', The average of the marginal effects at each
          observation.
        - 'mean', The marginal effects at the mean of each regressor.
        - 'median', The marginal effects at the median of each regressor.
        - 'zero', The marginal effects at zero for each regressor.
        - 'all', The marginal effects at each observation.

        Only overall has any effect here.you

    derivative : function or array-like
        If a function, it returns the marginal effects of the model with
        respect to the exogenous variables evaluated at exog. Expected to be
        called derivative(params, exog). This will be numerically
        differentiated. Otherwise, it can be the Jacobian of the marginal
        effects with respect to the parameters.
    dummy_ind : array-like
        Indices of the columns of exog that contain dummy variables
    count_ind : array-like
        Indices of the columns of exog that contain count variables

    Notes
    -----
    For continuous regressors, the variance-covariance is given by

    Asy. Var[MargEff] = [d margeff / d params] V [d margeff / d params]'

    where V is the parameter variance-covariance.

    The outer Jacobians are computed via numerical differentiation if
    derivative is a function.
    """
    if callable(derivative):
        from statsmodels.tools.numdiff import approx_fprime_cs
        params = params.ravel('F')  # for Multinomial
        try:
            jacobian_mat = approx_fprime_cs(params,
                                            derivative,
                                            args=(exog, method))
        except TypeError:  # norm.cdf doesn't take complex values
            from statsmodels.tools.numdiff import approx_fprime
            jacobian_mat = approx_fprime(params,
                                         derivative,
                                         args=(exog, method))
        if at == 'overall':
            jacobian_mat = np.mean(jacobian_mat, axis=1)
        else:
            jacobian_mat = jacobian_mat.squeeze()  # exog was 2d row vector
        if dummy_ind is not None:
            jacobian_mat = _margeff_cov_params_dummy(model, jacobian_mat,
                                                     params, exog, dummy_ind,
                                                     method, J)
        if count_ind is not None:
            jacobian_mat = _margeff_cov_params_count(model, jacobian_mat,
                                                     params, exog, count_ind,
                                                     method, J)
    else:
        jacobian_mat = derivative

    #NOTE: this won't go through for at == 'all'
    return np.dot(np.dot(jacobian_mat, cov_params), jacobian_mat.T)
Esempio n. 16
0
def kstest(rvs, cdf, args=(), N=20, alternative = 'two_sided', mode='approx',**kwds):
    """
    Perform the Kolmogorov-Smirnov test for goodness of fit

    This performs a test of the distribution G(x) of an observed
    random variable against a given distribution F(x). Under the null
    hypothesis the two distributions are identical, G(x)=F(x). The
    alternative hypothesis can be either 'two_sided' (default), 'less'
    or 'greater'. The KS test is only valid for continuous distributions.

    Parameters
    ----------
    rvs : string or array or callable
        string: name of a distribution in scipy.stats

        array: 1-D observations of random variables

        callable: function to generate random variables, requires keyword
        argument `size`

    cdf : string or callable
        string: name of a distribution in scipy.stats, if rvs is a string then
        cdf can evaluate to `False` or be the same as rvs
        callable: function to evaluate cdf

    args : tuple, sequence
        distribution parameters, used if rvs or cdf are strings
    N : int
        sample size if rvs is string or callable
    alternative : 'two_sided' (default), 'less' or 'greater'
        defines the alternative hypothesis (see explanation)

    mode : 'approx' (default) or 'asymp'
        defines the distribution used for calculating p-value

        'approx' : use approximation to exact distribution of test statistic

        'asymp' : use asymptotic distribution of test statistic


    Returns
    -------
    D : float
        KS test statistic, either D, D+ or D-
    p-value :  float
        one-tailed or two-tailed p-value

    Notes
    -----

    In the one-sided test, the alternative is that the empirical
    cumulative distribution function of the random variable is "less"
    or "greater" than the cumulative distribution function F(x) of the
    hypothesis, G(x)<=F(x), resp. G(x)>=F(x).

    Examples
    --------

    >>> from scipy import stats
    >>> import numpy as np
    >>> from scipy.stats import kstest

    >>> x = np.linspace(-15,15,9)
    >>> kstest(x,'norm')
    (0.44435602715924361, 0.038850142705171065)

    >>> np.random.seed(987654321) # set random seed to get the same result
    >>> kstest('norm','',N=100)
    (0.058352892479417884, 0.88531190944151261)

    is equivalent to this

    >>> np.random.seed(987654321)
    >>> kstest(stats.norm.rvs(size=100),'norm')
    (0.058352892479417884, 0.88531190944151261)

    Test against one-sided alternative hypothesis:

    >>> np.random.seed(987654321)

    Shift distribution to larger values, so that cdf_dgp(x)< norm.cdf(x):

    >>> x = stats.norm.rvs(loc=0.2, size=100)
    >>> kstest(x,'norm', alternative = 'less')
    (0.12464329735846891, 0.040989164077641749)

    Reject equal distribution against alternative hypothesis: less

    >>> kstest(x,'norm', alternative = 'greater')
    (0.0072115233216311081, 0.98531158590396395)

    Don't reject equal distribution against alternative hypothesis: greater

    >>> kstest(x,'norm', mode='asymp')
    (0.12464329735846891, 0.08944488871182088)


    Testing t distributed random variables against normal distribution:

    With 100 degrees of freedom the t distribution looks close to the normal
    distribution, and the kstest does not reject the hypothesis that the sample
    came from the normal distribution

    >>> np.random.seed(987654321)
    >>> stats.kstest(stats.t.rvs(100,size=100),'norm')
    (0.072018929165471257, 0.67630062862479168)

    With 3 degrees of freedom the t distribution looks sufficiently different
    from the normal distribution, that we can reject the hypothesis that the
    sample came from the normal distribution at a alpha=10% level

    >>> np.random.seed(987654321)
    >>> stats.kstest(stats.t.rvs(3,size=100),'norm')
    (0.131016895759829, 0.058826222555312224)

    """
    if isinstance(rvs, string_types):
        #cdf = getattr(stats, rvs).cdf
        if (not cdf) or (cdf == rvs):
            cdf = getattr(distributions, rvs).cdf
            rvs = getattr(distributions, rvs).rvs
        else:
            raise AttributeError('if rvs is string, cdf has to be the same distribution')


    if isinstance(cdf, string_types):
        cdf = getattr(distributions, cdf).cdf
    if callable(rvs):
        kwds = {'size':N}
        vals = np.sort(rvs(*args,**kwds))
    else:
        vals = np.sort(rvs)
        N = len(vals)
    cdfvals = cdf(vals, *args)

    if alternative in ['two_sided', 'greater']:
        Dplus = (np.arange(1.0, N+1)/N - cdfvals).max()
        if alternative == 'greater':
            return Dplus, distributions.ksone.sf(Dplus,N)

    if alternative in ['two_sided', 'less']:
        Dmin = (cdfvals - np.arange(0.0, N)/N).max()
        if alternative == 'less':
            return Dmin, distributions.ksone.sf(Dmin,N)

    if alternative == 'two_sided':
        D = np.max([Dplus,Dmin])
        if mode == 'asymp':
            return D, distributions.kstwobign.sf(D*np.sqrt(N))
        if mode == 'approx':
            pval_two = distributions.kstwobign.sf(D*np.sqrt(N))
            if N > 2666 or pval_two > 0.80 - N*0.3/1000.0 :
                return D, distributions.kstwobign.sf(D*np.sqrt(N))
            else:
                return D, distributions.ksone.sf(D,N)*2
Esempio n. 17
0
def kstest(rvs,
           cdf,
           args=(),
           N=20,
           alternative='two_sided',
           mode='approx',
           **kwds):
    """
    Perform the Kolmogorov-Smirnov test for goodness of fit

    This performs a test of the distribution G(x) of an observed
    random variable against a given distribution F(x). Under the null
    hypothesis the two distributions are identical, G(x)=F(x). The
    alternative hypothesis can be either 'two_sided' (default), 'less'
    or 'greater'. The KS test is only valid for continuous distributions.

    Parameters
    ----------
    rvs : string or array or callable
        string: name of a distribution in scipy.stats

        array: 1-D observations of random variables

        callable: function to generate random variables, requires keyword
        argument `size`

    cdf : string or callable
        string: name of a distribution in scipy.stats, if rvs is a string then
        cdf can evaluate to `False` or be the same as rvs
        callable: function to evaluate cdf

    args : tuple, sequence
        distribution parameters, used if rvs or cdf are strings
    N : int
        sample size if rvs is string or callable
    alternative : 'two_sided' (default), 'less' or 'greater'
        defines the alternative hypothesis (see explanation)

    mode : 'approx' (default) or 'asymp'
        defines the distribution used for calculating p-value

        'approx' : use approximation to exact distribution of test statistic

        'asymp' : use asymptotic distribution of test statistic


    Returns
    -------
    D : float
        KS test statistic, either D, D+ or D-
    p-value :  float
        one-tailed or two-tailed p-value

    Notes
    -----

    In the one-sided test, the alternative is that the empirical
    cumulative distribution function of the random variable is "less"
    or "greater" than the cumulative distribution function F(x) of the
    hypothesis, G(x)<=F(x), resp. G(x)>=F(x).

    Examples
    --------

    >>> from scipy import stats
    >>> import numpy as np
    >>> from scipy.stats import kstest

    >>> x = np.linspace(-15,15,9)
    >>> kstest(x,'norm')
    (0.44435602715924361, 0.038850142705171065)

    >>> np.random.seed(987654321) # set random seed to get the same result
    >>> kstest('norm','',N=100)
    (0.058352892479417884, 0.88531190944151261)

    is equivalent to this

    >>> np.random.seed(987654321)
    >>> kstest(stats.norm.rvs(size=100),'norm')
    (0.058352892479417884, 0.88531190944151261)

    Test against one-sided alternative hypothesis:

    >>> np.random.seed(987654321)

    Shift distribution to larger values, so that cdf_dgp(x)< norm.cdf(x):

    >>> x = stats.norm.rvs(loc=0.2, size=100)
    >>> kstest(x,'norm', alternative = 'less')
    (0.12464329735846891, 0.040989164077641749)

    Reject equal distribution against alternative hypothesis: less

    >>> kstest(x,'norm', alternative = 'greater')
    (0.0072115233216311081, 0.98531158590396395)

    Don't reject equal distribution against alternative hypothesis: greater

    >>> kstest(x,'norm', mode='asymp')
    (0.12464329735846891, 0.08944488871182088)


    Testing t distributed random variables against normal distribution:

    With 100 degrees of freedom the t distribution looks close to the normal
    distribution, and the kstest does not reject the hypothesis that the sample
    came from the normal distribution

    >>> np.random.seed(987654321)
    >>> stats.kstest(stats.t.rvs(100,size=100),'norm')
    (0.072018929165471257, 0.67630062862479168)

    With 3 degrees of freedom the t distribution looks sufficiently different
    from the normal distribution, that we can reject the hypothesis that the
    sample came from the normal distribution at a alpha=10% level

    >>> np.random.seed(987654321)
    >>> stats.kstest(stats.t.rvs(3,size=100),'norm')
    (0.131016895759829, 0.058826222555312224)

    """
    if isinstance(rvs, string_types):
        #cdf = getattr(stats, rvs).cdf
        if (not cdf) or (cdf == rvs):
            cdf = getattr(distributions, rvs).cdf
            rvs = getattr(distributions, rvs).rvs
        else:
            raise AttributeError(
                'if rvs is string, cdf has to be the same distribution')

    if isinstance(cdf, string_types):
        cdf = getattr(distributions, cdf).cdf
    if callable(rvs):
        kwds = {'size': N}
        vals = np.sort(rvs(*args, **kwds))
    else:
        vals = np.sort(rvs)
        N = len(vals)
    cdfvals = cdf(vals, *args)

    if alternative in ['two_sided', 'greater']:
        Dplus = (np.arange(1.0, N + 1) / N - cdfvals).max()
        if alternative == 'greater':
            return Dplus, distributions.ksone.sf(Dplus, N)

    if alternative in ['two_sided', 'less']:
        Dmin = (cdfvals - np.arange(0.0, N) / N).max()
        if alternative == 'less':
            return Dmin, distributions.ksone.sf(Dmin, N)

    if alternative == 'two_sided':
        D = np.max([Dplus, Dmin])
        if mode == 'asymp':
            return D, distributions.kstwobign.sf(D * np.sqrt(N))
        if mode == 'approx':
            pval_two = distributions.kstwobign.sf(D * np.sqrt(N))
            if N > 2666 or pval_two > 0.80 - N * 0.3 / 1000.0:
                return D, distributions.kstwobign.sf(D * np.sqrt(N))
            else:
                return D, distributions.ksone.sf(D, N) * 2
Esempio n. 18
0
def margeff_cov_params(model, params, exog, cov_params, at, derivative,
                       dummy_ind, count_ind, method, J):
    """
    Computes the variance-covariance of marginal effects by the delta method.

    Parameters
    ----------
    model : model instance
        The model that returned the fitted results. Its pdf method is used
        for computing the Jacobian of discrete variables in dummy_ind and
        count_ind
    params : array-like
        estimated model parameters
    exog : array-like
        exogenous variables at which to calculate the derivative
    cov_params : array-like
        The variance-covariance of the parameters
    at : str
       Options are:

        - 'overall', The average of the marginal effects at each
          observation.
        - 'mean', The marginal effects at the mean of each regressor.
        - 'median', The marginal effects at the median of each regressor.
        - 'zero', The marginal effects at zero for each regressor.
        - 'all', The marginal effects at each observation.

        Only overall has any effect here.you

    derivative : function or array-like
        If a function, it returns the marginal effects of the model with
        respect to the exogenous variables evaluated at exog. Expected to be
        called derivative(params, exog). This will be numerically
        differentiated. Otherwise, it can be the Jacobian of the marginal
        effects with respect to the parameters.
    dummy_ind : array-like
        Indices of the columns of exog that contain dummy variables
    count_ind : array-like
        Indices of the columns of exog that contain count variables

    Notes
    -----
    For continuous regressors, the variance-covariance is given by

    Asy. Var[MargEff] = [d margeff / d params] V [d margeff / d params]'

    where V is the parameter variance-covariance.

    The outer Jacobians are computed via numerical differentiation if
    derivative is a function.
    """
    if callable(derivative):
        from statsmodels.tools.numdiff import approx_fprime_cs
        params = params.ravel('F')  # for Multinomial
        try:
            jacobian_mat = approx_fprime_cs(params, derivative,
                                            args=(exog,method))
        except TypeError:  # norm.cdf doesn't take complex values
            from statsmodels.tools.numdiff import approx_fprime
            jacobian_mat = approx_fprime(params, derivative,
                                            args=(exog,method))
        if at == 'overall':
            jacobian_mat = np.mean(jacobian_mat, axis=1)
        else:
            jacobian_mat = jacobian_mat.squeeze()  # exog was 2d row vector
        if dummy_ind is not None:
            jacobian_mat = _margeff_cov_params_dummy(model, jacobian_mat,
                                params, exog, dummy_ind, method, J)
        if count_ind is not None:
            jacobian_mat = _margeff_cov_params_count(model, jacobian_mat,
                                params, exog, count_ind, method, J)
    else:
        jacobian_mat = derivative

    #NOTE: this won't go through for at == 'all'
    return np.dot(np.dot(jacobian_mat, cov_params), jacobian_mat.T)