def BrownForsytheTest(x, y):

    # Data analysis
    K = 2
    Ni = np.array([len(x), len(y)])
    N = Ni.sum()

    # Transform data
    Zi = np.abs(np.array([x - np.median(x), y - np.median(y)]))
    Z_Bar = np.concatenate([Zi[0], Zi[1]]).mean()
    Zj = np.array([Zi[0].mean(), Zi[1].mean()])

    # Compute test result
    Nominator, Denominator = 0, 0
    for i in range(K):
        Nominator += Ni[i] * (Zj[i] - Z_Bar)**2

        for j in range(Ni[i]):
            Denominator += (Zi[i][j] - Zj[i])**2

    W = (N - K) / (K - 1) * Nominator / Denominator

    # Compute p value
    p = 1 - f.cdf(W, K - 1, N - K)

    return W, p
Example #2
0
def gelman_rubin(chains, return_cdf=False):
    """
    Compute the Gelman-Rubin R-statistic from an ensemble of chains.  `chains`
    is expected to have shape `(nsteps, nchains)` if samples are one dimensional,
    or `(nsteps, nchains, ndim)` if multidimensional.  For multidimensional samples
    R-statistics will be computed for each dimension.

    :param chains:
        An `(nsteps, nchains)` or `(nsteps, nchains, ndim)`-shaped array.

    :param return_cdf: (optional)
        If ``True``, the CDF of the R-statistic(s), assuming an F-distribution, are
        returned in addition to the R-statistic(s).
    """
    if len(chains.shape) > 2:
        results = [
            gelman_rubin(chains[..., param], return_cdf=return_cdf)
            for param in range(chains.shape[-1])
        ]
        if return_cdf:
            return zip(*results)
        else:
            return results

    nchains, nsteps = chains.shape[1], chains.shape[0]

    chain_means = np.mean(chains, axis=0)
    chain_vars = np.var(chains, axis=0)

    # between-chain variance
    interchain_var = np.sum((chain_means - np.mean(chains))**2) / (nchains - 1)

    # within-chain variances
    intrachain_vars = (chains - chain_means)**2 / (nsteps - 1)
    intrachain_var = np.sum(intrachain_vars) / nchains

    var = intrachain_var * (nsteps - 1) / nsteps + interchain_var
    post_var = var + interchain_var / nchains

    # The Statistic
    R = np.sqrt(post_var / intrachain_var)

    if return_cdf:
        # R should be F-distributed
        dof1 = nchains - 1
        dof2 = 2 * intrachain_var**2 * nchains / np.var(intrachain_vars)
        return R, f.cdf(R, dof1, dof2)
    else:
        return R
Example #3
0
def gelman_rubin(chains, return_cdf=False):
    """
    Compute the Gelman-Rubin R-statistic from an ensemble of chains.  `chains`
    is expected to have shape `(nsteps, nchains)` if samples are one dimensional,
    or `(nsteps, nchains, ndim)` if multidimensional.  For multidimensional samples
    R-statistics will be computed for each dimension.

    :param chains:
        An `(nsteps, nchains)` or `(nsteps, nchains, ndim)`-shaped array.

    :param return_cdf: (optional)
        If ``True``, the CDF of the R-statistic(s), assuming an F-distribution, are
        returned in addition to the R-statistic(s).
    """
    if len(chains.shape) > 2:
        results = [gelman_rubin(chains[..., param], return_cdf=return_cdf)
                   for param in range(chains.shape[-1])]
        if return_cdf:
            return zip(*results)
        else:
            return results

    nchains, nsteps = chains.shape[1], chains.shape[0]

    chain_means = np.mean(chains, axis=0)
    chain_vars = np.var(chains, axis=0)

    # between-chain variance
    interchain_var = np.sum((chain_means - np.mean(chains)) ** 2) / (nchains - 1)

    # within-chain variances
    intrachain_vars = (chains - chain_means)**2 / (nsteps - 1)
    intrachain_var = np.sum(intrachain_vars)/nchains

    var = intrachain_var * (nsteps - 1) / nsteps + interchain_var
    post_var = var + interchain_var / nchains

    # The Statistic
    R = np.sqrt(post_var / intrachain_var)

    if return_cdf:
        # R should be F-distributed
        dof1 = nchains - 1
        dof2 = 2*intrachain_var**2*nchains/np.var(intrachain_vars)
        return R, f.cdf(R, dof1, dof2)
    else:
        return R
Example #4
0
def ANOVA(tab):
    """
    ANOVA test for table tab (tab should be a list of lists).
     Values returned are in order:
       Fs: Value for the variable F (F of snedecor).
       glentre: degrees of freedom in between.
       gldentro: degrees of fredom inside.
       1-fsnede: left tail (p-value for the test).
    """
    r = len(tab)
    ni = [len(ele) for ele in tab]
    xbi = [stats(ele)[0] for ele in tab]
    N = sum(ni)
    XB = sum([ni[ii]*xbi[ii] for ii in xrange(r)])/N
    ssi = [sum([(ele - xbi[ii])**2 for ele in ele2]) for ii, ele2 in enumerate(tab)]
    SSdentro = sum(ssi)
    gldentro = N-r
    MSdentro = SSdentro/gldentro
    SSentre = sum([ni[ii]*(ele-XB)**2 for ii, ele in enumerate(xbi)])
    glentre = r-1
    MSentre = SSentre/glentre
    Fs = MSentre/MSdentro
    return [Fs, glentre, gldentro, 1.-fsnede.cdf(Fs,glentre,gldentro)]
Example #5
0
def predict_functional(result,
                       focus_var,
                       summaries=None,
                       values=None,
                       summaries2=None,
                       values2=None,
                       alpha=0.05,
                       ci_method="pointwise",
                       linear=True,
                       num_points=10,
                       exog=None,
                       exog2=None,
                       **kwargs):

    if ci_method not in ("pointwise", "scheffe", "simultaneous"):
        raise ValueError('confidence band method must be one of '
                         '`pointwise`, `scheffe`, and `simultaneous`.')

    contrast = (values2 is not None) or (summaries2 is not None)

    if contrast and not linear:
        raise ValueError("`linear` must be True for computing contrasts")

    model = result.model
    if exog is not None:

        if any(x is not None
               for x in [summaries, summaries2, values, values2]):
            raise ValueError("if `exog` is provided then do not "
                             "provide `summaries` or `values`")

        fexog = exog
        dexog = patsy.dmatrix(model.data.design_info,
                              fexog,
                              return_type='dataframe')
        fvals = exog[focus_var]

        if exog2 is not None:
            fexog2 = exog
            dexog2 = patsy.dmatrix(model.data.design_info,
                                   fexog2,
                                   return_type='dataframe')
            fvals2 = fvals

    else:

        values, summaries, values2, summaries2 = _check_args(
            values, summaries, values2, summaries2)

        dexog, fexog, fvals = _make_exog(result, focus_var, summaries, values,
                                         num_points)

        if len(summaries2) + len(values2) > 0:
            dexog2, fexog2, fvals2 = _make_exog(result, focus_var, summaries2,
                                                values2, num_points)

    from statsmodels.genmod.generalized_linear_model import GLM
    from statsmodels.genmod.generalized_estimating_equations import GEE
    if isinstance(result.model, (GLM, GEE)):
        kwargs_pred = kwargs.copy()
        kwargs_pred.update({"linear": True})
    else:
        kwargs_pred = kwargs

    pred = result.predict(exog=fexog, **kwargs_pred)
    if contrast:
        pred2 = result.predict(exog=fexog2, **kwargs_pred)
        pred = pred - pred2
        dexog = dexog - dexog2

    if ci_method == 'pointwise':

        t_test = result.t_test(dexog)
        cb = t_test.conf_int(alpha=alpha)

    elif ci_method == 'scheffe':

        t_test = result.t_test(dexog)
        sd = t_test.sd
        cb = np.zeros((num_points, 2))

        # Scheffe's method
        from scipy.stats.distributions import f as fdist
        df1 = result.model.exog.shape[1]
        df2 = result.model.exog.shape[0] - df1
        qf = fdist.cdf(1 - alpha, df1, df2)
        fx = sd * np.sqrt(df1 * qf)
        cb[:, 0] = pred - fx
        cb[:, 1] = pred + fx

    elif ci_method == 'simultaneous':

        sigma, c = _glm_basic_scr(result, dexog, alpha)
        cb = np.zeros((dexog.shape[0], 2))
        cb[:, 0] = pred - c * sigma
        cb[:, 1] = pred + c * sigma

    if not linear:
        # May need to support other models with link-like functions.
        link = result.family.link
        pred = link.inverse(pred)
        cb = link.inverse(cb)

    return pred, cb, fvals
def predict_functional(result, focus_var, summaries=None, values=None,
                       summaries2=None, values2=None, alpha=0.05,
                       ci_method="pointwise", linear=True, num_points=10,
                       exog=None, exog2=None, **kwargs):
    # docstring attached below

    if ci_method not in ("pointwise", "scheffe", "simultaneous"):
        raise ValueError('confidence band method must be one of `pointwise`, `scheffe`, and `simultaneous`.')

    contrast = (values2 is not None) or (summaries2 is not None)

    if contrast and not linear:
        raise ValueError("`linear` must be True for computing contrasts")

    model = result.model
    if exog is not None:

        if any(x is not None for x in [summaries, summaries2, values, values2]):
            raise ValueError("if `exog` is provided then do not provide `summaries` or `values`")

        fexog = exog
        dexog = patsy.dmatrix(model.data.design_info.builder,
                              fexog, return_type='dataframe')
        fvals = exog[focus_var]

        if exog2 is not None:
            fexog2 = exog
            dexog2 = patsy.dmatrix(model.data.design_info.builder,
                                   fexog2, return_type='dataframe')
            fvals2 = fvals

    else:

        values, summaries, values2, summaries2 = _check_args(values,
                             summaries, values2, summaries2)

        dexog, fexog, fvals = _make_exog(result, focus_var, summaries, values, num_points)

        if len(summaries2) + len(values2) > 0:
            dexog2, fexog2, fvals2 = _make_exog(result, focus_var, summaries2, values2, num_points)

    from statsmodels.genmod.generalized_linear_model import GLM
    from statsmodels.genmod.generalized_estimating_equations import GEE
    if isinstance(result.model, (GLM, GEE)):
        kwargs_pred = kwargs.copy()
        kwargs_pred.update({"linear": True})
    else:
        kwargs_pred = kwargs

    pred = result.predict(exog=fexog, **kwargs_pred)
    if contrast:
        pred2 = result.predict(exog=fexog2, **kwargs_pred)
        pred = pred - pred2
        dexog = dexog - dexog2

    if ci_method == 'pointwise':

        t_test = result.t_test(dexog)
        cb = t_test.conf_int(alpha=alpha)

    elif ci_method == 'scheffe':

        t_test = result.t_test(dexog)
        sd = t_test.sd
        cb = np.zeros((num_points, 2))

        # Scheffe's method
        from scipy.stats.distributions import f as fdist
        df1 = result.model.exog.shape[1]
        df2 = result.model.exog.shape[0] - df1
        qf = fdist.cdf(1 - alpha, df1, df2)
        fx = sd * np.sqrt(df1 * qf)
        cb[:, 0] = pred - fx
        cb[:, 1] = pred + fx

    elif ci_method == 'simultaneous':

        sigma, c = _glm_basic_scr(result, dexog, alpha)
        cb = np.zeros((dexog.shape[0], 2))
        cb[:, 0] = pred - c*sigma
        cb[:, 1] = pred + c*sigma

    if not linear:
        # May need to support other models with link-like functions.
        link = result.family.link
        pred = link.inverse(pred)
        cb = link.inverse(cb)

    return pred, cb, fvals