Beispiel #1
0
def get_peruser_diff_zstat(df,
                  nobs_name,
                  mean_name,
                  std_name,
                  test_group_column='test_group',
                  test='TEST',
                  control='CONTROL',
                  alpha=0.05,
                  alternative='two-sided'):
        '''confidence interval for the difference in means.
        Similar to https://www.statsmodels.org/dev/generated/statsmodels.stats.weightstats.CompareMeans.zconfint_diff.html#statsmodels.stats.weightstats.CompareMeans.zconfint_diff
        Example: pre = s1.build_and_run_sql(grouping=['test_group','user_id'], sql_template=TEMPLATE_USER_STATS)
        get_user_agg_zstat(pre, 'units','buyers mean', 'buyers std')
        Parameters
        ----------
        alpha : float
            significance level for the confidence interval, coverage is
            ``1-alpha``
        alternative : string
            This specifies the alternative hypothesis for the test that
            corresponds to the confidence interval.
            The alternative hypothesis, H1, has to be one of the following :
            'two-sided': H1: difference in means not equal to value (default)
            'larger' :   H1: difference in means larger than value
            'smaller' :  H1: difference in means smaller than value
        Returns
        -------
        diff, zstat, pvalue, confint : floats
        '''
        summary = df.set_index(test_group_column)#.T.loc[[nobs_name, mean_name, std_name],[test, control]].to_dict()

        d1 = summary.loc[test, :]
        d2 = summary.loc[control,:]
        diff = d1[mean_name] - d2[mean_name]
        std_diff = np.sqrt(d1[std_name]**2/(d1[nobs_name]-1) + d2[std_name]**2/(d2[nobs_name]-1)) #Assume unequal variance
        confint = wstats._zconfint_generic(diff, std_diff, alpha=alpha, alternative=alternative)
        zstat, pvalue = wstats._zstat_generic2(diff, std_diff, alternative)
        return d1[mean_name], d2[mean_name], diff/d2[mean_name], confint/d2[mean_name], zstat, pvalue
Beispiel #2
0
def proportions_ztest(count, nobs, value=None, alternative='two-sided',
                      prop_var=False):
    '''test for proportions based on normal (z) test

    Parameters
    ----------
    count : integer or array_like
        the number of successes in nobs trials. If this is array_like, then
        the assumption is that this represents the number of successes for
        each independent sample
    nobs : integer
        the number of trials or observations, with the same length as
        count.
    value : None or float or array_like
        This is the value of the null hypothesis equal to the proportion in the
        case of a one sample test. In the case of a two-sample test, the
        null hypothesis is that prop[0] - prop[1] = value, where prop is the
        proportion in the two samples
    alternative : string in ['two-sided', 'smaller', 'larger']
        The alternative hypothesis can be either two-sided or one of the one-
        sided tests, smaller means that the alternative hypothesis is
        ``prop < value` and larger means ``prop > value``, or the corresponding
        inequality for the two sample test.
    prop_var : False or float in (0, 1)
        If prop_var is false, then the variance of the proportion estimate is
        calculated based on the sample proportion. Alternatively, a proportion
        can be specified to calculate this variance. Common use case is to
        use the proportion under the Null hypothesis to specify the variance
        of the proportion estimate.
        TODO: change options similar to propotion_ztost ?

    Returns
    -------
    zstat : float
        test statistic for the z-test
    p-value : float
        p-value for the z-test


    Notes
    -----
    This uses a simple normal test for proportions. It should be the same as
    running the mean z-test on the data encoded 1 for event and 0 for no event,
    so that the sum corresponds to count.

    In the one and two sample cases with two-sided alternative, this test
    produces the same p-value as ``proportions_chisquare``, since the
    chisquare is the distribution of the square of a standard normal
    distribution.
    (TODO: verify that this really holds)

    TODO: add continuity correction or other improvements for small samples.

    '''
    prop = count * 1. / nobs
    k_sample = np.size(prop)
    if k_sample == 1:
        diff = prop - value
    elif k_sample == 2:
        diff = prop[0] - prop[1] - value
    else:
        msg = 'more than two samples are not implemented yet'
        raise NotImplementedError(msg)

    p_pooled = np.sum(count) * 1. / np.sum(nobs)

    nobs_fact = np.sum(1. / nobs)
    if prop_var:
        p_pooled = prop_var
    var_ = p_pooled * (1 - p_pooled) * nobs_fact
    std_diff = np.sqrt(var_)
    from statsmodels.stats.weightstats import _zstat_generic2
    return _zstat_generic2(diff, std_diff, alternative)
Beispiel #3
0
def test_poisson_2indep(count1,
                        exposure1,
                        count2,
                        exposure2,
                        ratio_null=1,
                        method='score',
                        alternative='two-sided',
                        etest_kwds=None):
    '''test for ratio of two sample Poisson intensities

    If the two Poisson rates are g1 and g2, then the Null hypothesis is

    - H0: g1 / g2 = ratio_null

    against one of the following alternatives

    - H1_2-sided: g1 / g2 != ratio_null
    - H1_larger: g1 / g2 > ratio_null
    - H1_smaller: g1 / g2 < ratio_null

    Parameters
    ----------
    count1 : int
        Number of events in first sample.
    exposure1 : float
        Total exposure (time * subjects) in first sample.
    count2 : int
        Number of events in second sample.
    exposure2 : float
        Total exposure (time * subjects) in second sample.
    ratio: float
        ratio of the two Poisson rates under the Null hypothesis. Default is 1.
    method : string
        Method for the test statistic and the p-value. Defaults to `'score'`.
        Current Methods are based on Gu et. al 2008.
        Implemented are 'wald', 'score' and 'sqrt' based asymptotic normal
        distribution, and the exact conditional test 'exact-cond', and its
        mid-point version 'cond-midp'. method='etest' and method='etest-wald'
        provide pvalues from `etest_poisson_2indep` using score or wald
        statistic respectively.
        see Notes.
    alternative : string
        The alternative hypothesis, H1, has to be one of the following

        - 'two-sided': H1: ratio of rates is not equal to ratio_null (default)
        - 'larger' :   H1: ratio of rates is larger than ratio_null
        - 'smaller' :  H1: ratio of rates is smaller than ratio_null
    etest_kwds: dictionary
        Additional parameters to be passed to the etest_poisson_2indep
        function, namely ygrid.

    Returns
    -------
    results : instance of HolderTuple class
        The two main attributes are test statistic `statistic` and p-value
        `pvalue`.

    Notes
    -----
    - 'wald': method W1A, wald test, variance based on separate estimates
    - 'score': method W2A, score test, variance based on estimate under Null
    - 'wald-log': W3A
    - 'score-log' W4A
    - 'sqrt': W5A, based on variance stabilizing square root transformation
    - 'exact-cond': exact conditional test based on binomial distribution
    - 'cond-midp': midpoint-pvalue of exact conditional test
    - 'etest': etest with score test statistic
    - 'etest-wald': etest with wald test statistic

    References
    ----------
    Gu, Ng, Tang, Schucany 2008: Testing the Ratio of Two Poisson Rates,
    Biometrical Journal 50 (2008) 2, 2008

    See Also
    --------
    tost_poisson_2indep
    etest_poisson_2indep
    '''

    # shortcut names
    y1, n1, y2, n2 = count1, exposure1, count2, exposure2
    d = n2 / n1
    r = ratio_null
    r_d = r / d

    if method in ['score']:
        stat = (y1 - y2 * r_d) / np.sqrt((y1 + y2) * r_d)
        dist = 'normal'
    elif method in ['wald']:
        stat = (y1 - y2 * r_d) / np.sqrt(y1 + y2 * r_d**2)
        dist = 'normal'
    elif method in ['sqrt']:
        stat = 2 * (np.sqrt(y1 + 3 / 8.) - np.sqrt((y2 + 3 / 8.) * r_d))
        stat /= np.sqrt(1 + r_d)
        dist = 'normal'
    elif method in ['exact-cond', 'cond-midp']:
        from statsmodels.stats import proportion
        bp = r_d / (1 + r_d)
        y_total = y1 + y2
        stat = None
        # TODO: why y2 in here and not y1, check definition of H1 "larger"
        pvalue = proportion.binom_test(y1,
                                       y_total,
                                       prop=bp,
                                       alternative=alternative)
        if method in ['cond-midp']:
            # not inplace in case we still want binom pvalue
            pvalue = pvalue - 0.5 * stats.binom.pmf(y1, y_total, bp)

        dist = 'binomial'
    elif method.startswith('etest'):
        if method.endswith('wald'):
            method_etest = 'wald'
        else:
            method_etest = 'score'
        if etest_kwds is None:
            etest_kwds = {}

        stat, pvalue = etest_poisson_2indep(count1,
                                            exposure1,
                                            count2,
                                            exposure2,
                                            ratio_null=ratio_null,
                                            method=method_etest,
                                            alternative=alternative,
                                            **etest_kwds)

        dist = 'poisson'
    else:
        raise ValueError('method not recognized')

    if dist == 'normal':
        stat, pvalue = _zstat_generic2(stat, 1, alternative)

    rates = (y1 / n1, y2 / n2)
    ratio = rates[0] / rates[1]
    res = HolderTuple(statistic=stat,
                      pvalue=pvalue,
                      distribution=dist,
                      method=method,
                      alternative=alternative,
                      rates=rates,
                      ratio=ratio,
                      ratio_null=ratio_null)
    return res
Beispiel #4
0
def proportions_ztest(count, nobs, value=None, alternative='two-sided',
                      prop_var=False):
    """
    Test for proportions based on normal (z) test

    Parameters
    ----------
    count : integer or array_like
        the number of successes in nobs trials. If this is array_like, then
        the assumption is that this represents the number of successes for
        each independent sample
    nobs : integer or array-like
        the number of trials or observations, with the same length as
        count.
    value : float, array_like or None, optional
        This is the value of the null hypothesis equal to the proportion in the
        case of a one sample test. In the case of a two-sample test, the
        null hypothesis is that prop[0] - prop[1] = value, where prop is the
        proportion in the two samples. If not provided value = 0 and the null
        is prop[0] = prop[1]
    alternative : string in ['two-sided', 'smaller', 'larger']
        The alternative hypothesis can be either two-sided or one of the one-
        sided tests, smaller means that the alternative hypothesis is
        ``prop < value` and larger means ``prop > value``, or the corresponding
        inequality for the two sample test.
    prop_var : False or float in (0, 1)
        If prop_var is false, then the variance of the proportion estimate is
        calculated based on the sample proportion. Alternatively, a proportion
        can be specified to calculate this variance. Common use case is to
        use the proportion under the Null hypothesis to specify the variance
        of the proportion estimate.

    Returns
    -------
    zstat : float
        test statistic for the z-test
    p-value : float
        p-value for the z-test

    Examples
    --------
    >>> count = 5
    >>> nobs = 83
    >>> value = .05
    >>> stat, pval = proportions_ztest(count, nobs, value)
    >>> print('{0:0.3f}'.format(pval))
    0.695

    >>> import numpy as np
    >>> from statsmodels.stats.proportion import proportions_ztest
    >>> count = np.array([5, 12])
    >>> nobs = np.array([83, 99])
    >>> stat, pval = proportions_ztest(counts, nobs)
    >>> print('{0:0.3f}'.format(pval))
    0.159

    Notes
    -----
    This uses a simple normal test for proportions. It should be the same as
    running the mean z-test on the data encoded 1 for event and 0 for no event
    so that the sum corresponds to the count.

    In the one and two sample cases with two-sided alternative, this test
    produces the same p-value as ``proportions_chisquare``, since the
    chisquare is the distribution of the square of a standard normal
    distribution.
    """
    # TODO: verify that this really holds
    # TODO: add continuity correction or other improvements for small samples
    # TODO: change options similar to propotion_ztost ?

    count = np.asarray(count)
    nobs = np.asarray(nobs)

    if nobs.size == 1:
        nobs = nobs * np.ones_like(count)

    prop = count * 1. / nobs
    k_sample = np.size(prop)
    if value is None:
        if k_sample == 1:
            raise ValueError('value must be provided for a 1-sample test')
        value = 0
    if k_sample == 1:
        diff = prop - value
    elif k_sample == 2:
        diff = prop[0] - prop[1] - value
    else:
        msg = 'more than two samples are not implemented yet'
        raise NotImplementedError(msg)

    p_pooled = np.sum(count) * 1. / np.sum(nobs)

    nobs_fact = np.sum(1. / nobs)
    if prop_var:
        p_pooled = prop_var
    var_ = p_pooled * (1 - p_pooled) * nobs_fact
    std_diff = np.sqrt(var_)
    from statsmodels.stats.weightstats import _zstat_generic2
    return _zstat_generic2(diff, std_diff, alternative)
Beispiel #5
0
def proportions_ztest(count,
                      nobs,
                      value=None,
                      alternative='two-sided',
                      prop_var=False):
    '''test for proportions based on normal (z) test

    Parameters
    ----------
    count : integer or array_like
        the number of successes in nobs trials. If this is array_like, then
        the assumption is that this represents the number of successes for
        each independent sample
    nobs : integer
        the number of trials or observations, with the same length as
        count.
    value : None or float or array_like
        This is the value of the null hypothesis equal to the proportion in the
        case of a one sample test. In the case of a two-sample test, the
        null hypothesis is that prop[0] - prop[1] = value, where prop is the
        proportion in the two samples
    alternative : string in ['two-sided', 'smaller', 'larger']
        The alternative hypothesis can be either two-sided or one of the one-
        sided tests, smaller means that the alternative hypothesis is
        ``prop < value` and larger means ``prop > value``, or the corresponding
        inequality for the two sample test.
    prop_var : False or float in (0, 1)
        If prop_var is false, then the variance of the proportion estimate is
        calculated based on the sample proportion. Alternatively, a proportion
        can be specified to calculate this variance. Common use case is to
        use the proportion under the Null hypothesis to specify the variance
        of the proportion estimate.
        TODO: change options similar to propotion_ztost ?

    Returns
    -------
    zstat : float
        test statistic for the z-test
    p-value : float
        p-value for the z-test


    Notes
    -----
    This uses a simple normal test for proportions. It should be the same as
    running the mean z-test on the data encoded 1 for event and 0 for no event,
    so that the sum corresponds to count.

    In the one and two sample cases with two-sided alternative, this test
    produces the same p-value as ``proportions_chisquare``, since the
    chisquare is the distribution of the square of a standard normal
    distribution.
    (TODO: verify that this really holds)

    TODO: add continuity correction or other improvements for small samples.

    '''
    prop = count * 1. / nobs
    k_sample = np.size(prop)
    if k_sample == 1:
        diff = prop - value
    elif k_sample == 2:
        diff = prop[0] - prop[1] - value
    else:
        msg = 'more than two samples are not implemented yet'
        raise NotImplementedError(msg)

    p_pooled = np.sum(count) * 1. / np.sum(nobs)

    nobs_fact = np.sum(1. / nobs)
    if prop_var:
        p_pooled = prop_var
    var_ = p_pooled * (1 - p_pooled) * nobs_fact
    std_diff = np.sqrt(var_)
    from statsmodels.stats.weightstats import _zstat_generic2
    return _zstat_generic2(diff, std_diff, alternative)
Beispiel #6
0
    for Id in IdLs:
        BranchScoreDf.loc[Indx, Id] = abs(row[f"{Id}_Subs"] -
                                          AvgLen[f"{Id}_Subs"])  #**2

BranchScoreDf['TreeFull'] = BranchScoreDf[IdLs].sum(axis=1)
BranchScoreDf.to_csv('BranchScoreDf.tsv',
                     sep='\t',
                     header=True,
                     index=True,
                     index_label='Id')

BraScStatSigDf = pd.DataFrame(dtype=np.float64)
ColLs = list(BranchScoreDf.columns)
for Indx, row in BranchScoreDf.iterrows():
    for Col in ColLs:
        BraScStatSigDf.at[Indx, Col] = weightstats._zstat_generic2(
            row[Col],
            BsDf.at[Indx, f"SE_{Col}"],
            alternative='two-sided',
        )[-1] / 2
BraScStatSigDf.to_csv('BraScStatSigDf.tsv',
                      sep='\t',
                      header=True,
                      index=True,
                      index_label='Id')
FdrCorrection(BraScStatSigDf)
BraScStatSigDf.to_csv('FdrBraScStatSigDf.tsv',
                      sep='\t',
                      header=True,
                      index=True,
                      index_label='Id')