Ejemplo n.º 1
1
def SBDFP_Calc(DF, FP="ECFP4", FORMAT="RDKit"):
# This function requires a dataframe with a column identified as "ECFP4FP" or "MACCSFP" containing the respective fingerprints as RDKit objects
# The function also requires the files ECFP4.counts or MACCS.counts that contain the "1" Bit counts for the respective fingerprints
# The input dataframe can be taken from the LoadDatasetFromCSV function
# FP = "ECFP4" or "MACCS" according to the respective SB-DFP
# FORMAT = "RDKit" or "TEXT" according to the output format, RDKit object or TEXT string

    if FP == "ECFP4":
        FPSTEXT = [DataStructs.BitVectToText(row.ECFP4FP) for index, row in DF.iterrows()]
        DF_COUNTS = [0 for i in range(len(FPSTEXT[0]))]
        for i in FPSTEXT:
            b = [int(j) for j in i]
            DF_COUNTS = [x + y for x, y in zip(DF_COUNTS, b)]
        REF = open("ECFP4.counts")
        line = REF.readline()
        a = line.split(",")
        REF_COUNTS = [int(x) for x in a]
        SBDFP = []
        for i in range(len(REF_COUNTS)):
            stat, pval = proportions_ztest([REF_COUNTS[i], DF_COUNTS[i]], [15403690,DF.shape[0]], alternative='smaller')
            if pval < 0.01:
                SBDFP.append(1)
            else:
                SBDFP.append(0)
        SBDFP = [str(x) for x in SBDFP]
        SBDFP = "".join(SBDFP)
        SBDFP_RDKIT = DataStructs.CreateFromBitString(SBDFP)
    
    elif FP == "MACCS":
        FPSTEXT = [DataStructs.BitVectToText(row.MACCSFP) for index, row in DF.iterrows()]
        DF_COUNTS = [0 for i in range(len(FPSTEXT[0]))]
        for i in FPSTEXT:
            b = [int(j) for j in i]
            DF_COUNTS = [x + y for x, y in zip(DF_COUNTS, b)]
        REF = open("MACCS.counts")
        line = REF.readline()
        a = line.split(",")
        REF_COUNTS = [int(x) for x in a]
        SBDFP = []
        for i in range(len(REF_COUNTS)):
            stat, pval = proportions_ztest([REF_COUNTS[i], DF_COUNTS[i]], [15403690,DF.shape[0]], alternative='smaller')
            if pval < 0.01:
                SBDFP.append(1)
            else:
                SBDFP.append(0)
        SBDFP = [str(x) for x in SBDFP]
        SBDFP = "".join(SBDFP)
        SBDFP_RDKIT = DataStructs.CreateFromBitString(SBDFP)
    if FORMAT == "RDKit":
        return SBDFP_RDKIT
    elif FORMAT == "TEXT":
        return SBDFP
Ejemplo n.º 2
0
def perform_z_test(m1, n1, m2, n2, delta, alpha):

    ## null hypothesis |p1 - p2| < delta
    p1 = 0 if n1 == 0 else m1 / n1
    p2 = 0 if n2 == 0 else m2 / n2
    p_hat = (m1 + m2) / (n1 + n2)
    if __VERBOSE__:
        print('p1, p2, abs(p1 - p2), delta, se:', p1, p2, abs(p1 - p2), delta)

    try:
        if p1 > p2:
            stat_test = proportions_ztest([m1, m2], [n1, n2], value=delta, alternative='larger')
        else:
            stat_test = proportions_ztest([m2, m1], [n2, n1], value=delta, alternative='larger')

        return stat_test[1] < alpha, {M1_ATTR_NAME: m1, N1_ATTR_NAME: n1, M2_ATTR_NAME: m2, N2_ATTR_NAME: n2, P1_ATTR_NAME: p1,\
                                      P2_ATTR_NAME: p2, 'diff': abs(p1-p2),\
                                      'p_hat': p_hat, STATISTICS_ATTR_NAME: stat_test[0], PVALUE_ATTR_NAME: stat_test[1],\
                                      'delta': delta, 'alpha': alpha, 'significant_diff': stat_test[1] < alpha}
    except:
        print('z_test assumption not met')
        return False, {M1_ATTR_NAME: m1, N1_ATTR_NAME: n1, M2_ATTR_NAME: m2, N2_ATTR_NAME: n2, P1_ATTR_NAME: p1,
                       P2_ATTR_NAME: p2, 'diff': abs(p1 - p2),
                       'p_hat': np.nan, 'se': np.nan, STATISTICS_ATTR_NAME: np.nan, PVALUE_ATTR_NAME: np.nan,
                       'delta': delta, 'alpha': alpha, 'significant_diff': False}
def t_test(curDict, sub):
    td = {}
    if len(curDict['Africa']) != 0:
        if sub:
            subpop = [pop for pop in subpopulations]
            for pop1 in subpopulations:
                subpop.pop(0)
                for pop2 in subpop:
                    count = np.asarray(
                        [sum(curDict[pop1]),
                         sum(curDict[pop2])])
                    nobs = np.asarray([len(curDict[pop1]), len(curDict[pop2])])
                    stat, P = proportions_ztest(count, nobs)
                    # if P < alphaSub:
                    if P not in td.keys():
                        td[P] = [[pop1, pop2]]
                    else:
                        td[P].append([pop1, pop2])
        else:
            suppop = [pop for pop in superpopulations]
            for pop1 in superpopulations:
                suppop.pop(0)
                for pop2 in suppop:
                    count = np.asarray(
                        [sum(curDict[pop1]),
                         sum(curDict[pop2])])
                    nobs = np.asarray([len(curDict[pop1]), len(curDict[pop2])])
                    stat, P = proportions_ztest(count, nobs)
                    # if P < alphaSup:
                    if P not in td.keys():
                        td[P] = [[pop1, pop2]]
                    else:
                        td[P].append([pop1, pop2])
    return td
Ejemplo n.º 4
0
def test_proportion_ztests():
    # currently only consistency test with proportions chisquare
    # Note: alternative handling is generic

    res1 = smprop.proportions_ztest(15, 20., value=0.5, prop_var=0.5)
    res2 = smprop.proportions_chisquare(15, 20., value=0.5)
    assert_almost_equal(res1[1], res2[1], decimal=13)

    res1 = smprop.proportions_ztest(np.asarray([15, 10]),
                                    np.asarray([20., 20]),
                                    value=0,
                                    prop_var=None)
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]),
                                        np.asarray([20., 20]))
    # test only p-value
    assert_almost_equal(res1[1], res2[1], decimal=13)

    # test with integers, issue #7603
    res1 = smprop.proportions_ztest(np.asarray([15, 10]),
                                    np.asarray([20, 50000]),
                                    value=0,
                                    prop_var=None)
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]),
                                        np.asarray([20, 50000]))
    # test only p-value
    assert_almost_equal(res1[1], res2[1], decimal=13)
    assert_array_less(0, res2[-1][1])  # expected should be positive
def test_proportion_ztests():
    # currently only consistency test with proportions chisquare
    # Note: alternative handling is generic

    res1 = smprop.proportions_ztest(15, 20., value=0.5, prop_var=0.5)
    res2 = smprop.proportions_chisquare(15, 20., value=0.5)
    assert_almost_equal(res1[1], res2[1], decimal=13)

    res1 = smprop.proportions_ztest(np.asarray([15, 10]), np.asarray([20., 20]),
                                 value=0, prop_var=None)
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20., 20]))
    # test only p-value
    assert_almost_equal(res1[1], res2[1], decimal=13)
Ejemplo n.º 6
0
def test_proportion_ztests():
    # currently only consistency test with proportions chisquare
    # Note: alternative handling is generic

    res1 = smprop.proportions_ztest(15, 20., value=0.5, prop_var=0.5)
    res2 = smprop.proportions_chisquare(15, 20., value=0.5)
    assert_almost_equal(res1[1], res2[1], decimal=13)

    res1 = smprop.proportions_ztest(np.asarray([15, 10]), np.asarray([20., 20]),
                                 value=0, prop_var=None)
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20., 20]))
    # test only p-value
    assert_almost_equal(res1[1], res2[1], decimal=13)
def test_hypothesis_proportions():
    # source: https://sonalake.com/latest/hypothesis-testing-of-proportion-based-samples/
    # can we assume anything from our sample
    significance = 0.05
    # our sample - 89% are good
    sample_success = 1367
    sample_size = 1520

    # our Ho is  85%
    null_hypothesis = 0.85
    # check our sample against Ho for Ha > Ho
    # for Ha < Ho use alternative='smaller'
    # for Ha != Ho use alternative='two-sided'
    stat, p_value = proportions_ztest(count=sample_success,
                                      nobs=sample_size,
                                      value=null_hypothesis,
                                      alternative='larger')
    # report
    print('z_stat: %0.3f, p_value: %0.3f' % (stat, p_value))
    if p_value > significance:
        print(
            "Fail to reject the null hypothesis - we have nothing else to say")
    else:
        print(
            "Reject the null hypothesis - suggest the alternative hypothesis is true"
        )
def compare_proportion(p_clu, vocab, behr):
    term_cl = {}
    set_terms = set(vocab.keys())
    for p_id, cl in p_clu.items():
        term_cl.setdefault(cl, list()).extend(behr[p_id])
        set_terms.update(behr[p_id])

    term_prop = {t: [None] * len(term_cl) for t in set_terms}
    for cl, term in term_cl.items():
        for t in set_terms:
            term_prop[t][int(cl)] = (term.count(t), len(term))

    # ztest of proportion with bonferroni-corrected p-values
    s = (len(term_cl.keys()) - 1, len(term_cl.keys()) - 1)
    result_ztest = {t: {'pval': np.zeros(s), 'count': term_prop[t]} for t in term_prop}
    for t, prop_count in term_prop.items():
        count_comp = 0
        for cl in range(len(prop_count)):
            idx = cl + 1
            while idx < len(prop_count):
                if prop_count[cl][0] != 0 or prop_count[idx][0] != 0:
                    count = np.array([prop_count[cl][0], prop_count[idx][0]])
                    nobs = np.array([prop_count[cl][1], prop_count[idx][1]])
                    stat, pval = proportions_ztest(count, nobs)
                    result_ztest[t]['pval'][cl][idx - 1] = pval
                    count_comp += 1
                idx += 1
        result_ztest[t]['pval'] = result_ztest[t]['pval'] * count_comp
    return result_ztest
Ejemplo n.º 9
0
def test_ztest_ztost():
    # compare weightstats with separately tested proportion ztest ztost
    import statsmodels.stats.proportion as smprop

    x1 = [0, 1]
    w1 = [5, 15]

    res2 = smprop.proportions_ztest(15, 20., value=0.5)
    d1 = DescrStatsW(x1, w1)
    res1 = d1.ztest_mean(0.5)
    assert_allclose(res1, res2, rtol=0.03, atol=0.003)

    d2 = DescrStatsW(x1, np.array(w1)*21./20)
    res1 = d2.ztest_mean(0.5)
    assert_almost_equal(res1, res2, decimal=12)

    res1 = d2.ztost_mean(0.4, 0.6)
    res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6)
    assert_almost_equal(res1[0], res2[0], decimal=12)

    x2 = [0, 1]
    w2 = [10, 10]
    # d2 = DescrStatsW(x1, np.array(w1)*21./20)
    d2 = DescrStatsW(x2, w2)
    res1 = ztest(d1.asrepeats(), d2.asrepeats())
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]),
                                        np.asarray([20., 20]))
    # TODO: check this is this difference expected?, see test_proportion
    assert_allclose(res1[1], res2[1], rtol=0.03)

    res1a = CompareMeans(d1, d2).ztest_ind()
    assert_allclose(res1a[1], res2[1], rtol=0.03)
    assert_almost_equal(res1a, res1, decimal=12)
Ejemplo n.º 10
0
def pval_cal_withoutrep(df):
    df=df.rename(columns = {'mc2_rep1':'mc_case','mc1_rep1':'mc_cont','h2_rep1':'h_case','h1_rep1':'h_cont'})
    meth_case=df.mc_case.divide(df.h_case)
    meth_cont=df.mc_cont.divide(df.h_cont)
    
    df['meth_diff']=meth_case.subtract(meth_cont)
    df['meth_case']=meth_case
    df['meth_cont']=meth_cont
    
    df['ztestpval'] =df.apply(lambda r: sm.proportions_ztest(np.array([r.mc_cont,r.mc_case]), np.array([r.h_cont,r.h_case]), value=0, alternative='two-sided')[1], axis=1)
    #df['Fisherpval'] = df.apply(lambda r: stats.fisher_exact([[r.mc, (r.h-r.mc)],[r.mc1,(r.h1-r.mc1)]])[1], axis=1)
    df=df.fillna(0)
    h=df.meth_diff.abs()
    mod_pval=1-df.ztestpval
    df['val']=h.multiply(mod_pval)
    hh=mod_pval.apply(np.exp)
    exp_val=h.multiply(hh)
    scaled_exp_val=(exp_val.subtract(exp_val.min())).divide(((exp_val.max())-(exp_val.min())))
    
    smooth_exp_val=smoothing(*exp_val) 
    
    scaled_smooth_exp_val=(smooth_exp_val-min(smooth_exp_val))/(max(smooth_exp_val)-min(smooth_exp_val))
    
    df1=pd.concat([df, pd.DataFrame({"exp_val":scaled_exp_val}),pd.DataFrame({"smooth_val":scaled_smooth_exp_val})], axis=1)
    return df1
Ejemplo n.º 11
0
    def get_node_info(self):
        '''Pivots data and extracts information from it'''
        pivot = self.data.groupby([self.option_column_name
                                   ])[self.click_column_name].agg({
                                       'opens':
                                       'count',
                                       'clicks':
                                       'sum',
                                       'mean':
                                       'mean'
                                   })
        sims = pivot.apply(lambda x: [
            np.random.beta(1 + x['clicks'], 1 + x['opens'])
            for i in range(1000)
        ],
                           axis=1)
        pivot['prob_of_choice'] = pd.DataFrame(
            list(zip(*(sims.tolist()))), columns=sims.index).idxmax(
                axis=1, skipna=True).value_counts() / 1000
        pivot = pivot.fillna(0)
        incremental_clicks = np.round(
            (pivot['mean'].max() - self.data[self.click_column_name].mean()) *
            pivot['opens'].sum(), 0)
        z, p_value = proportions_ztest(count=pivot['clicks'],
                                       nobs=pivot['opens'],
                                       value=0,
                                       alternative='two-sided')
        choice = pivot['mean'].argmax()
        pivot = pivot.to_dict(orient='index')

        return (choice, incremental_clicks, p_value, pivot)
Ejemplo n.º 12
0
def hypothesis_testing_5():
    df1 = pd.read_excel(xls, 'AlertData')
    df2 = pd.read_excel(xls, 'FuelInfo')
    #pre requirements
    df1_group_by = df1.groupby(['deviceId'], as_index=False)

    l = []
    for x in df1_group_by:
        l.append(x)
    l[1][1].to_csv(l[1][0] + '.csv')
    df = pd.read_csv('12DF03C6:19523068255842304686.csv')
    df = df.loc[df['alarmType'] == 'PCW']
    no_of_trials = len(df)
    no_of_success = len(df.loc[df['speed'] > 30])

    #hypo 2
    #Proportion of times the bus(id = 12DF03C6:19523068255842304686) has crossed 30kmph is less than or equal to 0.4
    #NULL HYP : p <= 0.4 ALT HYP : p > 0.4

    stat, pval = proportions_ztest(no_of_success,
                                   no_of_trials,
                                   0.4,
                                   alternative='larger')
    print(pval)
    if (pval < 0.05):
        print(
            'Null hypothesis is rejected and hence the bus is driven by careless drivers'
        )
    else:
        print('Bus drivers are careful')
Ejemplo n.º 13
0
    def simulate(self):
        """This simulation assumes that we are testing for an `effect` in a single
        experiment.

        Returns
        -------
        z_stat : float
            The z statistic from z-test in StatsModels
        p_value : float [0, 1]
            The p-value from z-test in StatsModels
        effect_point_estimate : float
            The effect size point estimate observed in the treatment group
        """
        n_treat = int(np.ceil(self.sample_size * self.test_split))
        n_control = int(self.sample_size - n_treat)

        # Treatment
        exp_observations = [
            np.random.binomial(1, (self.natural_rate + self.absolute_effect),
                               n_treat).sum()
        ]
        # Control
        exp_observations.append(
            np.random.binomial(1, (self.natural_rate), n_control).sum())

        effect_point_estimate = round(
            exp_observations[0] / float(n_treat) -
            exp_observations[1] / float(n_control), 4)
        z_stat, p_value = proportions_ztest(exp_observations,
                                            [n_treat, n_control])
        return z_stat, p_value, effect_point_estimate
Ejemplo n.º 14
0
def test_ztest_ztost():
    # compare weightstats with separately tested proportion ztest ztost
    import statsmodels.stats.proportion as smprop

    x1 = [0, 1]
    w1 = [5, 15]

    res2 = smprop.proportions_ztest(15, 20., value=0.5)
    d1 = DescrStatsW(x1, w1)
    res1 = d1.ztest_mean(0.5)
    assert_allclose(res1, res2, rtol=0.03, atol=0.003)

    d2 = DescrStatsW(x1, np.array(w1) * 21. / 20)
    res1 = d2.ztest_mean(0.5)
    assert_almost_equal(res1, res2, decimal=12)

    res1 = d2.ztost_mean(0.4, 0.6)
    res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6)
    assert_almost_equal(res1[0], res2[0], decimal=12)

    x2 = [0, 1]
    w2 = [10, 10]
    #d2 = DescrStatsW(x1, np.array(w1)*21./20)
    d2 = DescrStatsW(x2, w2)
    res1 = ztest(d1.asrepeats(), d2.asrepeats())
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]),
                                        np.asarray([20., 20]))
    #TODO: check this is this difference expected?, see test_proportion
    assert_allclose(res1[1], res2[1], rtol=0.03)

    res1a = CompareMeans(d1, d2).ztest_ind()
    assert_allclose(res1a[1], res2[1], rtol=0.03)
    assert_almost_equal(res1a, res1, decimal=12)
Ejemplo n.º 15
0
def z_proportions(successes_1,
                  trials_1,
                  successes_2,
                  trials_2,
                  h1='two-sided'):
    """
    Test for proportions based on normal (z) test

    Parameters
    ----------
    successes_1, successes_2 : Number of successes of two independent samples.
    trials_1, trials_1 : Number of trials or observations of two independent samples.
    h1 : str in ['two-sided', 'smaller', 'larger']
        In the two sample test, smaller means that
        the alternative hypothesis is ``p1 < p2`` and
        larger means ``p1 > p2`` where ``p1`` is the proportion of the first
        sample and ``p2`` of the second one.

    Returns
    -------
    p_value : The p-value for the test.
    """
    z_statistic, p_value = proportions_ztest(
        count=np.array([successes_1, successes_2]),
        nobs=np.array([trials_1, trials_2]),
        alternative=h1)
    return p_value
Ejemplo n.º 16
0
def z_test(df: pd.DataFrame) -> pd.DataFrame:
    df_sig = pd.DataFrame().reindex_like(df)  # 准备空的df
    labels = "abcdefghjklmn"  # 列字母标签

    for rowIndex, row in df.iterrows():  # 遍历行
        for col1, value1 in row.items():  # 遍历列
            for col2, value2 in row.items():  # 再次遍历列
                stat, p = proportions_ztest(
                    np.array([value1, value2]),
                    np.array([df[col1].sum(), df[col2].sum()]),
                )  # 计算P值
                # print(rowIndex,col1,value1,col2,value2,p)
                if p < 0.05 and value1 > value2:
                    try:
                        df_sig.loc[rowIndex, col1] = (
                            df_sig.loc[rowIndex, col1] +
                            labels[df.columns.get_loc(col2)]
                        )  # 如果当前列显著高于对比列(p<0.05),则空df对应cell加上对比列的字母标签
                    except:
                        df_sig.loc[rowIndex, col1] = labels[df.columns.get_loc(
                            col2)]  # 如果是报错意味着第一次加标记,不能字符串拼接,直接赋值

    for j, col in enumerate(df_sig.columns):
        df_sig.rename(columns={col: col + labels[j]}, inplace=True)  # 列名加上字母标签

    return df_sig.fillna("")
Ejemplo n.º 17
0
def run_proportion_Z_test(feature):
    dist1 = df.loc[df.churned == 'False.', feature]
    dist2 = df.loc[df.churned == 'True.', feature]
    n1 = len(dist1)
    p1 = dist1.sum()
    n2 = len(dist2)
    p2 = dist2.sum()
    z_score, p_value = proportions_ztest([p1, p2], [n1, n2])
Ejemplo n.º 18
0
def proportiontestResult():
    count = request.form.get('count')
    nob = request.form.get('nob')
    value = request.form.get('value')
    stat, pval = proportions_ztest(float(count), float(nob), float(value))
    return render_template(
        'one-sample-tests/proportion-test/proportion-test-result.html',
        z=stat,
        p=pval)
    def test_scalar(self):
        count = 5
        nobs = 83
        value = 0.05
        stat, pval = smprop.proportions_ztest(count, nobs, value=value)
        assert_almost_equal(stat, 0.392126026314)
        assert_almost_equal(pval, 0.694965098115)

        assert_raises(ValueError, smprop.proportions_ztest, count, nobs, value=None)
Ejemplo n.º 20
0
def two_sided_z_test(count, nobs):
    """
    - Null Hypothesis(H0): means are different from two groups — two sided
    - Null Hypothesis(H0): u0 > u1 or u0 < u2 — one sided
    - Use case: Ran a fair AB test, control group got 486 clicks out of 5000 impression vs experiment
    group got 527 clicks out of 5000 impression. Could we say experiment group won the test?
    Given statistical significance as 0.95
    """
    return proportions_ztest(count, nobs, alternative="two-sided")
Ejemplo n.º 21
0
 def ztest(self):
     prop_var = self.pooled_variance
     n_1 = self.d1.nobs
     s_1 = sum(self.d1.data)
     n_2 = self.d2.nobs
     s_2 = sum(self.d2.data)
     return proportions_ztest([s_1, s_2], [n_1, n_2],
                              alternative=self.test_direction,
                              prop_var=prop_var)
Ejemplo n.º 22
0
    def test_scalar(self):
        count = 5
        nobs = 83
        value = 0.05
        stat, pval = smprop.proportions_ztest(count, nobs, value=value)
        assert_almost_equal(stat, 0.392126026314)
        assert_almost_equal(pval, 0.694965098115)

        assert_raises(ValueError, smprop.proportions_ztest, count, nobs, value=None)
def get_ztest(data1, data2, g_var, hyp_type):
    '''Get the z_test and p_values scores'''
    c1 = data1[data1[g_var] == 1][g_var].sum()
    c2 = data2[data2[g_var] == 1][g_var].sum()
    count = [c1, c2]
    nobs = [len(data1), len(data2)]
    stat, pval = proportions_ztest(count, nobs, value=0, alternative=hyp_type)
    print('p-value: {0:0.3f}'.format(pval))
    print('z-statistic: {}'.format(round(stat, 3)))
Ejemplo n.º 24
0
 def test1(self):
     #Assume
     success = 5
     sample = 83
     hypoth = 0.05
     alt = 'two-sided'
     #Action
     result = prop.proportions_ztest(success, sample, hypoth, alt)
     #Assert
     self.assertAlmostEqual(result[0], 0.392, places=3)
     self.assertAlmostEqual(result[1], 0.695, places=3)
Ejemplo n.º 25
0
def proportionsztestResult():
    count1 = request.form.get('count1')
    count2 = request.form.get('count2')
    nob1 = request.form.get('nob1')
    nob2 = request.form.get('nob2')
    count = np.array([float(count1), float(count2)])
    nobs = np.array([float(nob1), float(nob2)])
    stat, pval = proportions_ztest(count, nobs)
    return render_template(
        'two-sample-tests/proportions-z-test/proportions-z-test-result.html',
        z=stat,
        p=pval)
Ejemplo n.º 26
0
def run_proportion_z_test(feature,
                          col='loan_status',
                          value1='Fully Paid',
                          value2='Default'):
    '''Calculate z-statistics and p-value of z-test. Feature is the feature that z-test will be performed on'''
    group1 = df.loc[df[col] == value1, feature]
    group2 = df.loc[df[col] == value2, feature]
    n1 = len(group1)
    p1 = group1.sum()
    n2 = len(group2)
    p2 = group2.sum()
    z_score, p_value = proportions_ztest([p1, p2], [n1, n2])
    return ('z-score = {}; p-value = {}'.format(z_score, p_value))
Ejemplo n.º 27
0
def ztest(data: pd.DataFrame,
          factors: np.ndarray,
          levels: np.ndarray,
          y: str,
          name: str,
          alpha=0.05) -> dict:
    '''
    For each factor in factors, conducts a t-test/one-way ANOVA to see if there are any 
    differences in the means of two or more groups.
    Parameters
    ----------
    data (pandas.DataFrame): df containing data of the experiment.
    factors (numpy.ndarray): the list of the independent variables.
    levels (numpy.ndarray): the matrix of factor x level. Each row represents a 
    factor and each element in a row represents a level.
    y (str): the name of the dependent variable.
    name (str): the version of the experiment.
    alpha (float): ignored.

    Return
    -------
    result_dict (dict): keys are factors and values are the results of statistical tests. 
    "Experiment" key is used as an index.
    '''
    result_dict = {'Experiment': name.replace('_', ' ').title()}
    filtered = data[data[y] == 1]

    for factor, factor_levels in zip(factors, levels):
        count = []
        nobs = []
        base_data = data
        base_filtered = filtered
        for i in range(1, len(factor_levels)):
            base_data = base_data[base_data[factor + '_' +
                                            factor_levels[i]] == 0]
            base_filtered = base_filtered[base_filtered[factor + '_' +
                                                        factor_levels[i]] == 0]
            count.append(
                len(filtered[filtered[factor + '_' + factor_levels[i]] == 1]))
            nobs.append(len(data[data[factor + '_' + factor_levels[i]] == 1]))
        count.append(len(base_filtered))
        nobs.append(len(base_data))

        if len(count) > 2:
            result = proportions_chisquare(np.array(count), np.array(nobs))
        else:
            result = proportions_ztest(np.array(count), np.array(nobs))

        p = p_to_string(result[1])
        result_dict[factor] = str(round(result[0], 2)) + ' ' + p
    return result_dict
Ejemplo n.º 28
0
    def find_best_split_for_binary_variable(self, variable):
        '''searches for best value split based on p-value and incremental clicks
           optimized for binary type variables'''
        pivot = self.data.groupby([self.option_column_name,
                                   variable])[self.click_column_name].agg({
                                       'opens':
                                       'count',
                                       'clicks':
                                       'sum',
                                       'mean':
                                       'mean'
                                   }).unstack(self.option_column_name)
        pivot = pivot.fillna(0)
        pivot.columns = ['_'.join(col).strip() for col in pivot.columns.values]
        pivot.reset_index(inplace=True)

        pivot['can_use'] = np.all(
            np.all(pivot[[s for s in pivot.columns if "opens_" in s]] >= 1000,
                   axis=1))
        pivot = pivot[pivot['can_use'] == True]

        if pivot.empty:
            return None, 0

        pivot['p_value'] = pivot.apply(lambda row: proportions_ztest(
            count=np.array(row[[s for s in pivot.columns if "clicks_" in s]]),
            nobs=np.array(row[[s for s in pivot.columns if "opens_" in s]]),
            value=0,
            alternative='two-sided')[1],
                                       axis=1)
        pivot = pivot[pivot['p_value'] <= self.max_p_value]
        if pivot.empty:
            return None, 0

        pivot['total_opens'] = pivot[[
            s for s in pivot.columns if "opens_" in s
        ]].sum(axis=1)
        pivot['total_clicks'] = pivot[[
            s for s in pivot.columns if "clicks_" in s
        ]].sum(axis=1)
        pivot['total_mean'] = pivot['total_clicks'] / pivot['total_opens']
        pivot['total_incremental_clicks'] = (
            pivot[[s for s in pivot.columns if "mean_" in s]].max(axis=1) -
            pivot['total_mean']) * pivot['total_opens']
        total_incremental_clicks = np.round(
            pivot['total_incremental_clicks'].sum(), 0)

        split_value = 1

        return split_value, total_incremental_clicks
Ejemplo n.º 29
0
def applying_prop_test(dataframe, reference_var, outcome_var='y', alternative='two-sided'):
    """
    Function that apply the statistical test for difference between proportions (means of a binary variable) for
    two different samples.
    
    This function makes use of "proportions_ztest" function from "statsmodels" library. In the way it is
    implemented here, the following hypothesis are tested against each other:
        H0: P(outcome_var = 1|reference_var = 1) = P(outcome_var = 1|reference_var = 0)
        H1: P(outcome_var = 1|reference_var = 1) != P(outcome_var = 1|reference_var = 0)
    
    Where the variable "reference_var" is responsible for splitting a dataset into two different samples, while
    "outcome_var" is the binary variable of interest.
    
    :param dataframe: dataframe with samples for implementing the test.
    :type dataframe: dataframe.
    
    :param reference_var: binary variable that will split samples accross two subsets.
    :type reference_var: string.
    
    :param outcome_var: binary variable whose difference in proportion should be assessed.
    :type outcome_var: string.
    
    :param alternative: indicate whether the test is two-sided (p0 != p1) or one-sided ("smaller" for p0 < p1,
    "larger" for p1 > p0).
    :type alternative: string.
    
    :return: test statistic, p-value of the test, hypotheses being tested, relevant frequencies.
    :rtype: dictionary.
    """
    
    oper = '<' if alternative=='smaller' else ('>' if alternative=='larger' else '!=')
    
    d0 = len(dataframe[reference_var]) - dataframe[reference_var].sum()
    d1 = dataframe[reference_var].sum()

    d0_y1 = len(dataframe[(dataframe[reference_var]==0) & (dataframe[outcome_var]==1)][reference_var])
    d1_y1 = len(dataframe[(dataframe[reference_var]==1) & (dataframe[outcome_var]==1)][reference_var])

    count = np.array([d0_y1, d1_y1])
    nobs = np.array([d0, d1])
    stat, pval = proportions_ztest(count, nobs, alternative=alternative)
    
    return {'test_stat': stat, 'p_value': pval,
            'hypotheses': f'H0: P({outcome_var}=1|{reference_var}=0) = P({outcome_var}=1|{reference_var}=1)\n'\
                          f'H1: P({outcome_var}=1|{reference_var}=0) {oper} P({outcome_var}=1|{reference_var}=1)',
            'frequencies': {f'freq({reference_var}=0)': d0,
                            f'freq({reference_var}=1)': d1,
                            f'freq({reference_var}=0&{outcome_var}=1)': d0_y1,
                            f'freq({reference_var}=1&{outcome_var}=1)': d1_y1}}
Ejemplo n.º 30
0
    def tek_orneklem_oran_testi(self):
        count = int(input("Gözlenmiş başarı sayısı(exp=25) kişi yorum yaptı"))
        nobs = stats.describe(self.choice_array).nobs  #gözlem sayısı
        value = 0.04  # sınanacak olan null hipotezimizin değeri
        p_value = float(proportions_ztest(count, nobs, value)[1])

        if p_value < 0.05:
            #HO hipoteti red edilir
            return "(P_value=" + str(
                p_value
            ) + ") 0.04 oranında web sitemize dönüşüm yoktur(Verilen ilk degerle aralarında farklılık var demektir)"
        else:
            return "(P_value=" + str(
                p_value
            ) + ") 0.04 oranında web sitemize dönüş vardır(Verilen ilk degerle aralarında farklılık yok demektir)"
Ejemplo n.º 31
0
    def iki_orneklem_oran_testi(self):
        detaya_bakilma_sayisi = np.array([500, 600])
        goruntulenme_sayisi = np.array([1700, 1800])

        iki_orneklem_oran_testi_result = float(
            proportions_ztest(detaya_bakilma_sayisi, goruntulenme_sayisi)[1])

        if iki_orneklem_oran_testi_result < 0.05:
            return "P_value değeri= " + str(
                iki_orneklem_oran_testi_result
            ) + " olduğundan dolayı HO hipotezi(İki oran arasında anlamlı bir farklılık yoktur) red edilir "
        else:
            return "P_value değeri= " + str(
                iki_orneklem_oran_testi_result
            ) + " olduğundan dolayı HO hipotezi(İki oran arasında anlamlı bir farklılık yoktur) kabul edilir"
Ejemplo n.º 32
0
def plot_power(min_diff, prob_b, size_a, size_b, significance=0.05):
    """illustrating power through a two-tailed hypothesis test
       obtains the z-score for the minimum detectable difference using proportion_ztest distribution for the null hypothesis, h0            and alternative hypothesis, h1
       points that are greater than the zscore for the specified significance level
       power is the area after the threshold, i.e. 1 - the cumulative distribution function of that point"""

    prob_a = prob_b + min_diff
    count_a = size_a * prob_a
    count_b = size_b * prob_b
    counts = np.array([count_a, count_b])
    nobs = np.array([size_a, size_b])
    zscore, _ = proportions_ztest(counts, nobs, alternative='two-sided')

    h0 = stats.norm(loc=0, scale=1)
    h1 = stats.norm(loc=zscore, scale=1)

    x = np.linspace(-5, 6, num=100)
    threshold = h0.ppf(1 - significance / 2)
    mask1 = (x > threshold)
    mask2 = (x < -threshold)

    power = np.round(1 - h1.cdf(threshold), 2)

    hypotheses = [h1, h0]
    labels = ['$H_1$ is true', '$H_0$ is true']
    for hypothesis, label in zip(hypotheses, labels):
        y = hypothesis.pdf(x)
        line = plt.plot(x, y, label=label)
        plt.fill_between(x=x[mask1],
                         y1=0.0,
                         y2=y[mask1],
                         alpha=0.2,
                         color=line[0].get_color())
        plt.fill_between(x=x[mask2],
                         y1=0.0,
                         y2=y[mask2],
                         alpha=0.2,
                         color=line[0].get_color())

    title = 'p1: {}, p2: {}, size1: {}, size2: {}, power: {}'
    plt.title(title.format(prob_a, prob_b, size_a, size_b, power),
              fontdict={'fontsize': 15})
    plt.ylabel('Probability')
    plt.xlabel('')
    plt.legend()
    plt.tight_layout()
    plt.show()
Ejemplo n.º 33
0
def expansion_stats():
    human = load_cf('Human').query('variance == "constant" and not is_term').expand
    model = load_cf('OptimalPlusPure').query('variance == "constant" and not is_term').expand

    write_tex('expansion_human', f'{100*human.mean():.1f}\\%')
    # write_tex(f'expansion_human', mean_std(100*human.groupby('wid').mean(), fmt='pct', digits=0))

    write_tex('expansion_optimal', f'{100*model.mean():.1f}\\%')

    z, p = proportions_ztest([human.sum(), model.sum()], [len(human), len(model)])
    write_tex("expansion_test", rf"$z={z:.1f},\ {pval(p)}$")

    write_tex("jump", f'{expansion.jump.mean()*100:.1f}\%')
    # write_tex("jump", mean_std(expansion.groupby('wid').jump.mean()*100, fmt='pct'))

    m = logit(f'jump.astype(int) ~ gain_z', data=expansion).fit()
    write_tex(f'expansion_logistic', rf'$\beta = {m.params.gain_z:.3f},\ {pval(m.pvalues.gain_z)}$')
    def get_pvals(self, col, target):

        target_prop = target.mean()  # Baseline
        level_counts = col.value_counts()  # counts

        # Bin category levels that account for less than <thresh> of the total data
        to_bin = level_counts.index[level_counts < self.thresh]
        col[col.isin(to_bin)] = 'Other'

        # Record the bins
        self.to_bin[col.name] = to_bin

        # Get a p-value for each proportion
        df = pd.concat([col, target], axis=1)
        agg = df.groupby(col.name)['target'].aggregate({
                'count': lambda x: x.sum(),
                'nobs': lambda x: x.count()
        })
        zscores = agg.apply(lambda x: proportions_ztest(x['count'], x['nobs'], target_prop)[0], axis=1)

        return defaultdict(int, zscores)
Ejemplo n.º 35
0
ConferenceAccepted = len(data[((data.Segment == "Conference") & (data.Accept == 1))])
OtherAccepted = len(data[((data.Segment == "Other") & (data.Accept == 1))])
VacationAccepted = len(data[((data.Segment == "Vacation") & (data.Accept == 1))])
TotalAccepted = len(data[(data.Accept == 1)])

import statsmodels.stats.proportion as sm

TotalAcceptedPercent = TotalAccepted / Total


print("BusinessLong")
print("Average: " + str(BusinessLongAccepted / BusinessLong))
BusinessLongCI = sm.proportion_confint(BusinessLongAccepted, BusinessLong)
print("Lower: " + str(BusinessLongCI[0]))
print("Upper: " + str(BusinessLongCI[1]))
print(sm.proportions_ztest(BusinessLongAccepted, BusinessLong, TotalAcceptedPercent)[1])

print("BusinessShort")
print("Average: " + str(BusinessShortAccepted / BusinessShort))
BusinessShortCI = sm.proportion_confint(BusinessShortAccepted, BusinessShort)
print("Lower: " + str(BusinessShortCI[0]))
print("Upper: " + str(BusinessShortCI[1]))
print(sm.proportions_ztest(BusinessShortAccepted, BusinessShort, TotalAcceptedPercent)[1])

print("Conference")
print("Average: " + str(ConferenceAccepted / Conference))
ConferenceCI = sm.proportion_confint(ConferenceAccepted, Conference)
print("Lower: " + str(ConferenceCI[0]))
print("Upper: " + str(ConferenceCI[1]))
print(sm.proportions_ztest(ConferenceAccepted, Conference, TotalAcceptedPercent)[1])
Ejemplo n.º 36
0
def z_test(driver_1, n1, driver_2, n2):
    count = np.array([driver_1, driver_2])
    nobs = np.array([n1, n2])
    z, p = proportions_ztest(count, nobs, value=0, alternative = 'larger')
    print ('z-stat = {z} \n p-value = {p}'.format(z=z,p=p))
Ejemplo n.º 37
0
 def test_default_values(self):
     count = np.array([5, 12])
     nobs = np.array([83, 99])
     stat, pval = smprop.proportions_ztest(count, nobs, value=None)
     assert_almost_equal(stat, -1.4078304151258787)
     assert_almost_equal(pval, 0.15918129181156992)