def test_tukeyRangeTest_pResult(self):
     x1, x2, x3 = [1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]
     results = tukey_range_test(x1, x2, x3)
     model = pairwise_tukeyhsd(x1 + x2 + x3, groups=[0] * 5 + [1] * 5 + [2] * 5)
     p_vals = psturng(np.abs(model.meandiffs / model.std_pairs), len(model.groupsunique), model.df_total)
     for i in range(3):
         assert pytest.approx(p_vals[i]) == results[i][2]
Beispiel #2
0
 def _calc_firing_rate(self,
                       num_peaks: pd.DataFrame,
                       epoch: str = "All_cells"):
     """
     Sum all indices of peaks to find the average firing rate of cells in the three epochs
     :return:
     """
     # Remove silent cells from comparison
     split_data = num_peaks.stack()
     mc = MultiComparison(split_data.values,
                          split_data.index.get_level_values(1).values)
     try:
         res = mc.tukeyhsd()
     except ValueError:
         aprint("<yellow>Failed during the p-value calculation.</yellow>")
     else:
         print(res)
         print(
             f"P-values ({epoch}, number of cells: {split_data.shape[0] // 3}):",
             psturng(
                 np.abs(res.meandiffs / res.std_pairs),
                 len(res.groupsunique),
                 res.df_total,
             ),
         )
     finally:
         print(split_data.mean(level=1))
Beispiel #3
0
    def test_handful_to_known_values(self):
        cases = [
            (0.71499578726111435, 67, 956.70742488392386, 5.0517658443070692),
            (0.42974234855067672, 16, 723.50261736502318, 3.3303582093701354),
            (0.94936429359548424, 2, 916.1867328010926, 2.7677975546417244),
            (0.85357381770725038, 66, 65.67055060832368, 5.5647438108270109),
            (0.87372108021900929, 74, 626.42369474993632, 5.5355540570701107),
            (0.53891960564713726, 49, 862.63799438485785, 4.5108645923377146),
            (0.98818659555664567, 18, 36.269686711464274, 6.0906643750886156),
            (0.53031994896037626, 50, 265.29558652727917, 4.5179640079726795),
            (0.7318857887397332, 59, 701.41497552251201, 4.9980139875409915),
            (0.65332019368982697, 61, 591.01183664195912, 4.8706581766706893),
            (0.55403221657248558, 77, 907.34156725405194, 4.8786135917984632),
            (0.30783916857266003, 83, 82.446923487980882, 4.4396401242858294),
            (0.29321720242415661, 16, 709.64382575553009, 3.0304277540702729),
            (0.27146478168880306, 31, 590.00594683574172, 3.5870031664477215),
            (0.67348796958433776, 81, 608.02706111127657, 5.1096199974432936),
            (0.32774393945968938, 18, 17.706224399250839, 3.2119038163765432),
            (0.7081637474795982, 72, 443.10678914889695, 5.0990030889410649),
            (0.33354939276757861, 47, 544.0772192199048, 4.0613352964193279),
            (0.60412143947363051, 36, 895.83526933271548, 4.381717596850172),
            (0.88739052300665977, 77, 426.03665511558262, 5.6333929480341309)
        ]

        for p, r, v, q in cases:
            assert_almost_equal(1. - p, psturng(q, r, v), 5)
def analyze_tukey_dict(tukey_dict):
    metric_names = list(tukey_dict.keys())
    model_names = list(tukey_dict[metric_names[0]].groupsunique)
    tukey_analysis_dict = {}

    for i, metric in zip(range(len(metric_names)), metric_names):
        tukey_res = tukey_dict[metric]
        tukey_df = pd.DataFrame(data=tukey_res._results_table.data[1:],
                                columns=tukey_res._results_table.data[0])

        fwer = tukey_res._results_table.title[tukey_res._results_table.title.
                                              find('FWER'):]
        tukey_df['p value' + fwer] = psturng(
            np.abs(tukey_res.meandiffs / tukey_res.std_pairs),
            len(tukey_res.groupsunique), tukey_res.df_total)

        reject_df = pd.DataFrame(0, index=model_names, columns=model_names)
        comp_df = pd.DataFrame(0, index=model_names, columns=model_names)
        for _, row in tukey_df.iterrows():
            if row['reject'] == True:
                if row['meandiff'] < 0:
                    comp_df.loc[row['group1']][row['group2']] = 1
                else:
                    comp_df.loc[row['group2']][row['group1']] = 1

            reject_df.loc[row['group1']][row['group2']] = row['reject']

        tukey_df.columns.name = metric
        reject_df.columns.name = metric
        comp_df.columns.name = metric
        tukey_analysis_dict[metric] = (tukey_df, reject_df, comp_df)

    return tukey_analysis_dict
Beispiel #5
0
def get_tukey(exp, df_all, measure):
    # Tukey posthoc analysis
    # See https://jpktd.blogspot.com/2013/03/multiple-comparison-and-tukey-hsd-or_25.html
    # And https://code.google.com/archive/p/qsturng-py/
    # And https://stackoverflow.com/questions/48200699/how-can-i-get-p-values-of-each-group-comparison-when-applying-the-tukey-s-hones
    # q, res_table, std_pairs, etc can be found from print(dir(result)) which will list all possible calculations

    if len(df_all.groupby('strain').count()) >= 3:
        df_tukey = df_all[np.isfinite(df_all[measure])]
        mc = MultiComparison(df_tukey[measure], df_tukey['strain'])
        result = mc.tukeyhsd()
        p = psturng(np.abs(result.meandiffs / result.std_pairs),
                    len(result.groupsunique), result.df_total)
        df_pairs = pd.DataFrame({
            'group1': [
                result._results_table[1][0], result._results_table[2][0],
                result._results_table[3][0]
            ],
            'group2': [
                result._results_table[1][1], result._results_table[2][1],
                result._results_table[3][1]
            ],
            'p_value':
            [np.around(p[0], 4),
             np.around(p[1], 4),
             np.around(p[2], 4)]
        })
    else:
        df_pairs = pd.DataFrame({'group1': [], 'group2': [], 'p_value': []})

    file_out = exp.name + '_coupling_' + measure + '_' + '_tukey_' + '.csv'
    pfiles.save_csv(df_pairs, file_out, exp.dir_tukey, False)
    return df_pairs
Beispiel #6
0
def pairwise_tuckey(aov, categories):
    categories = np.array(aov.group_stats.index)
    n_groups = len(categories)
    gnobs = aov.group_stats["count"].to_numpy()
    gmeans = (aov.group_stats["sum"] / aov.group_stats["count"]).to_numpy()
    gvar = aov.table.at["Residual", "mean_sq"] / gnobs
    g1, g2 = np.array(list(itertools.combinations(np.arange(n_groups), 2))).T
    mn = gmeans[g1] - gmeans[g2]
    se = np.sqrt(gvar[g1] + gvar[g2])
    tval = mn / se
    df = aov.table.at["Residual", "df"]
    pval = psturng(np.sqrt(2) * np.abs(tval), n_groups, df)
    thsd = pd.DataFrame(
        columns=[
            "A",
            "B",
            "mean(A)",
            "mean(B)",
            "diff",
            "Std.Err.",
            "t value",
            "Pr(>|t|)",
        ],
        index=range(n_groups * (n_groups - 1) // 2),
    )
    thsd["A"] = categories[g1]
    thsd["B"] = categories[g2]
    thsd["mean(A)"] = gmeans[g1]
    thsd["mean(B)"] = gmeans[g2]
    thsd["diff"] = mn
    thsd["Std.Err."] = se
    thsd["t value"] = tval
    thsd["Pr(>|t|)"] = pval
    return thsd
Beispiel #7
0
def get_tukey(df_groups, measure, groups):

    # Tukey posthoc analysis
    # See https://jpktd.blogspot.com/2013/03/multiple-comparison-and-tukey-hsd-or_25.html
    # And https://code.google.com/archive/p/qsturng-py/
    # And https://stackoverflow.com/questions/48200699/how-can-i-get-p-values-of-each-group-comparison-when-applying-the-tukey-s-hones
    # q, res_table, std_pairs, etc can be found from print(dir(result)) which will list all possible calculations
    df_tukey = df_groups[3][np.isfinite(df_groups[3][measure])]
    #print(df_tukey)
    mc = MultiComparison(df_tukey[measure], df_tukey['strain'])
    #result = pairwise_tukeyhsd(mc.data,mc.groups,0.05)
    result = mc.tukeyhsd()
    p = psturng(np.abs(result.meandiffs / result.std_pairs),
                len(result.groupsunique), result.df_total)
    df_pairs = pd.DataFrame({
        'group1': [
            result._results_table[1][0], result._results_table[2][0],
            result._results_table[3][0]
        ],
        'group2': [
            result._results_table[1][1], result._results_table[2][1],
            result._results_table[3][1]
        ],
        'p_value':
        [np.around(p[0], 4),
         np.around(p[1], 4),
         np.around(p[2], 4)]
    })

    for index, row in df_pairs.iterrows():
        stars = get_stars(row['p_value'])
        df_pairs.loc[index, 'significance'] = stars

    return df_pairs
Beispiel #8
0
    def test_handful_to_known_values(self):
        cases = [
            (0.71499578726111435, 67, 956.70742488392386, 5.0517658443070692),
            (0.42974234855067672, 16, 723.50261736502318, 3.3303582093701354),
            (0.94936429359548424, 2, 916.1867328010926, 2.7677975546417244),
            (0.85357381770725038, 66, 65.67055060832368, 5.5647438108270109),
            (0.87372108021900929, 74, 626.42369474993632, 5.5355540570701107),
            (0.53891960564713726, 49, 862.63799438485785, 4.5108645923377146),
            (0.98818659555664567, 18, 36.269686711464274, 6.0906643750886156),
            (0.53031994896037626, 50, 265.29558652727917, 4.5179640079726795),
            (0.7318857887397332, 59, 701.41497552251201, 4.9980139875409915),
            (0.65332019368982697, 61, 591.01183664195912, 4.8706581766706893),
            (0.55403221657248558, 77, 907.34156725405194, 4.8786135917984632),
            (0.30783916857266003, 83, 82.446923487980882, 4.4396401242858294),
            (0.29321720242415661, 16, 709.64382575553009, 3.0304277540702729),
            (0.27146478168880306, 31, 590.00594683574172, 3.5870031664477215),
            (0.67348796958433776, 81, 608.02706111127657, 5.1096199974432936),
            (0.32774393945968938, 18, 17.706224399250839, 3.2119038163765432),
            (0.7081637474795982, 72, 443.10678914889695, 5.0990030889410649),
            (0.33354939276757861, 47, 544.0772192199048, 4.0613352964193279),
            (0.60412143947363051, 36, 895.83526933271548, 4.381717596850172),
            (0.88739052300665977, 77, 426.03665511558262, 5.6333929480341309),
        ]

        for p, r, v, q in cases:
            assert_almost_equal(1.0 - p, psturng(q, r, v), 5)
Beispiel #9
0
 def test_vector(self):
     "vector input -> vector output"
     assert_array_almost_equal(
         np.array([0.10679889, 0.06550009, 0.01730145]),
         psturng([3.98832389, 4.56835318, 6.26400894], [4, 4, 4], [6, 6, 6]),
         5,
     )
Beispiel #10
0
def getOWANOVAmultiComp(data, labels, verbose=False):
    tlabels = np.concatenate([[labels[j] for _, y in enumerate(x)]
                              for j, x in enumerate(data)])
    res = pairwise_tukeyhsd(np.concatenate(data), tlabels)
    if verbose:
        print(res.summary())
    return psturng(np.abs(res.meandiffs / res.std_pairs),
                   len(res.groupsunique), res.df_total)
def posthoc_turron_by_first_time_tasting(melted, variable):
    df = melted.copy()
    df = df[df['variable'] == variable].dropna()
    df['turron:first_time_tasting'] = df['turron'] + "_" + df['first_time_tasting']
    mc = MultiComparison(df['value'], df['turron:first_time_tasting'])
    res = mc.tukeyhsd()
    from statsmodels.stats.libqsturng import psturng
    p_values = psturng(np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total)
    print(p_values)
    print(res)
Beispiel #12
0
    def _games_howell_test(self):
        combs = list(combinations(np.unique(self.group), 2))
        sample_stats = self._group_sample_statistics()

        means_d = dict(sample_stats['Group Means'])
        obs_d = dict(sample_stats['Group Observations'])
        var_d = dict(sample_stats['Group Variance'])

        group_comps = []
        mean_differences = []
        degrees_freedom = []
        t_values = []
        p_values = []
        std_err = []
        up_conf = []
        low_conf = []

        for comb in combs:

            diff = means_d[comb[1]] - means_d[comb[0]]

            t_val = np.absolute(diff) / np.sqrt((var_d[comb[0]] / obs_d[comb[0]]) + (var_d[comb[1]] / obs_d[comb[1]]))

            df_num = (var_d[comb[0]] / obs_d[comb[0]] + var_d[comb[1]] / obs_d[comb[1]]) ** 2
            df_denom = ((var_d[comb[0]] / obs_d[comb[0]]) ** 2 / (obs_d[comb[0]] - 1) +
                        (var_d[comb[1]] / obs_d[comb[1]]) ** 2 / (obs_d[comb[1]] - 1))

            df = df_num / df_denom

            p_val = psturng(t_val * np.sqrt(2), sample_stats['Number of Groups'], df)

            se = np.sqrt(0.5 * (var_d[comb[0]] / obs_d[comb[0]] + var_d[comb[1]] / obs_d[comb[1]]))

            upper_conf = diff + qsturng(1 - self.alpha, sample_stats['Number of Groups'], df)
            lower_conf = diff - qsturng(1 - self.alpha, sample_stats['Number of Groups'], df)

            mean_differences.append(diff)
            degrees_freedom.append(df)
            t_values.append(t_val)
            p_values.append(p_val)
            std_err.append(se)
            up_conf.append(upper_conf)
            low_conf.append(lower_conf)
            group_comps.append(str(comb[0]) + ' : ' + str(comb[1]))

        result_df = pd.DataFrame({'groups': group_comps,
                                  'mean_difference': mean_differences,
                                  'std_error': std_err,
                                  't_value': t_values,
                                  'p_value': p_values,
                                  'upper_limit': up_conf,
                                  'lower limit': low_conf})

        return result_df
Beispiel #13
0
    def test_1000_random_values(self):
        n = 1000
        ps = np.random.random(n) * (.999 - .1) + .1
        rs = np.random.random_integers(2, 100, n)
        vs = np.random.random(n) * 998. + 2.
        qs = qsturng(ps, rs, vs)
        estimates = psturng(qs, rs, vs)
        actuals = 1. - ps
        errors = estimates - actuals

        assert_equal(np.array([]), np.where(errors > 1e-5)[0])
Beispiel #14
0
def tukeyTest(data, groups, alpha=0.05):
    '''Perform pairwise Tukey test for data by groups
    '''
    # pairwise comparisons using Tukey's test, calculating p-values
    res = pairwise_tukeyhsd(data, groups, alpha)
    print('Summary of test:\n', res)
    # print(dir(results))# prints out all attributes of an object
    pVal = psturng(np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total)
    print('p values of all pair-wise tests:\n', pVal)

    return res
Beispiel #15
0
 def test_vector(self):
     "vector input -> vector output"
     assert_array_almost_equal(np.array([0.10679889,
                                          0.06550009,
                                          0.01730145]),
                               psturng([3.98832389,
                                        4.56835318,
                                        6.26400894],
                                       [4, 4, 4],
                                       [6, 6, 6]),
                               5)
Beispiel #16
0
    def test_100_random_values(self):
        n = 100
        ps = np.random.random(n)*(.999 - .1) + .1
        rs = np.random.random_integers(2, 100, n)
        vs = np.random.random(n)*998. + 2.
        qs = qsturng(ps, rs, vs)
        estimates = psturng(qs, rs, vs)
        actuals = 1. - ps
        errors = estimates - actuals

        assert_equal(np.array([]), np.where(errors > 1e-5)[0])
Beispiel #17
0
def test_qstrung():
    rows = [   5,    6,    7,    8,    9,   10,   11,   12,   13,   14,   15,
              16,   17,   18,   19,   20,   24,   30,   40,   60,  120, 9999]
    cols = np.arange(2,11)

    for alpha in [0.01, 0.05]:
        for k in cols:
            c1 = get_tukeyQcrit(k, rows, alpha=alpha)
            c2 = qsturng(1-alpha, k, rows)
            assert_almost_equal(c1, c2, decimal=2)
            #roundtrip
            assert_almost_equal(psturng(qsturng(1-alpha, k, rows), k, rows), alpha, 5)
Beispiel #18
0
    def test_100_random_values(self, reset_randomstate):
        n = 100
        random_state = np.random.RandomState(12345)
        ps = random_state.random_sample(n) * (.999 - .1) + .1
        rs = random_state.randint(2, 101, n)
        vs = random_state.random_sample(n) * 998. + 2.
        qs = qsturng(ps, rs, vs)
        estimates = psturng(qs, rs, vs)
        actuals = 1. - ps
        errors = estimates - actuals

        assert_equal(np.array([]), np.where(errors > 1e-5)[0])
Beispiel #19
0
def tukey_hsd(args, ddof = 1):
    """
    Usage
    ------
    tukey(args, ddof=1.0)
    
    Parameters
    --------
    args : pandas dataframe datasets
            pd.DataFrame({'A':[x11,x12,x13,,,,x1i],
                   'B':[x21,x22,x23,,,,x2j],
                   'K':[..................]
                   'N':[xn1,xn2,xn3,,,,xnk]])
    ddof : delta degrees of freedom, default is 1.
    
    Returns
    -------
    dict of 'summary', 'p', 'v','var_e'
    summary : contains mean and var within group, and size of each groups.
    p       : contains dataframe of p-values between each groups.
    t       : contains dataframe of t-values between each groups.
    var_e   : contains df(degrees of freedom of error-variance), and error-variance between groups.
    Comments
    -------
    Multiple comparison with Tukey-Kramer's range test.
    This program is a ported python-script from R-script called tukey, originally 
    coded by Dr Shigeyuki Aoki.
    psturng and qsturng functions were imported from statsmodels.
    
    This program was wirtten by Kyoichiro Higashi, ([email protected])
    """
    
    ddof = ddof
    data = args
    alldata = args.unstack().values
    groups = len(args.columns)		# number of groups
    n_wig = args.notna().sum(0)      # cases within groups
    phi_e = (n_wig - ddof).sum()                    # sum of degrees of freedom
    mean_wig = data.mean()       # means of each groups
    var_wig = data.var()       # variance of each groups
    idx_group = args.columns
    summary = pd.DataFrame({'size':n_wig, 'mean':mean_wig, 'var':var_wig},
    index=idx_group)           # statics of each groups

    var_e = sum((n_wig - ddof) * var_wig) / phi_e                # Error variance (Variance within group)
    t = np.array( [np.array( [ abs(mean_wig[i] - mean_wig[j]) / np.sqrt(var_e * (1. / n_wig[i] + 1. / n_wig[j])) for i in range(groups)] ) for j in range(groups)] )  # t statistics of paired comparison         
    prob = psturng(t*np.sqrt(2.0), groups, phi_e)

    t = pd.DataFrame(data = t,index = idx_group, columns = idx_group)
    prob = pd.DataFrame(data = prob,index = idx_group, columns = idx_group)
    variance = pd.DataFrame(data = [phi_e, var_e], index = ['df','var'], columns = ['error'])

    return {'summary':summary, 't':t, 'p':prob, 'var_e':variance}
Beispiel #20
0
    def test_100_random_values(self, reset_randomstate):
        n = 100
        random_state = np.random.RandomState(12345)
        ps = random_state.random_sample(n)*(.999 - .1) + .1
        rs = random_state.randint(2, 101, n)
        vs = random_state.random_sample(n)*998. + 2.
        qs = qsturng(ps, rs, vs)
        estimates = psturng(qs, rs, vs)
        actuals = 1. - ps
        errors = estimates - actuals

        assert_equal(np.array([]), np.where(errors > 1e-5)[0])
Beispiel #21
0
def test_qstrung(alpha, k):
    rows = [
        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 24, 30, 40,
        60, 120, 9999
    ]

    c1 = get_tukeyQcrit(k, rows, alpha=alpha)
    c2 = qsturng(1 - alpha, k, rows)
    assert_almost_equal(c1, c2, decimal=2)
    # roundtrip
    assert_almost_equal(psturng(qsturng(1 - alpha, k, rows), k, rows), alpha,
                        5)
Beispiel #22
0
    def _group_comparison(self):
        r"""
        Constructs a pandas DataFrame containing the test results and group comparisons as found by Tukey's HSD test.

        Returns
        -------
        groups : array-like
            pandas DataFrame of group comparison results.

        """
        group_means = npi.group_by(self.design_matrix[:, 0],
                                   self.design_matrix[:, 1], np.mean)

        group_means = [i for _, i in group_means]

        group_mean_differences = np.array(list(combinations(group_means, 2)))[:, 0] - \
                                 np.array(list(combinations(group_means, 2)))[:, 1]

        group_sd = npi.group_by(self.design_matrix[:, 0],
                                self.design_matrix[:, 1], std_dev)
        group_sd = [i for _, i in group_sd]

        group_names = np.unique(self.design_matrix[:, 0])

        groups = pd.DataFrame(np.array(list(combinations(group_names, 2))))

        groups['groups'] = groups[0] + ' - ' + groups[1]
        groups['group means'] = group_means
        groups['mean difference'] = group_mean_differences

        groups['std_dev'] = group_sd

        groups['significant difference'] = np.where(
            np.abs(groups['mean difference']) >= self.hsd, True, False)

        groups['upper interval'] = groups[
            'mean difference'] + self.tukey_q_value * np.sqrt(
                self.mse / 2. * (2. / (self.n / self.k)))

        groups['lower interval'] = groups[
            'mean difference'] - self.tukey_q_value * np.sqrt(
                self.mse / 2. * (2. / (self.n / self.k)))

        q_values = groups['mean difference'] / group_sd

        groups['p_adjusted'] = psturng(np.absolute(q_values), self.n / self.k,
                                       self.dof)

        del groups[0]
        del groups[1]

        return groups
def get_pairwise_comparison_data(df, independent_variables_names, dependent_variables_names, significance_cutoff=0.05, NUM_GROUPS_CUTOFF=15):
    '''
        datasetId
        independentVariables - list names, must be categorical
        dependentVariables - list names, must be numerical
        numBins - number of bins for the independent quantitative variables (if they exist)
    '''
    considered_independent_variable_name = independent_variables_names[0]
    considered_dependent_variable_name = dependent_variables_names[0]

    # Only return pairwise comparison data if number of groups < THRESHOLD
    num_groups = len(get_unique(df[considered_independent_variable_name]))
    if num_groups > NUM_GROUPS_CUTOFF:
        return None

    hsd_result = pairwise_tukeyhsd(df[considered_dependent_variable_name], df[considered_independent_variable_name], alpha=significance_cutoff)
    hsd_raw_data = hsd_result.summary().data[1:]
    st_range = np.abs(hsd_result.meandiffs) / hsd_result.std_pairs
    p_values = psturng(st_range, len(hsd_result.groupsunique), hsd_result.df_total)

    hsd_headers = [
        'Group 1',
        'Group 2',
        'Group Mean Difference (2 - 1)',
        'Lower Bound',
        'Upper Bound',
        'p-value',
        'Distinct (p < %s)' % significance_cutoff
    ]
    hsd_data = []
    for i in range(0, len(hsd_raw_data)):
        if isinstance(p_values, float):
            p_value = p_values
        else:
            p_value = p_values[i] if i < len(p_values) else None
        hsd_data_row = [
            hsd_raw_data[i][0],
            hsd_raw_data[i][1],
            hsd_result.meandiffs[i],
            hsd_result.confint[i][0],
            hsd_result.confint[i][1],
            p_value,
            ( 'True' if (p_value <= significance_cutoff) else 'False' )
        ]
        hsd_data.append(hsd_data_row)

    return {
        'column_headers': hsd_headers,
        'rows': hsd_data
    }
Beispiel #24
0
def test_qstrung():
    rows = [
        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 24, 30, 40,
        60, 120, 9999
    ]
    cols = np.arange(2, 11)

    for alpha in [0.01, 0.05]:
        for k in cols:
            c1 = get_tukeyQcrit(k, rows, alpha=alpha)
            c2 = qsturng(1 - alpha, k, rows)
            assert_almost_equal(c1, c2, decimal=2)
            #roundtrip
            assert_almost_equal(psturng(qsturng(1 - alpha, k, rows), k, rows),
                                alpha, 5)
class Generator:
    enabled = False
    display_name = "statsmodel"

    @staticmethod
    def init_parser(parser):
        parser.add_argument('--statsmodel', action="store_true")

    @classmethod
    def init_args(cls, arg):
        cls.enabled = arg.statsmodel

    @staticmethod
    def process(case, dop):
        q, k, nu = case.q, case.k, case.v
        atol = 10**(-dop)
        return 1 - psturng(q, k, nu)
def tukey_range_test(*args):
    """Found in statsmodels as pairwise_tukeyhsd
    This test compares all possible pairs of means and determines if there are any differences in these pairs.

    Parameters
    ----------
    args: list or numpy arrays, 1-D
        The observed measurements for each group, organized into lists or numpy arrays

    Return
    ------
    results: list
        A list of lists containing 3 attributes:
            1) The groups being compared
            2) The Q Statistic
            3) p, or the likelihood our observed differences are due to chance
    """
    k = len(args)
    if k < 2:
        raise AttributeError("Need at least two groups to perform Tukey Range Test")
    results = []
    mean_i = np.mean(args, axis=1)
    groups = np.arange(len(args))
    n_i = [len(arg) for arg in args]
    sum_data = np.sum(args, axis=1)
    square_data = np.power(args, 2)
    df = sum(n_i) - k
    sse = _sse(sum_data, square_data, n_i)
    for group in groups:
        group = int(group)
        for next_group in range(group + 1, len(groups)):
            mean_a, mean_b = mean_i[group], mean_i[next_group]
            n_a, n_b = n_i[group], n_i[next_group]
            difference = abs(mean_a - mean_b)
            std_group = sqrt(sse / df / min(n_a, n_b))
            q = difference / std_group
            p = psturng(q, k, df)
            results.append(['group {} - group {}'.format(group, next_group), q, p])
    return results
Beispiel #27
0
def Tukey_p_value(result_from_tukey):
    #print(result_from_tukey.meandiffs)
    P_value = psturng(
        np.abs(result_from_tukey.meandiffs / result_from_tukey.std_pairs),
        len(result_from_tukey.groupsunique), result_from_tukey.df_total)
    print(P_value)
Beispiel #28
0
def calculate_tukey_posthoc(df,
                            column,
                            type_column='type',
                            verbose=True,
                            write=False,
                            name=None,
                            output_dir=None):
    """Computes p-values using ANOVA with post-hoc Tukey HSD for a given DataFrame.

    Estimates p-values for a given DataFrame assuming that the sample
    type is named as `type_column`.

    Parameters
    ----------
    df : pandas DataFrame
        Contains the table of values ans corresponding types or classes.

    column : str
        Indicates the column of values.

    type_column : str
        Indicates the column of sample kind.

    verbose : boolean
        Specifies if the output should be printed into a terminal.

    write : boolean
        Specifies if the output should be written into a text file.

    name : str
        Indicates the name of the output file.

    output_dir : str
        Indicates the output dir where the file will be written.

    Returns
    -------
    dict : sample typles and p-values
        The dict of sample types and cooresponding p-values.
    """
    mc = MultiComparison(df[column], df[type_column])
    tt = mc.tukeyhsd()
    st_range = np.abs(tt.meandiffs) / tt.std_pairs

    fout = None
    if write and output_dir:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        fout = open(os.path.join(output_dir, name + '.txt'), 'w')
        print(os.path.join(output_dir, name + '.txt'))

    if write:
        print('Tukey post-hoc ({0})'.format(column), end="", file=fout)
        print(tt, end="", file=fout)
        print(mc.groupsunique, end="", file=fout)

    if verbose:
        print('Tukey post-hoc ({0})' % (column))
        print(tt)
        print(mc.groupsunique)

    pvals = psturng(st_range, len(tt.groupsunique), tt.df_total)

    out = {}
    groups = mc.groupsunique
    g1idxs, g2idxs = mc.pairindices

    for g1i, g2i, p in zip(g1idxs, g2idxs, pvals):
        gname = '{}-{}'.format(groups[g1i], groups[g2i])
        out[gname] = p

    min_item = min(out.iteritems(), key=operator.itemgetter(1))

    for grp, p in out.items():
        if fout and write:
            print >> fout, '{}: {}'.format(grp, p)
        if verbose:
            print(grp, ': ', p)

    return out, min_item
Beispiel #29
0
def kw_nemenyi(groups, to_compare=None, alpha=0.05, method='tukey'):
    """

    Kruskal-Wallis 1-way ANOVA with Nemenyi's multiple comparison test

    Arguments:
    ---------------
    groups: sequence
        arrays corresponding to k mutually independent samples from
        continuous populations

    to_compare: sequence
        tuples specifying the indices of pairs of groups to compare, e.g.
        [(0, 1), (0, 2)] would compare group 0 with 1 & 2. by default, all
        possible pairwise comparisons between groups are performed.

    alpha: float
        family-wise error rate used for correcting for multiple comparisons
        (see statsmodels.stats.multitest.multipletests for details)

    method: string
        the null distribution of the test statistic used to determine the
        corrected p-values for each pair of groups, can be either "tukey"
        (studentized range) or "chisq" (Chi-squared). the "chisq" method will
        correct for tied ranks.

    Returns:
    ---------------
    H: float
        Kruskal-Wallis H-statistic

    p_omnibus: float
        p-value corresponding to the global null hypothesis that the medians of
        the groups are all equal

    p_corrected: float array
        corrected p-values for each pairwise comparison, corresponding to the
        null hypothesis that the pair of groups has equal medians. note that
        these are only meaningful if the global null hypothesis is rejected.

    reject: bool array
        True for pairs where the null hypothesis can be rejected for the given
        alpha

    Reference:
    ---------------

    """

    # omnibus test (K-W ANOVA)
    # -------------------------------------------------------------------------

    if method is None:
        method = 'chisq'
    elif method not in ('tukey', 'chisq'):
        raise ValueError('method must be either "tukey" or "chisq"')

    groups = [np.array(gg) for gg in groups]

    k = len(groups)

    n = np.array([len(gg) for gg in groups])
    if np.any(n < 5):
        warnings.warn("Sample sizes < 5 are not recommended (K-W test assumes "
                      "a chi square distribution)")

    allgroups = np.concatenate(groups)
    N = len(allgroups)
    ranked = stats.rankdata(allgroups)

    # correction factor for ties
    T = stats.tiecorrect(ranked)
    if T == 0:
        raise ValueError('All numbers are identical in kruskal')

    # sum of ranks for each group
    j = np.insert(np.cumsum(n), 0, 0)
    R = np.empty(k, dtype=np.float)
    for ii in range(k):
        R[ii] = ranked[j[ii]:j[ii + 1]].sum()

    # the Kruskal-Wallis H-statistic
    H = (12. / (N * (N + 1.))) * ((R**2.) / n).sum() - 3 * (N + 1)

    # apply correction factor for ties
    H /= T

    df_omnibus = k - 1
    p_omnibus = stats.chisqprob(H, df_omnibus)

    # multiple comparisons
    # -------------------------------------------------------------------------

    # by default we compare every possible pair of groups
    if to_compare is None:
        to_compare = tuple(combinations(range(k), 2))

    ncomp = len(to_compare)

    dif = np.empty(ncomp, dtype=np.float)
    B = np.empty(ncomp, dtype=np.float)

    Rmean = R / n
    A = N * (N + 1) / 12.

    for pp, (ii, jj) in enumerate(to_compare):

        # absolute difference of mean ranks
        dif[pp] = np.abs(Rmean[ii] - Rmean[jj])
        B[pp] = (1. / n[ii]) + (1. / n[jj])

    if method == 'tukey':

        # p-values obtained from the upper quantiles of the studentized range
        # distribution
        qval = dif / np.sqrt(A * B)
        p_corrected = psturng(qval * np.sqrt(2), k, 1E6)

    elif method == 'chisq':

        # p-values obtained from the upper quantiles of the chi-squared
        # distribution
        chi2 = (dif**2.) / (A * B)
        p_corrected = stats.chisqprob(chi2 * T, k - 1)

    reject = p_corrected <= alpha

    return H, p_omnibus, p_corrected, reject
Beispiel #30
0
def get_group_measures(preds_all, low_ms=None, norm_ms=None, U1=None, U2=None, U3=None, U4=None):
    predictions = defaultdict(list)
    maes = defaultdict(list)
    predictions["All"] = preds_all
    for user_id, item_id, r, r_, details in preds_all:
        mae = np.abs(r - r_)
        maes["All"].append(mae)

    if low_ms is not None and norm_ms is not None:
        for user_id, item_id, r, r_, details in preds_all:
            mae = np.abs(r - r_)
            if user_id in low_ms:
                maes["LowMs"].append(mae)
                predictions["LowMs"].append((user_id, item_id, r, r_, details))
            elif user_id in norm_ms:
                maes["NormMs"].append(mae)
                predictions["NormMs"].append((user_id, item_id, r, r_, details))


        # H0: MAE(NormMs) >= MAE(LowMs)
        t_statistic, p_val = ttest_ind(maes["NormMs"], maes["LowMs"])
        print(p_val)
        mae_ms_p = p_val / 2.0
        print("[t-TEST] t-statistic: %f, p-value: %f" % (t_statistic, mae_ms_p))

    if U1 is not None and U2 is not None and U3 is not None and U4 is not None:
        for user_id, item_id, r, r_, details in preds_all:
            mae = np.abs(r - r_)
            if user_id in U1:
                predictions["U1"].append((user_id, item_id, r, r_, details))
                maes["U1"].append(mae)
            elif user_id in U2:
                predictions["U2"].append((user_id, item_id, r, r_, details))
                maes["U2"].append(mae)
            elif user_id in U3:
                predictions["U3"].append((user_id, item_id, r, r_, details))
                maes["U3"].append(mae)
            elif user_id in U4:
                predictions["U4"].append((user_id, item_id, r, r_, details))
                maes["U4"].append(mae)

        # H0: all means are the same
        f_val, p_val = f_oneway(maes["U1"], maes["U2"], maes["U3"], maes["U4"])
        print("[ANOVA] p-value: %f" % p_val)


        df = pd.DataFrame(data={"MAE": maes["U1"], "Usergroup": "U1"})
        df = df.append(pd.DataFrame(data={"MAE": maes["U2"], "Usergroup": "U2"}))
        df = df.append(pd.DataFrame(data={"MAE": maes["U3"], "Usergroup": "U3"}))
        df = df.append(pd.DataFrame(data={"MAE": maes["U4"], "Usergroup": "U4"}))
        df = df.append(pd.DataFrame(data={"MAE": maes["All"], "Usergroup": "All"}))

        tukeyhsd_results = pairwise_tukeyhsd(df["MAE"], df["Usergroup"])
        print(tukeyhsd_results.summary())

        # from https://stackoverflow.com/questions/48200699/how-can-i-get-p-values-of-each-group-comparison-when-applying-the-tukey-s-hones
        p_values = psturng(np.abs(tukeyhsd_results.meandiffs / tukeyhsd_results.std_pairs), len(tukeyhsd_results.groupsunique), tukeyhsd_results.df_total)
        print("[TukeyHSD] p-values : " + str(p_values))

    results = ResultDict()
    for group in predictions.keys():
        results.add_result(usergroup=group, metric="MAE", value=accuracy.mae(predictions[group], verbose=False))
        """results.add_result(usergroup=group, metric="RMSE", value=accuracy.rmse(predictions[group], verbose=False))
        results.add_result(usergroup=group, metric="FCP", value=accuracy.fcp(predictions[group], verbose=False))"""

        """for k in [5, 10, 20]:
            _, _, f1, mrr, map, ndcg = top_k_measures(predictions[group], k=k)
            #results.add_result(usergroup=group, metric="P@" + str(k), value=p)
            #results.add_result(usergroup=group, metric="R@" + str(k), value=r)
            results.add_result(usergroup=group, metric="F1@" + str(k), value=f1)
            results.add_result(usergroup=group, metric="MRR@" + str(k), value=mrr)
            results.add_result(usergroup=group, metric="MAP@" + str(k), value=map)
            results.add_result(usergroup=group, metric="nDCG@" + str(k), value=ndcg)
        for k in range(1, 21):
            p, r, _, _, _, _ = top_k_measures(predictions[group], k=k)
            results.add_result(usergroup=group, metric="P@" + str(k), value=p)
            results.add_result(usergroup=group, metric="R@" + str(k), value=r)"""

    return results
Beispiel #31
0
                  jitter=True,
                  data=final_df[(final_df.Event == 'Cells') |
                                (final_df.Event == treatments[ind])],
                  ax=panel)

plt.savefig('respiration.svg')
plt.show()

#statistical analysis

final_df['Slope'] = final_df.Slope.astype(np.float)

#fit linear model
model = ols('Slope ~ Event + Experiment', data=final_df).fit()
print(model.summary())

print('ANOVA analysis')

aov_table = sm.stats.anova_lm(model, type=2)
print(aov_table)

print('Post hoc tukey')

mc = MultiComparison(final_df['Slope'], final_df['Event'])
mc_results = mc.tukeyhsd()
print(mc_results.summary())

p_values = psturng(np.abs(mc_results.meandiffs / mc_results.std_pairs),
                   len(mc_results.groupsunique), mc_results.df_total)

print('p_values: ', p_values)
Beispiel #32
0
 def test_v_equal_one(self):
     assert_almost_equal(.1, psturng(.2,5,1), 5)
Beispiel #33
0
def test_osrt(data: Union[List, np.ndarray, DataFrame],
              val_col: str = None,
              group_col: str = None,
              sort: bool = False) -> Tuple[float, float, int]:
    '''Hayter's one-sided studentised range test (OSRT)

    Tests a hypothesis against an ordered alternative for normal data with
    equal variances [1]_.

    Parameters
    ----------
    data : Union[List, numpy.ndarray, DataFrame]
        An array, any object exposing the array interface or a pandas
        DataFrame with data values.

    val_col : str = None
        Name of a DataFrame column that contains dependent variable values
        (test or response variable). Values should have a non-nominal scale.
        Must be specified if ``a`` is a pandas DataFrame object.

    group_col : str = None
        Name of a DataFrame column that contains independent variable values
        (grouping or predictor variable). Values should have a nominal scale
        (categorical). Must be specified if `a` is a pandas DataFrame object.

    sort : bool = False
        If True, sort data by block and group columns.

    Returns
    -------
    Tuple[float, float, int]
        P value, statistic, and number of degrees of freedom.

    Notes
    -----
    P values are computed from the Tukey distribution.

    References
    ----------
    .. [1] Hayter, A.J.(1990) A One-Sided Studentised Range Test for Testing
        Against a Simple Ordered Alternative, Journal of the American
        Statistical Association, 85, 778-785.

    Examples
    --------
    >>> import scikit_posthocs as sp
    >>> import pandas as pd
    >>> x = pd.DataFrame({"a": [1,2,3,5,1], "b": [12,31,54,62,12], "c": [10,12,6,74,11]})
    >>> x = x.melt(var_name='groups', value_name='values')
    >>> sp.test_osrt(x, val_col='values', group_col='groups')
    '''
    x, _val_col, _group_col = __convert_to_df(data, val_col, group_col)

    if not sort:
        x[_group_col] = Categorical(x[_group_col],
                                    categories=x[_group_col].unique(),
                                    ordered=True)

    x.sort_values(by=[_group_col], ascending=True, inplace=True)
    groups = np.unique(x[_group_col])
    x_grouped = x.groupby(_group_col)[_val_col]

    xi = x_grouped.mean()
    ni = x_grouped.count()
    k = groups.size
    n = len(x.index)
    df = n - k

    sigma2 = 0
    c = -1

    for i in range(k):
        for j in range(ni.iloc[i]):
            c += 1
            sigma2 += (x[_val_col].iat[c] - xi[i])**2. / df

    sigma = np.sqrt(sigma2)

    def compare(i, j):
        dif = xi.loc[groups[j]] - xi.loc[groups[i]]
        A = sigma / np.sqrt(2.) * np.sqrt(1. / ni[groups[j]] +
                                          1. / ni[groups[i]])
        qval = np.abs(dif) / A
        return qval

    vs = np.zeros((k, k), dtype=float)
    combs = it.combinations(range(k), 2)

    for i, j in combs:
        vs[i, j] = compare(i, j)

    stat = np.max(vs)
    pval = psturng(stat, k, df)
    return pval, stat, df
Beispiel #34
0
 def test_scalar(self):
     "scalar input -> scalar output"
     assert_almost_equal(.1, psturng(4.43645545899562, 5, 6), 5)
Beispiel #35
0
    'P-PG', 'P-TD', 'P-RMSPROP', 'NEAT', 'NEAT-EM-P-PG', 'NEAT-EM-P-TD',
    'NEAT-EM-P-RMSPROP'
]
group_names = []
for header in headers:
    group_names += list(itertools.repeat(header, min_length))

mc = MultiComparison(np.asarray(stripped_groups).flatten(), group_names)
result = mc.tukeyhsd()

from statsmodels.stats.libqsturng import psturng

print(result)
print(mc.groupsunique)
print(
    psturng(np.abs(result.meandiffs / result.std_pairs),
            len(result.groupsunique), result.df_total))


def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), stats.sem(a)
    h = se * stats.t._ppf((1 + confidence) / 2., n - 1)
    return h


count = 0
for stripped_group in stripped_groups:
    print(headers[count])
    print(np.mean(stripped_group), mean_confidence_interval(stripped_group))
    count += 1
Beispiel #36
0
 def test_v_equal_one(self):
     assert_almost_equal(.1, psturng(.2, 5, 1), 5)
Beispiel #37
0
 def test_scalar(self):
     "scalar input -> scalar output"
     assert_almost_equal(.1, psturng(4.43645545899562,5,6), 5)