Beispiel #1
0
def compute_anova_rev_restrict_t(topdir: str, m: int):
    # Assemble a large experiment table with all data
    neighbors = ["5", "10", "15", "20"]
    tolerances = ['0.0', '0.2', '0.4', '0.6', '0.8', '1.0']
    dfs = []

    for n in neighbors:
        for tol in tolerances:
            casedir = topdir + '/' + 'nn' + '_' + tol + '_' + n
            casetable = ac.compute_stored_runs(casedir, m, None)
            casetable['TOL'] = [float(tol)] * 5
            casetable['NNN'] = [float(n)] * 5
            dfs.append(casetable)

    dfa = pd.concat(dfs).reset_index(drop=True)
    df = dfa[dfa['TOL'] != 1.0]

    # Perform a regression with the data
    results = ols('REV ~ C(TOL) + C(NNN) + C(TOL):C(NNN)', data=df).fit()
    print(results.summary())
    print('\n\n\n')
    aov_table = sm.stats.anova_lm(results, typ=2)
    print(aov_table)
    print('\n\n\n')
    mct = MultiComparison(df['REV'], df['TOL'])
    mct_results = mct.tukeyhsd()
    print(mct_results)

    mcn = MultiComparison(df['REV'], df['NNN'])
    mcn_results = mcn.tukeyhsd()
    print(mcn_results)
Beispiel #2
0
    def test_incorrect_output(self):
        # too few groups
        assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 4)
        # too many groups
        assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 6)
        # just one group
        assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1] * 10)

        # group_order doesn't select all observations, only one group left
        assert_raises(ValueError, MultiComparison, np.array([1] * 10),
                     [1, 2] * 5, group_order=[1])

        # group_order doesn't select all observations,
        # we do tukey_hsd with reduced set of observations
        data = np.arange(15)
        groups = np.repeat([1, 2, 3], 5)
        mod1 = MultiComparison(np.array(data), groups, group_order=[1, 2])
        res1 = mod1.tukeyhsd(alpha=0.01)
        mod2 = MultiComparison(np.array(data[:10]), groups[:10])
        res2 = mod2.tukeyhsd(alpha=0.01)

        attributes = ['confint', 'data', 'df_total', 'groups', 'groupsunique',
                     'meandiffs', 'q_crit', 'reject', 'reject2', 'std_pairs',
                     'variance']
        for att in attributes:
            err_msg = att + 'failed'
            assert_allclose(getattr(res1, att), getattr(res2, att), rtol=1e-14,
                            err_msg=err_msg)

        attributes = ['data', 'datali', 'groupintlab', 'groups', 'groupsunique',
                      'ngroups', 'nobs', 'pairindices']
        for att in attributes:
            err_msg = att + 'failed'
            assert_allclose(getattr(mod1, att), getattr(mod2, att), rtol=1e-14,
                            err_msg=err_msg)
Beispiel #3
0
    def ANOVA_TimePoints(self, combine_sexes=True):
        """
        Hardcoded to use count as the parameter and treatment as the grouping
        identifier. This is a limitation due to these values being in a string,
        can work around this later by building a string according to the format
        below.
        """
        mc_results_to_return = []
        summary_to_return = []
        if combine_sexes:
            for timepoint in self.pr_columns:
                print("Time point: " + timepoint)
                df_timepoint = self.df_dropna.loc[self.df_dropna['Time Point']
                                                  == timepoint]
                results = ols('Count ~ C(Treatment)', data=df_timepoint).fit()
                print(results.summary())
                mc = MultiComparison(df_timepoint['Count'],
                                     df_timepoint['Treatment'])
                mc_results = mc.tukeyhsd()
                print(mc_results)
                summary_to_return.append(results)
                mc_results_to_return.append(mc_results)

        elif not combine_sexes:
            for timepoint in self.pr_columns:
                print("Time point: " + timepoint)
                df_timepoint = self.df_dropna.loc[
                    (self.df_dropna['Time Point'] == timepoint)
                    & (self.df_dropna['Treatment'].isin(
                        self.male_treatment_labels))]
                results = ols('Count ~ C(Treatment)', data=df_timepoint).fit()
                print(results.summary())
                mc = MultiComparison(df_timepoint['Count'],
                                     df_timepoint['Treatment'])
                mc_results = mc.tukeyhsd()
                print(mc_results)
                summary_to_return.append(results)
                mc_results_to_return.append(mc_results)

            for timepoint in self.pr_columns:
                print("Time point: " + timepoint)
                df_timepoint = self.df_dropna.loc[
                    (self.df_dropna['Time Point'] == timepoint)
                    & (self.df_dropna['Treatment'].isin(
                        self.female_treatment_labels))]
                results = ols('Count ~ C(Treatment)', data=df_timepoint).fit()
                print(results.summary())
                mc = MultiComparison(df_timepoint['Count'],
                                     df_timepoint['Treatment'])
                mc_results = mc.tukeyhsd()
                print(mc_results)
                summary_to_return.append(results)
                mc_results_to_return.append(mc_results)

            return summary_to_return, mc_results_to_return
        else:
            print(
                "Did not understand parameters for which stats to do here. Looking for True or False"
            )
Beispiel #4
0
    def test_incorrect_output(self):
        # too few groups
        assert_raises(ValueError, MultiComparison, np.array([1] * 10),
                      [1, 2] * 4)
        # too many groups
        assert_raises(ValueError, MultiComparison, np.array([1] * 10),
                      [1, 2] * 6)
        # just one group
        assert_raises(ValueError, MultiComparison, np.array([1] * 10),
                      [1] * 10)

        # group_order doesn't select all observations, only one group left
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter('always')
            assert_raises(ValueError,
                          MultiComparison,
                          np.array([1] * 10), [1, 2] * 5,
                          group_order=[1])

        # group_order doesn't select all observations,
        # we do tukey_hsd with reduced set of observations
        data = np.arange(15)
        groups = np.repeat([1, 2, 3], 5)
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter('always')
            mod1 = MultiComparison(np.array(data), groups, group_order=[1, 2])
            assert_equal(len(w), 1)
            assert issubclass(w[0].category, UserWarning)

        res1 = mod1.tukeyhsd(alpha=0.01)
        mod2 = MultiComparison(np.array(data[:10]), groups[:10])
        res2 = mod2.tukeyhsd(alpha=0.01)

        attributes = [
            'confint', 'data', 'df_total', 'groups', 'groupsunique',
            'meandiffs', 'q_crit', 'reject', 'reject2', 'std_pairs', 'variance'
        ]
        for att in attributes:
            err_msg = att + 'failed'
            assert_allclose(getattr(res1, att),
                            getattr(res2, att),
                            rtol=1e-14,
                            err_msg=err_msg)

        attributes = [
            'data', 'datali', 'groupintlab', 'groups', 'groupsunique',
            'ngroups', 'nobs', 'pairindices'
        ]
        for att in attributes:
            err_msg = att + 'failed'
            assert_allclose(getattr(mod1, att),
                            getattr(mod2, att),
                            rtol=1e-14,
                            err_msg=err_msg)
Beispiel #5
0
def tukey(structure, alpha, valutation): #Tukey calculation pairwise and multiple comparisons and finally print the plot
    if valutation == 'ap':
        data, group = make_datagroup(structure, valutation)
        mc = MultiComparison(data, group)
        result = mc.tukeyhsd(alpha)
        fig = result.plot_simultaneous()    # Plot group confidence intervals
        fig.set_figwidth(30)
        fig.set_figheight(20)
        axes = fig.gca()
        fig.suptitle('Tukey_HSD test', fontsize=40)
        axes.set_xlabel("Average Precision (AP)", fontsize=30)
        axes.tick_params(labelsize=30)
        fileplot = path+"results/run/plot/Tukey_HSD_test_ap.png"
        fig.savefig(fileplot, dpi=300)
        fw = open(path+"results/run/plot/tukey_HSD_ap.txt", "w")
        fw.write(str(result))
        print(result)
        fw.close()
    elif valutation == 'p_10':
        data, group = make_datagroup(structure, valutation)
        mc = MultiComparison(data, group)
        result = mc.tukeyhsd(alpha)
        fig = result.plot_simultaneous()    # Plot group confidence intervals
        fig.set_figwidth(30)
        fig.set_figheight(20)
        axes = fig.gca()
        fig.suptitle('Tukey_HSD test', fontsize=40)
        axes.set_xlabel("P(10)", fontsize=30)
        axes.tick_params(labelsize=30)
        fileplot = path+"results/run/plot/Tukey_HSD_test_p10.png"
        fig.savefig(fileplot, dpi=300)
        fw = open(path+"results/run/plot/tukey_HSD_p10.txt", "w")
        fw.write(str(result))
        print(result)
        fw.close()
    else:
        data, group = make_datagroup(structure, valutation)
        mc = MultiComparison(data, group)
        result = mc.tukeyhsd(alpha)
        fig = result.plot_simultaneous()    # Plot group confidence intervals
        fig.set_figwidth(30)
        fig.set_figheight(20)
        axes = fig.gca()
        fig.suptitle('Tukey_HSD test', fontsize=40)
        axes.set_xlabel("Rprec", fontsize=30)
        axes.tick_params(labelsize=30)
        fileplot = path+"results/run/plot/Tukey_HSD_test_rprec.png"
        fig.savefig(fileplot, dpi=300)
        fw = open(path+"results/run/plot/tukey_HSD_rprec.txt", "w")
        fw.write(str(result))
        print(result)
        fw.close()
Beispiel #6
0
    def tukey_test(self):
        """It applies Tukey test to the dataframe. Tukey is a multi-comparison method.
        Discover more at  https://en.wikipedia.org/wiki/Tukey’s_range_test .
        Be sure you are working with a normal distribution"""

        MultiComp = MultiComparison(self.df.values, self.df.index)
        tukey = MultiComp.tukeyhsd().summary()
        print("\nTukey test for rows\n" + str(tukey) + "\n")
        self.results.write("\nTukey test for rows\n" + str(tukey) + "\n")

        MultiComp = MultiComparison(self.df.T.values, self.df.columns)
        tukey2 = MultiComp.tukeyhsd().summary()
        print("\nTukey test for columns\n" + str(tukey2) + "\n")
        self.results.write("\nTukey test for columns\n" + str(tukey2) + "\n")
        return (tukey, tukey2)
Beispiel #7
0
def tukeyhsd(statistics_table: pandas.DataFrame,
             column: str) -> Dict[str, TukeyHSDResults]:
    """
		Perfors tukey multiple-comparison statistics.
	Parameters
	----------
	statistics_table: A table with each subject as a separate column
	column: The column with the relevant values. Should be identical to the `y` variable used when generating figures.
	"""
    is_nested = statistics_table['condition'].nunique() != 1
    if is_nested:
        subjects = ['plate', 'strain', 'condition']
    else:
        subjects = ['plate', 'strain']
    tukey_results = dict()
    for subject in subjects:
        logger.debug(f"tukey subject: '{subject}'")
        logger.debug(
            f"tukey subject values: {statistics_table[subject].unique()}")
        # MultiComparison doesn't work when there are only two possible groups, so disable this if we only have 2 categories.
        number_of_unique_categories = statistics_table[subject].nunique()
        if number_of_unique_categories > 2:
            tukey_result = MultiComparison(
                statistics_table[column],
                statistics_table[subject]).tukeyhsd()
            tukey_results[subject] = tukey_result

    statistics_table['condition:strain'] = statistics_table[
        'condition'] + "-" + statistics_table['strain']
    mc = MultiComparison(statistics_table[column],
                         statistics_table['condition:strain'])
    tukey_results['condition_strain'] = mc.tukeyhsd()

    return tukey_results
Beispiel #8
0
def get_tukey(exp, df_all, measure):
    # Tukey posthoc analysis
    # See https://jpktd.blogspot.com/2013/03/multiple-comparison-and-tukey-hsd-or_25.html
    # And https://code.google.com/archive/p/qsturng-py/
    # And https://stackoverflow.com/questions/48200699/how-can-i-get-p-values-of-each-group-comparison-when-applying-the-tukey-s-hones
    # q, res_table, std_pairs, etc can be found from print(dir(result)) which will list all possible calculations

    if len(df_all.groupby('strain').count()) >= 3:
        df_tukey = df_all[np.isfinite(df_all[measure])]
        mc = MultiComparison(df_tukey[measure], df_tukey['strain'])
        result = mc.tukeyhsd()
        p = psturng(np.abs(result.meandiffs / result.std_pairs),
                    len(result.groupsunique), result.df_total)
        df_pairs = pd.DataFrame({
            'group1': [
                result._results_table[1][0], result._results_table[2][0],
                result._results_table[3][0]
            ],
            'group2': [
                result._results_table[1][1], result._results_table[2][1],
                result._results_table[3][1]
            ],
            'p_value':
            [np.around(p[0], 4),
             np.around(p[1], 4),
             np.around(p[2], 4)]
        })
    else:
        df_pairs = pd.DataFrame({'group1': [], 'group2': [], 'p_value': []})

    file_out = exp.name + '_coupling_' + measure + '_' + '_tukey_' + '.csv'
    pfiles.save_csv(df_pairs, file_out, exp.dir_tukey, False)
    return df_pairs
def scipy_anova_post_hoc_tests(df=None,
                               flight_status_col='flight status new',
                               sig_test=stats.f_oneway):
    """
    df should be melted by aberration type
    """
    # make list of aberrations
    aberrations = list(df['aberration type'].unique())

    # loop through aberrations & perform anovas between pre/mid/post
    for aberr in aberrations:

        g_1 = df[(df[flight_status_col] == 'Pre-Flight')
                 & (df['aberration type'] == aberr)]['count per cell']
        g_2 = df[(df[flight_status_col] == 'Mid-Flight')
                 & (df['aberration type'] == aberr)]['count per cell']
        g_3 = df[(df[flight_status_col] == 'Post-Flight')
                 & (df['aberration type'] == aberr)]['count per cell']
        statistic, p_value = sig_test(g_1, g_2, g_3)
        print(aberr, p_value)

        # if anova detects sig diff, perform post-hoc tests
        if p_value <= 0.05:
            mc = MultiComparison(
                df[df['aberration type'] == aberr]['count per cell'],
                df[df['aberration type'] == aberr][flight_status_col])
            mc_results = mc.tukeyhsd()
            print(mc_results)
            res = mc_results
            print(
                f'pvalues: {list(psturng(np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total))}'
            )
            print('\n')
Beispiel #10
0
def get_tukey(df_groups, measure, groups):

    # Tukey posthoc analysis
    # See https://jpktd.blogspot.com/2013/03/multiple-comparison-and-tukey-hsd-or_25.html
    # And https://code.google.com/archive/p/qsturng-py/
    # And https://stackoverflow.com/questions/48200699/how-can-i-get-p-values-of-each-group-comparison-when-applying-the-tukey-s-hones
    # q, res_table, std_pairs, etc can be found from print(dir(result)) which will list all possible calculations
    df_tukey = df_groups[3][np.isfinite(df_groups[3][measure])]
    #print(df_tukey)
    mc = MultiComparison(df_tukey[measure], df_tukey['strain'])
    #result = pairwise_tukeyhsd(mc.data,mc.groups,0.05)
    result = mc.tukeyhsd()
    p = psturng(np.abs(result.meandiffs / result.std_pairs),
                len(result.groupsunique), result.df_total)
    df_pairs = pd.DataFrame({
        'group1': [
            result._results_table[1][0], result._results_table[2][0],
            result._results_table[3][0]
        ],
        'group2': [
            result._results_table[1][1], result._results_table[2][1],
            result._results_table[3][1]
        ],
        'p_value':
        [np.around(p[0], 4),
         np.around(p[1], 4),
         np.around(p[2], 4)]
    })

    for index, row in df_pairs.iterrows():
        stars = get_stars(row['p_value'])
        df_pairs.loc[index, 'significance'] = stars

    return df_pairs
def anova_all( type, file_output, *datas ):
    together = pd.DataFrame()
    for d in datas:
        together = pd.concat( [together, d] )
    
    groups = together.groupby(['strategy'])
    
    fvalue, pvalue = stats.f_oneway(
        # groups.get_group("m")["result"],
        # groups.get_group("pr")["result"],
        # groups.get_group("mpr")["result"],
        # groups.get_group("ols")["result"],
        groups.get_group("om")["result"],
        groups.get_group("opr")["result"],
        groups.get_group("ompr")["result"]
    )
    
    f = open( file_output, "a" )
    f.write( type + ":" + str( pvalue ) + "\n" )

    if pvalue < 0.05:
        mc = MultiComparison(
            together["result"],
            together["strategy"]
        )
        mc_results = mc.tukeyhsd()
        print( str( round( pvalue, 4 ) ), file=f )
        print( mc_results, file=f )
Beispiel #12
0
class CheckTuckeyHSD(object):

    @classmethod
    def setup_class_(self):
        self.mc = MultiComparison(self.endog, self.groups)
        self.res = self.mc.tukeyhsd(alpha=self.alpha)

    def test_multicomptukey(self):
        meandiff1 = self.res[1][2]
        assert_almost_equal(meandiff1, self.meandiff2, decimal=14)

        confint1 = self.res[1][4]
        assert_almost_equal(confint1, self.confint2, decimal=2)

        reject1 = self.res[1][1]
        assert_equal(reject1, self.reject2)

    def test_group_tukey(self):
        res_t = get_thsd(self.mc,alpha=self.alpha)
        assert_almost_equal(res_t[4], self.confint2, decimal=2)

    def test_shortcut_function(self):
        #check wrapper function
        res = pairwise_tukeyhsd(self.endog, self.groups, alpha=self.alpha)
        assert_almost_equal(res[1][4], self.res[1][4], decimal=14)
Beispiel #13
0
def create_df_with_all_post_hoc(df, hue='value'):
    index_0 = [
        "start_cm", "rel_pt", "amp_max_cop", 'amp_max_pel', 'amp_max_c7',
        "vel_max_cop", 'vel_max_pel', 'vel_max_c7', "overshoot", "dcm", "dtml",
        "rcm"
    ]
    index_1 = ["comparaison 0", "comparaison 1", "comparaison 2"]
    index = pd.MultiIndex.from_product([index_0, index_1])
    columns = ['group1', 'group2', 'meandiff', 'lower', 'upper', 'reject']

    n_row = len(index_0) * len(index_1)
    n_col = len(columns)

    data = np.empty((n_row, n_col))
    data[:] = np.nan

    full_df = pd.DataFrame(data, index=index, columns=columns)
    for v in index_0:
        data = get_data_for_anova(df, v, hue)
        results = ols("{} ~ C(level)".format(v), data=data).fit()
        mc = MultiComparison(data[v], data['level'])
        post_hoc = mc.tukeyhsd()
        df_ph = create_df_post_hoc(post_hoc, v)

        if results.f_pvalue < 0.05:
            full_df.loc[(v)] = df_ph.values
        else:
            full_df.loc[(v)] = 'anova ns'

    return full_df
Beispiel #14
0
    def print_hpcontrast(self, data: list, labels: list, alpha: float = 0.05):
        """Contrast the hypoteses of the scores given in the list and print the results in the report.

        Using Kurskal-Wallis and Tuckeyhsd tests.

        :param data: List containing the metric results of the models
        :type data: list
        :param labels: List containg the models tags
        :type labels: list
        :param alpha: Number for the pValue of the test, defaults to 0.05
        :type alpha: float, optional
        """
        _, pVal = stats.kruskal(*data)
        str_toprint = f"p-valor KrusW:{pVal}\n"
        if pVal <= alpha:
            str_toprint += (
                "Hypotheses are being rejected: the models are different\n")
            stacked_data = np.vstack(data).ravel()
            cv = len(data[0])
            model_rep = []
            for i in labels:
                model_rep.append(np.repeat("model" + i, cv))
            stacked_model = np.vstack(model_rep).ravel()
            multi_comp = MultiComparison(stacked_data, stacked_model)
            comp = multi_comp.tukeyhsd(alpha=alpha)
            str_toprint += str(comp)
        else:
            str_toprint = (
                str_toprint +
                "Hypotheses are being accepted: the models are equal")
        self.print_noformat(str_toprint)
Beispiel #15
0
    def Tukey(self, Categorical, Continuous):
        """
		Calculate Tukey Honest Significance Difference (HSD) Test, to identify the groups whose
		distributions are significantly different
		"""
        temp_df = self.df.dropna()
        start = time.time()
        mc = MultiComparison(temp_df[Continuous], temp_df[Categorical])
        result = mc.tukeyhsd()
        reject = result.reject
        meandiffs = result.meandiffs
        UniqueGroup = mc.groupsunique
        group1 = [UniqueGroup[index] for index in mc.pairindices[0]]
        group2 = [UniqueGroup[index] for index in mc.pairindices[1]]
        reject = result.reject
        meandiffs = [
            round(float(meandiff), 3) for meandiff in result.meandiffs
        ]
        columns = ['Group 1', "Group 2", "Mean Difference", "Reject"]
        TukeyResult = pd.DataFrame(np.column_stack(
            (group1, group2, meandiffs, reject)),
                                   columns=columns)

        end = time.time()
        if self.debug == 'YES':
            print('Tukey', end - start)
        return TukeyResult
def anova_analysis(df):
    """
    anova_analysis takes in a data frame and performs an anova test for hypothesis testing 1
    prints out the test results
    """
    time_periods = df.groupby(['week_ending','Holiday'],as_index = False)[['seats_sold']].sum()
    TG = time_periods.loc[time_periods['Holiday'] == 'ThanksGiving','seats_sold']
    WB = time_periods.loc[time_periods['Holiday'] == 'WinterBreak','seats_sold']
    SB = time_periods.loc[time_periods['Holiday'] == 'SummerBreak','seats_sold']
    NH = time_periods.loc[time_periods['Holiday'] == 'Not Holiday','seats_sold']
    f,p = stats.f_oneway(TG,WB,SB,NH)
    print('The f and p of ANOVA analysis are:')
    print(f,p)

    ## plot the mean of each group
    time_periods.boxplot('seats_sold', by='Holiday', figsize=(12, 8))
    fileName = 'ANOVA.png'
    plt.savefig(fileName)

    print("The mean seats sold of each time periods:")
    print(time_periods.groupby('Holiday')['seats_sold'].mean())

    pairwise = MultiComparison(time_periods['seats_sold'], time_periods['Holiday'])
    result = pairwise.tukeyhsd()
    print(pairwise)
    print(result)
def posthoc_turron_by_gender(melted, variable):
    df = melted.copy()
    df = df[df['variable'] == variable]
    df['turron:gender'] = df['turron'] + "_" + df['gender']
    mc = MultiComparison(df['value'], df['turron:gender'])
    result = mc.tukeyhsd()
    print(result)
def tukey_multi_metrics(gather_df,
                        col_indices=list(range(10)) + list(range(15, 20)) +
                        list(range(25, 65)) + list(range(145, 149)) +
                        list(range(150, 180)),
                        alpha=0.05):
    metric_names = list(gather_df.columns.values[col_indices])
    model_names = list(gather_df.index.levels[0])
    tukey_dict = {}

    # drop fold means and medians
    gather_df = gather_df[metric_names]
    gather_df = gather_df.xs('test_metrics', level='set')
    gather_df = gather_df.drop('Folds Mean', level='fold')
    gather_df = gather_df.drop('Folds Median', level='fold')

    # get fold count
    model_names_rep = []
    for m in model_names:
        k = gather_df.xs(m, level='model').shape[0]
        model_names_rep.extend([m for _ in range(k)])

    for i, metric in zip(range(len(metric_names)), metric_names):
        m_df = gather_df[metric]

        m_df.sort_index(inplace=True)
        m_df = m_df.loc[model_names]

        m_df_mat = np.around(m_df.as_matrix(), decimals=4)
        mc_obj = MultiComparison(m_df_mat, model_names_rep)
        tukey_res = mc_obj.tukeyhsd(alpha=alpha)

        tukey_dict[metric] = tukey_res

    return tukey_dict
def tukey_hsd(data, groups, metric):
    """ANOVA and Tukey HSD post-hoc comparison from statsmodels.

    The Tukey HSD post-hoc comparison test controls for type I error and maintains the familywise error rate at 0.05.
    The group1 and group2 columns are the groups being compared,
    meandiff column is the difference in means of the two groups being calculated as group2 – group1,
    lower/upper columns are the lower/upper boundaries of the 95% confidence interval,
    reject column states whether or not the null hypothesis should be rejected.

    Args:
        data (Dataframe): Dataframe grouped by dialogue group and label_type values.
        groups (string): Indicates which columns values to group data for comparison i.e. label_type.
        metric (string): Indicates which column name has the result values i.e. values/times.

    Returns:
        tukey_frame (Dataframe): Contains f-statistic, p-value and eta/omega effect sizes.

            group1   group2  meandiff  p-value   lower   upper  reject
        0       ap  ap type   -0.1167   0.8059 -0.5825  0.3492   False
        1       ap       da    0.1000   0.8543 -0.3659  0.5659   False
        2  ap type       da    0.2167   0.5158 -0.2492  0.6825   False
    """
    # Compare the results (metric) for the range of values for this experiment_type
    multi_comparison = MultiComparison(data=data[metric], groups=data[groups])
    # Create the tukey results table
    tukey_results = multi_comparison.tukeyhsd()

    # Convert the results to a dataframe
    tukey_frame = pd.DataFrame(data=tukey_results._results_table.data[1:],
                               columns=tukey_results._results_table.data[0])
    tukey_frame.rename(columns={'p-adj': 'p-value'}, inplace=True)

    return tukey_frame
Beispiel #20
0
 def _calc_firing_rate(self,
                       num_peaks: pd.DataFrame,
                       epoch: str = "All_cells"):
     """
     Sum all indices of peaks to find the average firing rate of cells in the three epochs
     :return:
     """
     # Remove silent cells from comparison
     split_data = num_peaks.stack()
     mc = MultiComparison(split_data.values,
                          split_data.index.get_level_values(1).values)
     try:
         res = mc.tukeyhsd()
     except ValueError:
         aprint("<yellow>Failed during the p-value calculation.</yellow>")
     else:
         print(res)
         print(
             f"P-values ({epoch}, number of cells: {split_data.shape[0] // 3}):",
             psturng(
                 np.abs(res.meandiffs / res.std_pairs),
                 len(res.groupsunique),
                 res.df_total,
             ),
         )
     finally:
         print(split_data.mean(level=1))
def preform_anova(df_anova, post_hoc=False):
    ### perform anova
    no_interaction = 'err ~ C(u) + C(m) + C(n) + C(h) + C(q) + C(prob)'
    with_interactions = 'err ~ C(u) * C(m) * C(n) * C(h) * C(q) * C(prob)'
    formula = 'err ~ u + m + n + h + q + prob'

    model = ols(formula, data=df_anova).fit()
    aov_table = sm.stats.anova_lm(model,
                                  typ=2)  # Type 2 is for ANOVA DataFrame
    ### effect size (R^2) (R_squared)
    esq_sm = aov_table['sum_sq'][:-1] / (aov_table['sum_sq'][:-1] +
                                         aov_table['sum_sq'][-1])

    aov_table.loc[:, 'r_sq'] = esq_sm

    print(aov_table)

    if post_hoc:
        for name, row in aov_table.iterrows():
            if row['PR(>F)'] < 5e-2:
                mc = MultiComparison(df_anova[name], df_anova['err'])
                mc_results = mc.tukeyhsd()
                print('post_hoc')
                print(mc_results)
    aov_table.to_csv('analysis/anova_results.csv')
def calc_Tukey(f):
    '''
    f= name of feature
    '''
    mc = MultiComparison(data.grouped_features[f]['value'],
                         data.grouped_features[f]['KD'])
    mc_results = mc.tukeyhsd()
    return mc_results
def posthoc_turron_by_first_time_tasting(melted, variable):
    df = melted.copy()
    df = df[df['variable'] == variable].dropna()
    df['turron:first_time_tasting'] = df['turron'] + "_" + df['first_time_tasting']
    mc = MultiComparison(df['value'], df['turron:first_time_tasting'])
    res = mc.tukeyhsd()
    from statsmodels.stats.libqsturng import psturng
    p_values = psturng(np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total)
    print(p_values)
    print(res)
Beispiel #24
0
def tukey(prediction: 'array of ints', model: 'array of string'):
    """ 
        This performs Tukey HSD test,
        input: array(int(predictions)), array(str(model_names))
        this test says if there are significant differences between the classes.
    """
    mc = MultiComparison(prediction, model)
    mc_results = mc.tukeyhsd()
    print(mc_results)
    return mc_results
Beispiel #25
0
def perform_post_hoc_tukey(data, factor1, factor2, factor3, factor2_idx):
    for factor2_idx in range(len(factor2_idx)):
        print(factor2, ': ', factor2_idx + 1)
        mc = MultiComparison(
            data[data[factor2] == factor2_idx + 1][factor1],
            index_data_response_MDD[index_data_response_MDD[factor2] ==
                                    factor2_idx + 1][factor3])
        result = mc.tukeyhsd()

        print(result)
        print(mc.groupsunique)
def tukey(data, names):
    names = np.array(names)
    tmp = []
    for item in data:
        for val in item:
            tmp.append(val)
    data = np.array(tmp)
    mc = MultiComparison(data, names)
    result = mc.tukeyhsd()
    print(result)
    print(mc.groupsunique)
Beispiel #27
0
def tukey(exp):
    mc = MultiComparison(exp, groups)
    result = mc.tukeyhsd()

    print(result)
    print(mc.groupsunique)


# calc_p_value(exp2__saccade_count, exp3__saccade_count)
# mean_calcs(exp3__saccade_count)
# anova(exp2__fix_dur)
 def tukey_hsd(self, stacked_df, colname):
     '''
     input:
     stacked_df: from table_transform, a stacked df 
     colname: string, the category to compare 
     return:
     tukeyhsd table result and stacked table 
     set up tukey hsd for post anova with significance
     '''
     MultiComp = MultiComparison(stacked_df[colname], stacked_df['state'])
     return MultiComp.tukeyhsd().summary()
Beispiel #29
0
	def GroupTukeyHSD(self,continuous, categorical):
		try:
			mc = MultiComparison(continuous, categorical)
			result = mc.tukeyhsd()
			reject = result.reject
			meandiffs = result.meandiffs
			UniqueGroup = mc.groupsunique
			group1 = [UniqueGroup[index] for index in mc.pairindices[0]]
			group2 = [UniqueGroup[index] for index in mc.pairindices[1]]
			reject = result.reject
			meandiffs = [round(float(meandiff),3) for meandiff in result.meandiffs]
			columns = ['Group 1', "Group 2", "Mean Difference", "Reject"]
			TukeyResult = pd.DataFrame(np.column_stack((group1, group2, meandiffs, reject)), columns=columns)
			'''
				Once Tukey HSD test is done. Select only those entries, with Reject=False. 
				This implies, only entries with similar distribution is selected.
				Once selected, group them into different distributions.
			'''		
			TukeyResult_false = TukeyResult[TukeyResult['Reject']=='False']
			overall_distribution_list = []
			same_distribution_list = []		
			if len(TukeyResult_false) > 0:
				for group1 in TukeyResult_false['Group 1'].unique():
					if group1 not in overall_distribution_list:
						temp_list=[]
						temp_result = TukeyResult_false[TukeyResult_false['Group 1']== group1]
						overall_distribution_list.append(group1)
						temp_list.append(group1)
						for entry in list(temp_result['Group 2'].unique()):
							if entry not in overall_distribution_list:
								overall_distribution_list.append(entry)
								temp_list.append(entry)
						
				#         if temp_result['Group 2'].nunique()>1:
				#             temp_list.extend((temp_result['Group 2'].unique()))
				#         else:
				#             temp_list.append((temp_result['Group 2'].unique()[0]))
						same_distribution_list.append(dict(list_name=group1, lists=temp_list, length=len(temp_list)))
						
				if len(set(categorical.unique())-set(overall_distribution_list)) >0:
					missing_categories = list(set(categorical.unique())-set(overall_distribution_list))
					for group1 in missing_categories:
						same_distribution_list.append(dict(list_name=group1, lists=[group1], length=1))

			else:
				for group1 in categorical.unique():
					same_distribution_list.append(dict(list_name=group1, lists=[group1], length=1))
			
			g1 = pd.DataFrame(same_distribution_list).sort_values('length',ascending=False)
		except:
			g1 = pd.DataFrame()
		return g1
def tukey_test(score_array):

    # create INTEGER indexes to label scores
    index_array = []
    for i in range(len(score_array)):
        index_dummy = np.array([(int(i + 1))
                                for j in range(len(score_array[i]))])
        index_array.append(index_dummy)

    # transform arrays to tuples
    score_tuple = tuple(map(tuple, score_array))
    index_tuple = tuple(map(tuple, np.array(index_array)))

    # format data for tukey function
    indexes = np.concatenate(index_tuple, axis=0)
    values = np.concatenate(score_tuple, axis=0)
    data = {'means': values, 'group': indexes}

    # perform the pairwise tukey test
    MultiComp2 = MultiComparison(data['means'], data['group'])
    print(MultiComp2.tukeyhsd(0.05).summary())
    return MultiComp2.tukeyhsd(0.05)
class TukeyHSD(Difference):
    name = "tukeyhsd"
    # from https://cleverowl.uk/2015/07/01/using-one-way-anova-and-tukeys-test-to-compare-data-sets/
    """Assumes normality, IID, one-to-one data pairing."""
    def run(self, data, groups):
        self.mc = MultiComparison(data, groups)
        self.result = self.mc.tukeyhsd()
        return self.result, self.mc.groupsunique

    def report(self):
        info("{} results:".format(self.name))
        info("result: {}".format(self.result.__str__()))
        info("Unique groups: {}".format(self.mc.groupsunique))
 def test_table_names_custom_group_order(self):
     # if the group_order parameter is used, the groups should
     # be reported in the specified order
     mc = MultiComparison(self.endog, self.groups,
                          group_order=[b'physical', b'medical', b'mental'])
     res = mc.tukeyhsd(alpha=self.alpha)
     #print(res)
     t = res._results_table
     expected_order = [(b'physical',b'medical'),
                       (b'physical',b'mental'),
                       (b'medical', b'mental')]
     for i in range(1, 4):
         first_group = t[i][0].data
         second_group = t[i][1].data
         assert_((first_group, second_group) == expected_order[i - 1])
Beispiel #33
0
( 30,  'medical',  1 )], dtype=[('idx', '<i4'),
                                ('Treatment', '|S8'),
                                ('StressReduction', '<i4')])

# First, do an one-way ANOVA
df = pd.DataFrame(dta2)
model = ols('StressReduction ~ C(Treatment)',df).fit()

anovaResults =  anova_lm(model)
print anovaResults
if anovaResults['PR(>F)'][0] < 0.05:
    print('One of the groups is different.')

#Then, do the multiple testing
mod = MultiComparison(dta2['StressReduction'], dta2['Treatment'])
print mod.tukeyhsd()[0]

# The following code produces the same printout
res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment'])
#print res2[0]

# Show the group names
print mod.groupsunique

# Generate a print
import matplotlib.pyplot as plt
plt.plot([0,1,2], res2[1][2], 'o')
plt.errorbar([0,1,2], res2[1][2], yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o')
xlim = -0.5, 2.5
plt.hlines(0, *xlim)
plt.xlim(*xlim)
                                        spectraTransform[np.where(dominant == listDominant[11])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[12])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[13])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[14])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[15])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[16])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[17])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[18])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[19])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[20])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[21])[0], w],
                                        spectraTransform[np.where(dominant == listDominant[22])[0], w])
        # If the anova turns back a pvalue < 0.05, do multicomparison to figure out what samples are different
        if anovaResults[w, 1] < 0.05:
            mc = MultiComparison(spectraTransform[:, w], dominant)  # http://statsmodels.sourceforge.net/0.6.0/_modules/statsmodels/stats/multicomp.html
            result = mc.tukeyhsd()  # http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.MultiComparison.tukeyhsd.html
            inResults = np.array([mc.groupsunique[mc.pairindices[0]], mc.groupsunique[mc.pairindices[1]], result.meandiffs, result.confint[:, 0], result.confint[:, 1], result.std_pairs, result.reject]).T
            inResults = np.column_stack((np.repeat(wavelengths[w], len(result.reject)), inResults))
            tukeyResults = np.vstack((tukeyResults, inResults))

# Set up csv file to output statistical results
outStats = file(outLocation + dateTag + '_statistical_analysis.csv', 'wb')  # Opening in append mode
row1 = np.hstack(('normal distribution p value for original spectra', normalStats))
row2 = np.hstack(('kurtosis p value for original spectra', kurtosisStats))
row3 = np.hstack(('skew p value for original spectra', skewStats))
row4 = np.hstack(('normal distribution p value for transformed spectra', normalTransformStats))
row5 = np.hstack(('kurtosis p value for transformed spectra', kurtosisTransformStats))
row6 = np.hstack(('skew p value for transformed spectra', skewTransformStats))
row7 = np.hstack(('anova results for transformed spectra', anovaResults[:, 1]))
inRows = np.vstack((row1, row2, row3, row4, row5, row6, row7))
np.savetxt(outStats, inRows, fmt='%s', delimiter=',')
Beispiel #35
0
def main():
    # Note: the statsmodels module is required here.
    from statsmodels.stats.multicomp import (pairwise_tukeyhsd,
                                             MultiComparison)
    from statsmodels.formula.api import ols
    from statsmodels.stats.anova import anova_lm
    
    # Set up the data, as a structured array.
    # The first and last field are 32-bit intergers; the second field is an
    # 8-byte string. Note that here we can also give names to the individual
    # fields!
    dta2 = np.rec.array([
    (  1,   'mental',  2 ),
    (  2,   'mental',  2 ),
    (  3,   'mental',  3 ),
    (  4,   'mental',  4 ),
    (  5,   'mental',  4 ),
    (  6,   'mental',  5 ),
    (  7,   'mental',  3 ),
    (  8,   'mental',  4 ),
    (  9,   'mental',  4 ),
    ( 10,   'mental',  4 ),
    ( 11, 'physical',  4 ),
    ( 12, 'physical',  4 ),
    ( 13, 'physical',  3 ),
    ( 14, 'physical',  5 ),
    ( 15, 'physical',  4 ),
    ( 16, 'physical',  1 ),
    ( 17, 'physical',  1 ),
    ( 18, 'physical',  2 ),
    ( 19, 'physical',  3 ),
    ( 20, 'physical',  3 ),
    ( 21,  'medical',  1 ),
    ( 22,  'medical',  2 ),
    ( 23,  'medical',  2 ),
    ( 24,  'medical',  2 ),
    ( 25,  'medical',  3 ),
    ( 26,  'medical',  2 ),
    ( 27,  'medical',  3 ),
    ( 28,  'medical',  1 ),
    ( 29,  'medical',  3 ),
    ( 30,  'medical',  1 )], dtype=[('idx', '<i4'),
                                    ('Treatment', '|S8'),
                                    ('StressReduction', '<i4')])
    
    # First, do an one-way ANOVA
    df = pd.DataFrame(dta2)
    model = ols('StressReduction ~ C(Treatment)',df).fit()
    
    anovaResults =  anova_lm(model)
    print(anovaResults)
    if anovaResults['PR(>F)'][0] < 0.05:
        print('One of the groups is different.')
    
    #Then, do the multiple testing
    mod = MultiComparison(dta2['StressReduction'], dta2['Treatment'])
    print((mod.tukeyhsd().summary()))
    
    # The following code produces the same printout
    res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment'])
    #print res2[0]
    
    # Show the group names
    print((mod.groupsunique))
    
    # Generate a print
    import matplotlib.pyplot as plt
    xvals = np.arange(3)
    plt.plot(xvals, res2.meandiffs, 'o')
    #plt.errorbar(xvals, res2.meandiffs, yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o')
    errors = np.ravel(np.diff(res2.confint)/2)
    plt.errorbar(xvals, res2.meandiffs, yerr=errors, ls='o')
    xlim = -0.5, 2.5
    plt.hlines(0, *xlim)
    plt.xlim(*xlim)
    pair_labels = mod.groupsunique[np.column_stack(res2._multicomp.pairindices)]
    plt.xticks(xvals, pair_labels)
    plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' +
              '\n Pairwise Mean Differences')          
    
    # Save to outfile
    outFile = 'MultComp.png'
    plt.savefig('MultComp.png', dpi=200)
    print(('Figure written to {0}'.format(outFile)))
    
    plt.show()
    
    # Instead of the Tukey's test, we can do pairwise t-test
    # First, with the "Holm" correction
    rtp = mod.allpairtest(stats.ttest_rel, method='Holm')
    print((rtp[0]))
    
    # and then with the Bonferroni correction
    print((mod.allpairtest(stats.ttest_rel, method='b')[0]))
    
    # Done this way, the variance is calculated at each comparison.
    # If you want the joint variance across all samples, you have to 
    # use a few tricks:(http://jpktd.blogspot.co.at/2013/03/multiple-comparison-and-tukey-hsd-or_25.html)
    res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment'])
    studentized_mean = res2.meandiffs
    studentized_variance = res2.variance
    
    t_stat = (studentized_mean / studentized_variance) / np.sqrt(2)
    dof = len(dta2) - len(mod.groupsunique)
    my_pvalues = stats.t.sf(np.abs(t_stat), dof) * 2  # two-sided
    
    # Now with the Bonferroni correction
    from statsmodels.stats.multitest import multipletests
    res_b = multipletests(my_pvalues, method='b')
    
    return res2.variance
def run_stats(experiment):
    '''Run independent T-test or one-way ANOVA dependent on number of groups.

    Args:
        experiment (Experiment instance): An instance of the Experiment class.

    Returns:
        A new Pandas data frame with p values, adjusted p values and Tukey HSD
        post-hoc results if there are > 2 groups.

    '''

    groups = experiment.get_groups()
    samples = experiment.get_sampleids()
    df = experiment.df
    all_vals = []

## Get values for each group, ready for T-test or ANOVA.

    for group in groups:
        sample_re = re.compile(group + "_\d+$")
        ids = [sample for sample in samples if sample_re.match(sample)]
        vals = list(map(list, df[ids].values))
        all_vals.append(vals)

## Decide whether to use T-test or ANOVA dependent on number of groups.
    if len(groups) == 2:
        p_vals = [ttest_ind(all_vals[0][i], all_vals[1][i])[1] for i in range(len(all_vals[0]))]
    else:
        p_vals = []
        for i in range(len(all_vals[0])):
            row_vals = [all_vals[j][i] for j in range(len(groups))]
            p_val = f_oneway(*row_vals)[1]
            p_vals.append(p_val)

## Adjust the p values and create a new data frame with them in.
    p_val_adj = list(multipletests(p_vals, method='fdr_bh')[1])
    new_df = df.ix[:, :5].copy()
    new_df['p_val'] = pd.Series(p_vals, index=new_df.index)
    new_df['p_val_adj'] = pd.Series(p_val_adj, index=new_df.index)

    ## Post-hoc test.

    ## Only do the post-hoc test if there are more than 2 groups, duh!
    if len(groups) > 2:
        vals_df = df[samples]
        group_ids = [sample.split('_')[0] for sample in vals_df.columns.values]
        posthoc_results = {}

        ## Run the post-hoc test on each row.
        for row in range(len(vals_df)):
            row_vals = vals_df.ix[row]
            mc = MultiComparison(row_vals, group_ids)
            mc_groups = mc.groupsunique
            results = mc.tukeyhsd()
            significant = results.reject
            pairs = list(zip(*[x.tolist() for x in mc.pairindices]))

            ## Go through each pair and add results to the posthoc_results dictionary.
            for i in range(len(pairs)):
                pair = list(pairs[i])
                pair.sort()
                pair_name = str(mc_groups[pair[0]]) + '_' + str(mc_groups[pair[1]])
                if pair_name in posthoc_results:
                    posthoc_results[pair_name].append(significant[i])
                else:
                    posthoc_results[pair_name] = [significant[i]]

        ## Add the post-hoc results to the data frame.
        for pair_name in posthoc_results:
            new_df['significant_' + pair_name] = posthoc_results[pair_name]

    return new_df
Beispiel #37
-1
('Pat', 9),
('Pat', 4),
('Jack', 4),
('Jack', 8),
('Jack', 7),
('Jack', 5),
('Jack', 1),
('Jack', 5),
('Alex', 9),
('Alex', 8),
('Alex', 8),
('Alex', 10),
('Alex', 5),
('Alex', 10)], dtype = [('Archer','|U5'),('Score', '<i8')])

f, p = stats.f_oneway(data[data['Archer'] == 'Pat'].Score,
	              data[data['Archer'] == 'Jack'].Score,
                      data[data['Archer'] == 'Alex'].Score)

print ('One-way ANOVA')
print ('=============')

print ('F value:', f)
print ('P value:', p, '\n')

mc = MultiComparison(data['Score'], data['Archer'])
result = mc.tukeyhsd()

print(result)
print(mc.groupsunique)