Beispiel #1
0
def compute_anova_rev_restrict_t(topdir: str, m: int):
    # Assemble a large experiment table with all data
    neighbors = ["5", "10", "15", "20"]
    tolerances = ['0.0', '0.2', '0.4', '0.6', '0.8', '1.0']
    dfs = []

    for n in neighbors:
        for tol in tolerances:
            casedir = topdir + '/' + 'nn' + '_' + tol + '_' + n
            casetable = ac.compute_stored_runs(casedir, m, None)
            casetable['TOL'] = [float(tol)] * 5
            casetable['NNN'] = [float(n)] * 5
            dfs.append(casetable)

    dfa = pd.concat(dfs).reset_index(drop=True)
    df = dfa[dfa['TOL'] != 1.0]

    # Perform a regression with the data
    results = ols('REV ~ C(TOL) + C(NNN) + C(TOL):C(NNN)', data=df).fit()
    print(results.summary())
    print('\n\n\n')
    aov_table = sm.stats.anova_lm(results, typ=2)
    print(aov_table)
    print('\n\n\n')
    mct = MultiComparison(df['REV'], df['TOL'])
    mct_results = mct.tukeyhsd()
    print(mct_results)

    mcn = MultiComparison(df['REV'], df['NNN'])
    mcn_results = mcn.tukeyhsd()
    print(mcn_results)
Beispiel #2
0
def tukeyhsd(statistics_table: pandas.DataFrame,
             column: str) -> Dict[str, TukeyHSDResults]:
    """
		Perfors tukey multiple-comparison statistics.
	Parameters
	----------
	statistics_table: A table with each subject as a separate column
	column: The column with the relevant values. Should be identical to the `y` variable used when generating figures.
	"""
    is_nested = statistics_table['condition'].nunique() != 1
    if is_nested:
        subjects = ['plate', 'strain', 'condition']
    else:
        subjects = ['plate', 'strain']
    tukey_results = dict()
    for subject in subjects:
        logger.debug(f"tukey subject: '{subject}'")
        logger.debug(
            f"tukey subject values: {statistics_table[subject].unique()}")
        # MultiComparison doesn't work when there are only two possible groups, so disable this if we only have 2 categories.
        number_of_unique_categories = statistics_table[subject].nunique()
        if number_of_unique_categories > 2:
            tukey_result = MultiComparison(
                statistics_table[column],
                statistics_table[subject]).tukeyhsd()
            tukey_results[subject] = tukey_result

    statistics_table['condition:strain'] = statistics_table[
        'condition'] + "-" + statistics_table['strain']
    mc = MultiComparison(statistics_table[column],
                         statistics_table['condition:strain'])
    tukey_results['condition_strain'] = mc.tukeyhsd()

    return tukey_results
def analyzeData(results2):

    print('Accuracy')
    print(
        AnovaRM(data=results2,
                depvar='Accuracy',
                subject='Subject',
                within=['Condition'],
                aggregate_func='mean').fit())

    MultiComp = MultiComparison(results2['Accuracy'], results2['Condition'])
    comp = MultiComp.allpairtest(sci.ttest_rel, method='bonf')
    print(comp[0])

    print('Reaction Time')
    print(
        AnovaRM(data=results2,
                depvar='Reaction Time',
                subject='Subject',
                within=['Condition'],
                aggregate_func='mean').fit())

    MultiComp = MultiComparison(results2['Reaction Time'],
                                results2['Condition'])
    comp = MultiComp.allpairtest(sci.ttest_rel, method='bonf')
    print(comp[0])
Beispiel #4
0
    def ANOVA_TimePoints(self, combine_sexes=True):
        """
        Hardcoded to use count as the parameter and treatment as the grouping
        identifier. This is a limitation due to these values being in a string,
        can work around this later by building a string according to the format
        below.
        """
        mc_results_to_return = []
        summary_to_return = []
        if combine_sexes:
            for timepoint in self.pr_columns:
                print("Time point: " + timepoint)
                df_timepoint = self.df_dropna.loc[self.df_dropna['Time Point']
                                                  == timepoint]
                results = ols('Count ~ C(Treatment)', data=df_timepoint).fit()
                print(results.summary())
                mc = MultiComparison(df_timepoint['Count'],
                                     df_timepoint['Treatment'])
                mc_results = mc.tukeyhsd()
                print(mc_results)
                summary_to_return.append(results)
                mc_results_to_return.append(mc_results)

        elif not combine_sexes:
            for timepoint in self.pr_columns:
                print("Time point: " + timepoint)
                df_timepoint = self.df_dropna.loc[
                    (self.df_dropna['Time Point'] == timepoint)
                    & (self.df_dropna['Treatment'].isin(
                        self.male_treatment_labels))]
                results = ols('Count ~ C(Treatment)', data=df_timepoint).fit()
                print(results.summary())
                mc = MultiComparison(df_timepoint['Count'],
                                     df_timepoint['Treatment'])
                mc_results = mc.tukeyhsd()
                print(mc_results)
                summary_to_return.append(results)
                mc_results_to_return.append(mc_results)

            for timepoint in self.pr_columns:
                print("Time point: " + timepoint)
                df_timepoint = self.df_dropna.loc[
                    (self.df_dropna['Time Point'] == timepoint)
                    & (self.df_dropna['Treatment'].isin(
                        self.female_treatment_labels))]
                results = ols('Count ~ C(Treatment)', data=df_timepoint).fit()
                print(results.summary())
                mc = MultiComparison(df_timepoint['Count'],
                                     df_timepoint['Treatment'])
                mc_results = mc.tukeyhsd()
                print(mc_results)
                summary_to_return.append(results)
                mc_results_to_return.append(mc_results)

            return summary_to_return, mc_results_to_return
        else:
            print(
                "Did not understand parameters for which stats to do here. Looking for True or False"
            )
Beispiel #5
0
    def test_incorrect_output(self):
        # too few groups
        assert_raises(ValueError, MultiComparison, np.array([1] * 10),
                      [1, 2] * 4)
        # too many groups
        assert_raises(ValueError, MultiComparison, np.array([1] * 10),
                      [1, 2] * 6)
        # just one group
        assert_raises(ValueError, MultiComparison, np.array([1] * 10),
                      [1] * 10)

        # group_order doesn't select all observations, only one group left
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter('always')
            assert_raises(ValueError,
                          MultiComparison,
                          np.array([1] * 10), [1, 2] * 5,
                          group_order=[1])

        # group_order doesn't select all observations,
        # we do tukey_hsd with reduced set of observations
        data = np.arange(15)
        groups = np.repeat([1, 2, 3], 5)
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter('always')
            mod1 = MultiComparison(np.array(data), groups, group_order=[1, 2])
            assert_equal(len(w), 1)
            assert issubclass(w[0].category, UserWarning)

        res1 = mod1.tukeyhsd(alpha=0.01)
        mod2 = MultiComparison(np.array(data[:10]), groups[:10])
        res2 = mod2.tukeyhsd(alpha=0.01)

        attributes = [
            'confint', 'data', 'df_total', 'groups', 'groupsunique',
            'meandiffs', 'q_crit', 'reject', 'reject2', 'std_pairs', 'variance'
        ]
        for att in attributes:
            err_msg = att + 'failed'
            assert_allclose(getattr(res1, att),
                            getattr(res2, att),
                            rtol=1e-14,
                            err_msg=err_msg)

        attributes = [
            'data', 'datali', 'groupintlab', 'groups', 'groupsunique',
            'ngroups', 'nobs', 'pairindices'
        ]
        for att in attributes:
            err_msg = att + 'failed'
            assert_allclose(getattr(mod1, att),
                            getattr(mod2, att),
                            rtol=1e-14,
                            err_msg=err_msg)
Beispiel #6
0
def tukey(structure, alpha, valutation): #Tukey calculation pairwise and multiple comparisons and finally print the plot
    if valutation == 'ap':
        data, group = make_datagroup(structure, valutation)
        mc = MultiComparison(data, group)
        result = mc.tukeyhsd(alpha)
        fig = result.plot_simultaneous()    # Plot group confidence intervals
        fig.set_figwidth(30)
        fig.set_figheight(20)
        axes = fig.gca()
        fig.suptitle('Tukey_HSD test', fontsize=40)
        axes.set_xlabel("Average Precision (AP)", fontsize=30)
        axes.tick_params(labelsize=30)
        fileplot = path+"results/run/plot/Tukey_HSD_test_ap.png"
        fig.savefig(fileplot, dpi=300)
        fw = open(path+"results/run/plot/tukey_HSD_ap.txt", "w")
        fw.write(str(result))
        print(result)
        fw.close()
    elif valutation == 'p_10':
        data, group = make_datagroup(structure, valutation)
        mc = MultiComparison(data, group)
        result = mc.tukeyhsd(alpha)
        fig = result.plot_simultaneous()    # Plot group confidence intervals
        fig.set_figwidth(30)
        fig.set_figheight(20)
        axes = fig.gca()
        fig.suptitle('Tukey_HSD test', fontsize=40)
        axes.set_xlabel("P(10)", fontsize=30)
        axes.tick_params(labelsize=30)
        fileplot = path+"results/run/plot/Tukey_HSD_test_p10.png"
        fig.savefig(fileplot, dpi=300)
        fw = open(path+"results/run/plot/tukey_HSD_p10.txt", "w")
        fw.write(str(result))
        print(result)
        fw.close()
    else:
        data, group = make_datagroup(structure, valutation)
        mc = MultiComparison(data, group)
        result = mc.tukeyhsd(alpha)
        fig = result.plot_simultaneous()    # Plot group confidence intervals
        fig.set_figwidth(30)
        fig.set_figheight(20)
        axes = fig.gca()
        fig.suptitle('Tukey_HSD test', fontsize=40)
        axes.set_xlabel("Rprec", fontsize=30)
        axes.tick_params(labelsize=30)
        fileplot = path+"results/run/plot/Tukey_HSD_test_rprec.png"
        fig.savefig(fileplot, dpi=300)
        fw = open(path+"results/run/plot/tukey_HSD_rprec.txt", "w")
        fw.write(str(result))
        print(result)
        fw.close()
Beispiel #7
0
    def tukey_test(self):
        """It applies Tukey test to the dataframe. Tukey is a multi-comparison method.
        Discover more at  https://en.wikipedia.org/wiki/Tukey’s_range_test .
        Be sure you are working with a normal distribution"""

        MultiComp = MultiComparison(self.df.values, self.df.index)
        tukey = MultiComp.tukeyhsd().summary()
        print("\nTukey test for rows\n" + str(tukey) + "\n")
        self.results.write("\nTukey test for rows\n" + str(tukey) + "\n")

        MultiComp = MultiComparison(self.df.T.values, self.df.columns)
        tukey2 = MultiComp.tukeyhsd().summary()
        print("\nTukey test for columns\n" + str(tukey2) + "\n")
        self.results.write("\nTukey test for columns\n" + str(tukey2) + "\n")
        return (tukey, tukey2)
def tukey_multi_metrics(gather_df,
                        col_indices=list(range(10)) + list(range(15, 20)) +
                        list(range(25, 65)) + list(range(145, 149)) +
                        list(range(150, 180)),
                        alpha=0.05):
    metric_names = list(gather_df.columns.values[col_indices])
    model_names = list(gather_df.index.levels[0])
    tukey_dict = {}

    # drop fold means and medians
    gather_df = gather_df[metric_names]
    gather_df = gather_df.xs('test_metrics', level='set')
    gather_df = gather_df.drop('Folds Mean', level='fold')
    gather_df = gather_df.drop('Folds Median', level='fold')

    # get fold count
    model_names_rep = []
    for m in model_names:
        k = gather_df.xs(m, level='model').shape[0]
        model_names_rep.extend([m for _ in range(k)])

    for i, metric in zip(range(len(metric_names)), metric_names):
        m_df = gather_df[metric]

        m_df.sort_index(inplace=True)
        m_df = m_df.loc[model_names]

        m_df_mat = np.around(m_df.as_matrix(), decimals=4)
        mc_obj = MultiComparison(m_df_mat, model_names_rep)
        tukey_res = mc_obj.tukeyhsd(alpha=alpha)

        tukey_dict[metric] = tukey_res

    return tukey_dict
def anova_all( type, file_output, *datas ):
    together = pd.DataFrame()
    for d in datas:
        together = pd.concat( [together, d] )
    
    groups = together.groupby(['strategy'])
    
    fvalue, pvalue = stats.f_oneway(
        # groups.get_group("m")["result"],
        # groups.get_group("pr")["result"],
        # groups.get_group("mpr")["result"],
        # groups.get_group("ols")["result"],
        groups.get_group("om")["result"],
        groups.get_group("opr")["result"],
        groups.get_group("ompr")["result"]
    )
    
    f = open( file_output, "a" )
    f.write( type + ":" + str( pvalue ) + "\n" )

    if pvalue < 0.05:
        mc = MultiComparison(
            together["result"],
            together["strategy"]
        )
        mc_results = mc.tukeyhsd()
        print( str( round( pvalue, 4 ) ), file=f )
        print( mc_results, file=f )
Beispiel #10
0
def get_tukey(exp, df_all, measure):
    # Tukey posthoc analysis
    # See https://jpktd.blogspot.com/2013/03/multiple-comparison-and-tukey-hsd-or_25.html
    # And https://code.google.com/archive/p/qsturng-py/
    # And https://stackoverflow.com/questions/48200699/how-can-i-get-p-values-of-each-group-comparison-when-applying-the-tukey-s-hones
    # q, res_table, std_pairs, etc can be found from print(dir(result)) which will list all possible calculations

    if len(df_all.groupby('strain').count()) >= 3:
        df_tukey = df_all[np.isfinite(df_all[measure])]
        mc = MultiComparison(df_tukey[measure], df_tukey['strain'])
        result = mc.tukeyhsd()
        p = psturng(np.abs(result.meandiffs / result.std_pairs),
                    len(result.groupsunique), result.df_total)
        df_pairs = pd.DataFrame({
            'group1': [
                result._results_table[1][0], result._results_table[2][0],
                result._results_table[3][0]
            ],
            'group2': [
                result._results_table[1][1], result._results_table[2][1],
                result._results_table[3][1]
            ],
            'p_value':
            [np.around(p[0], 4),
             np.around(p[1], 4),
             np.around(p[2], 4)]
        })
    else:
        df_pairs = pd.DataFrame({'group1': [], 'group2': [], 'p_value': []})

    file_out = exp.name + '_coupling_' + measure + '_' + '_tukey_' + '.csv'
    pfiles.save_csv(df_pairs, file_out, exp.dir_tukey, False)
    return df_pairs
def scipy_anova_post_hoc_tests(df=None,
                               flight_status_col='flight status new',
                               sig_test=stats.f_oneway):
    """
    df should be melted by aberration type
    """
    # make list of aberrations
    aberrations = list(df['aberration type'].unique())

    # loop through aberrations & perform anovas between pre/mid/post
    for aberr in aberrations:

        g_1 = df[(df[flight_status_col] == 'Pre-Flight')
                 & (df['aberration type'] == aberr)]['count per cell']
        g_2 = df[(df[flight_status_col] == 'Mid-Flight')
                 & (df['aberration type'] == aberr)]['count per cell']
        g_3 = df[(df[flight_status_col] == 'Post-Flight')
                 & (df['aberration type'] == aberr)]['count per cell']
        statistic, p_value = sig_test(g_1, g_2, g_3)
        print(aberr, p_value)

        # if anova detects sig diff, perform post-hoc tests
        if p_value <= 0.05:
            mc = MultiComparison(
                df[df['aberration type'] == aberr]['count per cell'],
                df[df['aberration type'] == aberr][flight_status_col])
            mc_results = mc.tukeyhsd()
            print(mc_results)
            res = mc_results
            print(
                f'pvalues: {list(psturng(np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total))}'
            )
            print('\n')
Beispiel #12
0
def get_tukey(df_groups, measure, groups):

    # Tukey posthoc analysis
    # See https://jpktd.blogspot.com/2013/03/multiple-comparison-and-tukey-hsd-or_25.html
    # And https://code.google.com/archive/p/qsturng-py/
    # And https://stackoverflow.com/questions/48200699/how-can-i-get-p-values-of-each-group-comparison-when-applying-the-tukey-s-hones
    # q, res_table, std_pairs, etc can be found from print(dir(result)) which will list all possible calculations
    df_tukey = df_groups[3][np.isfinite(df_groups[3][measure])]
    #print(df_tukey)
    mc = MultiComparison(df_tukey[measure], df_tukey['strain'])
    #result = pairwise_tukeyhsd(mc.data,mc.groups,0.05)
    result = mc.tukeyhsd()
    p = psturng(np.abs(result.meandiffs / result.std_pairs),
                len(result.groupsunique), result.df_total)
    df_pairs = pd.DataFrame({
        'group1': [
            result._results_table[1][0], result._results_table[2][0],
            result._results_table[3][0]
        ],
        'group2': [
            result._results_table[1][1], result._results_table[2][1],
            result._results_table[3][1]
        ],
        'p_value':
        [np.around(p[0], 4),
         np.around(p[1], 4),
         np.around(p[2], 4)]
    })

    for index, row in df_pairs.iterrows():
        stars = get_stars(row['p_value'])
        df_pairs.loc[index, 'significance'] = stars

    return df_pairs
def posthoc_turron_by_gender(melted, variable):
    df = melted.copy()
    df = df[df['variable'] == variable]
    df['turron:gender'] = df['turron'] + "_" + df['gender']
    mc = MultiComparison(df['value'], df['turron:gender'])
    result = mc.tukeyhsd()
    print(result)
Beispiel #14
0
 def calculate_test(self):
     """It applies Holm-Bonferroni test to the dataframe. Bonferroni is a multi-comparison method.
     Discover more at https://en.wikipedia.org/wiki/Holm%E2%80%93Bonferroni_method .
     Be sure you are working with a normal distribution"""
     
     MultiComp = MultiComparison(self.data.values,self.data.index)
     holm=MultiComp.allpairtest(stats.ttest_rel, method='Holm')
     print("\nHolm-Bonferroni test for rows\n"+str(holm) +"\n")
     self.results.write("\nHolm-Bonferroni test for rows\n"+str(holm) +"\n")
     
     MultiComp = MultiComparison(self.data.T.values,self.data.columns)
     holm2=MultiComp.allpairtest(stats.ttest_rel, method='Holm')
     print("\nHolm-Bonferroni test for columns\n"+str(holm2) +"\n")
     self.results.write("\nHolm-Bonferroni test for columns\n"+str(holm2) +"\n")
     
     return (holm,holm2)
def tukey_hsd(data, groups, metric):
    """ANOVA and Tukey HSD post-hoc comparison from statsmodels.

    The Tukey HSD post-hoc comparison test controls for type I error and maintains the familywise error rate at 0.05.
    The group1 and group2 columns are the groups being compared,
    meandiff column is the difference in means of the two groups being calculated as group2 – group1,
    lower/upper columns are the lower/upper boundaries of the 95% confidence interval,
    reject column states whether or not the null hypothesis should be rejected.

    Args:
        data (Dataframe): Dataframe grouped by dialogue group and label_type values.
        groups (string): Indicates which columns values to group data for comparison i.e. label_type.
        metric (string): Indicates which column name has the result values i.e. values/times.

    Returns:
        tukey_frame (Dataframe): Contains f-statistic, p-value and eta/omega effect sizes.

            group1   group2  meandiff  p-value   lower   upper  reject
        0       ap  ap type   -0.1167   0.8059 -0.5825  0.3492   False
        1       ap       da    0.1000   0.8543 -0.3659  0.5659   False
        2  ap type       da    0.2167   0.5158 -0.2492  0.6825   False
    """
    # Compare the results (metric) for the range of values for this experiment_type
    multi_comparison = MultiComparison(data=data[metric], groups=data[groups])
    # Create the tukey results table
    tukey_results = multi_comparison.tukeyhsd()

    # Convert the results to a dataframe
    tukey_frame = pd.DataFrame(data=tukey_results._results_table.data[1:],
                               columns=tukey_results._results_table.data[0])
    tukey_frame.rename(columns={'p-adj': 'p-value'}, inplace=True)

    return tukey_frame
Beispiel #16
0
    def print_hpcontrast(self, data: list, labels: list, alpha: float = 0.05):
        """Contrast the hypoteses of the scores given in the list and print the results in the report.

        Using Kurskal-Wallis and Tuckeyhsd tests.

        :param data: List containing the metric results of the models
        :type data: list
        :param labels: List containg the models tags
        :type labels: list
        :param alpha: Number for the pValue of the test, defaults to 0.05
        :type alpha: float, optional
        """
        _, pVal = stats.kruskal(*data)
        str_toprint = f"p-valor KrusW:{pVal}\n"
        if pVal <= alpha:
            str_toprint += (
                "Hypotheses are being rejected: the models are different\n")
            stacked_data = np.vstack(data).ravel()
            cv = len(data[0])
            model_rep = []
            for i in labels:
                model_rep.append(np.repeat("model" + i, cv))
            stacked_model = np.vstack(model_rep).ravel()
            multi_comp = MultiComparison(stacked_data, stacked_model)
            comp = multi_comp.tukeyhsd(alpha=alpha)
            str_toprint += str(comp)
        else:
            str_toprint = (
                str_toprint +
                "Hypotheses are being accepted: the models are equal")
        self.print_noformat(str_toprint)
Beispiel #17
0
def pairwise_ttest(val_vec, cnf):
    df = pd.DataFrame()
    cluster = []
    score = []
    for subc, dic_conf in val_vec.items():
        cluster += [str(subc) for idx in range(len(dic_conf[cnf]))]
        score.extend(dic_conf[cnf])
    df['subcluster'] = cluster
    df['score'] = score
    #    all_comb = list(combinations(df.subcluster, 2))
    #    p_vals = []
    #    for comb in all_comb:
    #        g1 = df[(df.subcluster == comb[0])]['score']
    #        g2 = df[(df.subcluster == comb[1])]['score']
    #        stat, pval = ttest_ind(g1, g2, equal_var=False)
    #        p_vals.append(pval)
    #    reject_list, corrected_p_vals = multipletests(p_vals, method='bonferroni')[:2]
    #    for comb, pv, cpv, r in zip(all_comb, p_vals, corrected_p_vals, reject_list):
    #        print("Comparison: {0} -- p={1}, corr_p={2}, rej={3}".format(
    #              comb, pv, cpv, r))
    MultiComp = MultiComparison(df['score'], df['subcluster'])
    comp = MultiComp.allpairtest(ttest_ind, method='bonf')
    print(comp[0])
    pd.options.display.float_format = '{:.3f}'.format
    print(df.groupby(['subcluster']).describe())
def anova_analysis(df):
    """
    anova_analysis takes in a data frame and performs an anova test for hypothesis testing 1
    prints out the test results
    """
    time_periods = df.groupby(['week_ending','Holiday'],as_index = False)[['seats_sold']].sum()
    TG = time_periods.loc[time_periods['Holiday'] == 'ThanksGiving','seats_sold']
    WB = time_periods.loc[time_periods['Holiday'] == 'WinterBreak','seats_sold']
    SB = time_periods.loc[time_periods['Holiday'] == 'SummerBreak','seats_sold']
    NH = time_periods.loc[time_periods['Holiday'] == 'Not Holiday','seats_sold']
    f,p = stats.f_oneway(TG,WB,SB,NH)
    print('The f and p of ANOVA analysis are:')
    print(f,p)

    ## plot the mean of each group
    time_periods.boxplot('seats_sold', by='Holiday', figsize=(12, 8))
    fileName = 'ANOVA.png'
    plt.savefig(fileName)

    print("The mean seats sold of each time periods:")
    print(time_periods.groupby('Holiday')['seats_sold'].mean())

    pairwise = MultiComparison(time_periods['seats_sold'], time_periods['Holiday'])
    result = pairwise.tukeyhsd()
    print(pairwise)
    print(result)
Beispiel #19
0
def get_significance_booleans(data):
    '''
    preform multiple comparisons (t-tests).

    paramters
    ---------
    data: Series
        must have a single level index containing the group labels (id).
        values are the results to be compared

    returns
    ------
    booleans: Series
         boolean values indicating significance between the groups.
    '''

    id = data.index.values
    value = data.values

    # multiple comparison
    multiple_comparisons = MultiComparison(value, id)  # instanciate multiple comparisons object
    pairwise_holm = multiple_comparisons.allpairtest(ttest_ind, method='holm')  # preform pairwise t-test
    significance_matrix = DataFrame(pairwise_holm[2])  # store results in dataframe
    groups_as_index = significance_matrix.set_index(['group1', 'group2'])
    significance_booleans = groups_as_index['reject']

    return significance_booleans
Beispiel #20
0
def create_df_with_all_post_hoc(df, hue='value'):
    index_0 = [
        "start_cm", "rel_pt", "amp_max_cop", 'amp_max_pel', 'amp_max_c7',
        "vel_max_cop", 'vel_max_pel', 'vel_max_c7', "overshoot", "dcm", "dtml",
        "rcm"
    ]
    index_1 = ["comparaison 0", "comparaison 1", "comparaison 2"]
    index = pd.MultiIndex.from_product([index_0, index_1])
    columns = ['group1', 'group2', 'meandiff', 'lower', 'upper', 'reject']

    n_row = len(index_0) * len(index_1)
    n_col = len(columns)

    data = np.empty((n_row, n_col))
    data[:] = np.nan

    full_df = pd.DataFrame(data, index=index, columns=columns)
    for v in index_0:
        data = get_data_for_anova(df, v, hue)
        results = ols("{} ~ C(level)".format(v), data=data).fit()
        mc = MultiComparison(data[v], data['level'])
        post_hoc = mc.tukeyhsd()
        df_ph = create_df_post_hoc(post_hoc, v)

        if results.f_pvalue < 0.05:
            full_df.loc[(v)] = df_ph.values
        else:
            full_df.loc[(v)] = 'anova ns'

    return full_df
Beispiel #21
0
 def _calc_firing_rate(self,
                       num_peaks: pd.DataFrame,
                       epoch: str = "All_cells"):
     """
     Sum all indices of peaks to find the average firing rate of cells in the three epochs
     :return:
     """
     # Remove silent cells from comparison
     split_data = num_peaks.stack()
     mc = MultiComparison(split_data.values,
                          split_data.index.get_level_values(1).values)
     try:
         res = mc.tukeyhsd()
     except ValueError:
         aprint("<yellow>Failed during the p-value calculation.</yellow>")
     else:
         print(res)
         print(
             f"P-values ({epoch}, number of cells: {split_data.shape[0] // 3}):",
             psturng(
                 np.abs(res.meandiffs / res.std_pairs),
                 len(res.groupsunique),
                 res.df_total,
             ),
         )
     finally:
         print(split_data.mean(level=1))
Beispiel #22
0
    def Tukey(self, Categorical, Continuous):
        """
		Calculate Tukey Honest Significance Difference (HSD) Test, to identify the groups whose
		distributions are significantly different
		"""
        temp_df = self.df.dropna()
        start = time.time()
        mc = MultiComparison(temp_df[Continuous], temp_df[Categorical])
        result = mc.tukeyhsd()
        reject = result.reject
        meandiffs = result.meandiffs
        UniqueGroup = mc.groupsunique
        group1 = [UniqueGroup[index] for index in mc.pairindices[0]]
        group2 = [UniqueGroup[index] for index in mc.pairindices[1]]
        reject = result.reject
        meandiffs = [
            round(float(meandiff), 3) for meandiff in result.meandiffs
        ]
        columns = ['Group 1', "Group 2", "Mean Difference", "Reject"]
        TukeyResult = pd.DataFrame(np.column_stack(
            (group1, group2, meandiffs, reject)),
                                   columns=columns)

        end = time.time()
        if self.debug == 'YES':
            print('Tukey', end - start)
        return TukeyResult
def preform_anova(df_anova, post_hoc=False):
    ### perform anova
    no_interaction = 'err ~ C(u) + C(m) + C(n) + C(h) + C(q) + C(prob)'
    with_interactions = 'err ~ C(u) * C(m) * C(n) * C(h) * C(q) * C(prob)'
    formula = 'err ~ u + m + n + h + q + prob'

    model = ols(formula, data=df_anova).fit()
    aov_table = sm.stats.anova_lm(model,
                                  typ=2)  # Type 2 is for ANOVA DataFrame
    ### effect size (R^2) (R_squared)
    esq_sm = aov_table['sum_sq'][:-1] / (aov_table['sum_sq'][:-1] +
                                         aov_table['sum_sq'][-1])

    aov_table.loc[:, 'r_sq'] = esq_sm

    print(aov_table)

    if post_hoc:
        for name, row in aov_table.iterrows():
            if row['PR(>F)'] < 5e-2:
                mc = MultiComparison(df_anova[name], df_anova['err'])
                mc_results = mc.tukeyhsd()
                print('post_hoc')
                print(mc_results)
    aov_table.to_csv('analysis/anova_results.csv')
def calc_Tukey(f):
    '''
    f= name of feature
    '''
    mc = MultiComparison(data.grouped_features[f]['value'],
                         data.grouped_features[f]['KD'])
    mc_results = mc.tukeyhsd()
    return mc_results
def posthoc_turron_by_first_time_tasting(melted, variable):
    df = melted.copy()
    df = df[df['variable'] == variable].dropna()
    df['turron:first_time_tasting'] = df['turron'] + "_" + df['first_time_tasting']
    mc = MultiComparison(df['value'], df['turron:first_time_tasting'])
    res = mc.tukeyhsd()
    from statsmodels.stats.libqsturng import psturng
    p_values = psturng(np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total)
    print(p_values)
    print(res)
Beispiel #26
0
def tukey(prediction: 'array of ints', model: 'array of string'):
    """ 
        This performs Tukey HSD test,
        input: array(int(predictions)), array(str(model_names))
        this test says if there are significant differences between the classes.
    """
    mc = MultiComparison(prediction, model)
    mc_results = mc.tukeyhsd()
    print(mc_results)
    return mc_results
Beispiel #27
0
def perform_post_hoc_tukey(data, factor1, factor2, factor3, factor2_idx):
    for factor2_idx in range(len(factor2_idx)):
        print(factor2, ': ', factor2_idx + 1)
        mc = MultiComparison(
            data[data[factor2] == factor2_idx + 1][factor1],
            index_data_response_MDD[index_data_response_MDD[factor2] ==
                                    factor2_idx + 1][factor3])
        result = mc.tukeyhsd()

        print(result)
        print(mc.groupsunique)
 def tukey_hsd(self, stacked_df, colname):
     '''
     input:
     stacked_df: from table_transform, a stacked df 
     colname: string, the category to compare 
     return:
     tukeyhsd table result and stacked table 
     set up tukey hsd for post anova with significance
     '''
     MultiComp = MultiComparison(stacked_df[colname], stacked_df['state'])
     return MultiComp.tukeyhsd().summary()
Beispiel #29
0
 def get_multiplecomparisons(self, dataframe, test):
     # If distributions are different then do multiple comparisons
     dataframe = dataframe.dropna()
     print(dataframe)
     cleanbin = dataframe.melt(var_name='Bin', value_name='Value')
     MultiComp = MultiComparison(cleanbin['Value'], cleanbin['Bin'])
     if test == 'ttest':
         comp = MultiComp.allpairtest(scipy.stats.ttest_rel, method='Bonf')
     else:
         comp = MultiComp.allpairtest(scipy.stats.wilcoxon, method='Bonf')
     print(comp[0])
def tukey(data, names):
    names = np.array(names)
    tmp = []
    for item in data:
        for val in item:
            tmp.append(val)
    data = np.array(tmp)
    mc = MultiComparison(data, names)
    result = mc.tukeyhsd()
    print(result)
    print(mc.groupsunique)