Example #1
0
def types_diff(data):
    """
    Statistically significance correction by using bonferroni.
    Arguments:
    ---------
    data: neuro_data + clinical_data
    Returns:
    -------
    pandas dataframe with the features that passed the bonferroni test
    """
    conn_stat = pd.DataFrame(columns=['ROI', 'pvalue'])
    feats = data.iloc[:, :-1].columns.to_list()
    for connections in feats:
        stat, p = shapiro(data[connections])
        alpha = 0.05
        sample1 = data.loc[data["controls_ms"] == 0, connections]
        sample2 = data.loc[data["controls_ms"] == 1, connections]
        if p > alpha:
            stat, p = bartlett(sample1, sample2)
            homovar = True
            if (p <= 0.05):
                homovar = False
            stat, p = ttest_ind(sample1, sample2, equal_var=homovar)
        else:
            stat, p = mannwhitneyu(data.loc[data["controls_ms"] == 0,
                                            connections],
                                   data.loc[data["controls_ms"] == 1,
                                            connections],
                                   alternative='two-sided')

        if (p <= 0.05):
            conn_stat = conn_stat.append({
                'ROI': connections,
                'pvalue': p
            },
                                         ignore_index=True)

    print('Statistically diferences in %d of connections' % len(conn_stat))

    diff = conn_stat.copy()

    p_corr = multipletests(diff["pvalue"],
                           alpha=0.05,
                           method="bonferroni",
                           is_sorted=False)
    diff["p_corr"] = p_corr[1]  #Added the bonferroni correction
    diff_fdr = diff[diff["p_corr"] < 0.05]  #FDR correction with lowest pvalue
    print('Statistically diferences in %d of connections with FDR' %
          len(diff_fdr))
    diff_fa = diff_fdr["ROI"].tolist()
    fa_clinic = data.loc[:, ["age", "sex", "dd", "edss", "controls_ms"]]
    #Adding index fa_har
    # fa_har = data.set_index(data.index) perque???
    fa_har_bonferroni = data[diff_fa]
    fa_har_corr = pd.merge(fa_har_bonferroni,
                           fa_clinic,
                           left_index=True,
                           right_index=True)

    return fa_har_corr
Example #2
0
def anova_by_group(data_df, resp_var, group_var):
    """One way anova."""
    model = ols(resp_var + ' ~ ' + group_var, data=data_df).fit()

    anova_df = sm.stats.anova_lm(model, typ=2)
    anova_df['mean_sq'] = anova_df['sum_sq'] / anova_df['df']

    args = []
    describe_df = pd.DataFrame()

    for group in data_df[group_var].unique():
        grouped_data_df = data_df.loc[data_df[group_var] == group]
        group_describe_df = grouped_data_df.describe().T.rename(
            {resp_var: group})
        describe_df = pd.concat([
            describe_df,
            group_describe_df.loc[group_describe_df.index == group]
        ])

        args.append(grouped_data_df[resp_var])

    markdown('#### Groups description')
    display(describe_df)

    markdown('#### ANOVA')
    display(anova_df[['sum_sq', 'df', 'mean_sq', 'F',
                      'PR(>F)']].replace({np.NaN: ''}))

    markdown('#### Bartlett\'s test of same variance')
    display(stats.bartlett(*args))
Example #3
0
def tTestEqlVar(tValue1, tValue2, tTitle, rConclusion, frConclusion, yLabel,
                figName):
    #t test for equal variances
    alpha = .05
    tvar, p_valvar = stats.bartlett(tValue1, tValue2)

    print(tTitle)
    print(
        f"The t test statistic is {round(tvar,3)} and the p-value is {round(p_valvar,4)}"
    )
    if p_valvar < alpha:
        print(rConclusion)
        tEqVar = False
        ttype = 'Welch (unequal variances) Two-Sample t test'
    else:
        print(frConclusion)
        tEqVar = True
        ttype = 'Two-Sample t test (assuming equal variances)'

    # Create the boxplot
    y = [tValue1, tValue2]
    plt.boxplot(y)
    plt.title(f't: {round(tvar,3)}, p-val: {round(p_valvar,4)}', size=10)
    plt.suptitle(ttype, size=10)
    plt.xticks(range(1, 3), [
        f"4 Bed rooms: {round(tValue1.mean(),2)}",
        f"5 Bed rooms: {round(tValue2.mean(),2)}"
    ])
    plt.ylabel(yLabel)
    plt.savefig(figName, bbox_inches='tight')
    plt.show()
Example #4
0
    def _homogeneity_tests(self):
        df = self.__df
        homogeneityTests = pd.DataFrame(
            {
                "Test Statistic": [
                    stats.levene(df.iloc[:, 0], df.iloc[:, 1])[0],
                    stats.bartlett(df.iloc[:, 0], df.iloc[:, 1])[0]
                ],
                "P-value": [
                    stats.levene(df.iloc[:, 0], df.iloc[:, 1])[1],
                    stats.bartlett(df.iloc[:, 0], df.iloc[:, 1])[1]
                ]
            },
            index=["Levene", "Bartlett"])

        return round(homogeneityTests, 3)
Example #5
0
def bartlett(tamannoMuestras, poblacion):
    results = st.bartlett(muestra(poblacion, tamannoMuestras),
                          muestra(poblacion, tamannoMuestras),
                          muestra(poblacion, tamannoMuestras),
                          muestra(poblacion, tamannoMuestras))
    print("Bartlett Valor Estadistico %f" % results[0])
    print("Bartlett Valor p %f" % results[1])
def bartlett(data):
    """Description of bartlett
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bartlett.html
    """
    if len(data) == 3:
        statistic, pvalue = stats.bartlett(data[0], data[1], data[2])
    elif len(data) == 4:
        statistic, pvalue = stats.bartlett(data[0], data[1], data[2], data[3])
    else:
        utils.print_error("TODO barlett manage more values")
    print("Bartlett Statistic " + str(statistic) + " and p-value " +
          str(pvalue))
    if pvalue > 0.05:
        return True
    else:
        return False
Example #7
0
 def test_data(self):
     args = []
     for k in range(1,11):
         args.append(eval('g%d'%k))
     T, pval = stats.bartlett(*args)
     assert_almost_equal(T,20.78587342806484,7)
     assert_almost_equal(pval,0.0136358632781,7)
def kmo_Bartlett(x):
    x = x.astype(float)
    dataset_corr = x.corr()
    list = [dataset_corr.iloc[:, i] for i in range(dataset_corr.shape[1])]
    statistic, pvalue = bartlett(*list)
    corr_inv = np.linalg.inv(dataset_corr)
    nrow_inv_corr, ncol_inv_corr = dataset_corr.shape
    A = np.ones((nrow_inv_corr, ncol_inv_corr))
    for i in range(0, nrow_inv_corr, 1):
        for j in range(i, ncol_inv_corr, 1):
            A[i, j] = -(corr_inv[i, j]) / (math.sqrt(
                corr_inv[i, i] * corr_inv[j, j]))
            A[j, i] = A[i, j]
    dataset_corr = np.asarray(dataset_corr)
    kmo_num = np.sum(np.square(dataset_corr)) - np.sum(
        np.square(np.diagonal(A)))
    kmo_denom = kmo_num + np.sum(np.square(A)) - np.sum(
        np.square(np.diagonal(A)))
    kmo_value = kmo_num / kmo_denom
    # kmo_value = int(kmo_value)
    # statistic = int(statistic)
    # pvalue = int(pvalue)
    res = []
    res.append([
        "{:.4f}".format(kmo_value), "{:.4f}".format(statistic),
        "{:.4f}".format(pvalue)
    ])

    col = ["KMO检验统计量", "Bartlett's球状检验统计量", "Bartlett's球状检验显著性"]
    title = "KMO检验和Bartlett's球状检验"
    return {'title': title, 'col': col, 'data': res}
Example #9
0
def fifth(data, data1, data2, data3):
    print(f_oneway(data1, data2, data3))
    print(ttest_ind(data1, data2))
    print(ttest_ind(data1, data3))
    print(ttest_ind(data2, data3))
    print(ttest_ind(data1, data2, equal_var=False))
    print(ttest_ind(data1, data3, equal_var=False))
    print(ttest_ind(data2, data3, equal_var=False))

    print(bartlett(data1, data2, data3))
    print(bartlett(data1, data2))
    print(bartlett(data1, data3))
    print(bartlett(data2, data3))

    z = f.ppf(0.975, N - 1, N - 1)
    print(z)
Example #10
0
def bartletts_test(table, response_cols, factor_col):
    groups = table[factor_col].unique()
    
    data_list = []
    stat_list = []
    p_list = []
    for response_col in response_cols:
        response = table[response_col]
        stat_bart, p_bart = bartlett(*[response[table[factor_col] == group] for group in groups])
        data = '{response_col} by {factor_col}'.format(response_col=response_col, factor_col=factor_col)
        data_list.append(data)
        stat_list.append(stat_bart)
        p_list.append(p_bart)
        
    result_table = pd.DataFrame.from_items([ 
        ['data', data_list],
        ['estimate', stat_list],
        ['p_value', p_list] 
    ])
    
    result = dict()
    result['result_table'] = result_table
    
    rb = ReportBuilder()
    rb.addMD(strip_margin("""
    ## Bartlett's Test Result
    | - H0: k population variances are equal.
    | - H1: at least two variances are different.
    |
    | {result_table}
    """.format(result_table=pandasDF2MD(result_table))))
    
    result['report'] = rb.get()
        
    return {'result': result}
Example #11
0
    def _anova_assumptions(self, cl):
        arrays = [['Normality (Shapiro-Wilk)', 'Normality (Shapiro-Wilk)', 'Variance', 'Variance'],
                  ['test stats', 'p-value', 'test stats', 'p-value']]

        temp = np.zeros((4, 1+len(self.indep_var)))

        index = [self.dep_var]

        # Experimental errors are normally distributed
        temp[0,0], temp[1,0] = ss.shapiro(self.ols_model.resid)

        if temp[1,0] > cl: # test for equal variances using Bartlett's test
            for i in range(len(self.indep_var)):
                index.append(self.indep_var[i])
                list_unique = self.df[self.indep_var[i]].unique()
                args = [self.df.loc[self.df[self.indep_var[i]]== x].accuracy for x in list_unique]
                temp[2,i+1], temp[3,i+1] = ss.bartlett(*args)

            arrays[0][2] = arrays[0][2] + ' (Bartlett)'
            arrays[0][3] = arrays[0][3] + ' (Bartlett)'

        else: # test for equal variances using Levene's test
            for i in range(len(self.indep_var)):
                list_unique = self.df[self.indep_var[i]].unique()
                args = [self.df.loc[self.df[self.indep_var[i]]== x].accuracy for x in list_unique]
                temp[2,i+1], temp[3,i+1] = ss.levene(*args)

            arrays[0][2] = arrays[0][2] + ' (Levene)'
            arrays[0][3] = arrays[0][3] + ' (Levene)'

        self.anova_assump_df = pd.DataFrame(temp, index=arrays, columns=index)

        if self.print_output==True: print(' ------------------\n', 'ANOVA assumptions', '\n ------------------'),\
                                    print(self.anova_assump_df, '\n')
        return
Example #12
0
    def check_homogene_variances(self,
                                 value_col,
                                 group_col,
                                 condition=False,
                                 display_result=True):
        # collect data
        data = self.__get_condition_sets(self.df, value_col, group_col,
                                         condition)

        # perform test
        stat, p = bartlett(*data)
        if display_result:
            print("### Homogeneity of Variances ###")
            if condition is False:
                print("{0:} between {1:}: stat={2:.5}, p={3:.5}".format(
                    value_col, group_col, stat, p))
            else:
                print(
                    "{0:} in {1:} between {2:}: stat={3:.5}, p={4:.5}".format(
                        value_col, condition, group_col, stat, p))
            if p > self.alpha:
                print('--> Homogene Variances')
            else:
                print('--> Non-Homogene Variances')
            print("")

        return stat, p
Example #13
0
 def test_data(self):
     args = []
     for k in range(1, 11):
         args.append(eval('g%d' % k))
     T, pval = stats.bartlett(*args)
     assert_almost_equal(T, 20.78587342806484, 7)
     assert_almost_equal(pval, 0.0136358632781, 7)
Example #14
0
def check_homegenity(col1,col2,verbose=False):

    """
    Check whether distances computed for 2 models
    are from the same distribution

    """
    if check_normality(col1) == True and check_normality(col2) == True:
        # Check homogenity for variances -- bartlett
        if verbose is True:
            print('Performing bartlett test for equal variances')
        _,p = bartlett(col1,col2)
        if p > 0.05: # Variances equal
            if verbose is True:
                print('T-test with equal variances')
            _,p = ttest_ind(col1,col2,equal_var=True)
        else:
            if verbose is True:
                print('T-test with unequal variances')
            _,p = ttest_ind(col1,col2,equal_var=False)

        if p > 0.05:
            if verbose is True:
                print('Distributions are homogenous')
            return True
        else:
            if verbose is True:
                print('Distributions are not homogenous')
            return False


    else:
        # Check homegenity for variances -- levene
        if verbose is True:
            print('Performing levene test for equal variances')
        _,p = levene(col1,col2)
        if p > 0.05:
            if verbose is True:
                print('Performing Mann-Whitney U test for equality of medians')
            _,p = mannwhitneyu(col1,col2)
            if p > 0.05:
                if verbose is True:
                    print('Distributions are homogenous')
                return True
            else:
                if verbose is True:
                    print('Distributions are not homogenous')
                return False
        else:
            if verbose is True:
                print('Variances for non-normal data are not equal')
            _,p = mannwhitneyu(col1,col2)
            if p > 0.05:
                if verbose is True:
                    print('Distributions are homogenous')
                return True
            else:
                if verbose is True:
                    print('Distributions are not homogenous')
                return False
Example #15
0
def test_rankings(ranks_df,num_times):
    '''runs ind T-Test num_times to determine if there is a difference in rank across platforms

    args: 
    ranks_df: str, merged DataFrame of podcasts that have both an apple and platform rank
    num_times: int, number of times to run the T-test to determine differences'''

    #create empty list for p-values of each T-test
    p_values =[]
    for i in range(num_times):
        ranks_sample = ranks_df.sample(50,replace = True) #collect 50 podcasts randomly w/ replacement
        
        #if variances different, T-test will use if statement. Otherwise, T-test will use else statement
        bartletts, bart_p = stats.bartlett(ranks_sample['apple_rank'], ranks_sample['spotify_rank'])
        T, T_p = stats.ttest_ind(ranks_sample['apple_rank'],ranks_sample['spotify_rank'],equal_var=bart_p >.05)
        # if bart_p <.05:
        #     T, T_p =  stats.ttest_ind(ranks_sample['apple_rank'],ranks_sample['spotify_rank'],equal_var=False)
        # elif bart_p >.05:
        #     T, T_p = stats.ttest_ind(ranks_sample['apple_rank'],ranks_sample['spotify_rank'],equal_var=True)
        
        p_values.append(T_p)

    #calculate the average p-value of the tests conducted
    final_p = np.mean(p_values)
    if final_p >0.05:
        print('No significant difference in rank across platforms!')
    else:
        print('Significant difference in rank across platforms!')
    return final_p
Example #16
0
def get_stats_Emmanuelle(stats_type, groups, *data):
    '''
        author: version adjusted by Emmanuelle Mazur-Lainé 202206
    Args: type of stats, data
    Return: float
    '''

    data = data[0]
    nbr_gr = len(groups)

    if stats_type == 'mean':
        res_stats = ()
        for group in data:
            res = stats.tmean(group)
            res_stats += (res, )
        return (res_stats), 'mean'
    if stats_type == 'std':
        res_stats = ()
        for group in data:
            res = stats.tstd(group)
            res_stats += (res, )
        return res_stats, 'std'
    elif stats_type == 'kurtosis':
        res_stats = ()
        for group in data:
            res = stats.kurtosis(group)
            res_stats += (res, )
        return res_stats, 'kurtosis'
    elif stats_type == 'skewness':
        res_stats = ()
        for group in data:
            res = stats.skew(group)
            res_stats += (res, )
        return res_stats, 'skewness'

    elif stats_type == 'TTest':
        return stats.ttest_ind(data[0], data[1], equal_var=True), ('t', 'p')
    elif stats_type == 'Welch':
        return stats.ttest_ind(data[0], data[1], equal_var=False), ('t', 'p')
    elif stats_type == 'MannWhitneyu':
        try:
            return stats.mannwhitneyu(data[0], data[1]), ('u', 'p')
        except ValueError:
            return (0, 0), ('h', 'p')

    ########### RESTE À TROUVER COMMENT METTRE TOUS LES GROUPES
    # EN PRAMÈTRES DES TESTS BARTLETT, KRUSKAL ET ANOVA####

    elif stats_type == 'Bartlett':
        return stats.bartlett(*data), ('t', 'p'
                                       )  # Bartlett, tests the null hypothesis
    elif stats_type == 'Kruskal':
        try:
            return stats.kruskal(*data), ('h', 'p')
        except ValueError:
            return (0, 0), ('h', 'p')
    elif stats_type == 'ANOVA':
        return stats.f_oneway(*data), ('t', 'p'
                                       )  #One way ANOVA, checks the variance
Example #17
0
def homo_variance(data, val_col, group_col, result=False):
    """
    data : pd.DataFrame

    val_col : str
            The name of columns which you test.

    group_col : strings or list
            The name of columns which you want to devide data.

    **kwargs : bool

    result = True -> Show p-value and result

    """

    one_d_data = [
        data.loc[ids, val_col].values
        for ids in data.groupby(group_col).groups.values()
    ]
    pw_normal_dist = normal_dist(data, val_col, group_col, result=result)
    if pw_normal_dist > 0.05:
        stastic, p_value = ss.bartlett(*one_d_data)
        if p_value > 0.05:
            print("Equal variance")
            if result == True:
                print("""
===========================================================
p-value is {}. Null hypothesis is not rejected.
These data's variance are not different. / from levene test
===========================================================""".format(p_value))
        if p_value < 0.05:
            print("Unequal variance")
            if result == True:
                print("""
===========================================================
p-value is {}. Null hypothesis is not rejected.
These data's variance are not the same. / from levene test
===========================================================""".format(p_value))
        return pw_normal_dist, p_value
    if pw_normal_dist < 0.05:
        stastic, p_value = ss.levene(*one_d_data)
        if p_value > 0.05:
            print("Equal variance")
            if result == True:
                print("""
===========================================================
p-value is {}. Null hypothesis is not rejected.
These data's variance are not different. / from fligner test
===========================================================""".format(p_value))
        if p_value < 0.05:
            print("Unequal variance")
            if result == True:
                print("""
===========================================================
p-value is {}. Null hypothesis is not rejected.
These data's variance are not the same. / from fligner test
===========================================================""".format(p_value))
        return pw_normal_dist, p_value
Example #18
0
def levene_test(data):
    s1, p1 = levene(data.iloc[:, 0], data.iloc[:, 1], data.iloc[:, 2])
    s2, p2 = bartlett(data.iloc[:, 0], data.iloc[:, 1], data.iloc[:, 2])

    res = [s1, p1, s2, p2]
    res = [round(x, 2) for x in res]

    return res
Example #19
0
 def test_barlettTest_xResult(self):
     data_1 = randint(0, 100, 10)
     data_2 = randint(500, 550, 10)
     data_3 = randint(0, 10, 10)
     data_4 = randint(0, 50, 10)
     x1, p1 = bartlett_test(data_1, data_2, data_3, data_4)
     x2, p2 = bartlett(data_1, data_2, data_3, data_4)
     assert pytest.approx(x2) == x1
Example #20
0
    def full_test(self, groups, alpha):
        list_of_data = [sample.data() for sample in groups.groups()]
        (criterion_value, p_value) = bartlett(*list_of_data)

        crit_left, crit_right = self.critical_values(alpha)

        return criterion_value, (
            crit_left, crit_right), p_value, criterion_value < crit_right
    def homo_cal(self, x, norm_res, skews):
        '''Calculate and return the homoscedasticity test result.
        
        Parameters:
        ----------
        x : list of numpy.ndarray
            The variables to test on
        norm_res : dict 
            Results of normality test
        skews : list
            Skewness values of variables of x
        
        Returns:
        -------
        homo_res : dict
            'Variables': number of variables tested
            'Statistic': statistic value calculated by the test
            'Pvalue': p-value calculated by the test
            'Test': name of the test used
            'Result': True if homogeneous, False otherwise
            
        Notes:
        ------
        Both Barlett Test and Levene Test don't require equal
        sample sizes'''

        homo_res = {}
        if sum(norm_res.values()) == len(x) and all(abs(np.array(skews)) < .5):
            # All normal, use Barlett Test
            homo_res['Variables'] = len(x)
            homo_res['Statistic'] = ss.bartlett(*x)[0]
            homo_res['Pvalue'] = ss.bartlett(*x)[1]
            homo_res['Test'] = 'Bartlett Test'
        else:  # Not all normal, use unparametric Levene Test
            c = self.get_center(skews)
            homo_res['Variables'] = len(x)
            homo_res['Statistic'] = ss.levene(*x, center=c)[0]
            homo_res['Pvalue'] = ss.levene(*x, center=c)[1]
            homo_res['Test'] = 'Levene Test'
            # Also Fligner-Killeen Test is an option
        if homo_res['Pvalue'] >= .05:
            homo_res['Result'] = True
        else:
            homo_res['Result'] = False

        return homo_res
Example #22
0
def equal_variance_test(df):
    """ test for heteroskedasticity """
    """ to test the entire data set, pass argument as pd.concat([train,test]) """
    all_samples = df[[
        'ProductCategory', 'MasterSKU', 'month', 'new_product', 'price_change',
        'cluster', 'monthly_sum_order_qty'
    ]].values
    return bartlett(*all_samples)
Example #23
0
def is_homoscedastic(residuals, y, ha_threshold=0.05, verbose=False):
    print_verbose(f"Testing for homoscedasticity with an alpha of: {str(ha_threshold)}. The null hypothesis is that the errors are homoscedastic.", verbose=verbose)
    result = bartlett(residuals, y)
    if ha_threshold >= result[1]:
        print_verbose(f"P-value for Bartlett test is {str(result[1])} which is at or below the threshold. We therefore reject the null hypothesis and accept the errors are heteroscedastic.", verbose=verbose)
        return False
    
    print_verbose(f"P-value for Bartlett test is {str(result[1])} which is greater than the threshold. We therefore do not reject the null hypothesis and accept the errors are homoscedastic.", verbose=verbose)
    return True
Example #24
0
 def fit(self, *args):
     """Perform Bartlett’s test for equal variances
     
     Parameters
     -------
     sample1, sample2,... : array_like
         arrays of sample data. May be different lengths.
     """
     self._statistic, self._p = bartlett(*args)
Example #25
0
def homoscedasticity_test(df, X, Y, test_type='levene'):
    groups = [df[df[X] == cls][Y] for cls in df[X].unique()]
    if test_type is 'levene':
        levene, p_value = st.levene(*groups)
    elif test_type is 'bartlett':
        bartlett, p_value = st.bartlett(*groups)
    else:
        raise Exception('{} not valid'.format(test_type))
    return p_value
Example #26
0
 def levene(self):
     test = stats.bartlett(x[self.var], y[self.var])
     print('< 등분산성 검정 levene test >', end='\n  ')
     if test.pvalue < 0.05:
         print(str(self.var) + '변수는 두 집단 사이에서 분산이 같지 않다 => 이분산')
         self.Ttest(False)
     else:
         print(str(self.var) + '변수는 두 집단 사이에서 분산이 같다 => 등분산')
         self.Ttest(True)
def getSamples(df, val, survivaldict):
    if val == 0:
        col = 'subtypes'
    elif val == 1:
        col = 'novel'
    else:
        col = 'sklearn'
    clist = df.loc[df[col] == 0, 'samples']
    mlist = df.loc[df[col] == 1, 'samples']
    nlist = df.loc[df[col] == 2, 'samples']
    plist = df.loc[df[col] == 3, 'samples']

    cvals = [
        float(survivaldict[i]) for i in clist if (float(survivaldict[i]) > 0)
    ]
    mvals = [
        float(survivaldict[i]) for i in mlist if (float(survivaldict[i]) > 0)
    ]
    nvals = [
        float(survivaldict[i]) for i in nlist if (float(survivaldict[i]) > 0)
    ]
    pvals = [
        float(survivaldict[i]) for i in plist if (float(survivaldict[i]) > 0)
    ]

    # print(min(cvals), max(cvals), median(cvals))
    # print(min(mvals), max(mvals), median(mvals))
    # print(min(nvals), max(nvals), median(nvals))
    # print(min(pvals), max(pvals), median(pvals))

    print("----Signficance tests----")
    # bartlett for equal variance
    bartf, bartp = stats.bartlett(cvals, mvals, nvals, pvals)
    print("Bartlett's test: p-value = ", bartp)
    # one-way anova for significant differences in mean -> ASSUMES equal variance
    anovaf, anovap = stats.f_oneway(cvals, mvals, nvals, pvals)
    print("ANOVA test: p-value = ", anovap)
    # kruskal doesnt assume equal variance
    krusf, krusp = stats.kruskal(cvals, mvals, nvals, pvals)
    print("Kruskal test: p-value = ", krusp)

    plt.clf()
    plt.boxplot([cvals, mvals, nvals, pvals])
    plt.show()
    # name = col + "_bp.png"
    # plt.savefig(name)

    cmean = mean(cvals)
    mmean = mean(mvals)
    nmean = mean(nvals)
    pmean = mean(pvals)
    cstd = variance(cvals)
    mstd = variance(mvals)
    nstd = variance(nvals)
    pstd = variance(pvals)

    return cmean, mmean, nmean, pmean, cstd, mstd, nstd, pstd
Example #28
0
def bartlett(df, treatment_name_list, treatment_name, value_name):
    """
    Equal Variances (barlett's Test)
    """
    data = []
    for i, name in enumerate(treatment_name_list):
        data.append(df[value_name][df[treatment_name] == name])
    stat = stats.bartlett(*data)
    print(f'p-value: {stat[1]}')
    return stat
Example #29
0
def compare(data_old: pd.DataFrame, data_new: pd.DataFrame, t_test_var: str):
    # T test
    _, p = ttest_ind(data_old[t_test_var],
                     data_new[t_test_var],
                     equal_var=False)
    print(f'p val for {t_test_var} mean: {p}')

    # Variance test
    _, p = bartlett(data_old[t_test_var], data_new[t_test_var])
    print(f'p val {t_test_var} variance: {p}')
Example #30
0
def check_homoscedasticy():
	X, _, y, _ = generate_year_06_dataset()

	statistic, p_value = levene(*list(X.T.to_numpy()))

	print("levene: statistic = {}, p_value = {}".format(statistic, p_value))

	statistic, p_value = bartlett(*list(X.T.to_numpy()))

	print("bartlett: statistic = {}, p_value = {}".format(statistic, p_value))
Example #31
0
def bartlett(a, b, c):
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("Kiểm định Bartlett:")
    stat, pvalue = stats.bartlett(a, b, c)
    print("Statistic =", stat, "\n", "p value =", pvalue)
    if pvalue > 0.05:
        print("Các features đồng nhất về phương sai")
    else:
        print("Các features không đồng nhất về phương sai")
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
Example #32
0
def bartlett_pandas(group_vec, response_vec, min_size=5):
    '''
    Wrapper to do a one way anova on pandas Series
    ------------------------------------------------
    group_vec: Series of labels
    response_vec: Series of measurements
    '''
    if group_vec.value_counts().min() < min_size:
        return np.nan
    group_vec, response_vec = _match_series(group_vec, response_vec)
    res = stats.bartlett(*[response_vec[group_vec == num] for num in 
                     group_vec.unique()])
    return pd.Series(res, index=['T','p'])
Example #33
0
 def run(self):
     if len(self._data) < self._min_size:
         pass
     if len(self._data.groups.values()) <= 1:
         raise NoDataError("Equal variance test requires at least two numeric vectors.")
     if NormTest(self._data, display=False, alpha=self._alpha).p_value > self._alpha:
         statistic, p_value = bartlett(*self._data.groups.values())
         r = 'Bartlett'
         self._results.update({'p value': p_value, self._statistic_name[r]: statistic, 'alpha': self._alpha})
     else:
         statistic, p_value = levene(*self._data.groups.values())
         r = 'Levene'
         self._results.update({'p value': p_value, self._statistic_name[r]: statistic, 'alpha': self._alpha})
     self._test = r
     self._name = self._names[r]
Example #34
0
def is_equal_variance(mesa1, mesa2):
    """ Determine if two sets of values have equal variance.

    This uses the Bartlett test to determine whether or not the values are of
    equal variance. This test only holds for a normal distribution. The caller
    should have checked this for us.
    TODO: Implement Levene’s test for non-normally distributed data
    """

    # scipy really doesn't like 0 variance
    if np.var(mesa1) == 0 and np.var(mesa1) == np.var(mesa2):
        return True

    # http://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm
    T, _p = stats.bartlett(mesa1, mesa2)
    x2 = chisquare_critical(BARTLETT_CI, len(mesa1))
    return T <= x2
Example #35
0
def anova1(data,alpha):
    '''
    Returns list of anova-table data.
    Inputs:
        data - list of numpy arrays with different length.
        It is assumed that all arrays have the same variance and
        are normal distributed.
    Computational steps:
        1. All 
    '''
    IsAllNormalDistributed=True
    for k in data:
        res=st.shapiro(k)
        if alpha>res[1]:
            IsAllNormalDistributed=False
            break #It is not necessary for looping...
    
    if IsAllNormalDistributed==False:
        res=st.kruskal(*data)
        res=list(res)
        res.append('kr')
    else:
        w=st.bartlett(*data) 
        if alpha<w[1]:
            #All variances are equal. We have to perform standart anova analysis.
            res=st.f_oneway(*data)
            res=list(res)
            res.append('fs')
        else:
            # All the data are normal distributed, but variances are different.
            # Do pairwise comparison of columns using chrancox criterium
            print 'All data normal distributed'
            wta=[]
            for k in data:
                for j in [x for x in data if np.all(x!=k)]:
                    wta.append(welch_test(k,j)[1])
                    res=[0,min(wta),'wl']
    return res
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta,showViolin,showBox,firstColAnnot,plotTrend,showLegend,makePzfxFile,makeBinMatrix,writeDataSummaryStat,summaryStatRange,minuslog10pvalue,minNDataToKeep,vfacecolor,valpha,outXYZPvalues,dividePlots):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif


	
	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	trendData={}
	annot={}
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)
		
		if firstColAnnot:
			colAnnot=cols[0]
			cols=cols[1:]
			annotThisFile=[]
			annot[startIdx]=annotThisFile
		else:
			colAnnot=-1
			annotThisFile=None
			
		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))
		
		if plotTrend:
			#print >> stderr,"plotTrend"
			trendDataThisFile=[]
			trendData[startIdx]=trendDataThisFile
		else:
			trendDataThisFile=None
			
			
		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
			
			if plotTrend:
				#print >> stderr,"a"
				trendDataThisLine=[]
			else:
				trendDataThisLine=None
			
			allDataOKThisLine=True
			
			if colAnnot>=0:
				annotThisFile.append(fields[colAnnot])
			
			for idx,col in zip(colIndices,cols):
				try:
					value=float(fields[col])
					if logb!=0:
						if value==0.0:
							raise ValueError
						value=log(value)/logb							
					plotData[idx].append(value)
					
					if plotTrend:
						trendDataThisLine.append(value)
						#print >> stderr,"value:",value
					
				except:
					allDataOKThisLine=False	
				
			if plotTrend:
				if allDataOKThisLine:
					trendDataThisFile.append(trendDataThisLine)
				else:
					trendDataThisFile.append(None)
			
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()
		print >> stderr,xtickLabels
		print >> stderr,relabels
		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])<minNDataToKeep:
			print >> stderr,xtickLabels[c],"discarded because has only",len(plotData[c]),"data points <",minNDataToKeep
			del plotData[c]
			del xtickLabels[c]

	if not skipStat:
		print >> stdout,"student t-test (1 sample; mean=0)"
		print >> stdout,"sample","mean","p-val","median"
	
		if writeDataSummaryStat:
			fDSS=open(writeDataSummaryStat,"w")
			print >> fDSS,"sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange["+str(summaryStatRange[0])+","+str(summaryStatRange[1])+"]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove"
			
		for x in range(0,len(plotData)):
			#print >> stderr, len(plotData[x])
			try:
				print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1],median(plotData[x])
			except:
				print >> stdout, xtickLabels[x],mean(plotData[x]),"NA",median(plotData[x])
			
			if writeDataSummaryStat:
				sumData,N,NIN,NBelow,NAbove=filterDataInRangeInclusive(plotData[x],summaryStatRange[0],summaryStatRange[1])
				
				if NIN>1:
					#print >> stderr,"sumData=",sumData
					#print >> stderr,mean
					mea=mean2(sumData)
					DDOF=1
					sd=std(sumData,ddof=DDOF)
					var=sd*sd
					mi=min(sumData)
					ma=max(sumData)
				else:
					mea="NA"
					sd="NA"
					var="NA"
					mi="NA"
					ma="NA"
				
			
					
				print >> fDSS,xtickLabels[x]+"\t"+str(mea)+"\t"+str(var)+"\t"+str(sd)+"\t"+str(mi)+"\t"+str(ma)+"\t"+str(N)+"\t"+str(NIN)+"\t"+str(float(NIN)*100/N)+"\t"+str(NBelow)+"\t"+str(float(NBelow)*100/N)+"\t"+str(NAbove)+"\t"+str(float(NAbove)*100/N)
			
	
		pvalueM=[]
		
		if writeDataSummaryStat:
			fDSS.close()
		
		print >> stdout,""
		
		print >> stdout,"student t-test (2 samples)"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
		print >> stdout,""
	
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					try:
						pvalue=ttest_ind(plotData[x],plotData[y])[1]
					except:
						pvalue=1.0
					
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
					
					print >> stdout, str(pvalue),
					pvalueRow.append(pvalue)
			print >> stdout,"";	
	
		
		print >> stdout,""
	
		
	
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)
	
	
			
		pvalueM=[]
	
		print >> stdout,"welch t-test"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
						
				else:
					try:
						pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
					except:
						pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
					
					print >> stdout, str(pvalue),
					pvalueRow.append(pvalue)
			print >> stdout,"";
	
		if outXYZPvalues:
			writeXYZPvalues(outXYZPvalues+"_Welch.xyz",xtickLabels,pvalueM)
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)
	
		
		print >> stdout,""
		print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
					except:
						pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
	
		if outXYZPvalues:
			writeXYZPvalues(outXYZPvalues+"_U.xyz",xtickLabels,pvalueM)
		
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
		
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Ansari-Bradley Two-sample Test for difference in scale parameters " 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=ansari(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
						#pvalue=1.0
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_Ansari_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_Ansari",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Fligner's Two-sample Test for equal variance (non-parametrics)" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=fligner(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
						
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_fligner_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_fligner",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Levene's Two-sample Test for equal variance" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=levene(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
						
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_levene_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_levene",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Bartlett's Two-sample Test for equal variance (for normal distributions)" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=bartlett(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_bartlett_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_bartlett",xtickLabels,pvalueM,methodCluster)	
		
		
		#####

	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes,showViolin,showBox,annot,trendData,showLegend,makePzfxFile,makeBinMatrix,dividePlots)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)
		drawDensigram(plotHistogramToFile+".density.png",plotData,xtickLabels)
def create_stats_dict(df, group_var, continuous_measures=None, discrete_measures=None):
    '''
    This function first groups the data frame (df) by the grouping variable (group_var)
    
    It then loops through the discrete and continuous measures and tests the two groups
    for equality of means and variances. 

    Next, the continous measures and discrete measures are all compared
    for each group separately and the whole group together.

    All data is stored in the returned dictionary (stats_dict).
    For the discrete measures there are the following entries:
        * KEY: Number of participants, eg: Meds_male_n 
          VALUE: Contingency table of the number of people
                 in each combination of group_var and discrete measure
                 
        * KEY: Output of fisher's exact test, eg: Meds_male_fisher
          VALUE: odds_ratio and p_value tuple 
    
    For the continuous measures there are the following entries:
        * KEY: Number of observations, eg: Age_mean
          VALUE: Number of observations in each group of group_var
        
        * KEY: Mean value, eg: Age_mean
          VALUE: Mean value for the continuous measure
                 for each group of group_var

        * KEY: Standard deviation, eg: Age_std
          VALUE: Sandard deviation for the continuous measure
                 for each group of group_var
                 
        * KEY: Percentile_values, eg: Age_perc25, Age_perc50, Age_perc75
          VALUE: 25th, 50th and 75th percentile values 
                 for the continuous measure for each group of group_var
                 
        * KEY: Median value, eg: Age_perc50
          VALUE: Median value for the continuous measure
                 for each group of group_var

        * KEY: Test of equal variance output, eg: Age_eqvar
          VALUE: t and p values for bartlett test of equal variance

        * KEY: Test of normality, eg: Age_normal
          VALUE: k2 and p values for omnibus test of normality for *ungrouped* data
          
        * KEY: Test of equal means given equal variance output, eg: Age_ttest_eqvar
          VALUE: t and p values for student's t-test

        * KEY: Test of equal means given unequal variance output, eg: Age_ttest_uneqvar
          VALUE: t and p values for Welsch's t-test

        * KEY: Test of equal medians, eg: Age_mannwhitneyu
          VALUE: U and p values for Mann Whitney U test
    '''
    
    # Import what you need
    import numpy as np
    from scipy.stats import ttest_ind, bartlett, fisher_exact, pearsonr, mannwhitneyu, normaltest
    import itertools as it

    # Create the stats dictionary that we're going to fill in
    stats_dict = {}
    
    # Group the data frame by the grouping variable
    grouped = df.groupby(group_var)
    
    # Loop first through the discrete measures (if there are any)
    if discrete_measures:
        
        for measure in discrete_measures[0]:
            
            # Group the data frame by BOTH the grouping variable 
            # and the discrete measure
            grouped_again = df.groupby([group_var, measure])

            # Define the key for "n" entry to the stats dictionary
            key = '_'.join([group_var, measure, 'n'])
            
            # Add the number of members of each group into the stats_dict
            stats_dict[key] = grouped_again[measure].count().values[:]
            
            # If you have members in each of the four groups then
            # calculate the Fisher's exact test on this contingency table
            if len(np.array(grouped_again[measure].count())) == 4:
                
                # The n_array is the contingency table
                n_array = np.array(grouped_again[measure].count()).reshape([2,2])
                
                # Define the key for the dictionary entry
                key = '_'.join([group_var, measure, 'fisher'])
                
                # And add the fisher's exact output to the stats_dict
                stats_dict[key] = fisher_exact(n_array)
               
    # Loop through the continuous measures
    if continuous_measures:
        for measure in continuous_measures:
            
            # Save some basic stats for each measure by group
            key = '_'.join([measure, 'n'])
            stats_dict[key] = grouped[measure].count().values[:]
            
            key = '_'.join([measure, 'mean'])
            stats_dict[key] = grouped[measure].mean().values[:]
            
            key = '_'.join([measure, 'std'])
            stats_dict[key] = grouped[measure].std().values[:]
            
            key = '_'.join([measure, 'perc25'])
            stats_dict[key] = grouped[measure].quantile(0.25).values[:]
            
            key = '_'.join([measure, 'perc50'])
            stats_dict[key] = grouped[measure].quantile(0.5).values[:] 

            key = '_'.join([measure, 'median'])
            stats_dict[key] = grouped[measure].quantile(0.5).values[:] 
            
            key = '_'.join([measure, 'perc75'])
            stats_dict[key] = grouped[measure].quantile(0.75).values[:] 
            
            # Now save the output of tests of equal variance
            # and equal means
            
            # Use this snazzy little list manipulation to get the 
            # group values
            values = [ g.values for n, g in grouped[measure] ]
            
            # If there are two groups
            if len(values) == 2:

                # Mask out the not a numbers
                values[0] = [ x for x in values[0] if not np.isnan(x) ]
                values[1] = [ x for x in values[1] if not np.isnan(x) ]
        
                # Conduct test for equal variance
                key = '_'.join([measure, 'eqvar'])
                stats_dict[key] = bartlett(values[0], values[1])
                
                # Conduct test for normality
                key = '_'.join([measure, 'normal'])
                stats_dict[key] = normaltest(np.hstack([values[0], values[1]]))
                
                # When you test for equal means (ttest) you have different options
                # depending on if you have equal variances or not. You can also
                # run the non-parametric Mann Whitney U test
                
                # All three will be entered in the stats_dict
                
                # Conduct Welch's t-test (unequal variances)
                key = '_'.join([measure, 'ttest_uneqvar'])
                stats_dict[key] = ttest_ind(values[1], values[0], equal_var = False)
            
                # Conduct standard student's t-test (equal variances)
                key = '_'.join([measure, 'ttest_eqvar'])
                stats_dict[key] = ttest_ind(values[1], values[0], equal_var = True)

                # Conduct mann whitney U test (non-parametric test of medians)
                key = '_'.join([measure, 'mannwhitneyu'])
                u, p = mannwhitneyu(values[1], values[0])
                stats_dict[key] = (u, p*2)        
                
        # For two continuous measues we can calculate
        # PAIRWISE CORRELATIONS
        if continuous_measures:
            for a, b in it.combinations(continuous_measures,2):
                # First look at the whole group
                # mask out participants who don't have both measures
                mask = (df[a].notnull()) * (df[b].notnull())
                
                # Enter the number of participants that were included for the
                # regression into your stats dict
                key = '_'.join([group_var, 'all', a, b, 'n'])
                stats_dict[key] = np.sum(mask)
    
                # Figure out the pairwise correlation for this pair of measures
                # and add it to your stats_dict
                a_values = df[a][mask].values
                b_values = df[b][mask].values
                key = '_'.join([group_var, 'all', a, b, 'pwcorr'])
                stats_dict[key] = pearsonr(a_values, b_values)
                
                # Then do the same thing for the groups individually
                for name, group in grouped:
                    mask = (group[a].notnull()) * (group[b].notnull())
                    
                    # Save the number of members of the group who were 
                    # included in the regression
                    key = '_'.join([group_var, str(name), a, b, 'n'])
                    stats_dict[key] = np.sum(mask)
    
                    # And save the pairwise correlation
                    a_values = group[a][mask].values
                    b_values = group[b][mask].values
                    key = '_'.join([group_var, str(name), a, b, 'pwcorr'])
                    stats_dict[key] = pearsonr(a_values, b_values)
            
            # For a combination of a continous measure and a discrete measure
            # we can conduct TTESTS 
            if discrete_measures:
                for discrete, a in it.product(discrete_measures[0], continuous_measures):
                    # First look at the whole group
                    grouped_discrete = df.groupby(discrete)
                    
                    values = [ g.values for n, g in grouped_discrete[a] ]

                    if len(values) == 2:

                        # Mask out the not a numbers
                        values[0] = [ x for x in values[0] if not np.isnan(x) ]
                        values[1] = [ x for x in values[1] if not np.isnan(x) ]
                
                        # Conduct test for equal variance
                        key = '_'.join([group_var, 'all', discrete, a, 'eqvar'])
                        stats_dict[key] = bartlett(values[0], values[1])
                        
                        # Conduct test for normality
                        key = '_'.join([group_var, 'all', discrete, a, 'normal'])
                        stats_dict[key] = normaltest(np.hstack([values[0], values[1]]))
                        
                        # When you test for equal means (ttest) you have different options
                        # depending on if you have equal variances or not. You can also
                        # run the non-parametric Mann Whitney U test
                        
                        # All three will be entered in the stats_dict
                        
                        # Conduct Welch's t-test (unequal variances)
                        key = '_'.join([group_var, 'all', discrete, a, 'ttest_uneqvar'])
                        stats_dict[key] = ttest_ind(values[1], values[0], equal_var = False)
                    
                        # Conduct standard student's t-test (equal variances)
                        key = '_'.join([group_var, 'all', discrete, a, 'ttest_eqvar'])
                        stats_dict[key] = ttest_ind(values[1], values[0], equal_var = True)
        
                        # Conduct mann whitney U test (non-parametric test of medians)
                        key = '_'.join([group_var, 'all', discrete, a, 'mannwhitneyu'])
                        u, p = mannwhitneyu(values[1], values[0])
                        stats_dict[key] = (u, p*2)
                            
                    # Next look at the two groups separately:
                    for name, group in grouped:
                        grouped_discrete = group.groupby(discrete)
                                                
                        values = [ g.values for n, g in grouped_discrete[a] ]
                        
                        if len(values) == 2:
                            
                            # Mask out the not a numbers
                            values[0] = [ x for x in values[0] if not np.isnan(x) ]
                            values[1] = [ x for x in values[1] if not np.isnan(x) ]
                    
                            # Conduct test for equal variance
                            key = '_'.join([group_var, str(name), discrete, a, 'eqvar'])
                            stats_dict[key] = bartlett(values[0], values[1])
                            
                            # Conduct test for normality
                            key = '_'.join([group_var, str(name), discrete, a, 'normal'])
                            stats_dict[key] = normaltest(np.hstack([values[0], values[1]]))
                            
                            # When you test for equal means (ttest) you have different options
                            # depending on if you have equal variances or not. You can also
                            # run the non-parametric Mann Whitney U test
                            
                            # All three will be entered in the stats_dict
                            
                            # Conduct Welch's t-test (unequal variances)
                            key = '_'.join([group_var, str(name), discrete, a, 'ttest_uneqvar'])
                            stats_dict[key] = ttest_ind(values[1], values[0], equal_var = False)
                        
                            # Conduct standard student's t-test (equal variances)
                            key = '_'.join([group_var, str(name), discrete, a, 'ttest_eqvar'])
                            stats_dict[key] = ttest_ind(values[1], values[0], equal_var = True)
            
                            # Conduct mann whitney U test (non-parametric test of medians)
                            # NOTE that this returns a 1 tailed p value so we multiply it here
                            
                            key = '_'.join([group_var, str(name), discrete, a, 'mannwhitneyu'])
                            u, p = mannwhitneyu(values[1], values[0])
                            stats_dict[key] = (u, p*2)

        # For combos of discrete measures then you can conduct
        # FISHER EXACT tests
        if discrete_measures:
            if len(discrete_measures) > 1:
                for a, b in it.combinations(discrete_measures[0], 2):
                    # Look first at the whole group
                    grouped_again = df.groupby([a,b])
                    
                    if len(np.array(grouped_again[b].count())) == 4:
                    
                        key = '_'.join([group_var, 'all', a, b, 'n'])
                        stats_dict[key] = grouped_again[b].count().values[:]
                        
                        # Now calculate the Fisher's exact test on this contingency table
                        n_array = np.array(grouped_again[b].count()).reshape([2,2])
                    
                        key = '_'.join([group_var, 'all', a, b, 'fisher'])
                        stats_dict[key] = fisher_exact(n_array)
        
                        # Now loop through the two groups separately
                        for name, group in grouped:
                            grouped_again = group.groupby([a,b])
                            
                            if len(np.array(grouped_again[b].count())) == 4:
                                
                                key = '_'.join([group_var, 'all', a, b, 'n'])
                                stats_dict[key] = grouped_again[b].count().values[:]
                                
                                # Now calculate the Fisher's exact test on this contingency table
                                n_array = np.array(grouped_again[b].count()).reshape([2,2])
                                
                                key = '_'.join([group_var, 'all', a, b, 'fisher'])
                                stats_dict[key] = fisher_exact(n_array)
                    
    return stats_dict
Example #38
0
def bartlett((x, y)):
    return stats.bartlett(x, y)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help="Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values."
    )
    parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored")
    parser.add_argument(
        "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored"
    )
    parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help="If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help="the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram")
    parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction")
    parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help="lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias)
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one), correction=args.correction, lambda_=args.lambda_
            )
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one), map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two)
            )
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(
                map(float, sample_one), map(float, sample_two), equal_var=args.equal_var
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one), zero_method=args.zero_method, correction=args.correction
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one), method=args.med, weights=map(float, sample_two)
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples
            )
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
Example #40
0
# bartlett(*args): This package is used to perform Bartlett’s test for equal variances
# levene(args, *kwds): This package is used to perform Levene test for equal variances
# shapiro(x[, a, reta]): This package is used to perform the Shapiro-Wilk test for normality
# anderson(x[, dist]): This package is used to perform the Anderson-Darling test for data coming from a particular distribution
# anderson_ksamp(samples[, midrank]): This package is used to perform the Anderson-Darling test for k-samples

## one sample t_test
t_statistic, p_value = stats.ttest_1samp(a= engineering_breaks, popmean= breaks.mean())

## Bartlett for equal variance and then t_test
fig, axes = plt.subplot(1,2, figsize=(14, 4))
sns.boxplot(x='acl', y='Grade', data=student, ax=axes[0])
sns.pointplot(x='acl', y='Grade', data=student, ax=axes[1])
grades_low = student['Grade']['acl'=='low']
grades_high = student['Grade']['acl'=='High']
stats.bartlett(grades_low, grades_high)
stats.ttest_ind(grades_low, grades_high, equal_var=True)

A = np.random.normal(25.0, 5.0, 100000) 
B = np.random.normal(25.0, 5.0, 100000) 
stats.ttest_ind(A, B) 

%%R
confint(lm(booking_successful ~ assignment-1, data=tot_success))

## chi-square test for goodness of fit
observed = [102, 178, 186, 34] 
expected = [156, 165.5, 147, 31.5] 
chi_squared, p_value = stats.chisquare(f_obs= observed, f_exp= expected)  
chi_squared, p_value
nonathelete = data[data['Athlete'] == 0]['MileMinDur']
# Converting dataset from  hh:mm:ss format to a numerical number: running time in minutes
athelete=athelete.astype(str).reshape(athelete.size,1)
nonathelete=nonathelete.astype(str).reshape(nonathelete.size,1)
athelete=athelete[numpy.where(athelete!=[' '])]
nonathelete=nonathelete[numpy.where(nonathelete!=[' '])]
for i in range(numpy.shape(athelete)[0]) :
	h,m,s=athelete[i].split(':')
	athelete[i]=int(h)*60+int(m)+(int(s)/60.)
for j in range(numpy.shape(nonathelete)[0]) :
	h,m,s=nonathelete[j].split(':')
	nonathelete[j]=int(h)*60+int(m)+(int(s)/60.)	
#Defining significance level 
alpha=0.05
#Perfoming Barlett's Test. This tests whether the populations variances are equal
t, p = stats.bartlett(athelete,nonathelete)
print 'barlett test statistic is', t, 'and p-value', p
# Perfoming decision rule test rejects or fails to reject null hypothesis based on p-value and significance level 
if p <= alpha:
	print 'p-value is too small, null hypothesis is rejected. Variances of the data sets are not equal'
else:
	print 'P-value is greater than significance level',alpha,', Barlett test fails to reject the null hypothesis. Variances are similar'
#Performing Levene's Test. This tests whether the populations are equal
t, p = stats.levene(athelete,nonathelete)
print 'Levene test statistic is', t, 'and p-value', p
# Perfoming decision rule test rejects or fails to reject null hypothesis based on p-value and significance level 
if p <= alpha:
	print 'p-value is too small, null hypothesis is rejected. Variances of the data sets are not equal'
else:
	print 'P-value is greater than significance level',alpha,', Levene test fails to reject the null hypothesis. Variances are similar'
#Performing Welch's Test. This test whether population means are equal
Example #42
0
def analysis_3(df_Coredata, setumei, mokuteki):
	"""一元配置分散分析 : 等分散検定 バートレットの検定 """
	# http://lang.sist.chukyo-u.ac.jp/classes/PythonProbStat/Python-statistics6.html
	# https://py3math.hatenablog.com/entry/oneway-anovatests1

	# 使用したデータは『すぐできる生物統計』
	#
	# 一元配置分散分析:
	# 概要:
	# 2つのグループが互いに有意に違っているかどうか。
	#
	# 帰無仮説:
	# 2つのグループが同じ *** をもっている。


	#ユニークな要素のリスト
	u = df_Coredata[setumei].unique()	
	print( "水準数 : " + str(len(u)) )
	
	#listNumb = range(10)
	#print(listNumb)	

	#最小の水準数にあわせる。最大だとnp.sumで問題発生。
	valList = []
	j = 0
	for i in u:
		valList.append( len(df_Coredata[ (df_Coredata[setumei] == i) ] ) )
		

	print( "最小サンプル数 : " + str(min(valList)) )
	minVal = min(valList)

	df_temp = pd.DataFrame(index = range(minVal))
	
	#インデックスを0からの連番としたデータフレームの定義
	for i in u:
		#print(df_Coredata[ (df_Coredata[setumei] == i) ])
		df_temp_2 = df_Coredata[ (df_Coredata[setumei] == i) ]
		df_temp_2.index = range(len(df_temp_2))
		df_temp[i] = df_temp_2[mokuteki]

	print(df_temp)

	GroupAverageMatrix =np.ones(df_temp.shape)
	for i in range(df_temp.shape[1]):
		GroupAverageMatrix[:,i] = df_temp.mean().iloc[i]

	InGroup = np.array(df_temp - GroupAverageMatrix)

	InGroupSquareSum = np.sum(InGroup**2)

	OverallMean = np.sum(df_temp.mean())/len(df_temp.columns)

	InterGroup =GroupAverageMatrix - np.ones(df_temp.shape)*OverallMean

	InterGroupSquareSum = np.sum(InterGroup**2)

	Dividend = InterGroupSquareSum / (len(df_temp.columns) - 1.0)

	Divider = InGroupSquareSum /( ( len(df_temp.index) -1.0)*len(df_temp.columns))

	print(Divider)
	
	
	#### f値の導出
	print( df_temp[ u[0:len(u)] ] )

	res = f_oneway( df_temp[ u[0] ] , df_temp[ u[1] ], df_temp[ u[2] ], df_temp[ u[3] ] )

	print("f_oneway : " + str( res ))

	F_value, p_value = res

	# 帰無仮説が棄却されるかどうか。
	if p_value < 0.05:
		print('p 値: {} < 0.05'.format(p_value))
		print('帰無仮説は棄却される。')
	else:
		print('p 値: {} > 0.05'.format(p_value))
		print('帰無仮説は棄却されない。')

	#バートレット検定
	bt_results = stats.bartlett( df_temp[ u[0] ] , df_temp[ u[1] ], df_temp[ u[2] ], df_temp[ u[3] ] )
	print(bt_results)
def calc_ttest_dict(a, b, paired=False):
    '''
    Calculate the comparison between the two sets of data
    
    Importantly, although the stars will be the same, this code
    accurately applies either a Student's t, Welch's t, or Mann Whitney U
    test
    '''
    # Import what you need
    import numpy as np
    from scipy.stats import ttest_ind, ttest_rel, bartlett, mannwhitneyu, normaltest, wilcoxon
    
    stats_dict = {}
    
    # Mask out the not a numbers
    a = [ x for x in a if not np.isnan(x) ]
    b = [ x for x in b if not np.isnan(x) ]

    # Save number of people in each group
    stats_dict['n'] = (len(a), len(b))
    
    # Conduct test for equal variance
    stats_dict['eqvar'] = bartlett(a, b)
    
    # Conduct test for normality
    stats_dict['normal'] = normaltest(np.hstack([a, b]))
    
    # When you test for equal means (ttest) you have different options
    # depending on if you have equal variances or not. You can also
    # run the non-parametric Mann Whitney U test
    # Alternatively these data may be paired so there's also the
    # paired t-test and the Wilcoxon signed rank test
    
    # All five will be entered in the stats_dict
    
    # Conduct Welch's t-test (unequal variances)
    stats_dict['ttest_uneqvar'] = ttest_ind(a, b, equal_var = False)

    # Conduct standard student's t-test (equal variances)
    stats_dict['ttest_eqvar'] = ttest_ind(a, b, equal_var = True)

    # Conduct mann whitney U test (non-parametric test of medians)
    stats_dict['mannwhitneyu'] = mannwhitneyu(a, b)
    
    if paired:
        # Conduct the paired student's t-test
        stats_dict['ttest_paired'] = ttest_rel(a, b)
    
        # Conduct Wilcoxon signed rank test (non-parametric *paired* test of medians)
        stats_dict['wilcoxon'] = wilcoxon(a, b)

    # Save in the stats dict the various other measures you might
    # want to report
    stats_dict['medians'] = [np.percentile(a, 50), np.percentile(b, 50)]
    stats_dict['percentile25'] = [np.percentile(a, 25), np.percentile(b, 25)]
    stats_dict['percentile75'] = [np.percentile(a, 75), np.percentile(b, 75)]
    stats_dict['means'] = [np.mean(a), np.mean(b)]
    stats_dict['stds'] = [np.std(a), np.std(b)]
    stats_dict['dfs'] = [(np.float(stats_dict['n'][0])-1), (np.float(stats_dict['n'][1])-1)]
    stats_dict['pooled_std'] = np.sqrt( (np.float(stats_dict['dfs'][0])*(np.float(stats_dict['stds'][0])**2)
                                     + np.float(stats_dict['dfs'][1])*(np.float(stats_dict['stds'][0])**2))
                                     / (np.float(stats_dict['dfs'][0]) + np.float(stats_dict['dfs'][1])))
    
    if paired:
        stats_dict['mean_difference'] = np.mean(np.array(b)-np.array(a))
        stats_dict['std_difference'] = np.std(np.array(b)-np.array(a))
        stats_dict['median_difference'] = np.percentile(np.array(b)-np.array(a), 50) 
        stats_dict['percentile25_difference'] = np.percentile(np.array(b)-np.array(a), 25) 
        stats_dict['percentile75_difference'] = np.percentile(np.array(b)-np.array(a), 75)
        stats_dict['cohens_d'] = np.float(stats_dict['mean_difference']) / np.float(stats_dict['pooled_std'])
        stats_dict['cohens_d_paired'] = np.float(stats_dict['mean_difference']) / np.float(stats_dict['std_difference'])

    return stats_dict
    cur.append(stats.mstats.normaltest(Y)[1])
    #plt.hist(Y)
    #stats.probplot(Y, dist="norm", plot=pylab)
    
    # F-test, strong normality required
    F = np.var(Xr)/np.var(Y)
    df1 = len(Xr) - 1; df2 = len(Y) - 1
    alpha = 0.05 #Or whatever you want your alpha to be.
    p_value = stats.f.sf(F, df1, df2) # p-value = 1-CDF
    cur.append(p_value)    
    if p_value < alpha:
        print "Reject the null hypothesis that Var(X) == Var(Y)"
    else:
        print "equal variance !"
    
    cur.append(stats.bartlett(Xr,Y)[1]) # require normal
    cur.append(stats.levene(Xr,Y,center='median')[1]) # for non-normal samples
    
    # t-test, after equal variance, test for mean
    cur.append(stats.ttest_ind(Xr, Y)[1])
    cur.append(stats.ttest_ind(Xr, Y, equal_var=False)[1])
    cur.append(stats.mannwhitneyu(Xr, Y)[1])
    out.append(cur)

alpha = .5
outary = np.array(out)
col = 0
nrej = sum(outary[:,col]<alpha)
print 'Fraction of reject normaltest on Xr: %.2f' % (nrej*1./nbootstrapp)
col = 1
nrej = sum(outary[:,col]<alpha)
Example #45
0

lexperiments = ['e140515']

for expname in lexperiments:
    datainfo = experiments[expname]
    f = h5py.File(datainfo.dpath + datainfo.name + '.hdf5', 'r')

    rslt = 2400000
    step = 2400000
    d = f[datainfo.datafiles[0] + '/' + 'RawFiltered']

    datainit = d[0:step, 5]
#    print datainfo.datafiles[0], np.mean(datainit), np.std(datainit)
    for dfile in range(1,len(datainfo.datafiles)):
        d = f[datainfo.datafiles[dfile] + '/' + 'RawFiltered']
        datanext = d[0:step, 5]
        bval, pval = bartlett(datainit, datanext)
        print datainfo.datafiles[dfile], np.mean(datanext), np.std(datanext)
        print datainfo.datafiles[dfile], bval, pval, '*'
        datainit = datanext

        # for s in range(len(datainfo.sensors)):
        #     print dfile, datainfo.sensors[s]
        #     for pos in range(1, length):
        #         pvar = np.mean(d[pos*step:(pos*step)+rslt, s])
        #         pmean = np.std(d[pos*step:(pos*step)+rslt, s])
        #         #print pmean, pvar
                    # show_signal(d[(pos-1)*step:((pos-1)*step)+rslt, s])
                    # show_signal(d[pos*step:(pos*step)+rslt, s])
from scipy import stats


#let's first try Bartlett's test:
angles = np.load('/Users/cyrilrocke/Documents/c_elegans/data/test1/data/angles.npy')

#check the variances:
Vars = []
for i in range(48):
    Vars.append(np.var(angles[:,i]))

features = []
for i in range(48):
    features.append(angles[:,i])

if stats.bartlett(*features)[1] < 0.05:
    print('We cant use Kmeans')

#now let's test the assumption that cluster sizes are approximately uniformly
#distributed. This test isn't as important as the first. 
all_postures = np.load('/Users/cyrilrocke/Documents/c_elegans/data/arrays/all_postures.npy')
    
ALL = []
for i in range(39):
    ALL+=all_postures[i].split(' ')
    ALL.remove('')

ALL = [int(i) for i in ALL]

if stats.chisquare(ALL)[1] < 0.05:
    print('chisquare fail')
Example #47
0
 def test_data(self):
     args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10]
     T, pval = stats.bartlett(*args)
     assert_almost_equal(T,20.78587342806484,7)
     assert_almost_equal(pval,0.0136358632781,7)
# 2samplepaired_braindata.py:  
import pandas
from scipy import stats
import numpy
# reading data file
data = pandas.read_csv('http://www.scipy-lectures.org/_downloads/brain_size.csv', sep=';', na_values=".")
#Defining significance level 
alpha=0.05
#Perfoming Barlett's Test. This tests whether the populations variances are equal
t, p = stats.bartlett(data['FSIQ'], data['PIQ']) 
print 'barlett test statistic is', t, 'and p-value', p
# Perfoming decision rule test rejects or fails to reject null hypothesis based on p-value and significance level 
if p <= alpha:
	print 'p-value is too small, null hypothesis is rejected. Variances of the data sets are not equal'
else:
	print 'P-value is greater than significance level',alpha,', Barlett test fails to reject the null hypothesis. Variances are similar'
#Performing Levene's Test. This tests whether the populations are equal
t, p = stats.levene(data['FSIQ'], data['PIQ']) 
print 'Levene test statistic is', t, 'and p-value', p
# Perfoming decision rule test rejects or fails to reject null hypothesis based on p-value and significance level 
if p <= alpha:
	print 'p-value is too small, null hypothesis is rejected. Variances of the data sets are not equal'
else:
	print 'P-value is greater than significance level',alpha,', Levene test fails to reject the null hypothesis. Variances are similar'
#Performing Paired Samples t-test. 
# Null Hypothesis: Mean Full Scale IQ (FSIQ) and Mean Performance IQ (PIQ), measured on the same individuals, are equal. " 
t, p=stats.ttest_rel(data['FSIQ'], data['PIQ'])  
print 'Paired  t- test statistic is', t, 'and p-value', p
#Calculation of difference between mean point estimates:
diffmean=abs(numpy.mean(data['FSIQ']) - numpy.mean(data['PIQ']))
# Perfoming decision rule test rejects or fails to reject null hypothesis based on p-value and significance level 
Example #49
0
def statdesc(data, missing='NaN', labels=[], alpha=.05, show=2):
    """
    Descriptive statistics of data.

    This function calculates the following statistics for each column
    (variable) of the input: mean and unbiased standard deviation [1]_, 95%
    confidence interval (confidence limits for the mean) with unknown
    population STD [2]_, minimum and maximum, median, 25th and 75th percentiles
    [3]_, test for normality (Shapiro-Wilk's test) [4]_, and a test for
    equality of variances for all columns (Levene's or Bartlett's test) [5]_.

    This function also generates plots (if matplotlib is available) to
    visualize the data and shows the calculated statistics on screen.

    Parameters
    ----------
    data : array_like
        1D or 2D (column oriented) numerical data with possible missing values

    missing : string ('nan') or number (int or float), optional
        option to enter a number representing missing values (default = 'nan')

    labels : list of strings, optional
        labels for each column (variable) in data

    alpha : float, optional
        statistical significance level (to decide which test for equality of
        variances to use)

    show : integer (0 or 1 or 2), optional
        option to show plots with some descritive statistics (0: don't show
        any plot; 1: show plots only for the grouped data; 2: show plots for
        individual data as well as for the grouped data (default))

    Returns
    -------
    m_sd : array
        mean and unbiased standard deviation of each column (variable) in data

    ci : array
        95% confidence interval (confidence limits for the mean) with unknown
        population STD for each column (variable) in data

    min_max : array
        minimum and maximum of each column (variable) in data

    quartiles : array
        median, 25th and 75th percentiles of each column (variable) in data

    normality : array
        test for normality of each column (variable) in data (Shapiro-Wilk's
        test)

    eq_var : array
        test for equality of variances for all columns (variables) in data
        (Levene's or Bartlett's test)

    References
    ----------
    .. [1] http://www.itl.nist.gov/div898/handbook/eda/section3/eda356.htm
    .. [2] http://www.itl.nist.gov/div898/handbook/prc/section1/prc14.htm.
    .. [3] http://www.itl.nist.gov/div898/handbook/prc/section2/prc252.htm.
    .. [4] http://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm.
    .. [5] http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm.

    Examples
    --------
    >>> import numpy as np
    >>> from statdesc import statdesc
    >>> y = np.random.randn(20,3)
    >>> statdesc(y)                # use the default options
    >>> y[8:12,1] = np.NaN         # add a missing value
    >>> y[12,1] = 2                # add another missing value
    >>> statdesc(y, False, 2, ['A','B'], .01) # set arguments
    >>> m_sd,ci,minmax,quartiles,normality,eq_var = statdesc(y)

    See Also
    --------
    scipy.stats.describe : Computes several descriptive statistics of the
    passed array

    """

    data = np.asarray(data)  # convert the input to array
    if len(data.shape) == 1:
        data = data.reshape(data.shape[0], 1)
    # missing data: don't use masked arrray, some functions don't handle that
    if isinstance(missing, (int, float)) and ~np.isnan(missing):
        # if missing option is string, must be 'NaN', then data has already NaN
        data[data == missing] = np.NaN

    m_sd = np.zeros((data.shape[1], 2)) * np.NaN
    ci = np.zeros((data.shape[1], 2)) * np.NaN
    min_max = np.zeros((data.shape[1], 2)) * np.NaN
    quartiles = np.zeros((data.shape[1], 3)) * np.NaN
    normality = np.zeros((data.shape[1], 2)) * np.NaN
    eq_var = np.zeros((1, 2)) * np.NaN
    x = []
    nmiss = 0
    min_len = 0

    for i in range(data.shape[1]):
        # due to missing data, each column can have different length;
        # use list of arrays
        x.append(data[~np.isnan(data[:, i]), i])
        nmiss += data.shape[0] - x[i].shape[0]  # total number of missing value
        # skip empty array (data column with missing data only)
        if x[i].shape[0] == 0:
            print('Skipping column %d, only missing data' % (i + 1))
            continue
        # at least 2 sets with 3 points to test for equality of variances
        if x[i].shape[0] > 2:
            min_len += 1
        # handle labels
        if len(labels) > i and labels[i]:
            pass
        else:
            if len(labels) > i:
                labels[i] = str(i+1)
            else:
                labels.append(str(i+1))
        # summary statistics
        m_sd[i], ci[i], min_max[i], quartiles[i], normality[i] = summary(x[i])
        if show > 1 and plt:  # PLOT
            #plot for each variable
            plot1var(data[:, i], x[i], m_sd[i], min_max[i], normality[i],
                     labels[i], alpha, data.shape[1])

    # remove empty arrays (data columns with missing data only)
    i = 0
    while i < len(x):
        if x[i].size == 0:
            x.pop(i)
        else:
            i += 1

    # test for equality of variances
    if len(x) > 1 and min_len > 1:
        # at least 2 sets with 3 points to run this function
        # Levene's test is an alternative to the Bartlett test. The Levene test
        # is less sensitive than the Bartlett test to departures from normality
        # For data with nornal distribution, Bartlett's test has better
        # performance.
        if np.all(normality[:, 1] > .05):
            eq_var[0] = stats.bartlett(*x)
        else:
            eq_var[0] = stats.levene(*x, center='median')

    if show and plt:  # PLOT
        if data.shape[1] > 1:
            #summary plot
            plotallvar(data, x, min_max, eq_var, min_len, alpha, labels)
            #scatterplot matrix
            scatterplot(data, x, label=labels)

    #print results on screen
    statprint(m_sd, ci, min_max, quartiles, normality, eq_var,
              labels, alpha, data.shape[0], data.shape[1], nmiss, len(x))

    return m_sd, ci, min_max, quartiles, normality, eq_var
Example #50
0
    for fiber in fiber_list:
        mod = Model(lambda x, a, b: a * x + b)
        slope_displ = mod.fit(fiber.binned_exp['static_fr_mean'],
                              x=fiber.binned_exp['displ_mean'],
                              a=1, b=1).best_values['a']
        slope_force = mod.fit(fiber.binned_exp['static_fr_mean'],
                              x=fiber.binned_exp['force_mean'],
                              a=1, b=1).best_values['a']
        slope_displ_list.append(slope_displ)
        slope_force_list.append(slope_force)
    slope_displ_arr = np.array(slope_displ_list)
    slope_force_arr = np.array(slope_force_list)
    sensitivity_df = pd.DataFrame(
        np.c_[slope_displ_arr, slope_force_arr],
        index=['#' + str(i+1) for i in range(slope_displ_arr.size)],
        columns=['Displacement sensitivity (Hz/mm)',
                 'Force sensitivity (Hz/mN)'])
    for column in sensitivity_df.columns:
        sensitivity_df[column[:5] + '_normalized'] = sensitivity_df[column] /\
            sensitivity_df[column].median()
    sensitivity_df.transpose().to_excel('./csvs/sensitivity.xlsx')
    print(sensitivity_df.var())
    from scipy.stats import f, bartlett, levene
    print(f.cdf(sensitivity_df['Displ_normalized'].var() /
                sensitivity_df['Force_normalized'].var(),
          sensitivity_df.shape[0], sensitivity_df.shape[0]))
    print(bartlett(sensitivity_df['Displ_normalized'],
                   sensitivity_df['Force_normalized']))
    print(levene(sensitivity_df['Displ_normalized'],
                 sensitivity_df['Force_normalized']))
Example #51
0
 def test_result_attributes(self):
     args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10]
     res = stats.bartlett(*args)
     attributes = ('statistic', 'pvalue')
     check_named_results(res, attributes)
Example #52
0
 def test_empty_arg(self):
     args = (g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, [])
     assert_equal((np.nan, np.nan), stats.bartlett(*args))
def main():
    def n_digits(num):
        if num <= 1:
            return 1
        return math.ceil(math.log(num) / math.log(10))

    db = sqlite.connect(db_fn)
    dbc = db.cursor()
    rows = []
    integer_digits = {'best': 0,
                      'best_time': 0,
                      'mean': 0,
                      'stddev': 0}
    allvals = []
    allvals_dict = {}
    for variant in VARIANTS:
        query = ("select tw from (select min(treewidth) as tw from validationresults where variant='%(variant)s' and instance='%(instance)s' group by seed)")
        result = dbc.execute(query % {'variant': variant, 'instance': instance})
        vals = NP.array([row[0] for row in result])
        min, mean, stddev = vals.min(), vals.mean(), vals.std()
        # print('%s: vals=%r' % (variant, vals), file=sys.stderr)
        W, p = STATS.shapiro(vals)
        print('%s: normal distribution? shapiro-wilk: W=%s (p=%s) %s@5%% %s@2%%' % (variant, W, p, 'no' if W <= .905 else 'yes', 'no' if W <= .884 else 'yes'), file=sys.stderr)
        z, p = STATS.skewtest(vals)
        print('%s: normal distribution? skew test: (z=%s) p=%s => %s' % (variant, z, p, 'no' if p < .5 else 'yes'), file=sys.stderr)
        allvals.append(vals)
        allvals_dict[variant] = vals

        query = ("select min(runtime_s)"
                 " from validationresults"
                 " where variant='%(variant)s' and instance='%(instance)s' and treewidth='%(treewidth)s'")
        result = dbc.execute(query % {'variant': variant, 'instance': instance, 'treewidth': min})
        best_time = [row[0] for row in result][0]
        # print("%s: best=%s @ %ss, avg=%s +- %s" % (variant, min, best_time, mean, stddev), file=sys.stderr)
        row = {'variant': variant,
               'best': min,
               'best_time': round(best_time, 1),
               'mean': round(mean, 1),
               'stddev': round(stddev, 1)}
        rows.append(row)
        integer_digits['best'] = max(integer_digits['best'], n_digits(row['best']))
        integer_digits['best_time'] = max(integer_digits['best_time'], n_digits(row['best_time']))
        integer_digits['mean'] = max(integer_digits['mean'], n_digits(row['mean']))
        integer_digits['stddev'] = max(integer_digits['stddev'], n_digits(row['stddev']))
    db.close()
    T, p = STATS.bartlett(*allvals)
    print('equal variances? bartlett: T=%s (p=%s) [vs Chi-Quadrat_{k-1=%s, alpha=.5}]' % (T, p, len(allvals) - 1), file=sys.stderr)
    W, p = STATS.levene(*allvals, center='mean')
    print('equal variances? levene (mean): (W=%s) p=%s' % (W, p), file=sys.stderr)
    W, p = STATS.levene(*allvals, center='median')
    print('equal variances? levene (median): (W=%s) p=%s' % (W, p), file=sys.stderr)
    F, p = STATS.f_oneway(*allvals)
    print('equal means? one-way ANOVA: F=%s, p=%s [vs F_{k-1=%s,n-k=%s}]' % (F, p, len(allvals) - 1, sum([len(x) for x in allvals]) - len(allvals)), file=sys.stderr)
    try:
        W, p = STATS.kruskal(*allvals)
        print('equal means? kruskal wallis: W=%s, p=%s' % (W, p), file=sys.stderr)
    except Exception as e:
        print(e)
    lsd = LSD.LSD(allvals, .05)
    print('LSD: %r' % lsd, file=sys.stderr)
    print(statsmodels.stats.multicomp.pairwise_tukeyhsd(NP.array(allvals).ravel(), NP.array([[x] * 20 for x in VARIANTS]).ravel(), alpha=.10), file=sys.stderr)
    print(statsmodels.stats.multicomp.pairwise_tukeyhsd(NP.array(allvals).ravel(), NP.array([[x] * 20 for x in VARIANTS]).ravel(), alpha=.05), file=sys.stderr)

    def welch(var1, var2):
        res = STATS.ttest_ind(allvals_dict[var1], allvals_dict[var2], equal_var=False)
        print('%4s vs %s  t,p=%r => \t%s @a=10%%, %s @a=5%%'
              % (var1, var2, res, 'NE' if res[1] < .01116 else ' E', 'NE' if res[1] < .00568 else ' E'), file=sys.stderr)

    print('pairwise Welch\'s t-test with Bonferroni correction:', file=sys.stderr)
    welch('IHA', 'MA1')
    welch('IHA', 'MA2')
    welch('IHA', 'MA3')
    welch('GAtw', 'MA1')
    welch('GAtw', 'MA2')
    welch('GAtw', 'MA3')
    welch('MA1', 'MA2')
    welch('MA1', 'MA3')
    welch('MA2', 'MA3')

    def mannwhitneyu(var1, var2):
        try:
            res = STATS.mannwhitneyu(allvals_dict[var1], allvals_dict[var2])
            print('%4s vs %s  u,p=%r => \t%s @a=10%%, %s @a=5%%'
                  % (var1, var2, res, 'NE' if res[1] < .01116 else ' E', 'NE' if res[1] < .00568 else ' E'), file=sys.stderr)
        except Exception as e:
            print('%4s vs %s  failed: %r' % (var1, var2, e))

    print('pairwise Mann-Whitney U test with Bonferroni correction:', file=sys.stderr)
    mannwhitneyu('IHA', 'MA1')
    mannwhitneyu('IHA', 'MA2')
    mannwhitneyu('IHA', 'MA3')
    mannwhitneyu('GAtw', 'MA1')
    mannwhitneyu('GAtw', 'MA2')
    mannwhitneyu('GAtw', 'MA3')
    mannwhitneyu('MA1', 'MA2')
    mannwhitneyu('MA1', 'MA3')
    mannwhitneyu('MA2', 'MA3')

    #latex = [r'\begin{sidefigure}{caption={Results for instance \Instance{%(instanceTexEsc)s}},label={fig:%(instanceFileEsc)s-results},place={htbp}}''\n'
             #r'   \begin{center}''\n'
    latex = [r'\begin{table}[hbtp]''\n'
             r'   \caption{Results for instance \Instance{%(instanceTexEsc)s}}''\n'
             r'   \label{fig:%(instanceFileEsc)s-results}''\n'
             r'   \centering\small''\n'
             r'      \begin{tabular}{l S[table-format=%(best)s] S[table-format=%(best_time)s.1]%%''\n'
             r'                      S[table-format=%(mean)s.1,table-number-alignment=right] @{$\,\pm\,$} S[table-format=%(stddev)s.1,table-number-alignment=left]''\n'
             r'                      S[table-format=2]} \toprule''\n'
             r'         & \multicolumn{2}{c}{\header{Best}} & \multicolumn{2}{c}{\header{Average}} & \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}''\n'
             r'         & \header{treewidth} & \header{seconds} & \multicolumn{2}{c}{\header{treewidth}} & \header{samples} \\ \midrule'
             % dict(integer_digits.items() | dict(instanceTexEsc=instance.replace('_', r'\textunderscore{}'), instanceFileEsc=instance.replace('_', '-')).items())]
    for row in rows:
        latex.append(' ' * (3 * 3) + ' & '.join([row['variant'], str(row['best']), str(row['best_time']), str(row['mean']), str(row['stddev']), "20"]) + r'\\')
    latex.append(r'         \bottomrule''\n'
                 r'      \end{tabular}''\n'
                 r'\end{table}')
                 #r'   \end{center}''\n'
                 #r'\end{sidefigure}')

    with open('validation-validationset-%s-results.tex' % instance.replace('_', '-'), 'w') as f:
        print('\n'.join(latex), file=f)
Example #54
0
def ttest(data, dataLabel=None, paired=False, decimals=4):
    """
    Perform a t-test using Scipy.stats
    
    Parameters
    ----------
    data : Dictionary (default: None)
        Data format {'group1': [dataset], 'group2': [dataset]}.
    dataLabel : string (default: None)
        title to use for print out of data
    paired : Boolean (default: False)
        Set true to do "paired" t-test, false to do unpaired (independent samples) test.
    decimals : int (default 4)
        decimals in formatted printout
    
    Returns
    -------
     (df, t, p) : tuple
        df : degrees of freedom calculated assuming unequal variance
        t : t statistic for the difference
        p : p value
    """
    
    # test calling values
    if data is None or not isinstance(data, dict) or len(data.keys()) != 2:
        raise ValueError('RStats.permutation: data must be a dictionary with at exactly 2 keys' +
            '\nUse KW (anova) for more than 2 groups')
            
    k = data.keys()
    g1 = data[k[0]]
    g2 = data[k[1]]
    n1 = len(g1)
    n2 = len(g2)
    (w1, p1) = Stats.shapiro(g1, a=None, reta=False)
    (w2, p2) = Stats.shapiro(g2, a=None, reta=False)
    Tb, pb = Stats.bartlett(g1, g2)  # do bartletss for equal variance
    if pb > 0.05:
        equalVar = True
    else:
        equalVar = False
    if paired:
        (t, p) = Stats.ttest_rel(g1, g2, equal_var=equalVar)
    else:
        (t, p) = Stats.ttest_ind(g1, g2,)
    g1mean = np.mean(g1)
    g1std = np.std(g1)
    g2mean = np.mean(g2)
    g2std = np.std(g2)
    #       df = (tstd[k]**2/tN[k] + dstd[k]**2/dN[k])**2 / (( (tstd[k]**2 /
    # tN[k])**2 / (tN[k] - 1) ) + ( (dstd[k]**2 / dN[k])**2 / (tN[k] - 1) ) )
    df = (g1std**2/n1 + g2std**2/n2)**2 / (((g1std**2 / n1)**2 / (n1 - 1) + ((g2std**2 / n2)**2 / (n1 - 1))))
    if dataLabel is not None:
        testtype = 'Independent'
        if paired:
            testtype = 'Paired'
        n = max([len(l) for l in k])
        print '\n%s T-test, data set = %s' % (testtype, dataLabel)
        if p1 < 0.05 and p2 < 0.05:
            print(u'  Both data sets appear normally distributed: Shapiro-Wilk Group 1 p = {:6.3f}, Group2 p = {:6.3f}'.format(p1, p2))
        else:
            print(u'  ****At least one Data set is NOT normally distributed****\n      Shapiro-Wilk Group 1 p = {:6.3f}, Group2 p = {:6.3f}'.format(p1, p2))
            print (u'    (performing test anyway, as requested)')
        if equalVar:
            print(u'  Variances are equivalent (Bartletts test, p = {:.3f})'.format(pb))
        else:
            print(u'  Variances are unequal (Bartletts test, p = {:.3f}); not assuming equal variances'.format(pb))
        print(u'  {:s}={:8.{pc}f}\u00B1{:.{pc}f}  (mean, SD)'.format(k[0].rjust(n), g1mean, g1std, pc=decimals))
        print(u'  {:s}={:8.{pc}f}\u00B1{:.{pc}f}  (mean, SD)'.format(k[1].rjust(n), g2mean, g2std, pc=decimals))
        print(u'  t({:6.2f})={:8.4f}   p={:8.6f}\n'.format(df, float(t), float(p)))
    return(df, float(t), float(p))