Esempio n. 1
0
 def test_equal_mean_median(self):
     x = np.linspace(-1, 1, 21)
     y = x ** 3
     W1, pval1 = stats.levene(x, y, center="mean")
     W2, pval2 = stats.levene(x, y, center="median")
     assert_almost_equal(W1, W2)
     assert_almost_equal(pval1, pval2)
Esempio n. 2
0
 def test_trimmed1(self):
     # Test that center='trimmed' gives the same result as center='mean'
     # when proportiontocut=0.
     W1, pval1 = stats.levene(g1, g2, g3, center='mean')
     W2, pval2 = stats.levene(g1, g2, g3, center='trimmed', proportiontocut=0.0)
     assert_almost_equal(W1, W2)
     assert_almost_equal(pval1, pval2)
Esempio n. 3
0
 def test_equal_mean_median(self):
     x = np.linspace(-1,1,21)
     np.random.seed(1234)
     x2 = np.random.permutation(x)
     y = x**3
     W1, pval1 = stats.levene(x, y, center='mean')
     W2, pval2 = stats.levene(x2, y, center='median')
     assert_almost_equal(W1, W2)
     assert_almost_equal(pval1, pval2)
Esempio n. 4
0
 def test_trimmed2(self):
     x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0]
     y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0]
     # Use center='trimmed'
     W1, pval1 = stats.levene(x, y, center="trimmed", proportiontocut=0.125)
     # Trim the data here, and use center='mean'
     W2, pval2 = stats.levene(x[1:-1], y[1:-1], center="mean")
     # Result should be the same.
     assert_almost_equal(W1, W2)
     assert_almost_equal(pval1, pval2)
Esempio n. 5
0
 def test_trimmed1(self):
     # Test that center='trimmed' gives the same result as center='mean'
     # when proportiontocut=0.
     W1, pval1 = stats.levene(g1, g2, g3, center='mean')
     W2, pval2 = stats.levene(g1,
                              g2,
                              g3,
                              center='trimmed',
                              proportiontocut=0.0)
     assert_almost_equal(W1, W2)
     assert_almost_equal(pval1, pval2)
def main():
    df = pd.read_json(sys.argv[1], lines=True)
    reddit_df, weekends_df, weekdays_df = filterAndTransform(df)
    weekend_counts = weekends_df['comment_count']
    weekday_counts = weekdays_df['comment_count']

    # T-test, normality test and variance test
    ttest = stats.ttest_ind(weekend_counts, weekday_counts)
    initial_ttest_p = ttest.pvalue
    initial_weekday_normality_p = stats.normaltest(weekday_counts).pvalue
    initial_weekend_normality_p = stats.normaltest(weekend_counts).pvalue
    initial_levene_p = stats.levene(weekday_counts, weekend_counts).pvalue

    #Fix 1
    transformed_weekday_counts = np.sqrt(weekday_counts)
    transformed_weekday_normality_p = stats.normaltest(
        transformed_weekday_counts).pvalue
    transformed_weekend_counts = np.sqrt(weekend_counts)
    transformed_weekend_normality_p = stats.normaltest(
        transformed_weekend_counts).pvalue
    transformed_levene_p = stats.levene(transformed_weekend_counts,
                                        transformed_weekday_counts).pvalue

    #Fix 2
    weekly_weekday_counts = weekdays_df.groupby(['year', 'week'
                                                 ]).mean()['comment_count']
    weekly_weekday_normality_p = stats.normaltest(weekly_weekday_counts).pvalue
    weekly_weekend_counts = weekends_df.groupby(['year', 'week'
                                                 ]).mean()['comment_count']
    weekly_weekend_normality_p = stats.normaltest(weekly_weekend_counts).pvalue
    weekly_levene_p = stats.levene(weekly_weekday_counts,
                                   weekly_weekend_counts).pvalue
    weekly_ttest_p = stats.ttest_ind(weekly_weekday_counts,
                                     weekly_weekend_counts).pvalue
    utest_p = stats.mannwhitneyu(weekday_counts,
                                 weekend_counts,
                                 alternative='two-sided').pvalue
    # ...

    print(
        OUTPUT_TEMPLATE.format(
            initial_ttest_p=initial_ttest_p,
            initial_weekday_normality_p=initial_weekday_normality_p,
            initial_weekend_normality_p=initial_weekend_normality_p,
            initial_levene_p=initial_levene_p,
            transformed_weekday_normality_p=transformed_weekday_normality_p,
            transformed_weekend_normality_p=transformed_weekend_normality_p,
            transformed_levene_p=transformed_levene_p,
            weekly_weekday_normality_p=weekly_weekday_normality_p,
            weekly_weekend_normality_p=weekly_weekend_normality_p,
            weekly_levene_p=weekly_levene_p,
            weekly_ttest_p=weekly_ttest_p,
            utest_p=utest_p,
        ))
Esempio n. 7
0
def stats_tests():
    global errors
    tests = ['Brown-Forsythe', 'Bartlett', 'Levene', 'Fligner-Killeen']
    securities = list(container.index)
    indicators = list(container.columns)

    output = pd.DataFrame(index=pd.MultiIndex.from_product([securities, indicators]),
                          columns=tests)

    for security in securities:
        for indicator in indicators:
            all = pd.Series(container.loc[security][indicator]['all'])
            signal = pd.Series(container.loc[security][indicator]['signal'])
            all = pd.to_numeric(all, errors='coerce')
            signal = pd.to_numeric(signal, errors='coerce')

            try:
                output.loc[security, indicator][tests[0]] = stats.levene(
                    all, signal,
                    center='median'
                )
            except:
                errors.append([security, indicator, tests[0]])

            try:
                output.loc[security, indicator][tests[1]] = stats.bartlett(
                    all, signal
                )
            except:

                errors.append([security, indicator, tests[1]])

            try:
                output.loc[security, indicator][tests[2]] = stats.levene(
                    all, signal,
                    center='mean'
                )
            except:
                errors.append([security, indicator, tests[2]])

            try:
                output.loc[security, indicator][tests[3]] = stats.fligner(
                    all, signal
                )
            except:
                errors.append([security, indicator, tests[3]])

    p_values = output.dropna().applymap(lambda x: x.pvalue).unstack()
    p_values_container = output.dropna().applymap(lambda x: x.pvalue).unstack().melt()
    p_values.to_pickle('p_values_full')
    p_values_container.to_pickle('p_values_container_full')
Esempio n. 8
0
def _varAnalysis(df, labels):
    """
    """
    from scipy.stats import levene
    if df.shape[0] != len(labels):
        raise ValueError(
            "The number of input samples is not equal to labels size")
        return 0

    label_ = np.unique(labels)
    groups = _split(df, labels)
    if len(label_) == 2:
        print('Performing t-test analysis...')
        from scipy.stats import ttest_ind
        F, P = [], []
        for i in range(df.shape[1]):
            sample = [item[:, i] for item in groups]
            stat, p = levene(*sample)
            if p < 0.05:
                f, p = ttest_ind(*sample, equal_var=False)
            else:
                f, p = ttest_ind(*sample, equal_var=True)
            F.append(f)
            P.append(p)

    elif len(label_) > 2:
        print('Performing anova analysis...')
        F, P = [], []
        for i in range(df.shape[1]):
            sample = [item[:, i] for item in groups]
            stat, p = levene(*sample)
            if p < 0.05:
                from pingouin import welch_anova
                meta = pd.DataFrame(df.iloc[:, i])
                meta.columns = ['feature']
                meta['labels'] = labels
                result = welch_anova(data=meta, dv='feature', between='labels')
                f = result['F'].values[0]
                p = result['p-unc'].values[0]
            else:
                from scipy.stats import f_oneway
                f, p = f_oneway(*sample)
            F.append(f)
            P.append(p)
    else:
        raise ValueError("Groups for comparison are less than 2!")
    F = pd.DataFrame(F)
    P = pd.DataFrame(P)
    F.index = df.columns
    P.index = df.columns
    return F, P
Esempio n. 9
0
    def test_trimmed2(self):
        x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0]
        y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0]
        np.random.seed(1234)
        x2 = np.random.permutation(x)

        # Use center='trimmed'
        W0, pval0 = stats.levene(x, y, center='trimmed', proportiontocut=0.125)
        W1, pval1 = stats.levene(x2, y, center='trimmed', proportiontocut=0.125)
        # Trim the data here, and use center='mean'
        W2, pval2 = stats.levene(x[1:-1], y[1:-1], center='mean')
        # Result should be the same.
        assert_almost_equal(W0, W2)
        assert_almost_equal(W1, W2)
        assert_almost_equal(pval1, pval2)
Esempio n. 10
0
def anova_oneway():
    ''' One-way ANOVA: test if results from 3 groups are equal. '''
    
    # Get the data
    data = getData('altman_910.txt')
    
    # Sort them into groups, according to column 1
    group1 = data[data[:,1]==1,0]
    group2 = data[data[:,1]==2,0]
    group3 = data[data[:,1]==3,0]
    
    # First, check if the variances are equal, with the "Levene"-test
    (W,p) = stats.levene(group1, group2, group3)
    if p<0.05:
        print('Warning: the p-value of the Levene test is <0.05: p={0}'.format(p))
    
    # Do the one-way ANOVA
    F_statistic, pVal = stats.f_oneway(group1, group2, group3)
    
    # Print the results
    print 'Altman 910:'
    print (F_statistic, pVal)
    if pVal < 0.05:
        print('One of the groups is significantly different.')
        
    # Elegant alternative implementation, with pandas & statsmodels
    df = pd.DataFrame(data, columns=['value', 'treatment'])    
    model = ols('value ~ C(treatment)', df).fit()
    print anova_lm(model)
Esempio n. 11
0
def apply_test(a1, a2, type):
    #Todos los tests se hacen con un 95% de confianza.
    if type == "shapiro":
        _, p1 = stats.shapiro(a1)
        _, p2 = stats.shapiro(a2)

        return (p1 > 0.05 and p2 > 0.05)
    elif type == "levene":
        _, p = stats.levene(a1, a2)

        return p > 0.05
    elif type == "anova":
        _, p = stats.f_oneway(a1, a2)

        return p > 0.05
    elif type == "welch":
        _, p = stats.ttest_ind(a1, a2, equal_var=False)

        return p > 0.05
    elif type == "kruskal":
        _, p = stats.kruskal(a1, a2)

        return p > 0.05
    else:
        print("Test no identificado.")

        return -1
Esempio n. 12
0
def cep_anova(samples_dict):
    '''
    Perform ANOVAs for the samples listed in sample_list
    '''
    samples_list = samples_dict.values()
    result_dict = {}
    # First, perform a Levene test to determine the homogeneity of variance
    equal_var_test = levene(*samples_list, center='mean')
    # The significance stat is the second element in the result tuple
    equal_var_test_sig = equal_var_test[1]
    # Then, depending on the result, we'll perform either a standard or a Welch's test
    # If there's no result, then end test here
    if pd.isnull(equal_var_test_sig):
        result_dict['test'] = 'N/A'
    else:
        if equal_var_test_sig >= SIG_LEVEL:
            result_dict['test'] = 'Standard'
            # Perform an ANOVA here
            anova_result = f_oneway(*samples_list)
        elif equal_var_test_sig < SIG_LEVEL:
            result_dict['test'] = 'Welch'
            # Perform a Welch test here
            anova_result = welch_anova(*samples_list)
        anova_result_sig = anova_result[1]
        result_dict['anova_p'] = anova_result_sig
        if anova_result_sig < SIG_LEVEL:
            # If significant, we'll continue with posthoc tests
            # First, split samples into pairs so we can perform tests
            # on each pair
            c = combinations(samples_dict.items(), 2)
            pairs_dict = {}
            for i in c:
                # Get the value tuple first
                val_tuple = i[0][0], i[1][0]
                # Then the sample tuple
                sample_tuple = i[0][1], i[1][1]
                # Then assign all to pairs_dict
                pairs_dict[val_tuple] = sample_tuple
            # If we did standard test earlier, follow with Tukey posthoc
            # If we did Welch earlier, follow with Games-Howell
            # First, let's calculate msw, r, and df to feed into the posthoc
            msw, r, df = get_msw_et_al(*samples_list)
            kwargs_dict = {}
            kwargs_dict['r'] = r
            if result_dict['test'] == 'Standard':
                result_dict['posthoc'] = 'Tukey'
                posthoc = tukey
                kwargs_dict['msw'] = msw
                kwargs_dict['df'] = df
            elif result_dict['test'] == 'Welch':
                result_dict['posthoc'] = 'Games-Howell'
                posthoc = gh
            for key, sample_tuple in pairs_dict.items():
                sample_a = sample_tuple[0]
                sample_b = sample_tuple[1]
                mean_diff, pval = posthoc(sample_a, sample_b, **kwargs_dict)
                # Translate result into verdict, sign, and cohens_d
                # And save this tuple in the key entry of the result_dict
                result_dict[key] = translate_result(pval, mean_diff, sample_a, sample_b)
    return result_dict
Esempio n. 13
0
def get_p_value_by_feature(pd_train, pd_test, feature_name):
    """
    对特征进行统计检验,保证在两个类别之间的特征是有差异的,没有差异的特征去除掉

    :param pd_train: 可以是train 可以是label 为 1
    :param pd_test: 可以是test 可以是label 为 0
    :param feature_name: 特征的名字
    :return: p值  小于 0.05 是有差异  大于 0.05 是无差异
    """
    # pd_train = pd.read_csv(train_path)
    # pd_test = pd.read_csv(test_path)

    train_feature = pd_train[feature_name]
    test_feature = pd_test[feature_name]

    train_feature_class = len(set(train_feature))
    test_feature_class = len(set(test_feature))

    if train_feature_class > 2 and test_feature_class > 2:
        # 说明这是连续变量,就使用T检验或者是U检验
        train_feature_mean = np.mean(train_feature)
        test_feature_mean = np.mean(test_feature)

        train_feature_std = np.std(train_feature)
        test_feature_std = np.std(test_feature)

        # 进行正态性和方差齐性检验
        sta_value, p_value = levene(train_feature, test_feature)  # 方差齐性
        sta_train, p_value_train = stats.kstest(
            train_feature, "norm", (train_feature_mean, train_feature_std))
        sta_train, p_value_test = stats.kstest(
            test_feature, "norm", (test_feature_mean, test_feature_std))
        # print(p_value_train, p_value_test, p_value)
        if p_value_train >= 0.05 and p_value_test >= 0.05 and p_value >= 0.05:

            statistic, pvalue_t = ttest_ind(train_feature, test_feature)

            # print(feature_name + " t检验:", round(pvalue_t, 3))
            return round(pvalue_t, 3)
        else:

            stat_num, p_m_value = mannwhitneyu(train_feature, test_feature)

            # print(feature_name + " u检验:", round(p_m_value, 3))
            return round(p_m_value, 3)

    if train_feature_class == 2 and test_feature_class == 2:
        # 进行卡方检验
        train_class_1, train_class_2 = Counter(train_feature).most_common()
        test_class_1, test_class_2 = Counter(test_feature).most_common()

        kf_data = np.array(
            [[np.array(train_class_1[-1]),
              np.array(test_class_1[-1])],
             [np.array(train_class_2[-1]),
              np.array(test_class_2[-1])]])
        # print(kf_data)
        a, p_value, b, c = chi2_contingency(kf_data)
        # print(feature_name+"卡方检验: p_value={:.4f}".format(p_value))
        return round(p_value, 3)
def apply_anova(data):
    p_values = []
    genes = data.drop(columns=['group']).columns
    for i, col in enumerate(genes):
        group_names, groups = split_into_groups(data, col)
        res = stats.f_oneway(*groups)

        shapiro_res = stats.shapiro(np.concatenate(groups))  # normality test
        levene_res = stats.levene(*groups, center='mean')  # homodestacity test

        p_values.append((col, res.pvalue, shapiro_res[1], levene_res.pvalue))

        if i % 100 == 0:
            print('Progress {:2.0%}'.format((i / (genes.shape[0]))), end='\r')

    anova_table = pd.DataFrame(
        p_values,
        columns=['gene', 'p_value', 'shapiro_p_value', 'levene_p_value'])
    print(
        'Found {} genes that influence'.format(
            (anova_table['p_value'] < ALPHA).sum()),
        'health conditions according to ANOVA tests.')
    print(
        'Found {} genes that influence'.format(
            ((anova_table['p_value'] < ALPHA) &
             (anova_table['shapiro_p_value'] > ALPHA) &
             (anova_table['levene_p_value'] > ALPHA)).sum()),
        'health conditions according to ANOVA Shapiro-Wilks and Levene tests.')

    return anova_table
Esempio n. 15
0
    def _homogeneity_tests(self):
        df = self.__df
        homogeneityTests = pd.DataFrame(
            {
                "Test Statistic": [
                    stats.levene(df.iloc[:, 0], df.iloc[:, 1])[0],
                    stats.bartlett(df.iloc[:, 0], df.iloc[:, 1])[0]
                ],
                "P-value": [
                    stats.levene(df.iloc[:, 0], df.iloc[:, 1])[1],
                    stats.bartlett(df.iloc[:, 0], df.iloc[:, 1])[1]
                ]
            },
            index=["Levene", "Bartlett"])

        return round(homogeneityTests, 3)
Esempio n. 16
0
def ttestForTwoChoiceQuestions(xValues, yValues):
	npArrayX = np.array(xValues)
	npArrayY = np.array(yValues)
	
	# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html#scipy.stats.normaltest
	xIsNormal = isNormal(npArrayX)
	yIsNormal = isNormal(npArrayY) 
	
	if xIsNormal and yIsNormal:
		# Levene test for equal variances
		# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.levene.html#scipy.stats.levene
		l, lp = stats.levene(npArrayX, npArrayY)
		parametric = xIsNormal and yIsNormal and lp >- 0.05
	else:
		parametric = False
	
	if parametric:
		# if levene test comes out well and samples are normal, can use standard t-test for independent samples
		# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html#scipy.stats.ttest_ind
		t, tp = stats.ttest_ind(xValues, yValues, axis=0)
	else:
		# if not, use Kruskal-Wallis H-test instead
		# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kruskal.html#scipy.stats.kruskal
		t, tp = stats.kruskal(npArrayX, npArrayY)
		t = t / 5.0 # these come out bigger than the t-test stats
	
	return parametric, t, tp
Esempio n. 17
0
    def Simulate(self):
        res = Results()
        res.origW2 = []
        res.surrW2 = []
        res.stats = {}

        for i in range(self.nb_tree):
            # Generate tree
            t = self.treeGenerator.generate(self.tree_size)

            # Sample a clade
            allNodes = [
                n for i, n in enumerate(
                    t.ageorder_node_iter(include_leaves=True, descending=True))
                if self.cladeThrMin <= i <= self.cladeThrMax
            ]
            for c in random.sample(allNodes, self.nb_clades):
                res.origW2.append(computeW2(t, c))
                res.surrW2.append(self.surrogateStrat.generate(t, c))

        # Compute stats
        res.stats['correl'], res.stats['correlPval'] = spearmanr(
            res.origW2, res.surrW2)
        res.stats['origW2Mean'] = np.mean(res.origW2)
        res.stats['origW2Std'] = np.std(res.origW2)
        res.stats['surrW2Mean'] = np.mean(res.surrW2)
        res.stats['surrW2Std'] = np.std(res.surrW2)
        leveneVal, res.stats['LevenePVal'] = levene(res.origW2, res.surrW2)

        return res
Esempio n. 18
0
def calc_ttest(data, exp_set, control_set, tags=()):
    d = [ st.ttest_ind( data.ix[probeset, list(exp_set.filenames)], 
                             data.ix[probeset, list(control_set.filenames)], equal_var=False) for probeset in data.index]
    rs = pandas.DataFrame( index=data.index, data=d, columns=[ tm.e( tags+(("st", "t"),("tt", "welch ttest"))), tm.e( tags + (("st", "pval"), ("tt", "welch ttest"), ("mc", "nominal") ))])    
    rs[tm.e( tags + (("tt", "welch ttest"), ("st", "pval"), ("mc", "bonf")))] = statsmodels.sandbox.stats.multicomp.multipletests(rs.ix[:, tm.e( tags + (("st", "pval"), ("tt", "welch ttest"), ("mc", "nominal"))) ], method="bonferroni")[1]
    rs[tm.e( tags + (("tt", "welch ttest"), ("st", "pval"), ("mc", "bh")))] = statsmodels.sandbox.stats.multicomp.multipletests(rs.ix[:, tm.e( tags + (("st", "pval"), ("tt", "welch ttest"), ("mc", "nominal")))], method="fdr_bh")[1] 

    d = [ st.ttest_ind( data.ix[probeset, list(exp_set.filenames)], 
                             data.ix[probeset, list(control_set.filenames)], equal_var=True) for probeset in data.index]

    rs[tm.e( tags+(("st", "t"),("tt", "student ttest")))] = [v[0] for v in d]
    rs[tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "nominal") ))] = [v[1] for v in d]
    
    rs[tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "bonf")))] = statsmodels.sandbox.stats.multicomp.multipletests(rs.ix[:, tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "nominal"))) ], method="bonferroni")[1]
    rs[tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "bh")))] = statsmodels.sandbox.stats.multicomp.multipletests(rs.ix[:, tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "nominal")))], method="fdr_bh")[1] 


    # do diagnostic tests for heteroskedasticity
    d = [st.levene( data.ix[probeset, list(exp_set.filenames)], data.ix[probeset, list(control_set.filenames)]) for probeset in data.index ]
    rs[ tm.e( tags + (("tt", "levene"), ("st", "pval")))] = [z[1] for z in d]

    # omnibus test for normality
#    d = [st.normaltest( data.ix[probeset, list(exp_set.filenames)]) for probeset in data.index ]
#    rs[ tm.e( tags + (("tt", "d-p omnibus"), ("st", "pval"), ("cg", "exp") ))] = [z[1] for z in d]

#    d = [st.normaltest( data.ix[probeset, list(control_set.filenames)]) for probeset in data.index ]
#    rs[ tm.e( tags + (("tt", "d-p omnibus"), ("st", "pval"), ("cg", "ctrl") ))] = [z[1] for z in d]

    return rs
Esempio n. 19
0
 def test_data(self):
     args = []
     for k in range(1,11):
         args.append(eval('g%d'%k))
     W, pval = stats.levene(*args)
     assert_almost_equal(W,1.7059176930008939,7)
     assert_almost_equal(pval,0.0990829755522,7)
Esempio n. 20
0
    def test(self, arr1, arr2):
        p_value = 0
        if self.statistics == "auto":
            # проверяем Левеном на равенство дисперсий. Если равны
            if stats.levene(arr1, arr2)[1] > 0.05:
                # Шапир на нормальность выборок. Если нормальные
                if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05:
                    # p = Student
                    p_value = stats.ttest_ind(arr1, arr2)[1]
                else:
                    # p = Mann
                    if equal(arr1, arr2):
                        p_value = 1
                    else:
                        p_value = stats.mannwhitneyu(arr1, arr2)[1]
            else:
                p_value = stats.ttest_ind(arr1, arr2, False)[1]

        elif self.statistics == "student":
            p_value = stats.ttest_ind(arr1, arr2)[1]
        elif self.statistics == "welch":
            p_value = stats.ttest_ind(arr1, arr2, False)[1]
        elif self.statistics == "mann":
            if equal(arr1, arr2):
                p_value = 1
            else:
                p_value = stats.mannwhitneyu(arr1, arr2)[1]
        return p_value
Esempio n. 21
0
def check_different_feature_with_anova_limit2():
    _student_data, headerArray = loadFeatureData()

    for i in range(cluster_result_transferred.__len__() - 1):
        for j in range(i + 1, cluster_result_transferred.__len__()):
            print(i, j)
            compare_array = [i, j]
            print("feature,w,p,f,p_f")
            for feature_index in range(1, headerArray.__len__()):
                if headerArray[feature_index] == "unknownCount":
                    continue
                _uid_to_feature_map = {}
                for item in _student_data:
                    _uid_to_feature_map[item[0]] = item[feature_index]

                cluster_with_feature = []
                for cluster_index in compare_array:
                    feature_array = []
                    for uid in cluster_result_transferred[cluster_index]:
                        if uid in _uid_to_feature_map:
                            feature_array.append(_uid_to_feature_map[uid])
                    cluster_with_feature.append(feature_array)

                w, p = stats.levene(*cluster_with_feature)
                f, p_f = stats.f_oneway(*cluster_with_feature)

                print(headerArray[feature_index], ",", w, ",", p, ",", f, ",",
                      p_f)
Esempio n. 22
0
def levene(tamannoMuestras, poblacion):
    results = st.levene(muestra(poblacion, tamannoMuestras),
                        muestra(poblacion, tamannoMuestras),
                        muestra(poblacion, tamannoMuestras),
                        muestra(poblacion, tamannoMuestras))
    print("Levene Valor Estadistico %f" % results[0])
    print("Levene Valor p %f" % results[1])
Esempio n. 23
0
def homogeneity_class_covariances(X_train, y_train):
	
	dims = X_train.shape[1]
	y_u = np.unique(y_train)
	covs = []
		
	for y in y_u:
		covs.append( X_train[y_train==y] )
		
	levene_stat, levene_pval = [], []
	levene_success = 0
	for j in range(0, dims):
		L = []
		for M in covs:
			L.append( M[:, j] )
		l_stat, l_pval = levene(*L)
		levene_pval.append( l_pval )
		if l_pval < 0.05:
			levene_success += 1
			levene_stat.append( l_stat )
	
	if levene_success > 0:
		levene_stat_avg = np.average( levene_stat, weights=levene_stat )
	else:
		levene_stat_avg = np.nan
	levene_pval_avg = np.average( levene_pval, weights=levene_pval )
	levene_success_ratio = levene_success / dims
	
	return levene_stat_avg, levene_pval_avg, levene_success_ratio
Esempio n. 24
0
def test_for_side_levene(df_side, lvl=3, hue='value'):
    from scipy import stats

    columns = ['statistic', 'p-value']

    index_0 = list("lvl_{}".format(i) for i in range(0, lvl))
    index_1 = [
        "start_cm", "rel_pt", "amp_max_cop", 'amp_max_pel', 'amp_max_c7',
        "vel_max_cop", 'vel_max_pel', 'vel_max_c7', "overshoot", "dcm", "dtml",
        "rcm"
    ]

    index = pd.MultiIndex.from_product([index_0, index_1])

    n_row = len(index_0) * len(index_1)
    n_col = len(columns)

    data = np.empty((n_row, n_col))
    data[:] = np.nan

    df = pd.DataFrame(data, index=index, columns=columns)

    for i in range(0, len(index_0)):
        for v in index_1:
            i0, i1 = index_0[i], v
            v_df = get_data_group_by_player_mean(df_side, i, v, hue=hue)
            df.loc[(i0, i1)] = stats.levene(v_df['left'], v_df['right'])
    return df
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        type=str,
        default='testData.csv',
        help='path to the input csv file (default: testData.csv)')
    parser.add_argument('--level',
                        type=float,
                        default=0.05,
                        help='level of significance (default: 0.05)')
    args = parser.parse_args()
    filename = args.input
    level = args.level

    group_list = data_reader(filename)

    data_list = []
    for group_xi in group_list:
        data_list.append(group_xi.data_array)
    data_array = np.array(data_list)

    statistic, p_value = stats.levene(*(data_array[i, :]
                                        for i in range(data_array.shape[0])))

    if (p_value < level):
        print('p-value = ' + str(p_value) + '*')
    else:
        print('p-value = ' + str(p_value))
Esempio n. 26
0
 def levene(cls, xa, xb):
     print('2群間: 母平均の95%ルビーン検定による等分散性の検定-------------------start')
     _, p = st.levene(xa, xb, center='mean')
     if p >= 0.05:
         print(f'p値 = {p:.3f} // 検定結果: 帰無仮説を採択して、2つの標本には等分散性なしとは言えない')
     else:
         print(f'p値 = {p:.3f} // 検定結果: 帰無仮説を棄却して、2つの標本には等分散性なし')
Esempio n. 27
0
def find_feature_Ttest(Data, label, use_all=True):
    # 两个都是list
    Data = np.array(Data)
    label = np.array(label)

    positiver_f = Data[label == 1]
    negative_f = Data[label == 0]
    ksresult1 = check_normality(positiver_f)
    ksresult2 = check_normality(negative_f)

    if ((ksresult1[1] > 0.05) and (ksresult2[1] > 0.05)):
        # 检验方差齐性
        leveneresult = stats.levene(positiver_f, negative_f)
        if leveneresult[1] >= 0.05:
            ttestresult = stats.ttest_ind(positiver_f, negative_f)
            static_value = ttestresult[0]
            p_value = ttestresult[1]
            method = 'ttest'
        if leveneresult[1] < 0.05:
            ttestresult = stats.ttest_ind(positiver_f,
                                          negative_f,
                                          equal_var=False)
            static_value = ttestresult[0]
            p_value = ttestresult[1]
            method = 'ttes_adj'
    elif use_all:
        nontestresult = stats.mannwhitneyu(positiver_f, negative_f)
        static_value = nontestresult[0]
        p_value = nontestresult[1]
        method = 'mannwhitneyu'

    return [static_value, p_value, method]
def oneway_anova(df, x, y, W, H, use_hsd=True, plot=True):
    # mean_compare_table
    mean_compare_table = df.groupby(x, as_index=False)[[y]].mean()
    print(mean_compare_table)
    if plot:
        # plot
        plt.figure(figsize=(W, H))
        sns.violinplot(x, y, data=df)
    # set group
    val_list = list(set(df[x]))
    groups = []
    for val in val_list:
        groups.append(df.loc[df[x] == val, y].tolist())
    # anova
    levene_test = levene(*groups)
    if levene_test.pvalue >= 0.05:
        print("方差齐")
        f_value, p_value = f_oneway(*groups)
    else:
        print("方差不齐")
        f_value, p_value = f_oneway(*groups)  # 实际都使用f_oneway
        #h_value, p_value = kruskalwallis(*groups)
    # 结论
    print(p_value)
    if use_hsd:
        hsd = pairwise_tukeyhsd(endog=df[y], groups=df[x], alpha=0.05)
        print(hsd.summary())
    return mean_compare_table
Esempio n. 29
0
    def _anova_assumptions(self, cl):
        arrays = [['Normality (Shapiro-Wilk)', 'Normality (Shapiro-Wilk)', 'Variance', 'Variance'],
                  ['test stats', 'p-value', 'test stats', 'p-value']]

        temp = np.zeros((4, 1+len(self.indep_var)))

        index = [self.dep_var]

        # Experimental errors are normally distributed
        temp[0,0], temp[1,0] = ss.shapiro(self.ols_model.resid)

        if temp[1,0] > cl: # test for equal variances using Bartlett's test
            for i in range(len(self.indep_var)):
                index.append(self.indep_var[i])
                list_unique = self.df[self.indep_var[i]].unique()
                args = [self.df.loc[self.df[self.indep_var[i]]== x].accuracy for x in list_unique]
                temp[2,i+1], temp[3,i+1] = ss.bartlett(*args)

            arrays[0][2] = arrays[0][2] + ' (Bartlett)'
            arrays[0][3] = arrays[0][3] + ' (Bartlett)'

        else: # test for equal variances using Levene's test
            for i in range(len(self.indep_var)):
                list_unique = self.df[self.indep_var[i]].unique()
                args = [self.df.loc[self.df[self.indep_var[i]]== x].accuracy for x in list_unique]
                temp[2,i+1], temp[3,i+1] = ss.levene(*args)

            arrays[0][2] = arrays[0][2] + ' (Levene)'
            arrays[0][3] = arrays[0][3] + ' (Levene)'

        self.anova_assump_df = pd.DataFrame(temp, index=arrays, columns=index)

        if self.print_output==True: print(' ------------------\n', 'ANOVA assumptions', '\n ------------------'),\
                                    print(self.anova_assump_df, '\n')
        return
def ttest(ds1, ds2, p = 0.05):
    rlt_var, p_var = sp.levene(ds1, ds2)
    eq = p_var > p
    
    # If equal_variance is False, then Welch's ttest is performed
    rlt_tt, p_tt = sp.ttest_ind(ds1, ds2, equal_var = eq)
    return p_tt
Esempio n. 31
0
def ANOVA_assumptions_test(R, N, H):
    # RNCH are the provided groups

    valid = False

    # test for normality
    ps = []
    for i in [R, N, H]:
        shapiro_test = stats.shapiro(i)
        ps.append(shapiro_test.pvalue)

    # test for equal variances
    _, p = levene(R, N, H)
    ps.append(p)

    if (np.array(ps) > 0.05).all():
        valid = True

    if valid:
        # stats f_oneway functions takes the groups as input and returns F and P-value
        fvalue, pvalue = stats.f_oneway(R, N, H)
        test = 'ANOVA'
    else:
        fvalue, pvalue = stats.kruskal(R, N, H)
        test = 'kruskal'

    return fvalue, pvalue, test
Esempio n. 32
0
 def compute(self, model):
                   
     Cexp    = ad.cross_covariance(sts_exp, binsize = 150*ms)
     # Now pipe model and exp into actual test
     pvalue = levene(model.covar.ravel(), Cexp.ravel()).pvalue
     self.score = LeveneScore(pvalue)
     return self.score
Esempio n. 33
0
def checkParametricConditions(accuracies,alpha):
    print("Checking independence ")
    print("Ok")
    independence = True
    print("Checking normality using Shapiro-Wilk's test for normality, alpha=0.05")
    (W, p) = shapiro(accuracies)
    print("W: %f, p:%f" % (W, p))
    if p < alpha:
        print("The null hypothesis (normality) is rejected")
        normality = False
    else:
        print("The null hypothesis (normality) is accepted")
        normality = True
    print("Checking heteroscedasticity using Levene's test, alpha=0.05")
    (W, p) = levene(*accuracies)
    print("W: %f, p:%f" % (W, p))
    if p < alpha:
        print("The null hypothesis (heteroscedasticity) is rejected")
        heteroscedasticity = False
    else:
        print("The null hypothesis (heteroscedasticity) is accepted")
        heteroscedasticity = True

    parametric = independence and normality and heteroscedasticity
    return parametric
Esempio n. 34
0
def compute_anova(df, clusters):
	clusters = clusters['labels']
	set_cluster(df, clusters)
	
	clusters_ = set(clusters.values())
	# print(stats.wilcoxon(list(df[df['#Cluster'] == '0']['SMSin'])))
	return stats.levene(*[df[df['#Cluster'] == c]['SMSin'] for c in clusters_])
Esempio n. 35
0
def levene(data):
    """Test of equal variance. H0 = same variance.
    @W: thev test statistics
    @pval: the p-value
    """
    W, pval = st.levene(*data)
    return (W, pval)
def levene_by_column(df, dummy):
    """Iterate Levene's test for equality of variances for each column of a
    DataFrame, after splitting the observations in two groups according to a
    dummy variable.

    Args:
        df (pd.DataFrame): The dataframe on which to perform the test.
        dummy (string): Name of *df* column (e.g. "Treatment"). Must represent
            a dummy variable (take value 0 or 1). Observations where the dummy
            value is missing are not considered.

    Returns:
        pd.DataFrame: A dataframe displaying, in each row,
            the Levene's test statistic and p-value for each column.

    """
    df1 = df[df[dummy] == 1].drop(dummy, axis=1)
    df0 = df[df[dummy] == 0].drop(dummy, axis=1)
    levene_outcome = []
    for col in df1.columns:
        levene_outcome.append(
            stats.levene(df0[col].dropna(), df1[col].dropna()))
    levene_df = pd.DataFrame(levene_outcome,
                             index=df1.columns,
                             columns=["test stat.", "p-value"])
    return levene_df
Esempio n. 37
0
def levenes_test(target, feature):
    '''
    This function does a Levene's Test for a categorical feature

    PARAMETERS
    ----------
    target: {pandas.Series} the response variable

    feature: {pandas.Series} the categorical feature

    RETURNS
    -------
    results: {pandas.DataFrame} dataframe containing the results of the Levene's Test
    '''
    categories = feature.unique()

    feature_dict = {category: target[feature==category] for category in categories}

    feature_tuple = (feature_dict[category] for category in categories)

    stat, pval = levene(*feature_tuple)

    results = pd.DataFrame({'Statistic': [stat], 'p-value': [pval]})

    return results
Esempio n. 38
0
 def variance_test(self, group_a, group_b):
     print('-----Variance test--------------------------------------------------')
     df_a = self.df[group_a]
     df_b = self.df[group_b]
     t_l, p_l = levene(df_a, df_b)
     print('Statistic: {} and p-value: {} of variance comparison'.format(t_l, p_l))
     print('-----END Variance test----------------------------------------------\n')
def two_sample_ttest(df, x, y, val_1, val_2, W, H, plot=True):
    # mean_compare_table
    mean_compare_table = df.groupby(x, as_index=False)[[y]].mean()
    print(mean_compare_table)
    a = df.loc[df[x] == val_1, y].tolist()
    b = df.loc[df[x] == val_2, y].tolist()
    if plot:
        # plot-1
        plt.figure(figsize=(W, H))
        sns.violinplot(x, y, data=df)
        # plot-2
        plt.figure(figsize=(W, H))
        sns.kdeplot(a, shade=True, label=val_1)
        sns.kdeplot(b, shade=True, label=val_2)
    # T-test
    groups = [a, b]
    levene_test = levene(*groups)
    if levene_test.pvalue >= 0.05:
        t_test = ttest_ind(
            a, b, equal_var=True)  # standard independent 2 sample test
    else:
        t_test = ttest_ind(a, b, equal_var=False)  # Welch's t-test
    p_value = t_test.pvalue
    # 结论
    if p_value <= 0.05:
        print(p_value)
        print("%s 在 %s 上存在显著性差异" % (y, x))
    else:
        print(p_value)
        print("%s 在 %s 上不存在显著性差异" % (y, x))
    return mean_compare_table
def tTest(data, checking, group, group1, group2, nameGroup1, nameGroup2, x,
          output):
    output[x]['Variable'] = checking
    leveneResult = stats.levene(data[checking][data[group] == group1],
                                data[checking][data[group] == group2],
                                center='mean')

    summary, results = rp.ttest(group1=data[checking][data[group] == group1],
                                group1_name=nameGroup1,
                                group2=data[checking][data[group] == group2],
                                group2_name=nameGroup2)

    output[x][nameGroup1 + ' N'] = round(summary.iloc[0]['N'], 2)
    output[x][nameGroup2 + ' N'] = round(summary.iloc[1]['N'], 2)
    output[x][nameGroup1 + ' Mean'] = round(summary.iloc[0]['Mean'], 2)
    output[x][nameGroup2 + ' Mean'] = round(summary.iloc[1]['Mean'], 2)
    output[x][nameGroup1 + ' SD'] = round(summary.iloc[0]['SD'], 2)
    output[x][nameGroup2 + ' SD'] = round(summary.iloc[1]['SD'], 2)
    output[x][nameGroup1 + ' SE'] = round(summary.iloc[0]['SE'], 2)
    output[x][nameGroup2 + ' SE'] = round(summary.iloc[1]['SE'], 2)

    if leveneResult.pvalue < 0.05:
        output[x]['Leneve Value'] = str(round(leveneResult.pvalue, 2)) + "****"
    else:
        output[x]['Leneve Value'] = str(round(leveneResult.pvalue, 2))

    values = results.results
    output[x]["T-Test P Value"] = signifiant(float(values.loc[[3]]))
    output[x]["Cohen Effect Size"] = effectSize(float(values.loc[[6]]))
Esempio n. 41
0
    def test_equal_var():
        '''Levene test for independence

        '''
        d1 = self.d1
        d2 = self.d2
        #rewrite this, for now just use scipy.stats
        return stats.levene(d1.data, d2.data)
Esempio n. 42
0
    def return_test_results(self, arr1, arr2):
        test_name = ""
        p_value = 0
        t_value = 0
        levene = stats.levene(arr1, arr2)[1]
        if self.statistics == "auto":
            # проверяем Левеном на равенство дисперсий. Если равны
            if levene > 0.05:
                # Шапир на нормальность выборок. Если нормальные
                if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05:
                    # p = Student
                    test_name = "Student"
                    result = stats.ttest_ind(arr1, arr2)
                    t_value = result[0]
                    p_value = result[1]
                else:
                    # p = Mann
                    test_name = "Mann"
                    if equal(arr1, arr2):
                        t_value = None
                        p_value = 1
                    else:
                        result = stats.mannwhitneyu(arr1, arr2)
                        t_value = result[0]
                        p_value = result[1]
            else:
                test_name = "Welch"
                result = stats.ttest_ind(arr1, arr2, False)
                t_value = result[0]
                p_value = result[1]

        elif self.statistics == "student":
            test_name = "Student"
            result = stats.ttest_ind(arr1, arr2)
            t_value = result[0]
            p_value = result[1]
        elif self.statistics == "welch":
            test_name = "Welch"
            result = stats.ttest_ind(arr1, arr2, False)
            t_value = result[0]
            p_value = result[1]
        elif self.statistics == "mann":
            test_name = "Mann"
            if equal(arr1, arr2):
                t_value = None
                p_value = 1
            else:
                result = stats.mannwhitneyu(arr1, arr2)
                t_value = result[0]
                p_value = result[1]

        df = len(arr1) + len(arr2) - 2

        return [test_name, t_value, p_value, df, levene]
def anova_oneway():
    ''' One-way ANOVA: test if results from 3 groups are equal.
    
    Twenty-two patients undergoing cardiac bypass surgery were randomized to one of three ventilation groups:
    
    Group I: Patients received 50% nitrous oxide and 50% oxygen mixture continuously for 24 h.
    Group II: Patients received a 50% nitrous oxide and 50% oxygen mixture only dirng the operation.
    Group III: Patients received no nitrous oxide but received 35-50% oxygen for 24 h.
    
    The data show red cell folate levels for the three groups after 24h' ventilation.
    
    '''
    
    # Get the data
    print('One-way ANOVA: -----------------')
    inFile = 'altman_910.txt'
    data = np.genfromtxt(inFile, delimiter=',')
    
    # Sort them into groups, according to column 1
    group1 = data[data[:,1]==1,0]
    group2 = data[data[:,1]==2,0]
    group3 = data[data[:,1]==3,0]
    
    # --- >>> START stats <<< ---
    # First, check if the variances are equal, with the "Levene"-test
    (W,p) = stats.levene(group1, group2, group3)
    if p<0.05:
        print(('Warning: the p-value of the Levene test is <0.05: p={0}'.format(p)))
    
    # Do the one-way ANOVA
    F_statistic, pVal = stats.f_oneway(group1, group2, group3)
    # --- >>> STOP stats <<< ---
    
    # Print the results
    print('Data form Altman 910:')
    print((F_statistic, pVal))
    if pVal < 0.05:
        print('One of the groups is significantly different.')
        
    # Elegant alternative implementation, with pandas & statsmodels
    df = pd.DataFrame(data, columns=['value', 'treatment'])    
    model = ols('value ~ C(treatment)', df).fit()
    anovaResults = anova_lm(model)
    print(anovaResults)
    
    # Check if the two results are equal. If they are, there is no output
    np.testing.assert_almost_equal(F_statistic, anovaResults['F'][0])
    
    return (F_statistic, pVal) # should be (3.711335988266943, 0.043589334959179327)
Esempio n. 44
0
 def run(self):
     if len(self._data) < self._min_size:
         pass
     if len(self._data.groups.values()) <= 1:
         raise NoDataError("Equal variance test requires at least two numeric vectors.")
     if NormTest(self._data, display=False, alpha=self._alpha).p_value > self._alpha:
         statistic, p_value = bartlett(*self._data.groups.values())
         r = 'Bartlett'
         self._results.update({'p value': p_value, self._statistic_name[r]: statistic, 'alpha': self._alpha})
     else:
         statistic, p_value = levene(*self._data.groups.values())
         r = 'Levene'
         self._results.update({'p value': p_value, self._statistic_name[r]: statistic, 'alpha': self._alpha})
     self._test = r
     self._name = self._names[r]
Esempio n. 45
0
def cep_ttest(sample_a, sample_b):
    '''
    Sample A and Sample B are array-like data stores
    Ideally they should be numpy arrays or pandas Series
    So we can perform mean and standard deviation calculations with them
    The function will return a dictionary with the following entries:
        "test": "Standard" (equal variance) or "Welch" (not equal variance)
        "pval": P-value of the test performed
        "verdict": "Not significant" or effect size specified
        "cohen": Cohen's d value
        "sign": blank, ".", "*", "**", or "***" depending on p-value and significance
        "g1_n": response count in sample_a
        "g2_n": response count in sample_b
    '''
    # Construct a result_dict
    result_dict = {}
    # First, perform a Levene's test to determine whether the samples have equal variances
    equal_var_test = levene(sample_a, sample_b, center='mean')
    # The significance stat is the second element in the result tuple
    equal_var_test_sig = equal_var_test[1]
    # Then, depending on the result, we'll perform either a standard or a Welch's test
    # If there's no result, then end test here
    if pd.isnull(equal_var_test_sig):
        result_dict['test'] = 'N/A'
    else:
        if equal_var_test_sig >= SIG_LEVEL:
            equal_var_arg = True
            result_dict['test'] = 'Standard'
        elif equal_var_test_sig < SIG_LEVEL:
            equal_var_arg = False
            result_dict['test'] = 'Welch'
        ttest_result = ttest_ind(sample_a, sample_b, axis=0, equal_var=equal_var_arg)
        ttest_result_sig = ttest_result[1]
        result_dict['pval'] = ttest_result_sig
        # If it's not significant, end here
        # Translate result here
        mean_diff = sample_a.mean() - sample_b.mean()
        verdict, sign, cohens_d = translate_result(ttest_result_sig, mean_diff, sample_a, sample_b)
        result_dict['cohen'] = cohens_d
        result_dict['verdict'] = verdict
        result_dict['sign'] = sign
        result_dict['g1_n'] = sample_a.count()
        result_dict['g2_n'] = sample_b.count()
        result_dict['g1_mean'] = sample_a.mean()
        result_dict['g2_mean'] = sample_b.mean()        
    return result_dict
Esempio n. 46
0
def apply_test(data, group, test):
    '''applies test along axis=1
    data - 2d data array
    group - group identity (rows)
    test - 'levene' for example
            should accept functions too
    '''
    n_samples = data.shape[1]
    if test == 'levene':
        levene_W = np.zeros(n_samples)
        levene_p = np.zeros(n_samples)
        for t_ind in range(n_samples):
            levene_W[t_ind], levene_p[t_ind] = stats.levene(
                data[group == 0, t_ind], data[group == 1, t_ind])

        return levene_W, levene_p
    else:
        raise NotImplementedError('Only levene is implemented currently...')
Esempio n. 47
0
def anova_oneway():
    ''' One-way ANOVA: test if results from 3 groups are equal. '''

    # Get the data
    print('One-way ANOVA: -----------------')
    data = getData('altman_910.txt', subDir='..\Data\data_altman')

    # Sort them into groups, according to column 1
    group1 = data[data[:, 1] == 1, 0]
    group2 = data[data[:, 1] == 2, 0]
    group3 = data[data[:, 1] == 3, 0]

    # First, check if the variances are equal, with the "Levene"-test
    (W, p) = stats.levene(group1, group2, group3)
    if p < 0.05:
        print('Warning: the p-value of the Levene test is <0.05: p={0}'.format(
            p))

    # Do the one-way ANOVA
    F_statistic, pVal = stats.f_oneway(group1, group2, group3)

    # Print the results
    print('Data form Altman 910:')
    print((F_statistic, pVal))
    if pVal < 0.05:
        print('One of the groups is significantly different.')

    # Elegant alternative implementation, with pandas & statsmodels
    df = pd.DataFrame(data, columns=['value', 'treatment'])
    model = ols('value ~ C(treatment)', df).fit()
    anovaResults = anova_lm(model)
    print(anovaResults)

    # Check if the two results are equal. If they are, there is no output
    np.testing.assert_almost_equal(F_statistic, anovaResults['F'][0])

    return (F_statistic,
            pVal)  # should be (3.711335988266943, 0.043589334959179327)
Esempio n. 48
0
import os
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help="Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values."
    )
    parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored")
    parser.add_argument(
        "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored"
    )
    parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help="If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help="the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram")
    parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction")
    parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help="lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias)
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one), correction=args.correction, lambda_=args.lambda_
            )
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one), map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two)
            )
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(
                map(float, sample_one), map(float, sample_two), equal_var=args.equal_var
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one), zero_method=args.zero_method, correction=args.correction
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one), method=args.med, weights=map(float, sample_two)
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples
            )
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
Esempio n. 50
0
			
			ejecucion['medRHV'] = numpy.mean(RHVs)
			ejecucion['stdRHV'] = numpy.std(RHVs)			
			ejecucion['ksRHVpval'] = stats.kstest(RHVs,'norm', args=(ejecucion['medRHV'],ejecucion['stdRHV'])).pvalue if ejecucion['stdRHV'] else 0
			ejecucion['shapiroRHVpval'] = stats.shapiro(RHVs)[1] if ejecucion['stdRHV'] else 0
			
			ejecucion['medGD'] = numpy.mean(gds)
			ejecucion['stdGD'] = numpy.std(gds)
			ejecucion['ksGDpval'] = stats.kstest(gds,'norm', args=(ejecucion['medGD'],ejecucion['stdGD'])).pvalue if ejecucion['stdGD'] else 0
			ejecucion['shapiroGDpval'] = stats.shapiro(gds)[1] if ejecucion['stdGD'] else 0
			
			
			
			if RHVsAnteriores:
				ejecucion["tstudentRHV"] = stats.ttest_ind(RHVsAnteriores, RHVs, equal_var=False).pvalue
				ejecucion["leveneRHV"] = stats.levene(RHVsAnteriores, RHVs).pvalue
			else:
				ejecucion["tstudentRHV"] = "-"
				ejecucion["leveneRHV"] = "-"
				
			if gdsAnteriores:
				ejecucion["tstudentGD"] = stats.ttest_ind(gdsAnteriores, gds, equal_var=False).pvalue
				ejecucion["leveneGD"] = stats.levene(gdsAnteriores, gds).pvalue
			else:
				ejecucion["tstudentGD"] = "-"
				ejecucion["leveneGD"] = "-"
				
			RHVsAnteriores = RHVs
			gdsAnteriores = gds
		
	with open(PATH_JSON_EJECUCION,'w') as f:
Esempio n. 51
0
def levene((x, y)):
    return stats.levene(x, y)
Esempio n. 52
0
 def test_result_attributes(self):
     args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10]
     res = stats.levene(*args)
     attributes = ('statistic', 'pvalue')
     check_named_results(res, attributes)
  #hfmt = dates.DateFormatter('%H:%M')
  #ax.xaxis.set_major_formatter(hfmt)
#  y_formatter = mpl.ticker.ScalarFormatter(useOffset=False)
#  ax.yaxis.set_major_formatter(y_formatter)
#  ax.grid(True)

  f.suptitle("Dichte der Leistungsgradienten")
  f.autofmt_xdate()
  plt.savefig("images/sonnenfinsternis-dichte-gradienten.png")#, bbox_inches='tight')

  plt.clf()
  friday_series, friday_vals = ecdf.get_ecdf(friday_momentum_df.momentum)
  ecdf.plot_ecdf_curve(friday_series, friday_vals, color="b", label="Typischer Freitag")
  eclipse_series, eclipse_vals = ecdf.get_ecdf(eclipse_momentum_df.momentum)
  ecdf.plot_ecdf_curve(eclipse_series, eclipse_vals, color="r", label="Sonnenfinsternis")
  print "Mittelwert alle Freitage: %f" % np.median(friday_momentum_df.momentum)
  print "Mittelwert Sonnenfinsternis: %f" % np.median(eclipse_momentum_df.momentum)
  # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.levene.html#scipy.stats.levene
  W, p_val = stats.levene(friday_momentum_df.momentum,
      eclipse_momentum_df.momentum, center='median')
  print ("Levenes Test auf Gleichheit der Varianz: P=%s (gleiche Varianz für p<=0.05)" % p_val)


  W, p_val = stats.fligner(friday_momentum_df.momentum, eclipse_momentum_df.momentum)
  print "Fliegners Test auf Gleichheit der Varianz: P=%s" % p_val

  f.suptitle("ECDF der Leistungsgradienten: Ungleiche Varianzen (Levene, p=%f)" % p_val)
  plt.savefig("images/sonnenfinsternis-ecdf-gradienten.png")#, bbox_inches='tight')

 
Esempio n. 54
0
 def test_data(self):
     args = [g1, g2, g3, g4, g5, g6, g7, g8, g9, g10]
     W, pval = stats.levene(*args)
     assert_almost_equal(W,1.7059176930008939,7)
     assert_almost_equal(pval,0.0990829755522,7)
Esempio n. 55
0
#Report some descriptive statistics
print stroop.describe()

#visualizations (with seaborn style)
import matplotlib.pyplot as plot
import seaborn as sns

# Do the boxplot
plot.show(sns.boxplot(stroop))

# Do the violinplot
plot.show(sns.violinplot(stroop, widths = 0.5))

"""
# Do the distribution plot
sns.distplot(stroop['Congruent'],kde=False, color= "b")
"""

from scipy.stats import levene
print levene(stroop.Congruent, stroop.Incongruent)

from scipy.stats import kstest
print 'ks_con', kstest(stroop.Congruent, 'norm')
print 'ks_inc', kstest(stroop.Incongruent, 'norm')

from scipy.stats import ks_2samp
ks_2samp(stroop.Congruent, stroop.Incongruent)
# Do the t-test
import scipy.stats as ss
print ss.ttest_ind(stroop.Congruent, stroop.Incongruent)
 def f_test(self,test_series):
     """F-test of equal variances."""
     # print(stats.bartlett(self.series,test_series))
     # return stats.f.sf(F, df1, df2)
     return stats.levene(self.series,test_series)
Esempio n. 57
0
def main():
    def n_digits(num):
        if num <= 1:
            return 1
        return math.ceil(math.log(num) / math.log(10))

    db = sqlite.connect(db_fn)
    dbc = db.cursor()
    rows = []
    integer_digits = {'best': 0,
                      'best_time': 0,
                      'mean': 0,
                      'stddev': 0}
    allvals = []
    allvals_dict = {}
    for variant in VARIANTS:
        query = ("select tw from (select min(treewidth) as tw from validationresults where variant='%(variant)s' and instance='%(instance)s' group by seed)")
        result = dbc.execute(query % {'variant': variant, 'instance': instance})
        vals = NP.array([row[0] for row in result])
        min, mean, stddev = vals.min(), vals.mean(), vals.std()
        # print('%s: vals=%r' % (variant, vals), file=sys.stderr)
        W, p = STATS.shapiro(vals)
        print('%s: normal distribution? shapiro-wilk: W=%s (p=%s) %s@5%% %s@2%%' % (variant, W, p, 'no' if W <= .905 else 'yes', 'no' if W <= .884 else 'yes'), file=sys.stderr)
        z, p = STATS.skewtest(vals)
        print('%s: normal distribution? skew test: (z=%s) p=%s => %s' % (variant, z, p, 'no' if p < .5 else 'yes'), file=sys.stderr)
        allvals.append(vals)
        allvals_dict[variant] = vals

        query = ("select min(runtime_s)"
                 " from validationresults"
                 " where variant='%(variant)s' and instance='%(instance)s' and treewidth='%(treewidth)s'")
        result = dbc.execute(query % {'variant': variant, 'instance': instance, 'treewidth': min})
        best_time = [row[0] for row in result][0]
        # print("%s: best=%s @ %ss, avg=%s +- %s" % (variant, min, best_time, mean, stddev), file=sys.stderr)
        row = {'variant': variant,
               'best': min,
               'best_time': round(best_time, 1),
               'mean': round(mean, 1),
               'stddev': round(stddev, 1)}
        rows.append(row)
        integer_digits['best'] = max(integer_digits['best'], n_digits(row['best']))
        integer_digits['best_time'] = max(integer_digits['best_time'], n_digits(row['best_time']))
        integer_digits['mean'] = max(integer_digits['mean'], n_digits(row['mean']))
        integer_digits['stddev'] = max(integer_digits['stddev'], n_digits(row['stddev']))
    db.close()
    T, p = STATS.bartlett(*allvals)
    print('equal variances? bartlett: T=%s (p=%s) [vs Chi-Quadrat_{k-1=%s, alpha=.5}]' % (T, p, len(allvals) - 1), file=sys.stderr)
    W, p = STATS.levene(*allvals, center='mean')
    print('equal variances? levene (mean): (W=%s) p=%s' % (W, p), file=sys.stderr)
    W, p = STATS.levene(*allvals, center='median')
    print('equal variances? levene (median): (W=%s) p=%s' % (W, p), file=sys.stderr)
    F, p = STATS.f_oneway(*allvals)
    print('equal means? one-way ANOVA: F=%s, p=%s [vs F_{k-1=%s,n-k=%s}]' % (F, p, len(allvals) - 1, sum([len(x) for x in allvals]) - len(allvals)), file=sys.stderr)
    try:
        W, p = STATS.kruskal(*allvals)
        print('equal means? kruskal wallis: W=%s, p=%s' % (W, p), file=sys.stderr)
    except Exception as e:
        print(e)
    lsd = LSD.LSD(allvals, .05)
    print('LSD: %r' % lsd, file=sys.stderr)
    print(statsmodels.stats.multicomp.pairwise_tukeyhsd(NP.array(allvals).ravel(), NP.array([[x] * 20 for x in VARIANTS]).ravel(), alpha=.10), file=sys.stderr)
    print(statsmodels.stats.multicomp.pairwise_tukeyhsd(NP.array(allvals).ravel(), NP.array([[x] * 20 for x in VARIANTS]).ravel(), alpha=.05), file=sys.stderr)

    def welch(var1, var2):
        res = STATS.ttest_ind(allvals_dict[var1], allvals_dict[var2], equal_var=False)
        print('%4s vs %s  t,p=%r => \t%s @a=10%%, %s @a=5%%'
              % (var1, var2, res, 'NE' if res[1] < .01116 else ' E', 'NE' if res[1] < .00568 else ' E'), file=sys.stderr)

    print('pairwise Welch\'s t-test with Bonferroni correction:', file=sys.stderr)
    welch('IHA', 'MA1')
    welch('IHA', 'MA2')
    welch('IHA', 'MA3')
    welch('GAtw', 'MA1')
    welch('GAtw', 'MA2')
    welch('GAtw', 'MA3')
    welch('MA1', 'MA2')
    welch('MA1', 'MA3')
    welch('MA2', 'MA3')

    def mannwhitneyu(var1, var2):
        try:
            res = STATS.mannwhitneyu(allvals_dict[var1], allvals_dict[var2])
            print('%4s vs %s  u,p=%r => \t%s @a=10%%, %s @a=5%%'
                  % (var1, var2, res, 'NE' if res[1] < .01116 else ' E', 'NE' if res[1] < .00568 else ' E'), file=sys.stderr)
        except Exception as e:
            print('%4s vs %s  failed: %r' % (var1, var2, e))

    print('pairwise Mann-Whitney U test with Bonferroni correction:', file=sys.stderr)
    mannwhitneyu('IHA', 'MA1')
    mannwhitneyu('IHA', 'MA2')
    mannwhitneyu('IHA', 'MA3')
    mannwhitneyu('GAtw', 'MA1')
    mannwhitneyu('GAtw', 'MA2')
    mannwhitneyu('GAtw', 'MA3')
    mannwhitneyu('MA1', 'MA2')
    mannwhitneyu('MA1', 'MA3')
    mannwhitneyu('MA2', 'MA3')

    #latex = [r'\begin{sidefigure}{caption={Results for instance \Instance{%(instanceTexEsc)s}},label={fig:%(instanceFileEsc)s-results},place={htbp}}''\n'
             #r'   \begin{center}''\n'
    latex = [r'\begin{table}[hbtp]''\n'
             r'   \caption{Results for instance \Instance{%(instanceTexEsc)s}}''\n'
             r'   \label{fig:%(instanceFileEsc)s-results}''\n'
             r'   \centering\small''\n'
             r'      \begin{tabular}{l S[table-format=%(best)s] S[table-format=%(best_time)s.1]%%''\n'
             r'                      S[table-format=%(mean)s.1,table-number-alignment=right] @{$\,\pm\,$} S[table-format=%(stddev)s.1,table-number-alignment=left]''\n'
             r'                      S[table-format=2]} \toprule''\n'
             r'         & \multicolumn{2}{c}{\header{Best}} & \multicolumn{2}{c}{\header{Average}} & \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}''\n'
             r'         & \header{treewidth} & \header{seconds} & \multicolumn{2}{c}{\header{treewidth}} & \header{samples} \\ \midrule'
             % dict(integer_digits.items() | dict(instanceTexEsc=instance.replace('_', r'\textunderscore{}'), instanceFileEsc=instance.replace('_', '-')).items())]
    for row in rows:
        latex.append(' ' * (3 * 3) + ' & '.join([row['variant'], str(row['best']), str(row['best_time']), str(row['mean']), str(row['stddev']), "20"]) + r'\\')
    latex.append(r'         \bottomrule''\n'
                 r'      \end{tabular}''\n'
                 r'\end{table}')
                 #r'   \end{center}''\n'
                 #r'\end{sidefigure}')

    with open('validation-validationset-%s-results.tex' % instance.replace('_', '-'), 'w') as f:
        print('\n'.join(latex), file=f)
Esempio n. 58
0
def get_levene(group1, group2):
    lev_w, lev_p_value = levene(group1, group2)
    return (lev_p_value, lev_w)
Esempio n. 59
0
    for fiber in fiber_list:
        mod = Model(lambda x, a, b: a * x + b)
        slope_displ = mod.fit(fiber.binned_exp['static_fr_mean'],
                              x=fiber.binned_exp['displ_mean'],
                              a=1, b=1).best_values['a']
        slope_force = mod.fit(fiber.binned_exp['static_fr_mean'],
                              x=fiber.binned_exp['force_mean'],
                              a=1, b=1).best_values['a']
        slope_displ_list.append(slope_displ)
        slope_force_list.append(slope_force)
    slope_displ_arr = np.array(slope_displ_list)
    slope_force_arr = np.array(slope_force_list)
    sensitivity_df = pd.DataFrame(
        np.c_[slope_displ_arr, slope_force_arr],
        index=['#' + str(i+1) for i in range(slope_displ_arr.size)],
        columns=['Displacement sensitivity (Hz/mm)',
                 'Force sensitivity (Hz/mN)'])
    for column in sensitivity_df.columns:
        sensitivity_df[column[:5] + '_normalized'] = sensitivity_df[column] /\
            sensitivity_df[column].median()
    sensitivity_df.transpose().to_excel('./csvs/sensitivity.xlsx')
    print(sensitivity_df.var())
    from scipy.stats import f, bartlett, levene
    print(f.cdf(sensitivity_df['Displ_normalized'].var() /
                sensitivity_df['Force_normalized'].var(),
          sensitivity_df.shape[0], sensitivity_df.shape[0]))
    print(bartlett(sensitivity_df['Displ_normalized'],
                   sensitivity_df['Force_normalized']))
    print(levene(sensitivity_df['Displ_normalized'],
                 sensitivity_df['Force_normalized']))