Esempio n. 1
0
 def test_trimmed1(self):
     # Test that center='trimmed' gives the same result as center='mean'
     # when proportiontocut=0.
     Xsq1, pval1 = stats.fligner(g1, g2, g3, center='mean')
     Xsq2, pval2 = stats.fligner(g1, g2, g3, center='trimmed', proportiontocut=0.0)
     assert_almost_equal(Xsq1, Xsq2)
     assert_almost_equal(pval1, pval2)
Esempio n. 2
0
 def test_trimmed2(self):
     x = [1.2, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 100.0]
     y = [0.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 200.0]
     # Use center='trimmed'
     Xsq1, pval1 = stats.fligner(x, y, center='trimmed', proportiontocut=0.125)
     # Trim the data here, and use center='mean'
     Xsq2, pval2 = stats.fligner(x[1:-1], y[1:-1], center='mean')
     # Result should be the same.
     assert_almost_equal(Xsq1, Xsq2)
     assert_almost_equal(pval1, pval2)
Esempio n. 3
0
    def _scipy_fk_test(self, mode='median', alpha=0.01):
        """
        Fligner Killeen Test for differences in data variances
        Scipy implementation uses the CHI2 approximation for calculation of the FK
        statistics.

        Parameters
        ----------
        mode
        alpha

        Returns
        -------
        h : int
            0 if no break found, 1 if break was found
        stats_fk : dict
            Fligner test statistics
        """
        q0 = self.get_group_data(0, self.df_test_resampled, ['Q'])
        q1 = self.get_group_data(1, self.df_test_resampled, ['Q'])

        with warnings.catch_warnings(): # supress scipy warnings
            warnings.filterwarnings('ignore')
            fstats, pval = fligner(q0, q1, center=mode)

        stats_fk = {'z': fstats, 'pval': pval}

        if stats_fk['pval'] <= alpha:  # With CHI2 approximation
            h = 1
        else:
            h = 0

        return h, stats_fk
Esempio n. 4
0
def test_significance(y1, y2):
    # Тестируем гипотезу на нормальность
    y1_shapiro = stats.shapiro(y1)
    print(y1_shapiro)
    y2_shapiro = stats.shapiro(y2)
    print(y2_shapiro)

    if y1_shapiro[1] >= 0.05 and y2_shapiro[1] >= 0.05:
        print('Distributions of quantities are normal')
        # Тестируем гипотезу на равенство дисперсий
        fligner_test = stats.fligner(y1, y2)
        print(fligner_test)

        # Т-тест (только если нормальное распределение)
        if fligner_test[1] < 0.05:
            print('Variances are not equal')
            ttest_result = stats.ttest_ind(y1, y2, equal_var=False)
        else:
            print('Variances are equal')
            ttest_result = stats.ttest_ind(y1, y2, equal_var=True)

        print(ttest_result)
        if ttest_result[1] >= 0.05:
            print('Differences in predictions are not significant.')
        else:
            print('Differences in predictions are significant.')
    else:
        print('Distributions of quantities are not normal')
        # Тест Вилкоксона (если распределение не подчиняется нормальному закону)
        wilcoxon_result = stats.wilcoxon(y1, y2)
        print(wilcoxon_result)
        if wilcoxon_result[1] >= 0.05:
            print('Differences in predictions are not significant.')
        else:
            print('Differences in predictions are significant.')
 def test_flignerKileenTest_xResult(self):
     data_1 = [51, 87, 50, 48, 79, 61, 53, 54]
     data_2 = [82, 91, 92, 80, 52, 79, 73, 74]
     data_4 = [85, 80, 65, 71, 67, 51, 63, 93]
     data_3 = [79, 84, 74, 98, 63, 83, 85, 58]
     x1, p1 = fligner_kileen_test(data_1,
                                  data_2,
                                  data_3,
                                  data_4,
                                  center='median')
     x2, p2 = fligner(data_1, data_2, data_3, data_4, center='median')
     assert pytest.approx(x2) == x1
Esempio n. 6
0
def stats_tests():
    global errors
    tests = ['Brown-Forsythe', 'Bartlett', 'Levene', 'Fligner-Killeen']
    securities = list(container.index)
    indicators = list(container.columns)

    output = pd.DataFrame(index=pd.MultiIndex.from_product([securities, indicators]),
                          columns=tests)

    for security in securities:
        for indicator in indicators:
            all = pd.Series(container.loc[security][indicator]['all'])
            signal = pd.Series(container.loc[security][indicator]['signal'])
            all = pd.to_numeric(all, errors='coerce')
            signal = pd.to_numeric(signal, errors='coerce')

            try:
                output.loc[security, indicator][tests[0]] = stats.levene(
                    all, signal,
                    center='median'
                )
            except:
                errors.append([security, indicator, tests[0]])

            try:
                output.loc[security, indicator][tests[1]] = stats.bartlett(
                    all, signal
                )
            except:

                errors.append([security, indicator, tests[1]])

            try:
                output.loc[security, indicator][tests[2]] = stats.levene(
                    all, signal,
                    center='mean'
                )
            except:
                errors.append([security, indicator, tests[2]])

            try:
                output.loc[security, indicator][tests[3]] = stats.fligner(
                    all, signal
                )
            except:
                errors.append([security, indicator, tests[3]])

    p_values = output.dropna().applymap(lambda x: x.pvalue).unstack()
    p_values_container = output.dropna().applymap(lambda x: x.pvalue).unstack().melt()
    p_values.to_pickle('p_values_full')
    p_values_container.to_pickle('p_values_container_full')
Esempio n. 7
0
def print_parametric_info(dfs, df_valid, key):
    values = []
    for df in dfs:
        print(df.conditionType.iloc[0])
        print(stats.shapiro(df[key]))

        plt.figure(len(values))
        plt.hist(df[key])
        #stats.probplot(df[key], plot=plt)

        values.append(df[key])
        print('')

    print('general')
    plt.figure(len(values))
    plt.hist(df_valid[key])

    print(stats.shapiro(df_valid[key]))
    print(stats.fligner(*values))
Esempio n. 8
0
def vector_hypotheses(a, b):

    dict_stat = {}
    dict_pval = {}
    pea = pearsonr(a, b)
    dict_stat["pearsonr"], dict_pval["pearsonr"] = pea[0], pea[1]
    ran = ranksums(a, b)
    dict_stat["ranksums"], dict_pval["ranksums"] = ran[0], ran[1]
    moo = mood(a, b)
    dict_stat["mood"], dict_pval["mood"] = moo[0], moo[1]
    fli = fligner(a, b)
    dict_stat["fligner"], dict_pval["fligner"] = fli[0], fli[1]
    ans = ansari(a, b)
    dict_stat["ansari"], dict_pval["ansari"] = ans[0], ans[1]
    bar = bartlett(a, b)
    dict_stat["bartlett"], dict_pval["bartlett"] = bar[0], bar[1]
    lev = levene(a, b)
    dict_stat["levene"], dict_pval["levene"] = lev[0], lev[1]
    man = mannwhitneyu(a, b)
    dict_stat["mannwhitneyu"], dict_pval["mannwhitneyu"] = man[0], man[1]
    return dict_stat, dict_pval
Esempio n. 9
0
def isHomogeneous(df, alpha, levene=True):

    print "\nChecking if all the columns are homogeneous by Levene or Fligner-Killeen test...\n"

    #colums to list
    h = list(df.columns.values)[:-1]
    #columns values to list
    col1 = df[h[0]].tolist()
    col2 = df[h[1]].tolist()
    col3 = df[h[2]].tolist()
    col4 = df[h[3]].tolist()
    col5 = df[h[4]].tolist()
    col6 = df[h[5]].tolist()
    col7 = df[h[6]].tolist()
    col8 = df[h[7]].tolist()
    col9 = df[h[8]].tolist()
    col10 = df[h[9]].tolist()
    col11 = df[h[10]].tolist()

    L, p_val = ss.levene(col1, col2, col3, col4, col5, col6, col7, col8, col9,
                         col10, col11)
    F, p = ss.fligner(col1, col2, col3, col4, col5, col6, col7, col8, col9,
                      col10, col11)

    if (levene):
        if p_val < alpha:

            print "\n   It is not an homegeneous dataset (Levene)\n"

        else:
            print "\n   It is an homogeneneous dataset (Levene)\n"

    else:
        if p < alpha:

            print "\n   It is not an homegeneous dataset (Fligner-Killeen) \n"

        else:
            print "\n   It is an homogeneneous dataset (Fligner-Killen)\n"
Esempio n. 10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o",
                        "--outfile",
                        required=True,
                        help="Path to the output file.")
    parser.add_argument("--sample_one_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument(
        "--sample_cols",
        help="Input format, like smi, sdf, inchi,separate arrays using ;",
    )
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help=
        "Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help=
        "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta",
        action="store_true",
        default=False,
        help="Whether or not to return the internally computed a values.",
    )
    parser.add_argument(
        "--fisher",
        action="store_true",
        default=False,
        help="if true then Fisher definition is used",
    )
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help=
        "if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument(
        "--inclusive1",
        action="store_true",
        default=False,
        help="if false,lower_limit will be ignored",
    )
    parser.add_argument(
        "--inclusive2",
        action="store_true",
        default=False,
        help="if false,higher_limit will be ignored",
    )
    parser.add_argument(
        "--inclusive",
        action="store_true",
        default=False,
        help="if false,limit will be ignored",
    )
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help=
        "If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help=
        "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument(
        "--correction",
        action="store_true",
        default=False,
        help="continuity correction ",
    )
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help=
        "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help=
        "the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b",
                        type=int,
                        default=0,
                        help="The number of bins to use for the histogram")
    parser.add_argument("--N",
                        type=int,
                        default=0,
                        help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof",
                        type=int,
                        default=0,
                        help="Degrees of freedom correction")
    parser.add_argument(
        "--score",
        type=int,
        default=0,
        help="Score that is compared to the elements in a.",
    )
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help=
        "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument(
        "--new",
        type=float,
        default=0.0,
        help="Value to put in place of values in a outside of bounds",
    )
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help=
        "lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help=
        "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument(
        "--base",
        type=float,
        default=1.6,
        help="The logarithmic base to use, defaults to e",
    )
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols is not None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols is not None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols is not None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(
                map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one),
                                               dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one),
                                       n=args.n,
                                       p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(
                map(float, sample_one),
                axis=args.axis,
                fisher=args.fisher,
                bias=args.bias,
            )
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one),
                                        score=args.score,
                                        kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one),
                                                   alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one),
                                             low=args.m,
                                             high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one),
                cdf=args.cdf,
                N=args.N,
                alternative=args.alternative,
                mode=args.mode,
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one),
                correction=args.correction,
                lambda_=args.lambda_)
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf == 0 and mf == 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf),
                                   (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf == 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one),
                                 lowerlimit=mf,
                                 inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf == 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one),
                                 upperlimit=nf,
                                 inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf == 0 and mf == 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf == 0 and mf == 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf == 0 and mf == 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf),
                               (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf == 0 and mf == 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    interpolation_method=args.interpolation,
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    (mf, nf),
                    interpolation_method=args.interpolation,
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf == 0 and mf == 0:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf == 0 and mf == 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf == 0 and mf == 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one),
                                    mf,
                                    nf,
                                    newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one),
                               proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(
                map(float, sample_one),
                proportiontocut=args.proportiontocut,
                tail=args.tail,
            )
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf == 0 and mf == 0:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf == 0 and mf == 0:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf == 0 and mf == 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf),
                                          method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda == 0:
                box, ma, ci = stats.boxcox(map(float, sample_one),
                                           alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one),
                                   imbda,
                                   alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one),
                                  map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one),
                                                  map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one),
                                        map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one),
                                       map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two))
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one),
                                          map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one),
                                              map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one),
                                        map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one),
                map(float, sample_two),
                use_continuity=args.mwu_use_continuity,
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one),
                           map(float, sample_two),
                           ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one),
                                                  map(float, sample_two),
                                                  equal_var=args.equal_var)
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one),
                                      map(float, sample_two),
                                      axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one),
                                    map(float, sample_two),
                                    axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one),
                                          map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one),
                map(float, sample_two),
                initial_lexsort=args.initial_lexsort,
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one),
                              map(float, sample_two),
                              base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one),
                                               map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               map(float, sample_two),
                                               ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one),
                    map(float, sample_two),
                    ddof=args.ddof,
                    lambda_=args.lambda_,
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one),
                                                       ddof=args.ddof,
                                                       lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     map(float, sample_two),
                                                     alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one),
                    method=args.med,
                    weights=map(float, sample_two),
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one),
                                                      method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center,
                                         proportiontocut=args.proportiontocut,
                                         *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center,
                                      proportiontocut=args.proportiontocut,
                                      *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties,
                correction=args.correction,
                lambda_=args.lambda_,
                *b_samples)
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
print('Resultado del test de Anderson-Darling')
AD_test(df_high_quality)
AD_test(df_normal_quality)

######################
##  Variance Test   ##
######################
print('\n-- Sección Variance Test --')
# The Variance Homogeneity Test will be done using the Fligner-Killeen test

matrix_fligner = [['Dataset', 'Statistic', 'p-value']]

statistic, p_value = stats.fligner(
    df_high_quality.iloc[:, 0], df_high_quality.iloc[:, 1],
    df_high_quality.iloc[:, 2], df_high_quality.iloc[:, 3],
    df_high_quality.iloc[:, 4], df_high_quality.iloc[:, 5],
    df_high_quality.iloc[:, 6], df_high_quality.iloc[:, 7],
    df_high_quality.iloc[:, 8], df_high_quality.iloc[:, 9],
    df_high_quality.iloc[:, 10], df_high_quality.iloc[:, 11])

matrix_fligner.append(['df_high_quality', statistic, p_value])

statistic, p_value = stats.fligner(
    df_normal_quality.iloc[:, 0], df_normal_quality.iloc[:, 1],
    df_normal_quality.iloc[:, 2], df_normal_quality.iloc[:, 3],
    df_normal_quality.iloc[:, 4], df_normal_quality.iloc[:, 5],
    df_normal_quality.iloc[:, 6], df_normal_quality.iloc[:, 7],
    df_normal_quality.iloc[:, 8], df_normal_quality.iloc[:, 9],
    df_normal_quality.iloc[:, 10], df_normal_quality.iloc[:, 11])

matrix_fligner.append(['df_normal_quality', statistic, p_value])
Esempio n. 12
0
 def test_data(self):
     # numbers from R: fligner.test in package stats
     x1 = np.arange(5)
     assert_array_almost_equal(stats.fligner(x1,x1**2),
                        (3.2282229927203536, 0.072379187848207877), 11)
Esempio n. 13
0
import matplotlib.pyplot as plt
import seaborn as sns

sns.distplot(sco1, kde=False, fit=stats.norm)
sns.distplot(sco2, kde=False, fit=stats.norm)
plt.show()

# 정규성 확인 함수

print(stats.shapiro(sco1), '\n')  # 0.36799 > 0.05 이므로 정규성 분포를 이룸
print(stats.shapiro(sco2), '\n')  # 0.67141 > 0.05 이므로 정규성 분포를 이룸

# 등분산성
print(stats.levene(sco1, sco2).pvalue,
      '\n')  # 가장 일반적 0.45684 < 0.05 이므로 등분산성을 따름
print(stats.fligner(sco1, sco2).pvalue, '\n')
print(stats.bartlett(sco1, sco2).pvalue, '\n')

print(stats.ttest_ind(sco1, sco2), '\n')  # 등분산성을 만족한 경우 . 기본값

#Ttest_indResult(statistic=-0.19649386929539883, pvalue=0.8450532207209545)
#해석 : pvalue(0.8450) > 0.05 이므로 귀무채택이다
#r

print(stats.ttest_ind(sco1, sco2, equal_var=False))  # 등분산성을 만족하지 않는 경우

#만약 정규성을 만족하지 않는경우

#stats.wilcoxon(sco1,sco2)
#stats.kruskal()
#stats.mannwhitneyu()
  #hfmt = dates.DateFormatter('%H:%M')
  #ax.xaxis.set_major_formatter(hfmt)
#  y_formatter = mpl.ticker.ScalarFormatter(useOffset=False)
#  ax.yaxis.set_major_formatter(y_formatter)
#  ax.grid(True)

  f.suptitle("Dichte der Leistungsgradienten")
  f.autofmt_xdate()
  plt.savefig("images/sonnenfinsternis-dichte-gradienten.png")#, bbox_inches='tight')

  plt.clf()
  friday_series, friday_vals = ecdf.get_ecdf(friday_momentum_df.momentum)
  ecdf.plot_ecdf_curve(friday_series, friday_vals, color="b", label="Typischer Freitag")
  eclipse_series, eclipse_vals = ecdf.get_ecdf(eclipse_momentum_df.momentum)
  ecdf.plot_ecdf_curve(eclipse_series, eclipse_vals, color="r", label="Sonnenfinsternis")
  print "Mittelwert alle Freitage: %f" % np.median(friday_momentum_df.momentum)
  print "Mittelwert Sonnenfinsternis: %f" % np.median(eclipse_momentum_df.momentum)
  # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.levene.html#scipy.stats.levene
  W, p_val = stats.levene(friday_momentum_df.momentum,
      eclipse_momentum_df.momentum, center='median')
  print ("Levenes Test auf Gleichheit der Varianz: P=%s (gleiche Varianz für p<=0.05)" % p_val)


  W, p_val = stats.fligner(friday_momentum_df.momentum, eclipse_momentum_df.momentum)
  print "Fliegners Test auf Gleichheit der Varianz: P=%s" % p_val

  f.suptitle("ECDF der Leistungsgradienten: Ungleiche Varianzen (Levene, p=%f)" % p_val)
  plt.savefig("images/sonnenfinsternis-ecdf-gradienten.png")#, bbox_inches='tight')

 
Esempio n. 15
0
# =============================================================================
# ~ Anàlisi - Homoscedasticitat
# =============================================================================
sep_("Anàlisi Homoscedasticitat")

# Agafem un nivell de significàcia de 0.05 per a avaluar la homoscedasticitat
# entre els dos grups sota el test de Fligner-Killeen, donat que no hi ha
# normalitat. Obtenim que l'única variable amb homoscedasticitat és SibSp.
print('\n--Test Fligner-Killeen:',
      '\tH0: la variància és igual en ambdós grups (homoscedasticitat)',
      '\tH1: la variància no és igual entre els grups (heteroscedasticitat)',
      sep='\n')
alpha = 0.05
res = pd.DataFrame(columns=['Variable', 'Estadistic', 'p-valor', 'H0'])
for var in ['Age', 'Fare', 'SibSp', 'Parch']:
    stat, p = fligner(X0[var], X1[var])
    res = res.append(
        {
            'Variable': var,
            'Estadistic': stat,
            'p-valor': round(p, 6),
            'H0': p >= alpha
        },
        ignore_index=True)
print(res)
res_homo = res.copy()

# =============================================================================
# ~ Anàlisi - Tendència central
# =============================================================================
sep_("Anàlisi Tendència Central")
Esempio n. 16
0
def fligner((x, y)):
    return stats.fligner(x, y)
Esempio n. 17
0
print("test mean: %s std: %s" % (test_mean, sqrt(test_var)))

# percent uplifts in test mean over control mean
percent_uplift_mean = ((test_mean - control_mean)/control_mean)*100
print("percent uplift in %s mean over %s mean: %s" % (test_group, control_group, percent_uplift_mean))

#########################  Hypothesis Testing
# compute pearsonr test for h_A:r_test != r_control
pearsonr_obj = stats.pearsonr(control_metric, test_metric)
#print("correlation coef: %s p-value: %s" % (pearsonr_obj[0], pearsonr_obj[1]))
p_rtest = pearsonr_obj[1]
is_correlated = p_rtest <= alpha
print("are groups correlated? %s" % is_correlated)

# compute flinger's test for h_A: sig^2_test != sig^2_control
fligner_obj = stats.fligner(control_metric, test_metric, center="mean")
p_fligner = fligner_obj[1]
is_var_equal = p_fligner > alpha
print("is variance of groups equal? %s" % is_var_equal)

# compute student t test for h_A: mu_test != mu_control
if is_correlated:
    p_ttest = stats.ttest_rel(control_metric, test_metric)[1]
else:
    p_ttest = stats.ttest_ind(control_metric, test_metric, equal_var=is_var_equal)[1]

print("t test p value: %s" % p_ttest)

# output test results
if p_ttest <= alpha:
  print("reject null hypothesis, means are not equal")
def plotExpBox_Main(inputFiles, headers, valcols, outputFile, sep, startRow,
                    showIndPoints, mark, markMean, showMean, notch, whisker,
                    outliers, plotPvalueCluster, outputClusterPrefix,
                    methodCluster, xlegendrotation, xlabe, ylabe, figsz, titl,
                    showSampleSizes, trimToMinSize, relabels, logb,
                    plotHistogramToFile, plotMedianForGroups, botta,
                    showViolin, showBox, firstColAnnot, plotTrend, showLegend,
                    makePzfxFile, makeBinMatrix, writeDataSummaryStat,
                    summaryStatRange, minuslog10pvalue, minNDataToKeep,
                    vfacecolor, valpha, outXYZPvalues, dividePlots):

    #if plotPvalueCluster:
    #if pvalue cluster is needed:
    #	from Bio.Cluster.cluster import *
    #	from Bio.Cluster import *
    #endif

    #the real deal!
    plotData = []
    xtickLabels = []

    trendData = {}
    annot = {}

    minSize = -1

    for inputFile, header, cols in zip(inputFiles, headers, valcols):
        fin = generic_istream(inputFile)

        startIdx = len(plotData)

        if firstColAnnot:
            colAnnot = cols[0]
            cols = cols[1:]
            annotThisFile = []
            annot[startIdx] = annotThisFile
        else:
            colAnnot = -1
            annotThisFile = None

        for col in cols:
            plotData.append([])
            xtickLabels.append(header[col])

        colIndices = range(startIdx, startIdx + len(cols))

        if plotTrend:
            #print >> stderr,"plotTrend"
            trendDataThisFile = []
            trendData[startIdx] = trendDataThisFile
        else:
            trendDataThisFile = None

        lino = 0
        for lin in fin:
            lino += 1
            if lino < startRow:
                continue
            fields = lin.rstrip("\r\n").split(sep)

            if plotTrend:
                #print >> stderr,"a"
                trendDataThisLine = []
            else:
                trendDataThisLine = None

            allDataOKThisLine = True

            if colAnnot >= 0:
                annotThisFile.append(fields[colAnnot])

            for idx, col in zip(colIndices, cols):
                try:
                    value = float(fields[col])
                    if logb != 0:
                        if value == 0.0:
                            raise ValueError
                        value = log(value) / logb
                    plotData[idx].append(value)

                    if plotTrend:
                        trendDataThisLine.append(value)
                        #print >> stderr,"value:",value

                except:
                    allDataOKThisLine = False

            if plotTrend:
                if allDataOKThisLine:
                    trendDataThisFile.append(trendDataThisLine)
                else:
                    trendDataThisFile.append(None)

        fin.close()

        if minSize == -1:
            minSize = len(plotData[idx])  #or startIDX?
        else:
            minSize = min([minSize, len(plotData[idx])])

    if trimToMinSize:
        print >> stderr, "trimming to min size =", minSize
        trimData(plotData, minSize)

    if len(relabels) > 0:
        #if len(relabels)!=len(xtickLabels):
        #	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
        #	exit()
        print >> stderr, xtickLabels
        print >> stderr, relabels
        for i, relabel in zip(range(0, len(relabels)), relabels):
            xtickLabels[i] = relabel

    for i in range(0, len(plotMedianForGroups)):
        plotMedianForGroups[i] = getCol0ListFromCol1ListStringAdv(
            xtickLabels, plotMedianForGroups[i])

    #drawing medians:
    medianToDraw = []
    for mediangrouper in plotMedianForGroups:
        curD = []
        for c in mediangrouper:
            curD.extend(plotData[c])
        medianToDraw.append(median(curD))

    for c in range(len(plotData) - 1, -1, -1):
        if len(plotData[c]) < minNDataToKeep:
            print >> stderr, xtickLabels[c], "discarded because has only", len(
                plotData[c]), "data points <", minNDataToKeep
            del plotData[c]
            del xtickLabels[c]

    if not skipStat:
        print >> stdout, "student t-test (1 sample; mean=0)"
        print >> stdout, "sample", "mean", "p-val", "median"

        if writeDataSummaryStat:
            fDSS = open(writeDataSummaryStat, "w")
            print >> fDSS, "sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange[" + str(
                summaryStatRange[0]) + "," + str(
                    summaryStatRange[1]
                ) + "]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove"

        for x in range(0, len(plotData)):
            #print >> stderr, len(plotData[x])
            try:
                print >> stdout, xtickLabels[x], mean(
                    plotData[x]), ttest_1samp(plotData[x],
                                              0)[1], median(plotData[x])
            except:
                print >> stdout, xtickLabels[x], mean(
                    plotData[x]), "NA", median(plotData[x])

            if writeDataSummaryStat:
                sumData, N, NIN, NBelow, NAbove = filterDataInRangeInclusive(
                    plotData[x], summaryStatRange[0], summaryStatRange[1])

                if NIN > 1:
                    #print >> stderr,"sumData=",sumData
                    #print >> stderr,mean
                    mea = mean2(sumData)
                    DDOF = 1
                    sd = std(sumData, ddof=DDOF)
                    var = sd * sd
                    mi = min(sumData)
                    ma = max(sumData)
                else:
                    mea = "NA"
                    sd = "NA"
                    var = "NA"
                    mi = "NA"
                    ma = "NA"

                print >> fDSS, xtickLabels[x] + "\t" + str(mea) + "\t" + str(
                    var) + "\t" + str(sd) + "\t" + str(mi) + "\t" + str(
                        ma) + "\t" + str(N) + "\t" + str(NIN) + "\t" + str(
                            float(NIN) * 100 /
                            N) + "\t" + str(NBelow) + "\t" + str(
                                float(NBelow) * 100 /
                                N) + "\t" + str(NAbove) + "\t" + str(
                                    float(NAbove) * 100 / N)

        pvalueM = []

        if writeDataSummaryStat:
            fDSS.close()

        print >> stdout, ""

        print >> stdout, "student t-test (2 samples)"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        print >> stdout, ""

        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    try:
                        pvalue = ttest_ind(plotData[x], plotData[y])[1]
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, str(pvalue),
                    pvalueRow.append(pvalue)
            print >> stdout, ""

        print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_t_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_t", xtickLabels,
                                  pvalueM, methodCluster)

        pvalueM = []

        print >> stdout, "welch t-test"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])

                else:
                    try:
                        pvalue = welchs_approximate_ttest_arr(
                            plotData[x], plotData[y])[3]
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, str(pvalue),
                    pvalueRow.append(pvalue)
            print >> stdout, ""

        if outXYZPvalues:
            writeXYZPvalues(outXYZPvalues + "_Welch.xyz", xtickLabels, pvalueM)

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_Welch_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_Welch", xtickLabels,
                                  pvalueM, methodCluster)

        print >> stdout, ""
        print >> stdout, "non-parametric (Mann-Whitney U)"  #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
        print >> stdout, "p-val",
        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = mannwhitneyu(plotData[x], plotData[y])[1] * 2
                    except:
                        pvalue = 1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,  #mann-whiteney need to mul by 2 (one tail to two tail)
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if outXYZPvalues:
            writeXYZPvalues(outXYZPvalues + "_U.xyz", xtickLabels, pvalueM)

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_U_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_U", xtickLabels,
                                  pvalueM, methodCluster)

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Ansari-Bradley Two-sample Test for difference in scale parameters "
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = ansari(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                        #pvalue=1.0
                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_Ansari_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_Ansari", xtickLabels,
                                  pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Fligner's Two-sample Test for equal variance (non-parametrics)"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = fligner(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_fligner_raw",
                              xtickLabels, pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_fligner",
                                  xtickLabels, pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Levene's Two-sample Test for equal variance"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = levene(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_levene_raw", xtickLabels,
                              pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_levene", xtickLabels,
                                  pvalueM, methodCluster)

        #####

        #####now the variance tests

        print >> stdout, ""
        print >> stdout, "Bartlett's Two-sample Test for equal variance (for normal distributions)"
        print >> stdout, "p-val",

        for x in range(0, len(plotData)):
            print >> stdout, xtickLabels[x],

        pvalueM = []

        print >> stdout, ""
        for x in range(0, len(plotData)):
            pvalueRow = []
            pvalueM.append(pvalueRow)
            print >> stdout, xtickLabels[x],
            for y in range(0, len(plotData)):
                if y <= x:
                    print >> stdout, "",
                    if x == y:
                        if minuslog10pvalue:
                            pvalueRow.append(0.0)
                        else:
                            pvalueRow.append(1.0)
                    else:
                        pvalueRow.append(pvalueM[y][x])
                else:
                    #if max(len(plotData[x]),len(plotData[y]))<=20:
                    try:
                        pvalue = bartlett(plotData[x], plotData[y])[1]
                    except:
                        pvalue = "NA"
                        #pvalue=1.0

                    if minuslog10pvalue and str(pvalue) != "NA":
                        try:
                            pvalue = -1 * log(pvalue, 10)
                        except:
                            pvalue = -1000.0

                    print >> stdout, pvalue,
                    pvalueRow.append(pvalue)
                    #else:
                    #	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
            print >> stdout, ""

        if plotPvalueCluster:
            makePValueRawPlot(outputClusterPrefix + "_bartlett_raw",
                              xtickLabels, pvalueM)
            makePValueClusterPlot(outputClusterPrefix + "_bartlett",
                                  xtickLabels, pvalueM, methodCluster)

        #####

    figure(figsize=figsz)
    subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)

    if len(titl) == 0:
        titl = outputFile

    plotExpBox(plotData, xtickLabels, showIndPoints, mark, markMean, showMean,
               notch, whisker, outliers, xlegendrotation, xlabe, ylabe, titl,
               showSampleSizes, showViolin, showBox, annot, trendData,
               showLegend, makePzfxFile, makeBinMatrix, dividePlots)

    #ylim([0,200])
    for m in medianToDraw:
        axhline(y=m, linestyle=':', color='gray')

    savefig(outputFile, bbox_inches="tight")

    if len(plotHistogramToFile) > 0:
        drawHistogram(plotHistogramToFile, plotData, xtickLabels)
        drawDensigram(plotHistogramToFile + ".density.png", plotData,
                      xtickLabels)
Esempio n. 19
0
# Cuando los datos no cumplen con la condición de normalidad, cómo hemos visto en el apartado anterior, utilizamos el test Fligner-Killeen
#
# La hipótesis nula asume igualdad de varianzas en los diferentes grupos de datos, por lo que p-valores inferiores al nivel de significancia (0.05) indicán heterocedasticidad.
#
# En este caso lo aplicaremos a todas las variables, comparando los atributos (data_wine) con la variable target (quality)

# In[25]:

# Aplicamos la prueba de Fligner
from scipy.stats import fligner

names = list(data_wine.columns)

# Consideramos las variables/atributos y la comparamos con quality
for i in range(len(names)):
    stat, p = fligner(data_wine.iloc[:, i], wine['quality'])
    print(names[i], ': p = %.3f' % (p))

# Encontramos por tanto Heterocedastidad

# ## 4.3. Aplicación de pruebas estadísticas para comparar los grupos de datos.
# #### En función de los datos y el objetivo del estudio, aplicar pruebas de contraste de hipótesis, correlaciones, regresiones, etc. Aplicar al menos tres métodos de análisis diferentes.

# ### ***Contraste de hipótesis***

# * #### **Test Mann Whitney**

# En el caso que consideremos 2 niveles para la variable 'rating' o podemos utilizar también la variable 'quality' directamente

# In[26]:
                       cut_coords=cut_coords)
plt.savefig('lm.png')

plotting.plot_roi(atlas, title="Harvard Oxford atlas", cut_coords=cut_coords)
# print labels

from scipy.stats import fligner

X = roi_masker.transform(func_filename)
y, session = np.loadtxt(haxby_dataset.session_target).astype('int').T
conditions = np.recfromtxt(haxby_dataset.conditions_target)['f0']
non_rest = conditions != b'rest'
conditions = conditions[non_rest]
y, session = y[non_rest], session[non_rest]
y = y[session < 4]

var_stat = np.zeros(X.shape[1])
for j, x in enumerate(X.T):
    _, var_stat[j] = fligner(x[y == 8], x[y == 1], x[y == 2], x[y == 3],
                             x[y == 4], x[y == 5], x[y == 6], x[y == 7])

var_img = roi_masker.inverse_transform(-np.hstack(
    (0, np.log10(var_stat)))[np.newaxis])
plotting.plot_stat_map(var_img,
                       cut_coords=cut_coords,
                       title='Fligner test',
                       vmax=4)
plt.savefig('var_stat.png')

plt.show()
plotting.plot_stat_map(roi_score_img, title='Linear model',
                       cut_coords=cut_coords)
plt.savefig('lm.png')

plotting.plot_roi(atlas, title="Harvard Oxford atlas", cut_coords=cut_coords)
# print labels

from scipy.stats import fligner
X = roi_masker.transform(func_filename)
y, session = np.loadtxt(haxby_dataset.session_target).astype('int').T
conditions = np.recfromtxt(haxby_dataset.conditions_target)['f0']
non_rest = conditions != b'rest'
conditions = conditions[non_rest]
y, session = y[non_rest], session[non_rest]
y = y[session < 4]

var_stat = np.zeros(X.shape[1])
for j, x in enumerate(X.T):
    _, var_stat[j] = fligner(
        x[y == 8], x[y == 1], x[y == 2], x[y == 3],
        x[y == 4], x[y == 5], x[y == 6], x[y == 7])

var_img = roi_masker.inverse_transform(
    - np.hstack((0, np.log10(var_stat)))[np.newaxis])
plotting.plot_stat_map(var_img, cut_coords=cut_coords, title='Fligner test',
                       vmax=4)
plt.savefig('var_stat.png')


plt.show()
Esempio n. 22
0
def starplot(df=[],
             x='',
             y='',
             data=[],
             index=[],
             columns=[],
             fold=False,
             foldcol=0,
             mode=3,
             errorbar=True,
             plottype='barplot',
             stats='independent t test',
             test_var=False,
             stats_var='f test',
             crit_var=0.05,
             equal_var=True,
             rotate=0,
             elinewidth=0.5,
             fontsize=14,
             capsize=4,
             noffset_ylim=35,
             noffset_fst=10,
             noffset_diff=10,
             star_size=3,
             linewidth=1,
             crit=[0.05, 0.01, 0.001, 0.0001]):
    # data: list of data matrixs(or DataFrames) for comparison (row: obs, columns: var)
    # index: var, columns: obs
    # adjacent: annotate star for adjacent bar
    # control: annotate star between all other bars to selctive control bar
    # mix: mix mode
    # 3: annotate star for all combination of bar (only 3 bars available)

    crit = np.array(crit)
    plt.rcParams['font.family'] = 'Times New Roman'
    fig, ax = plt.subplots()
    star = ['*', '**', '***', '****']
    n = len(data)
    m = data[0].shape[1]
    test = pd.DataFrame()
    for i, j in enumerate(data):
        if type(test) == type(j):
            data[i] = j.values.reshape(len(j.index), len(j.columns))
    if plottype == 'barplot':
        error = pd.DataFrame()
        mean = pd.DataFrame()
        for i in range(m):
            error[i] = [data[j][:, i].std() for j in range(n)]
            mean[i] = [data[j][:, i].mean() for j in range(n)]
        error = error.transpose()
        mean = mean.transpose()
        if len(index) != 0:
            error.index = index
            mean.index = index
        if len(columns) != 0:
            error.columns = columns
            mean.columns = columns
        if fold == True:
            oldmean = mean.copy()
            olderror = error.copy()
            for i in range(len(mean.columns)):
                mean.iloc[:, i] = oldmean.iloc[:, i] / oldmean.iloc[:, foldcol]
                error.iloc[:,
                           i] = olderror.iloc[:, i] / oldmean.iloc[:, foldcol]
        if errorbar == True:
            plot = plot = mean.plot.bar(yerr=error,
                                        ax=ax,
                                        rot=rotate,
                                        capsize=capsize,
                                        error_kw=dict(elinewidth=elinewidth),
                                        fontsize=fontsize)
            max_bar = [[mean.iloc[j, i] + error.iloc[j, i] for i in range(n)]
                       for j in range(m)]
            min_bar = [
                mean.iloc[j, i] - error.iloc[j, i] for i in range(n)
                for j in range(m)
            ]
        else:
            plot = plot = mean.plot.bar(ax=ax,
                                        rot=rotate,
                                        capsize=capsize,
                                        error_kw=dict(elinewidth=elinewidth),
                                        fontsize=fontsize)
            max_bar = [[mean.iloc[j, i] for i in range(n)] for j in range(m)]
            min_bar = [mean.iloc[j, i] for i in range(n) for j in range(m)]
    elif plottype == 'boxplot':
        print("under buiding")
    ylim = 0
    offset = max([max_bar[i][j] for i in range(m) for j in range(n)]) / 100
    blank = []
    if mode == 3:
        for j in range(m):
            level = np.zeros(n)
            for i in range(n):
                if i < n - 1:
                    k = i + 1
                else:
                    k = 0
                if test_var == True:
                    if stats_var == 'f test':
                        f = 0.5 - abs(0.5 - ftest.sf(
                            data[i][:, j].var() / data[k][:, j].var(),
                            len(data[i][:, j]) - 1,
                            len(data[k][:, j]) - 1))
                        if crit_var / 2 > f:
                            equal_var = False
                        else:
                            equal_var = True
                    else:
                        if stats_var == 'bartlett':
                            f = bartlett(data[i][:, j], data[k][:, j])[1]
                        elif stats_var == 'levene':
                            f = bartlett(data[i][:, j], data[k][:, j])[1]
                        elif stats_var == 'fligner':
                            f = fligner(data[i][:, j], data[k][:, j])[1]
                        if crit_var > f:
                            equal_var = False
                        else:
                            equal_var = True
                if stats == 'independent t test':
                    p = ttest_ind(data[i][:, j],
                                  data[k][:, j],
                                  equal_var=equal_var)[1]
                elif stats == 'paired t test':
                    if equal_var == True:
                        p = ttest_rel(data[i][:, j], data[k][:, j])[1]
                    else:
                        p = 0
                elif stats == 'median test':
                    p = median_test(data[i][:, j], data[k][:, j])[1]
                elif stats == 'mannwhitneyu':
                    if equal_var == True:
                        p = mannwhitneyu(data[i][:, j], data[k][:, j])[1]
                    else:
                        p = 0
                elif stats == 'wilcoxon':
                    if equal_var == True:
                        p = wilcoxon(data[i][:, j], data[k][:, j])[1]
                    else:
                        p = 0
                level[i] = len(crit) - len(crit.compress(p > crit))
            for k in range(n):
                height = 0
                if level[k] != 0 and k != n - 1:
                    center = [
                        plot.patches[k * m + j].get_x(),
                        plot.patches[k * m + m + j].get_x()
                    ]
                    height = max([max_bar[j][k], max_bar[j][k + 1]])
                    h1 = max_bar[j][k]
                    h2 = max_bar[j][k + 1]
                    width = plot.patches[k * m + j].get_width()
                    blank.append(
                        (center[0] + width / 2,
                         height + noffset_fst * offset + (-1)**k * 2 * offset))
                    blank.append(
                        (center[1] + width / 2,
                         height + noffset_fst * offset + (-1)**k * 2 * offset))
                    ax.vlines(x=center[0] + width / 2,
                              ymin=h1 + offset * 2,
                              ymax=height + noffset_fst * offset +
                              (-1)**k * 2 * offset,
                              lw=linewidth)
                    ax.vlines(x=center[1] + width / 2,
                              ymin=h2 + offset * 2,
                              ymax=height + noffset_fst * offset +
                              (-1)**k * 2 * offset,
                              lw=linewidth)
                    ax.annotate(star[int(level[k] - 1)],
                                xy=((center[0] + center[1]) / 2 + width / 2,
                                    height + (noffset_fst + 1) * offset +
                                    (-1)**k * 2 * offset),
                                ha='center',
                                size=star_size)
                elif level[k] != 0 and k == n - 1:
                    center = [
                        plot.patches[j].get_x(),
                        plot.patches[k * m + j].get_x()
                    ]
                    height = max(max_bar[j])
                    h1 = max_bar[j][0]
                    h2 = max_bar[j][k]
                    blank.append(
                        (center[0] + width / 2,
                         height + (noffset_fst + noffset_diff) * offset))
                    blank.append((center[1] + width / 2, height + 20 * offset))
                    ax.vlines(x=center[0] + width / 2,
                              ymin=h1 + offset * 2,
                              ymax=height +
                              (noffset_fst + noffset_diff) * offset,
                              lw=linewidth)
                    ax.vlines(x=center[1] + width / 2,
                              ymin=h2 + offset * 2,
                              ymax=height +
                              (noffset_fst + noffset_diff) * offset,
                              lw=linewidth)
                    ax.annotate(star[int(level[k] - 1)],
                                xy=((center[0] + center[1]) / 2 + width / 2,
                                    height +
                                    (noffset_fst + noffset_diff + 1) * offset),
                                ha='center',
                                size=star_size)
                if height > ylim:
                    ylim = height
    if mode == 'adjacent':
        for j in range(m):
            level = np.zeros(n - 1)
            for i in range(n - 1):
                k = i + 1
                if test_var == True:
                    if stats_var == 'f test':
                        f = 0.5 - abs(0.5 - ftest.sf(
                            data[i][:, j].var() / data[k][:, j].var(),
                            len(data[i][:, j]) - 1,
                            len(data[k][:, j]) - 1))
                        if crit_var / 2 > f:
                            equal_var = False
                        else:
                            equal_var = True
                    else:
                        if stats_var == 'bartlett':
                            f = bartlett(data[i][:, j], data[k][:, j])[1]
                        elif stats_var == 'levene':
                            f = bartlett(data[i][:, j], data[k][:, j])[1]
                        elif stats_var == 'fligner':
                            f = fligner(data[i][:, j], data[k][:, j])[1]
                        if crit_var > f:
                            equal_var = False
                        else:
                            equal_var = True
                if stats == 'independent t test':
                    p = ttest_ind(data[i][:, j],
                                  data[k][:, j],
                                  equal_var=equal_var)[1]
                elif stats == 'paired t test':
                    if equal_var == True:
                        p = ttest_rel(data[i][:, j], data[k][:, j])[1]
                    else:
                        p = 0
                elif stats == 'median test':
                    p = median_test(data[i][:, j], data[k][:, j])[1]
                elif stats == 'mannwhitneyu':
                    if equal_var == True:
                        p = mannwhitneyu(data[i][:, j], data[k][:, j])[1]
                    else:
                        p = 0
                elif stats == 'wilcoxon':
                    if equal_var == True:
                        p = wilcoxon(data[i][:, j], data[k][:, j])[1]
                    else:
                        p = 0
                level[i] = len(crit) - len(crit.compress(p > crit))
            for k in range(n - 1):
                height = 0
                if level[k] != 0:
                    center = [
                        plot.patches[k * m + j].get_x(),
                        plot.patches[k * m + m + j].get_x()
                    ]
                    height = max([max_bar[j][k], max_bar[j][k + 1]])
                    h1 = max_bar[j][k]
                    h2 = max_bar[j][k + 1]
                    width = plot.patches[k * m + j].get_width()
                    blank.append(
                        (center[0] + width / 2,
                         height + noffset_fst * offset + (-1)**k * 2 * offset))
                    blank.append(
                        (center[1] + width / 2,
                         height + noffset_fst * offset + (-1)**k * 2 * offset))
                    ax.vlines(x=center[0] + width / 2,
                              ymin=h1 + offset * 2,
                              ymax=height + noffset_fst * offset +
                              (-1)**k * 2 * offset,
                              lw=linewidth)
                    ax.vlines(x=center[1] + width / 2,
                              ymin=h2 + offset * 2,
                              ymax=height + noffset_fst * offset +
                              (-1)**k * 2 * offset,
                              lw=linewidth)
                    ax.annotate(star[int(level[k] - 1)],
                                xy=((center[0] + center[1]) / 2 + width / 2,
                                    height + (noffset_fst + 1) * offset +
                                    (-1)**k * 2 * offset),
                                ha='center',
                                size=star_size)
                if height > ylim:
                    ylim = height
    ax.set_ylim(min(0,
                    min(min_bar) - 10 * offset), ylim + noffset_ylim * offset)
    for j, i in enumerate(blank):
        ax.vlines(x=i[0],
                  ymin=i[1],
                  ymax=i[1] + offset * 2,
                  color='white',
                  lw=1.2 * linewidth)
        if j % 2 == 1:
            ax.hlines(y=i[1], xmin=blank[j - 1], xmax=blank[j], lw=linewidth)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help="Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values."
    )
    parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored")
    parser.add_argument(
        "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored"
    )
    parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help="If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help="the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram")
    parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction")
    parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help="lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias)
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one), correction=args.correction, lambda_=args.lambda_
            )
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one), map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two)
            )
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(
                map(float, sample_one), map(float, sample_two), equal_var=args.equal_var
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one), zero_method=args.zero_method, correction=args.correction
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one), method=args.med, weights=map(float, sample_two)
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples
            )
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
Esempio n. 24
0
def fligner_test(ts):
    values = pd.Series(index=np.arange(10, 60, 10))
    for sample_size in values.index:
        values[sample_size] = stats.fligner(*chunks(ts, sample_size)).pvalue
    return values.mean()
Esempio n. 25
0
# For each column of data we apply the Anderson Darling theorem
for column in medium_quality_df:
    print(column)
    print(anderson(medium_quality_df[column], dist="norm"))

# For each column of data we apply the Anderson Darling theorem
for column in high_quality_df:
    print(column)
    print(anderson(high_quality_df[column], dist="norm"))

# Apply the Flinger theory
for column in high_quality_df:
    print(column)
    print(
        fligner(low_quality_df[column], medium_quality_df[column],
                high_quality_df[column]))

# CORRELATION
# Correlations between variables
plt.figure(figsize=(10, 6)).subplots_adjust(bottom=0.25)
sns.heatmap(df.corr(), annot=True, fmt='.0%')
plt.show()

# Calculate and order correlations
plt.figure(figsize=(10, 6)).subplots_adjust(bottom=0.25)
df.corr()['quality'].sort_values(ascending=False).plot(kind='bar')
plt.show()

# Matrix correlation between all variables
plt.figure(figsize=(10, 6))
sns.stripplot(data=df, x="quality", y="alcohol", jitter=True)
Esempio n. 26
0
# f, axes = plt.subplots(1, 3, figsize=(100, 100))
# axes[0].hist(A, bins = 10)
# axes[1].hist(B, bins = 10)
# axes[2].hist(C, bins = 5)
# plt.axis("equal")
# plt.show()
#
# stat_a, p_a = stats.shapiro(A)
# stat_b, p_b = stats.shapiro(B)
# stat_c, p_c = stats.shapiro(C)
#
# print("p-vlaues for shapiro-wilk test are\nA: %f\nB: %f\nC: %f"%(p_a,p_b,p_c))
#
print("p-value for homogeneity of variance test is"
      ": %f"%(stats.fligner(B,C)[1]))
#
# k = []
# # k.append((np.std(A)**2)/np.mean(A)**2)
# k.append(((np.std(A))**2)/np.mean(A))
# # k.append(((np.std(B))**2)/(np.mean(B))**2)
# k.append(((np.std(A))**2)/np.mean(B))
# # k.append(((np.std(C))**2)/(np.mean(C))**2)
# k.append(((np.std(A))**2)/np.mean(C))
# print(k[:])
#
# _, p_anova = stats.f_oneway(A, B, C)
# print(p_anova)
#
# print("p-value for kruskal-wallis test between B, C: %f"%stats.kruskal(B,C)[1])
Esempio n. 27
0
#정규성 확인
print('정규성 만족 여부 :',
      stats.shapiro(data.score)[1])  #정규성 만족 여부 : 0.2986918091773987 > 0.05 ok

# 등분산성 확인
result = data[['method', 'score']]
m1 = result[result['method'] == 1]
m2 = result[result['method'] == 2]
m3 = result[result['method'] == 3]

score1 = m1['score']
score2 = m2['score']
score3 = m3['score']

print('등분산성 확인 :', stats.levene(score1, score2, score3).pvalue)  # 모수적 검정
print('등분산성 확인 :', stats.fligner(score1, score2, score3).pvalue)  # 모수적 검정
print('등분산성 확인 :', stats.bartlett(score1, score2, score3).pvalue)  # 비모수 검정

# 등분산성 확인 : 0.11322850654055751  > 0.05 등분산성 만족이므로 anova 를 사용 . 아니라면  welchi's anova
# 등분산성 확인 : 0.10847180733221087
# 등분산성 확인 : 0.15251432724222921

print('\n 가공된 잘로 분산분석 --------')
# 교차표 : 교육방법별 건수
data2 = pd.crosstab(index=data['method'], columns='count')
data2.index = ['방법1', '방법2', '방법3']
print(data2)

#교차표 : 교육방법별 만족 여부
data3 = pd.crosstab(data['method'], data['survey'])
data3.index = ['방법1', '방법2', '방법3']
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta,showViolin,showBox,firstColAnnot,plotTrend,showLegend,makePzfxFile,makeBinMatrix,writeDataSummaryStat,summaryStatRange,minuslog10pvalue,minNDataToKeep,vfacecolor,valpha,outXYZPvalues,dividePlots):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif


	
	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	trendData={}
	annot={}
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)
		
		if firstColAnnot:
			colAnnot=cols[0]
			cols=cols[1:]
			annotThisFile=[]
			annot[startIdx]=annotThisFile
		else:
			colAnnot=-1
			annotThisFile=None
			
		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))
		
		if plotTrend:
			#print >> stderr,"plotTrend"
			trendDataThisFile=[]
			trendData[startIdx]=trendDataThisFile
		else:
			trendDataThisFile=None
			
			
		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
			
			if plotTrend:
				#print >> stderr,"a"
				trendDataThisLine=[]
			else:
				trendDataThisLine=None
			
			allDataOKThisLine=True
			
			if colAnnot>=0:
				annotThisFile.append(fields[colAnnot])
			
			for idx,col in zip(colIndices,cols):
				try:
					value=float(fields[col])
					if logb!=0:
						if value==0.0:
							raise ValueError
						value=log(value)/logb							
					plotData[idx].append(value)
					
					if plotTrend:
						trendDataThisLine.append(value)
						#print >> stderr,"value:",value
					
				except:
					allDataOKThisLine=False	
				
			if plotTrend:
				if allDataOKThisLine:
					trendDataThisFile.append(trendDataThisLine)
				else:
					trendDataThisFile.append(None)
			
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()
		print >> stderr,xtickLabels
		print >> stderr,relabels
		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])<minNDataToKeep:
			print >> stderr,xtickLabels[c],"discarded because has only",len(plotData[c]),"data points <",minNDataToKeep
			del plotData[c]
			del xtickLabels[c]

	if not skipStat:
		print >> stdout,"student t-test (1 sample; mean=0)"
		print >> stdout,"sample","mean","p-val","median"
	
		if writeDataSummaryStat:
			fDSS=open(writeDataSummaryStat,"w")
			print >> fDSS,"sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange["+str(summaryStatRange[0])+","+str(summaryStatRange[1])+"]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove"
			
		for x in range(0,len(plotData)):
			#print >> stderr, len(plotData[x])
			try:
				print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1],median(plotData[x])
			except:
				print >> stdout, xtickLabels[x],mean(plotData[x]),"NA",median(plotData[x])
			
			if writeDataSummaryStat:
				sumData,N,NIN,NBelow,NAbove=filterDataInRangeInclusive(plotData[x],summaryStatRange[0],summaryStatRange[1])
				
				if NIN>1:
					#print >> stderr,"sumData=",sumData
					#print >> stderr,mean
					mea=mean2(sumData)
					DDOF=1
					sd=std(sumData,ddof=DDOF)
					var=sd*sd
					mi=min(sumData)
					ma=max(sumData)
				else:
					mea="NA"
					sd="NA"
					var="NA"
					mi="NA"
					ma="NA"
				
			
					
				print >> fDSS,xtickLabels[x]+"\t"+str(mea)+"\t"+str(var)+"\t"+str(sd)+"\t"+str(mi)+"\t"+str(ma)+"\t"+str(N)+"\t"+str(NIN)+"\t"+str(float(NIN)*100/N)+"\t"+str(NBelow)+"\t"+str(float(NBelow)*100/N)+"\t"+str(NAbove)+"\t"+str(float(NAbove)*100/N)
			
	
		pvalueM=[]
		
		if writeDataSummaryStat:
			fDSS.close()
		
		print >> stdout,""
		
		print >> stdout,"student t-test (2 samples)"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
		print >> stdout,""
	
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					try:
						pvalue=ttest_ind(plotData[x],plotData[y])[1]
					except:
						pvalue=1.0
					
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
					
					print >> stdout, str(pvalue),
					pvalueRow.append(pvalue)
			print >> stdout,"";	
	
		
		print >> stdout,""
	
		
	
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)
	
	
			
		pvalueM=[]
	
		print >> stdout,"welch t-test"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
						
				else:
					try:
						pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
					except:
						pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
					
					print >> stdout, str(pvalue),
					pvalueRow.append(pvalue)
			print >> stdout,"";
	
		if outXYZPvalues:
			writeXYZPvalues(outXYZPvalues+"_Welch.xyz",xtickLabels,pvalueM)
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)
	
		
		print >> stdout,""
		print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
					except:
						pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
	
		if outXYZPvalues:
			writeXYZPvalues(outXYZPvalues+"_U.xyz",xtickLabels,pvalueM)
		
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
		
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Ansari-Bradley Two-sample Test for difference in scale parameters " 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=ansari(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
						#pvalue=1.0
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_Ansari_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_Ansari",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Fligner's Two-sample Test for equal variance (non-parametrics)" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=fligner(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
						
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_fligner_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_fligner",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Levene's Two-sample Test for equal variance" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=levene(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
						
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_levene_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_levene",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Bartlett's Two-sample Test for equal variance (for normal distributions)" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=bartlett(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_bartlett_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_bartlett",xtickLabels,pvalueM,methodCluster)	
		
		
		#####

	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes,showViolin,showBox,annot,trendData,showLegend,makePzfxFile,makeBinMatrix,dividePlots)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)
		drawDensigram(plotHistogramToFile+".density.png",plotData,xtickLabels)
Esempio n. 29
0
        #print(normParams)
        normQual = stats.kstest( newdata, stats.norm( loc=normParams[0], scale=normParams[1] ).cdf ).statistic
        normScores.append( normQual)


    elif( r == 1 ):
        v += 1

        (_, _, MFEscores) = convertLineFields(data)

        newdata = pd.DataFrame(MFEscores)
        testData = testData.append(newdata)

# Test for equal variances
assert(len(trainData.get_values()[0]) == 50)
print(stats.fligner( *trainData.get_values() ))

fig, ax = plt.subplots()
quants = np.arange(0.0, 1.0, 0.002)

dfgs = pd.DataFrame(gammaScores)
qdfgs = dfgs.quantile(q = quants)

dfgs2 = pd.DataFrame(gammaScores2)
qdfgs2 = dfgs2.quantile(q = quants)


dfns = pd.DataFrame(normScores) 
qdfns = dfns.quantile(q = quants)

#dflo = dfgs / dfns
Esempio n. 30
0
 def test_empty_arg(self):
     x = np.arange(5)
     assert_equal((np.nan, np.nan), stats.fligner(x, x**2, []))
Esempio n. 31
0
x = df[df['COR'] == 1]['PRICE']
y = df[df['COR'] == 0]['PRICE']
x.name, y.name = 'corner', 'not corner'
two_histograms(x, y)
res = stats.mannwhitneyu(x, y)
print('p-value: ', res[1])

df = pd.read_csv("./Shad_Python_06_2/agedeath.dat.txt",
                 sep='\s+',
                 header=None,
                 names=['group', 'age', 'index'])
print(df.head())
x = df[df['group'] == 'sovr']['age']
y = df[df['group'] == 'aris']['age']
two_histograms(x, y)
res = stats.fligner(x, y)
print('p-value: ', res[1])
res = stats.ttest_ind(x, y, equal_var=False)
print('p-value: ', res[1])

df = pd.read_csv("./Shad_Python_06_2/interference.csv")
print(df.head())
x = df['DiffCol']
y = df['Black']
x.name, y.name = 'DiffCol', 'Black'
two_histograms(x, y)
res = stats.fligner(x, y)
print('p-value: ', res[1])
res = stats.ttest_ind(x, y, equal_var=False)
print('p-value: ', res[1])
Esempio n. 32
0
 def get_statistic_and_pvalue(self, y):
     return fligner(*y, center="mean")
 def pval(grp):
     if grp.size < min_count:
         return np.nan
     return fligner(grp.values, y.values)[1]
Esempio n. 34
0
sns.distplot(other_prices, fit = norm);
fig = plt.figure()
res = stats.probplot(other_prices, plot = plt)

# Prueba de Shapiro-Wilk
stat, p = shapiro(other_prices)
print(f"Stat: {round(stat,3)}")
print(f"p-value: {round(p,3)}")

"""En base a los resultados obtenidos, no es posible garantizar la normalidad en la distribución de las muestras de precios tanto de apple como de otras marcas dado que el test de Shapiro rechaza la hipótesis nula de normalidad. Asimismo, en el caso del subset de Apple, no sería posible tampoco aplicar el teorema del límite central al no superar las 30 muestras. 

En este sentido, para poder confirmar si podemos asumir homocedasticidad (igualdad de varianzas entre muestras) debemos aplicar el test de Fligner-Killeen (no paramétrico) al no poder suponer normalidad:
"""

# Fligner-Killeen test
fligner_test = stats.fligner(apple_prices, other_prices, center='median')
fligner_test

"""A raíz de los resultados (p value >> 0.05) no podemos descartar la hipótesis nula, por lo que se confirma la homocedasticidad. No obstante, debido a que no hemos podido afirmar que sigan distribuciones normales, no podremos aplicar un contraste de muestras paramétrico (t-Student), si no que tendremos que aplicar uno no paramétrico (Mann-Whitney):"""

# Mann-Whitney test
mannwhitneyu_test = stats.mannwhitneyu(apple_prices, other_prices, alternative="greater")
mannwhitneyu_test

"""En base a los resultados obtenidos del test de Mann-Whitney (p value << 0.05), podemos rechazar la hipótesis nula en favor de la hipótesis alternativa que, en este caso, correspondía con que el precio de los productos de Apple es superior de media que para el resto de marcas.

## **Estudio de variables numéricas**

Realizamos primero una matriz de correlación, que nos podrá indicar con qué 
variables tiene más relación el valor del precio:
"""
Esempio n. 35
0
 def test_empty_arg(self):
     x = np.arange(5)
     assert_equal((np.nan, np.nan), stats.fligner(x, x**2, []))
Esempio n. 36
0
print("ansari")
data['ansari'] = [
    ansari(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
                                      np.nan_to_num(question2_vectors))
]

#==============================================================================
# print("mannwhitneyu")
# data['mannwhitneyu'] = [mannwhitneyu(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]
#==============================================================================

print("fligner")
data['fligner'] = [
    fligner(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
                                       np.nan_to_num(question2_vectors))
]

print("mood")
data['mood'] = [
    mood(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
                                    np.nan_to_num(question2_vectors))
]

print("ks_2samp")
data['ks_2samp'] = [
    ks_2samp(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
                                        np.nan_to_num(question2_vectors))
]
Esempio n. 37
0
 def test_data(self):
     # numbers from R: fligner.test in package stats
     x1 = np.arange(5)
     assert_array_almost_equal(stats.fligner(x1, x1**2),
                               (3.2282229927203536, 0.072379187848207877),
                               11)
Esempio n. 38
0
# 분포를 시각화하여 정규성 확인
import matplotlib.pyplot as plt
import seaborn as sns
sns.distplot(sco1, kde=False, fit=stats.norm)
sns.distplot(sco2, kde=False, fit=stats.norm)
plt.show()

# 정규성 확인 함수 - shapiro : 0.05보다 크면 정규성을 띈다.
print(stats.shapiro(sco1))  # (0.965552806854248, 0.3679903745651245)    0.3679 > 0.05 정규성 분포를 이룸
print(stats.shapiro(sco2))  # (0.9621098637580872, 0.6714189648628235)    0.6714 > 0.05 정규성 분포를 이룸

# 등분산성  : 0.05보다 크면 등분산성을 이룸.
print(stats.levene(sco1, sco2))  # 가장 일반적인 등분산성 함수.   0.4568 > 0.05 이므로 등분산성을 따름
# print(stats.levene(sco1, sco2).pvalue)
print(stats.fligner(sco1, sco2))
print(stats.bartlett(sco1, sco2))

print(stats.ttest_ind(sco1, sco2))
print(stats.ttest_ind(sco1, sco2, equal_var=True))      # equal_var : 등분산성 만족한 경우. 기본값.
# Ttest_indResult(statistic=-0.19649386929539883, pvalue=0.8450532207209545)
# 해석 : p-value(0.8450) > 0.05 이므로 귀무 채택.    두 가지 교육방법에 따른 평균시험 점수에 차이가 없다.

print(stats.ttest_ind(sco1, sco2, equal_var=False))  # equal_var : 등분산성 만족하지 못한 경우.

# 만약 정규성 만족 못할 경우
# stats.wilcoxon()    # 이걸 사용하자. 다만 두개의 길이가 같아야 실행된다.
# stats.kruskal()
# stats.mannwhitneyu()