def test_weightstats_ddof_tests(self):
        # explicit test that ttest and confint are independent of ddof
        # one sample case
        x1_2d = self.x1_2d
        w1 = self.w1

        d1w_d0 = DescrStatsW(x1_2d, weights=w1, ddof=0)
        d1w_d1 = DescrStatsW(x1_2d, weights=w1, ddof=1)
        d1w_d2 = DescrStatsW(x1_2d, weights=w1, ddof=2)

        # check confint independent of user ddof
        res0 = d1w_d0.ttest_mean()
        res1 = d1w_d1.ttest_mean()
        res2 = d1w_d2.ttest_mean()
        # concatenate into one array with np.r_
        assert_almost_equal(np.r_[res1], np.r_[res0], 14)
        assert_almost_equal(np.r_[res2], np.r_[res0], 14)

        res0 = d1w_d0.ttest_mean(0.5)
        res1 = d1w_d1.ttest_mean(0.5)
        res2 = d1w_d2.ttest_mean(0.5)
        assert_almost_equal(np.r_[res1], np.r_[res0], 14)
        assert_almost_equal(np.r_[res2], np.r_[res0], 14)

        # check confint independent of user ddof
        res0 = d1w_d0.tconfint_mean()
        res1 = d1w_d1.tconfint_mean()
        res2 = d1w_d2.tconfint_mean()
        assert_almost_equal(res1, res0, 14)
        assert_almost_equal(res2, res0, 14)
    def test_weightstats_ddof_tests(self):
        # explicit test that ttest and confint are independent of ddof
        # one sample case
        x1_2d = self.x1_2d
        w1 = self.w1

        d1w_d0 = DescrStatsW(x1_2d, weights=w1, ddof=0)
        d1w_d1 = DescrStatsW(x1_2d, weights=w1, ddof=1)
        d1w_d2 = DescrStatsW(x1_2d, weights=w1, ddof=2)

        #check confint independent of user ddof
        res0 = d1w_d0.ttest_mean()
        res1 = d1w_d1.ttest_mean()
        res2 = d1w_d2.ttest_mean()
        # concatenate into one array with np.r_
        assert_almost_equal(np.r_[res1], np.r_[res0], 14)
        assert_almost_equal(np.r_[res2], np.r_[res0], 14)

        res0 = d1w_d0.ttest_mean(0.5)
        res1 = d1w_d1.ttest_mean(0.5)
        res2 = d1w_d2.ttest_mean(0.5)
        assert_almost_equal(np.r_[res1], np.r_[res0], 14)
        assert_almost_equal(np.r_[res2], np.r_[res0], 14)

        #check confint independent of user ddof
        res0 = d1w_d0.tconfint_mean()
        res1 = d1w_d1.tconfint_mean()
        res2 = d1w_d2.tconfint_mean()
        assert_almost_equal(res1, res0, 14)
        assert_almost_equal(res2, res0, 14)
def summarize(name, test=False, space_creator=space_optimized):
    print("---------------------")
    print(f"{name}> summarizing")
    files = list(glob.glob(f"output/{name}/*_results.txt"))
    print(f"{name}> {len(files)} files found")

    failed = 0
    timed_out = 0
    memory_error = 0

    results = []
    for file in files:
        log_file = file.replace("_results.txt", "_log.txt")
        with open(log_file) as f:
            lines = "\n".join(f.readlines())
            if "out of memory" in lines:
                failed += 1
                memory_error += 1
                print(f"{name}> {log_file} out of memory")
                continue

        with open(file) as f:
            lines = f.readlines()
            time_taken = int(lines[3]) / 1000 / 60  # minutes
            if lines[5].startswith("False"):
                print(f"{name}> {file} timed out")
                failed += 1
                timed_out += 1
                continue
            results.append(time_taken)

        if test:
            check(log_file, stop_on_error=True, space_creator=space_creator)

    success_count = len(results)

    print(f"{name}> {success_count} successes")
    if success_count != 0:
        d = DescrStatsW(results)
        print(f"{name}> mean={d.mean}, std_mean={d.std_mean}")
        print(f"{name}> confidence interval", d.tconfint_mean())
    print(
        f"{name}> {failed} failed, {failed / (success_count + failed) * 100}%")
    print(f"{name}> {timed_out} timed out")
    print(f"{name}> {memory_error} memory error")
    print(
        f"{name}> binomial success ci={proportion_confint(success_count, success_count + failed, method='wilson')}"
    )

    frame = frame_of(results, name)
    plt.figure()
    sns.boxplot(data=frame, y="time")
    plt.savefig(f"output/fig{name}.png")

    plt.figure()
    sns.displot(frame["time"])
    plt.ylim(0, 350)
    plt.savefig(f"output/fig{name}-dist.png")

    return results
Example #4
0
def grouped_weights_statsdf(df, statscols, groupbycol, weightscol):
    """generates df with weighted means and 95% CI by groupbycol for cols in statscols
    

    Parameters
    ----------
    df : df
        df to be weigthed
    statscols : list
        cols/outcomes for weigthed stats
    groupbycol : str
        column name in df that defines groups 
    weightscol : str
        column name in df with weigths 
              
    
    Returns
    -------
    df
        multi-indexed df with outcome and groups as index
        stats generated: weighted mean, upper bound (95 CI), lower bound (95% CI), weighted n by group, total n unweighted

    """
    alldata = pd.DataFrame()
    for c in statscols:
        cdf = df.dropna(subset=[c])
        nrobs = len(cdf)
        grouped = cdf.groupby(groupbycol)
        stats = {}
        means = []
        lower = []
        upper = []
        nrobs_gr = []
        groups = list(grouped.groups.keys())
        for gr in groups:
            stats = DescrStatsW(grouped.get_group(gr)[c],
                                weights=grouped.get_group(gr)[weightscol],
                                ddof=0)
            means.append(stats.mean)
            lower.append(stats.tconfint_mean()[0])
            upper.append(stats.tconfint_mean()[1])
            nrobs_gr.append(stats.nobs)
        weightedstats = pd.DataFrame([means, lower, upper, nrobs_gr],
                                     columns=groups,
                                     index=[
                                         'weighted mean', 'lower bound',
                                         'upper bound', 'wei_n__group'
                                     ]).T
        weightedstats['tot_n_unweigthed'] = nrobs
        weightedstats['outcome'] = c
        weightedstats.index.name = 'groups'
        colstats = weightedstats.reset_index()
        colstats = colstats.set_index(['outcome', 'groups'])
        alldata = pd.concat([alldata, colstats])

    return alldata
Example #5
0
def one_t_test(pdf, data_measlevs, var_name, test_value=0):
    """One sample t-test
    
    arguments:
    var_name (str):
        Name of the variable to test.
    test_value (numeric):
        Test against this value.
        
    return:
    text_result (html str):
        Result in APA format.
    image (matplotlib):
        Bar chart with mean and confidence interval.
    """
    text_result = ''
    data = pdf[var_name].dropna()
    if data_measlevs[var_name] in ['int', 'unk']:
        if data_measlevs[var_name] == 'unk':
            text_result += warn_unknown_variable
        if len(set(data)) == 1:
            return _('One sample t-test cannot be run for constant variable.\n'
                     ), None

        data = pdf[var_name].dropna()
        descr = DescrStatsW(data)
        t, p, df = descr.ttest_mean(float(test_value))
        if LooseVersion(csc.versions['statsmodels']) >= LooseVersion('0.5'):
            # Or we could use confidence_interval_t
            cil, cih = descr.tconfint_mean()
            ci = (cih - cil) / 2
            prec = cs_util.precision(data) + 1
            ci_text = '[%0.*f, %0.*f]' % (prec, cil, prec, cih)
        else:
            ci = 0  # only with statsmodels
            ci_text = _(
                'Sorry, newer statsmodels module is required for confidence interval.\n'
            )
        text_result += _('One sample t-test against %g') % float(
            test_value) + ': <i>t</i>(%d) = %0.3g, %s\n' % (df, t,
                                                            cs_util.print_p(p))

        # Graph
        image = cs_chart.create_variable_population_chart(data, var_name, ci)
    else:
        text_result += _(
            'One sample t-test is computed only for interval variables.')
        image = None
    return ci_text, text_result, image
Example #6
0
def grouped_weights_statscol (df, statscol, groupbycol, weightscol):
    df.dropna(subset=[statscol], inplace=True)
    nrobs=len(df)
    grouped=df.groupby(groupbycol)
    stats={}
    means=[]
    lower=[]
    upper=[]
    groups=list(grouped.groups.keys())
    for gr in groups:
        stats=DescrStatsW(grouped.get_group(gr)[statscol], weights=grouped.get_group(gr)[weightscol], ddof=0)
        means.append(stats.mean)
        lower.append(stats.tconfint_mean()[0])
        upper.append(stats.tconfint_mean()[1])
    weightedstats=pd.DataFrame([means, lower, upper], columns=groups, index=['weighted mean', 'lower bound', 'upper bound']).T
    weightedstats['numberofobs']=nrobs
    return weightedstats
Example #7
0
def confidence_interval_t(data, ci_only=True):
    """95%, two-sided CI based on t-distribution
    http://statsmodels.sourceforge.net/stable/_modules/statsmodels/stats/weightstats.html#DescrStatsW.tconfint_mean
    """
    # FIXME is this solution slow? Should we write our own CI function?
    if LooseVersion(csc.versions['statsmodels']) >= LooseVersion('0.5'):
        descr = DescrStatsW(data)
        cil, cih = descr.tconfint_mean()
        ci = (cih - cil) / 2
    else:
        cil = cih = ci = [None
                          for i in data]  # FIXME maybe this one is not correct
    if ci_only:
        if isinstance(data, pd.Series):
            return ci  # FIXME this one is for series? The other is for dataframes?
        elif isinstance(data, pd.DataFrame):
            return pd.Series(ci, index=data.columns)
            # without var names the call from comp_group_graph_cum fails
    else:
        return ci, cil, cih
Example #8
0
my_knn_socres = cross_val_score(KNeighborsRegressor(n_neighbors=5),
                                X,
                                y,
                                cv=LeaveOneOut(),
                                scoring='neg_mean_squared_error')

(-my_lm_scores.mean())**0.5
#> 15.697306009399101 # 線形回帰分析

(-my_knn_socres.mean())**0.5
#> 16.07308308943869 # K最近傍法

my_df = pd.DataFrame({'lm': -my_lm_scores, 'knn': -my_knn_socres})
my_df.head()
#>            lm     knn
#> 0   18.913720  108.16
#> 1  179.215044    0.64
#> 2   41.034336   64.00
#> 3  168.490212  184.96
#> 4    5.085308    0.00

my_df.boxplot().set_ylabel("$r^2$")

from statsmodels.stats.weightstats import DescrStatsW
d = DescrStatsW(my_df.lm - my_df.knn)
d.ttest_mean()[1]  # p値
#> 0.6952755720536115

d.tconfint_mean(alpha=0.05, alternative='two-sided')  # 信頼区間
#> (-72.8275283312228, 48.95036023665703)
Example #9
0
    def __call__(self, alpha, df):
        stats = DescrStatsW(df)
        (lower, upper) = stats.tconfint_mean(alpha=alpha)

        return Band(lower, upper)
Example #10
0
np.random.seed(75243)
temp = nota_media_dos_filmes_com_pelo_menos_10_votos.sample(frac=1)

medias = [temp[0:i].mean() for i in range(1, len(temp))]

plt.plot(medias)

from statsmodels.stats.weightstats import zconfint

zconfint(nota_media_dos_filmes_com_pelo_menos_10_votos)

from statsmodels.stats.weightstats import DescrStatsW

descr_todos_com_10_votos = DescrStatsW(nota_media_dos_filmes_com_pelo_menos_10_votos)
descr_todos_com_10_votos.tconfint_mean()

"""# Vamos ver o filme 1..."""

filmes = pd.read_csv("movies.csv")
filmes.query("movieId==1")

notas1 = notas.query("movieId == 1")
notas1.head()

ax = sns.distplot(notas1.rating)
ax.set(xlabel="Nota", ylabel="Densidade")
ax.set_title("Distribuição das notas para o Toy Story")

ax = sns.boxplot(notas1.rating)
ax.set(xlabel="Nota")
Example #11
0
]
Y = [
    35.4, 34.6, 31.1, 32.4, 33.3, 34.7, 35.3, 34.3, 32.1, 28.3, 33.3, 30.5,
    32.6, 33.3, 32.2
]

a = 0.05  # 有意水準(デフォルト) = 1 - 信頼係数
alt = 'two-sided'  # 両側検定(デフォルト)
# 左片側検定なら'smaller'
# 右片側検定なら'larger'

d = DescrStatsW(np.array(X) - np.array(Y))  # 対標本の場合
d.ttest_mean(alternative=alt)[1]  # p値
#> 0.0006415571512322235

d.tconfint_mean(alpha=a, alternative=alt)  # 信頼区間
#> (-3.9955246743198867, -1.3644753256801117)

c = CompareMeans(DescrStatsW(X), DescrStatsW(Y))  # 対標本でない場合

ve = 'pooled'  # 等分散を仮定する(デフォルト).仮定しないなら'unequal'.
c.ttest_ind(alternative=alt, usevar=ve)[1]  # p値
#> 0.000978530937238609

c.tconfint_diff(alpha=a, alternative=alt, usevar=ve)  # 信頼区間
#> (-4.170905570517185, -1.1890944294828283)

### 4.4.4 独立性の検定(カイ2乗検定)

import pandas as pd
my_url = ('https://raw.githubusercontent.com/taroyabuki'
diagnostico_b = df.query("diagnosis == 'B'")


# Efetuando o Zteste para a média (Comparando os resultados)
ztest(diagnostico_m['mean_radius'], value = diagnostico_m['mean_radius'].mean())
ztest(diagnostico_m['mean_radius'], value = diagnostico_b['mean_radius'].mean())


# Gerando o intervalo de confiança
zconfint(diagnostico_m['mean_radius'])
zconfint(diagnostico_b['mean_radius'])


"""----------------------------------------------------------------------------
        T Test
"""
diagnostico_m = df.query("diagnosis == 'M'")
diagnostico_b = df.query("diagnosis == 'B'")

# Aplicando o teste
resultados_m = DescrStatsW(diagnostico_m['mean_radius'])
resultados_b = DescrStatsW(diagnostico_b['mean_radius'])

# Gerando o intervalo de confiança
resultados_m.tconfint_mean()
resultados_b.tconfint_mean()