Esempio n. 1
0
def fit_poissons(X, alpha=0.05, min_dist=0.2, min_zscore=1):
    if np.mean(X) < 5:  # Can't really form a good statistic
        meanbounds = sms.DescrStatsW(X).tconfint_mean(alpha=alpha)
        return {"n": 1, "coeffs": [meanbounds[1]]}
    shift = np.min(X) - 1  # Needed later to shift back
    Xarr = np.log(X - shift)
    res = one_or_two_mixtures(Xarr.tolist(),
                              alpha=0.05,
                              min_dist=min_dist,
                              min_zscore=min_zscore)
    numcomponents = len(res["low_means"])
    if numcomponents == 2:
        mean1 = 0.5 * (res["low_means"][0] + res["high_means"][0])
        mean2 = 0.5 * (res["low_means"][1] + res["high_means"][1])
        mean1 = np.exp(mean1) + shift
        mean2 = np.exp(mean2) + shift
        sz1 = res["n"][0]
        sz2 = res["n"][1]
        alpha = sz1 / sz2
        # Now optimize with estimates
        # coeffs = fit_data_two_poissons(X, [alpha, mean1, mean2])
        coeffs_fm = fit_poissons_fixed_means(X, mean1, mean2)
        print("Optimality fm = {}".format(coeffs_fm.cost))
        coeffs_2 = fit_data_two_poissons(X, [alpha, mean1, mean2])
        print("Optimality 2 = {}".format(coeffs_2.cost))
        coeffs_1 = fit_data_one_poisson(X, [np.mean(X)])
        print("Optimality 1 = {}".format(coeffs_1.cost))
        if coeffs_2.cost < coeffs_fm.cost:
            coeffs = coeffs_2
        else:
            coeffs = coeffs_fm
        if coeffs.x[0] > 0.0 and 2 * coeffs.cost < coeffs_1.cost:
            return {"n": 2, "coeffs": coeffs}

    print("Only have one!")
    Xarr = np.array(X)
    mean1 = np.mean(Xarr)
    mean2 = mean1 + min_zscore * np.sqrt(mean1)
    mean1 = np.mean(Xarr[Xarr < mean2 - np.sqrt(mean2) / 2.0])

    coeffs = fit_poissons_fixed_means(X, mean1, mean2)
    print("Alpha = {}".format(coeffs.x[0]))
    # coeffs = fit_data_one_poisson(X, [mean1])
    return {"n": 2, "coeffs": coeffs}
Esempio n. 2
0
def main():

    a = analysis.run(force_fit=False, use_backup_file=True)
    parameters = a.class_model.param_labels

    neutral = {"distortion": 1, "risk_aversion": 0, "side_bias": 0}

    monkey_list = a.monkeys.copy()
    monkey_list.remove("Havane")
    monkey_list.remove("Gladys")
    monkey_list = ["Havane", "Gladys"] + monkey_list

    for p in parameters:

        row_list = []

        for m in monkey_list:

            if m == "Havane":
                m_name = "Hav"
            elif m == "Gladys":
                m_name = "Gla"
            else:
                m_name = m

            row = {"ID": m_name}

            for cond in GAIN, LOSS:

                x = a.cpt_fit[cond][m][p]
                mean = np.mean(x)
                ic = sms.DescrStatsW(x).tconfint_mean()

                # print(f"{p} {m} {mean:.2f} [{ic[0]:.2f}, {ic[1]:.2f}])
                row[f"{cond.capitalize()} - Mean [CI]"] = f"{mean:.2f} [{ic[0]:.2f}, {ic[1]:.2f}]"

                if p in neutral.keys():
                    could_be_neutral = "Yes" if ic[0] <= neutral[p] <= ic[
                        1] else "No"
                    row[f"{cond.capitalize()} - Neutral"] = could_be_neutral
            row_list.append(row)

        df = pd.DataFrame(row_list)
        df.to_csv(os.path.join(TABLE_FOLDER, f"table_{p}.csv"), index=False)
Esempio n. 3
0
def calculate_confidence_interval_for_weight_mean(dataset,
                                                  confidence_level: float):
    """
    arguments:
    confidence_level -- In plain English, a Confidence Interval is a range of values we are fairly sure our true value lies in.
     The level of "fair surety" is called confidence level significance level (alpha) + confidence level = 1
     alpha is also the threshold of pvalue.
    """

    assert np.isnan(dataset).any() == False
    assert confidence_level > 0.8

    ci_lower_bound, ci_upper_bound = sms.DescrStatsW(dataset).tconfint_mean(
        alpha=(1 - confidence_level))
    print('Assuming that the population is normally distributed, ', end='')
    print('C.I. with {}% confidence: [{}, {}]'.format(
        confidence_level * 100, round(ci_lower_bound, 10),
        round(ci_upper_bound, 10)))
    return ci_lower_bound, ci_upper_bound
Esempio n. 4
0
def confint_mean(var, alpha=0.05, alternative='two-sided'):
    '''
    Confident interval with mean
    :param var: dataframe var 1
    :param alpha: significance level
    :param alternative : h1 != val (two-sided)
                         h1 > val (larger)
                         h1 < val (smaller)
    :return:
    '''
    s = smstats.DescrStatsW(var)
    ci = None
    if s.std:
        ci = s.zconfint_mean(alpha,alternative)
    else:
        ci = s.tconfint_mean(alpha,alternative)
    print("{0}Confidence Interval - Compare Means{0}".format("="*5))
    print("=" * 50)
    print(pd.DataFrame({'Mean': [var.mean], 'Lower CI': [ci[0]], 'Upper CI': [ci[1]]}))
    print("=" * 50)
    return ci
Esempio n. 5
0
def make_pdf_and_cdf_plot(z, outfile='histogram.png'):
    fig, ax1 = plt.subplots()

    nbins = 21
    bins = np.linspace(RIRANGE[0], RIRANGE[1], nbins)
    pdf, _, _ = ax1.hist(z, bins=bins, normed=True, color=cmap(0.5))
    plt.suptitle('Hydrotrend: recurrence interval distribution', fontsize=20)

    ax1.set_ylim(0.0, 0.4)
    ax1.set_xlabel('RI [yr]', fontsize=18)
    ax1.set_ylabel('pdf', fontsize=18)

    cdf = np.cumsum(pdf)
    cdf /= cdf.max()
    ax2 = ax1.twinx()
    ax2.plot(bins[:-1], cdf, color='b', lw=1.5)

    ax2.set_ylabel('cdf', fontsize=18)

    ri_median = np.median(z)
    ri_mean = z.mean()
    ri_stdv = z.std()
    ri_ci = sms.DescrStatsW(z).tconfint_mean()
    top = ax2.get_ylim()[-1]
    right = ax2.get_xlim()[-1]
    ymrk = 0.95 * top
    ax2.plot([ri_mean - ri_stdv, ri_mean + ri_stdv], [ymrk, ymrk],
             color=cmap(0.5),
             lw=0.75)
    ax2.plot(ri_ci, [ymrk, ymrk], '|', color=cmap(0.5), ms=10, mew=1)
    ax2.plot(ri_mean, ymrk, 's', color=cmap(0.5), ms=5)
    ax2.plot(ri_median, ymrk, 'D', color=cmap(0.5), ms=5)

    print 'mean = {}'.format(ri_mean)
    print 'median = {}'.format(ri_median)
    print 'std = {}'.format(ri_stdv)
    print 'ci = {}'.format(ri_ci)

    plt.savefig(outfile, dpi=150)
    plt.close()
Esempio n. 6
0
def conn_highvariance(allcovdata):

    """ 
    -Identify windows with high variance in connectivity for each subject, 
    -calculate the average connectitivy (average of all edges)
    -Define a 95% confidence interval on this average 
    -Select data points outside (higher values)
    
    Parameters
    ----------
    allcovdata:{array-like} , Connectivity matrices of all subjects , 
                shape =(Subjects X NWindows X Mfeatures X Mfeatures )
    
    Returns
    ----------
    
    mtd_allsubj_highvar:{array-like} , All windows of High Variance , shape= (Windows X Mfeatures X Mfeatures)

    """
    mtd_allsubj_highvar = []
    
    # High variance windows for each subject 
    var_mtd_allsubj =[]
    
    for curcov in allcovdata:

        # calculate variance of connectivity intra subject
        var_mtd_allsubj.append([a.mean() for a in curcov])

    # Extract points with high variance ( > 95 % confidence interval )
    for cur_i,curvarmtd in enumerate(var_mtd_allsubj):

        a = sms.DescrStatsW(curvarmtd)
        _,high= a.tconfint_mean()
        ind_highvar = np.argwhere(curvarmtd>high)

        # select the covdata for these points only
        curcov = allcovdata[cur_i]
        mtd_allsubj_highvar.append(curcov[ind_highvar])
    return np.vstack(mtd_allsubj_highvar)
Esempio n. 7
0
def evaluate(dataset, predictions):
    exact_match = total = 0
    f1_scores = []
    for article in dataset:
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                total += 1
                if qa['id'] not in predictions:
                    message = 'Unanswered question ' + qa['id'] + \
                              ' will receive score 0.'
                    print(message, file=sys.stderr)
                    continue
                ground_truths = list(map(lambda x: x['text'], qa['answers']))
                prediction = predictions[qa['id']]
                exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
                f1_scores.append(
                    metric_max_over_ground_truths(f1_score, prediction,
                                                  ground_truths))

    # Exact-match is binary, so use binomial CI's
    exact_match_mean = 100.0 * exact_match / total
    lower, upper = sms.proportion_confint(exact_match,
                                          total,
                                          alpha=0.05,
                                          method="beta")
    exact_match_ci = (100.0 * lower, 100.0 * upper)

    # F1 scores are continuous in [0, 1]. We could use fancy bounded random
    # variable CI, but for now, we settle for the normal approximation.
    f1_mean = 100.0 * sum(f1_scores) / total
    lower, upper = sms.DescrStatsW(f1_scores).tconfint_mean()
    f1_ci = (100.0 * lower, 100.0 * upper)

    return {
        'exact_match': exact_match_mean,
        'exact_match_ci': exact_match_ci,
        'f1': f1_mean,
        'f1_ci': f1_ci
    }
def getSpikeStats(data, groups):

    groupIDs = np.unique(groups).astype(int)
    nGroups = len(groupIDs)
    stats = {}
    stats['mean'] = np.zeros(nGroups)
    stats['sem'] = np.zeros(nGroups)
    stats['conf_Int'] = np.zeros((nGroups, 2))
    stats['N'] = np.zeros(nGroups)
    stats['MWUz'] = np.zeros(nGroups)

    for i in groupIDs:
        g1 = groups == i
        g2 = groups != i
        x_stats = sms.DescrStatsW(data[g1])
        stats['mean'][i] = x_stats.mean
        stats['sem'][i] = x_stats.std_mean
        stats['N'][i] = x_stats.nobs
        stats['conf_Int'][i] = x_stats.tconfint_mean()
        stats['MWUz'][i], _, _ = getMWUz(data, g1, g2)

    return stats
Esempio n. 9
0
def main():
    df = pd.read_csv(FILE, index_col=0)

    # SD, CI, effect size
    desc = sms.DescrStatsW(df)
    dsim_values = desc.mean
    sd = desc.std_ddof(1)
    lower_ci, _ = desc.tconfint_mean(alternative='larger')
    cohens_d = dsim_values / sd

    # t test
    raw_tstats = ttest_1samp(df, popmean=0, axis=0)
    column_bools = ~np.isnan(raw_tstats.pvalue) & \
                   list(map(lambda c: not any(ex in c for ex in EXCLUDE), df.columns))
    all_tstats = [
        np.array(dsim_values[column_bools]), sd[column_bools],
        lower_ci[column_bools], cohens_d[column_bools],
        raw_tstats.statistic[column_bools], raw_tstats.pvalue[column_bools],
        raw_tstats.pvalue[column_bools] / 2
    ]  # one tailed p

    # multiple comparison correction
    multi_corrections = ['fdr_bh']
    for method in multi_corrections:
        corrected = multipletests(all_tstats[3], alpha=ALPHA, method=method)
        all_tstats.append(corrected[1])

    p_df = pd.DataFrame(all_tstats,
                        columns=df.columns[column_bools],
                        index=[
                            'delta_sim', 'sd', 'lower_ci', 'cohens_d',
                            'tstats', 'raw_p_2tailed', 'raw_p_1tailed'
                        ] + ['p_' + m for m in multi_corrections])
    p_df = p_df.T.sort_values('raw_p_1tailed')
    p_df.to_csv(OUTCSV)
    print('Output to: ' + OUTCSV)
Esempio n. 10
0
def add_data(frame, group, key, algs, opt_gat, domain_path):
    new_frame = DataFrame()
    means = []
    bounds = []
    for k, action_group in (DataFrame(group).groupby(['actionDuration'])):
        seeds = []

        for j, seed_group in action_group.groupby('domainSeed'):
            cur_opt_gat = opt_gat[(k, domain_path, j)]
            actual_gat = action_group['goalAchievementTime'].iloc(0)[0]
            seeds.append(actual_gat / cur_opt_gat)

        bound = sms.DescrStatsW(seeds).tconfint_mean()
        mean = statistics.mean(seeds)
        bounds.append((abs(mean - bound[0]), abs(mean - bound[1])))
        means.append(statistics.mean(seeds))

        # mean_gat = action_group.mean()['goalAchievementTime']
        # means.append(mean_gat / opt_gat[(k, domain_path)])
    new_frame[key] = means
    new_frame[key + "_" + "lerror"] = [i[0] for i in bounds]
    new_frame[key + "_" + "rerror"] = [i[1] for i in bounds]
    algs.append(key)
    return pd.concat([frame, new_frame], axis=1)
#489756&326584 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark vardır!
#489756&675201 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark vardır!
#489756&201436 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark vardır!
#361254&874521 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark yoktur!
#361254&326584 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark vardır!
#361254&675201 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark yoktur!
#361254&201436 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark yoktur!
#874521&326584 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark vardır!
#874521&675201 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark yoktur!
#874521&201436 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark yoktur!
#326584&675201 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark vardır!
#326584&201436 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark vardır!
#675201&201436 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark yoktur!

#Confidence Interval
sms.DescrStatsW(df["price"]).tconfint_mean()
#(38.334045331672925, 39.216612629527816)
#category_id{201436,326584,361254,489756,675201,874521 } olan itemlerin
#489756,326584,675201 itemlerinin fiyatlarının
#güven aralığında olması gerekir.

#sümilasyon
#minimum kazanç için
minfreq = len(df[df['price'] >= 35.693170])
income_min = minfreq * 35.6931
income_min

#ortalama kazanç
meanfreq = len(df[df['price'] >= 37.443592])
income_mean = minfreq * 37.4435
income_mean
Esempio n. 12
0
#missing for media opti
frames['OPTI_f4d']['cctrust_media'] = np.nan

vizframes = {}
# make a set with all items to visualise -->data
for cntry in cntrs:
    vizframes[cntry] = frames[cntry][vizitems].astype('float64')
    vizframes[cntry]['country'] = cntry
    print(vizframes[cntry])
data = pd.concat(vizframes.values(), ignore_index=True)

# make a set with totals and 95 cis

totals_m = data.groupby('country').agg([
    'mean', 'count', 'sem',
    lambda lb: sms.DescrStatsW(lb.dropna()).tconfint_mean(alpha=0.05)[0],
    lambda ub: sms.DescrStatsW(ub.dropna()).tconfint_mean(alpha=0.05)[1]
])
# add a better column label
totals_m.columns = totals_m.columns.set_levels(
    ['mean', 'count', 'sem', 'lowerbound', 'upperbound'], level=1)

# sortorder items/overall totals (by mean)
totals_institutions = data[vizitems].agg(['mean', 'count', 'sem'
                                          ]).T.sort_values(by='mean',
                                                           ascending=False)
totals_countries = data.groupby('country').mean().mean(axis=1).sort_values(
    ascending=True)
# labels for countries
newlabels = dict(
    zip(list(totals_m.index),
Esempio n. 13
0
def t_distribution_ci(df,
                      metric='post_sales_temp',
                      control='Control',
                      test='Test_1',
                      test_flag='test_flag',
                      alpha=0.05):

    signi = []
    p_value = []

    test_data_A = df[df[test_flag] == control]
    test_data_B = df[df[test_flag] == test]
    test_data_A[metric] = test_data_A[metric].astype('float')
    test_data_B[metric] = test_data_B[metric].astype('float')
    print(test_data_A[metric].quantile(.995))
    #test_data_A_clean = test_data_A[(test_data_A[metric]>0) & (test_data_A[metric]<test_data_A[metric].quantile(.995))]
    test_data_A_clean = test_data_A
    print(test_data_B[metric].quantile(.995))
    #test_data_B_clean = test_data_B[(test_data_B[metric]>0) & (test_data_B[metric]<test_data_B[metric].quantile(.995))]
    test_data_B_clean = test_data_B
    #Combine the cleaned data sets as one
    test_data_clean = test_data_A_clean.append(test_data_B_clean)
    #Summarize the metrics:- Calculating totals
    test_summary1 = test_data_clean.groupby(test_flag).agg({metric: 'sum'})
    #Summarize the metrics:- Calculating means
    test_summary2 = test_data_clean.groupby(test_flag).agg({metric: 'mean'})
    #Transposing the summaries
    test_summary1 = test_summary1.T
    test_summary2 = test_summary2.T

    #Initialize a dataframe with test stats
    test_stats = pd.DataFrame(
        columns=['pct_lft', 'conf_int_lb', 'conf_int_ub', 'p-value'])
    #Concatenate the test stats with both the summaries
    test_summary1 = pd.concat([test_summary1, test_stats],
                              axis=1,
                              ignore_index=False,
                              sort=False)
    #Calculate pct_lift for all the metrics
    test_summary1['pct_lft'] = (test_summary1[test] - test_summary1[control]
                                ) / test_summary1[control] * 100
    test_summary2 = pd.concat([test_summary2, test_stats],
                              axis=1,
                              ignore_index=False,
                              sort=False)
    #Calculate pct_lift for all the metrics
    test_summary2['pct_lft'] = (test_summary2[test] - test_summary2[control]
                                ) / test_summary2[control] * 100

    cm = sms.CompareMeans(
        sms.DescrStatsW(
            test_data_A_clean[metric][test_data_A_clean[metric].notnull()]),
        sms.DescrStatsW(
            test_data_B_clean[metric][test_data_B_clean[metric].notnull()]))
    lb, rb = cm.tconfint_diff(usevar='unequal',
                              alternative='two-sided',
                              alpha=0.10)

    test_summary2['conf_int_lb'] = (rb * -1) / test_data_A_clean[metric].mean()
    test_summary2['conf_int_ub'] = (lb * -1) / test_data_A_clean[metric].mean()

    t_stat, test_summary2['p-value'] = sc.ttest_ind(
        test_data_A_clean[metric][test_data_A_clean[metric].notnull()],
        test_data_B_clean[metric][test_data_B_clean[metric].notnull()],
        equal_var=False)

    if (test_summary2['p-value'].iloc[0] <
            alpha) and (test_summary2['pct_lft'].iloc[0] > 0):
        signi.append('Significant with lift')
    elif (test_summary2['p-value'].iloc[0] <
          alpha) and (test_summary2['pct_lft'].iloc[0] < 0):
        signi.append('Significanct ,control performance better than test')
    elif (test_summary2['p-value'].iloc[0] >
          alpha) and (test_summary2['pct_lft'].iloc[0] < 0):
        signi.append('Not significanct with negative lift')
    elif (test_summary2['p-value'].iloc[0] >
          alpha) and (test_summary2['pct_lft'].iloc[0] > 0):
        signi.append('Not significant with positive lift')
    else:
        signi.append('Nothing')

    print(signi)

    test_summary2['sigificance'] = signi
    return test_summary2
Esempio n. 14
0
def run_benchmark(benchmark, max_runs, timeout_hours, output_file, min_runs=3):
    def run_benchmark_once():
        print('Running benchmark... ', end='', flush=True)
        result = benchmark.run()
        print(result)
        for dimension, value in result.items():
            results_by_dimension[dimension] += [value]

    results_by_dimension = defaultdict(lambda: [])
    print('Preparing for benchmark... ', end='', flush=True)
    benchmark.prepare()
    print('Done.')

    start_time = timer()

    # Run at least min_runs times
    for i in range(min_runs):
        run_benchmark_once()

    # Then consider running a few more times to get the desired precision.
    while True:
        if timer() - start_time > timeout_hours * 3600:
            print(
                "Warning: timed out, couldn't determine a result with the desired precision."
            )
            break

        for dimension, results in results_by_dimension.items():
            if all(result == results[0] for result in results):
                # If all results are exactly the same the code below misbehaves. We don't need to run again in this case.
                continue
            confidence_interval = stats.DescrStatsW(results).tconfint_mean(
                0.05)
            confidence_interval_2dig = (round_to_significant_digits(
                confidence_interval[0],
                2), round_to_significant_digits(confidence_interval[1], 2))
            if abs(confidence_interval_2dig[0] -
                   confidence_interval_2dig[1]) > numpy.finfo(float).eps * 10:
                if len(results) < max_runs:
                    print(
                        "Running again to get more precision on the metric %s. Current confidence interval: [%.3g, %.3g]"
                        % (dimension, confidence_interval[0],
                           confidence_interval[1]))
                    break
                else:
                    print(
                        "Warning: couldn't determine a precise result for the metric %s. Confidence interval: [%.3g, %.3g]"
                        % (dimension, confidence_interval[0],
                           confidence_interval[1]))
        else:
            # We've reached sufficient precision in all metrics, or we've reached the max number of runs.
            break

        run_benchmark_once()

    # We've reached the desired precision in all dimensions or reached the maximum number of runs. Record the results.
    rounded_confidence_intervals_by_dimension = {}
    confidence_intervals_by_dimension = {}
    for dimension, results in results_by_dimension.items():
        confidence_interval = stats.DescrStatsW(results).tconfint_mean(0.05)
        confidence_interval_2dig = (round_to_significant_digits(
            confidence_interval[0],
            2), round_to_significant_digits(confidence_interval[1], 2))
        rounded_confidence_intervals_by_dimension[
            dimension] = confidence_interval_2dig
        confidence_intervals_by_dimension[dimension] = (
            confidence_interval, confidence_interval_2dig)
    with open(output_file, 'a') as f:
        json.dump(
            {
                "benchmark": benchmark.describe(),
                "results": confidence_intervals_by_dimension
            }, f)
        print(file=f)
    print('Benchmark finished. Result: ',
          rounded_confidence_intervals_by_dimension)
    print()
Esempio n. 15
0
def plot_hvi(parameters_file, output_hvi_file_name, list_of_dirs):
    """
    Plot the hypervolume indicator (HVI) results of the design space exploration.
    In this plot specifically we plot the HVI of HyperMapper's DSE against the HVI of a competing approach.
    On the x axis we plot time in seconds and on the y axis the HVI.
    HVI to be computed needs a real Pareto or at least a Pareto that is the best found by the results concatenation of
    HyperMapper and the competing approach.

    ######################################################
    ######### Input of this script ######################
    # 1) a file that is the real Pareto or the best Pareto found
    #    (supposing the we are comparing several approaches for example the best Pareto is the result of all these approaches combined).
    # 2) a file containing all the samples of the exploration (not only the Pareto).
    #    From this file we can compute the Pareto at time t and then the hvi at time t
    """
    try:
        import statsmodels.stats.api as sms
    except:
        # TODO: Long-term: move this import to the top.
        ImportError(
            "Failed to import statsmodels. Statsmodels is required for plot_hvi."
        )
    xlabel = "Time (sec)"
    ylabel = "HyperVolume Indicator (HVI)"
    number_of_bins = 20

    filename, file_extension = os.path.splitext(parameters_file)
    if file_extension != ".json":
        print(
            "Error: invalid file name. \nThe input file has to be a .json file not a %s"
            % file_extension
        )
        exit(1)
    with open(parameters_file, "r") as f:
        config = json.load(f)

    schema = json.load(resource_stream("hypermapper", "schema.json"))

    DefaultValidatingDraft4Validator = extend_with_default(Draft4Validator)
    DefaultValidatingDraft4Validator(schema).validate(config)

    if "application_name" in config:
        application_name = config["application_name"]
    else:
        application_name = ""

    print("########## plot_hvi.py #########################")
    print("### Parameters file is %s" % parameters_file)
    print("### Application name is %s" % application_name)
    print("### The input directories data are %s" % str(list_of_dirs))
    print("################################################")

    param_space = space.Space(config)
    optimization_metrics = param_space.get_optimization_parameters()

    ###################################################################################################################
    ########### Compute the hypervolume of all the input files concatenated as a reference for the HVI metric.
    ###################################################################################################################
    input_files = {}

    # y_data_mean is dict on the directories that for each entry in the dict contains the mean of each point x over multiple file repetitions in one directory; lower and upper are for the confidence interval.
    y_data_mean = defaultdict(list)
    y_data_median = defaultdict(list)
    y_data_min = defaultdict(list)
    y_data_max = defaultdict(list)
    y_data_lower = defaultdict(list)
    y_data_upper = defaultdict(list)
    bin_array_X = {}
    number_or_runs_in_bins = {}

    for dir in list_of_dirs:
        input_files[dir] = [f for f in listdir(dir) if isfile(join(dir, f))]

    for dir in list_of_dirs:
        files_to_remove = []
        for file in input_files[dir]:
            filename, file_extension = os.path.splitext(file)
            if file_extension != ".csv":
                print(
                    "Warning: file %s is not a csv file, it will not be considered in the HVI plot. "
                    % file
                )
                files_to_remove.append(file)
        # Don't move this for loop inside the previous identical one otherwise you will remove the elements before they get process because of overlapping references.
        for file in files_to_remove:
            input_files[dir].remove(file)

    for dir in list_of_dirs:
        if len(input_files[dir]) == 0:
            print(
                "Warning: directory %s is empty, it will not be considered in the HVI plot."
            )
            del input_files[dir]

    if len(input_files) == 0:
        print("Error: there no input files to compute the HVI.")

    print("The files used as a input are: ")
    for i, dir in enumerate(input_files.keys()):
        print(
            "Directory "
            + str(i)
            + ": "
            + dir
            + ", # of files: "
            + str(len(input_files[dir]))
            + ", list of files: "
            + str(input_files[dir])
        )

    all_data_files = []
    for dir in input_files.keys():
        for file in input_files[dir]:
            all_data_files += [dir + "/" + file]

    selection_keys = (
        param_space.get_output_parameters() + param_space.get_timestamp_parameter()
    )
    feasible_flag = True if (param_space.get_feasible_parameter() != [None]) else False
    concatenated_all_data_array = param_space.load_data_files(
        all_data_files, selection_keys_list=selection_keys, only_valid=feasible_flag
    )

    if len(next(iter(concatenated_all_data_array.values()))) == 0:
        return return_empty_images(
            application_name,
            input_files,
            number_of_bins,
            output_hvi_file_name,
            xlabel,
            ylabel,
        )

    bounds = {}
    max_point = []
    standard_deviation_optimization_metrics = []
    max_min_difference = []
    # Get bounds of objective space
    for metric in optimization_metrics:
        X = np.array(concatenated_all_data_array[metric])

        standard_deviation = np.std(X, axis=0)
        standard_deviation_optimization_metrics.append(standard_deviation)
        X /= standard_deviation

        concatenated_all_data_array[metric] = X
        bounds[metric] = (
            min(concatenated_all_data_array[metric]),
            max(concatenated_all_data_array[metric]),
        )
        max_point.append(bounds[metric][1])
        max_min_difference.append(bounds[metric][1] - bounds[metric][0])
        print(
            "(min, max) = (%f, %f) for the metric %s. This is to compute the hypervolume."
            % (bounds[metric][0], bounds[metric][1], metric)
        )

    total_volume = prod(max_min_difference)
    list_of_objectives = [
        concatenated_all_data_array[objective]
        for objective in param_space.get_optimization_parameters()
    ]
    reformatted_all_data = list(zip(*list_of_objectives))

    # Get dominated hypervolume for Pareto of all data observed
    hv_all_data = H(reformatted_all_data, max_point)
    print("The hypervolume of all the files concatenated: %d" % hv_all_data)

    ###################################################################################################################
    ########### Compute the HVI for each directory.
    ###################################################################################################################
    hvi = {}
    for dir in input_files:
        print("Compute HVI for %s" % dir)
        convert_in_seconds = 1000.0
        hvi[dir], bin_array_X[dir], number_or_runs_in_bins[dir] = compute_hvi(
            standard_deviation_optimization_metrics,
            input_files[dir],
            dir,
            total_volume,
            max_point,
            hv_all_data,
            param_space,
            convert_in_seconds,
            number_of_bins,
        )

        # Round the floating point numbers to 1 decimal for clarity of visualization.
        bin_array_X[dir] = [round(float(i), 1) for i in bin_array_X[dir]]
        for file in hvi[dir]:
            for bin in hvi[dir][file]:
                hvi[dir][file][bin] = round(float(hvi[dir][file][bin]), 1)

    ###################################################################################################################
    ########### Plot all the HVIs (using box plots bin_array_X and hvi)
    ###################################################################################################################

    for dir in input_files:
        hvi_list_of_lists = []
        each_bin = defaultdict(list)
        for file in hvi[dir]:
            for bin in hvi[dir][file]:
                each_bin[bin].append(hvi[dir][file][bin])
        for bin in hvi[dir][file]:
            hvi_list_of_lists.append(
                each_bin[bin]
            )  # This is a list of bins and for each bin there is a list of hvi values for each file in that directory.

        # Print boxplot (one figure per directory).
        boxplot(
            bin_array_X[dir],
            hvi_list_of_lists,
            application_name,
            number_of_bins,
            xlabel,
            ylabel,
            str(dir + "/" + os.path.basename(dir) + "_boxplot" + ".pdf"),
        )

        # Print lineplot (only one figure comparing all the directories).
        for hvi_list in hvi_list_of_lists:
            hvi_list_array = np.array(hvi_list)
            y_data_mean[dir].append(hvi_list_array.mean())
            y_data_median[dir].append(np.median(hvi_list_array))
            y_data_min[dir].append(np.min(hvi_list_array))
            y_data_max[dir].append(np.max(hvi_list_array))
            low, up = sms.DescrStatsW(hvi_list_array).tconfint_mean()
            y_data_lower[dir].append(low)
            y_data_upper[dir].append(up)

        for bin_number, bin_value in enumerate(y_data_lower[dir]):
            if not math.isnan(bin_value) and bin_value < 0:
                y_data_lower[dir][bin_number] = 0
        for bin_number, bin_value in enumerate(y_data_upper[dir]):
            if not math.isnan(bin_value) and bin_value < 0:
                y_data_upper[dir][bin_number] = 0

        print_stats_on_a_txt(
            dir,
            str(dir + "/" + os.path.basename(dir) + "_stats" + ".txt"),
            bin_array_X,
            number_or_runs_in_bins,
            y_data_mean,
            y_data_median,
            y_data_min,
            y_data_max,
            y_data_lower,
            y_data_upper,
        )

    # Call the function to create plot
    lineplotCI(
        input_files,
        application_name,
        x_data=bin_array_X,
        y_data=y_data_mean,
        low_CI=y_data_lower,
        upper_CI=y_data_upper,
        xlabel=xlabel,
        ylabel=ylabel,
        title="Line plot with 95% confidence intervals",
        output_filename=output_hvi_file_name,
    )
Esempio n. 16
0
df_control_group = data_control.copy()

df_testing_group.head()
df_control_group.head()

df_control_group.shape
df_control_group.shape

# na checked
df_testing_group.isnull().sum()
df_control_group.isnull().sum()

# Confidence Interval

# Testing group
sms.DescrStatsW(df_testing_group["Purchase"]).tconfint_mean()

# Control group
sms.DescrStatsW(df_control_group["Purchase"]).tconfint_mean()

############################
# Testing of Assumptions
############################

# Assumptions of normality

# H0: Normality assumption is provide.
# H1: Normality assumption isn't provided.

test_stat, pvalue = shapiro(df_testing_group["Purchase"])
print('Test statistics = %.4f, p-value = %.4f' % (test_stat, pvalue))
Esempio n. 17
0
def believeCase():
    df = readDatasetNinformantions()
    sms.DescrStatsW(df["Hava Sıcaklığı ( °C )"]).tconfint_mean()
    return df["Hava Sıcaklığı ( °C )"].mean()
Esempio n. 18
0
def sim_stats(series):
    stats = {
        'mean': series.mean(),
        'ci': series.mean() - sms.DescrStatsW(series).tconfint_mean(0.05)[0]
    }
    return stats
Esempio n. 19
0
def two_population(a,
                   b,
                   alpha=.05,
                   consistency='equal',
                   option='right',
                   show_table=False,
                   stages=[1, 2, 3],
                   show=True,
                   precision=4,
                   matched_pairs=False):
    """
+ [First stage]: F Statistics - consistency: equal, left (1 is more consistent than 2), right (2 is more consistent than 1)
+ [Second stage]: t Test
+ [Third stage]: Confidence Interval

Will return a result_dict regardless of stages.
    """
    opt = option.lower()[0]
    results = ""

    const = consistency.lower()[0]

    result_dict = dict()

    df_1 = len(a) - 1
    df_2 = len(b) - 1
    if 1 in stages:

        varall = [stats.describe(a).variance, stats.describe(b).variance]
        f_value = varall[0] / varall[1]

        result_dict['varall'] = varall
        result_dict['f_value'] = f_value

        ptmp = stats.f.cdf(f_value, df_1, df_2)

        if const == 'e':
            if ptmp > 0.5:
                ptmp = 1 - ptmp
            p_value = ptmp * 2
            rej_upper = stats.f.ppf(1 - alpha / 2, df_1, df_2)
            rej_lower = stats.f.ppf(alpha / 2, df_1, df_2)
            result_dict['f_rej_upper'] = rej_upper
            result_dict['f_rej_lower'] = rej_lower
            if f_value < rej_lower or f_value > rej_upper:
                flag = True
            else:
                flag = False
            text = 'unequal variances'
        else:
            rej_upper = stats.f.ppf(1 - alpha, df_1, df_2)
            rej_lower = stats.f.ppf(alpha, df_1, df_2)
            if const == 'r':
                result_dict['f_rej_upper'] = rej_upper
                p_value = 1 - ptmp
                if f_value > rej_upper:
                    flag = True
                else:
                    flag = False
                text = 'σ_1/σ_2 > 1'
            else:
                result_dict['f_rej_lower'] = rej_lower
                p_value = ptmp
                if f_value < rej_lower:
                    flag = True
                else:
                    flag = False
                text = 'σ_1/σ_2 < 1'

        result_dict['p_value'] = p_value

        results = f"""          F Statistics
===================================
F statistic = {f_value:.{precision}f}
p-value = {p_value:.{precision}f} ({inter_p_value(p_value)})
Reject H_0 ({text}) → {flag}
"""
    if 2 in stages:
        if matched_pairs:
            samp_diff = a - b
            nobs = samp_diff.shape[0]
            df = nobs - 1

            tmpdesc = stats.describe(samp_diff)
            t_value = tmpdesc.mean / (tmpdesc.variance**0.5) * (nobs**0.5)

            # p-values
            ptmp = stats.t.cdf(t_value, df)
            if opt == 'r':
                text = 'one-tail'
                tcv = stats.t.ppf(1 - alpha, df=df)
                p_value = 1 - ptmp
            elif opt == 'l':
                text = 'one-tail'
                p_value = ptmp
                tcv = stats.t.ppf(alpha, df=df)
            else:
                text = 'two-tail'
                tcv = stats.t.ppf(1 - alpha / 2, df=df)
                if ptmp > 0.5:
                    ptmp = 1 - ptmp
                p_value = ptmp * 2

            flag = p_value < alpha
            results += f"""
           t Test      
===================================
t (Observed value) = {t_value:.{precision}f}
p-value ({text}) = {p_value:.{precision}f} ({inter_p_value(p_value)})
t (Critical, ({text})) = {tcv:.{precision}f}
DF = {(df):.{precision}f}
Reject H_0 → {flag}
"""
            result_dict['t_p_value'] = p_value
            result_dict['t_critical_value'] = tcv
            result_dict['t_observed_value'] = t_value
            t_alpha = stats.t.ppf(1 - alpha / 2, df)
            std_xbar = (tmpdesc.variance / nobs)**0.5
            LCL = tmpdesc.mean - t_alpha * std_xbar
            UCL = tmpdesc.mean + t_alpha * std_xbar
            con_coef = 1 - alpha
            conf_interval = [LCL, UCL]
            result_dict['conf_interval'] = conf_interval
            results += f"""
           Confidence Interval      
===================================
{con_coef * 100:.1f}% Confidence Interval: [{LCL:.{precision}f}, {UCL:.{precision}f}]
"""
        else:
            if flag:  # True == unequal variance
                ttest_result = stats.ttest_ind(a, b, equal_var=False)
                t_summary = list(ttest_result)
                t_critical_two = stats.t.ppf(1 - alpha / 2, df=(df_1 + df_2))
                if opt == 'r':
                    t_critical_one = stats.t.ppf(1 - alpha, df=(df_1 + df_2))
                    result_dict['t_critical_one'] = t_critical_one
                elif opt == 'l':
                    t_critical_one = stats.t.ppf(alpha, df=(df_1 + df_2))
                    result_dict['t_critical_one'] = t_critical_one

                if opt == 't':
                    flag = t_summary[1] < alpha
                    result_dict['t_critical_two'] = t_critical_two
                    result_dict['t_observed_value'] = t_summary[0]
                    result_dict['t_p_value'] = t_summary[1]
                    result_dict['df'] = df_1 + df_2
                    results += f"""
           t Test      
===================================
t (Observed value) = {t_summary[0]:.{precision}f}
p-value (two-tail) = {t_summary[1]:.{precision}f} ({inter_p_value(t_summary[1])})
t (Critical, two-tail) = {t_critical_two:.{precision}f}
DF = {(df_1 + df_2):.{precision}f}
Reject H_0 → {flag}
"""
                else:
                    flag = t_summary[1] / 2 < alpha
                    result_dict['t_observed_value'] = t_summary[0]
                    result_dict['t_p_value'] = t_summary[1] / 2
                    result_dict['df'] = df_1 + df_2
                    results += f"""
           t Test      
===================================
t (Observed value) = {t_summary[0]:.{precision}f}
p-value (one-tail) = {(t_summary[1] / 2):.{precision}f} ({inter_p_value(t_summary[1] / 2)})
t (Critical, one-tail) = {t_critical_one:.{precision}f}
DF = {(df_1 + df_2):.{precision}f}
Reject H_0 → {flag}
"""
                if 3 in stages:
                    cm_result = sms.CompareMeans(sms.DescrStatsW(a),
                                                 sms.DescrStatsW(b))
                    conf_table = cm_result.summary(usevar='unequal',
                                                   alpha=alpha)
                    conf_interval = list(
                        map(float,
                            conf_table.as_text().split('\n')[4].split()[6:]))
                    con_coef = 1 - alpha

                    # record result
                    result_dict['conf_interval'] = conf_interval
                    results += f"""
           Confidence Interval      
===================================
{con_coef * 100:.1f}% Confidence Interval: [{conf_interval[0]:.{precision}f}, {conf_interval[1]:.{precision}f}]
"""
            else:
                ttest_result = stats.ttest_ind(a, b, equal_var=True)
                t_summary = list(ttest_result)
                t_critical_two = stats.t.ppf(1 - alpha / 2, df=(df_1 + df_2))
                if opt == 'r':
                    t_critical_one = stats.t.ppf(1 - alpha, df=(df_1 + df_2))
                    result_dict['t_critical_one'] = t_critical_one
                elif opt == 'l':
                    t_critical_one = stats.t.ppf(alpha, df=(df_1 + df_2))
                    result_dict['t_critical_one'] = t_critical_one

                if opt == 't':
                    flag = t_summary[1] < alpha
                    result_dict['t_critical_two'] = t_critical_two
                    result_dict['t_observed_value'] = t_summary[0]
                    result_dict['t_p_value'] = t_summary[1]
                    result_dict['df'] = df_1 + df_2

                    results += f"""
           t Test      
===================================
t (Observed value) = {t_summary[0]:.{precision}f}
p-value (two-tail) = {t_summary[1]:.{precision}f} ({inter_p_value(t_summary[1])})
t (Critical, two-tail) = {t_critical_two:.{precision}f}
DF = {(df_1 + df_2):.{precision}f}
Reject H_0 → {flag}
"""
                else:
                    flag = t_summary[1] / 2 < alpha
                    result_dict['t_observed_value'] = t_summary[0]
                    result_dict['t_p_value'] = t_summary[1]
                    result_dict['df'] = df_1 + df_2

                    results += f"""
           t Test      
===================================
t (Observed value) = {t_summary[0]:.{precision}f}
p-value (one-tail) = {(t_summary[1] / 2):.{precision}f} ({inter_p_value(t_summary[1] / 2)})
t (Critical, one-tail) = {t_critical_one:.{precision}f}
DF = {(df_1 + df_2):.{precision}f}
Reject H_0 → {flag}
"""
                if 3 in stages:
                    cm_result = sms.CompareMeans(sms.DescrStatsW(a),
                                                 sms.DescrStatsW(b))
                    conf_table = cm_result.summary(usevar='pooled',
                                                   alpha=alpha)
                    conf_interval = list(
                        map(float,
                            conf_table.as_text().split('\n')[4].split()[6:]))
                    # record result
                    result_dict['conf_interval'] = conf_interval
                    con_coef = 1 - alpha
                    results += f"""
           Confidence Interval      
===================================
{con_coef * 100:.1f}% Confidence Interval: [{conf_interval[0]:.{precision}f}, {conf_interval[1]:.{precision}f}]
"""

            if show_table == True and 3 in stages:
                results += f"""{conf_table.as_text()}"""

    if show == True:
        print(results)
    return result_dict
Esempio n. 20
0
def believeCase():
    df = checkEmptyValues()
    sms.DescrStatsW(df["Üretim"]).tconfint_mean()
    df["Üretim"].mean()
Esempio n. 21
0

lower_bound, upper_bound = CI_printout(df['temperature'], 0.95, 'z')

print('Frequentist approach:')
print('95% confidence interval range: [{:.3f}, {:.3f}]'.format(
    lower_bound, upper_bound))

#%% [markdown]
# ==> So, we consider the normal temperature to be in the range from 98.123 to 98.375 with a confidence level of 95%. Any value of temperature beyond this range can be considered abnormal.
#%% [markdown]
# Please note that we also can use the provided library as follows.

#%%
import statsmodels.stats.api as sms
sms.DescrStatsW(df['temperature']).tconfint_mean()

#%% [markdown]
# ### 6. Is there a significant difference between males and females in normal temperature?
#%% [markdown]
# First, we find the mean normal temperature for both malies and females.

#%%
means = df.groupby("gender")["temperature"].mean()
means

#%% [markdown]
# It seems that the mean female temeprature slightly higher than that for males.
# We visualise the distribution of temperatures for both males and females.

#%%
Esempio n. 22
0
motivations = np.load('data/cleaned/motivation.npy')

#Plot
x = [1, 2, 3]
count = 0
for segment in segments:
    count += 1
    y = []
    ci1 = []
    ci2 = []
    yerr = []
    for measurement_index in range(0, 3):
        segment_score = motivations[motivation_index, segment, measurement_index]
        mean_segment_score = np.mean(segment_score)      
        y.append(mean_segment_score)
        ci = sms.DescrStatsW(segment_score).tconfint_mean()
        ci1.append(mean_segment_score - ci[0])
        ci2.append(ci[1] - mean_segment_score)
    
    yerr = [ci1, ci2]
    plt.errorbar(x, y, yerr = yerr)
    plt.title('Segment ' + str(count) + ': mean motivation with 95% confidence intervals')
    
    plt.xlabel('Time')
    plt.xlim([0, 4])
    plt.xticks(x, ['pre', 'half way', 'post'], size=8)
    
    plt.ylabel('Motivation score')
    plt.ylim([1, 7])
    
    plt.grid()
Esempio n. 23
0
treatment_df["active_mins"].describe()
control_df["active_mins"].describe()
#note that the mean active_mins is higher in the dataframe that has the experimental group than the control group

#conduct t-test
stats.ttest_ind(treatment_df["active_mins"],
                control_df["active_mins"],
                equal_var=False)
#output: t-statistic=30.686846737487123 and pvalue<.05)

#now we're going to find the 95% confidene interval
x1 = treatment_df["active_mins"]
x2 = control_df["active_mins"]
#going to use statsmodels
cm = sms.CompareMeans(sms.DescrStatsW(x1), sms.DescrStatsW(x2))
print(cm.tconfint_diff(usevar='unequal'))

####################################################################################

#PAGE 4
#read in the dataframes wrangled in R
ctrl_df_pg4 = pd.read_csv("/Users/ankushbharadwaj/Desktop/ctrl_df_pg4.csv")
exp_df_pg4 = pd.read_csv("/Users/ankushbharadwaj/Desktop/exp_df_pg4.csv")

#STEP 1: REMOVE OUTLIERS
#going to remove outliers more than 3 standard deviations from mean

#get standard deviation of active minutes per user per day for each group
std_exp = np.std(exp_df_pg4["active_mins"])
std_ctrl = np.std(ctrl_df_pg4["active_mins"])
Esempio n. 24
0
    def getPrecisionRecallFalseRate(self, resultSimulation, kMAX, plot=False, output='dict'):

        """
        The function receives the detections/false alarms of all the nodes
        and computes the Precision/Recall for all the K.
        """

        precisionConfInterval = {}
        recallConfInterval = {}
        falseConfInterval = {}

        for k in range(1, kMAX + 1):
            precisionConfInterval[k] = []
            recallConfInterval[k] = []
            falseConfInterval[k] = []

        for key, value in resultSimulation.iteritems():

            detections = value['detections']
            falsePositives = value['falsePositives']

            for k in range(1, kMAX + 1):
                
                if detections[k]['events'] != 0:
                    recallConfInterval[k].append(detections[k]['detection']/float(detections[k]['events']))
                            
                if (detections[k]['detection'])!= 0 and falsePositives != 0:
                    precisionConfInterval[k].append((detections[k]['detection'])/(float(detections[k]['detection']) + falsePositives[k] ))    
                
                falseConfInterval[k].append(falsePositives[k]/float(len(self.truth.clears)))

        errorRecall = np.ndarray(kMAX)
        errorPrecision = np.ndarray(kMAX)
        errorFalse = np.ndarray(kMAX)

        meanRecall = np.ndarray(kMAX)
        meanPrecision = np.ndarray(kMAX)
        meanFalseRate = np.ndarray(kMAX)

        for k in range(1, kMAX + 1):
            a = recallConfInterval[k]
            meanRecall[k-1] = np.mean(a)
            interval = sms.DescrStatsW(a).tconfint_mean()    
            errorRecall[k-1] = interval[1] - np.mean(a)
            
            a = precisionConfInterval[k]
            meanPrecision[k-1] = np.mean(a)
            interval = sms.DescrStatsW(a).tconfint_mean()  
            errorPrecision[k-1] = interval[1] - np.mean(a)
            
            a = falseConfInterval[k]
            meanFalseRate[k-1] = np.mean(a)
            interval = sms.DescrStatsW(a).tconfint_mean()    
            errorFalse[k-1] = interval[1] - np.mean(a)

        if plot:
            visual = Visualization()
            visual.barRecallPrecisionvsK2(meanRecall, meanFalseRate, meanPrecision, errorRecall, errorPrecision, errorFalse)

        if output == 'dict':

            if meanFalseRate[0] > 1:
                meanFalseRate[0] = 1

            result = {

                'Precision': np.nan_to_num(meanPrecision).tolist(),
                'errPrecision': errorPrecision.tolist(),
                'Recall': np.nan_to_num(meanRecall).tolist(),
                'errRecall': errorRecall.tolist(),
                'FalseRate': np.nan_to_num(meanFalseRate).tolist(),
                'errFalseRate': errorFalse.tolist()
            }

            return result

        elif output == 'tuple':

            return meanPrecision, errorPrecision, meanRecall, errorRecall, meanFalseRate, errorFalse

        else:
            return 
Esempio n. 25
0
from scipy.stats import chi2_contingency
chi_test = chi2_contingency(cross_tab)
print(chi_test)

#Boxlplot
import seaborn as sns
import matplotlib.pyplot as plt

plt.boxplot(newprice, labels=['New price'], patch_artist=True)
plt.boxplot(nuclee, labels=['Nuclee '], patch_artist=True)
plt.boxplot(rating, labels=['Rating'], patch_artist=True)

#Estimarea intervalului de incredere
import statsmodels.stats.api as sms
print('Confidence Interval', sms.DescrStatsW(newprice).tconfint_mean())
print('Confidence Interval', sms.DescrStatsW(nuclee).tconfint_mean())
print('Confidence Interval', sms.DescrStatsW(rating).tconfint_mean())

#Testarea mediilor
#Simple Student test
from scipy import stats
print(stats.ttest_1samp(newprice, 3000))
print(stats.ttest_1samp(nuclee, 2))
print(stats.ttest_1samp(rating, 5))

#Test 2 means
newprice_i7 = baza.loc[baza['procesor'] == 'i7']
newprice_i5 = baza.loc[baza['procesor'] == 'i5']
print(stats.ttest_ind(newprice_i7.newprice, newprice_i5.newprice))
Esempio n. 26
0
    def getDelay(self, resultSimulation, kMAX, plot=False, samplingRate = 5):

        """
        The function receives the results previously obtained and computes
        the average detection delay for all the K and with respect to the 
        distance from the root node (how fare is the event).
        """

        depth = 3

        delay0 = {}
        delay1 = {}
        delay2 = {}

        for k in range(1, kMAX + 1):
            delay0[k] = []
            delay1[k] = []
            delay2[k] = []

        for key, value in resultSimulation.iteritems():

            detections = value['detections']
                        
            for k in range(1, kMAX + 1):

                delays = detections[k]['delays']
                        
                for delay in delays:
                                        
                    if delay['position'] == 0:
                        delay0[k].append(delay['delay'])
                
                    if delay['position'] == 1:
                        delay1[k].append(delay['delay'])
                
                    if delay['position'] == 2:
                        delay2[k].append(delay['delay'])
                                 
        delayConfInterval =  {'hop0': np.ndarray(kMAX),
                              'hop1': np.ndarray(kMAX),
                              'hop2': np.ndarray(kMAX)}

        delaymeansConfInterval = np.ndarray((kMAX,depth))  

        for k in range(1, kMAX + 1):
            
            a = delay0[k]
            b = delay1[k]
            c = delay2[k]
          
            delaymeansConfInterval[k-1][0] = np.mean(a)
            delaymeansConfInterval[k-1][1] = np.mean(b)
            delaymeansConfInterval[k-1][2] = np.mean(c)
            
            interval = sms.DescrStatsW(a).tconfint_mean()    
            delayConfInterval['hop0'][k-1] = interval[1] - np.mean(a)
            
            interval = sms.DescrStatsW(b).tconfint_mean()    
            delayConfInterval['hop1'][k-1] = interval[1] - np.mean(b)
            
            interval = sms.DescrStatsW(c).tconfint_mean()    
            delayConfInterval['hop2'][k-1] = interval[1] - np.mean(c)

        if plot:
            visual = Visualization()
            visual.plotBarDelay(delaymeansConfInterval, delayConfInterval, trunc='yes')

        return delaymeansConfInterval, delayConfInterval
Esempio n. 27
0
df = sns.load_dataset("tips")
df.describe().T

df.head()
df["sex"].value_counts()
df[["tip", "total_bill"]].corr()

############################
# Confidence Interval
############################

import statsmodels.stats.api as sms
df = sns.load_dataset("tips")
df.describe().T

sms.DescrStatsW(df["total_bill"]).tconfint_mean()
sms.DescrStatsW(df["tip"]).tconfint_mean()

df = pd.read_csv("datasets/titanic.csv")
df.describe().T
sms.DescrStatsW(df["Age"].dropna()).tconfint_mean()
sms.DescrStatsW(df["Fare"].dropna()).tconfint_mean()

df_ = pd.read_excel("datasets/online_retail_II.xlsx",
                    sheet_name="Year 2010-2011")

df = df_.copy()

sms.DescrStatsW(df["Quantity"].dropna()).tconfint_mean()
sms.DescrStatsW(df["Price"].dropna()).tconfint_mean()
Esempio n. 28
0
    sorted_info = sorted(info, key=lambda tup: tup[2])

    visualize_data = None
    with open(os.path.join(output_dir, "feature_qvalues_and_qt_means.txt"),
              'w+') as file_handle:
        column_names = [
            "feature", "pvalue", "qvalue", "signif_mean", "nonsignif_mean",
            "signif_lo_interval", "signif_up_interval",
            "nonsignif_lo_interval", "nonsignif_up_interval"
        ]
        file_handle.write("{0}\n".format("\t".join(column_names)))
        for index, (feat, pval, qval) in enumerate(sorted_info):
            sig_scores = qt_sig_scores_matrix[:, index]
            nonsig_scores = qt_nonsig_scores_matrix[:, index]

            sig_group_descr = sms.DescrStatsW(sig_scores)
            nonsig_group_descr = sms.DescrStatsW(nonsig_scores)

            sig_lower, sig_upper = sig_group_descr.tconfint_mean()
            nonsig_lower, nonsig_upper = nonsig_group_descr.tconfint_mean()

            sig_mean = sig_group_descr.mean
            nonsig_mean = nonsig_group_descr.mean

            values = [
                feat, pval, qval, sig_mean, nonsig_mean, sig_lower, sig_upper,
                nonsig_lower, nonsig_upper
            ]
            if visualize_data is None:
                # visualize the feature with the smallest q-value
                visualize_data = values[:1] + values[2:]
Esempio n. 29
0
def CI_ttest(X1, X2):
	cm = sms.CompareMeans(sms.DescrStatsW(X1), sms.DescrStatsW(X2))
	out = cm.tconfint_diff(usevar='unequal')
	return '[%.2f, %.2f]'%(out[0], out[1])
Esempio n. 30
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import ttest_ind
import statsmodels.stats.api as sms
GE = pd.read_csv('C:/Users/anivia/Desktop/geDJ.txt',
                 sep="\s+",
                 header=None,
                 names=['date', 'open', 'high', 'low', 'close', 'vol'])
SP = pd.read_csv(
    'https://www.math.ust.hk/~macwyu/MAFS5110_2018-2019/MAFS5110_2018-2019/Chapter_1/sp500.txt',
    sep="\s+")
logreturn_GE = np.diff(np.log(np.array(GE["close"])))
logreturn_sp500 = np.diff(np.log(np.array(SP["close"])))
da2 = pd.concat([pd.DataFrame(logreturn_GE),
                 pd.DataFrame(logreturn_sp500)],
                axis=1)
#da2.columns=['date','open','high','low','close','vol','logreturn_sp500']
#da2.index=da.index[1:]
da2.columns = ["logreturn_GE", "logreturn_sp500"]
da2.boxplot(column=['logreturn_GE', 'logreturn_sp500'])
plt.show()
print(stats.mood(logreturn_sp500, logreturn_GE))
print('H0 can be rejected, the variances are significantly different')
print(ttest_ind(logreturn_sp500, logreturn_GE, equal_var=True))
print('')
cm = sms.CompareMeans(sms.DescrStatsW(logreturn_sp500),
                      sms.DescrStatsW(logreturn_GE))
print(cm.tconfint_diff())