Beispiel #1
0
def get_pos_stats(df, nIdx, cutoff=1, expt=1, letterorder=['C', 'A', 'T', 'G']):
    # Get row of interest
    data = df[[c for c in df.columns if not c == 'sequence']].iloc[nIdx]
    nt = df['sequence'].iloc[nIdx]
    total_n = float(data.sum())
    
    # Set up dataframe
    ntCols = ['N->'+c for c in letterorder] + ['N->!N']
    outsCols = ['ct', '%', '%lb', '%ub']
    cols = [x+'_'+out for x in ntCols for out in outsCols] + ['total_n', 'sequence']
    out_df = pd.DataFrame(index=[expt], columns=cols)
    
    out_df['sequence'] = nt
    out_df['total_n'] = total_n   
    
    # Do individual nucleotide stats
    for n in letterorder:
        ct = data[nt+'->'+n]
        rate = ct / total_n
        lb, ub = proportion.proportion_confint(ct, total_n, method='jeffrey')
        
        out_df['N->'+n+'_ct'] = ct
        out_df['N->'+n+'_%'] = rate
        out_df['N->'+n+'_%lb'] = lb
        out_df['N->'+n+'_%ub'] = ub
    
    # Do aggregate misincorporation stats
    misinc_n = total_n - out_df['N->%c_ct' % nt]
    lb, ub = proportion.proportion_confint(misinc_n, total_n, method='jeffrey')
    out_df['N->!N_ct'] = misinc_n
    out_df['N->!N_%'] = misinc_n / total_n
    out_df['N->!N_%lb'] = lb
    out_df['N->!N_%ub'] = ub
    
    return out_df
def test_binom_test():
    #> bt = binom.test(51,235,(1/6),alternative="less")
    #> cat_items(bt, "binom_test_less.")
    binom_test_less = Holder()
    binom_test_less.statistic = 51
    binom_test_less.parameter = 235
    binom_test_less.p_value = 0.982022657605858
    binom_test_less.conf_int = [0, 0.2659460862574313]
    binom_test_less.estimate = 0.2170212765957447
    binom_test_less.null_value = 1. / 6
    binom_test_less.alternative = 'less'
    binom_test_less.method = 'Exact binomial test'
    binom_test_less.data_name = '51 and 235'

    #> bt = binom.test(51,235,(1/6),alternative="greater")
    #> cat_items(bt, "binom_test_greater.")
    binom_test_greater = Holder()
    binom_test_greater.statistic = 51
    binom_test_greater.parameter = 235
    binom_test_greater.p_value = 0.02654424571169085
    binom_test_greater.conf_int = [0.1735252778065201, 1]
    binom_test_greater.estimate = 0.2170212765957447
    binom_test_greater.null_value = 1. / 6
    binom_test_greater.alternative = 'greater'
    binom_test_greater.method = 'Exact binomial test'
    binom_test_greater.data_name = '51 and 235'

    #> bt = binom.test(51,235,(1/6),alternative="t")
    #> cat_items(bt, "binom_test_2sided.")
    binom_test_2sided = Holder()
    binom_test_2sided.statistic = 51
    binom_test_2sided.parameter = 235
    binom_test_2sided.p_value = 0.0437479701823997
    binom_test_2sided.conf_int = [0.1660633298083073, 0.2752683640289254]
    binom_test_2sided.estimate = 0.2170212765957447
    binom_test_2sided.null_value = 1. / 6
    binom_test_2sided.alternative = 'two.sided'
    binom_test_2sided.method = 'Exact binomial test'
    binom_test_2sided.data_name = '51 and 235'

    alltests = [('larger', binom_test_greater),
                ('smaller', binom_test_less),
                ('two-sided', binom_test_2sided)]

    for alt, res0 in alltests:
        # only p-value is returned
        res = smprop.binom_test(51, 235, prop=1. / 6, alternative=alt)
        #assert_almost_equal(res[0], res0.statistic)
        assert_almost_equal(res, res0.p_value, decimal=13)

    # R binom_test returns Copper-Pearson confint
    ci_2s = smprop.proportion_confint(51, 235, alpha=0.05, method='beta')
    ci_low, ci_upp = smprop.proportion_confint(51, 235, alpha=0.1,
                                               method='beta')
    assert_almost_equal(ci_2s, binom_test_2sided.conf_int, decimal=13)
    assert_almost_equal(ci_upp, binom_test_less.conf_int[1], decimal=13)
    assert_almost_equal(ci_low, binom_test_greater.conf_int[0], decimal=13)
Beispiel #3
0
def evaluate(frame, subset='val'):
    """
    Evaluate a DataFrame containing term vectors on its ability to predict term
    relatedness, according to MEN-3000, RW, MTurk-771, and WordSim-353. Use a
    VectorSpaceWrapper to fill missing vocabulary from ConceptNet.

    Return a Series containing these labeled results.
    """
    # Make subset names consistent with other datasets
    if subset == 'dev':
        subset = 'val'
    elif subset == 'all':
        # for the final evaluation, use just the test data
        subset = 'test'
    filename = get_support_data_filename('story-cloze/cloze_test_spring2016_%s.tsv' % subset)
    vectors = VectorSpaceWrapper(frame=frame)
    total = 0
    correct = 0
    for sentences, answers in read_cloze(filename):
        text = ' '.join(sentences)
        right_answer, wrong_answer = answers
        probe_vec = vectors.text_to_vector('en', text)
        right_vec = vectors.text_to_vector('en', right_answer)
        wrong_vec = vectors.text_to_vector('en', wrong_answer)

        right_sim = cosine_similarity(probe_vec, right_vec)
        wrong_sim = cosine_similarity(probe_vec, wrong_vec)
        if right_sim > wrong_sim:
            correct += 1
        total += 1
        # print("%+4.2f %s / %s / %s" % (right_sim - wrong_sim, text, right_answer, wrong_answer))
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
Beispiel #4
0
def eval_analogies(frame):
    filename = get_support_data_filename('google-analogies/questions-words.txt')
    quads = read_google_analogies(filename)
    vocab = [
        standardized_uri('en', word)
        for word in wordfreq.top_n_list('en', 200000)
    ]
    wrap = VectorSpaceWrapper(frame=frame)
    vecs = np.vstack([wrap.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        vector = analogy_func(frame, *prompt)
        similar = similar_to_vec(tframe, vector)
        result = None
        for match in similar.index:
            if match not in prompt:
                result = match
                break
        if result == answer:
            correct += 1
        else:
            if result not in seen_mistakes:
                print(
                    "%s : %s :: %s : [%s] (should be %s)"
                    % (quad[0], quad[1], quad[2], result, answer)
                    )
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
def print_survival_rate(df):
    for domain_path, domain_group in df.groupby(["domainPath"]):
        survival_results = DataFrame(columns="actionDuration algorithmName survival lbound rbound".split())
        domain_name = re.search("[^/]+$", domain_path).group(0).rstrip(".track")

        for fields, action_group in domain_group.groupby(['algorithmName', 'actionDuration']):
            total_trials = len(action_group)
            error_experiments = action_group[action_group["errorMessage"].notnull()]

            deaths = len(error_experiments[error_experiments["errorMessage"] != "Timeout"])
            timeouts = len(error_experiments) - deaths
            successes = len(action_group[~action_group["errorMessage"].notnull()])

            survival_confint = proportion_confint(successes, total_trials, 0.05)
            survival_rate = (successes / (successes + deaths))
            survival_results = add_row(survival_results,
                                      [fields[1], fields[0], survival_rate, survival_confint[0], survival_confint[1]])


        fig, ax = plt.subplots()
        errors = []
        for alg, alg_group in survival_results.groupby('algorithmName'):
            errors.append([(alg_group['lbound'] - alg_group['survival']).values,
                           (alg_group['rbound'].values - alg_group['survival']).values])
        errors = np.abs(errors)
        print(errors)
        survival = survival_results.pivot(index='actionDuration', columns='algorithmName', values='survival')

        survival.plot(ax=ax, yerr=errors,
                      xlim=[0, 7000], ylim=[0, 1.0],
                      capsize=4, capthick=1, ecolor='black', cmap=plt.get_cmap("rainbow"), elinewidth=1)

        plt.savefig('test.png', format='png')
def test_binom_tost():
    # consistency check with two different implementation,
    # proportion_confint is tested against R
    # no reference case from other package available
    ci = smprop.proportion_confint(10, 20, method='beta', alpha=0.1)
    bt = smprop.binom_tost(10, 20, *ci)
    assert_almost_equal(bt, [0.05] * 3, decimal=12)

    ci = smprop.proportion_confint(5, 20, method='beta', alpha=0.1)
    bt = smprop.binom_tost(5, 20, *ci)
    assert_almost_equal(bt, [0.05] * 3, decimal=12)

    # vectorized, TODO: observed proportion = 0 returns nan
    ci = smprop.proportion_confint(np.arange(1, 20), 20, method='beta',
                                   alpha=0.05)
    bt = smprop.binom_tost(np.arange(1, 20), 20, *ci)
    bt = np.asarray(bt)
    assert_almost_equal(bt, 0.025 * np.ones(bt.shape), decimal=12)
Beispiel #7
0
    def test_confidence_interval_estimation(self):
        if "ci" not in self.config["modes"]:
            print("Skipping CI")
            return
        runner = SingleProcessExperimentRunner()
        sample_length = self.config["sample_length"]
        samples = self.config["samples"]
        alpha = self.config["alpha"]
        method = "agresti_coull"
        estimation_tolerance = 0.1

        confidence_intervals = []
        all_successes = 0
        report_lines = []
        """:type : list[dict]"""
        fname = "smctest02_ci_{}.csv".format(datetime.now().strftime("%Y%m%d-%H_%M_%S_%f"))
        with open(fname, "w") as f:
            f.write("I;SUCCESSES;TRIALS\n")
            f.flush()
            for i in range(0, samples):
                _, res, trial_infos = runner.run_trials(self.experiment,
                                                        number_of_trials=sample_length,
                                                        max_retrials=0)
                print(trial_infos)
                self.assertEqual(sample_length, len(res))
                self.assertEqual(sample_length, len(trial_infos))
                successes = sum(res)
                all_successes += successes
                ci_low, ci_up = proportion.proportion_confint(successes, len(res), alpha=alpha,
                                                              method=method)
                confidence_intervals.append((ci_low, ci_up))
                line = dict(i=i+1, successes=successes, trials=len(res))
                f.write("{i};{successes};{trials}\n".format(**line))
                f.flush()
                print("Run #{}: {} successes, CI: [{}..{}]".format(i + 1, successes, ci_low, ci_up))
                # self.experiment.world.printState()

        estimated_prob = all_successes / (samples * sample_length)

        real_prob = self.calc_real_prob()

        print("estimated probability: {}".format(estimated_prob))
        print("real probability: {}".format(real_prob))
        interval_hit = 0
        for cl, cu in confidence_intervals:
            if cl <= real_prob <= cu:
                interval_hit += 1
        interval_hit_ratio = interval_hit / len(confidence_intervals)
        print("interval hits: {} of {} = {} %".format(interval_hit, len(confidence_intervals),
                                                      interval_hit_ratio * 100.0))

        self.assertAlmostEqual(real_prob, estimated_prob, delta=estimation_tolerance)
        self.assertTrue(interval_hit_ratio >= (1.0 - alpha))
def _woe_confint(n, cnt, q):
    """
    Считает 95%% доверительный интервал для WoE

    Parameters
    ----------
    n: кол-во просрочек в бакете
    cnt: кол-во элементов в бакете
    q: вероятность просрочки на всем корпусе
    """
    p_low, p_high = proportion_confint(n, cnt, method='normal')
    return _woe(p_low, q), _woe(p_high, q)
def print_survival_rate(df):
    for domain_path, domain_group in df.groupby(["domainPath"]):
        survival_results = DataFrame(
            columns="actionDuration algorithmName survival lbound rbound".
            split())
        domain_name = re.search("[^/]+$",
                                domain_path).group(0).rstrip(".track")

        for fields, action_group in domain_group.groupby(
            ['algorithmName', 'actionDuration']):
            total_trials = len(action_group)
            error_experiments = action_group[
                action_group["errorMessage"].notnull()]

            deaths = len(error_experiments[
                error_experiments["errorMessage"] != "Timeout"])
            timeouts = len(error_experiments) - deaths
            successes = len(
                action_group[~action_group["errorMessage"].notnull()])

            survival_confint = proportion_confint(successes, total_trials,
                                                  0.05)
            survival_rate = (successes / (successes + deaths))
            survival_results = add_row(survival_results, [
                fields[1], fields[0], survival_rate, survival_confint[0],
                survival_confint[1]
            ])

        fig, ax = plt.subplots()
        errors = []
        for alg, alg_group in survival_results.groupby('algorithmName'):
            errors.append([
                (alg_group['lbound'] - alg_group['survival']).values,
                (alg_group['rbound'].values - alg_group['survival']).values
            ])
        errors = np.abs(errors)
        print(errors)
        survival = survival_results.pivot(index='actionDuration',
                                          columns='algorithmName',
                                          values='survival')

        survival.plot(
            ax=ax,
            yerr=errors,
            # xlim=[0, 7000],
            ylim=[0, 1.0],
            capsize=4,
            capthick=1,
            ecolor='black',
            cmap=plt.get_cmap("rainbow"),
            elinewidth=1)

        plt.savefig('test.png', format='png')
def main():
  parser = argparse.ArgumentParser(description='extract and combine the kmer stats of multiple files')
  parser.add_argument('alpha',type=float,nargs='?', default=0.05, help='alpha of confidence interval')
  args = parser.parse_args()

  for line in sys.stdin:
    fields = line.split()
    values = map(int,fields[-4:])
    total = sum(values) * 1.0

    ci = proportion_confint(values[-1], total, args.alpha, method="wilson")
    print line[:-1], values[-1] / total, ci[0], ci[1]
Beispiel #11
0
    def _lower_confidence_bound(self, NA: int, N: int, alpha: float) -> float:
        """ Returns a (1 - alpha) lower confidence bound on a bernoulli proportion.

        This function uses the Clopper-Pearson method.

        :param NA: the number of "successes"
        :param N: the number of total draws
        :param alpha: the confidence level
        :return: a lower bound on the binomial proportion which holds true w.p at least (1 - alpha) over the samples
        """
        print(NA, N, alpha, "beta")
        return proportion_confint(NA, N, alpha=2 * alpha, method="beta")[0]
def test_confint_proportion_ndim(method):
    # check that it works with 1-D, 2-D and pandas

    count = np.arange(6).reshape(2, 3)
    nobs = 10 * np.ones((2, 3))

    count_pd = pd.DataFrame(count)
    nobs_pd =  pd.DataFrame(nobs)

    ci_arr = proportion_confint(count, nobs, alpha=0.05, method=method)
    ci_pd = proportion_confint(count_pd, nobs_pd, alpha=0.05,
                               method=method)
    assert_allclose(ci_arr, (ci_pd[0].values, ci_pd[1].values), rtol=1e-13)
    # spot checking one value
    ci12 = proportion_confint(count[1, 2], nobs[1, 2], alpha=0.05,
                              method=method)
    assert_allclose((ci_pd[0].values[1, 2], ci_pd[1].values[1, 2]), ci12,
                    rtol=1e-13)
    assert_allclose((ci_arr[0][1, 2], ci_arr[1][1, 2]), ci12, rtol=1e-13)

    # check that lists work as input
    ci_li = proportion_confint(count.tolist(), nobs.tolist(), alpha=0.05,
                               method=method)
    assert_allclose(ci_arr, (ci_li[0], ci_li[1]), rtol=1e-13)

    # check pandas Series, 1-D
    ci_pds = proportion_confint(count_pd.iloc[0], nobs_pd.iloc[0],
                                alpha=0.05, method=method)
    assert_allclose((ci_pds[0].values, ci_pds[1].values),
                    (ci_pd[0].values[0], ci_pd[1].values[0]), rtol=1e-13)

    # check scalar nobs, verifying one value
    ci_arr2 = proportion_confint(count, nobs[1, 2], alpha=0.05,
                                 method=method)
    assert_allclose((ci_arr2[0][1, 2], ci_arr[1][1, 2]), ci12, rtol=1e-13)
Beispiel #13
0
def get_num_player_win_rate(mongo_database, n):
    query = [
        {
            '$group':
                {
                    '_id': {'agent': '$agent'},
                    'wins': {'$sum': f'${n}p.wins'},
                    'losses': {'$sum': f'${n}p.losses'}
                }
        }
    ]
    data = [
        {
            'Agent': doc['_id']['agent'], 'Number of Players': n, 'wins': doc['wins'], 'losses': doc['losses'],
            'Win Rate': round(float(doc['wins']) / (doc['wins'] + doc['losses']), 3),
            'error': round(sm.proportion_confint(count=doc['wins'], nobs=doc['wins'] + doc['losses'], alpha=0.05)[1]
                           - float(doc['wins']) / (doc['wins'] + doc['losses']), 3),
            'conf_int': sm.proportion_confint(count=doc['wins'], nobs=doc['wins'] + doc['losses'], alpha=0.05)
        }
        for doc in mongo_database.agent_summaries.aggregate(query)
    ]
    return pd.DataFrame(columns=['Agent', 'Number of Players', 'wins', 'losses', 'Win Rate', 'error', 'conf_int'], data=data)
Beispiel #14
0
def ci_for_df(odf, ci_method, ci_min=0.25):
    df = odf.copy()
    df['tot_reads'] = df.ip + df.input
    df['avg'] = df.ip / df.tot_reads.astype(float)
    assert ci_method in ('normal', 'agresti_coull',
                         'beta', 'wilson', 'jeffrey')
    df['ci_low'], df['ci_high'] = proportion_confint(df.ip, df.tot_reads,
                                                     method=ci_method)
    df['ci_diff'] = df.ci_high - df.ci_low
    has_ip_and_ctr_reads = np.logical_and(df.ip > 0, df.input > 0)
    small_CI = np.logical_and(df.ci_diff < ci_min, has_ip_and_ctr_reads)
    df['score'] = logit(df.ix[small_CI].avg)
    return df
Beispiel #15
0
 def compLowUpCI(cls, number_samples, number_correct, significant_level):
     """
     Description: 
         lower and upper bounds on the model classification'S accuracy
     :param number_samples: size of data
     :param number_correct: the number of corrected prediction on total samples/ data
     :param significant_level: 90%, 95%, 98%, 99%
     :return lower, upper
     """
     lower, upper = proportion_confint(number_correct, number_samples,
                                       1 - significant_level)
     print('lower=%.3f, upper=%.3f' % (lower, upper))
     return lower, upper
Beispiel #16
0
def get_overall_win_rates(mongo_database):
    query = [{'$group':
             {
                '_id': {'agent': '$agent'},
                'wins': {'$sum': '$total_wins'},
                'losses': {'$sum': '$total_losses'}
             }}]

    data = [
        {
            'agent': doc['_id']['agent'], 'wins': doc['wins'], 'losses': doc['losses'],
            'win_rate': round(float(doc['wins']) / (doc['wins'] + doc['losses']), 3),
            'error': round(sm.proportion_confint(count=doc['wins'], nobs=doc['wins']+doc['losses'], alpha=0.05)[1]
                           - float(doc['wins']) / (doc['wins'] + doc['losses']), 3),
            'conf_int': sm.proportion_confint(count=doc['wins'], nobs=doc['wins']+doc['losses'], alpha=0.05)
        }
        for doc in mongo_database.agent_summaries.aggregate(query)
    ]
    df = pd.DataFrame(columns=['agent', 'wins', 'losses', 'win_rate', 'error', 'conf_int'], data=data)
    df = df.replace(to_replace=r'SO-ISMCTS-10000 Agent', value='SO-ISMCTS Agent')
    df.to_csv('data/latest_data_overall_win_rate.csv', index=False)
    print(df, '\n')
Beispiel #17
0
def simulation():
    number_of_red_balls_observed = 0
    NUMBER_OF_TRIALS = 15
    CONFIDENCE_LEVEL = 0.95
    for i in range(NUMBER_OF_TRIALS):
        random.shuffle(bowl)
        ball = random.choice(bowl)
        if ball == 'red':
            number_of_red_balls_observed += 1
    ci_low, ci_up = proportion_confint(number_of_red_balls_observed,
                                       NUMBER_OF_TRIALS,
                                       alpha=1.0 - CONFIDENCE_LEVEL)
    return (ci_low, ci_up, number_of_red_balls_observed / NUMBER_OF_TRIALS)
Beispiel #18
0
def create_confint_df(count_df, ignored_cols=['Time']):
    """
    
    """
    words = [c for c in count_df.columns if c not in ignored_cols]
    ci_df = pd.DataFrame()

    for w in words:
        lb, ub = proportion.proportion_confint(counts_df[w], counts_df.sum(axis=1), method='jeffrey')
        ci_df['%s_lb' % w] = lb
        ci_df['%s_ub' % w] = ub

    return ci_df
Beispiel #19
0
def get_all_recommendations(connection_string, my_id, my_trophies='high'):
    #Set tables to collect from    
    ind_table = 'individual_aggs_' + my_trophies
    pop_table = 'population_aggs_' + my_trophies
    
    #Get player results
    query = "SELECT mode, map, brawler,"
    query += " SUM(wins) as wins,"
    query += " SUM(matches_played) AS matches_played"
    query += " FROM " + ind_table
    query += " WHERE player_id = '" + my_id + "'"
    query += " GROUP BY mode, map, brawler;"
    
    #Get individual data
    df = sql_get_results(connection_string, ind_table, '', '', '', my_id, custom_query = query)
    
    #Calculate win rate confidence intervals
    df['win_rate'] = df['wins'] / df['matches_played']
    df['ci.lower'],df['ci.upper'] = zip(*df.apply(lambda row : proportion_confint(count = row['wins'], nobs = row['matches_played'], alpha = .1, method = 'agresti_coull'), axis = 1))
    
    #Get population data
    pop_query = "SELECT mode, map, brawler,"
    pop_query += " SUM(wins) as wins,"
    pop_query += " SUM(matches_played) AS matches_played"
    pop_query += " FROM " + pop_table
    pop_query += " GROUP BY mode, map, brawler;"
    pop = sql_get_results(connection_string, pop_table, '', '', '', my_id, custom_query = pop_query)
    pop['win_rate'] = pop['wins'] / pop['matches_played']
    df = pop.merge(df, how = 'left', left_on = ['mode', 'map', 'brawler'], right_on = ['mode', 'map', 'brawler'])
    
    #Compare population to individual history and inform recommendations
    better = (df['win_rate_x'] < df['ci.lower']) & (df['matches_played_y'] >= 5)
    worse = (df['win_rate_x'] > df['ci.upper']) & (df['matches_played_y'] >= 5)
    df['reason'] = 'Population win rate'
    df.loc[better,'reason'] = 'Outperforming population win rate'
    df.loc[worse,'reason'] = 'Underperforming population win rate'
    df['estimated_win_rate'] = df['win_rate_x']
    df.loc[better,'estimated_win_rate'] = df.loc[better,'win_rate_y']
    df.loc[worse,'estimated_win_rate'] = df.loc[worse,'win_rate_y']
    df = df[['map', 'mode', 'brawler', 'estimated_win_rate', 'win_rate_x', 'win_rate_y', 'wins_y', 'matches_played_y', 'ci.lower', 'ci.upper', 'reason']].sort_values(by = 'win_rate_y', 
                                                                                                                                                ascending = False)
    df.columns = ['Map', 'Mode', 'Brawler', 'Estimated Win Rate', 'Population Win Rate', 'Your Win Rate', 'Your Wins', 'Your Matches Played', 'Estimated Lower Bound', 'Estimated Upper Bound', 'Reason']
    df = df.loc[df['Reason']!= 'Population win rate', :]
    to_percent = ['Estimated Win Rate', 'Population Win Rate', 'Your Win Rate', 'Estimated Lower Bound', 'Estimated Upper Bound']
    for col in to_percent:
        df[col] = pd.Series(["{0:.2f}%".format(num * 100) for num in df[col]], index=df.index)
    no_decimals = ['Your Wins', 'Your Matches Played']
    for col in no_decimals:
        df[col] = pd.Series(["{0:.0f}".format(num) for num in df[col]], index=df.index)
    df = df.replace(to_replace='nan%', value='-').replace(to_replace='nan', value='-')
    return(df)
def calculate_underestimate(country, dataframe, delay_func):
    df = dataframe[dataframe.country == country].iloc[::-1].reset_index(
        drop=True)
    cumulative_known_t = 0
    for ii in range(0, len(df)):
        known_i = 0
        for jj in range(0, ii + 1):
            known_jj = df['new_cases'].loc[ii - jj] * delay_func(jj)
            known_i = known_i + known_jj
        cumulative_known_t = cumulative_known_t + known_i
    cum_known_t = round(cumulative_known_t)
    nCFR = df['new_deaths'].sum() / df['new_cases'].sum()
    cCFR = df['new_deaths'].sum() / cum_known_t
    total_deaths = df['new_deaths'].sum()
    total_cases = df['new_cases'].sum()
    nCFR_UQ, nCFR_LQ = proportion_confint(total_deaths, total_cases)
    cCFR_UQ, cCFR_LQ = proportion_confint(total_deaths, cum_known_t)
    quantile25, quantile75 = proportion_confint(total_deaths,
                                                cum_known_t,
                                                alpha=0.5)
    row = {
        'country': country,
        'nCFR': nCFR,
        'cCFR': cCFR,
        'total_deaths': total_deaths,
        'cum_known_t': cum_known_t,
        'total_cases': total_cases,
        'nCFR_UQ': round(nCFR_UQ, 8),
        'nCFR_LQ': round(nCFR_LQ, 8),
        'cCFR_UQ': round(cCFR_UQ, 8),
        'cCFR_LQ': round(cCFR_LQ, 8),
        'underreporting_estimate': cCFRBaseline / (100 * cCFR),
        'lower': cCFREstimateRange[0] / (100 * cCFR_UQ),
        'upper': cCFREstimateRange[1] / (100 * cCFR_LQ),
        'quantile25': quantile25,
        'quantile75': quantile75
    }
    return row
def massart_bound_check(model, inp, eps, cls, **kwargs):
    delta = kwargs.get('delta', 0.3)
    alpha = kwargs.get('alpha', 0.05)
    confidence = kwargs.get('confidence', 0.95)
    verbose = kwargs.get('verbose', False)

    atk_locs = []
    epsilon = 1 - confidence
    chernoff_bound = math.ceil((1 / (2 * epsilon**2)) * math.log(2 / delta))
    print("BayesKeras. Maximum sample bound = %s" % (chernoff_bound))
    successes, iterations, misses = 0.0, 0.0, 0.0
    halting_bound = chernoff_bound
    I = [0, 1]
    while (iterations <= halting_bound):
        if (iterations > 0 and verbose):
            print("Working on iteration: %s \t Bound: %s \t Param: %s" %
                  (iterations, halting_bound, successes / iterations))
        model.set_weights(model.sample())
        logit_l, logit_u = IBP(model,
                               inp,
                               model.model.get_weights(),
                               eps,
                               predict=False)
        v1 = tf.one_hot(cls, depth=10)
        v2 = 1 - tf.one_hot(cls, depth=10)
        worst_case = tf.math.add(tf.math.multiply(v2, logit_u),
                                 tf.math.multiply(v1, logit_l))
        if (np.argmax(np.squeeze(worst_case)) != cls):
            misses += 1
            result = 0
        else:
            result = 1
        successes += result
        iterations += 1
        # Final bounds computation below
        lb, ub = proportion_confint(successes, iterations, method='beta')
        if (math.isnan(lb)):
            lb = 0.0  # Setting lb to zero if it is Nans
        if (math.isnan(ub)):
            ub = 1.0  # Setting ub to one if it is Nans
        I = [lb, ub]
        hb = absolute_massart_halting(successes, iterations, I, epsilon, delta,
                                      alpha)
        if (hb == -1):
            halting_bound = chernoff_bound
        else:
            halting_bound = min(hb, chernoff_bound)
    if (verbose):
        print("Exited becuase %s >= %s" % (iterations, halting_bound))
    return successes / iterations
    def test_incidence_rate(self):
        data = pd.read_csv("meerkat_analysis/test/test_data/univariate.csv")
        variables = util.Variables.from_json_file(
            "meerkat_analysis/test/test_data/variables.json")

        incidence, ci = univariate.incidence_rate(data, var_id="gen_1")
        self.assertEqual(incidence, 0.4)
        self.assertEqual(ci,
                         proportion.proportion_confint(4, 10, method="wilson"))

        incidence, ci = univariate.incidence_rate(data,
                                                  population=20,
                                                  var_id="gen_1")
        self.assertEqual(incidence, 0.2)
        self.assertEqual(ci,
                         proportion.proportion_confint(4, 20, method="wilson"))

        incidence, ci = univariate.incidence_rate(data,
                                                  name="Female",
                                                  variables=variables)
        self.assertEqual(incidence, 0.6)
        self.assertEqual(ci,
                         proportion.proportion_confint(6, 10, method="wilson"))
Beispiel #23
0
def multi_ci(counts, alpha):
    multi_list = []
    n = np.sum(counts)
    l = len(counts)
    for i in range(l):
        multi_list.append(
            proportion_confint(
                # counts[i],
                min(max(counts[i], 1e-10), n - 1e-10),
                n,
                alpha=alpha / 2,
                method="beta",
            ))
    return np.array(multi_list)
def bn_mean_ci(df, groupby_col, feature, **kwargs):
    """
    group df and extact mean and ci for binomial feature
    """
    counts = df.groupby(by=groupby_col)[feature].value_counts().unstack(
        level=1).fillna(0)
    tot = counts[1] + counts[0]

    lb, ub = proportion_confint(counts[1], tot, **kwargs)
    ci = pd.concat((lb, ub), axis=1)
    ci.columns = ['lb', 'ub']
    ci['mean'] = counts[1] / (counts[1] + counts[0])

    return (ci)
 def __init__(self, mask, dataset):
     self.dataset = dataset
     self.good_cnt = sum(mask & dataset.target.good_mask)
     self.bad_cnt = sum(mask & dataset.target.bad_mask)
     self.proc_bad = self.bad_cnt / dataset.target.bad_cnt 
     self.proc_good = self.good_cnt / dataset.target.good_cnt 
     
     if self.good_cnt + self.bad_cnt > 0: 
         self.left, self.right = proportion_confint(self.good_cnt,
                                                    self.good_cnt + self.bad_cnt,
                                                    method='wilson')
         self.event_rate = self.good_cnt / (self.good_cnt + self.bad_cnt)
     else: 
         self.left, self.right = 0., 0.
         self.event_rate = 0.
Beispiel #26
0
    def _lower_confidence_bound(self, n_class_samples: int,
                                n_total_samples: int) -> float:
        """
        Uses Clopper-Pearson method to return a (1-alpha) lower confidence bound on bernoulli proportion

        :param n_class_samples: Number of samples of a specific class.
        :param n_total_samples: Number of samples for certification.
        :return: Lower bound on the binomial proportion w.p. (1-alpha) over samples.
        """
        from statsmodels.stats.proportion import proportion_confint

        return proportion_confint(n_class_samples,
                                  n_total_samples,
                                  alpha=2 * self.alpha,
                                  method="beta")[0]
Beispiel #27
0
def create_confint_df(count_df, ignored_cols=['Time']):
    """
    
    """
    words = [c for c in count_df.columns if c not in ignored_cols]
    ci_df = pd.DataFrame()

    for w in words:
        lb, ub = proportion.proportion_confint(counts_df[w],
                                               counts_df.sum(axis=1),
                                               method='jeffrey')
        ci_df['%s_lb' % w] = lb
        ci_df['%s_ub' % w] = ub

    return ci_df
Beispiel #28
0
        def binomial_error_bars(N_pos, N):
            # Return 95% confidence for binomial
            # median = N_pos/N
            err = np.array(proportion_confint(N_pos, N, method='beta'))
            if np.sum(np.isnan(err)):
                err[np.isnan(err)] = 0

            # print("Binomial error bars for prevalence plot: ")
            # print(err)

            # errorbar wants not absolute values, but offsets.  convert to that:
            err_as_offset = err  #copy?
            err_as_offset[0] = np.abs(err[0] - (N_pos / N))
            err_as_offset[1] = np.abs(err[1] - (N_pos / N))
            return err_as_offset
Beispiel #29
0
    def acceptable(self, classificationRecord_ind, class_inst_counter,
                   counter):
        if classificationRecord_ind:
            cr_correct = classificationRecord_ind['correct']
        else:
            return False

        #class_accuracy = cr_correct / (counter+1)
        correct_class_accuracy_interval = proportion_confint(
            cr_correct, counter + 1)
        class_freq_interval = proportion_confint(class_inst_counter,
                                                 counter + 1)
        #rel_freq = classCounter_ind / (counter+1)
        #if class_accuracy > rel_freq:

        # If the accuracy interval's lower
        # endpoint is greater than the class frequency interval's higher endpoint, then the instance
        # is accepted.
        if correct_class_accuracy_interval[0] > class_freq_interval[1]:
            print('ATrue', correct_class_accuracy_interval,
                  class_freq_interval)
            return True
        else:
            return False
Beispiel #30
0
 def __init__(self, oracle):
     # Create root node with constrains set equal to [], initial data equal to the entire training dataset
     # and labels predicted by the oracle model
     self.oracle = oracle
     self.initial_data = oracle.X
     self.initial_labels = oracle.y
     self.num_examples = len(oracle.X)
     # Improvement is the percentage by which gain should improve on addition of a new test. (Can be from 1.0+)
     self.tree_params = {"tree_size": 15, "split_min": 1000, "num_feature_splits": 50, 'delta': 0.05, 'eps': 0.05}
     self.conf_interval_low, _ = proportion_confint(self.tree_params['split_min'] * (1 - self.tree_params['eps']),
                                                    self.tree_params['split_min'],
                                                    alpha=self.tree_params['delta'])
     self.num_nodes = 0
     self.max_levels = 0
     self.root = self.construct_node(self.initial_data, self.initial_labels, Constraint(), '', 'root')
Beispiel #31
0
def one_prop_conf_interval(values: np.ndarray,
                           conf_level: float = 0.95) -> tuple:
    """calculates confidence interval for a proportion

    Args:
        values (np.array): sample values (values are of values 0 and 1)
        conf_level (float): confidence level

    Returns:
        tuple: lower and upper bounds of confidence interval
    """
    ci = prop_stats.proportion_confint(values.sum(),
                                       len(values),
                                       alpha=1 - conf_level)
    return ci
def cancel(data):

    p = data['is_canceled'].mean()
    n = len(data['is_canceled'])
    print(p)
    print(math.sqrt(p * (1 - p) / n))
    print(sm.proportion_confint(data['is_canceled'].sum(), n))
    dcancel = data[['number_of_paid_orders_before',
                    'is_canceled']].query('number_of_paid_orders_before <=10')
    dcancel['is_canceled'].groupby(
        dcancel['number_of_paid_orders_before']).mean().plot(
            kind='bar', title='Cancel rate')
    plt.show()
    graph(data['is_canceled'].resample('M').mean(), 10,
          'Cancel rate per month', 'Cancel rate')
def plot():

    dic = pkl.load(
        open(
            "/users/global/cornkle/figs/LSTA-bullshit/scales/new_LSTA/dominant_scales_save/scales.p",
            "rb"))

    bin = np.array(dic['bin'])
    center = bin[0:-1] + (bin[1::] - bin[0:-1])
    pdb.set_trace()
    data = dic['blob'] - (
        np.sum(dic['blobc'], axis=0) / np.sum(dic['blobc'])
    )  #dic['scale']#(np.sum(dic['blobc'], axis=0)/np.sum(dic['blobc']))## (np.sum(dic['blobc'], axis=0)/np.sum(dic['blobc'])) #dic['scale']  #(np.sum(dic['blobc'], axis=0)/np.sum(dic['blobc']))
    db = dic['blobc']
    filler = np.zeros_like(db)
    for i in range(db.shape[0]):
        for j in range(db.shape[1]):
            low, up = prop.proportion_confint(db[i, j], np.nansum(db[i, :]))

            unrange = (db[i, j] / np.nansum(db[i, :])) - low
            filler[i, j] = unrange

    mask = np.zeros_like(db)
    mask[filler > np.abs(data)] = 1
    data[np.where(mask)] = 0

    f = plt.figure()
    ax = plt.subplot(111)
    pmap = ax.pcolormesh(data * 100, vmin=-2, vmax=2, cmap='RdBu_r')
    ax.set_xticks(np.arange(dic['blob'].shape[1]) + 1, minor=False)
    ax.set_xticklabels(center)
    cbar = plt.colorbar(pmap)
    cbar.set_label('Difference in scale frequency | Blobs')

    ax.set_yticks(np.arange(dic['blob'].shape[0]) + 1, minor=False)
    ax.set_yticklabels(np.arange(0, 24))
    ax.set_xlabel('Surface Scales of pos/neg deviation to surroundings')
    ax.set_ylabel('Hours')

    ax1 = ax.twinx()
    ax1.set_yticks(np.arange(dic['blob'].shape[0]) + 1, minor=False)
    ax1.set_yticklabels(dic['nblobs'])

    print(np.sum(dic['blobc'] > 0) / np.sum(dic['nblobs']))

    print(np.sum(np.isfinite(dic['blobc'])))

    print(np.sum(data, axis=0))
Beispiel #34
0
def influence_ci(y_true, y_pred, ci='both'):
    """
    :param ci:
        if 'both': returns a tuple (ci_low, ci_high)
        if 'low': returns only low confidence interval
        if 'high': returns only high confidence interval
    """
    [not_true_not_pred, not_true_pred], [true_not_pred, true_pred] = confusion_matrix(y_true, y_pred)
    base_rate = (true_not_pred + true_pred) / (not_true_not_pred + not_true_pred + true_not_pred + true_pred)
    ci_low, ci_high = proportion_confint(true_pred, (true_pred + not_true_pred))
    if ci == 'low':
        return ci_low / base_rate
    elif ci == 'high':
        return ci_high / base_rate
    else:
        return ci_low, ci_high / base_rate
Beispiel #35
0
def transpositions(data):
    data = convert(data, 'melted').data.dropna()
    y = data['recalled_pos'] - data.index.get_level_values('pos')
    h, edges = np.histogram(
        y.values, np.arange(min(y.values) - 0.5,
                            max(y.values) + 0.5))
    x = np.asarray(edges[:-1] + 0.5 * np.diff(edges), dtype=int)
    p = h / float(len(y))
    ci_low, ci_upp = proportion_confint(h, len(y), method='beta')
    return pd.DataFrame(
        {
            'p_transpose': p,
            'ci_low': p - ci_low,
            'ci_upp': ci_upp - p,
        },
        index=x)
Beispiel #36
0
def plot_fraction_asymptomatic(adf):

    cols = ['num_positive', 'num_asymptomatic']
    df = adf.groupby('delivery_date')[cols].agg(np.sum).reset_index()
    df = df.sort_values(by='delivery_date')
    df = df[df['num_positive'] > 0]
    df['fraction_asymptomatic'] = df['num_asymptomatic'] / df['num_positive']

    sns.set_style('whitegrid', {'axes.linewidth': 0.5})
    fig = plt.figure(figsize=(8, 4))
    ax = fig.gca()
    palette = sns.color_palette('Set1')
    formatter = mdates.DateFormatter("%m-%d")

    # ax.scatter(df['delivery_date'].values, df['fraction_asymptomatic'].values, df['num_positive'], color=palette[1],
    #            label='daily')

    df['moving_ave_frac'] = df['fraction_asymptomatic'].rolling(
        window=7, center=True).mean()
    ax.plot(df['delivery_date'], df['moving_ave_frac'], '-', color=palette[1])

    df['moving_ave_asymp'] = df['num_asymptomatic'].rolling(window=7,
                                                            center=True).sum()
    df['moving_ave_pos'] = df['num_positive'].rolling(window=7,
                                                      center=True).sum()
    lows, highs = [], []
    for r, row in df.iterrows():
        low, high = proportion_confint(row['moving_ave_asymp'],
                                       row['moving_ave_pos'])
        lows.append(low)
        highs.append(high)

    ax.fill_between(df['delivery_date'].values,
                    lows,
                    highs,
                    color=palette[1],
                    linewidth=0,
                    alpha=0.3)

    ax.set_ylabel('percent of positives who were asymptomatic at time of test')
    ax.set_xlabel('date of delivery')
    ax.xaxis.set_major_formatter(formatter)
    ax.xaxis.set_major_locator(mdates.MonthLocator())
    fig.savefig(os.path.join(plotdir, 'percent_asymptomatic.png'))
    fig.savefig(os.path.join(plotdir, 'percent_asymptomatic.pdf'),
                format='PDF')
Beispiel #37
0
def melted_to_serial_pos(data):
    y = defaultdict(lambda: 0)
    for _, x in data.iterrows():
        if np.isfinite(x['recalled_pos']):
            y[int(x['recalled_pos']) + 1] += 1
    n = len(data.index.get_level_values('trial').unique())
    y = np.array([y[k] for k in range(1, max(y.keys()) + 1)])
    ci_low, ci_upp = proportion_confint(y, n, method='beta')
    m = y / float(n)
    return pd.DataFrame(
        {
            'correct': m,
            'ci_low': m - ci_low,
            'ci_upp': ci_upp - m,
        },
        index=np.arange(1,
                        len(y) + 1)).sort_index()
Beispiel #38
0
    def CreateRocCurve(self, df, ref_roc=None):
        n_wp = len(self.working_points)
        wp_roc = None
        if self.raw:
            fpr, tpr, thresholds = metrics.roc_curve(
                df['gen_tau'].values,
                df[self.column].values,
                sample_weight=df.weight.values)
            roc = RocCurve(len(fpr), self.color, False, dashed=self.dashed)
            roc.pr[0, :] = fpr
            roc.pr[1, :] = tpr
            roc.thresholds = thresholds
            roc.auc_score = metrics.roc_auc_score(
                df['gen_tau'].values,
                df[self.column].values,
                sample_weight=df.weight.values)
        if n_wp > 0:
            wp_roc = RocCurve(n_wp, self.color, not self.raw, self.raw)
            for n in range(n_wp):
                for kind in [0, 1]:
                    df_x = df[df['gen_tau'] == kind]
                    n_passed = self.CountPassed(df_x, self.working_points[n])
                    n_total = np.sum(df_x.weight.values)
                    eff = float(n_passed) / n_total
                    wp_roc.pr[kind, n_wp - n - 1] = eff
                    if not self.raw:
                        if sys.version_info.major > 2:
                            ci_low, ci_upp = proportion_confint(n_passed,
                                                                n_total,
                                                                alpha=1 - 0.68,
                                                                method='beta')
                        else:
                            err = math.sqrt(eff * (1 - eff) / n_total)
                            ci_low, ci_upp = eff - err, eff + err
                        wp_roc.pr_err[kind, 1, n_wp - n - 1] = ci_upp - eff
                        wp_roc.pr_err[kind, 0, n_wp - n - 1] = eff - ci_low
        if not self.raw:
            roc = wp_roc
            wp_roc = None
        if ref_roc is not None:
            roc.ratio = create_roc_ratio(roc.pr[1], roc.pr[0], ref_roc.pr[1],
                                         ref_roc.pr[0])
        elif roc.pr[1].shape[0] > 0:
            roc.ratio = np.array([[1, 1], [roc.pr[1][0], roc.pr[1][-1]]])

        return roc, wp_roc
Beispiel #39
0
    def test_confidence_interval_estimation(self):
        runner = SingleProcessExperimentRunner()
        sample_length = self.config["sample_length"]
        samples = self.config["samples"]
        alpha = self.config["alpha"]
        method = "agresti_coull"
        estimation_tolerance = 0.1

        confidence_intervals = []
        all_successes = 0
        report_lines = []
        """:type : list[dict]"""
        for i in range(0, samples):
            _, res, trial_infos = runner.run_trials(self.experiment,
                                                    number_of_trials=sample_length,
                                                    step_listeners=[report_step])
            print(trial_infos)
            self.assertEqual(sample_length, len(res))
            self.assertEqual(sample_length, len(trial_infos))
            successes = sum(res)
            all_successes += successes
            ci_low, ci_up = proportion.proportion_confint(successes, len(res), alpha=alpha,
                                                          method=method)
            confidence_intervals.append((ci_low, ci_up))
            report_lines.append(dict(i=i+1, successes=successes, trials=len(res)))

            print("Run #{}: {} successes, CI: [{}..{}]".format(i + 1, successes, ci_low, ci_up))

        estimated_prob = all_successes / (samples * sample_length)

        real_prob = self.calc_real_prob()

        print("estimated probability: {}".format(estimated_prob))
        print("real probability: {}".format(real_prob))
        interval_hit = 0
        for cl, cu in confidence_intervals:
            if cl <= real_prob <= cu:
                interval_hit += 1
        interval_hit_ratio = interval_hit / len(confidence_intervals)
        print("interval hits: {} of {} = {} %".format(interval_hit, len(confidence_intervals),
                                                      interval_hit_ratio * 100.0))

        with open("smctest01_ci.csv", "w") as f:
            f.write("I;SUCCESSES;TRIALS\n")
            for line in report_lines:
                f.write("{i};{successes};{trials}\n".format(**line))
Beispiel #40
0
def _prop_confidence_interval(pair: PlayerPair[int]):
    nobs = pair.one + pair.two
    count = pair.one
    ci_low, ci_upp = proportion_confint(count,
                                        nobs,
                                        alpha=0.05,
                                        method='wilson')
    begin = ""
    if ci_low > 0.5:
        begin = "\033[92m"  # green
    elif ci_upp < 0.5:
        begin = "\033[91m"  # red
    end = "\033[0m" if begin != "" else ""
    low = "{:.2%}".format(ci_low)
    upp = "{:.2%}".format(ci_upp)
    mean = "{:.2%}".format(count / nobs)
    return f"{begin}{mean} [{low}, {upp}]{end}"
def BinomialErrors(nobs, Nsamp, alpha=0.05, method='jeffrey'):
    """
    This is basically just statsmodels.stats.proportion.proportion_confint
    with a different default method. It also returns the proportion nobs/Nsamp

    Parameters:
    ===========
    - nobs:     integer
                The number of "successes"

    - Nsamp:    integer
                The total number of trials. Should be >= nobs.

    - alpha:    float in (0, 1)
                Probability that the true value lies outside the
                resulting error (or something like that).
                alpha=0.05 is about 2-sigma.

    - method:   string
                The calculation method. This is just passed to
                `statsmodels.stats.proportion.proportion_confint`

    Returns:
    ========
    - prob:     float
                The estimate for the probability. prob = nobs / Nsamp

    - low:      float
                The lower bound on the probability

    - high:     float
                The upper bound on the probability
    """

    low, high = proportion_confint(nobs, Nsamp, method=method, alpha=alpha)

    if nobs == 0:
        low = 0.0
        p = 0.0
    elif nobs == Nsamp:
        high = 1.0
        p = 1.0
    else:
        p = float(nobs) / float(Nsamp)

    return p, low, high
Beispiel #42
0
def determine_p_est_by_interval(df, key, max_delay, search_confidence, search_max_p_rel_interval_len):
    num_sample_cases = df.shape[0]

    #Estimate probability of getting a max delay path from the entire MC sim
    num_max_delay_cases = df.value_counts()[max_delay]

    p_est = num_max_delay_cases / float(num_sample_cases)

    print "P_est: {}".format(p_est)

    #Calculate the interval to see if it has converged
    #
    #The 'beta' (Clopper-Pearson) method is a pessimistic interval which gaurentees
    #to cover the alpha-significant interval, but it may be conservative (i.e. it may
    #cover a more significant (smaller alpha)
    alpha = 1. - search_confidence
    ci = sms_sp.proportion_confint(num_max_delay_cases, num_sample_cases, alpha=alpha, method="beta")

    #Convert tuple to array
    ci = [ci[0], ci[1]]

    if max_delay == 0 and math.isnan(ci[1]):
        print "Warning: end of confidence interval was nan for max_delay 0; forcing to 1."
        ci[1] = 1.

    assert not math.isnan(ci[0])
    assert not math.isnan(ci[1])

    ci_len = ci[1] - ci[0]
    ci_len_ratio = ci_len / p_est

    print "P_est CI: [{:g}, {:g}] @ alpha={} ci_len/P_est={}".format(ci[0], ci[1], alpha, ci_len_ratio)

    if p_est < ci[0] or p_est > ci[1]:
        msg = "Estimate {:g} falls outside confidence interval [{:g}, {:g}]: NOT CONVERGED".format(p_est, ci[0], ci[1])

        raise NotConvergedException(msg, num_sample_cases)

    if ci_len_ratio > search_max_p_rel_interval_len:
        msg = "Normalized CI delta (ci[1] - ci[0])/p_ext={:g} exceeds target={:g}: NOT CONVERGED".format(ci_len_ratio, search_max_p_rel_interval_len)

        raise NotConvergedException(msg, num_sample_cases)


    return p_est, ci
def plot():


    dic = pkl.load( open(cnst.network_data + "figs/LSTA-bullshit/scales/new/dominant_scales_save/scales.p", "rb"))

    bin = np.array(dic['bin'])
    center = bin[0:-1] + (bin[1::]-bin[0:-1])

    data = dic['blob']- (np.sum(dic['blobc'], axis=0)/np.sum(dic['blobc']))#dic['scale']#(np.sum(dic['blobc'], axis=0)/np.sum(dic['blobc']))## (np.sum(dic['blobc'], axis=0)/np.sum(dic['blobc'])) #dic['scale']  #(np.sum(dic['blobc'], axis=0)/np.sum(dic['blobc']))
    db = dic['blobc']
    filler = np.zeros_like(db)
    for i in range(db.shape[0]):
        for j in range(db.shape[1]):
            low , up = prop.proportion_confint(db[i,j], np.nansum(db[i,:]))

            unrange = (db[i,j] / np.nansum(db[i,:])) - low
            filler[i,j] = unrange

    mask = np.zeros_like(db)
    mask[filler>np.abs(data)] = 1
    data[np.where(mask)] = 0

    f = plt.figure()
    ax = plt.subplot(111)
    pmap = ax.pcolormesh(data*100, vmin=-2, vmax=2, cmap='RdBu_r')
    ax.set_xticks(np.arange(dic['blob'].shape[1])+1, minor=False)
    ax.set_xticklabels(center)
    cbar = plt.colorbar(pmap)
    cbar.set_label('Difference in scale frequency | Blobs')

    ax.set_yticks(np.arange(dic['blob'].shape[0]) + 1, minor=False)
    ax.set_yticklabels(np.arange(0,24))
    ax.set_xlabel('Surface Scales of pos/neg deviation to surroundings')
    ax.set_ylabel('Hours')


    ax1 = ax.twinx()
    ax1.set_yticks(np.arange(dic['blob'].shape[0])+1, minor=False)
    ax1.set_yticklabels(dic['nblobs'])
    plt.show()
    print(np.sum(dic['blobc']>0)/np.sum(dic['nblobs']))

    print(np.sum(np.isfinite(dic['blobc'])))

    print(np.sum(data, axis=0))
Beispiel #44
0
def BinomialErrors(nobs, Nsamp, alpha=0.05, method='jeffrey'):
    """
    This is basically just statsmodels.stats.proportion.proportion_confint
    with a different default method. It also returns the proportion nobs/Nsamp
    """

    low, high = proportion_confint(nobs, Nsamp, method=method, alpha=alpha)
    
    if nobs == 0:
        low = 0.0
        p = 0.0
    elif nobs == Nsamp:
        high = 1.0
        p = 1.0
    else:
        p = float(nobs) / float(Nsamp)

    return p, low, high
Beispiel #45
0
def get_stats(df, cutoff=1):
    data = df[[c for c in df.columns if not c == 'sequence']]
    total_n = data.sum(axis=1) # All non-corresponding data should be zero
    correct_n = data[[n+'->'+n for n in ['A', 'C', 'T', 'G']]] # Get the columns that correspond to correct incorporations
    misinc_n = total_n - correct_n.sum(axis=1) # Same as above.
    
    # 
    rate = misinc_n / total_n
    lb, ub = proportion.proportion_confint(misinc_n, total_n, method='jeffrey')
    
    # Assemble output dataframe
    simp_df = pd.DataFrame()
    simp_df['rate'] = rate
    simp_df['lb'] = lb
    simp_df['ub'] = ub
    simp_df['n'] = total_n
    simp_df['sequence'] = df.sequence
    
    return simp_df
def percent_correct_sound_type(bdata, soundType):
    '''
    DEPRECATED
    Return the average percent correct for a behavior session, limited to a single sound type
    '''

    from statsmodels.stats.proportion import proportion_confint

    correctChoiceVec = bdata['outcome']==bdata.labels['outcome']['correct']
    trialsSoundType = bdata['soundType']==bdata.labels['soundType'][soundType]
    correctSoundType = correctChoiceVec[trialsSoundType]
    nRewardedSoundType = sum(correctSoundType)
    valid = bdata['valid']
    validSoundType = valid[trialsSoundType]
    nValidSoundType = sum(validSoundType)
    fractionCorrectSoundType = nRewardedSoundType/float(nValidSoundType)

    ci = np.array(proportion_confint(nRewardedSoundType, nValidSoundType, method = 'wilson'))

    return (fractionCorrectSoundType, nRewardedSoundType, nValidSoundType)
def plot_band_psychometric(validPerSNR, rightPerSNR, possibleSNRs, colour = 'k', linestyle='-', xlabel=True, ylabel=True):
    from statsmodels.stats.proportion import proportion_confint
    performance = []
    upper = []
    lower = []
    for inds in range(len(possibleSNRs)):
        CIthisSNR = np.array(proportion_confint(rightPerSNR[inds], validPerSNR[inds], method = 'wilson'))
        performance.append(100.0*rightPerSNR[inds]/validPerSNR[inds])
        upper.append(100.0*CIthisSNR[1]-performance[-1])
        lower.append(performance[-1]-100.0*CIthisSNR[0])
    plt.plot(np.arange(len(possibleSNRs)), performance, linestyle, marker='o', color=colour, mec=colour, lw=3, ms=10)
    plt.errorbar(np.arange(len(possibleSNRs)), performance, yerr = [lower, upper], color=colour, lw=2, ls=linestyle)
    if ylabel:
        plt.ylabel("% rightward", fontsize=16)
    if xlabel:
        plt.xlabel('SNR (dB)', fontsize=16)
    plt.xticks(np.arange(len(possibleSNRs)), possibleSNRs)
    plt.ylim((0,100))
    ax = plt.gca()
    extraplots.boxoff(ax)
def compute_psycurve(bdata):
    targetFrequency=bdata['targetFrequency']
    valid=bdata['valid']
    choice=bdata['choice']
    intensities=bdata['targetIntensity']
    choiceRight = choice==bdata.labels['choice']['right']

    #Find trials at each frequency
    possibleFreq = np.unique(targetFrequency)
    nFreq = len(possibleFreq) 
    trialsEachFreq = behavioranalysis.find_trials_each_type(targetFrequency,possibleFreq)

    #Preallocate arrays
    fractionRightEachFreq=np.empty(nFreq)
    confLimitsEachFreq=np.empty([2, nFreq]) #Upper and lower confidence limits
    nTrialsEachFreq = np.empty(nFreq)
    nRightwardEachFreq = np.empty(nFreq)

    #Compute the number right and c.i for each freq.
    for indf,thisFreq in enumerate(possibleFreq): 

        nTrialsThisFreq = sum(valid & trialsEachFreq[:,indf])
        nRightwardThisFreq =  sum(valid & choiceRight & trialsEachFreq[:,indf])
        conf = np.array(proportion_confint(nRightwardThisFreq, nTrialsThisFreq, method = 'wilson'))

        nTrialsEachFreq[indf] = nTrialsThisFreq
        nRightwardEachFreq[indf] = nRightwardThisFreq
        confLimitsEachFreq[0, indf] = conf[0] # Lower ci
        confLimitsEachFreq[1, indf] = conf[1] # Upper ci

    #Compute %right (c.i. are already in percent)
    fractionRightEachFreq = nRightwardEachFreq/nTrialsEachFreq.astype(float)

    #plot(possibleFreq,fractionRightEachFreq,'-o')
    #gca().set_xscale('log')

    #Find lengths of whiskers for plotting function
    upperWhisker = confLimitsEachFreq[1, :] - fractionRightEachFreq
    lowerWhisker = fractionRightEachFreq - confLimitsEachFreq[0,:]

    return nRightwardEachFreq, nTrialsEachFreq, fractionRightEachFreq, upperWhisker, lowerWhisker
Beispiel #49
0
def test_confint_proportion():
    from .results.results_proportion import res_binom, res_binom_methods
    methods = {'agresti_coull' : 'agresti-coull',
               'normal' : 'asymptotic',
               'beta' : 'exact',
               'wilson' : 'wilson',
               'jeffrey' : 'bayes'
               }

    for case in res_binom:
        count, nobs = case
        for method in methods:
            idx = res_binom_methods.index(methods[method])
            res_low = res_binom[case].ci_low[idx]
            res_upp = res_binom[case].ci_upp[idx]
            if np.isnan(res_low) or np.isnan(res_upp):
                continue
            ci = proportion_confint(count, nobs, alpha=0.05, method=method)

            assert_almost_equal(ci, [res_low, res_upp], decimal=6,
                                err_msg=repr(case) + method)
Beispiel #50
0
def eval_pairwise_analogies(frame, eval_filename, subset='all',
                            weight_direct=0.35, weight_transpose=0.65):
    total = 0
    correct = 0
    wrap = VectorSpaceWrapper(frame=frame)
    for idx, (prompt, choices, answer) in enumerate(read_turney_analogies(eval_filename)):
        # Enable an artificial training/test split
        if subset == 'all' or (subset == 'dev') == (idx % 2 == 0):
            a1, b1 = prompt
            choice_values = []
            for choice in choices:
                a2, b2 = choice
                choice_values.append(
                    pairwise_analogy_func(wrap, a1, b1, a2, b2, weight_direct, weight_transpose)
                )
            our_answer = np.argmax(choice_values)
            if our_answer == answer:
                correct += 1
            total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
Beispiel #51
0
def confidence_interval(delta, metric):
    """Calculate a two-sided 95% confidence interval for differences."""
    # Wilson score interval for sign test.
    num_successes = np.count_nonzero(delta > 0)
    num_trials = np.count_nonzero(delta != 0)  # Exclude zero differences.
    lower, upper = proportion.proportion_confint(
        num_successes, num_trials, alpha=0.05, method='wilson')
    median_delta = delta.median()
    if metric == 'auc':
        median = r'%.3f' % median_delta
        ci = r'(%.2f, %.2f)' % (lower, upper)
    else:
        median = r'%.0f' % median_delta
        ci = r'(%.2f, %.2f)' % (lower, upper)
    if lower < 0.5 and upper < 0.5:
        median = r'\bfseries \color{red} ' + median
        ci = r'\bfseries \color{red} ' + ci
    elif lower > 0.5 and upper > 0.5:
        median = r'\bfseries ' + median
        ci = r'\bfseries ' + ci
    return median, ci
def calculate_psychometric(hitTrials,paramValueEachTrial,valid=None):
    '''
    Calculate fraction of hits for each parameter value (in vector param).
    hitTrials: (boolean array of size nTrials) hit or miss
    paramValueEachTrial: (array of size nTrials) parameter value for each trial
    valid: (boolean array of size nTrials) trials to use in calculation

    RETURNS:
    possibleValues: array with possible values of the parameter
    fractionHitsEachValue: array of same length as possibleValues
    ciHitsEachValue: array of size [2,len(possibleValues)]
    nTrialsEachValue: array of same length as possibleValues
    nHitsEachValue: array of same length as possibleValues
    '''
    try:
        from statsmodels.stats.proportion import proportion_confint #Used to compute confidence interval for the error bars.
        useCI = True
    except ImportError:
        print 'To calculate confidence intervals, please install "statsmodels" module.'
        useCI = False
    nTrials = len(hitTrials)
    if valid is None:
        valid = ones(nTrials,dtype=bool)
    possibleValues = np.unique(paramValueEachTrial)
    nValues = len(possibleValues)
    trialsEachValue = find_trials_each_type(paramValueEachTrial,possibleValues)

    nTrialsEachValue = np.empty(nValues,dtype=int)
    nHitsEachValue = np.empty(nValues,dtype=int)
    for indv,thisValue in enumerate(possibleValues):
        nTrialsEachValue[indv] = sum(valid & trialsEachValue[:,indv])
        nHitsEachValue[indv] = sum(valid & hitTrials & trialsEachValue[:,indv])

    fractionHitsEachValue = nHitsEachValue/nTrialsEachValue.astype(float)
    if useCI:
        ciHitsEachValue = np.array(proportion_confint(nHitsEachValue, nTrialsEachValue, method = 'wilson'))
    else:
        ciHitsEachValue = None
    return (possibleValues,fractionHitsEachValue,ciHitsEachValue,nTrialsEachValue,nHitsEachValue)
Beispiel #53
0
def move_article(mongo_client, doc, from_table, to_table, dry_run = False):
  if (not doc.has_key("docid")) or doc["docid"] == "":
    return

  editor_score = "0"
  expiration_date = "NA"
  if doc.has_key("editor_score"):
    editor_score = doc["editor_score"]
  if doc.has_key("expiration_date"):
    expiration_date = doc["expiration_date"]

  docid = doc["docid"]
  if dry_run:
    clicks = 0
    imps = 0
    if doc.has_key('all_clicks'):
      clicks = doc["all_clicks"]
    if doc.has_key('all_imps'):
      imps = doc["all_imps"]
    if imps >= 5:
      (lb, ub) = proportion_confint(clicks, imps, (1 - CONFIDENCE), 'wilson')
      print docid, clicks, imps, (clicks * 1.0 / imps), lb, ub, editor_score, expiration_date, from_table, to_table
    else:
      print docid, clicks, imps, 0.0,  0.0, 0.0, editor_score, expiration_date, from_table, to_table
    return

  from_db = get_table(mongo_client, from_table)
  to_db = get_table(mongo_client, to_table)

  docs = to_db.find({"docid": docid})
  existing_size = 0
  for doc in docs: 
    existing_size += 1
  if existing_size > 0:
    logging.warn("doc " + docid + " already exists in destination table: " + to_table)
  else:
    to_db.insert_one(doc)
  from_db.delete_one({"docid": docid})
def test_confint_proportion():
    from .results.results_proportion import res_binom, res_binom_methods


    for case in res_binom:
        count, nobs = case
        for method in probci_methods:
            idx = res_binom_methods.index(probci_methods[method])
            res_low = res_binom[case].ci_low[idx]
            res_upp = res_binom[case].ci_upp[idx]
            if np.isnan(res_low) or np.isnan(res_upp):
                continue
            if (count == 0 or count == nobs) and method == 'jeffreys':
                # maybe a bug or different corner case definition
                continue
            if method == 'jeffreys' and nobs == 30:
                # something is strange in extreme case e.g 0/30 or 1/30
                continue
            ci = proportion_confint(count, nobs, alpha=0.05, method=method)
            # we impose that confint is in [0, 1]
            res_low = max(res_low, 0)
            res_upp = min(res_upp, 1)
            assert_almost_equal(ci, [res_low, res_upp], decimal=6,
                                err_msg=repr(case) + method)
        oldlen = newlen
resultsVoiceAll = resultsAgg.loc[indVoiceAll]

# Color code for call type
callColor = {'Be': (0/255.0, 230/255.0, 255/255.0), 'LT': (0/255.0, 95/255.0, 255/255.0), 'Tu': (255/255.0, 200/255.0, 65/255.0), 'Th': (255/255.0, 150/255.0, 40/255.0), 
             'Di': (255/255.0, 105/255.0, 15/255.0), 'Ag': (255/255.0, 0/255.0, 0/255.0), 'Wh': (255/255.0, 180/255.0, 255/255.0), 'Ne': (255/255.0, 100/255.0, 255/255.0),
             'Te': (140/255.0, 100/255.0, 185/255.0), 'DC': (100/255.0, 50/255.0, 200/255.0), 'So': (0/255.0, 0/255.0, 0/255.0)}


plt.figure()
xvals = np.arange(len(indCallerAll))
width = 0.75          # the width of the bars
numbars = 2           # Number of bars per group
 
plt.subplot(131)
yerr = np.asarray([ proportion_confint(count, nobs) for count, nobs in zip(resultsCallerAll.LDA_YES, resultsCallerAll.TestCount)])
b1=plt.bar(xvals*numbars, resultsCallerAll.LDA, width, color='k', yerr=yerr.T)

yerr = np.asarray([ proportion_confint(count, nobs) for count, nobs in zip(resultsVoiceAll.LDA_YES, resultsVoiceAll.TestCount)])
b2=plt.bar(np.array(xvalsV)*numbars + width, resultsVoiceAll.LDA, width, color='b', yerr=yerr.T)                      
                    
plt.xticks(numbars*xvals + width*numbars/2., sortedCallTypes)
plt.legend((b1[0], b2[0]), ('Within CV', 'Across CV'))
plt.title('Within Call Type vs Across Call Type')
plt.ylabel('LDA Performance % ')
plt.xlabel('Call Type')
myAxis = plt.axis()
myAxis = (myAxis[0], myAxis[1], 40.0, myAxis[3])
plt.axis(myAxis)

plt.subplot(132)
Beispiel #56
0
def test_samplesize_confidenceinterval_prop():
    #consistency test for samplesize to achieve confidence_interval
    nobs = 20
    ci = smprop.proportion_confint(12, nobs, alpha=0.05, method='normal')
    res = smprop.samplesize_confint_proportion(12./nobs, (ci[1] - ci[0]) / 2)
    assert_almost_equal(res, nobs, decimal=13)
Beispiel #57
0
BusinessLongAccepted = len(data[((data.Segment == "BusinessLong") & (data.Accept == 1))])
BusinessShortAccepted = len(data[((data.Segment == "BusinessShort") & (data.Accept == 1))])
ConferenceAccepted = len(data[((data.Segment == "Conference") & (data.Accept == 1))])
OtherAccepted = len(data[((data.Segment == "Other") & (data.Accept == 1))])
VacationAccepted = len(data[((data.Segment == "Vacation") & (data.Accept == 1))])
TotalAccepted = len(data[(data.Accept == 1)])

import statsmodels.stats.proportion as sm

TotalAcceptedPercent = TotalAccepted / Total


print("BusinessLong")
print("Average: " + str(BusinessLongAccepted / BusinessLong))
BusinessLongCI = sm.proportion_confint(BusinessLongAccepted, BusinessLong)
print("Lower: " + str(BusinessLongCI[0]))
print("Upper: " + str(BusinessLongCI[1]))
print(sm.proportions_ztest(BusinessLongAccepted, BusinessLong, TotalAcceptedPercent)[1])

print("BusinessShort")
print("Average: " + str(BusinessShortAccepted / BusinessShort))
BusinessShortCI = sm.proportion_confint(BusinessShortAccepted, BusinessShort)
print("Lower: " + str(BusinessShortCI[0]))
print("Upper: " + str(BusinessShortCI[1]))
print(sm.proportions_ztest(BusinessShortAccepted, BusinessShort, TotalAcceptedPercent)[1])

print("Conference")
print("Average: " + str(ConferenceAccepted / Conference))
ConferenceCI = sm.proportion_confint(ConferenceAccepted, Conference)
print("Lower: " + str(ConferenceCI[0]))
Beispiel #58
0
def get_proportion_confint_report(num_successes, num_trials, alpha=0.05, do_normal=True, do_agresti_coull=True,
                                  do_beta=False, do_wilson=True, do_jeffrey=False, do_binom_test=False):
    """ Get confidence intervals of proportion num_successes / num_trials using different methods in JSON-friendly form.

    :param num_successes: number of successes
    :type num_successes: int
    :param num_trials: number of trials or attempts
    :type num_trials: int
    :param alpha: significance used in statistical inferences
    :type alpha: float
    :param do_normal: True to get a confidence interval using the normal approximation method
    :type do_normal: bool
    :param do_agresti_coull: True to get a confidence interval using the Agresti-Coull method
    :type do_agresti_coull: bool
    :param do_beta: True to get a confidence interval using the beta method
    :type do_beta: bool
    :param do_wilson: True to get a confidence interval using the Wilson method
    :type do_wilson: bool
    :param do_jeffrey: True to get a confidence interval using the Jeffrey method
    :type do_jeffrey: bool
    :param do_binom_test: True to get a confidence interval using the binomial test method
    :type do_binom_test: bool
    :return: JSON-able dict report
    :rtype: dict

    Why do I use normal, agresti_coull and wilson by default?

    1) Normal, because it is the widely-used but flawed option.
    2) AgrestiCoull & Wilson, because Lawrence D. Brown, T. Tony Cai and Anirban DasGupta in their paper
        `Interval Estimation for a Binomial Proportion` say 'we recommend the Wilson interval for small n and the
        interval suggested in Agresti and Coull for larger n'

    Call to proportion_confint allows methods:

    - `normal` : asymptotic normal approximation
    - `agresti_coull` : Agresti-Coull interval
    - `beta` : Clopper-Pearson interval based on Beta distribution
    - `wilson` : Wilson Score interval
    - `jeffrey` : Jeffrey's Bayesian Interval
    - `binom_test`

    """

    from statsmodels.stats.proportion import proportion_confint

    conf_int_reports = OrderedDict()

    if do_normal:
        conf_int = proportion_confint(num_successes, num_trials, alpha=alpha, method='normal')
        conf_int_report = OrderedDict()
        conf_int_report['Lower'] = conf_int[0]
        conf_int_report['Upper'] = conf_int[1]
        conf_int_report['Alpha'] = alpha
        conf_int_report['Method'] = 'Normal'
        conf_int_reports['Normal'] = conf_int_report

    if do_agresti_coull:
        conf_int = proportion_confint(num_successes, num_trials, alpha=alpha, method='agresti_coull')
        conf_int_report = OrderedDict()
        conf_int_report['Lower'] = conf_int[0]
        conf_int_report['Upper'] = conf_int[1]
        conf_int_report['Alpha'] = alpha
        conf_int_report['Method'] = 'AgrestiCoull'
        conf_int_reports['AgrestiCoull'] = conf_int_report

    if do_beta:
        conf_int = proportion_confint(num_successes, num_trials, alpha=alpha, method='beta')
        conf_int_report = OrderedDict()
        conf_int_report['Lower'] = conf_int[0]
        conf_int_report['Upper'] = conf_int[1]
        conf_int_report['Alpha'] = alpha
        conf_int_report['Method'] = 'Beta'
        conf_int_reports['Beta'] = conf_int_report

    if do_wilson:
        conf_int = proportion_confint(num_successes, num_trials, alpha=alpha, method='wilson')
        conf_int_report = OrderedDict()
        conf_int_report['Lower'] = conf_int[0]
        conf_int_report['Upper'] = conf_int[1]
        conf_int_report['Alpha'] = alpha
        conf_int_report['Method'] = 'Wilson'
        conf_int_reports['Wilson'] = conf_int_report

    if do_jeffrey:
        conf_int = proportion_confint(num_successes, num_trials, alpha=alpha, method='jeffrey')
        conf_int_report = OrderedDict()
        conf_int_report['Lower'] = conf_int[0]
        conf_int_report['Upper'] = conf_int[1]
        conf_int_report['Alpha'] = alpha
        conf_int_report['Method'] = 'Jeffrey'
        conf_int_reports['Jeffrey'] = conf_int_report

    if do_binom_test:
        conf_int = proportion_confint(num_successes, num_trials, alpha=alpha, method='binom_test')
        conf_int_report = OrderedDict()
        conf_int_report['Lower'] = conf_int[0]
        conf_int_report['Upper'] = conf_int[1]
        conf_int_report['Alpha'] = alpha
        conf_int_report['Method'] = 'BinomTest'
        conf_int_reports['BinomTest'] = conf_int_report

    return conf_int_reports
Beispiel #59
0
        print("Conducted tests: {}".format(len(results)))
        print("Successes: {} of {}".format(sum(results), len(results)))
        print("Hypothesis accepted: {}".format(accepted_hypothesis))

        print_timing_info(info)
    elif _MODE == ESTIMATION:
        print("len: ", len(sc1.world.getDomain("plcs")))
        _, results, info = runner.run_trials(sc1, number_of_trials=NUM_REPETITIONS)

        successes = sum(results)
        nobs = len(results)
        ratio = successes / nobs
        print("ESTIMATION")
        print("Successes: {} of {}".format(successes, nobs))
        print("Ratio: {}".format(ratio))
        alpha = 0.05

        print("Confidence intervals, alpha={}: ".format(alpha))
        for method in ["normal", "agresti_coull", "beta", "wilson", "jeffrey"]:
            ci_low, ci_upp = proportion.proportion_confint(successes, nobs, alpha=alpha, method=method)
            print("   {} => [{}..{}]".format(method, ci_low, ci_upp))

        print_timing_info(info)

    elif _MODE == VISUALIZE:
        print("Visualize")
        verdict, info = sc1.run()

        print("Verdict: {}".format(verdict))
        print("Info: {}".format(info))