Esempio n. 1
0
def _threshold_counts(counts, thresholdLow, thresholdHigh, n_vec):
    y11 = []
    y11_upper_err = []
    y11_lower_err = []
    y10_01 = []
    y10_01_upper_err = []
    y10_01_lower_err = []
    y00 = []
    y00_upper_err = []
    y00_lower_err = []
    for countVec, N in zip(counts, n_vec):
        countVec = countVec[0:int(N)]
        p11 = sum([x >= thresholdHigh for x in countVec]) / N
        p10_01 = sum([((x < thresholdHigh) & (x > thresholdLow)) for x in countVec]) / N
        p00 = sum([x <= thresholdLow for x in countVec]) / N
        y11.append(p11)
        y10_01.append(p10_01)
        y00.append(p00)
        kci11 = binom.interval(0.68, N, p11) 
        kci10_01 = binom.interval(0.68, N, p10_01) 
        kci00 = binom.interval(0.68, N, p00) 
        y11_lower_err.append( p11 - kci11[0]/N )
        y11_upper_err.append( kci11[1]/N - p11 )
        y10_01_lower_err.append( p10_01 - kci10_01[0]/N )
        y10_01_upper_err.append( kci10_01[1]/N - p10_01 )
        y00_lower_err.append( p00 - kci00[0]/N )
        y00_upper_err.append( kci00[1]/N - p00 )
    return y11, y11_upper_err, y11_lower_err, y00, y00_upper_err, y00_lower_err, y10_01, y10_01_upper_err, y10_01_lower_err
 def error_bars(df):
     """
     Return 68% (1-sigma) binomial confidence interval.
     pyplot.errorbar expects a 2xN array of unsigned offsets relative to points
     """
     errs = [binom.interval(0.68, n, p=k/n, loc=-k) / n for n, k in zip(df.Trials, df.Observations)]
     return np.abs(np.array(errs).T)
Esempio n. 3
0
def plot_reliability_diagram(y,x,bins=np.linspace(0,1,21),size_points=False, show_baseline=True,
                                error_bars=True, error_bar_alpha = .05, marker='+',c='red', **kwargs):
    # if ax is None:
    #     ax = plt.gca()
    #     fig = ax.get_figure()
    digitized_x = np.digitize(x, bins)
    mean_count_array = np.array([[np.mean(y[digitized_x == i]),len(y[digitized_x == i]),np.mean(x[digitized_x==i])] for i in np.unique(digitized_x)])
    x_pts_to_graph = mean_count_array[:,2]
    y_pts_to_graph = mean_count_array[:,0]
    pt_sizes = mean_count_array[:,1]
    plt.subplot(1,2,1)

    if show_baseline:
        plt.plot(np.linspace(0,1,100),(np.linspace(0,1,100)),'k--')
#        ax.loglog(np.linspace(0,1,100),(np.linspace(0,1,100)),'k--')
    for i in range(len(y_pts_to_graph)):
        if size_points:
            plt.scatter(x_pts_to_graph,y_pts_to_graph,s=pt_sizes,marker=marker,c=c, **kwargs)
        else:
            plt.scatter(x_pts_to_graph,y_pts_to_graph, c=c, **kwargs)
    plt.axis([-0.1,1.1,-0.1,1.1])
    
    if error_bars:
        yerr_mat = binom.interval(1-error_bar_alpha,pt_sizes,x_pts_to_graph)/pt_sizes - x_pts_to_graph
        yerr_mat[0,:] = -yerr_mat[0,:]
        plt.errorbar(x_pts_to_graph, x_pts_to_graph, yerr=yerr_mat, capsize=5)
    plt.subplot(1,2,2)
    plt.hist(x,bins=bins)

    return(x_pts_to_graph,y_pts_to_graph,pt_sizes)
Esempio n. 4
0
def check(N, p):
    global numfails, numchecks, mu, sigma2
    H = NeuronGroup(1, 'v:1', threshold='False', name='H')
    G = NeuronGroup(N, 'v:1', threshold='False', name='G')
    S = Synapses(H, G, on_pre='v+=w', name='S')
    S.connect(p=p)
    m = len(S)
    low, high = binom.interval(alpha, N, p)
    if p==0:
        low = high = 0
    elif p==1:
        low = high = N
    else:
        i = diff(S.j[:])
        i = i[i<isi_max[p]]
        b = bincount(i, minlength=isi_max[p])[:isi_max[p]]
        if b[0]:
            print 'Major error: repeated indices for N=%d, p=%.3f' % (N, p)
            raise ValueError("Repeated indices")
        isi[p] += b
        num_isi[p] += sum(b)
    q = binom.cdf(low-0.1, N, p)+binom.sf(high+0.1, N, p)
    mu += q
    sigma2 += q*(1-q)
    numchecks += 1
    if m<low or m>high:
        numfails += 1
        return True
    else:
        return False
Esempio n. 5
0
    def test_nonpenalizable_tokens_2(self):
        probs = [0.5, 0.5]

        actual_num_nonequal_pairs = 0
        samples_generated = 0
        while samples_generated < _SAMPLES_NUM:
            token_sampler = TokenSampler(batch_size=1,
                                         banned_tokens_ids=[],
                                         non_penalizable_tokens_ids=[1],
                                         repetition_penalization_coefficient=
                                         REPETITION_PENALIZE_COEFFICIENT)
            first_token = token_sampler.sample(probs, sample_idx=0)
            if first_token == 0:
                samples_generated += 1
                second_token = token_sampler.sample(probs, sample_idx=0)
                actual_num_nonequal_pairs += (first_token != second_token)

        # When we penalize for token#0, P(first != second | first=0) = P(second=1 | first=0) = 0.5 * r / (0.5 + 0.5 * r) = r / (1 + r)
        expected_nonequal_pair_rate = REPETITION_PENALIZE_COEFFICIENT / (
            1 + REPETITION_PENALIZE_COEFFICIENT)
        expected_nonequal_pair_rate_from, expected_nonequal_pair_rate_to = binom.interval(
            1 - _CONFIDENCE_LEVEL, _SAMPLES_NUM, expected_nonequal_pair_rate)
        self.assertLessEqual(actual_num_nonequal_pairs,
                             expected_nonequal_pair_rate_to)
        self.assertGreaterEqual(actual_num_nonequal_pairs,
                                expected_nonequal_pair_rate_from)
Esempio n. 6
0
 def plot_ranks(self,
                x=None,
                p=None,
                nbins=None,
                figsize=6,
                testing_fn=None):
     if testing_fn is None:
         testing_fn = testing_fn_generator(x, p)
     ranks = zeros(p.shape)
     N, n = p.shape
     for i, m in enumerate(self.predict(testing_fn)):
         ranks[i] = sum(m["samples"] < p[i].reshape((1, -1)), axis=0)
     if nbins is None:
         nbins = int(N**.5)
     while self.n_samples % nbins:
         nbins += 1
     f, ax = subplots(n,
                      1,
                      figsize=(figsize, 2 / 3 * figsize * n),
                      gridspec_kw={"hspace": 0.3})
     if n == 1:
         ax = [ax]
     interval = binom.interval(0.9, N, 1 / nbins)
     edges = linspace(0, self.n_samples, nbins + 1)
     for i in range(n):
         ax[i].hist(ranks[:, i], edges)
         ax[i].axhspan(*interval, color="r", alpha=0.3)
         ax[i].set_title(self.labels[i])
         ax[i].set_xlabel("rank of truth among posterior samples")
Esempio n. 7
0
def qqplot(data, labels, n_quantiles=100, alpha=0.95, error_type='theoretical', distribution = 'binomial', log10conv=True, color=['k', 'r', 'b'], fill_dens=[0.1, 0.1, 0.1], type = 'uniform', title='title'):
    '''
    Function for plotting Quantile Quantile (QQ) plots with confidence interval (CI)
    :param data: NumPy 1D array with data
    :param labels:
    :param type: type of the plot
    :param n_quantiles: number of quntiles to plot
    :param alpha: confidence interval
    :param log10conv: conversion to -log10(p) for the figure
    :return: nothing
    '''
    xmax = 0
    ymax = 0
    if type == 'uniform':
        # we expect distribution from 0 to 1
        for j in range(len(data)):
            # define quantiles positions:
            q_pos = np.concatenate([np.arange(99.)/len(data[j]), np.logspace(-np.log10(len(data[j]))+2, 0, n_quantiles)])
            # define quantiles in data
            q_data = mquantiles(data[j], prob=q_pos, alphap=0, betap=1, limit=(0, 1)) # linear interpolation
            # define theoretical predictions
            q_th = q_pos.copy()
            # evaluate errors
            q_err = np.zeros([len(q_pos),2])
            if np.sum(alpha) > 0:
                for i in range(0, len(q_pos)):
                    if distribution == 'binomial':
                        q_err[i, :] = binom.interval(alpha=alpha, n=len(data[j]), p=q_pos[i])
                    elif distribution == 'normal':
                        q_err[i, :] = norm.interval(alpha, len(data[j])*q_pos[i], np.sqrt(len(data[j])*q_pos[i]*(1.-q_pos[i])))
                        q_err[i, q_err[i, :] < 0] = 1e-12
                    else:
                        print('Distribution is not defined!')
                q_err /= 1.0*len(data[j])
                for i in range(0, 100):
                    q_err[i,:] += 1e-12
            # print(q_err[100:, :])
            slope, intercept, r_value, p_value, std_err = linregress(q_th, q_data)
            # print(labels[j], ' -- Slope: ', slope, " R-squared:", r_value**2)
            plt.plot(-np.log10(q_th[n_quantiles-1:]), -np.log10(q_data[n_quantiles-1:]), '-', color=color[j])
            plt.plot(-np.log10(q_th[:n_quantiles]), -np.log10(q_data[:n_quantiles]), '.', color=color[j], label=labels[j])
            xmax = np.max([xmax, - np.log10(q_th[1])])
            ymax = np.max([ymax, - np.log10(q_data[0])])
            # print(- np.log10(q_th[:]))
            if np.sum(alpha)>0:
                if error_type=='experimental':
                    plt.fill_between(-np.log10(q_th), -np.log10(q_data/q_th*q_err[:,0]), -np.log10(q_data/q_th*q_err[:,1]), color=color[j], alpha=fill_dens[j], label='%1.3f CI'%alpha)
        if np.sum(alpha)>0:
            if error_type=='theoretical':
                plt.fill_between(-np.log10(q_th), -np.log10(q_err[:,0]), -np.log10(q_err[:,1]), color=color[j], alpha=fill_dens[j], label='%1.3f CI'%alpha)
    plt.legend(loc=4)
    plt.xlabel('Theoretical -log10')
    plt.ylabel('Experimental -log10')
    plt.plot([0, 100], [0, 100],'--k')
    # print(xmax,ymax)
    plt.xlim([0, np.ceil(xmax)])
    plt.ylim([0, np.ceil(ymax*1.05)])
    plt.title(title)
    plt.tight_layout()
Esempio n. 8
0
def isRatioSignificantlyCorrect(variant_users, total_users, division,
                                threshold):
    alpha = threshold  #significance p-value

    lower_bound, upper_bound = binom.interval(1 - alpha, total_users, division)

    isSig = (variant_users > lower_bound and variant_users < upper_bound)

    return isSig, lower_bound, upper_bound
Esempio n. 9
0
def proportion_ci(p, n, alpha=0.05):
    """
    Create credible intervals for percentage data

    :param p:
    :param n:
    :param alpha:
    :return:
    """
    from scipy.stats import binom
    lower, upper = binom.interval(1 - alpha, n, p)
    return lower / n, upper / n
Esempio n. 10
0
    def get_proper_hets(self):

        #just get the binomial added in, add filters later
        tmp_interval = binom.interval(0.95, self.__num_samples, 0.5)
        interval = [tmp_interval[0] * self.__num_samples, tmp_interval[1] * self.__num_samples]


        proper_variants = []
        for variant in self.__vcf_file:
            if(len(variant.ALT) == 1 and
                    variant.INFO.get('AF') >= interval[0] and
                    variant.INFO.get('AF') <= interval[1]):
                proper_variants.append(variant)

        return proper_variants
Esempio n. 11
0
def test2():
    count = 1000
    visit_params = [0.3, 0.6]
    ds = generate_visits(count, visit_params)

    assert (len(ds) == count)
    assert (len(ds[0]) == len(visit_params))
    sums = np.sum(ds, axis=0)
    print(sums.shape)
    for i, col in enumerate(ds[0]):
        assert (col == 1 or col == 0)
        pct = visit_params[i]
        (lo, hi) = binom.interval(.954, 1000, pct)
        print("sums", i, lo, sums[i], hi)
        assert (lo <= sums[i] <= hi)
Esempio n. 12
0
    def test_sample_probs(self):
        probs = [0.3, 0.6, 0.1]

        token_sampler = TokenSampler(
            batch_size=1,
            banned_tokens_ids=[],
            non_penalizable_tokens_ids=range(len(probs)),
            repetition_penalization_coefficient=REPETITION_PENALIZE_COEFFICIENT)
        adjusted_confidence_level = _CONFIDENCE_LEVEL / len(probs)  # bonferroni correction
        confidence_intervals = [binom.interval(1 - adjusted_confidence_level, _SAMPLES_NUM, p) for p in probs]
        est_probs_from, est_probs_to = zip(*confidence_intervals)
        samples = np.array([token_sampler.sample(probs, 0) for _ in xrange(_SAMPLES_NUM)])
        counts = {val: np.sum(samples == val) for val in np.unique(samples)}

        for i, _ in enumerate(probs):
            self.assertLessEqual(counts[i], est_probs_to[i])
            self.assertGreaterEqual(counts[i], est_probs_from[i])
Esempio n. 13
0
    def test_sample_probs(self):
        probs = [0.3, 0.6, 0.1]

        token_sampler = TokenSampler(
            batch_size=1,
            banned_tokens_ids=[],
            non_penalizable_tokens_ids=range(len(probs)),
            repetition_penalization_coefficient=REPETITION_PENALIZE_COEFFICIENT)
        adjusted_confidence_level = _CONFIDENCE_LEVEL / len(probs)  # bonferroni correction
        confidence_intervals = [binom.interval(1 - adjusted_confidence_level, _SAMPLES_NUM, p) for p in probs]
        est_probs_from, est_probs_to = zip(*confidence_intervals)
        samples = np.array([token_sampler.sample(probs, 0) for _ in range(_SAMPLES_NUM)])
        counts = {val: np.sum(samples == val) for val in np.unique(samples)}

        for i, _ in enumerate(probs):
            self.assertLessEqual(counts[i], est_probs_to[i])
            self.assertGreaterEqual(counts[i], est_probs_from[i])
Esempio n. 14
0
def stat_bursty_tweets(time_window_tweets, expectation_features,
                       features_to_find):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    N = len(time_window_tweets)
    tokenized_tweets = [(tweet['id'], tokenize_tweet(tweet['full_text']))
                        for tweet in time_window_tweets]

    bag_of_feature = {}
    for id_flist in tokenized_tweets:
        for feature in id_flist[1]:
            bag_of_feature[feature] = bag_of_feature.get(feature, set())
            bag_of_feature[feature].add(id_flist[0])

    for feature in bag_of_feature:

        #Determine whether te feature matches the truth or not
        is_bursty = bl_rt(feature, features_to_find)

        n_feature_appear_in = len(bag_of_feature[feature])

        feature_info = expectation_features.get(feature, [0, 0])
        expected = feature_info[1]

        ra = math.floor(expected * N)  #max of the distribution
        rb = binom.interval(0.999, N, expected)[1]  #point where is (almost) 0
        q = (rb + ra) / 2

        if (n_feature_appear_in >= q):
            if is_bursty:
                TP += 1
            else:
                FP += 1
        else:
            if is_bursty:
                FN += 1
            else:
                TN += 1

    return TP, FP, TN, FN
Esempio n. 15
0
    def test_repetition_penalization(self):
        probs = [0.5, 0.5]

        actual_num_nonequal_pairs = 0
        for _ in xrange(_SAMPLES_NUM):
            token_sampler = TokenSampler(
                batch_size=1,
                banned_tokens_ids=[],
                non_penalizable_tokens_ids=[],
                repetition_penalization_coefficient=REPETITION_PENALIZE_COEFFICIENT)
            first_token = token_sampler.sample(probs, sample_idx=0)
            second_token = token_sampler.sample(probs, sample_idx=0)
            actual_num_nonequal_pairs += int(first_token != second_token)

        # P(first != second) = P(first=0, second=1) + P(first=1, second=0) =
        # = 0.5 * 0.5 * r / (0.5 + 0.5 * r) + 0.5 * 0.5 * r / (0.5 + 0.5 * r) = r / (1 + r)
        expected_nonequal_pair_rate = REPETITION_PENALIZE_COEFFICIENT / (1 + REPETITION_PENALIZE_COEFFICIENT)
        expected_nonequal_pair_rate_from, expected_nonequal_pair_rate_to = \
            binom.interval(1 - _CONFIDENCE_LEVEL, _SAMPLES_NUM, expected_nonequal_pair_rate)
        self.assertLessEqual(actual_num_nonequal_pairs, expected_nonequal_pair_rate_to)
        self.assertGreaterEqual(actual_num_nonequal_pairs, expected_nonequal_pair_rate_from)
Esempio n. 16
0
    def test_repetition_penalization(self):
        probs = [0.5, 0.5]

        actual_num_nonequal_pairs = 0
        for _ in range(_SAMPLES_NUM):
            token_sampler = TokenSampler(
                batch_size=1,
                banned_tokens_ids=[],
                non_penalizable_tokens_ids=[],
                repetition_penalization_coefficient=REPETITION_PENALIZE_COEFFICIENT)
            first_token = token_sampler.sample(probs, sample_idx=0)
            second_token = token_sampler.sample(probs, sample_idx=0)
            actual_num_nonequal_pairs += int(first_token != second_token)

        # P(first != second) = P(first=0, second=1) + P(first=1, second=0) =
        # = 0.5 * 0.5 * r / (0.5 + 0.5 * r) + 0.5 * 0.5 * r / (0.5 + 0.5 * r) = r / (1 + r)
        expected_nonequal_pair_rate = REPETITION_PENALIZE_COEFFICIENT / (1 + REPETITION_PENALIZE_COEFFICIENT)
        expected_nonequal_pair_rate_from, expected_nonequal_pair_rate_to = \
            binom.interval(1 - _CONFIDENCE_LEVEL, _SAMPLES_NUM, expected_nonequal_pair_rate)
        self.assertLessEqual(actual_num_nonequal_pairs, expected_nonequal_pair_rate_to)
        self.assertGreaterEqual(actual_num_nonequal_pairs, expected_nonequal_pair_rate_from)
Esempio n. 17
0
    def data_for_visualization(self):
        df = self.data
        df = df[df.dimension.notnull() & df.value.notnull()]
        output = dict()

        # Set title
        output['title'] = self.title

        # Set levels
        levels = sorted(df.value.unique().tolist())
        level_s = [str(v) for v in levels]
        output['levels'] = level_s

        # Compute overall freq and cumulative frequency
        overall_f = self.freq_from_df(df, levels)
        overall_cum = self.cumulative_frequency(overall_f)
        output['overall_f'] = overall_f
        output['overall_cum'] = overall_cum

        # Compute dimensions
        dimensions = list()
        for dim in df.dimension.unique().tolist():
            df_d = df[df.dimension == dim]
            pop_size = len(df_d)
            freq = self.freq_from_df(df_d, levels)
            cum = self.cumulative_frequency(freq)
            ranges = binom.interval(0.95, pop_size, overall_cum)
            range_low = [x / pop_size for x in ranges[0]]
            range_high = [x / pop_size for x in ranges[1]]
            dimensions.append(
                {'name': dim,
                 'freq': freq,
                 'cum': cum,
                 'overall_range_low': range_low,
                 'overall_range_high': range_high,
                 })
        output['dimensions'] = dimensions

        return output
Esempio n. 18
0
    def test_nonpenalizable_tokens_2(self):
        probs = [0.5, 0.5]

        actual_num_nonequal_pairs = 0
        samples_generated = 0
        while samples_generated < _SAMPLES_NUM:
            token_sampler = TokenSampler(
                batch_size=1,
                banned_tokens_ids=[],
                non_penalizable_tokens_ids=[1],
                repetition_penalization_coefficient=REPETITION_PENALIZE_COEFFICIENT)
            first_token = token_sampler.sample(probs, sample_idx=0)
            if first_token == 0:
                samples_generated += 1
                second_token = token_sampler.sample(probs, sample_idx=0)
                actual_num_nonequal_pairs += (first_token != second_token)

        # When we penalize for token#0, P(first != second | first=0) = P(second=1 | first=0) = 0.5 * r / (0.5 + 0.5 * r) = r / (1 + r)
        expected_nonequal_pair_rate = REPETITION_PENALIZE_COEFFICIENT / (1 + REPETITION_PENALIZE_COEFFICIENT)
        expected_nonequal_pair_rate_from, expected_nonequal_pair_rate_to = binom.interval(
            1 - _CONFIDENCE_LEVEL, _SAMPLES_NUM, expected_nonequal_pair_rate)
        self.assertLessEqual(actual_num_nonequal_pairs, expected_nonequal_pair_rate_to)
        self.assertGreaterEqual(actual_num_nonequal_pairs, expected_nonequal_pair_rate_from)
Esempio n. 19
0
def Plot_the_distribution_MDs(GSCs_data, Non_GCSs_data, bins, MDs_lengths,
                              genome_len, path_out):
    #Parameters
    ticks1 = [
        0, 500000, 1000000, 1500000, 2000000, 2500000, 3000000, 3500000,
        4000000, 4500000
    ]
    xticknames1 = [
        '', '500', '1000', '1500', '2000', '2500', '3000', '3500', '4000',
        '4500'
    ]
    colors = [
        '#7FCE79', '#BAE85C', '#ff878b', '#8991ff', '#ac5eff', '#50b3ff',
        '#ffd75e'
    ]
    plot_names = [
        'plot1', 'plot2', 'plot3', 'plot4', 'plot5', 'plot6', 'plot7'
    ]
    Y_labels = [
        'Cfx GCSs', 'RifCfx GCSs', 'Micro GCSs', 'Oxo GCSs', 'Score', 'GC%',
        'Transcription\nlevel'
    ]
    yter = 1592477
    yori = 3711828

    #Prepare GCSs data to bar-compatible format.
    GCSs_data_bared = {}

    for ab, ab_ar in GSCs_data.items():
        bar_ar = []
        for i in range(len(bins) - 1):
            ab_num = 0
            if bins[i] >= 0:
                for gcs in ab_ar:
                    if bins[i + 1] > gcs >= bins[i]:
                        ab_num += 1
                bar_ar.append(ab_num)
            else:
                for gcs in ab_ar:
                    if bins[i + 1] > gcs >= 0 or genome_len > gcs >= bins[
                            i] + genome_len:
                        ab_num += 1
                bar_ar.append(ab_num)
        bar_ar.append(bar_ar[0])
        print(ab, bar_ar)
        GCSs_data_bared[ab] = bar_ar

    #Compute confident intervals for GCSs number fall into MDs.
    MDs_confident_intervals = {}
    for ab, ab_ar in GSCs_data.items():
        upper_edge = []
        lower_edge = []
        print(MDs_lengths)
        for MD_len in MDs_lengths:
            upper_edge.append(
                binom.interval(0.999, len(ab_ar), MD_len / genome_len)[1])
            lower_edge.append(
                binom.interval(0.999, len(ab_ar), MD_len / genome_len)[0])
        upper_edge = [upper_edge[0]] + upper_edge + [upper_edge[0]
                                                     ] + [upper_edge[0]]
        lower_edge = [lower_edge[0]] + lower_edge + [lower_edge[0]
                                                     ] + [lower_edge[0]]
        MDs_confident_intervals[ab] = [lower_edge, upper_edge]
        print(ab, lower_edge)
        print(ab, upper_edge)

    #GCSs data plotting.
    fig, plot_names = plt.subplots(7, 1, figsize=(11, 15), dpi=100)
    print(bins)
    position = [0] + bins[1:]
    print(position)
    bin_width = []
    position_bw = position + [genome_len]
    for j in range(len(position_bw) - 1):
        bin_width.append(position_bw[j + 1] - position_bw[j])
    print(bin_width)
    position_centre = []
    for j in range(len(bin_width)):
        position_centre.append(int(position[j] + (bin_width[j] / 2)))
    position_centre = [0] + position_centre + [genome_len]

    i = 0
    Histo_comp_dict = {
    }  #Will contain computed histogramm data (bins and values)
    for key, value in GCSs_data_bared.items():
        plot_names[i].set_xlim(0, genome_len)
        plot_names[i].set_xticks(ticks1, minor=False)
        plot_names[i].set_xticks([yter, yori], minor=True)
        plot_names[i].set_xticklabels(xticknames1)
        plt.setp(plot_names[i].set_xticklabels(xticknames1),
                 rotation=0,
                 fontsize=14)
        plot_names[i].locator_params(axis='y', nbins=6)
        plot_names[i].tick_params(axis='x', which='major', labelsize=19)
        plot_names[i].bar(position,
                          value,
                          bin_width,
                          color=colors[i],
                          linewidth=1,
                          edgecolor='black',
                          align='edge',
                          zorder=1)  #Barplot for GCSs number
        plot_names[i].plot(position_centre,
                           MDs_confident_intervals[key][0],
                           linestyle=":",
                           color="black",
                           linewidth=1,
                           zorder=8)  #Lower confidential border
        plot_names[i].plot(position_centre,
                           MDs_confident_intervals[key][1],
                           linestyle=":",
                           color="black",
                           linewidth=1,
                           zorder=9)  #Upper confidential border
        plot_names[i].plot(position_centre,
                           MDs_confident_intervals[key][0],
                           marker="_",
                           color="black",
                           linewidth=0,
                           markersize=15,
                           zorder=6)  #Lower confidential border
        plot_names[i].plot(position_centre,
                           MDs_confident_intervals[key][1],
                           marker="_",
                           color="black",
                           linewidth=0,
                           markersize=15,
                           zorder=7)  #Upper confidential border
        plot_names[i].fill_between(position_centre,
                                   MDs_confident_intervals[key][0],
                                   MDs_confident_intervals[key][1],
                                   facecolor='grey',
                                   alpha=0.3,
                                   zorder=10)  #Fill confident interval
        plot_names[i].tick_params(axis='y', which='major', pad=7, labelsize=15)
        plot_names[i].set_ylabel(Y_labels[i], size=22, labelpad=8, rotation=90)
        i += 1

    #Score, GC, Transcription plotting.
    for key, value in Non_GCSs_data.items():
        value.append(value[0])
        plot_names[i].set_xlim(0, genome_len)
        if key == "GC":
            plot_names[i].set_ylim(45, max(value) + 2)
        elif key == "Score":
            plot_names[i].set_ylim(min(value) - 0.2, -1.5)
        plot_names[i].set_xticks(ticks1, minor=False)
        plot_names[i].set_xticks([yter, yori], minor=True)
        plot_names[i].set_xticklabels(xticknames1)
        plt.setp(plot_names[i].set_xticklabels(xticknames1),
                 rotation=0,
                 fontsize=14)
        plot_names[i].tick_params(axis='x', which='major', labelsize=19)
        plot_names[i].locator_params(axis='y', nbins=6)
        plot_names[i].bar(position,
                          value,
                          bin_width,
                          color=colors[i],
                          linewidth=1,
                          edgecolor='black',
                          align='edge')
        plot_names[i].tick_params(axis='y', which='major', pad=7, labelsize=15)
        plot_names[i].set_ylabel(Y_labels[i], size=22, labelpad=8, rotation=90)
        i += 1
    plt.tight_layout()
    fig.savefig(path_out +
                "GCSs_num_score_GC133_transcription_distrib_thr_genome.png",
                figsize=(11, 15),
                dpi=400)

    #GCSs data plotting for Cfx, Micro, and Oxo only.
    GSCs_data_main = {
        'Cfx': GCSs_data_bared['Cfx'],
        'Micro': GCSs_data_bared['Micro'],
        'Oxo': GCSs_data_bared['Oxo']
    }
    Y_labels_main = ['Cfx GCSs', 'Micro GCSs', 'Oxo GCSs']
    colors_main = ['#7FCE79', '#ff878b', '#8991ff']
    fig, plot_names = plt.subplots(3, 1, figsize=(11, 7), dpi=100)
    i = 0
    for key, value in GSCs_data_main.items():
        plot_names[i].set_xlim(0, genome_len)
        plot_names[i].set_xticks(ticks1, minor=False)
        plot_names[i].set_xticks([yter, yori], minor=True)
        plot_names[i].set_xticklabels(xticknames1)
        plt.setp(plot_names[i].set_xticklabels(xticknames1),
                 rotation=0,
                 fontsize=14)
        plot_names[i].locator_params(axis='y', nbins=6)
        plot_names[i].tick_params(axis='x', which='major', labelsize=19)
        plot_names[i].bar(position,
                          value,
                          bin_width,
                          color=colors_main[i],
                          linewidth=1,
                          edgecolor='black',
                          align='edge')
        plot_names[i].tick_params(axis='y', which='major', pad=7, labelsize=15)
        plot_names[i].set_ylabel(Y_labels_main[i],
                                 size=22,
                                 labelpad=8,
                                 rotation=90)
        i += 1
    plt.tight_layout()
    fig.savefig(path_out + "GCSs_number_Cfx_Micro_Oxo_distrib_thr_genome.png",
                figsize=(11, 7),
                dpi=400)
    return GCSs_data_bared, Non_GCSs_data
Esempio n. 20
0

  def generate_visits(self, count):
    dataset = []
    for _ in range(count):
      obs = []
      for param in self.visitor_params:
        val = 1 if random.random() < param else 0
        obs.append(val)
      dataset.append(obs)
    return dataset

if __name__ == "__main__":
  count = 1000
  visitor_params = [0.3, 0.6]
  vm = VisitorModel(["a","b"], visitor_params)

  ds = vm.generate_visits(count)

  assert(len(ds) == count)
  assert(len(ds[0]) == len(visitor_params))
  sums = np.sum(ds, axis=0)
  print(sums.shape)
  for i, col in enumerate(ds[0]):
    assert(col ==1 or col == 0)
    pct = visitor_params[i]
    (lo, hi) = binom.interval(.954, 1000, pct)
    print("sums", i, lo, sums[i], hi)
    assert(lo <= sums[i] <= hi)

Esempio n. 21
0
        return True
    else:
        return False

for N in N_range:
    print 'Starting N =', N
    for p in p_range:
        num_Np_fails = 0
        num_Np_checks = 0
        for _ in xrange(repeats):
            if check(N, p):
                num_Np_fails += 1
            num_Np_checks += 1
        # work out what the failure probability is (approximately but not exactly 1-alpha
        # because it's a discrete distribution)
        low, high = binom.interval(alpha, N, p)
        if p==0:
            low = high = 0
        elif p==1:
            low = high = N
        q = binom.cdf(low-0.1, N, p)+binom.sf(high+0.1, N, p)
        low, high = binom.interval(alpha, num_Np_checks, q)
        if q==0:
            low = high = 0
        if num_Np_fails<low or num_Np_fails>high:
            print 'N=%d, p=%.3f failed %d of %d checks, outside range (%d, %d)' % (N, p, num_Np_fails,
                                                                                   num_Np_checks, low, high)
print
failrate = float(numfails)/numchecks
low, high = norm.interval(alpha, loc=mu, scale=sqrt(sigma2))
print '%d/%d=%.2f%% failed at %d%%' % (numfails, numchecks, numfails*100.0/numchecks, 100*alpha)
Esempio n. 22
0
def plot_sbc(theta_samples,
             theta_test,
             param_names,
             bins=20,
             figsize=(15, 5),
             interval=0.99,
             show=True,
             filename=None,
             font_size=12):
    """
    Plots the simulation-based posterior checking histograms as advocated by Talts et al. (2018).
    """

    # Plot settings
    plt.rcParams['font.size'] = font_size
    N = int(theta_test.shape[0])

    # Prepare figure
    if len(param_names) >= 6:
        n_col = int(np.ceil(len(param_names) / 2))
        n_row = 2
    else:
        n_col = int(len(param_names))
        n_row = 1
    # Initialize figure
    f, axarr = plt.subplots(n_row, n_col, figsize=figsize)
    if n_row > 1:
        axarr = axarr.flat

    # Compute ranks (using broadcasting)
    ranks = np.sum(theta_samples < theta_test[:, np.newaxis, :], axis=1)

    # Compute interval
    endpoints = binom.interval(interval, N, 1 / (bins + 1))

    # Plot histograms
    for j in range(len(param_names)):

        # Add interval
        axarr[j].axhspan(endpoints[0],
                         endpoints[1],
                         facecolor='gray',
                         alpha=0.3)
        axarr[j].axhline(np.mean(endpoints), color='gray', zorder=0, alpha=0.5)

        sns.distplot(ranks[:, j],
                     kde=False,
                     ax=axarr[j],
                     color='#a34f4f',
                     hist_kws=dict(edgecolor="k", linewidth=1, alpha=1.),
                     bins=bins)

        axarr[j].set_title(param_names[j])
        axarr[j].spines['right'].set_visible(False)
        axarr[j].spines['top'].set_visible(False)
        if j == 0:
            axarr[j].set_xlabel('Rank statistic')
        axarr[j].get_yaxis().set_ticks([])

    f.tight_layout()

    # Show, if specified
    if show:
        plt.show()
    # Save if specified
    if filename is not None:
        f.savefig("figures/{}_sbc.png".format(filename), dpi=600)
Esempio n. 23
0
def test_issue_11134():
    alpha, n, p = 0.95, 10, 0
    assert_equal(binom.interval(alpha=alpha, n=n, p=p), (0, 0))
Esempio n. 24
0
def plot_reliability_diagram(y,x,bins=np.linspace(0,1,21),size_points=False,
                             show_baseline=True, error_bars=True,
                             error_bar_alpha=.05, show_histogram=False,
                             c='red', **kwargs):
    """Plots a reliability diagram of predicted vs empirical probabilities.

    
    Parameters
    ----------
    y: array-like, length (n_samples). The true outcome values as integers (0 or 1)

    x: The predicted probabilities, between 0 and 1 inclusize.

    bins: array-like, the endpoints of the bins used to aggregate and estimate the
        empirical probabilities.  Default is 20 equally sized bins
        from 0 to 1, i.e. [0,0.05,0.1,...,.95, .1].

    size_points: scale the size of the plotted points to reflect the number of
        data points in the bin.  This may not work well if some bins are much
        larger than others.  Default is False.

    show_baseline: whether or not to print a dotted black line representing
        y=x (perfect calibration).  Default is True

    error_bars: whether to show error bars reflecting the confidence
        interval under the assumption that the input probabilities are
        perfectly calibrated. Default is True.

    error_bar_alpha: The alpha value to use for the error_bars.  Default
        is .05 (a 95% CI).  Confidence intervals are based on the exact
        binomial distribution, not the normal approximation.

    show_histogram: Whether or not to show a separate histogram of the
        number of values in each bin.  Default is False

    c: color of the plotted points.  Default is 'red'.

    **kwargs: additional args to be passed to the plt.scatter matplotlib call.

    Returns
    -------
    A tuple of the x_values, y_values, and associated bin_counts for each of
    the points in the plot.
    """
    digitized_x = np.digitize(x, bins)
    mean_count_array = np.array([[np.mean(y[digitized_x == i]),
                                  len(y[digitized_x == i]),
                                  np.mean(x[digitized_x==i])] 
                                  for i in np.unique(digitized_x)])
    x_pts_to_graph = mean_count_array[:,2]
    y_pts_to_graph = mean_count_array[:,0]
    pt_sizes = mean_count_array[:,1]
    if show_histogram:
        plt.subplot(1,2,1)
    if show_baseline:
        plt.plot(np.linspace(0,1,100),(np.linspace(0,1,100)),'k--')
    for i in range(len(y_pts_to_graph)):
        if size_points:
            plt.scatter(x_pts_to_graph, y_pts_to_graph,
                        s=pt_sizes, c=c, **kwargs)
        else:
            plt.scatter(x_pts_to_graph,y_pts_to_graph, c=c, **kwargs)
    plt.axis([-0.1,1.1,-0.1,1.1])
    plt.xlabel('Predicted')
    plt.ylabel('Empirical')
    if error_bars:
        yerr_mat = binom.interval(1-error_bar_alpha,pt_sizes,x_pts_to_graph)/pt_sizes - x_pts_to_graph
        yerr_mat[0,:] = -yerr_mat[0,:]
        plt.errorbar(x_pts_to_graph, x_pts_to_graph, yerr=yerr_mat, capsize=5)
    if show_histogram:
        plt.subplot(1,2,2)
        plt.hist(x,bins=bins)
    return(x_pts_to_graph,y_pts_to_graph,pt_sizes)
Esempio n. 25
0
def plot_reliability_diagram(y,
                             x,
                             bins=np.linspace(0, 1, 21),
                             show_baseline=True,
                             error_bars=True,
                             error_bar_alpha=.05,
                             show_histogram=False,
                             scaling='none',
                             scaling_eps=.0001,
                             scaling_base=10,
                             c='red',
                             **kwargs):
    """Plots a reliability diagram of predicted vs empirical probabilities.

    
    Parameters
    ----------
    y: array-like, length (n_samples). The true outcome values as integers (0 or 1)

    x: The predicted probabilities, between 0 and 1 inclusive.

    bins: array-like, the endpoints of the bins used to aggregate and estimate the
        empirical probabilities.  Default is 20 equally sized bins
        from 0 to 1, i.e. [0,0.05,0.1,...,.95, .1].

    show_baseline: whether or not to print a dotted black line representing
        y=x (perfect calibration).  Default is True

    error_bars: whether to show error bars reflecting the confidence
        interval under the assumption that the input probabilities are
        perfectly calibrated. Default is True.

    error_bar_alpha: The alpha value to use for the error_bars.  Default
        is .05 (a 95% CI).  Confidence intervals are based on the exact
        binomial distribution, not the normal approximation.

    show_histogram: Whether or not to show a separate histogram of the
        number of values in each bin.  Default is False

    scaling: Default is 'none'. Alternative is 'logit' which is useful for
        better examination of calibration near 0 and 1.  Values shown are
        on the scale provided and then tick marks are relabeled.

    scaling_eps: default is .0001.  Ignored unless scaling='logit'. This 
        indicates the smallest meaningful positive probability you
        want to consider.

    scaling_base: default is 10. Ignored unless scaling='logit'. This
        indicates the base used when scaling back and forth.  Matters
        only in how it affects the automatic tick marks.

    c: color of the plotted points.  Default is 'red'.

    **kwargs: additional args to be passed to the plt.scatter matplotlib call.

    Returns
    -------
    A dictionary containing the x and y points plotted (unscaled) and the 
        count in each bin.
    """
    digitized_x = np.digitize(x, bins)
    mean_count_array = np.array([[
        np.mean(y[digitized_x == i]),
        len(y[digitized_x == i]),
        np.mean(x[digitized_x == i])
    ] for i in np.unique(digitized_x)])
    x_pts_to_graph = mean_count_array[:, 2]
    y_pts_to_graph = mean_count_array[:, 0]
    bin_counts = mean_count_array[:, 1]
    if show_histogram:
        plt.subplot(1, 2, 1)
    if scaling == 'logit':
        x_pts_to_graph_scaled = my_logit(x_pts_to_graph,
                                         eps=scaling_eps,
                                         base=scaling_base)
        y_pts_to_graph_scaled = my_logit(y_pts_to_graph,
                                         eps=scaling_eps,
                                         base=scaling_base)
        prec_int = np.max([
            -np.floor(np.min(x_pts_to_graph_scaled)),
            np.ceil(np.max(x_pts_to_graph_scaled))
        ])
        prec_int = np.max([prec_int, -np.floor(np.log10(scaling_eps))])
        low_mark = -prec_int
        high_mark = prec_int
        if show_baseline:
            plt.plot([low_mark, high_mark], [low_mark, high_mark], 'k--')
        # for i in range(len(y_pts_to_graph)):
        plt.scatter(x_pts_to_graph_scaled,
                    y_pts_to_graph_scaled,
                    c=c,
                    **kwargs)
        locs, labels = plt.xticks()
        labels = np.round(my_logistic(locs, base=scaling_base), decimals=4)
        plt.xticks(locs, labels)
        locs, labels = plt.yticks()
        labels = np.round(my_logistic(locs, base=scaling_base), decimals=4)
        plt.yticks(locs, labels)
        if error_bars:
            prob_range_mat = binom.interval(1 - error_bar_alpha, bin_counts,
                                            x_pts_to_graph) / bin_counts
            yerr_mat = (
                my_logit(prob_range_mat, eps=scaling_eps, base=scaling_base) -
                my_logit(x_pts_to_graph, eps=scaling_eps, base=scaling_base))
            yerr_mat[0, :] = -yerr_mat[0, :]
            plt.errorbar(x_pts_to_graph_scaled,
                         x_pts_to_graph_scaled,
                         yerr=yerr_mat,
                         capsize=5)
        plt.axis(
            [low_mark - .1, high_mark + .1, low_mark - .1, high_mark + .1])
    if scaling != 'logit':
        if show_baseline:
            plt.plot(np.linspace(0, 1, 100), (np.linspace(0, 1, 100)), 'k--')
        # for i in range(len(y_pts_to_graph)):
        plt.scatter(x_pts_to_graph, y_pts_to_graph, c=c, **kwargs)
        plt.axis([-0.1, 1.1, -0.1, 1.1])
        if error_bars:
            yerr_mat = binom.interval(
                1 - error_bar_alpha, bin_counts,
                x_pts_to_graph) / bin_counts - x_pts_to_graph
            yerr_mat[0, :] = -yerr_mat[0, :]
            plt.errorbar(x_pts_to_graph,
                         x_pts_to_graph,
                         yerr=yerr_mat,
                         capsize=5)
    plt.xlabel('Predicted')
    plt.ylabel('Empirical')
    if show_histogram:
        plt.subplot(1, 2, 2)
        plt.hist(x, bins=bins)
    out_dict = {}
    out_dict['pred_probs'] = x_pts_to_graph
    out_dict['emp_probs'] = y_pts_to_graph
    out_dict['bin_counts'] = bin_counts
    return (out_dict)
Esempio n. 26
0
def run_benchmark(args):
    def verbose_print(*a, **kw):
        if args.verbose:
            print(*a, **kw)

    diff_programs = get_diff_programs_for_args(args)

    all_diff_program_extra_fields = sorted(
        {k
         for p in diff_programs for k in p.get("extra_fields", {}).keys()})
    print(f"{len(diff_programs)} diff programs")

    try:
        with (args.input_dir / "index.json").open("r", encoding="utf-8") as f:
            benchmark_input_index = json.load(f)
    except Exception:
        print('Failed to load benchmark inputs. Did you run "prepare"?')
        raise
    shuffled_generation_configs = benchmark_input_index[
        "shuffled_generation_configs"]
    num_regens = benchmark_input_index["num_regens"]
    assert num_regens >= 1

    test_combination_factors = [
        len(shuffled_generation_configs),
        num_regens,
        len(diff_programs),
        # don't count number of repetitions, because it is possibly dynamic
    ]
    total_test_combinations = np.prod(test_combination_factors)
    print(
        f'{" * ".join(str(v) for v in test_combination_factors)} = {total_test_combinations} total test combinations'
    )

    csv_output_file = open(args.output_csv, "w", newline="")
    csv_output_writer = CSVOutputWriter(csv_output_file)

    def get_extra_file_path(suffix):
        name = ".".join(args.output_csv.name.split(".")[:-1]) + suffix
        return args.output_csv.parent / name

    failed_file_path = get_extra_file_path("-FAILED.txt")
    failed_file = open(failed_file_path, "w")

    if args.auto_repetitions:
        num_repetitions = args.max_repetitions
    else:
        num_repetitions = args.min_repetitions

    if args.no_progress_bar:
        progress_bar = NoopProgressBar()
    else:
        progress_bar = tqdm(total=total_test_combinations, smoothing=0)

    if args.skip_estimated_timeouts:
        # record smallest input length with timeout per diff_program to skip larger test cases
        smallest_timeout = {}

    last_flush_time = time.monotonic()
    break_flag = False
    some_benchmarks_failed = False
    for _entry in shuffled_generation_configs:
        generation_config_i = _entry["i"]
        generation_config = _entry["config"]

        for regen_i in range(num_regens):
            verbose_print("generation_config", generation_config)
            test_case_dir = (args.input_dir /
                             f"config-{generation_config_i}-regen-{regen_i}")

            for diff_program in diff_programs:

                diff_prog_full_name = (diff_program["name"] + "_" + str(
                    diff_program.get("extra_fields", {}).get("mpi_procs", 1)))

                if (args.skip_estimated_timeouts
                        and diff_prog_full_name in smallest_timeout
                        and generation_config["length_1"] >=
                        smallest_timeout[diff_prog_full_name]):
                    some_benchmarks_failed = True
                    print(diff_prog_full_name + "\t", file=failed_file, end="")
                    print(generation_config, file=failed_file, end="")
                    print(
                        f"\t skipped due to estimated timeout, since length_1 {generation_config['length_1']} >= {smallest_timeout[diff_prog_full_name]}",
                        file=failed_file,
                    )
                    progress_bar.update()
                    continue

                # sorted list of measurements
                micros_until_len_res = []
                check_interval = min_repetitions_for_confidence - 1

                for repetition_i in range(num_repetitions):
                    if time.monotonic(
                    ) - last_flush_time > flush_every_seconds:
                        csv_output_file.flush()
                        failed_file.flush()
                        last_flush_time = time.monotonic()

                    verbose_print("  diff_program", diff_program["name"])

                    extra_fields_for_output = {
                        k: diff_program.get("extra_fields", {}).get(k, "")
                        for k in all_diff_program_extra_fields
                    }

                    extra_fields_for_run = deepcopy(
                        diff_program.get("extra_fields", {}))
                    if (args.no_direct_mpi_procs_limit
                            and "mpi_procs" in extra_fields_for_run):
                        extra_fields_for_run["mpi_procs"] = None

                    try:
                        program_result = diff_program["run"](
                            test_case_dir / "in_1.txt",
                            test_case_dir / "in_2.txt",
                            extra_fields_for_run,
                        )
                        verbose_print("    micros_until_len",
                                      program_result.micros_until_len)
                    except KeyboardInterrupt:  # exit the benchmark
                        break_flag = True
                        break
                    except TimeoutExpired as te:
                        some_benchmarks_failed = True
                        print(diff_prog_full_name + "\t",
                              file=failed_file,
                              end="")
                        print(generation_config, file=failed_file, end="")
                        print("\t" + repr(te), file=failed_file)
                        if args.auto_repetitions:
                            timeout_micros = te.timeout * 1e6  # seconds to microseconds
                            if (repetition_i >= 5 and micros_until_len_res[0]
                                    == timeout_micros):
                                if args.skip_estimated_timeouts:
                                    smallest_timeout[
                                        diff_prog_full_name] = generation_config[
                                            "length_1"]
                                break  # if five iterations timed out -> assume all will timeout, don't try again
                            micros_until_len_res.append(timeout_micros)
                        else:
                            if args.skip_estimated_timeouts:
                                smallest_timeout[
                                    diff_prog_full_name] = generation_config[
                                        "length_1"]
                        continue
                    except Exception as e:  # catch all
                        some_benchmarks_failed = True
                        print(diff_prog_full_name + "\t",
                              file=failed_file,
                              end="")
                        print(generation_config, file=failed_file, end="")
                        print("\t" + repr(e), file=failed_file)
                        break  # assumption: will always fail with these exceptions -> no need to run all repetitions

                    output_data = {
                        "generation_config_i":
                        generation_config_i,
                        **{
                            f"input_{k}": v
                            for k, v in generation_config.items()
                        },
                        "regen_i":
                        regen_i,
                        "repetition_i":
                        repetition_i,
                        "diff_program":
                        diff_program["name"],
                        **extra_fields_for_output,
                        "mpi_comm_world":
                        getattr(program_result, "mpi_comm_world", 1),
                        "micros_input":
                        program_result.micros_input,
                        "micros_precompute":
                        program_result.micros_precompute,
                        "micros_until_len":
                        program_result.micros_until_len,
                        "micros_edit_script":
                        program_result.micros_edit_script,
                        "min_edit_length":
                        program_result.min_edit_length,
                    }

                    csv_output_writer.write_row(output_data)

                    if args.auto_repetitions:
                        bisect.insort(micros_until_len_res,
                                      program_result.micros_until_len)

                        if (
                                repetition_i >= args.min_repetitions
                                and repetition_i % check_interval == 0
                        ) or repetition_i == num_repetitions - 1:  # reached the last iteration
                            # check if required confidence interval is reached
                            if repetition_i % 2 == 0:  # odd number of results
                                current_median = micros_until_len_res[
                                    repetition_i // 2]
                            else:
                                current_median = (micros_until_len_res[
                                    (repetition_i - 1) //
                                    2] + micros_until_len_res[
                                        (repetition_i + 1) // 2]) / 2

                            # check about every 20 ms = 20'000 microseconds  (overhead is about 1 ms)  => max 5% overhead
                            check_interval = math.ceil(20000 / current_median)

                            lower_idx, upper_idx = binom.interval(
                                confidence_level, repetition_i + 1, 0.5)
                            # to get correct indices in python (Boudec paper Appendix A - 1)
                            lower_idx -= 1
                            # sometimes the interval is a little bit wider than in the Boudec paper, but this just means more confidence

                            if (micros_until_len_res[int(lower_idx)] >=
                                (1 - args.max_median_error) * current_median
                                    and micros_until_len_res[int(upper_idx)] <=
                                (1 + args.max_median_error) * current_median):
                                break

                            if repetition_i == num_repetitions - 1:
                                # failed to reach required confidence
                                some_benchmarks_failed = True
                                print(
                                    diff_prog_full_name + "\t",
                                    file=failed_file,
                                    end="",
                                )
                                print(generation_config,
                                      file=failed_file,
                                      end="")
                                print(
                                    "\t" +
                                    f"Failed to reach required confidence after {num_repetitions} repetitions; "
                                    +
                                    f"current median: {current_median}, left end of CI: {micros_until_len_res[int(lower_idx)]}, right end of CI: {micros_until_len_res[int(upper_idx)]}",
                                    file=failed_file,
                                )

                progress_bar.update()

                if break_flag:
                    break
            if break_flag:
                break
        if break_flag:
            break

    progress_bar.close()
    csv_output_file.close()
    failed_file.close()

    if not some_benchmarks_failed:
        failed_file_path.unlink()
Esempio n. 27
0
def plot_sbc(theta_samples,
             theta_test,
             param_names,
             bins=25,
             dpi=300,
             figsize=(24, 12),
             interval=0.99,
             show=True,
             font_size=12):
    """ Plots the simulation-based posterior checking histograms as advocated by Talts et al. (2018).

    Parameters
    ----------
    theta_samples: np.array
        Array of sampled parameters
    theta_test: np.array
        Array of test parameters
    param_names: list(str)
        List of parameter names for plotting.
    bins: int, default: 25
        Bins for histogram plot
    dpi: int, default: 300
        Dots per inch (dpi) for plot
    figsize: tuple(int, int), default: (24, 12)
        Figure size
    interval: float, default: 0.99
        Interval to plot
    show: bool, default: True
        Controls whether the plot shall be printed
    font_size: int, default:12
        Font size

    """

    # Plot settings
    plt.rcParams['font.size'] = font_size
    N = int(theta_test.shape[0])

    # Determine n_subplots dynamically
    n_row = int(np.ceil(len(param_names) / 6))
    n_col = int(np.ceil(len(param_names) / n_row))

    # Initialize figure
    f, axarr = plt.subplots(n_row, n_col, figsize=figsize)
    if n_row > 1:
        axarr = axarr.flat

    # Compute ranks (using broadcasting)
    ranks = np.sum(theta_samples < theta_test, axis=0)

    # Compute interval
    endpoints = binom.interval(interval, N, 1 / (bins + 1))

    # Plot histograms
    for j in range(len(param_names)):

        # Add interval
        axarr[j].axhspan(endpoints[0],
                         endpoints[1],
                         facecolor='gray',
                         alpha=0.3)
        axarr[j].axhline(np.mean(endpoints), color='gray', zorder=0, alpha=0.5)

        sns.histplot(ranks[:, j],
                     kde=False,
                     ax=axarr[j],
                     color='#a34f4f',
                     bins=bins,
                     alpha=0.95)

        axarr[j].set_title(param_names[j])
        axarr[j].spines['right'].set_visible(False)
        axarr[j].spines['top'].set_visible(False)
        if j == 0:
            axarr[j].set_xlabel('Rank statistic')
        axarr[j].get_yaxis().set_ticks([])
        axarr[j].set_ylabel('')

    f.tight_layout()

    # Show, if specified
    if show:
        plt.show()
    return f
Esempio n. 28
0
def get_bursty_tweet(time_window_tweets,
                     expectation_features,
                     train=True,
                     print_stats=False):

    N = len(time_window_tweets)

    id_ret = set()

    # Tokenize all the tweets - reduce them to unigram features
    tokenized_tweets = [(tweet['id'], tokenize_tweet(tweet['full_text']))
                        for tweet in time_window_tweets]

    # Get a list feature - list of tweet_ids where the feature appear in
    bag_of_feature = {}
    for id_flist in tokenized_tweets:
        for feature in id_flist[1]:
            bag_of_feature[feature] = bag_of_feature.get(feature, set())
            bag_of_feature[feature].add(id_flist[0])

    #print(bag_of_feature.keys())
    for feature in bag_of_feature:
        #  Count feature, number of tweets it appear in and reporpotionate it
        n_feature_appear_in = len(bag_of_feature[feature])

        #Probability f_j appears in the time windows P_o(n_{i,j})
        Prob_f_window = n_feature_appear_in / N

        feature_info = expectation_features.get(feature, [0, 0])
        expected = feature_info[1]
        windows_feature_appeared = feature_info[0]

        ra = math.floor(expected * N)  #max of the distribution
        rb = binom.interval(0.999, N,
                            expected)[1]  #point where distribution appraches 0
        q = (rb + ra) / 2

        if (n_feature_appear_in >= q):
            id_ret = id_ret.union(bag_of_feature[feature])

        if train:
            expected = ((expected * windows_feature_appeared) +
                        Prob_f_window) / (windows_feature_appeared + 1)
            expectation_features[feature] = [
                windows_feature_appeared + 1, expected
            ]

        if print_stats:
            print(
                "------------------------------------------------------------------------"
            )
            print("Feature: ", feature)
            print("Window size: ", N)
            print("n_{i,j}: ", n_feature_appear_in)
            print("Probability in the time window of the feature: " +
                  str("{0:.2f}".format(Prob_f_window)))
            print("Expectation of the feature: " +
                  str("{0:.2f}".format(expected)))
            #plot_graph(N, expected,int(N*n_feature_appear_in/N_tweets_window), Prob_f_window)
            print(
                "------------------------------------------------------------------------"
            )

    return id_ret
Esempio n. 29
0
def calcPercentile():
    data_binom = binom.rvs(n=120, p=1 / 120, size=20000)
    CI = binom.interval(0.95, 120, 1 / 120)
    print(CI)
    plt.hist(data_binom)
    plt.show()
Esempio n. 30
0
def qqplot(data,
           labels,
           n_quantiles=100,
           alpha=0.95,
           error_type='theoretical',
           distribution='binomial',
           log10conv=True,
           color=['k', 'r', 'b'],
           fill_dens=[0.1, 0.1, 0.1],
           type='uniform',
           title='title'):
    '''
    Function for plotting Quantile Quantile (QQ) plots with confidence interval (CI)
    :param data: NumPy 1D array with data
    :param labels:
    :param type: type of the plot
    :param n_quantiles: number of quntiles to plot
    :param alpha: confidence interval
    :param distribution: beta/normal/binomial -- type of the error estimation. Most common in the literature is 'beta'.
    :param log10conv: conversion to -log10(p) for the figure
    :return: nothing
    '''
    xmax = 0
    ymax = 0
    np.seterr(divide='ignore')
    if type == 'uniform':
        # we expect distribution from 0 to 1
        for j in range(len(data)):
            # define quantiles positions:
            q_pos = np.concatenate([
                np.arange(99.) / len(data[j]),
                np.logspace(-np.log10(len(data[j])) + 2, 0, n_quantiles)
            ])
            # define quantiles in data
            q_data = mquantiles(data[j],
                                prob=q_pos,
                                alphap=0,
                                betap=1,
                                limit=(0, 1))  # linear interpolation
            # define theoretical predictions
            q_th = q_pos.copy()
            # evaluate errors
            q_err = np.zeros([len(q_pos), 2])
            if np.sum(alpha) > 0:
                for i in range(0, len(q_pos)):
                    if distribution == 'beta':
                        q_err[i, :] = beta.interval(
                            alpha,
                            len(data[j]) * q_pos[i],
                            len(data[j]) - len(data[j]) * q_pos[i])
                    elif distribution == 'binomial':
                        q_err[i, :] = binom.interval(alpha=alpha,
                                                     n=len(data[j]),
                                                     p=q_pos[i])
                    elif distribution == 'normal':
                        q_err[i, :] = norm.interval(
                            alpha,
                            len(data[j]) * q_pos[i],
                            np.sqrt(len(data[j]) * q_pos[i] * (1. - q_pos[i])))
                    else:
                        print('Distribution is not defined!')
                q_err[i, q_err[i, :] < 0] = 1e-15
                if (distribution == 'binomial') | (distribution == 'normal'):
                    q_err /= 1.0 * len(data[j])
                    for i in range(0, 100):
                        q_err[i, :] += 1e-15
            # print(q_err[100:, :])
            slope, intercept, r_value, p_value, std_err = linregress(
                q_th, q_data)
            # print(labels[j], ' -- Slope: ', slope, " R-squared:", r_value**2)
            plt.plot(-np.log10(q_th[n_quantiles - 1:]),
                     -np.log10(q_data[n_quantiles - 1:]),
                     '-',
                     color=color[j])
            plt.plot(-np.log10(q_th[:n_quantiles]),
                     -np.log10(q_data[:n_quantiles]),
                     '.',
                     color=color[j],
                     label=labels[j])
            xmax = np.max([xmax, -np.log10(q_th[1])])
            ymax = np.max([ymax, -np.log10(q_data[0])])
            # print(- np.log10(q_th[:]))
            if np.sum(alpha) > 0:
                if error_type == 'experimental':
                    plt.fill_between(-np.log10(q_th),
                                     -np.log10(q_data / q_th * q_err[:, 0]),
                                     -np.log10(q_data / q_th * q_err[:, 1]),
                                     color=color[j],
                                     alpha=fill_dens[j],
                                     label='%1.3f CI' % alpha)
        if np.sum(alpha) > 0:
            if error_type == 'theoretical':
                plt.fill_between(-np.log10(q_th),
                                 -np.log10(q_err[:, 0]),
                                 -np.log10(q_err[:, 1]),
                                 color=color[j],
                                 alpha=fill_dens[j],
                                 label='%1.3f CI' % alpha)
    plt.legend(loc=4)
    plt.xlabel('Theoretical -log10(p)')
    plt.ylabel('Experimental -log10(p)')
    plt.plot([0, 100], [0, 100], '--k')
    # print(xmax,ymax)
    plt.xlim([0, np.ceil(xmax)])
    plt.ylim([0, np.ceil(ymax * 1.05)])
    plt.title(title)
    plt.tight_layout()
    np.seterr(divide='warn')
Esempio n. 31
0
    def model_dict(self, round_state,
                   hole_card):  # CHANGE!!! Added 'hole_card'
        self.this_round_length = 0
        action_histories = round_state['action_histories']
        for x in self.opponent_model.keys():
            self.opponent_model[x]['fold_this_round'] = 0
            self.opponent_model[x]['raise_this_round'] = 0
            self.opponent_model[x]['call_this_round'] = 0
        for game_round in action_histories.keys():
            for x in range(len(action_histories[str(game_round)])):
                if len(action_histories[str(game_round)]) != 0:
                    self.this_round_length += 1

                    # CHANGE !!! Simple method for adding priors. Probably not the best option.
                    self.opponent_model[action_histories[str(
                        game_round)][x]['uuid']][
                            'fold'] = 2.16  # or 4.32 (or any multiple of 2.16)
                    self.opponent_model[action_histories[str(
                        game_round)][x]['uuid']][
                            'call'] = .42  # or .84 (or any multiple of .42)
                    self.opponent_model[action_histories[str(
                        game_round)][x]['uuid']][
                            'raise'] = .42  # or .84 (or any multiple of .42)
                if action_histories[str(game_round)][x]['action'] == 'FOLD':
                    self.opponent_model[action_histories[str(game_round)][x]
                                        ['uuid']]['fold'] += 1
                    self.opponent_model[action_histories[str(game_round)][x]
                                        ['uuid']]['fold_this_round'] += 1
                elif action_histories[str(game_round)][x]['action'] == 'CALL':
                    self.opponent_model[action_histories[str(game_round)][x]
                                        ['uuid']]['call'] += 1
                    self.opponent_model[action_histories[str(game_round)][x]
                                        ['uuid']]['call_this_round'] += 1
                elif action_histories[str(game_round)][x]['action'] == 'RAISE':
                    self.opponent_model[action_histories[str(game_round)][x]
                                        ['uuid']]['raise'] += 1
                    self.opponent_model[action_histories[str(game_round)][x]
                                        ['uuid']]['raise_this_round'] += 1
                    amt = action_histories[str(game_round)][x]['amount']
                    self.opponent_model[action_histories[str(
                        game_round)][x]['uuid']]['raise_shares'] += (
                            amt / self.opponent_model[action_histories[str(
                                game_round)][x]['uuid']]['stack'])
        output = {}
        probability_list = []
        output['probability_list'] = probability_list
        for x in self.opponent_model.keys():
            try:

                n = (self.opponent_model[x]['fold'] +
                     self.opponent_model[x]['raise'] +
                     self.opponent_model[x]['call'])
                fold_freq = self.opponent_model[x]['fold'] / (
                    self.opponent_model[x]['fold'] +
                    self.opponent_model[x]['raise'] +
                    self.opponent_model[x]['call'])  # p_hat
                conf_int = binom.interval(.05, n, fold_freq)
                print(conf_int)

                self.opponent_model[x][
                    'fold_freq'] = self.opponent_model[x]['fold'] / (
                        self.opponent_model[x]['fold'] +  # CHANGE!!!
                        self.opponent_model[x]['raise'] +
                        self.opponent_model[x]['call'])
            except:
                self.opponent_model[x]['fold_freq'] = .72
            try:
                self.opponent_model[x]['raises:calls'] = self.opponent_model[
                    x]['raise'] / self.opponent_model[x]['call']
            except:
                self.opponent_model[x]['raises:calls'] = 1
            community_card = round_state['community_card']
            win_rate = estimate_hole_card_win_rate(
                nb_simulation=NB_SIMULATION,
                nb_player=self.nb_player,
                hole_card=gen_cards(hole_card),
                community_card=gen_cards(community_card))
            self.opponent_model[x]['probability'] = (1 -
                                                     win_rate) / self.nb_player

            if self.opponent_model[x]['raise_this_round'] > 0:
                self.opponent_model[x]['probability'] += (
                    1 - self.opponent_model[x]['aggressiveness'] *
                    self.aggressiveness_raise_prob_factor
                ) * self.opponent_model[x]['probability']
            if self.opponent_model[x]['call_this_round'] > 0:
                self.opponent_model[x]['probability'] += ((1 - self.opponent_model[x]['frequency']) * \
                                                          self.opponent_model[x][
                                                              'probability']) * self.frequency_call_factor
            if self.opponent_model[x]['fold_this_round'] > 0:
                self.opponent_model[x]['probability'] = 0

            # I'm skeptical that opponent model dictionary is tracking properly. I think it only tracks while we're playing a hand.
            # print(self.opponent_model[x])
            if self.opponent_model[x]['name'] != self.name:
                output['probability_list'].append(
                    self.opponent_model[x]['probability'])
            else:
                output['stack'] = self.opponent_model[x]['stack']
        return (output)
Esempio n. 32
0
def Plot_the_distribution(GSCs_data, Non_GCSs_data, bins, genome_len,
                          path_out):
    #Parameters
    ticks1 = [
        0, 500000, 1000000, 1500000, 2000000, 2500000, 3000000, 3500000,
        4000000, 4500000
    ]
    xticknames1 = [
        '', '500', '1000', '1500', '2000', '2500', '3000', '3500', '4000',
        '4500'
    ]
    colors = [
        '#7FCE79', '#BAE85C', '#ff878b', '#8991ff', '#ac5eff', '#50b3ff',
        '#ffd75e'
    ]
    plot_names = [
        'plot1', 'plot2', 'plot3', 'plot4', 'plot5', 'plot6', 'plot7'
    ]
    Y_labels = [
        'Cfx GCSs', 'RifCfx GCSs', 'Micro GCSs', 'Oxo GCSs', 'Score', 'GC%',
        'Transcription\nlevel'
    ]
    yter = 1592477
    yori = 3711828
    #GCSs data plotting.
    fig, plot_names = plt.subplots(7, 1, figsize=(11, 15), dpi=100)
    i = 0
    Histo_comp_dict = {
    }  #Will contain computed histogramm data (bins and values)
    for key, value in GSCs_data.items():
        plot_names[i].set_xlim(0, genome_len)
        plot_names[i].set_xticks(ticks1, minor=False)
        plot_names[i].set_xticks([yter, yori], minor=True)
        plot_names[i].set_xticklabels(xticknames1)
        plt.setp(plot_names[i].set_xticklabels(xticknames1),
                 rotation=0,
                 fontsize=14)
        conf_interval = [
            binom.interval(0.999, len(value), 1 / 10)[0],
            binom.interval(0.999, len(value), 1 / 10)[1]
        ]
        plot_names[i].set_yticks(conf_interval, minor=True)
        plot_names[i].yaxis.grid(True,
                                 which='minor',
                                 linewidth=0.4,
                                 linestyle='--',
                                 color='black')
        plot_names[i].fill_between(bins,
                                   conf_interval[0],
                                   conf_interval[1],
                                   facecolor='grey',
                                   alpha=0.3)
        plot_names[i].locator_params(axis='y', nbins=6)
        plot_names[i].tick_params(axis='x', which='major', labelsize=19)
        Histo_comp_dict[key] = plot_names[i].hist(
            value,
            bins,
            facecolor=colors[i],
            alpha=0.7,
            linewidth=1,
            edgecolor='black'
        )  #Plot histo and save computed histogramm data (bins and values)
        plot_names[i].tick_params(axis='y', which='major', pad=7, labelsize=15)
        plot_names[i].set_ylabel(Y_labels[i], size=22, labelpad=8, rotation=90)
        i += 1
    #Score, GC, Transcription plotting.
    bin_width = int(bins[1])
    position = bins[:-1]
    print(len(position))
    for key, value in Non_GCSs_data.items():
        len(value)
        plot_names[i].set_xlim(0, genome_len)
        if key == "GC":
            plot_names[i].set_ylim(45, max(value) + 2)
        elif key == "Score":
            plot_names[i].set_ylim(min(value) - 0.2, -1.5)
        plot_names[i].set_xticks(ticks1, minor=False)
        plot_names[i].set_xticks([yter, yori], minor=True)
        plot_names[i].set_xticklabels(xticknames1)
        plt.setp(plot_names[i].set_xticklabels(xticknames1),
                 rotation=0,
                 fontsize=14)
        plot_names[i].tick_params(axis='x', which='major', labelsize=19)
        plot_names[i].locator_params(axis='y', nbins=6)
        plot_names[i].bar(position,
                          value,
                          bin_width,
                          color=colors[i],
                          linewidth=1,
                          edgecolor='black',
                          align='edge')
        plot_names[i].tick_params(axis='y', which='major', pad=7, labelsize=15)
        plot_names[i].set_ylabel(Y_labels[i], size=22, labelpad=8, rotation=90)
        i += 1
    plt.tight_layout()
    fig.savefig(path_out +
                "GCSs_num_score_GC133_transcription_distrib_thr_genome.png",
                figsize=(11, 15),
                dpi=400)

    #GCSs data plotting for Cfx, Micro, and Oxo only.
    GSCs_data_main = {
        'Cfx': GSCs_data['Cfx'],
        'Micro': GSCs_data['Micro'],
        'Oxo': GSCs_data['Oxo']
    }
    Y_labels_main = ['Cfx GCSs', 'Micro GCSs', 'Oxo GCSs']
    colors_main = ['#7FCE79', '#ff878b', '#8991ff']
    fig, plot_names = plt.subplots(3, 1, figsize=(11, 7), dpi=100)
    i = 0
    for key, value in GSCs_data_main.items():
        plot_names[i].set_xlim(0, genome_len)
        plot_names[i].set_xticks(ticks1, minor=False)
        plot_names[i].set_xticks([yter, yori], minor=True)
        plot_names[i].set_xticklabels(xticknames1)
        plt.setp(plot_names[i].set_xticklabels(xticknames1),
                 rotation=0,
                 fontsize=14)
        conf_interval = [
            binom.interval(0.999, len(value), 1 / 10)[0],
            binom.interval(0.999, len(value), 1 / 10)[1]
        ]
        plot_names[i].set_yticks(conf_interval, minor=True)
        plot_names[i].yaxis.grid(True,
                                 which='minor',
                                 linewidth=0.4,
                                 linestyle='--',
                                 color='black')
        plot_names[i].fill_between(bins,
                                   conf_interval[0],
                                   conf_interval[1],
                                   facecolor='grey',
                                   alpha=0.3)
        plot_names[i].locator_params(axis='y', nbins=6)
        plot_names[i].tick_params(axis='x', which='major', labelsize=19)
        plot_names[i].hist(
            value,
            bins,
            facecolor=colors_main[i],
            alpha=0.7,
            linewidth=1,
            edgecolor='black'
        )  #Plot histo and save computed histogramm data (bins and values)
        plot_names[i].tick_params(axis='y', which='major', pad=7, labelsize=15)
        plot_names[i].set_ylabel(Y_labels_main[i],
                                 size=22,
                                 labelpad=8,
                                 rotation=90)
        i += 1
    plt.tight_layout()
    fig.savefig(path_out + "GCSs_number_Cfx_Micro_Oxo_distrib_thr_genome.png",
                figsize=(11, 7),
                dpi=400)
    return Histo_comp_dict