def test_issue_7406(): np.random.seed(0) assert_equal(binom.ppf(np.random.rand(10), 0, 0.5), 0) # Also check that endpoints (q=0, q=1) are correct assert_equal(binom.ppf(0, 0, 0.5), -1) assert_equal(binom.ppf(1, 0, 0.5), 0)
def plot_acuity(logmar, accuracy, yerror, n_validation, name, conditions, condition_name, plot_directory, unit_label="logMAR"): print(f"plotting {name} {condition_name} classification accuracy.") sig5 = np.repeat( binom.ppf(0.95, n_validation, 0.5) / n_validation, len(logmar)) sig1 = np.repeat( binom.ppf(0.99, n_validation, 0.5) / n_validation, len(logmar)) fig, ax = plt.subplots() nconditions = len(conditions) for condition in range(nconditions): if type(conditions[condition]) is float: label = f'{conditions[condition]:.2f}' else: label = f'{conditions[condition]}' ax.errorbar(logmar, accuracy[condition], marker='o', markersize=4, capsize=4, yerr=yerror[condition], label=label) ax.plot(logmar, sig1, 'k--') # ax.plot(logmar, sig1, 'k--', label='p<0.01') ax.set_ylabel("Accuracy") ax.set_xlabel(unit_label) if unit_label == "logMAR": x_major_ticks = np.arange(1.6, 3.2, 0.2) ax.set_xticks(x_major_ticks) ax.set_xlim(1.5, 3.25) elif unit_label == "cpd": pass # ax.set_xticks(minor_ticks, minor=True) # ax.set_yticks(major_ticks) # ax.set_yticks(minor_ticks, minor=True) ax.grid(which='both') ax.set_ylim(0.35, 1.05) ax.legend(loc=(1, 0.1)) if condition_name is None: ax.set_title(f'{name} classification by binning technique') fig.tight_layout() fig.savefig(os.path.join(plot_directory, f"{name}_acuity.png")) else: ax.set_title(f'{name} classification by {condition_name}') fig.tight_layout() fig.savefig( os.path.join(plot_directory, f"{name}-{condition_name}_acuity.png"))
def get_m_n_from_bernoulli(N): p, P_B = 0.05, 0.05 m_n_bernoulli = np.arange(1, N) * np.nan for n in np.arange(1, N): x = np.arange(binom.ppf(0.00, n, p), binom.ppf(1.00, n, p)) prob = binom.sf(x, n, p) m = find_m(prob, P_B) m_n_bernoulli[n - 1] = m * 1. / n return (m_n_bernoulli)
def occurrence_error(n_planets, rate): try: n = n_planets/rate p = rate high = rate*n_planets/binom.ppf(0.159, n, p) low = rate*n_planets/binom.ppf(0.841, n, p) return rate - low,high - rate except: return np.nan, np.nan
def test_round(self): random_state = RandomState() avg = 3.4 samples = 1000 obs_avg = np.mean([random_state.round(avg) for i in range(samples)]) min = np.floor(avg) + binom.ppf(0.001, n=samples, p=avg % 1) / samples max = np.floor(avg) + binom.ppf(0.999, n=samples, p=avg % 1) / samples self.assertGreater(obs_avg, min) self.assertLess(obs_avg, max)
def lc_plot(n, k, p, titlename, outname=None): # plt.plot(prob) # if p < 0.001: # plt.title(titlename + f'\np < 0.001',fontsize=10) # else: # plt.title(titlename + '\n' + 'p = ' + '%.3f' %p, fontsize=10) # plt.plot([k,k],[prob[k],prob[k]],'.', markersize=10) # plt.plot([k,k],[0,0.06],'--', markersize=15) # plt.xlabel('Number of correct predictions',fontsize=8) # plt.ylabel('Probability', fontsize=8) # fig, ax = plt.subplots(1, 1, figsize = (a, b)) fig = plt.figure(figsize=(8, 5)) ax = fig.add_subplot(1, 1, 1) x = np.arange(binom.ppf(0.01, n, p), binom.ppf(0.99, n, p)) ax.plot(x, binom.pmf(x, n, p), 'bo', color='gray', ms=5, label='binomial probability mass function') ax.vlines(x, 0, binom.pmf(x, n, p), colors='gray', lw=5, alpha=0.5) rv = binom(n, p) ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='-', lw=1, label='frozen probability mass function') # plt.plot([k,k],[prob[k],prob[k]],'.', markersize=10) # plt.plot([k,k],[0,0.06],'--', markersize=15) # ax.legend(loc='lower right', frameon=False) num1 = 1.1 num2 = 1 num3 = 1 num4 = 0.5 ax.legend(bbox_to_anchor=(num1, num2), loc=num3, borderaxespad=num4) plt.title(titlename) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_linewidth(2) ax.spines['left'].set_linewidth(2) if outname: plt.savefig(outname) plt.show()
def test_merge_by_weight(self): selected_counts = {0: 0, 1: 0} alpha = 0.01 nrounds = 1000 from scipy.stats import binom # lower and upper bounds of 95% CI for selecting the segment with weight 1/3 lb = binom.ppf(alpha / 2.0, nrounds, 1.0 / 3.0) ub = binom.ppf(1.0 - alpha / 2.0, nrounds, 1.0 / 3.0) system = WESTSystem() system.bin_mapper = RectilinearBinMapper([[0.0, 1.0]]) system.bin_target_counts = np.array([1]) system.pcoord_len = 2 self.we_driver = WEDriver(system=system) self.system = system self._seg_id = 0 segments = [ Segment(n_iter=1, seg_id=0, pcoord=np.array([[0], [0.25]], dtype=np.float32), weight=1.0 / 3.0), Segment(n_iter=1, seg_id=1, pcoord=np.array([[0], [0.75]], dtype=np.float32), weight=2.0 / 3.0), ] for _iround in range(nrounds): for segment in segments: segment.endpoint_type = Segment.SEG_ENDPOINT_UNSET self.we_driver.new_iteration() self.we_driver.assign(segments) self.we_driver.construct_next() assert len(self.we_driver.next_iter_binning[0]) == 1 newseg = self.we_driver.next_iter_binning[0].pop() assert segments[ newseg. parent_id].endpoint_type == Segment.SEG_ENDPOINT_CONTINUES assert segments[ ~newseg.parent_id].endpoint_type == Segment.SEG_ENDPOINT_MERGED selected_counts[newseg.parent_id] += 1 print(selected_counts) assert ( lb <= selected_counts[0] <= ub ), 'Incorrect proportion of histories selected.' 'this is expected about {:%} of the time; retry test.'.format( alpha)
def calculate_CI(len_samples, confidence_level=0.95, n_points=1001): """ (https://git.ligo.org/lscsoft/bilby/blob/master/bilby/core/result.py#L1578) """ x_values = np.linspace(0, 1, n_points) N = len_samples edge_of_bound = (1. - confidence_level) / 2. lower = binom.ppf(1 - edge_of_bound, N, x_values) / N upper = binom.ppf(edge_of_bound, N, x_values) / N lower[0] = 0 upper[0] = 0 return x_values, upper, lower
def test_round_midpoint(self): random_state = RandomState() self.assertEqual(random_state.round_midpoint(3.4), 3) self.assertEqual(random_state.round_midpoint(3.6), 4) avg = 3.5 samples = 2000 obs_avg = np.mean( [random_state.round_midpoint(avg) for i in range(samples)]) min = np.floor(avg) + binom.ppf(0.0001, n=samples, p=avg % 1) / samples max = np.floor(avg) + binom.ppf(0.9999, n=samples, p=avg % 1) / samples self.assertGreaterEqual(obs_avg, min) self.assertLessEqual(obs_avg, max)
def fun_CI_builder(l_subr, pd_order_z, f_delta_k, f_alpha_k, f_epsilon): f_vol_S = l_subr[0].f_volume f_vol_C = sum(c.f_volume for c in l_subr if c.s_label == 'C' and c.b_activate is True) f_vol_P = sum(c.f_volume for c in l_subr if c.s_label == 'P' and c.b_activate is True) f_vol_M = sum(c.f_volume for c in l_subr if c.s_label == 'M' and c.b_activate is True) f_delta_kl = f_delta_k - float(f_vol_P * f_epsilon) / (f_vol_S * f_vol_C) f_delta_ku = f_delta_k + float(f_vol_M * f_epsilon) / (f_vol_S * f_vol_C) f_max_r = binom.ppf(f_alpha_k / 2, len(pd_order_z), f_delta_kl) f_min_s = binom.ppf(1 - f_alpha_k / 2, len(pd_order_z), f_delta_ku) if math.isnan(f_max_r) is True: f_max_r = 0 CI_l = pd_order_z.loc[f_max_r, 'mean'] CI_u = pd_order_z.loc[f_min_s, 'mean'] return [CI_u, CI_l]
def test_issue_5122(): p = 0 n = np.random.randint(100, size=10) x = 0 ppf = binom.ppf(x, n, p) assert_equal(ppf, -1) x = np.linspace(0.01, 0.99, 10) ppf = binom.ppf(x, n, p) assert_equal(ppf, 0) x = 1 ppf = binom.ppf(x, n, p) assert_equal(ppf, n)
def binom_confidence_interval(alpha, N_discr, p_discr): ''' Two-sided confidence interval of size 1-p_discr for binomial probability parameter given N_discr. Equivalently, using a two-sided test with significance level p_discr for alpha \\neq beta, the null hypothesis will not be rejected if beta is in the interval (lower, upper) and N_discr is the number of trials and beta*N_discr is the number of successfull tirals. ''' lower = binom.ppf(p_discr / 2, N_discr, alpha) * 1. / N_discr upper = binom.ppf(1 - p_discr / 2, N_discr, alpha) * 1. / N_discr return lower, upper
def binom_confidence_interval(alpha, N_discr, p_discr): ''' Two-sided confidence interval of size 1-p_discr for binomial probability parameter given N_discr. Equivalently, using a two-sided test with significance level p_discr for alpha \\neq beta, the null hypothesis will not be rejected if beta is in the interval (lower, upper) and N_discr is the number of trials and beta*N_discr is the number of successfull tirals. ''' lower = binom.ppf(p_discr/2, N_discr, alpha)*1./N_discr upper = binom.ppf(1-p_discr/2, N_discr, alpha)*1./N_discr return lower, upper
def err(ci, k, j): n = binom.ppf(ci, k, j) if n == 0: #this is an edge case, so we report a big error return 1e9 else: return abs(n / (j * k) - 1)
def mcnemar_test(cont_table): """Found in statsmodels as mcnemar Used when we have paired nominal data that is organized in a 2x2 contingency table. It is used to test the assumption that the marginal column and row probabilities are equal, i.e., that the probability that b and c are equivalent. Parameters ---------- cont_table: list or numpy array, 2 x 2 A 2x2 contingency table Return ------ chi_squared: float Our Chi statistic, or the sum of differences between b and c p: float, 0 <= p <= 1 The probability that b and c aren't equivalent due to chance """ cont_table = _check_table(cont_table, True) if cont_table.shape != (2, 2): raise AttributeError( "McNemar's Test is meant for a 2x2 contingency table") b, c = cont_table[0, 1], cont_table[1, 0] if b + c > 25: chi_squared = pow(abs(b - c) - 1, 2) / (b + c) p = 1 - chi2.cdf(chi_squared, 1) else: chi_squared = min(b, c) p = 2 * binom.cdf(chi_squared, b + c, 0.5) - binom.pmf( binom.ppf(0.99, b + c, 0.5), b + c, 0.5) return chi_squared, p
def binomInvCDF(prob, n, p, loc): if 0 < p and p < 1: return if isinstance(n, int) and n > 0: raise Exception("Error: n must be a positive integer") value = binom.ppf(prob, n, p, x = loc) return value
def qbinom(q, size=1, prob=0.5, lowertail=True): """ ============================================================================ qbinom() ============================================================================ The quantile function for the binomial distribution. You provide a quantile (eg q=0.75) or array of quantiles, and it returns the value along the binomial distribution that corresponds to the qth quantile. USAGE: dbinom(x, size, prob=0.5, log=False) pbinom(q, size, prob=0.5, lowertail=True, log=False) qbinom(p, size, prob=0.5, lowertail=True) rbinom(n=1, size=1, prob=0.5) :param q: float. or array of floats. The quantile () :param size: int. Number of trials :param prob: float. Probability of a success :param log: bool. take the log? :return: an array of the value(s) corresponding to the quantiles q ============================================================================ """ # TODO: BUG: qbinom(0, size=11, prob=0.3) gives -1. It should be 0 # TODO: check that q is between 0.0 and 1.0 if lowertail: return binom.ppf(q=q, n=size, p=prob) else: return binom.isf(q=q, n=size, p=prob)
def err(ci, k, j): n = binom.ppf(ci, k, j) if n == 0: #this is an edge case, so we report a big error return 1e9 else: return abs(n/(j*k) - 1)
def get_FDR_cutoff_binom(readlengths, genelength, alpha=0.05, mincut=2): ''' model peak height by binomial distribution, return the FDR_cutoff(no. reads needed to reach FDR) :param readlengths: list, list of read length in a genomic region :param genelength: int, length of genomie region :param alpha: float, default = 0.05, FDR alpha value :param mincut: int, default 2, minimal peak height (no. reads per position). if if FDR cutoff < mincut, return mincut :return: int, minimal peak height required to reach FDR ''' number_reads = len(readlengths) if number_reads == 0: return mincut else: read_length = np.array(readlengths) mean_read_length = np.mean(read_length) prob = float(mean_read_length) / float(genelength) if prob > 1: raise ValueError("probability of >= 1 read per-base > 1") try: k = int( binom.ppf(1 - (alpha), number_reads, prob) ) # percent point function (ppf) inverse of cdf; which number of reads we need tp if k < mincut: return mincut else: return k except: print(read_length, mean_read_length, genelength, prob, alpha, number_reads) raise
def _compute_bad_bins_all_channels(self): tces_to_remove = self.transits[ self.mask_transits_in_bad_bins].tce.unique() mask_transits_to_remove = self.transits.tce.isin(tces_to_remove) # Make a cut across all channels total_transit_count = self.transits[~mask_transits_to_remove].groupby( 'bin_id').size() n_tces = self.transits[~mask_transits_to_remove].tce.unique().size p_transit = total_transit_count.median() / n_tces total_count_threshold = binom.ppf( 1 - self.probability_threshold_combined, int(n_tces), p_transit) # print('n={} p={}'.format(n_tces, p_transit)) bins_to_remove = total_transit_count[ total_transit_count > total_count_threshold].index print('Identified {} bad bins for all channels.'.format( len(bins_to_remove))) self.mask_transits_in_bad_bin_ids = self.transits['bin_id'].isin( bins_to_remove) print('Flagged {} out of {} transits as suspicious.'.format( self.mask_transits_in_bad_bin_ids.sum(), len(self.mask_transits_in_bad_bin_ids))) return bins_to_remove
def _compute_binomial_thresholds(self): rates = self.rates tce_expect_col, rate_expected_col, rate_threshold_col = [], [], [] for skygroup, season in rates.index: mask_reference = ( (rates.index.get_level_values('skygroup') == skygroup) & (rates.index.get_level_values('season') != season) & ~rates.channel.isin(OUTLIER_CHANNELS)) # Compute the probability for a TCE to produce a transit in a given bin n_transits_per_bin = self.binsize * rates[ mask_reference].transits_per_day n_tces = rates[mask_reference].n_tces mean_transit_probability = (n_transits_per_bin / n_tces).mean() rate_expected_col.append(n_transits_per_bin.median()) tce_expect_col.append(n_tces.median()) rate_threshold_col.append( int( binom.ppf(1 - self.probability_threshold, int(n_tces.median()), mean_transit_probability))) rates['n_tces_expected'] = tce_expect_col rates['transit_rate_expected'] = rate_expected_col rates['transit_rate_threshold'] = rate_threshold_col return rates
def binomial_dist(self, value, bound_min, bound_max, population): # FIXME Population should never be inferior to zero ! if population > 0: # Probability of picking a number # - between bound_min and bound_max # - AND smaller than value p = (value - bound_min) / (bound_max - bound_min) assert 0 <= p <= 1, "Not a probability !?" # We use the inverse-CDF method to pick a random # number in [0,population] that follows a. # binomial distribution q = np.random.uniform() return binom.ppf(q, population, p) else: moving_population = 0 #print("population {}".format(population)) for i in range(int(population)): proba = random.uniform(bound_min, bound_max) if proba < value: moving_population += 1 return moving_population
def get_binomial_table(p = 0.5, alpha = 0.05, trial_range = 8): '''Compute the numbers of points from the :math:`\\delta`-neighborhood, which need to fall outside the :math:`\\varepsilon`-neighborhood, in order to reject the Null Hypothesis at a significance level :math:`\\alpha`. Parameters ---------- p : `float`, optional Binominal p (Default is `p = 0.5`). alpha : `float`, optional Significance level in order to be able to reject the Null on the basis of the binomial distribution (Default is `alpha = 0.05`). trial_range : `int`, optional Number of considered delta-neighborhood-points (Default is `trial_range = 8`). Returns ------- delta_to_epsilon_amount : `dict` A dictionary with `delta_points` as keys and the corresponding number of points in order to reject the Null, `epsilon_points`, constitute the values. Notes ----- One parameter of the binomial distribution is `p`, the other one would be the number of trials, i.e. the considered number of points of the :math:`\\delta`-neighborhood. `trial_range` determines the number of considered :math:`\\delta`-neighborhood-points, always starting from 8. For instance, if `trial_range = 8`, then :math:`\\delta`-neighborhood sizes from 8 up to 15 are considered. ''' assert trial_range >= 1 delta_to_epsilon_amount = dict() for key in range(8,8+trial_range): delta_to_epsilon_amount[key] = int(binom.ppf(1-alpha, key, p)) return delta_to_epsilon_amount
def get_covarying_errors(self): nucleotide_counts = self.get_nucleotide_counts() summary = nucleotide_counts.loc[ ~nucleotide_counts.covarying, ['nucleotide_max', 'coverage'] ].sum() total_coverage = summary['coverage'] total_consensus = summary['nucleotide_max'] error_rate = np.abs(total_coverage - total_consensus) / total_consensus nucleotide_counts.loc[:, 'n_error'] = \ nucleotide_counts.loc[:, 'coverage'].apply( lambda count: binom.ppf( 1-self.error_threshold, count, error_rate ) ) nucleotide_counts.loc[:, 'site'] = nucleotide_counts.index nucleotide_counts.loc[:, 'covarying'] = False nucleotide_counts.loc[self.covarying_sites, 'covarying'] = True site_counts = nucleotide_counts.loc[ nucleotide_counts['covarying'], : ].melt( id_vars=['n_error', 'site'], value_vars=['A', 'C', 'G', 'T'] ) covarying_values = site_counts['value'] covarying_counts = site_counts['n_error'] significant = (covarying_values <= covarying_counts) & \ (covarying_values > 0) covarying_errors = site_counts.loc[significant, :] \ .sort_values(by='site') \ .reset_index(drop=True) self.covarying_errors = covarying_errors return covarying_errors
def qbinom(p, size, prob=0.5): """ Calculates the quantile function from the binomial distribution """ from scipy.stats import binom result=binom.ppf(q=p,n=size,p=prob,loc=0) return result
def find_sample_size_for_stopping_prob_r2bravo(stopping_probability, N_w, N_l, alpha, underlying=None, right=None): """ Finds the first round size that achieves the passed stopping_probability for an R2 Bravo audit (with no stratification). """ N = N_w + N_l left = 1 right = N while(1): n = math.ceil((left + right) / 2) # compute the 1 - stopping_probability quantile of the alt dist # kmax where pr[k >= kmax | alt] = stopping_probability # floor because we need to ensure at least a stopping_probability prob of stopping kmax = math.floor(binom.ppf(1 - stopping_probability, n, N_w / N)) # compute pvalue for this kmax pvalue = r2bravo_pvalue_direct_count(winner_votes=kmax, n=n, popsize=N, alpha=alpha, Vw=N_w, Vl=N_l, null_margin=0) # update binary search bounds if (pvalue > alpha): left = n elif (pvalue <= alpha): right = n # when and right converge, right is the minimum round size that achieves stopping_probability if (left == right - 1): if (right == N): print("required round size is greater than stratum size") return right
def monitor(self, data, model_id=0): gamma = 0.4 # Filter rate. n = data.shape[0] # Get Operation Mode op_mode = self.models[model_id] # Compute the limit of out-of-bounds sample to be detected as out of the model. limit = np.round( binom.ppf(op_mode.confidence, n, 1 - op_mode.confidence)) # Compute the log likelihood. logprob, responsability = op_mode.model.score_samples(data) # Filter statistics. filtered_stats = exponential_filter(logprob, gamma) # Other info. idx_out = -filtered_stats > op_mode.threshold num_out = np.sum(idx_out) out = num_out > limit data_out = data[idx_out, ] # Return: # Monitored Statistics (filtered negative log-likelihood) # Threshold of the selected operation model. # Bit indicating whether the behaviour is out of the OP. # Number of samples beyond the threshold. # Data points that were out of the model. # Idx of the operation mode. return -filtered_stats, op_mode.threshold, out, num_out, data_out, model_id
def compute_unconditional_power(margin, N_wl, pi, alpha): ''' Compute unconditional power of the test. margin = vote margin (votes for w / votes for w or l) in the population N_wl = the total number of ballots for either the winner or loser in the population, pop = total population size, pi = the sampling probability, alpha = the type I error rate ''' unlikely_draw_lower = binom.ppf(0.005, N_wl, pi) unlikely_draw_upper = binom.ppf(0.995, N_wl, pi) power_sum = 0 powers = Parallel(n_jobs=num_cores)(delayed(compute)(margin, N_wl, pi, alpha, n) \ for n in range(int(unlikely_draw_lower), int(unlikely_draw_upper))) return sum(powers)
def get_probable_maximum_selected( n_total_trials, n_trials, selection_prob, chance=(1.0 / 100.0)): """ Get the likely maximum number of items that will be selected from a\ set of n_trials from a total set of n_total_trials\ with a probability of selection of selection_prob """ prob = 1.0 - (chance / float(n_total_trials)) return binom.ppf(prob, n_trials, selection_prob)
def parallel_forward_binom_step(self, dB: int = 0, num_sims=10000): # get previous state S, I, R, D, N = (vector[-1].copy() for vector in (self.S, self.I, self.R, self.D, self.N)) # update state Rt = self.Rt0 * S / N p = self.gamma * Rt * I / N num_cases = binom.rvs(n=S.astype(int), p=p, size=num_sims) self.upper_CI.append(binom.ppf(self.CI, n=S.astype(int), p=p)) self.lower_CI.append(binom.ppf(1 - self.CI, n=S.astype(int), p=p)) I += num_cases S -= num_cases rate_D = self.m * self.gamma * I num_dead = poisson.rvs(rate_D, size=num_sims) D += num_dead rate_R = (1 - self.m) * self.gamma * I num_recov = poisson.rvs(rate_R, size=num_sims) R += num_recov I -= (num_dead + num_recov) S = S.clip(0) I = I.clip(0) D = D.clip(0) N = S + I + R # beta = (num_cases * N)/(b * S * I) # update state vectors self.Rt.append(Rt) # self.b.append(b) self.S.append(S) self.I.append(I) self.R.append(R) self.D.append(D) self.N.append(N) # self.beta.append(beta) self.dT.append(num_cases) self.total_cases.append(I + R + D)
def testBinom(): # {{{ """ Binomial Distribution (二项分布 discrete) 二项分布的例子:抛掷10次硬币,恰好两次正面朝上的概率是多少? 事件要么发生, 要么不发生 """ # 准备数据: 已知 n(伯努利实验次数), p(某件事件发生的概率) # X轴: n次实验中事件出现k次 # Y轴: 概率 n = 100 # 当n很大(np > 5 && nq > 5) 近似 X ~ N(np, npq) p = 0.5 xs = np.arange(binom.ppf(0.01, n, p), binom.ppf(0.99, n, p)) # E(X) = np, D(X) = np(1-p) mean, var, skew, kurt = binom.stats(n, p, loc=0, moments='mvsk') print("mean: %.2f, var: %.2f, skew: %.2f, kurt: %.2f" % (mean, var, skew, kurt)) fig, axs = plt.subplots(1, 3) # 显示pmf ys = binom.pmf(xs, n, p) axs[0].plot(xs, ys, 'bo', markersize=5, label='binom pmf') axs[0].legend() # 显示cdf ys = binom.cdf(xs, n, p) axs[1].plot(xs, ys, 'bo', markersize=5, label='binom cdf') axs[1].legend() # 随机变量RVS data = binom.rvs(n, p, size=1000) import sys sys.path.append("../../thinkstats") import Pmf pmf = Pmf.MakePmfFromList(data) xs, ys = pmf.Render() axs[2].plot(xs, ys, 'bo', markersize=5, label='rvs pmf') axs[2].legend() plt.show()
def plot_with_uniform_band(values, ci_level, x_label, n_bins=30, figsize=(10, 4), ylim=[0, 50]): ''' Plots the PIT/HPD histogram and calculates the confidence interval for the bin values, were the PIT/HPD values follow an uniform distribution @param values: a numpy array with PIT/HPD values @param ci_level: a float between 0 and 1 indicating the size of the confidence level @param x_label: a string, populates the x_label of the plot @param n_bins: an integer, the number of bins in the histogram @param figsize: a tuple, the plot size (width, height) @param ylim: a list of two elements, including the lower and upper limit for the y axis @returns The matplotlib figure object with the histogram of the PIT/HPD values and the CI for the uniform distribution ''' # Extract the number of CDEs n = values.shape[0] # Creating upper and lower limit for selected uniform band ci_quantity = (1 - ci_level) / 2 low_lim = binom.ppf(q=ci_quantity, n=n, p=1 / n_bins) upp_lim = binom.ppf(q=ci_level + ci_quantity, n=n, p=1 / n_bins) # Creating figure fig = plt.figure(figsize=figsize) plt.hist(values, bins=n_bins) plt.axhline(y=low_lim, color='grey') plt.axhline(y=upp_lim, color='grey') plt.axhline(y=n / n_bins, label='Uniform Average', color='red') plt.fill_between(x=np.linspace(0, 1, 100), y1=np.repeat(low_lim, 100), y2=np.repeat(upp_lim, 100), color='grey', alpha=0.2) plt.legend(loc='best', prop={'size': 18}) plt.xlabel(x_label, size=20) plt.ylim(ylim) plt.xticks(size=16) plt.yticks(size=16) plt.close() return fig
def npfs(X, y, n_select, base="mim", alpha=.01, n_bootstraps=100): """ Parameters ---------- X : array-like, shape = (n_samples, n_features_in) Sample vectors. y : array-like, shape = (n_samples,) Target vector (class labels). base : string PyFeast feature selection method. ['mim', 'mrmr', 'jmi'] alpha : double Size of the hypothesis test for NPFS n_bootstraps : double Number of boostraps Returns ------- selections : array Vector of selected features. Length is variable. """ try: fs_method = getattr(feast, base) except ImportError: raise("Method does not exist in FEAST") n_samp, n_feat = X.shape X = bin_data(X, n_bins=np.sqrt(n_samp)) if n_samp != len(y): ValueError('len(y) and X.shape[0] must be the equal.') bern_matrix = np.zeros((n_feat,n_bootstraps)) for n in range(n_bootstraps): # generate a random sample idx = np.random.randint(0, n_samp, n_samp) sels = fs_method(1.0*X[idx], y[idx], n_select) b_sels = np.zeros((n_feat,)) b_sels[sels] = 1. bern_matrix[:, n] = b_sels delta = binom.ppf(1-alpha, n_bootstraps, 1.*n_select/n_feat) z = np.sum(bern_matrix, axis=1) selections = [] for k in range(n_feat): if z[k] > delta: selections.append(k) return selections, bern_matrix, delta
def generate_multivariate_binomial(cpu,mem,num_tasks): mean = [0, 0, 0] cov = [[1, -0.5, -0.5], [-0.5, 1, -0.5], [-0.5, -0.5, 1]] x, y, z = np.random.multivariate_normal(mean, cov, num_tasks).T cpus = [] mems = [] values = [] for ix in x: cpus.append(binom.ppf(norm.cdf(ix),cpu,8/cpu)) for iy in y: mems.append(binom.ppf(norm.cdf(iy),mem,8/mem)) for iz in z: values.append(norm.cdf(iz)*(100-1)+1) # print("cpu mem corr: ", np.corrcoef(cpus,mems)[0, 1]) # print("cpus: ",cpus) return cpus,mems,values
def decision_threshold(overall=0.99, accuracy=0.95, samples=1000): """ Calculate the decision threshold. This is based on Binomial Distribution. :param overall: Certainty we can sure the image is LevelA :param accuracy: Accuracy of predicting levelB is LevelB :param samples: How many sub images in total :return: integer value; if more than this value of sub images are LevelA, this image is LevelA """ k = binom.ppf(overall, samples, 1 - accuracy) return int(k)
def fit(self, data, labels): """ @self - self explanitory @data - data in a numpy array. here are some suggestions for formatting the data. len(data) = n_observations len(data.transpose()) = n_features @labels - numerical class labels in a numpy array. len(labels) = n_observations """ data, labels = self.__check_data(data, labels) try: fs_method = getattr(feast, self.fs_method) except ImportError: raise("Method does not exist in FEAST") self.n_observations = len(data) self.n_features = len(data.transpose()) self.method = fs_method # @Z - contains the observations of the Bernoulli random variables # that are whether the feature were or were not selected Z = np.zeros( (self.n_features, self.n_bootstraps) ) self.data = data self.labels = labels if self.parallel == None: for b in range(self.n_bootstraps): sf = self.boot_iteration() Z[sf, b] = 1 # mark the features selected with a '1'. else: pool = Pool(processes = self.parallel) sfs = pool.map(__call__, (self for x in range(self.n_bootstraps))) for x in range(len(sfs)): Z[sfs[x], x] = 1 z = np.sum(Z, axis=1) # z is a binomial random variable # compute the neyman-pearson threshold (include the bias term) p = (1.0*self.n_select)/self.n_features + self.beta if p > 1.0: # user chose \beta poorly -- null it out raise ValueError("p+beta > 1 -> Invalid probability") delta = binom.ppf(1 - self.alpha, self.n_bootstraps, p) # based on the threshold, determine which features are relevant and return # them in a numpy array selected_features = [] for k in range(self.n_features): if z[k] > delta: selected_features.append(k) self.Bernoulli_matrix = Z self.selected_features = np.array(selected_features) return self.selected_features
def qbinom(p, n): """ quantile function for binomial with probability of success 0.5 returns smallest k such that Prob(X <= k) >= p compare to R qbinom :param n: number :param p: quantile level :return: k """ return binom.ppf(p, n, 0.5)
def test_merge_by_weight(self): selected_counts = {0: 0, 1: 0} alpha = 0.01 nrounds = 1000 from scipy.stats import binom # lower and upper bounds of 95% CI for selecting the segment with weight 1/3 lb = binom.ppf(alpha/2.0, nrounds, 1.0/3.0) ub = binom.ppf(1.0-alpha/2.0, nrounds, 1.0/3.0) system = WESTSystem() system.bin_mapper = RectilinearBinMapper([[0.0, 1.0]]) system.bin_target_counts = numpy.array([1]) system.pcoord_len = 2 self.we_driver = WEDriver(system=system) self.system = system self._seg_id = 0 segments = [Segment(n_iter=1, seg_id=0, pcoord=numpy.array([[0],[0.25]], dtype=numpy.float32),weight=1.0/3.0), Segment(n_iter=1, seg_id=1, pcoord=numpy.array([[0],[0.75]], dtype=numpy.float32),weight=2.0/3.0)] for _iround in xrange(nrounds): for segment in segments: segment.endpoint_type = Segment.SEG_ENDPOINT_UNSET self.we_driver.new_iteration() self.we_driver.assign(segments) self.we_driver.construct_next() assert len(self.we_driver.next_iter_binning[0]) == 1 newseg = self.we_driver.next_iter_binning[0].pop() assert segments[newseg.parent_id].endpoint_type == Segment.SEG_ENDPOINT_CONTINUES assert segments[~newseg.parent_id].endpoint_type == Segment.SEG_ENDPOINT_MERGED selected_counts[newseg.parent_id] += 1 print(selected_counts) assert lb <= selected_counts[0] <= ub, ('Incorrect proportion of histories selected.' 'this is expected about {:%} of the time; retry test.'.format(alpha))
def npfs_chi2(X, y, fpr=0.05, alpha=.01, n_bootstraps=100): """ Parameters ---------- X : array-like, shape = (n_samples, n_features_in) Sample vectors. y : array-like, shape = (n_samples,) Target vector (class labels). fpr : double False positive rate for the Chi2-test feature selection approach alpha : double Size of the hypothesis test for NPFS n_bootstraps : double Number of boostraps Returns ------- selections : array Vector of selected features. Length is variable. """ n_samp, n_feat = X.shape X = bin_data(X, n_bins=np.sqrt(n_samp)) if n_samp != len(y): ValueError('len(y) and X.shape[0] must be the equal.') bern_matrix = np.zeros((n_feat,n_bootstraps)) for n in range(n_bootstraps): # generate a random sample idx = np.random.randint(0, n_samp, n_samp) chi, pval = chi2(1.0*X[idx], y[idx]) sels = np.where(pval <= fpr) b_sels = np.zeros((n_feat,)) b_sels[sels] = 1. bern_matrix[:, n] = b_sels delta = binom.ppf(1-alpha, n_bootstraps, fpr) z = np.sum(bern_matrix, axis=1) selections = [] for k in range(n_feat): if z[k] > delta: selections.append(k) return selections, bern_matrix, delta
def get_Binom_cutoff(readlengths,genelength,alpha, mincut=2): NR=len(readlengths) if NR==0: return mincut else: RL=numpy.array(readlengths) Mean_RL=numpy.mean(RL) Prob=float(Mean_RL)/float(genelength) k=int(binom.ppf(1-(alpha),NR, Prob)) if k < mincut: return mincut else: return k
def get_FDR_cutoff_binom(readlengths, genelength, alpha, mincut = 2): number_reads = len(readlengths) if number_reads == 0: return mincut else: read_length = numpy.array(readlengths) mean_read_length = numpy.mean(read_length) prob = float(mean_read_length) / float(genelength) try: k = int(binom.ppf(1 - (alpha), number_reads, prob)) if k < mincut: return mincut else: return k except: print read_length, mean_read_length, prob, alpha, number_reads raise
def get_FDR_cutoff_binom(readlengths, genelength, alpha, mincut = 2): number_reads = len(readlengths) if number_reads == 0: return mincut else: read_length = numpy.array(readlengths) mean_read_length = numpy.mean(read_length) prob = float(mean_read_length) / float(genelength) if prob > 1: raise ValueError("probability of >= 1 read per-base > 1") try: k = int(binom.ppf(1 - (alpha), number_reads, prob)) if k < mincut: return mincut else: return k except: print read_length, mean_read_length, genelength, prob, alpha, number_reads raise
def bino_p2da(y, p): """For a given vector label, get the decoding accuracy of p-values using the binomial law. Args: y: array The vector label p: int / float / list / array [0 <= p < 1] p-value. Ex : p = [0.05, 0.01, 0.001, 0.00001] Return: da: ndarray The decoding accuracy associate to each p-value """ y = np.ravel(y) nbepoch = len(y) nbclass = len(np.unique(y)) if not isinstance(p, np.ndarray): p = np.array(p) if (p.max() >= 1): raise ValueError('Consider 0<=p<1') return binom.ppf(1 - p, nbepoch, 1 / nbclass) * 100 / nbepoch
def monitor(self, data, model_id=0): gamma = 0.4 # Filter rate. n = data.shape[0] # Get Operation Mode op_mode = self.models[model_id] # Compute the limit of out-of-bounds sample to be detected as out of the model. limit = np.round(binom.ppf(op_mode.confidence, n, 1-op_mode.confidence)) # Compute the log likelihood. logprob, responsability = op_mode.model.score_samples(data) # Filter statistics. filtered_stats = exponential_filter(logprob, gamma) # Other info. idx_out = -filtered_stats > op_mode.threshold num_out = np.sum(idx_out) out = num_out > limit data_out = data[idx_out,] # Return: # Monitored Statistics (filtered negative log-likelihood) # Threshold of the selected operation model. # Bit indicating whether the behaviour is out of the OP. # Number of samples beyond the threshold. # Data points that were out of the model. # Idx of the operation mode. return -filtered_stats, op_mode.threshold, out, num_out, data_out, model_id
import numpy as np from scipy.stats import binom import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) n, p = 5, 0.4 mean, var, skew, kurt = binom.stats(n, p, moments='mvsk') x = np.arange(binom.ppf(0.01, n, p),binom.ppf(0.99, n, p)) ax.plot(x, binom.pmf(x, n, p), 'bo', ms=8, label='binom pmf') ax.vlines(x, 0, binom.pmf(x, n, p), colors='b', lw=5, alpha=0.5) plt.show()
# Distribucion Binomial usando scipy.stats from scipy.stats import binom import numpy as np import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) # Calculamos los primeros momentos: n, p = 5, 0.4 mean, var, skew, kurt = binom.stats(n, p, moments='mvsk') # Mostramos el pmf de la variable aleatoria (``pmf``): x = np.arange(binom.ppf(0.01, n, p), binom.ppf(0.99, n, p)) ax.plot(x, binom.pmf(x, n, p), 'bo', ms=8, label='pmf binomial') ax.vlines(x, 0, binom.pmf(x, n, p), colors='b', lw=5, alpha=0.5) ax.legend(loc='best', frameon=False) # Comprobar la exactitud del ``cdf`` y ``ppf``: prob = binom.cdf(x, n, p) np.allclose(x, binom.ppf(prob, n, p)) # Generamos numeros aleatorios r = binom.rvs(n, p, size=1000) plt.show()
def sgn_test_threshold( count, p_value=0.05 ): return (count - binom.ppf(p_value, count,0.5) + 1)/count
def binostat(y, p): y = n.ravel(y) nbepoch = len(y) nbclass = len(n.unique(y)) return binom.ppf(1 - p, nbepoch, 1 / nbclass) * 100 / nbepoch
b = 33 fig, ax = plt.subplots(1, 1) n = 400 step = 1 p = float(1) / float(1 + b) mean, var, skew, kurt = binom.stats(n, p, moments='mvsk') print binom.var(n, p) print binom.expect(lambda x: x, args=(n, p)) print binom.expect(lambda x: x ** 2, args=(n, p)) # x = np.arange(binom.ppf(0.00001, n, p), binom.ppf(0.99999, n, p)) # x = np.arange(binom.ppf(0.01, n, p), binom.ppf(0.99, n, p)) x = np.arange(binom.ppf(0.001, n, p), binom.ppf(0.999, n, p), step) y = np.array(binom.pmf(x, n, p), dtype=float) def squarer(pos1=1, pos2=len(x)): square = 0 if pos2 > len(x): pos2 -= len(x) for i in range(pos1, pos2): square += (float(y[i - 1] + y[i]) / float(2)) * (x[i] - x[i - 1]) return square print("Square: ", squarer(2, 3)) print("Full square: ", squarer()) ax.plot(x, binom.pmf(x, n, p), 'bo', ms=7, label='binom pmf')