def get_rule_rhs(Xtrain,Ytrain,d_t,alpha,intervals): N_t = compute_rule_usage(d_t,d_t.index(0),Xtrain,Ytrain) theta = [] ci_theta = [] for i,j in enumerate(d_t): # i is the index in the list, # j is the global index of the rule if Ytrain.shape[-1] == 2: #theta ~ Dirichlet(N[j,:] + alpha) #E[theta] = (N[j,:] + alpha)/float(sum(N[j,:] + alpha)) #NOTE this result is only for binary classification #theta = p(y=1) theta.append((N_t[i,1] + alpha[1])/float(sum(N_t[i,:] + alpha))) #And now the 95% interval, for Beta(N[j,1] + alpha[1], N[j,0] + alpha[0]) if intervals: ci_theta.append(beta.interval(0.95,N_t[i,1] + alpha[1],N_t[i,0] + alpha[0])) else: # theta ~ Dirichlet(N[j,:] + alpha) # E[theta] = (N[j,:] + alpha) / float(sum(N[j,:] + alpha)) theta.append((N_t[i, :] + alpha) / (N_t[i, :] + alpha).sum()) # marginal of a dirichlet is beta. # X_i ~ Beta(alpha[i], alpha.sum() - alpha[i]) if intervals: alpha_i = N_t[i, :] + alpha ci_theta.append(beta.interval(0.95, alpha_i, (N_t[i, :] + alpha).sum() - alpha_i)) return theta,ci_theta
def get_distribution_limits(self) -> Tuple[float, float]: lower_limit_0, upper_limit_0 = beta.interval(SequenceAssociationLikelihood.DISTRIBUTION_PERCENTAGE_TO_SHOW, self.method.alpha_0, self.method.beta_0) lower_limit_1, upper_limit_1 = beta.interval(SequenceAssociationLikelihood.DISTRIBUTION_PERCENTAGE_TO_SHOW, self.method.alpha_1, self.method.beta_1) lower_limit = min(lower_limit_0, lower_limit_1) upper_limit = max(upper_limit_0, upper_limit_1) return lower_limit, upper_limit
def switching_BER(data, **kwargs): """ Process data for BER experiment. """ count_mat, start_stt = count_matrices_ber(data, **kwargs) switched_stt = int(1 - start_stt) mean = beta.mean(1 + count_mat[start_stt, switched_stt], 1 + count_mat[start_stt, start_stt]) limit = beta.mean( 1 + count_mat[start_stt, switched_stt] + count_mat[start_stt, start_stt], 1) ci68 = beta.interval(0.68, 1 + count_mat[start_stt, switched_stt], 1 + count_mat[start_stt, start_stt]) ci95 = beta.interval(0.95, 1 + count_mat[start_stt, switched_stt], 1 + count_mat[start_stt, start_stt]) return mean, limit, ci68, ci95
def avg_y_by_x(x, y): x = np.array(x) y = np.array(y) xs = sorted(list(set(x))) xv = [] yv = [] lcb = [] ucb = [] n_obs = [] for v in xs: ys = [y[i] for i, e in enumerate(x) if e == v] if len(ys) > 0: xv.append(v) yv.append(sum(ys) / len(ys)) n_obs.append(len(ys)) unique, counts = np.unique(ys, return_counts=True) counts = dict(zip(unique, counts)) if 0 not in counts: counts[0] = 0 if 1 not in counts: counts[1] = 0 ci = beta.interval(0.95, 0.5 + counts[0], 0.5 + counts[1]) lcb.append(ci[0]) ucb.append(ci[1]) return xv, yv, lcb, ucb, n_obs
def _interval(self, row): interval = beta.interval( self._interval_size, row[self._numerator_column] + self._alpha_prior, row[self._denominator_column] - row[self._numerator_column] + self._beta_prior) return interval
def part_a3(fname=fname): fname = fname + '_a3' # m = 5 # theta_set = np.linspace(1/float(m),1-1/float(m),m) # print(theta_set) a = 2 b = 8 n = 34 y = 15 a2 = y + a b2 = n + b - y x = np.linspace(0, 1, 100) fig = plt.figure() plt.plot(x, beta.pdf(x, a2, b2), lw=1.5) # hh = [] # for i, theta in enumerate(theta_set): # h, = plt.plot(x, binom.pmf(x, n, theta), lw=1.5, label=r'$\theta='+'{:.2f}'.format(theta)+'$') # hh.append(h) plt.xlabel(r'$\theta$', fontsize=labelFontSize) plt.ylabel(r'$p(\theta \mid y)$', fontsize=labelFontSize) plt.title('3.4a' + r' $p(\theta \mid y)$', fontsize=titleFontSize) plt.xticks(fontsize=tickFontSize) plt.yticks(fontsize=tickFontSize) # plt.legend() plt.savefig(fname + '.' + imgFmt, format=imgFmt) print('95% CI = ', beta.interval(0.05, a2, b2))
def calculate_proportion(_x, _n): x = _x.round() n = _n.round() ci_low, ci_upp = beta.interval(1 - 0.05, x + 0.5, n - x + 0.5) # Jeffreys Interval est_proportion = _x / _n return est_proportion, ci_low, ci_upp
def _calc_prob_interval(volume, probs, prob_vars): """Compute the confidence interval of probability.""" if not isinstance(probs, np.ndarray): probs = np.asarray(probs) if not isinstance(prob_vars, np.ndarray): prob_vars = np.asarray(prob_vars) one_minus_probs = 1 - probs alpha_coef = (np.square(probs) * one_minus_probs / prob_vars) - probs beta_coef = alpha_coef * one_minus_probs / probs intervals = beta.interval(volume, alpha_coef, beta_coef) # avoid invalid result due to extreme small value of prob_vars lows = [] highs = [] for i, low in enumerate(intervals[0]): high = intervals[1][i] if prob_vars[i] <= 0 or \ not np.isfinite(low) or low > probs[i] or \ not np.isfinite(high) or high < probs[i]: low = probs[i] high = probs[i] lows.append(low) highs.append(high) return lows, highs
def binomial_binning(dat, grouping_variables, ci=.95, rule_of_succession=True, bernoulli_column='correct'): """Bin trials based on grouping variables, returning a new data frame with binomial outcome columns (successes, N_trials, plus propotion correct) rather than each row being a single trial. This data format will significantly speed up model fitting. :param dat: a pandas dataframe containing the data. Must have grouping_variables columns and also a column corresponding to bernoulli outcomes (0, 1). :param grouping_variables: a string or list of strings containing the column names to group over. :param ci: the percentage of the confidence intervals. :param rule_of_succession: if true, apply a rule-of-succession correction to the data by adding 1 success and one failure to the total number of trials. This is essentially a prior acknowledging the possibility of both successes and failures, and is used to correct for values with proportions of 0 or 1 (i.e. allow estimation of beta errors). :param bernoulli_column: A string naming the column of the dataframe corresponding to bernoulli trial outcome. Defaults to "correct". :returns: a new pandas dataframe where each row is a binomial trial. Example ---------- res = binomial_binning(dat, ['subj', 'surround', 'scale']) """ grouped = dat.groupby(grouping_variables, as_index=False) res = grouped[bernoulli_column].agg({'n_successes': _np.sum, 'n_trials': _np.size}) if rule_of_succession: res.loc[:, 'n_successes'] += 1 res.loc[:, 'n_trials'] += 2 # compute some additional values: res.loc[:, 'prop_corr'] = res.n_successes / res.n_trials # confidence intervals from a beta distribution: cis = _beta.interval(ci, res.n_successes, (res.n_trials-res.n_successes)) res.loc[:, 'ci_min'] = cis[0] res.loc[:, 'ci_max'] = cis[1] res.loc[:, 'error_min'] = _np.abs(res['ci_min'].values - res['prop_corr'].values) res.loc[:, 'error_max'] = _np.abs(res['ci_max'].values - res['prop_corr'].values) return(res)
def acc_ci(confusion_matrix, alpha=0.05): """Function takes in a NxN confusion matrix and returns the fraction of correct predictions""" x = confusion_matrix.diagonal().sum() N = confusion_matrix.sum() ci = beta.interval(1-alpha,x,N-x) return ci
def acc_ci(confusion_matrix, alpha=0.05): """Function takes in a NxN confusion matrix and returns the fraction of correct predictions""" x = confusion_matrix.diagonal().sum() N = confusion_matrix.sum() ci = beta.interval(1 - alpha, x, N - x) return ci
def binomial_binning(dat, y='correct', grouping_variables='x', ci=.95, rule_of_succession=False): """Bin trials based on grouping variables, returning a new data frame with binomial outcome columns (successes, N_trials, plus propotion correct) rather than each row being a single trial. This data format can significantly speed up model fitting. :param dat: a pandas dataframe containing the data. Must have grouping_variables columns and also a column corresponding to bernoulli outcomes (0, 1). :param y: A string naming the column of the dataframe corresponding to bernoulli trial outcome. Defaults to "correct". :param grouping_variables: a string or list of strings containing the column names to group over. Defaults to 'x'. :param ci: the percentage of the confidence intervals. :param rule_of_succession: if true, apply a rule-of-succession correction to the data by adding 1 success and one failure to the total number of trials. This is essentially a prior acknowledging the possibility of both successes and failures, and is used to correct for values with proportions of 0 or 1 (to e.g. allow estimation of beta errors). :returns: a new pandas dataframe where each row is a binomial trial. Example ---------- res = binomial_binning(dat, ['subj', 'surround', 'scale']) """ grouped = dat.groupby(grouping_variables, as_index=False) res = grouped[y].agg({'n_successes': np.sum, 'n_trials': np.size}) if rule_of_succession: res.loc[:, 'n_successes'] += 1 res.loc[:, 'n_trials'] += 2 # compute some additional values: res.loc[:, 'prop_corr'] = res.n_successes / res.n_trials # confidence intervals from a beta distribution: cis = beta.interval(ci, res.n_successes, (res.n_trials - res.n_successes)) res.loc[:, 'ci_min'] = cis[0] res.loc[:, 'ci_max'] = cis[1] res.loc[:, 'error_min'] = np.abs(res['ci_min'].values - res['prop_corr'].values) res.loc[:, 'error_max'] = np.abs(res['ci_max'].values - res['prop_corr'].values) return (res)
def qq_plot(output_dir, file_path=None, dataset=None): if dataset is None: dataset = pd.read_table(file_path) dataset = dataset[dataset.isna()['pvalue'] == False] dataset['-log10_pvalue'] = -np.log10(dataset['pvalue']) gene_size = dataset.shape[0] exp = np.concatenate([ np.arange(100) / gene_size, np.logspace(-np.log10(gene_size) + 2, 0, 200) ]) obs = mquantiles(dataset['pvalue'], prob=exp, alphap=0, betap=1) lower = list() upper = list() for i in range(0, len(exp)): CI_values = beta.interval(0.95, gene_size * exp[i], gene_size - gene_size * exp[i]) lower.append(CI_values[0]) upper.append(CI_values[1]) exp = -np.log10(exp) obs = -np.log10(obs) up = -np.log10(lower) low = -np.log10(upper) plt.close() fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111) ax.fill_between(exp, up, low, color='grey', alpha=0.5) ax.set_xlim(np.nanmin(exp[exp != -np.inf]), np.nanmax(exp[exp != np.inf]) + 0.1) ax.set_ylim( np.nanmin(obs[obs != -np.inf]), max(np.nanmax(obs[obs != np.inf]), np.nanmax(up[up != np.inf])) + 0.5) ax.plot(ax.get_xlim(), ax.get_xlim(), linestyle='--', color='black') ax.scatter(exp, obs, s=3, c=(31 / 255., 119 / 255., 180 / 255.)) if file_path is not None: # check file names if re.search(r".+\/(.+).tsv", file_path) == None: file_path = './' + file_path filename = re.search(r".+\/(.+).tsv", file_path).group(1) title = filename.replace('_', ' ') title = title.replace('.', ' ') else: filename = 'null' title = 'Null' ax.set_title(title + ' QQ-Plot', fontweight='bold', fontsize=24, y=1.02) ax.set_xlabel('expected -log\u2081\u2080 pvalue', fontsize=22) ax.set_ylabel('observed -log\u2081\u2080 pvalue', fontsize=22) ax.tick_params(labelsize=12) fig.savefig(output_dir + '/' + filename + '_qq_plot.png')
def historical_prob_winning(seed1, seed2): wins, losses = Seed2DDistribution.lookup(seed1, seed2) if not ((wins == 0) and (losses == 0)): mean = (wins + 1) / (wins + losses + 2.0) lower_bound, upper_bound = beta.interval(0.95, wins + 1, losses + 1) result = [seed1, seed2, mean, -lower_bound + mean, upper_bound - mean] else: result = None return result
def test_multivariate_equal_intensities(): N = 100 false_positives = 0 alpha = 0.95 for i in range(100): T = np.random.exponential(10, size=300) g = np.random.binomial(2, 0.5, size=300) s, _, result = multivariate_logrank_test(T, g, alpha=alpha, suppress_print=True) false_positives += result is not None bounds = beta.interval(0.95, 1 + false_positives, N - false_positives + 1) assert bounds[0] < 1 - alpha < bounds[1]
def is_confident(self, pi, percentage): """ Checks whether a given samples is confident with the distribution given a significance level :param pi: observed sample :param percentage: significance level :return: bool """ # Compute the upper bound that contains 'percentage' percent of the distribution upper_confidence_bound = beta.interval(percentage, self._alpha, self._beta)[1] return pi < upper_confidence_bound
def get_rule_rhs(Xtrain,Ytrain,d_t,alpha,intervals, Volume, Y2length): #modify this to handle volume. N_t, Volume_t = compute_rule_usage(d_t,d_t.index(0),Xtrain,Ytrain, Volume, Y2length) theta = [] ci_theta = [] print("this is d_t", d_t) for i,j in enumerate(d_t): if Volume_t[i]>0: theta.append((N_t[i]+alpha)/(float(sum([N_t[k]+alpha for k in range(N_t.size)]))*(Volume_t[i]))) else: theta.append(0.0) #And now the 95% interval, for Beta(N[j,1] + alpha[1], N[j,0] + alpha[0]) if intervals: ci_theta.append(beta.interval(0.95,N_t[i,1] + alpha[1],N_t[i,0] + alpha[0])) return theta,ci_theta
def plot2(_list, chr_brps, centromere_brps, line_names=None): if not line_names: line_names = range(1, _list.shape[0]+1) inflated_table = np.vstack([inflate_tags(x[0, :], 25) for x in np.split(_list, _list.shape[0])]) gs = gridspec.GridSpec(4, 4) ax1 = plt.subplot(gs[:-1, :]) plt.imshow(inflated_table, interpolation='nearest', cmap='coolwarm') show_breakpoints([0] + chr_brps + [_list.shape[1]], 'k') show_breakpoints(list(set(centromere_brps) - set(chr_brps)), 'g') ax2 = plt.subplot(gs[-1, :], sharex=ax1) red_run = np.nanmean((_list > 0).astype(np.float), 0) blue_run = np.nanmean((_list < 0).astype(np.float), 0) stack = np.hstack((blue_run, red_run)) mean = np.mean(stack) std = np.std(stack) _alpha = ((1 - mean)/std**2 - 1/mean)*mean**2 _beta = _alpha*(1/mean-1) r = beta.rvs(_alpha, _beta, size=1000) _min, _max = beta.interval(0.95, _alpha, _beta) plt.plot(blue_run, 'b') plt.plot(red_run, 'r') plt.axhline(y=_min, color='g') plt.axhline(y=_max, color='g') show_breakpoints([0] + chr_brps + [_list.shape[1]], 'k') show_breakpoints(list(set(centromere_brps) - set(chr_brps)), 'g') chr_arm_locations, chr_arm_names = sf.align_chromosome_edges(chr_brps, centromere_brps) ax1.set_xticks(chr_arm_locations) ax1.set_xticklabels(chr_arm_names, rotation='vertical') ax1.set_yticks(range(0, _list.shape[0]*25+1, 25)) ax1.set_yticklabels(line_names) ax2.set_xticks(chr_arm_locations) ax2.set_xticklabels(chr_arm_names, rotation='vertical') plt.show() smooth_histogram(r, 'b') smooth_histogram(stack) plt.axvline(x=_max, color='g') plt.axvline(x=_min, color='g') plt.show()
def get_rule_rhs(Xtrain, Ytrain, d_t, alpha, intervals): N_t = compute_rule_usage(d_t, d_t.index(0), Xtrain, Ytrain) theta = [] ci_theta = [] for i, j in enumerate(d_t): # theta ~ Dirichlet(N[j,:] + alpha) # E[theta] = (N[j,:] + alpha)/float(sum(N[j,:] + alpha)) # NOTE this result is only for binary classification # theta = p(y=1) theta.append((N_t[i, 1] + alpha[1]) / float(sum(N_t[i, :] + alpha))) # And now the 95% interval, for Beta(N[j,1] + alpha[1], N[j,0] + alpha[0]) if intervals: ci_theta.append(beta.interval(0.95, N_t[i, 1] + alpha[1], N_t[i, 0] + alpha[0])) return theta, ci_theta
def get_rule_rhs(Xtrain,Ytrain,d_t,alpha,intervals, Volume, Y2length): #modify this to handle volume. N_t, Volume_t = compute_rule_usage(d_t,d_t.index(0),Xtrain,Ytrain, Volume, Y2length) theta = [] ci_theta = [] print "this is d_t", d_t for i,j in enumerate(d_t): if Volume_t[i]>0: theta.append((N_t[i]+alpha)/(float(sum([N_t[k]+alpha for k in range(N_t.size)]))*(Volume_t[i]))) else: theta.append(0.0) #And now the 95% interval, for Beta(N[j,1] + alpha[1], N[j,0] + alpha[0]) if intervals: ci_theta.append(beta.interval(0.95,N_t[i,1] + alpha[1],N_t[i,0] + alpha[0])) return theta,ci_theta
def get_rule_rhs(Xtrain, Ytrain, d_t, alpha, intervals): N_t = compute_rule_usage(d_t, d_t.index(0), Xtrain, Ytrain) theta = [] ci_theta = [] for i, j in enumerate(d_t): #theta ~ Dirichlet(N[j,:] + alpha) #E[theta] = (N[j,:] + alpha)/float(sum(N[j,:] + alpha)) #NOTE this result is only for binary classification #theta = p(y=1) theta.append((N_t[i, 1] + alpha[1]) / float(sum(N_t[i, :] + alpha))) #And now the 95% interval, for Beta(N[j,1] + alpha[1], N[j,0] + alpha[0]) if intervals: ci_theta.append(beta.interval(0.95, N_t[i, 1] + alpha[1], N_t[i, 0] + alpha[0])) return theta, ci_theta
def test_equal_intensity(): """ This is the (I think) fact that 1-alpha == false positive rate. I use a Bayesian test to test that we achieve this rate. """ N = 100 false_positives = 0 alpha = 0.95 for i in range(100): data1 = np.random.exponential(5, size=(200, 1)) data2 = np.random.exponential(5, size=(200, 1)) summary, p_value, result = logrank_test(data1, data2, alpha=0.95, suppress_print=True) false_positives += result is not None bounds = beta.interval(0.95, 1 + false_positives, N - false_positives + 1) assert bounds[0] < 1 - alpha < bounds[1]
def _calc_beta_intervals(means, variances, prob=0.95): """Calculate confidence interval of beta distributions.""" if not isinstance(means, np.ndarray): means = np.array(means) if not isinstance(variances, np.ndarray): variances = np.array(variances) with np.errstate(divide='ignore'): coef_a = ((means ** 2) * (1 - means) / variances) - means coef_b = (coef_a * (1 - means)) / means itl_lows, itl_his = beta.interval(prob, coef_a, coef_b) sds = np.sqrt(variances) for i in range(itl_lows.shape[0]): if not np.isfinite(sds[i]) or not np.isfinite(itl_lows[i]) or not np.isfinite(itl_his[i]): itl_lows[i] = means[i] itl_his[i] = means[i] sds[i] = 0 return itl_lows, itl_his, sds
def get_rule_rhs(Xtrain, Ytrain, d_t, alpha, intervals): '''Compute the posterior consequent distributions (Basically compute points in each part of rule) ''' N_t = compute_rule_usage(d_t, d_t.index(0), Xtrain, Ytrain) theta = [] # P(Y=1) ci_theta = [] # confidence interval for Y=1 for i, j in enumerate(d_t): # theta ~ Dirichlet(N[j,:] + alpha) # E[theta] = (N[j,:] + alpha)/float(sum(N[j,:] + alpha)) # NOTE this result is only for binary classification # theta = p(y=1) theta.append((N_t[i, 1] + alpha[1]) / float(sum(N_t[i, :] + alpha))) # And now the 95% interval, for Beta(N[j,1] + alpha[1], N[j,0] + alpha[0]) if intervals: ci_theta.append( beta.interval(0.95, N_t[i, 1] + alpha[1], N_t[i, 0] + alpha[0])) return theta, ci_theta
def interval(self, alpha: float) -> Array: """ Calculates the endpoints of a confidence interval range using alpha Parameters ---------- alpha: float Percent of distribution to be caintained within returned interval. Must be between 0.0 and 1.0 Returns ------- Array: Array containing the interval range, first element is the low end of the range, second element is the high end of the range. """ interval = beta_dist.interval(alpha, self.alpha, self.beta) interval = np.array([(val * self.range) + self.a for val in interval]) return interval
def get_sex(sample, Nx, Na, Lx, La): Rx = float(Nx) / (Nx + Na) # Beta CI with non-informative prior, aka Jefferey's interval. # See Brown, Cai, and DasGupta (2001). doi:10.1214/ss/1009213286 Rx_CI = beta.interval(0.99, Nx + 0.5, Na + 0.5) # expected ratios from the chromosome lengths Elx_X0 = float(Lx) / (Lx + 2 * La) Elx_XX = float(Lx) / (Lx + La) #ll_x0 = beta.logpdf(Elx_X0, Nx+0.5, Na+0.5) #ll_xx = beta.logpdf(Elx_XX, Nx+0.5, Na+0.5) ll_x0 = binom.logpmf(Nx, Nx + Na, Elx_X0) ll_xx = binom.logpmf(Nx, Nx + Na, Elx_XX) # likelihood ratio test alpha = 0.001 if chi2.sf(2 * (ll_x0 - ll_xx), 1) < alpha: sex = 'M' elif chi2.sf(2 * (ll_xx - ll_x0), 1) < alpha: sex = 'F' else: # indeterminate sex = 'U' if ll_x0 > ll_xx: Elx = 2 * Elx_X0 else: Elx = Elx_XX Mx = Rx / Elx Mx_CI = [Rx_CI[0] / Elx, Rx_CI[1] / Elx] if Mx < 0.4 or Mx > 1.2: #print("Warning: {} has unexpected Mx={:g}".format(sample, Mx), file=sys.stderr) pass if Mx > 0.6 and Mx < 0.8: # suspicious sample, may be contaminated sex = 'U' return Elx, Mx, Mx_CI, sex
def qq(data, ax, color): xmax = 0 ymax = 0 alpha = 0.9 color = '#000000' n_quantiles = 100 q_pos = np.concatenate([ np.arange(99.) / len(data), np.logspace(-np.log10(len(data)) + 2, 0, n_quantiles) ]) q_data = mquantiles(data, prob=q_pos, alphap=0, betap=1, limit=(0, 1)) q_th = q_pos.copy() q_err = np.zeros([len(q_pos), 2]) for i in range(0, len(q_pos)): q_err[i, :] = q_err[i, :] = beta.interval( alpha, len(data) * q_pos[i], len(data) - len(data) * q_pos[i]) q_err[i, q_err[i, :] < 0] = 1e-15 slope, intercept, r_value, p_value, std_err = linregress(q_th, q_data) xmax = np.max([xmax, -np.log10(q_th[1])]) ymax = np.max([ymax, -np.log10(q_data[0])]) ax.plot(-np.log10(q_th[n_quantiles - 1:]), -np.log10(q_data[n_quantiles - 1:]), '-', color=color) ax.plot(-np.log10(q_th[:n_quantiles]), -np.log10(q_data[:n_quantiles]), '.', color=color, label='gf') ax.plot([0, xmax], [0, xmax], '--k', color='#f42e30') ax.fill_between( -np.log10(q_th), -np.log10(q_err[:, 0]), -np.log10(q_err[:, 1]), color=color, alpha=0.1, )
def BetaModel(data): def fitted(x, a, b): fx = gammaf(a+b)/gammaf(a)/gammaf(b)*x**(a-1)*(1-x)**(b-1) # pdf of beta return fx data1 = MaxMinNormalization(data) a, b, xx, yy = beta.fit(data1) plt.hist(data1, bins=30, normed=True) xx = np.linspace(0, max(data1), len(data1)) plt.plot(xx, fitted(xx, a, b), 'g') plt.show() alpha = 0.95 q1, q2 = beta.interval(alpha, a, b, loc=0, scale=1) d1 = q1*(max(data)-min(data))+min(data) d2 = q2*(max(data)-min(data))+min(data) return a, b, d1, d2
def parse_sextable(filename): sextable = [] with open(filename) as f: next(f) # skip header for line in f: line = line.rstrip() fields = line.split("\t") if (len(fields) < 5): continue sample = fields[0] Mx = int(fields[1]) Lx = int(fields[2]) Ma = int(fields[3]) La = int(fields[4]) if Ma < 1000: # don't bother continue Rl = float(Lx) / (Lx + La) Rx = float(Mx) / (Mx + Ma) # Beta 95%CI with non-informative prior, aka Jefferey's interval. # See Brown, Cai, and DasGupta (2001). doi:10.1214/ss/1009213286 Rx_95CI = beta.interval(0.95, Mx + 0.5, Ma + 0.5) sfields = sample.split("_") if sfields[-1].upper() == 'INF': age = float('inf') else: try: age = float(int(sfields[-1])) except ValueError: age = None sextable.append((sample, Mx, Lx, Ma, La, Rl, Rx, Rx_95CI, age)) return sextable
# Compute 95% CI for a beta distribution import superimport from scipy.stats import beta import numpy as np np.random.seed(42) N1 = 2 N0 = 8 N = N0 + N1 # Sufficient statistics aprior = 1 bprior = 1 # prior apost = aprior + N1 bpost = bprior + N0 # posterior alpha = 0.05 CI1 = beta.interval(1 - alpha, apost, bpost) print('{:0.2f}--{:0.2f}'.format(CI1[0], CI1[1])) # (0.06:0.52) l = beta.ppf(alpha / 2, apost, bpost) u = beta.ppf(1 - alpha / 2, apost, bpost) CI2 = (l, u) print('{:0.2f}--{:0.2f}'.format(CI2[0], CI2[1])) # (0.06:0.52) samples = beta.rvs(apost, bpost, size=1000) samples = np.sort(samples) CI3 = np.percentile(samples, 100 * np.array([alpha / 2, 1 - alpha / 2])) print('{:0.2f}--{:0.2f}'.format(CI3[0], CI3[1])) # (0.06:0.51)
def get_beta_confidence_interval(self, conf=0.68): betaparams = beta.fit(self.data) beta.interval(conf, *betaparams)
def QQPlot(data_table,p_value_column=None,maf_column=None,freq_bins=None,n_quantiles=1000,error_ci=0.95,min_p=1e-30,hide_hla=False,error_type='experimental',lambda_gc_scale=None): f, axis = plt.subplots(1, 1,figsize=(8,8)) axis.spines['right'].set_visible(False) axis.spines['top'].set_visible(False) if p_value_column==None: p_value_column='P' if maf_column==None: if 'MAF' in data_table.columns: maf_column='MAF' else: data_table['MAF']=np.zeros(len(data_table))*np.nan if hide_hla: chr6 = data_table.loc[(data_table.CHROM==6)] excluded=chr6.index[np.logical_and(chr6.POS>=28477797,chr6.POS<=33448354)] p_maf_table=data_table.drop(excluded)[[maf_column,p_value_column]] elif maf_column is not None: p_maf_table=data_table[[maf_column,p_value_column]] else: p_maf_table=data_table[[p_value_column]] assert error_type in ['experimental','theoretical'],"Error type must be in ['experimental','theoretical']" min_vals_obs=[] min_vals_exp=[] if freq_bins is None: p_input= p_maf_table[p_value_column].values p_input[p_input<min_p]=min_p quantile_thresholds = np.concatenate([np.arange(1,np.floor(0.5*n_quantiles))/p_input.shape[0], np.logspace(np.log10(np.floor(0.5*n_quantiles)/p_input.shape[0]), 0, int(np.ceil(0.5*n_quantiles))+1)[:-1]]) obs_quantiles = mquantiles(p_input, prob=quantile_thresholds, alphap=0.0, betap=1.0, limit=(0.0, 1.0)) axis.plot(-np.log10(quantile_thresholds),-np.log10(obs_quantiles),'.',color=color_list[0],ms=15) if lambda_gc_scale is not None: axis.text(1,5,r'$\lambda_{IF}$'+'={0:1.2f}'.format(LambdaGC(p_input)[0])+' ('+r'$\lambda^{'+'{0:d}'.format(lambda_gc_scale)+'}_{IF}$'+'={0:1.3f}'.format(LambdaGC(p_input,scaling_factor=lambda_gc_scale)[0])+')',fontsize=24,fontweight='bold',color=color_list[0]) else: axis.text(1,5,r'$\lambda_{IF}$'+'={0:1.2f}'.format(LambdaGC(p_input)[0]),fontsize=24,fontweight='bold',color=color_list[0]) min_vals_obs+=[obs_quantiles.min()] min_vals_exp+=[quantile_thresholds.min()] if error_type=='experimental': ci_vecs = beta.interval(error_ci, len(p_maf_table)*quantile_thresholds, len(p_maf_table) - len(p_maf_table)*quantile_thresholds) axis.fill_between( -np.log10(quantile_thresholds), -np.log10(obs_quantiles/quantile_thresholds*ci_vecs[0]), -np.log10(obs_quantiles/quantile_thresholds*ci_vecs[1]), color=color_list[0], alpha=0.25, label='{0:2d}% CI'.format(int(100*error_ci))) else: for i in range(len(freq_bins)-2): p_input= p_maf_table[np.logical_and(p_maf_table[maf_column]>=freq_bins[i],p_maf_table[maf_column]<freq_bins[i+1])][p_value_column].values p_input[p_input<min_p]=min_p quantile_thresholds = np.concatenate([np.arange(1,np.floor(0.5*n_quantiles))/p_input.shape[0], np.logspace(np.log10(np.floor(0.5*n_quantiles)/p_input.shape[0]), 0, int(np.ceil(0.5*n_quantiles))+1)[:-1]]) obs_quantiles = mquantiles(p_input, prob=quantile_thresholds, alphap=0.0, betap=1.0, limit=(0.0, 1.0)) axis.plot(-np.log10(quantile_thresholds),-np.log10(obs_quantiles),'.',ms=15,color=color_list[(i*2)%len(color_list)],label=r'{0:.1e}$\leq$ MAF$<${1:.1e}'.format(freq_bins[i],freq_bins[i+1])) if error_type=='experimental': ci_vecs = beta.interval(error_ci, len(p_maf_table)*quantile_thresholds, len(p_maf_table) - len(p_maf_table)*quantile_thresholds) axis.fill_between( -np.log10(quantile_thresholds), -np.log10(obs_quantiles/quantile_thresholds*ci_vecs[0]), -np.log10(obs_quantiles/quantile_thresholds*ci_vecs[1]), color=color_list[(i*2)%len(color_list)], alpha=0.25, label='{0:2d}% CI'.format(int(100*error_ci))) if lambda_gc_scale is not None: axis.text(1,5-i,r'$\lambda_{IF}$'+'={0:1.2f}'.format(LambdaGC(p_input)[0])+' ('+r'$\lambda^{'+'{0:d}'.format(lambda_gc_scale)+'}_{IF}$'+'={0:1.3f}'.format(LambdaGC(p_input,scaling_factor=lambda_gc_scale)[0])+')',fontsize=24,fontweight='bold',color=color_list[i*2]) else: axis.text(1,5-i,r'$\lambda_{IF}$'+'={0:1.2f}'.format(LambdaGC(p_input)[0]),fontsize=24,fontweight='bold',color=color_list[i*2]) min_vals_obs+=[obs_quantiles.min()] min_vals_exp+=[quantile_thresholds.min()] i+=1 p_input= p_maf_table[np.logical_and(p_maf_table[maf_column]>=freq_bins[i],p_maf_table[maf_column]<=freq_bins[i+1])][p_value_column].values p_input[p_input<min_p]=min_p quantile_thresholds = np.concatenate([np.arange(1,np.floor(0.5*n_quantiles))/p_input.shape[0], np.logspace(np.log10(np.floor(0.5*n_quantiles)/p_input.shape[0]), 0, int(np.ceil(0.5*n_quantiles))+1)[:-1]]) obs_quantiles = mquantiles(p_input, prob=quantile_thresholds, alphap=0.0, betap=1.0, limit=(0.0, 1.0)) axis.plot(-np.log10(quantile_thresholds),-np.log10(obs_quantiles),'o',color=color_list[(i*2)%len(color_list)],mew=0.0,label=r'{0:.1e}$\leq$ MAF$\leq${1:.1e}'.format(freq_bins[i],0.5)) if error_type=='experimental': ci_vecs = beta.interval(error_ci, len(p_maf_table)*quantile_thresholds, len(p_maf_table) - len(p_maf_table)*quantile_thresholds) axis.fill_between( -np.log10(quantile_thresholds), -np.log10(obs_quantiles/quantile_thresholds*ci_vecs[0]), -np.log10(obs_quantiles/quantile_thresholds*ci_vecs[1]), color=color_list[(i*2)%len(color_list)], alpha=0.25, label='{0:2d}% CI'.format(int(100*error_ci))) if lambda_gc_scale is not None: axis.text(1,5-i,r'$\lambda_{IF}$'+'={0:1.2f}'.format(LambdaGC(p_input)[0])+' ('+r'$\lambda^{'+'{0:d}'.format(lambda_gc_scale)+'}_{IF}$'+'={0:1.3f}'.format(LambdaGC(p_input,scaling_factor=lambda_gc_scale)[0])+')',fontsize=24,fontweight='bold',color=color_list[i*2]) else: axis.text(1,5-i,r'$\lambda_{IF}$'+'={0:1.2f}'.format(LambdaGC(p_input)[0]),fontsize=24,fontweight='bold',color=color_list[i*2]) min_vals_obs+=[obs_quantiles.min()] min_vals_exp+=[quantile_thresholds.min()] axis.set_xlim(0.0,np.ceil(-np.log10(min(min_vals_exp)))) exp_p_vals = np.linspace(0,axis.get_xlim()[1],100) if error_type=='theoretical': ci_vecs = beta.interval(error_ci, len(p_maf_table)*(10**(-1.0*exp_p_vals)), len(p_maf_table) - len(p_maf_table)*(10**(-1.0*exp_p_vals))) axis.fill_between(exp_p_vals, -np.log10(ci_vecs[0]), -np.log10(ci_vecs[1]), color=grey_color, alpha=0.25, label='{0:2d}% CI'.format(int(100*error_ci))) axis.plot(exp_p_vals,exp_p_vals,'--',color=red_color,lw=3.0) axis.set_ylim(0.0,np.ceil(-np.log10(min(min(min_vals_obs),ci_vecs[0].min(),min(min_vals_obs))))+1) axis.legend(loc='upper left',frameon=False,fontsize=14) axis.set_xlabel(r'$\log_{10}$(P-Value)'+'\nExpected',fontsize=24) axis.set_ylabel(r'$\log_{10}$(P-Value)'+'\nObserved',fontsize=24) return f,axis
final_state = state[1::2] switched = np.logical_xor(init_state, final_state) count_mat = np.zeros((2,2), dtype=np.int) count_mat[0,0] = np.sum(np.logical_and(init_state == 0, np.logical_not(switched) )) count_mat[0,1] = np.sum(np.logical_and(init_state == 0, switched )) count_mat[1,0] = np.sum(np.logical_and(init_state == 1, switched )) count_mat[1,1] = np.sum(np.logical_and(init_state == 1, np.logical_not(switched) )) counts.append(count_mat) plt.figure() mean_PtoAP = [beta.mean(1+c[0,1], 1+c[0,0]) for c in counts] mean_APtoP = [beta.mean(1+c[1,0], 1+c[1,1]) for c in counts] ci68_PtoAP = [beta.interval(0.68, 1+c[0,1], 1+c[0,0]) for c in counts] ci68_APtoP = [beta.interval(0.68, 1+c[1,0], 1+c[1,1]) for c in counts] ci95_PtoAP = [beta.interval(0.95, 1+c[0,1], 1+c[0,0]) for c in counts] ci95_APtoP = [beta.interval(0.95, 1+c[1,0], 1+c[1,1]) for c in counts] current_palette = sns.color_palette() plt.plot(fall_times, mean_PtoAP) plt.fill_between(fall_times, [ci[0] for ci in ci68_PtoAP], [ci[1] for ci in ci68_PtoAP], color=current_palette[0], alpha=0.2, edgecolor="none") plt.fill_between(fall_times, [ci[0] for ci in ci95_PtoAP], [ci[1] for ci in ci95_PtoAP], color=current_palette[0], alpha=0.2, edgecolor="none") plt.plot(fall_times, mean_APtoP) plt.fill_between(fall_times, [ci[0] for ci in ci68_APtoP], [ci[1] for ci in ci68_APtoP], color=current_palette[1], alpha=0.2, edgecolor="none") plt.fill_between(fall_times, [ci[0] for ci in ci95_APtoP], [ci[1] for ci in ci95_APtoP], color=current_palette[1], alpha=0.2, edgecolor="none") plt.xlabel("nTron Pulse Fall Time (s)") plt.ylabel("Switching Probability") plt.legend(("P->AP", "AP->P")) plt.show()
def calculate_dirichlet_abundances(ts_tv_file, p_values_file, total_fastq_reads, sample_abundance): """ Function that calculates the mean posterior abundances of species in metagenomic samples/libraries. """ assert os.stat( ts_tv_file).st_size, f"The ts_tv count file is empty {ts_tv_file}" assert os.stat( p_values_file ).st_size, f"The chi-square p values file is empty {p_values_file}" assert os.stat( total_fastq_reads ).st_size, f"The total fastq reads file is empty {total_fastq_reads}" # I calculate the coverage, fraction of the covered genome and evenness of coverage of each taxon # from reads in its bam/pileup file. Let's go there cov_val = pd.read_csv( p_values_file, sep="\t", names=[ "species", "ref_bases_cov", "total_bases_cov", "coverage", "fraction_ref_cov", "cov_evenness" ], usecols=["species", "coverage", "fraction_ref_cov", "cov_evenness"], ) evenness_vector = (cov_val[["species", "cov_evenness" ]].fillna(value=0).groupby("species").apply( hmean).astype("float64").rename("Taxon")) evenness_vector["Dark_Matter"] = np.nan evenness_vector["Grey_Matter"] = np.nan fraction_vector = (cov_val[["species", "fraction_ref_cov" ]].fillna(value=0).groupby("species").apply( hmean).astype("float64").rename("Taxon")) fraction_vector["Dark_Matter"] = np.nan fraction_vector["Grey_Matter"] = np.nan coverage_vector = (cov_val[["species", "coverage" ]].fillna(value=0).groupby("species").apply( hmean).astype("float64").rename("Taxon")) coverage_vector["Dark_Matter"] = np.nan coverage_vector["Grey_Matter"] = np.nan ts_tv_matrix = pd.read_csv( ts_tv_file, sep=",", usecols=["Taxon", "Read_ID", "Dirichlet_Assignment"]) aln_reads_vector = ts_tv_matrix[[ "Taxon", "Read_ID" ]].groupby("Taxon").count().squeeze(axis=1).rename("Taxon") aln_reads_vector["Dark_Matter"] = 0 aln_reads_vector["Grey_Matter"] = 0 # Sum the Dirichlet Assignments per taxon and calculate the Dark Matter reads from the Dirichlet Assignment column ts_tv_group = ts_tv_matrix.groupby("Read_ID").sum().squeeze(axis=1) grey_matter = ts_tv_group.where(ts_tv_group == 0).replace(0, 1).fillna(0) if len(ts_tv_matrix.Taxon.unique()) > 1: a = ts_tv_matrix.groupby("Taxon").sum().squeeze().astype(float) else: a = ts_tv_matrix.groupby("Taxon").sum().iloc[:, 0].astype(float) a.loc["Grey_Matter"] = grey_matter.sum() # Add the non aligned filtered reads count in the Dark Matter category total_fastq_reads = float(open(total_fastq_reads, "r").read()) reads_in_bams = len(ts_tv_matrix["Read_ID"].unique()) remaining_dark_matter = total_fastq_reads - reads_in_bams a.loc["Dark_Matter"] = remaining_dark_matter print(a, file=sys.stderr) # Perform Alberto's formulas b = a.sum() posterior_abundance_mean = a.add(1).divide(b + len(a)).sort_values( ascending=False) # Prepare the dataframe that is going to be outputted and calculate the rest of the output columns. posterior_abundance = posterior_abundance_mean.to_frame().reset_index() posterior_abundance.rename( columns={"Dirichlet_Assignment": "Mean_Posterior_Abundance"}, inplace=True) posterior_abundance["95_CI_lower"] = np.nan posterior_abundance["95_CI_upper"] = np.nan posterior_abundance["Minimum_Read_Num"] = np.nan posterior_abundance["Maximum_Read_Num"] = np.nan posterior_abundance["Dirichlet_Read_Num"] = np.nan posterior_abundance["Evenness_of_Coverage_Ratio"] = np.nan posterior_abundance["Fraction_of_Genome_Covered"] = np.nan posterior_abundance["Coverage"] = np.nan posterior_abundance["Aligned_Read_Num"] = np.nan for idx, row in posterior_abundance.iterrows(): ai = a.loc[posterior_abundance.iloc[idx, 0]] print(ai, file=sys.stderr) ci = beta.interval(0.95, ai + 1, b + len(a) - ai - 1) print(len(a), file=sys.stderr) posterior_abundance.iloc[idx, 2] = ci[0] posterior_abundance.iloc[idx, 3] = ci[1] posterior_abundance.iloc[idx, 4] = round(ci[0] * b) posterior_abundance.iloc[idx, 5] = round(ci[1] * b) posterior_abundance.iloc[idx, 6] = a.loc[posterior_abundance.iloc[idx, 0]] posterior_abundance.iloc[idx, 7] = evenness_vector.loc[ posterior_abundance.iloc[idx, 0]] posterior_abundance.iloc[idx, 8] = fraction_vector.loc[ posterior_abundance.iloc[idx, 0]] posterior_abundance.iloc[idx, 9] = coverage_vector.loc[ posterior_abundance.iloc[idx, 0]] posterior_abundance.iloc[idx, 10] = aln_reads_vector.loc[ posterior_abundance.iloc[idx, 0]] with open(sample_abundance, "w") as output_handle: posterior_abundance.to_csv(path_or_buf=output_handle, sep="\t", index=False, header=True)
from scipy.stats import beta import numpy as np S = 47 N = 100 a = S + 1 b = (N - S) + 1 alpha = 0.05 CI1 = beta.interval(1 - alpha, a, b) l = beta.ppf(alpha / 2, a, b) u = beta.ppf(1 - alpha / 2, a, b) CI2 = (l, u) samples = beta.rvs(a, b, size=1000) samples = np.sort(samples) CI3 = np.percentile(samples, 100 * np.array([alpha / 2, 1 - alpha / 2])) print(CI1) print(CI2) print(CI3)
from numpy import * from scipy.stats import beta from scipy.stats import gamma from numpy.random import normal from scipy.optimize import fsolve rs = random.RandomState(12345) n_sample = 10000 ###################### # parameters of beta distributions representing the proportion of the population sexually # active, by sex and age group ###################### # men, 16-24 [alpha_m_16_24, beta_m_16_24] = fsolve( lambda x: array(beta.interval(0.95, x[0], x[1], loc=0, scale=1)) - (0.8023836019, 0.843403825), [1,1] ) # women, 16-24 [alpha_f_16_24, beta_f_16_24] = fsolve( lambda x: array(beta.interval(0.95, x[0], x[1], loc=0, scale=1)) - (0.7998634469, 0.837979601), [1,1] ) ###################### # sexually-active population: ###################### # Population, testing and diagnosis data is from http://www.chlamydiascreening.nhs.uk/ps/data.asp (downloaded 17 April 2015).
def histogram_pair(value_vec, binary_vec, bins, smoothing_const=.01, prior_prob=.5, rel_risk=False, error_bar_alpha=.05, figsize=(12, 6), **kwargs): """This is a tool to explore the relationship between a numerical feature and a 1/0 binary outcome. Author: Brian Lucena It plots two histograms: one is of the values of the feature when the binary outcome is positive (1) and the other when it is negative (0). It then gives the marginal empirical probability of being a 1 given that the numerical feature is in a particular value range. In practice, it often takes some experimentation to find the appropriate bin endpoints for a particular feature. If the data contains 'NaN' values, it will also draw two small horizontal (dotted and dashed) lines, indicating the probabilities given NaN and not NaN respectively. """ nan_mask = np.isnan(value_vec) num_nans = np.sum(nan_mask) if num_nans > 0: nan_binary_vec = binary_vec[nan_mask] binary_vec = binary_vec[~nan_mask] value_vec = value_vec[~nan_mask] nan_avg_value = np.mean(nan_binary_vec) reg_avg_value = np.mean(binary_vec) # digitized_value_vec = np.digitize(value_vec, bins) # x_pts_to_graph = np.array([np.mean(value_vec[digitized_value_vec==i]) for i in np.unique(digitized_value_vec)]) # print(x_pts_to_graph) out0 = plt.hist(value_vec[binary_vec == 0], bins=bins, **kwargs) out1 = plt.hist(value_vec[binary_vec == 1], bins=bins, **kwargs) plt.close() plt.figure(figsize=figsize) plt.subplot(2, 1, 1) plt.hist((value_vec[binary_vec == 0], value_vec[binary_vec == 1]), stacked=True, bins=bins, **kwargs) bin_leftpts = (out1[1])[:-1] bin_rightpts = (out1[1])[1:] default_bin_centers = (bin_leftpts + bin_rightpts) / 2 digitized_value_vec = np.digitize(value_vec, bins) bin_centers = np.array([ np.mean(value_vec[digitized_value_vec == i]) if i in np.unique(digitized_value_vec) else default_bin_centers[i - 1] for i in np.arange(len(bins) - 1) + 1 ]) prob_numer = out1[0] prob_denom = out1[0] + out0[0] smoothing_const = .001 probs = (prob_numer + prior_prob * smoothing_const) / (prob_denom + smoothing_const) # print(bin_centers) # print(probs) plt.subplot(2, 1, 2) if rel_risk: plt.plot(bin_centers, np.log10(probs / prior_prob)) # plt.errorbar(bin_centers, probs, yerr=1.96 * probs * (1 - probs) / np.sqrt(prob_denom), capsize=3) plt.xlim(bin_leftpts[0], bin_rightpts[-1]) else: plt.plot(bin_centers[:len(probs)], probs) plt.xlim(bin_leftpts[0], bin_rightpts[-1]) yerr_mat_temp = beta.interval(1 - error_bar_alpha, out1[0] + 1, out0[0] + 1) yerr_mat = np.vstack((yerr_mat_temp[0], yerr_mat_temp[1])) - probs yerr_mat[0, :] = -yerr_mat[0, :] plt.errorbar(bin_centers[:len(probs)], probs, yerr=yerr_mat, capsize=5) plt.xlim(bin_leftpts[0], bin_rightpts[-1]) if num_nans > 0: plt.hlines(y=nan_avg_value, xmin=bin_leftpts[0], xmax=bin_leftpts[1], linestyle='dotted') plt.hlines(y=reg_avg_value, xmin=bin_leftpts[0], xmax=bin_leftpts[1], linestyle='dashed') return { 'bin_centers': bin_centers, 'probs': probs, 'prob_numer': prob_numer, 'prob_denom': prob_denom }
count_mat = np.zeros((2, 2), dtype=np.int) count_mat[0, 0] = np.sum( np.logical_and(init_state == 0, np.logical_not(switched))) count_mat[0, 1] = np.sum(np.logical_and(init_state == 0, switched)) count_mat[1, 0] = np.sum(np.logical_and(init_state == 1, switched)) count_mat[1, 1] = np.sum( np.logical_and(init_state == 1, np.logical_not(switched))) counts.append(count_mat) plt.figure() mean_PtoAP = [beta.mean(1 + c[0, 1], 1 + c[0, 0]) for c in counts] mean_APtoP = [beta.mean(1 + c[1, 0], 1 + c[1, 1]) for c in counts] ci68_PtoAP = [ beta.interval(0.68, 1 + c[0, 1], 1 + c[0, 0]) for c in counts ] ci68_APtoP = [ beta.interval(0.68, 1 + c[1, 0], 1 + c[1, 1]) for c in counts ] ci95_PtoAP = [ beta.interval(0.95, 1 + c[0, 1], 1 + c[0, 0]) for c in counts ] ci95_APtoP = [ beta.interval(0.95, 1 + c[1, 0], 1 + c[1, 1]) for c in counts ] current_palette = sns.color_palette() plt.plot(fall_times, mean_PtoAP) plt.fill_between(fall_times, [ci[0] for ci in ci68_PtoAP], [ci[1] for ci in ci68_PtoAP], color=current_palette[0],