def sample_power_probtest(p1, p2, power=0.8, sig=0.05): z = norm.isf([sig / 2]) #two-sided t test zp = -1 * norm.isf([power]) d = (p1 - p2) s = 2 * ((p1 + p2) / 2) * (1 - ((p1 + p2) / 2)) n = s * ((zp + z)**2) / (d**2) return int(round(n[0]))
def sample_power_probtest(p1, p2, power=0.8, sig=0.05): z = norm.isf([sig/2]) #two-sided t test zp = -1 * norm.isf([power]) d = (p1-p2) s =2*((p1+p2) /2)*(1-((p1+p2) /2)) n = s * ((zp + z)**2) / (d**2) return int(round(n[0]))
def test_z_score(): p = np.random.rand(10) assert_array_almost_equal(norm.sf(z_score(p)), p) # check the numerical precision for p in [1.e-250, 1 - 1.e-16]: assert_array_almost_equal(z_score(p), norm.isf(p)) assert_array_almost_equal(z_score(np.float32(1.e-100)), norm.isf(1.e-300))
def test_fdr(): n = 100 x = np.linspace(.5 / n, 1. - .5 / n, n) x[:10] = .0005 x = norm.isf(x) np.random.shuffle(x) assert_almost_equal(fdr_threshold(x, .1), norm.isf(.0005)) assert_true(fdr_threshold(x, .001) == np.infty)
def map_threshold(stat_img, mask_img, threshold, height_control='fpr', cluster_threshold=0): """ Threshold the provvided map Parameters ---------- stat_img : Niimg-like object, statistical image (presumably in z scale) mask_img : Niimg-like object, mask image threshold: float, cluster forming threshold (either a p-value or z-scale value) height_control: string false positive control meaning of cluster forming threshold: 'fpr'|'fdr'|'bonferroni'|'none' cluster_threshold : float, optional cluster size threshold Returns ------- thresholded_map : Nifti1Image, the stat_map theresholded at the prescribed voxel- and cluster-level """ # Masking masker = NiftiMasker(mask_img=mask_img) stats = np.ravel(masker.fit_transform(stat_img)) n_voxels = np.size(stats) # Thresholding if height_control == 'fpr': z_th = norm.isf(threshold) elif height_control == 'fdr': z_th = fdr_threshold(stats, threshold) elif height_control == 'bonferroni': z_th = norm.isf(threshold / n_voxels) else: # Brute-force thresholding z_th = threshold stats *= (stats > z_th) stat_map = masker.inverse_transform(stats).get_data() # Extract connected components above threshold label_map, n_labels = label(stat_map > z_th) labels = label_map[(masker.mask_img_.get_data() > 0)] for label_ in range(1, n_labels + 1): if np.sum(labels == label_) < cluster_threshold: stats[labels == label_] = 0 return masker.inverse_transform(stats)
def Eu_Option_BS_MC(S0, r, sigma, K, T, N, payoff, alpha=0.05): z = np.random.normal(0, 1, N) paths = S0 * np.exp((r - 0.5 * sigma**2) * T + sigma * np.sqrt(T) * z) V0 = np.exp(-r * T) * np.mean(payoff(paths)) var = np.var(payoff(paths), ddof=1) ci = [ V0 - norm.isf(alpha / 2) * np.sqrt(var / N), V0 + norm.isf(alpha / 2) * np.sqrt(var / N) ] epsilon = norm.isf(alpha / 2) * np.sqrt(var / N) return [V0, ci, epsilon]
def get_cutoff(rvs: list): """ 输入的rvs是符合正态分布的一组随机变量(random variables) Percent point function (inverse of cdf — percentiles). """ mean = np.mean(rvs) std = np.std(rvs) lower_bound = norm.isf(0.005, loc=mean, scale=std) upper_bound = norm.isf(0.975, loc=mean, scale=std) mid = norm.isf(0.5, loc=mean, scale=std) return lower_bound, upper_bound, mid
def get_sample_size(alpha, beta, sigma, mu0, mu1, how): if how == 'double': z0 = norm.isf(alpha / 2) elif how == 'up' or how == 'down': z0 = norm.isf(alpha) else: print("how参数错误.") return -1 z1 = norm.isf(beta) n = pow((z0 + z1) * sigma / ( mu0 - mu1 ), 2) return n
def test_fdr(): n = 100 x = np.linspace(.5 / n, 1. - .5 / n, n) x[:10] = .0005 x = norm.isf(x) np.random.shuffle(x) assert_almost_equal(fdr_threshold(x, .1), norm.isf(.0005)) assert fdr_threshold(x, .001) == np.infty with pytest.raises(ValueError): fdr_threshold(x, -.1) with pytest.raises(ValueError): fdr_threshold(x, 1.5)
def round_size_approx(self, margin, alpha, quant): """ Returns approximate round size for small margins :param margin: margin of victory (float in [0, 1]) :param alpha: risk limit :param quant: desired probability of stopping in the next round :return: the next round size computed under a normal approximation to the binomial """ z_a = norm.isf(quant) z_b = norm.isf(alpha * quant) p = (1 + margin) / 2 return ceil(((z_a * sqrt(p * (1 - p)) - .5 * z_b) / (p - .5))**2)
def test_map_threshold(): shape = (9, 10, 11) p = np.prod(shape) data = norm.isf(np.linspace(1. / p, 1. - 1. / p, p)).reshape(shape) threshold = .001 data[2:4, 5:7, 6:8] = 5. stat_img = nib.Nifti1Image(data, np.eye(4)) mask_img = nib.Nifti1Image(np.ones(shape), np.eye(4)) # test 1 th_map, _ = map_threshold( stat_img, mask_img, threshold, height_control='fpr', cluster_threshold=0) vals = th_map.get_data() assert_equal(np.sum(vals > 0), 8) # test 2: excessive cluster forming threshold th_map, _ = map_threshold( stat_img, mask_img, 100, height_control=None, cluster_threshold=0) vals = th_map.get_data() assert_true(np.sum(vals > 0) == 0) # test 3:excessive size threshold th_map, z_th = map_threshold( stat_img, mask_img, threshold, height_control='fpr', cluster_threshold=10) vals = th_map.get_data() assert_true(np.sum(vals > 0) == 0) assert_equal(z_th, norm.isf(.001)) # test 4: fdr threshold + bonferroni for control in ['fdr', 'bonferroni']: th_map, _ = map_threshold( stat_img, mask_img, .05, height_control=control, cluster_threshold=5) vals = th_map.get_data() assert_equal(np.sum(vals > 0), 8) # test 5: direct threshold th_map, _ = map_threshold( stat_img, mask_img, 4.0, height_control=None, cluster_threshold=0) vals = th_map.get_data() assert_equal(np.sum(vals > 0), 8) # test 6: without mask th_map, _ = map_threshold( stat_img, None, 4.0, height_control=None, cluster_threshold=0) vals = th_map.get_data() assert_equal(np.sum(vals > 0), 8)
def stouffer_liptak(pvals, sigma): qvals = norm.isf(pvals).reshape(len(pvals), 1) try: C = np.asmatrix(chol(sigma)).I except np.linalg.linalg.LinAlgError: # for non positive definite matrix default to z-score correction. z, L = np.mean(norm.isf(pvals)), len(pvals) sz = 1.0 / L * np.sqrt(L + 2 * np.tril(sigma, k=-1).sum()) return norm.sf(z / sz) qvals = C * qvals Cp = qvals.sum() / np.sqrt(len(qvals)) return norm.sf(Cp)
def next_sample_size_gaussian(self, sprob=.9): """This is a rougher but quicker round size estimate for very narrow margins.""" z_a = norm.isf(sprob) z_b = norm.isf(self.alpha * sprob) possible_sample_sizes = [] for sub_audit in self.sub_audits.values(): p = sub_audit.sub_contest.winner_prop possible_sample_sizes.append( math.ceil( ((z_a * math.sqrt(p * (1 - p)) - .5 * z_b) / (p - .5))**2)) return max(possible_sample_sizes)
def binormal_roc(Y,p): x = -norm.isf(np.array(p)) mu0 = x[Y==0].mean() sigma0 = x[Y==0].std() mu1 = x[Y==1].mean() sigma1 = x[Y==1].std() # Separation a = (mu1-mu0)/sigma1 # Symmetry b = sigma0/sigma1 threshold = np.linspace(0,1,1000) roc = norm.cdf(a-b*norm.isf(threshold)) return threshold,roc
def Eu_Option_BS_MC_AT(S0, r, sigma, K, T, N, payoff, alpha=0.05): z = np.random.normal(0, 1, N) paths = payoff(S0 * np.exp((r - 0.5 * sigma**2) * T + sigma * np.sqrt(T) * z)) paths2 = payoff(S0 * np.exp((r - 0.5 * sigma**2) * T + sigma * np.sqrt(T) * (-z))) V0 = 0.5 * np.mean(np.exp(-r * T) * (paths + paths2)) var = np.var(paths + paths2, ddof=1) ci = [ V0 - norm.isf(alpha / 2) * np.sqrt(var / (4 * N)), V0 + norm.isf(alpha / 2) * np.sqrt(var / (4 * N)) ] epsilon = norm.isf(alpha / 2) * np.sqrt(var / (4 * N)) return [V0, ci, epsilon]
def __init__(self,EquiProbVals,mu,sigma,nresult=None): self.nresult=nresult if self.nresult==None:self.nresult=len(EquiProbVals) EquiProbVal1=array(list(filter(lambda epv:epv!=None,EquiProbVals))) EquiProbVal1=EquiProbVal1.reshape(len(EquiProbVal1),1) self.pval=array(list(map(lambda i:(i+.5)/float(self.nresult),range(self.nresult)))).reshape(1,self.nresult) if sigma==0.: EquiProbVal2=array(self.nresult*[mu]).reshape(1,self.nresult) else: try: EquiProbVal2=norm.isf(1-self.pval,mu,sigma) except: print('\nProdOfProb 21') print('self.pval',self.pval) print('mu,sigma',mu,sigma) try: oldy=hstack(dot(EquiProbVal1,EquiProbVal2)) oldy.sort() nold=len(oldy) except: print('\nProdOfProb 30') print('EquiProbVal1',EquiProbVal1) print('EquiProbVal2',EquiProbVal2) print('oldy',oldy) oldx=list(map(lambda i:(i+.5)/float(nold),range(nold))) self.EquiProbVal=WCHinterp(oldx,oldy,self.pval)
def draw_two_gauss(ix=0, extrema=500, std=50, h1=0, h2=0, gap=50, alpha=0.15865525393145707): im = Image.new("RGB", (512, 512), "black") draw = ImageDraw.Draw(im, 'RGBA') fn = lambda x: 300 - norm.pdf(x - 250, 0, std) * 7000 draw_curve(fn, draw) fn2 = lambda x: 300 - norm.pdf(x - 250, gap, std) * 7000 draw_curve(fn2, draw, rgba=(138, 43, 226)) #draw.line((250,0,250,512),fill=(0,120,230),width=1) #draw.line((250,0,250,512),fill=(255,255,0),width=1) delta = norm.isf(alpha, 0, std) x1 = 250 + delta draw.line((x1, 0, x1, 512), fill=(255, 20, 147, 150), width=1) y1 = fn(x1) pts = [(x1, y1), (x1, 300), (extrema, fn(extrema))] for xx in np.arange(extrema - 1, x1, -1): yx = fn(xx) pts.append((xx, yx)) draw.polygon(pts, (255, 255, 0, 100)) y2 = fn2(x1) pts = [(x1, y2), (x1, 300), (180, fn2(180))] for xx in np.arange(179 + 1, x1, 1): yx = fn2(xx) pts.append((xx, yx)) draw.polygon(pts, (138, 43, 226, 100)) draw_trtmt_hist(draw, h1=h1, h2=h2) draw_alpha_beta_curve(draw, alpha, std=std, effect=gap) im.save(basedir + 'im' + str(ix) + '.png')
def rdc_sigthres_compute(N, Alpha): """ Computes the significance threshold for the RDC. Keyword arguments: N -- Number of measurement samples Alpha -- The required confidence level (0 < Alpha < 1) Returns: L -- Significance level """ # compute sigthres level l = 10000 v = numpy.zeros(l, dtype=numpy.float) for i in range(0, l): a = numpy.random.normal(size=N) b = numpy.random.normal(size=N) R = None while R is None: debug(2, "rdc_limit computation for N=%d, alpha=%f, iteration %d/%d", (N, Alpha, i, l)) (R, _, _) = RDC.rdc(a, b, Alpha, SkipThres=True, max_iter=-1) # With max_iter=-1, R is always != None v[i] = R (mu,std) = norm.fit(v) L = norm.isf(1.0-Alpha, loc=mu, scale=std) L = numpy.min([L, 1.0]) debug(1, "New rdc_limit: Alpha=%.6f, N=%d, L=%.6f", (Alpha, N, L)) return (L)
def main(rho=0.245, n=100, p=30): X, prec, nonzero = instance(n=n, p=p, alpha=0.99, rho=rho) lam_frac = 0.1 alpha = 0.8 randomization = laplace(loc=0, scale=1.) loss = randomized.neighbourhood_selection(X) epsilon = 1. lam = 2./np.sqrt(n) * np.linalg.norm(X) * norm.isf(alpha / (2 * p**2)) random_Z = randomization.rvs(p**2 - p) penalty = randomized.selective_l1norm(p**2-p, lagrange=lam) sampler1 = randomized.selective_sampler_MH(loss, random_Z, epsilon, randomization, penalty) loss_args = {"active":sampler1.penalty.active_set, "quadratic_coef":epsilon} null, alt = pval(sampler1, loss_args, None, X, nonzero) return null, alt
def check_significant_residuals(self, sig_level=0.05, n_decimals=3): """ Identify significant Pearson's residuals based on the given significant level. Parameters ---------- sig_level : float Significance level (alpha) to identify significant residuals n_decimals : int Number of digits to round results when showing them """ critical_value = norm.isf(sig_level / 2) n_sig_resids = (abs(self.residuals_pearson) >= critical_value).sum().sum() n_cells = self.n_cells perc_sig_resids = n_sig_resids / n_cells * 100 max_resid = self.residuals_pearson.max().max() max_resid_row = self.residuals_pearson.max(axis=1).idxmax() max_resid_column = self.residuals_pearson.max(axis=0).idxmax() min_resid = self.residuals_pearson.min().min() min_resid_row = self.residuals_pearson.min(axis=1).idxmin() min_resid_column = self.residuals_pearson.min(axis=0).idxmin() print(f'''{n_sig_resids} ({round(perc_sig_resids, n_decimals)}%) cells have Pearson's \ residual bigger than {round(critical_value, 2)}. The biggest residual is {round(max_resid, n_decimals)} (categories {max_resid_row} and {max_resid_column}). The smallest residual is {round(min_resid, n_decimals)} (categories {min_resid_row} and {min_resid_column}).''')
def main(rho=0.245, n=100, p=30): X, prec, nonzero = instance(n=n, p=p, alpha=0.99, rho=rho) lam_frac = 0.1 alpha = 0.8 randomization = laplace(loc=0, scale=1.) loss = randomized.neighbourhood_selection(X) epsilon = 1. lam = 2. / np.sqrt(n) * np.linalg.norm(X) * norm.isf(alpha / (2 * p**2)) random_Z = randomization.rvs(p**2 - p) penalty = randomized.selective_l1norm(p**2 - p, lagrange=lam) sampler1 = randomized.selective_sampler_MH(loss, random_Z, epsilon, randomization, penalty) loss_args = { "active": sampler1.penalty.active_set, "quadratic_coef": epsilon } null, alt = pval(sampler1, loss_args, None, X, nonzero) return null, alt
def __init__(self, sigLocal, sig0, N0): # Convert significance to p-value pLocal = norm.sf(sigLocal) p0 = norm.sf(sig0) # Get the test statistic value corresponding to the p-value u = chi2.isf(pLocal * 2, 1) u0 = chi2.isf(p0 * 2, 1) # The main equations N = N0 * exp(-(u - u0) / 2.) pGlobal = N + chi2.sf(u, 1) / 2. # Further info sigGlobal = norm.isf(pGlobal) trialFactor = pGlobal / pLocal self.sigGlobal = sigGlobal self.sigLocal = sigLocal self.sig0 = sig0 self.pGlobal = pGlobal self.pLocal = pLocal self.p0 = p0 self.N0 = N0 self.N = N self.u0 = u0 self.u = u self.trialFactor = trialFactor
def qnorm(q, mean=0, sd=1, lowertail=True): """ ============================================================================ qnorm() ============================================================================ The quantile function for the normal distribution. You provide a quantile (eg q=0.75) or array of quantiles, and it returns the value along the normal distribution that corresponds to the qth quantile. USAGE: cnorm(mean=0, sd=1, type="equal", conf=0.95) dnorm(x, mean=0, sd=1, log=False) pnorm(q, mean=0, sd=1, lowertail=True, log=False) qnorm(p, mean=0, sd=1, lowertail=True, log=False) rnorm(n=1, mean=0, sd=1) :param q (float, array of floats): The quantile(s) :param mean (float): mean of the distribution :param sd (float): standard deviation :param lowertail (bool): lowertail (true), or survival (false) :return: an array of the value(s) corresponding to the quantiles q ============================================================================ """ # TODO: check that q is between 0.0 and 1.0 if lowertail: return norm.ppf(q=q, loc=mean, scale=sd) else: return norm.isf(q=q, loc=mean, scale=sd)
def cpt_ppm_a_norm(mean, variance, alpha=0.05): """ Compute a Posterior Probability Map (fixed alpha) by assuming a Gaussian distribution. Expected shape of 'mean', 'variance': (voxel) """ return norm.isf(alpha, mean, variance**.5)
def linearity_test(self): result = True self.ltw('\\subsubsection{Liniowość postaci modelu}\n') pairs = [(resid, y) for resid, y in zip(self.residuals, self.y_data)] pairs.sort(key=itemgetter(1)) n1 = 0 n2 = 0 r = 0 norm_alpha = norm.isf(q=self.alpha) last = 0 for pair in pairs: if pair[0] > 0: n1 += 1 elif pair[0] < 0: n2 += 1 if pair[0] > 0 and not last > 0: r += 1 elif pair[0] < 0 and not last < 0: r += 1 last = pair[0] n = self.n z = (r - (((2 * n1 * n2) / n) + 1)) / (math.sqrt((2 * n1 * n2 * (2 * n1 * n2 - n)) / ((n-1) * (n ** 2)))) self.log(z, norm_alpha) self.ltw(f'\\[Z = {z}\\]\n') self.ltw(f'\\[k_{{{self.alpha}, 0, 1}} = {norm_alpha}\\]\n') if abs(z) > norm_alpha: result = False self.log("Model nieliniowy.") self.ltw("Postać modelu nie jest liniowa.\n") else: self.ltw('Postać modelu jest liniowa.\n') return result
def quantile_gaussianize(x): """Normalize a sequence of values via rank and Normal c.d.f. Args: x (array_like): sequence of values. Returns: Gaussian-normalized values. Example: .. doctest:: >>> from scipy_sugar.stats import quantile_gaussianize >>> print(quantile_gaussianize([-1, 0, 2])) [-0.67448975 0. 0.67448975] """ from scipy.stats import norm, rankdata x = asarray(x, float).copy() ok = isfinite(x) x[ok] *= -1 y = empty_like(x) y[ok] = rankdata(x[ok]) y[ok] = norm.isf(y[ok] / (sum(ok) + 1)) y[~ok] = x[~ok] return y
def __init__(self,sigLocal,sig0,N0): # Convert significance to p-value pLocal = norm.sf(sigLocal) p0 = norm.sf(sig0) # Get the test statistic value corresponding to the p-value u = chi2.isf(pLocal*2,1) u0 = chi2.isf(p0*2,1) # The main equations N = N0 * exp(-(u-u0)/2.) pGlobal = N + chi2.sf(u,1)/2. # Further info sigGlobal = norm.isf(pGlobal) trialFactor = pGlobal/pLocal self.sigGlobal = sigGlobal self.sigLocal = sigLocal self.sig0 = sig0 self.pGlobal = pGlobal self.pLocal = pLocal self.p0 = p0 self.N0 = N0 self.N = N self.u0 = u0 self.u = u self.trialFactor = trialFactor
def _significance_direct(n_on, mu_bkg): """Compute significance directly via Poisson probability. Use this method for small ``n_on < 10``. In this case the Li & Ma formula isn't correct any more. TODO: add large unit test coverage (where is it numerically precise enough)? TODO: check coverage with MC simulation I'm getting a positive significance for zero observed counts and small mu_bkg. That doesn't make too much sense ... >>> stats.poisson._significance_direct(0, 2) -1.1015196284987503 >>> stats.poisson._significance_direct(0, 0.1) 1.309617799458493 """ from scipy.stats import norm, poisson # Compute tail probability to see n_on or more counts probability = poisson.sf(n_on, mu_bkg) # Convert probability to a significance significance = norm.isf(probability) return significance
def gaussianize(vec): """Uses a look-up table to force the values in [vec] to be gaussian.""" ranks = np.argsort(np.argsort(vec)) cranks = (ranks + 1).astype(float) / (ranks.max() + 2) vals = norm.isf(1 - cranks) zvals = vals / vals.std() return zvals
def probability_to_significance_normal(probability): """Convert one-sided tail probability to significance. Parameters ---------- probability : array_like One-sided tail probability Returns ------- significance : ndarray Significance See Also -------- significance_to_probability_normal, probability_to_significance_normal_limit Examples -------- >>> probability_to_significance_normal(1e-10) 6.3613409024040557 """ from scipy.stats import norm return norm.isf(probability)
def h2_obs_to_liab(h2_obs, P, K): ''' Converts heritability on the observed scale in an ascertained sample to heritability on the liability scale in the population. Parameters ---------- h2_obs : float Heritability on the observed scale in an ascertained sample. P : float in (0,1) Prevalence of the phenotype in the sample. K : float in (0,1) Prevalence of the phenotype in the population. Returns ------- h2_liab : float Heritability of liability in the population. ''' if np.isnan(P) and np.isnan(K): return h2_obs if K <= 0 or K >= 1: raise ValueError('K must be in the range (0,1)') if P <= 0 or P >= 1: raise ValueError('P must be in the range (0,1)') thresh = norm.isf(K) conversion_factor = K ** 2 * \ (1 - K) ** 2 / (P * (1 - P) * norm.pdf(thresh) ** 2) return h2_obs * conversion_factor
def calculate_mean_confidence_interval_large(series, confidence_interval=0.90): mean = series.mean() s = math.sqrt(series.var()) count = series.count() z = norm.isf((1 - confidence_interval) / 2) delta = round(z * (s / math.sqrt(count)), 1) return FloatInterval.closed(mean - delta, mean + delta)
def z_score(pvalue, one_minus_pvalue=None): """ Return the z-score(s) corresponding to certain p-value(s) and, optionally, one_minus_pvalue(s) provided as inputs. Parameters ---------- pvalue: float or 1-d array shape=(n_pvalues,) computed using the survival function one_minus_pvalue: float or 1-d array shape=(n_one_minus_pvalues,), optional; it shall take the value returned by /nilearn/glm/contrasts.py::one_minus_pvalue which computes the p_value using the cumulative distribution function, with n_one_minus_pvalues = n_pvalues Returns ------- z_scores: 1-d array shape=(n_z_scores,), with n_z_scores = n_pvalues """ pvalue = np.clip(pvalue, 1.e-300, 1. - 1.e-16) z_scores_sf = norm.isf(pvalue) if one_minus_pvalue is not None: one_minus_pvalue = np.clip(one_minus_pvalue, 1.e-300, 1. - 1.e-16) z_scores_cdf = norm.ppf(one_minus_pvalue) z_scores = np.empty(pvalue.size) use_cdf = z_scores_sf < 0 use_sf = np.logical_not(use_cdf) z_scores[np.atleast_1d(use_cdf)] = z_scores_cdf[use_cdf] z_scores[np.atleast_1d(use_sf)] = z_scores_sf[use_sf] else: z_scores = z_scores_sf return z_scores
def qnorm(p, mean=0, sd=1, lowertail=True): """ ============================================================================ qnorm() ============================================================================ The quantile function for the normal distribution. You provide a quantile (eg q=0.75) or array of quantiles, and it returns the value along the normal distribution that corresponds to the qth quantile. USAGE: cnorm(mean=0, sd=1, type="equal", conf=0.95) dnorm(x, mean=0, sd=1, log=False) pnorm(q, mean=0, sd=1, lowertail=True, log=False) qnorm(p, mean=0, sd=1, lowertail=True, log=False) rnorm(n=1, mean=0, sd=1) :param q (float, array of floats): The quantile(s) :param mean (float): mean of the distribution :param sd (float): standard deviation :param lowertail (bool): lowertail (true), or survival (false) :return: an array of the value(s) corresponding to the quantiles q ============================================================================ """ # TODO: check that q is between 0.0 and 1.0 if lowertail: return norm.ppf(q=p, loc=mean, scale=sd) else: return norm.isf(q=p, loc=mean, scale=sd)
def _significance_direct_on_off(n_on, n_off, alpha): """Compute significance directly via Poisson probability. Use this method for small n_on < 10. In this case the Li & Ma formula isn't correct any more. * TODO: add reference * TODO: add large unit test coverage (where is it numerically precise enough)? * TODO: check coverage with MC simulation * TODO: implement in Cython and vectorize n_on (accept numpy array n_on as input) """ from math import factorial as fac from scipy.stats import norm # Compute tail probability to see n_on or more counts probability = 1 for n in range(0, n_on): term_1 = alpha ** n / (1 + alpha) ** (n_off + n + 1) term_2 = fac(n_off + n) / (fac(n) * fac(n_off)) probability -= term_1 * term_2 # Convert probability to a significance significance = norm.isf(probability) return significance
def wald_weighted_unc(k, N, cl=one_sigma): ''' Calculate the symmetric Wald uncertainty for a weighted sample, where "k" is the array of weights in the survival sample and "N" in the main sample. :param k: passed weights. :type k: numpy.ndarray(float) :param N: total weights. :type N: numpy.ndarray(float) :param cl: confidence level. :type cl: float or numpy.ndarray(float) :returns: symmetric uncertainty. :rtype: float or numpy.ndarray(float) ''' z = norm.isf((1. - cl) / 2.) sN = np.sum(N, axis=None) W1 = np.sum(k, axis=None) W2 = sN - W1 vw1 = np.sum(k * k, axis=None) vw2 = np.sum(N * N, axis=None) - vw1 return z * np.sqrt((W1**2 * vw2 + W2**2 * vw1) / sN**4)
def extreme_values(weighted_residuals, confidence_interval): ''' This function uses extreme value theory to calculate the number of standard deviations away from the mean at which we should expect to bracket *all* of our n data points at a certain confidence level. It then uses that value to identify which (if any) of the data points lie outside that region, and calculates the corresponding probabilities of finding a data point at least that many standard deviations away. Parameters ---------- weighted_residuals : array of floats Array of residuals weighted by the square root of their variances wr_i = r_i/sqrt(var_i) confidence_interval : float Probability at which all the weighted residuals lie within the confidence bounds Returns ------- confidence_bound : float Number of standard deviations at which we should expect to encompass all data at the user-defined confidence interval. indices : array of floats Indices of weighted residuals exceeding the confidence_interval defined by the user probabilities : array of floats The probabilities that the extreme data point of the distribution lies further from the mean than the observed position wr_i for each i in the "indices" output array. ''' n = len(weighted_residuals) mean = norm.isf(1./n) # good approximation for > 10 data points scale = 0.8/np.power(np.log(n), 1./2.) # good approximation for > 10 data points c = 0.33/np.power(np.log(n), 3./4.) # We now need a 1-tailed probability from the given confidence_interval # p_total = 1. - confidence_interval = p_upper + p_lower - p_upper*p_lower # p_total = 1. - confidence_interval = 2p - p^2, therefore: p = 1. - np.sqrt(confidence_interval) confidence_bound = genextreme.isf(p, c, loc=mean, scale=scale) indices = [i for i, r in enumerate(weighted_residuals) if np.abs(r) > confidence_bound] # Convert back to 2-tailed probabilities probabilities = (1. - np.power(genextreme.sf(np.abs(weighted_residuals[indices]), c, loc=mean, scale=scale) - 1., 2.)) return confidence_bound, indices, probabilities
def z_score_combine(pvals, sigma): L = len(pvals) pvals = np.array(pvals, dtype=np.float64) pvals[pvals == 1] = 1.0 - 9e-16 z = np.mean(norm.isf(pvals, loc=0, scale=1)) sz = 1.0 /L * np.sqrt(L + 2 * np.tril(sigma, k=-1).sum()) res = {'p': norm.sf(z/sz), 'OK': True} return res
def quantile_gaussianize(x): ok = isfinite(x) x[ok] *= -1 y = empty_like(x) y[ok] = rankdata(x[ok]) y[ok] = norm.isf(y[ok] / (sum(ok) + 1)) y[~ok] = x[~ok] return y
def __init__(self, power=0.8, sig=0.05): from scipy.stats import norm self.power = power self.sig = sig self.best = None self.z_need = norm.isf(sig / 2) # 2-tail test self.eliminated = [] self.to_pick = None
def zscore_cluster(formula, methylations, covs, coef, robust=False): r = _combine_cluster(formula, methylations, covs, coef) z, L = np.mean(norm.isf(r["p"])), len(r["p"]) sz = 1.0 / L * np.sqrt(L + 2 * np.tril(r["corr"], k=-1).sum()) r["p"] = norm.sf(z / sz) r["t"], r["coef"] = r["t"].mean(), r["coef"].mean() r.pop("corr") return r
def test_fdr_p_values(): n = 100 x = np.linspace(.5 / n, 1. - .5 / n, n) x[:10] = .0005 x = norm.isf(x) fdr = fdr_p_values(x) assert_array_almost_equal(fdr[:10], .005) assert_true((fdr[10:] > .95).all()) assert_true(fdr.max() <= 1)
def make_roc(y, age, patho, tiv, sex, label='dunno', detrend=DETREND): # fit a linear model on controls and normalize AD vols for sex and # tiv using the same model msk0 = np.where(patho == ' Normal') msk = np.where(patho == ' AD') # use either a linear model with tiv and sex as confounds or a # linear model wrt age (without confounds) if detrend: M = LinearModel(y[msk0], age[msk0], tiv[msk0], sex[msk0]) y0 = M.normalize() y = M.renormalize(y[msk], age[msk], tiv[msk], sex[msk]) else: y = y / tiv M = LinearModel(y[msk0], age[msk0]) y0 = y[msk0] y = y[msk] # plot curves figure() a, nm = M.predict(55, 95) delta = np.sqrt(M.s2) * ssnorm.isf(.05) plot(a, nm, 'k') plot(a, nm + delta, 'k:') plot(a, nm - delta, 'k:') plot(age[msk0], y0, 'ok') plot(age[msk], y, 'or') xlabel('age', fontsize=16) ylabel(label, fontsize=16) # roc curves z = y zm = M.beta[0] + age[msk] * M.beta[1] #alphas = 10**(-np.linspace(1,10)) alphas = np.linspace(0, 1 - SPECIFICITY_MIN, num=9999) betas = 0 * alphas for i in range(len(alphas)): alpha = alphas[i] delta = np.sqrt(M.s2) * ssnorm.isf(alpha) betas[i] = float(len(np.where(z < (zm - delta))[0])) / float(z.size) return alphas, betas
def calc_llr_distributions(llr_nmh,llr_imh,nbins): fig = plt.figure(figsize=(9,8)) llr_imh.hist(bins=nbins,histtype='step',lw=2,color='b') llr_nmh.hist(bins=nbins,histtype='step',lw=2,color='r') IMHTrue_mean_val = llr_nmh.mean() IMHTrue_std_dev = llr_nmh.std() IMHTrue_std_error = IMHTrue_std_dev/np.sqrt(len(llr_nmh)) IMHTrue_pvalue = 1.0 - float(np.sum(llr_imh > IMHTrue_mean_val))/len(llr_imh) IMHTrue_pvalueP1S = 1.0 - float(np.sum(llr_imh > (IMHTrue_mean_val+IMHTrue_std_error)))/len(llr_imh) IMHTrue_pvalueM1S = 1.0 - float(np.sum(llr_imh > (IMHTrue_mean_val-IMHTrue_std_error)))/len(llr_imh) NMHTrue_mean_val = llr_imh.mean() NMHTrue_std_dev = llr_imh.std() NMHTrue_std_error = NMHTrue_std_dev/np.sqrt(len(llr_imh)) NMHTrue_pvalue = float(np.sum(llr_nmh > NMHTrue_mean_val))/len(llr_nmh) NMHTrue_pvalueP1S = float(np.sum(llr_nmh > (NMHTrue_mean_val+NMHTrue_std_error)))/len(llr_nmh) NMHTrue_pvalueM1S = float(np.sum(llr_nmh > (NMHTrue_mean_val-NMHTrue_std_error)))/len(llr_nmh) IMHTrue_sigma_1side = np.sqrt(2.0)*erfinv(1.0 - IMHTrue_pvalue) IMHTrue_sigma_2side = norm.isf(IMHTrue_pvalue) print " Using non-gauss fit: " print " IMHTrue_pvalue: %.5f"%IMHTrue_pvalue print " IMHTrue_pvalueP1S: %.5f"%IMHTrue_pvalueP1S print " IMHTrue_pvalueM1S: %.5f"%IMHTrue_pvalueM1S print " IMHTrue_sigma 1 sided (erfinv): %.4f"%IMHTrue_sigma_1side print " IMHTrue_sigma 2 sided (isf) : %.4f"%IMHTrue_sigma_2side NMHTrue_sigma_1side = np.sqrt(2.0)*erfinv(1.0 - NMHTrue_pvalue) NMHTrue_sigma_2side = norm.isf(NMHTrue_pvalue) print " Using non-gauss fit: " print " NMHTrue_pvalue: %.5f"%NMHTrue_pvalue print " NMHTrue_pvalueP1S: %.5f"%NMHTrue_pvalueP1S print " NMHTrue_pvalueM1S: %.5f"%NMHTrue_pvalueM1S print " NMHTrue_sigma 1 sided (erfinv): %.4f"%NMHTrue_sigma_1side print " NMHTrue_sigma 2 sided (isf) : %.4f"%NMHTrue_sigma_2side return
def extreme_values(weighted_residuals, confidence_interval): ''' This function uses extreme value theory to calculate the number of standard deviations away from the mean at which we should expect to bracket *all* of our n data points at a certain confidence level. It then uses that value to identify which (if any) of the data points lie outside that region, and calculates the corresponding probabilities of finding a data point at least that many standard deviations away. Parameters ---------- weighted_residuals : array of floats Array of residuals weighted by the square root of their variances wr_i = r_i/sqrt(var_i) confidence_interval : float Probability at which all the weighted residuals lie within the confidence bounds Returns ------- confidence_bound : float Number of standard deviations at which we should expect to encompass all data at the user-defined confidence interval. indices : array of floats Indices of weighted residuals exceeding the confidence_interval defined by the user probabilities : array of floats The probabilities that the extreme data point of the distribution lies further from the mean than the observed position wr_i for each i in the "indices" output array. ''' n=len(weighted_residuals) mean = norm.isf(1./n) scale = 0.8/np.power(np.log(n), 1./2.) # good approximation for > 10 data points c = 0.33/np.power(np.log(n), 3./4.) # good approximation for > 10 data points # We now need a 1-tailed probability from the given confidence_interval # p_total = 1. - confidence_interval = p_upper + p_lower - p_upper*p_lower # p_total = 1. - confidence_interval = 2p - p^2, therefore: p = 1. - np.sqrt(confidence_interval) confidence_bound = genextreme.isf(p, c, loc=mean, scale=scale) indices = [i for i, r in enumerate(weighted_residuals) if np.abs(r) > confidence_bound] probabilities = 1. - np.power(genextreme.sf(np.abs(weighted_residuals[indices]), c, loc=mean, scale=scale) - 1., 2.) # Convert back to 2-tailed probabilities return confidence_bound, indices, probabilities
def _significance_direct(n_on, mu_bkg): """Compute significance directly via Poisson probability. Reference: TODO (is this ever used?) """ # Compute tail probability to see n_on or more counts # Note that we're using ``k = n_on - 1`` to get the probability # for n_on included or more, because `poisson.sf(k)` returns the # probability for more than k, with k excluded # For `n_on = 0` this returns ` probability = poisson.sf(n_on - 1, mu_bkg) # Convert probability to a significance return norm.isf(probability)
def stouffer_liptak(pvals, sigma=None): """ The stouffer_liptak correction. >>> stouffer_liptak([0.1, 0.2, 0.8, 0.12, 0.011]) {'p': 0.0168..., 'C': 2.1228..., 'OK': True} >>> stouffer_liptak([0.5, 0.5, 0.5, 0.5, 0.5]) {'p': 0.5, 'C': 0.0, 'OK': True} >>> stouffer_liptak([0.5, 0.1, 0.5, 0.5, 0.5]) {'p': 0.28..., 'C': 0.57..., 'OK': True} >>> stouffer_liptak([0.5, 0.1, 0.1, 0.1, 0.5]) {'p': 0.042..., 'C': 1.719..., 'OK': True} >>> stouffer_liptak([0.5], np.matrix([[1]])) {'p': 0.5...} """ L = len(pvals) pvals = np.array(pvals, dtype=np.float64) pvals[pvals == 1] = 1.0 - 9e-16 qvals = norm.isf(pvals, loc=0, scale=1).reshape(L, 1) if any(np.isinf(qvals)): raise Exception("bad values: %s" % pvals[list(np.isinf(qvals))]) # dont do the correction unless sigma is specified. result = {"OK": True} if not sigma is None: try: C = chol(sigma) Cm1 = np.asmatrix(C).I # C^-1 # qstar qvals = Cm1 * qvals except LinAlgError as e: result["OK"] = False result = z_score_combine(pvals, sigma) return result Cp = qvals.sum() / np.sqrt(len(qvals)) # get the right tail. pstar = norm.sf(Cp) if np.isnan(pstar): print("BAD:", pvals, sigma, file=sys.stderr) pstar = np.median(pvals) result["OK"] = True result.update({"C": Cp, "p": pstar}) return result
def get_roc_curve(Y,p,smooth=False): if not smooth: fpr, tpr, thresholds = roc_curve(Y, p) else: from scipy.stats import gaussian_kde x = -norm.isf(np.array(p)) x0 = x[Y==0] x1 = x[Y==1] threshold = np.linspace(-10,10,201) fpr = [gaussian_kde(x0,0.2).integrate_box(t,np.inf) for t in threshold] tpr = [gaussian_kde(x1,0.2).integrate_box(t,np.inf) for t in threshold] roc_auc = auc(fpr, tpr) if roc_auc < 0.5: fpr = 1-np.array(fpr) tpr = 1-np.array(tpr) roc_auc = 1-roc_auc return fpr,tpr,roc_auc
def _significance_direct(n_observed, mu_background): """Compute significance directly via Poisson probability. Use this method for small n_observed < 10. In this case the Li & Ma formula isn't correct any more. TODO: add large unit test coverage (where is it numerically precise enough)? TODO: check coverage with MC simulation """ from scipy.stats import norm, poisson # Compute tail probability to see n_on or more counts probability = poisson.sf(n_observed, mu_background) # Convert probability to a significance significance = norm.isf(probability) return significance
def calc_CI(A, Z, r, alpha): """ A : from calc_A Z : from calc_Z alpha : confidence bound """ z = norm.isf(alpha / 2.0) A1_ = Z + (Z - z) / (1.0 - A * (Z - z)) A1 = norm.cdf(A1_) A2_ = Z + (Z + z) / (1.0 - A * (Z + z)) A2 = norm.cdf(A2_) lo = np.int32(A1 * r) up = np.int32(A2 * r) ci = np.array((lo, up)) return ci
def calc_CI(self, A, Z): ''' A : from calc_A Z : from calc_Z alpha : confidence bound ''' z = norm.isf(self.alpha/2.) A1_ = Z + (Z - z) / (1. - A*(Z - z)) A1 = norm.cdf(A1_) A2_ = Z + (Z + z) / (1. - A*(Z + z)) A2 = norm.cdf(A2_) lo = np.int32(A1*self.r) up = np.int32(A2*self.r) ci = np.array((lo, up)) return ci