def sample_size_chi2(y, x, pars): alpha = pars['alpha'] m, n = x.shape x = np.squeeze(np.asarray(x)) y = np.squeeze(np.asarray(y)) nCl = len(np.unique(np.asarray(y))) s_size = m N = round(pow(m, 1/3)) nBinsArray = [nCl] nBinsArray.extend([N]*n) p, edges = np.histogramdd((y, x), bins=nBinsArray) idx = range(0,m) chi_min = chi2.ppf(alpha/2, 1.5*N) chi_max = chi2.ppf(1-alpha/2, 1.5*N) while s_size > N: chi2d = [] for nIter in range(0, 50): random.shuffle(idx) idx1 = idx[:s_size] random.shuffle(idx) idx2 = idx[:s_size] chi2d.append(chi2div(y, x, idx1, idx2, edges)) chi2d = sum(chi2d)/nIter if chi2d < chi_min or chi2d > chi2_max: s_size += 1 break s_size -= 1 return s_size
def calc_lambduh(num_flds, side_len): lower_bound = .5*chi2.ppf(ALPHA/2.0,2*sum(num_flds))/len(num_flds) upper_bound = .5*chi2.ppf(1-ALPHA/2.0,2*sum(num_flds)+2)/len(num_flds) lambduh = 1.0*np.average(num_flds)/(side_len**2) lambduh_bound = [lower_bound/side_len**2, upper_bound/side_len**2] area = side_len**2 return lambduh, lambduh_bound, area
def ci_var(self, lower_bound=None, upper_bound=None, sig=.05): """ Returns the confidence interval for the variance. Parameters ---------- lower_bound : float The minimum value the lower confidence interval can take. The p-value from test_var(lower_bound) must be lower than 1 - significance level. Default is .99 confidence limit assuming normality upper_bound : float The maximum value the upper confidence interval can take. The p-value from test_var(upper_bound) must be lower than 1 - significance level. Default is .99 confidence limit assuming normality sig : float The significance level. Default is .05 Returns ------- Interval : tuple Confidence interval for the variance Examples -------- >>> import numpy as np >>> import statsmodels.api as sm >>> random_numbers = np.random.standard_normal(100) >>> el_analysis = sm.emplike.DescStat(random_numbers) >>> el_analysis.ci_var() (0.7539322567470305, 1.229998852496268) >>> el_analysis.ci_var(.5, 2) (0.7539322567469926, 1.2299988524962664) Notes ----- If the function returns the error f(a) and f(b) must have different signs, consider lowering lower_bound and raising upper_bound. """ endog = self.endog if upper_bound is None: upper_bound = ((self.nobs - 1) * endog.var()) / \ (chi2.ppf(.0001, self.nobs - 1)) if lower_bound is None: lower_bound = ((self.nobs - 1) * endog.var()) / \ (chi2.ppf(.9999, self.nobs - 1)) self.r0 = chi2.ppf(1 - sig, 1) llim = optimize.brentq(self._ci_limits_var, lower_bound, endog.var()) ulim = optimize.brentq(self._ci_limits_var, endog.var(), upper_bound) return llim, ulim
def poisson_interval(k, alpha=0.05): """ uses chisquared info to get the poisson interval. Uses scipy.stats (imports in function). """ from scipy.stats import chi2 a = alpha low, high = (chi2.ppf(a / 2, 2 * k) / 2, chi2.ppf(1 - a / 2, 2 * k + 2) / 2) if k == 0: low = 0.0 return low, high
def chi2_distribution(): fig, ax = plt.subplots(1, 1) #display the probability density function df = 10 x=np.linspace(chi2.ppf(0.01, df), chi2.ppf(0.99, df), 100) ax.plot(x, chi2.pdf(x,df)) #simulate the chi2 distribution y = [] n=10 for i in range(1000): chi2r=0.0 r = norm.rvs(size=n) for j in range(n): chi2r=chi2r+r[j]**2 y.append(chi2r) ax.hist(y, normed=True, alpha=0.2) plt.show() fig, ax = plt.subplots(1, 1) #display the probability density function df = 10 x=np.linspace(-4, 4, 100) ax.plot(x, t.pdf(x,df)) #simulate the t-distribution y = [] for i in range(1000): rx = norm.rvs() ry = chi2.rvs(df) rt = rx/np.sqrt(ry/df) y.append(rt) ax.hist(y, normed=True, alpha=0.2) plt.show() fig, ax = plt.subplots(1, 1) #display the probability density function dfn, dfm = 10, 5 x = np.linspace(f.ppf(0.01, dfn, dfm), f.ppf(0.99, dfn, dfm), 100) ax.plot(x, f.pdf(x, dfn, dfm)) #simulate the F-distribution y = [] for i in range(1000): rx = chi2.rvs(dfn) ry = chi2.rvs(dfm) rf = np.sqrt(rx/dfn)/np.sqrt(ry/dfm) y.append(rf) ax.hist(y, normed=True, alpha=0.2) plt.show()
def get_lambda(p, definition = 'median'): ''' Evaluates Lambda value :param p: distribution of p-values :param definition: definition of lambda :return: ''' if definition == 'median': pm = np.median(p) Chi = chi2.ppf(1. - pm, 1) return Chi / chi2.ppf(0.5,1) else: raise Exception("Only 'median' definition of lambda is implemented at this moment.")
def plot_profile_1d(data, likes, minval, maxval, xlabel='Parameter value'): bins, PL = profile_1d(data, likes, minval, maxval) pl.figure() pl.plot(bins[1:] - 0.5*(bins[1] - bins[0]), PL, 'k-', linewidth=2.0) pl.ylim([0,10]) pl.axhline(0.5*chi2.ppf(0.683, 1), color='b', linestyle='--') pl.axhline(0.5*chi2.ppf(0.995, 1), color='b', linestyle='--') pl.text(2.7, 0.5*chi2.ppf(0.683, 1), r'$1\sigma$') pl.text(2.7, 0.5*chi2.ppf(0.995, 1), r'$2\sigma$') pl.xlabel(xlabel) pl.ylabel(r'$\Delta log \mathcal{L}$')
def predicate(cls, tasks, user_id, cost): if len(tasks) < 3: return None, None, None, None, None # use only same user tasks? same_user_tasks = filter_user_id(tasks, user_id) if len(same_user_tasks) > 3: tasks = same_user_tasks # use only same cost tasks? same_cost_tasks = filter_cost(tasks, cost) if len(same_cost_tasks) > 3: tasks = same_cost_tasks # use only last N tasks tasks = tasks[-8:] sample = np.array([x['actualWorkTime'] / x['cost'] for x in tasks]) n = sample.size mu = np.mean(sample) s2 = np.var(sample, ddof=1) t45 = sci_t.ppf(0.95, n - 1) mlow, mhigh = mu + np.array([-t45, t45]) * (np.sqrt(s2) / np.sqrt(n)) chi45a = sci_chi2.ppf(0.95, n - 1) shigh = np.sqrt((n - 1) * s2 / chi45a) low, high = mlow - shigh, mhigh + shigh return (mlow + mhigh) / 2 * cost, mlow * cost, mhigh * cost, low * cost, high * cost
def chi2_2sample_crit(alpha, df): """ @param alpha confidence level @param df degrees of freedom """ crit = chi2.ppf(1.0-alpha, df) return crit
def _quantile_notTruncated(self, q, tol=1.e-6): """ Compute the quantile for the non truncated distribution Parameters ---------- q : float quantile you want to compute. Between 0 and 1 tol : float precision for the output Returns ------- x : float x such that P(X < x) = q """ scale = self._scale k = self._k dps = self._dps z_approx = scale * chi2.ppf(q, k) epsilon = scale * 0.001 lb = z_approx - epsilon ub = z_approx + epsilon f = lambda z: self._cdf_notTruncated(-np.inf, z, dps) z = find_root(f, q, lb, ub, tol) return z
def plot_cov_ellipse(cov, pos, volume=.5, ax=None, fc='none', ec=[0,0,0], a=1, lw=2): """ Helper Method: draw ellipse of gaussian mixture""" import numpy as np from scipy.stats import chi2 import matplotlib.pyplot as plt from matplotlib.patches import Ellipse def eigsorted(cov): vals, vecs = np.linalg.eigh(cov) order = vals.argsort()[::-1] return vals[order], vecs[:,order] if ax is None: ax = plt.gca() vals, vecs = eigsorted(cov) theta = np.degrees(np.arctan2(*vecs[:,0][::-1])) kwrg = {'facecolor':fc, 'edgecolor':ec, 'alpha':a, 'linewidth':lw} # Width and height are "full" widths, not radius width, height = 2 * np.sqrt(chi2.ppf(volume,2)) * np.sqrt(vals) ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwrg) ax.add_artist(ellip)
def GlobalTest(self,T,n): Tcrit=chi2.ppf(0.05,n) if (T>Tcrit): GTResult=True else: GTResult=False return GTResult
def generate_emissions(theta,data): def _eigsorted(cov): vals,vecs = np.linalg.eigh(cov) order = vals.argsort()[::-1] return vals[order],vecs[:,order] xmin = np.inf; xmax = -np.inf ymin = np.inf; ymax = -np.inf fig = plt.figure() ax = fig.add_subplot(111) for i in range(len(data)): ax.plot(data[i][:,0],data[i][:,1],'k.') xmin = np.min([xmin,np.min(data[i][:,0])]) xmax = np.max([xmax,np.max(data[i][:,0])]) ymin = np.min([ymin,np.min(data[i][:,1])]) ymax = np.max([ymax,np.max(data[i][:,1])]) vol = [0.25,0.5,0.75,0.95, 0.99] ell = [] for i in range(len(theta)): pos = theta[i].mean vals,vecs = _eigsorted(theta[i].var) th = np.degrees(np.arctan2(*vecs[:,0][::-1])) for v in vol: width,height = 2.0*np.sqrt(chi2.ppf(v,2))*np.sqrt(vals) ell.append(Ellipse(xy=pos,width=width,height=height,angle=th)) for i,e in enumerate(ell): ax.add_artist(e) e.set_facecolor(my_color_map(i)) e.set_alpha(0.5) ax.set_xlim(xmin, xmax) ax.set_ylim(ymin, ymax) plt.show()
def ci_cmle_is(X, v, theta_grid, alpha_level, T = 100, verbose = False): cmle_is = np.empty_like(theta_grid) r = X.sum(1) c = X.sum(0) for l, theta_l in enumerate(theta_grid): logit_P_l = theta_l * v w_l = np.exp(logit_P_l) z = cond_a_sample(r, c, w_l, T) logf = np.empty(T) for t in range(T): logQ, logP = z[t][1], z[t][2] logf[t] = logP - logQ logkappa = -np.log(T) + logsumexp(logf) if verbose: logcvsq = -np.log(T - 1) - 2 * logkappa + \ logsumexp(2 * logabsdiffexp(logf, logkappa)) print 'est. cv^2 = %.2f (T = %d)' % (np.exp(logcvsq), T) cmle_is[l] = np.sum(np.log(w_l[X])) - logkappa crit = -0.5 * chi2.ppf(1 - alpha_level, 1) ci = invert_test(theta_grid, cmle_is - cmle_is.max(), crit) if params['plot']: plot_statistics(ax_cmle_is, theta_grid, cmle_is - cmle_is.max(), crit) cmle_is_coverage_data['cis'].append(ci) cmle_is_coverage_data['theta_grid'] = theta_grid cmle_is_coverage_data['crit'] = crit return ci
def __init__(self,image): self.Mlist = ('threshold', 'MRF', 'Mathematical Morphology', 'gray MM') self.method = self.Mlist[0] self.img = image self.nvar = 3 print 'edit' self.param1 = chi2.ppf(0.995, self.nvar) img_init = self.img > self.param1 self.param2 = 2 self.fig, ax = plt.subplots(num=20, figsize=(30, 20)) self.im1 = ax.imshow(img_init, cmap = plt.cm.Greys_r) plt.subplots_adjust(left=0.3) axcolor = 'lightgoldenrodyellow' rax = plt.axes([0.05, 0.7, 0.15, 0.15], axisbg=axcolor) self.radio = RadioButtons(rax, self.Mlist) axS1 = self.fig.add_axes([0.25, 0.1, 0.65, 0.03], axisbg=axcolor) self.slider1 = Slider(axS1, 'Threshold', 1, 5000, valinit=self.param1) axS2 = self.fig.add_axes([0.25, 0.05, 0.65, 0.03], axisbg=axcolor) self.slider2 = Slider(axS2, 'Smoothnes', 1, 11, valinit=self.param2)
def plot_cov_ellipse(cov, pos, volume=.5, ax=None, fc='none', ec=[1, 0, 0], a=1, lw=2): """ Plots an ellipse enclosing *volume* based on the specified covariance matrix (*cov*) and location (*pos*). Additional keyword arguments are passed on to the ellipse patch artist. Parameters ---------- cov : The 2x2 covariance matrix to base the ellipse on pos : The location of the center of the ellipse. Expects a 2-element sequence of [x0, y0]. volume : The volume inside the ellipse; defaults to 0.5 ax : The axis that the ellipse will be plotted on. Defaults to the current axis. """ def eigsorted(cov): vals, vecs = np.linalg.eigh(cov) order = vals.argsort()[::-1] return vals[order], vecs[:, order] if ax is None: ax = plt.gca() vals, vecs = eigsorted(cov) theta = np.degrees(np.arctan2(*vecs[:, 0][::-1])) kwrg = {'facecolor': fc, 'edgecolor': ec, 'alpha': a, 'linewidth': lw} # Width and height are "full" widths, not radius width, height = 2 * np.sqrt(chi2.ppf(volume,2)) * np.sqrt(vals) ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwrg) ax.add_artist(ellip)
def minChi(parts, all): pi=math.pi def meanBlom(tests): return sum([test.blom for test in tests])/len(tests) def SSE(tests): return sum([test.blom**2 for test in tests]) for part in parts: computeBlom(part, all) k = parts[0].n cut, left, right = None, None, None bestError = 0 if len(parts) == 1: return cut, left, right totalError = meanBlom(parts) for i in range(1,len(parts)): lParts, rParts = parts[:i], parts[i:] error = k*(len(lParts)*((meanBlom(lParts) - totalError)**2) + len(rParts)*((meanBlom(rParts) - totalError)**2)) if (error > bestError): bestError, cut, left, right = error, i, lParts, rParts v = k/(pi-2) lamda = (pi/(2*(pi-2)))*bestError/(SSE(parts)/v) chi = chi2.ppf(0.99, v) if lamda > chi: return cut, left, right return None, None, None
def cov_ellipses(x, y, cov_mat=None, cov_tri=None, q=None, nsig=None, **kwargs): """Draw covariance error ellipses. Parameters ---------- x, y : array (n,) Center of covariance ellipses. cov_mat : array (n, 2, 2), optional Covariance matrix. cov_tri : list of array (n,), optional Covariance matrix in flat form of (xvar, yvar, xycov). q : scalar or array Wanted (quantile) probability enclosed in error ellipse. nsig : scalar or array Probability in unit of standard error. Eg. `nsig = 1` means `q = 0.683`. kwargs : `ellipses` properties. Eg. c, vmin, vmax, alpha, edgecolor(ec), facecolor(fc), linewidth(lw), linestyle(ls), norm, cmap, transform, etc. Reference --------- [1]: http://www.visiondummy.com/2014/04/draw-error-ellipse-representing-covariance-matrix [2]: http://stackoverflow.com/questions/12301071/multidimensional-confidence-intervals """ from scipy.stats import norm, chi2 if cov_mat is not None: cov_mat = np.asarray(cov_mat) elif cov_tri is not None: assert len(cov_tri) == 3 cov_mat = np.array([[cov_tri[0], cov_tri[2]], [cov_tri[2], cov_tri[1]]]) cov_mat = cov_mat.transpose(range(2, cov_mat.ndim) + range(2)) # Roll the first two dimensions (2, 2) to end. else: raise ValueError('One of `cov_mat` and `cov_tri` should be specified.') x, y = np.asarray(x), np.asarray(y) if not (cov_mat.shape[:-2] == x.shape == y.shape): raise ValueError('The shape of x, y and covariance are incompatible.') if not (cov_mat.shape[-2:] == (2, 2)): raise ValueError('Invalid covariance matrix shape.') if q is not None: q = np.asarray(q) elif nsig is not None: q = 2 * norm.cdf(nsig) - 1 else: raise ValueError('One of `q` and `nsig` should be specified.') rho = chi2.ppf(q, 2) rho = rho.reshape(rho.shape + (1,) * x.ndim) # raise dimensions val, vec = np.linalg.eigh(cov_mat) w = 2 * np.sqrt(val[..., 0] * rho) h = 2 * np.sqrt(val[..., 1] * rho) rot = np.degrees(np.arctan2(vec[..., 1, 0], vec[..., 0, 0])) return ellipses(x, y, w, h, rot=rot, **kwargs) """cov_cross
def chi2_distribution(): fig,ax = plt.subplots(1, 1) df = 10 x=np.linspace(chi2.ppf(0.01, df),chi2.ppf(0.99, df), 100) ax.plot(x, chi2.pdf(x,df)) y = [] n=10 for i in range(1000): chi2r = 0.0 r = norm.rvs(size = 10) for j in range(10): chi2r = chi2r + r[j]**2 y.append(chi2r) ax.hist(y, normed=True, alpha=0.2) plt.show()
def __init__(self, dataRange=None, delta=0.1, tau=0.1, vmin=None, spmin=None, \ uniform=False, fullcovs=True, regularize=0): # configuration params self.dimension = dataRange.size self.vmin = vmin if vmin is not None else 2 * self.dimension self.spmin = spmin if spmin is not None else self.dimension + 1 self.delta = delta self.tau = tau self.SIGMA = (self.delta * dataRange)**2 self.maxDist = chi2.ppf(1 - self.tau, self.dimension) self.uniform = uniform self.fullcovs = fullcovs self.regVal = regularize # components params self.priors = [] self.means = [] self.covs = [] self.sps = [] self.vs = [] self.nc = 0 # components outputs self.loglikes = [] self.posts = [] # Mahalanobis distance self.mahalaD = [] # model likelihood self.dataLikelihood = 0
def getErrorEllipse(self, par1, par2, confLevel=0.6827): """ Returns a, b, tan(2 theta) of confLevel error ellipse in par1-par2-plane with: a: large half axis b: small half axis tan(2 theta): tilt angle, has to be divided by the aspect ratio of the actual plot before taking arctan Formulae taken from arXiv:0906.4123 """ sigma1, sigma2 = self.getSigma(par1), self.getSigma(par2) cov = self.getCovariance(par1, par2) #for this we need sigma1 > sigma2, otherwise just swap parameters if sigma1 > sigma2: a_sq = (sigma1**2 + sigma2**2)/2. + np.sqrt((sigma1**2 - sigma2**2)**2/4. + cov**2) b_sq = (sigma1**2 + sigma2**2)/2. - np.sqrt((sigma1**2 - sigma2**2)**2/4. + cov**2) else: a_sq = (sigma2**2 + sigma1**2)/2. - np.sqrt((sigma2**2 - sigma1**2)**2/4. + cov**2) b_sq = (sigma2**2 + sigma1**2)/2. + np.sqrt((sigma2**2 - sigma1**2)**2/4. + cov**2) #Note: this has weird dimensions (actual size of the plot)! tan_2_th = 2.*cov / (sigma1**2 - sigma2**2) # we are dealing with a 2D error ellipse here scaling = np.sqrt(chi2.ppf(confLevel, 2)) return scaling*np.sqrt(a_sq), scaling*np.sqrt(b_sq), tan_2_th
def sampling_distribution(): fig, ax = plt.subplots(1, 1) #display the probability density function df = 10 x=np.linspace(chi2.ppf(0.01, df), chi2.ppf(0.99, df), 100) ax.plot(x, chi2.pdf(x, df)) #simulate the sampling distribution y = [] for i in range(1000): r = norm.rvs(loc=5, scale=2, size=df+1) rchi2 =(df)*np.var(r)/4 y.append(rchi2) ax.hist(y, normed=True, alpha=0.2) plt.savefig('sampling_distribution.png')
def stats(self,SigmaR,B_Col): #Chi Squared test print('Our Null Hypothesis states that the variance of our population = sample variance') #Variance of our radius from measured points observed = SigmaR #Expected variance (3mm per x,y,z observation) expected = .003**2+.003**2+.003**2 #Calculation of degrees of freedom dof = B_Col-1 #Calculation of test statistics teststatx = B_Col*((observed - expected)**2/expected) teststatx1 = dof*(observed/expected) #User is prompted to input desired significance level significance = np.float(input('Please specify the significance level: ')) print(teststatx), print(teststatx1) #Using built in scipy.stats.chi2 function instead of looking up values on a table mean, var, skew, kurt = chi2.stats(dof, moments='mvsk') Chi = chi2.ppf((1-significance),dof) #If our sampled variance is greater than the population variance at the chosen significance level then we reject the null hypothesis at that significance level if teststatx > Chi: print 'We reject the null hypothesis at the ',significance,'significance level' else: print 'We fail to reject the null hypothesis at the ',significance,'significance level' print(teststatx, dof)
def getMahalanobisRobust(dat, critical_alpha = 0.01, good_rows = np.zeros(0)): '''Calculate the Mahalanobis distance from the sample vector.''' if good_rows.size == 0: good_rows = np.any(~np.isnan(dat), axis=1); #import pdb #pdb.set_trace() try: robust_cov = MinCovDet().fit(dat[good_rows]) mahalanobis_dist = np.sqrt(robust_cov.mahalanobis(dat)) except ValueError: #this step will fail if the covariance matrix is not singular. This happens if the data is not #a unimodal symetric distribution. For example there is too many small noisy particles. Therefore #I will take a safe option and return zeros in the mahalanobis distance if this is the case. mahalanobis_dist = np.zeros(dat.shape[0]) #critial distance of the maholanobis distance using the chi-square distirbution #https://en.wikiversity.org/wiki/Mahalanobis%27_distance #http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html maha_lim = chi2.ppf(1-critical_alpha, dat.shape[1]) outliers = mahalanobis_dist>maha_lim return mahalanobis_dist, outliers, maha_lim
def param_intervals_1d(data, likes, paramstr='Parameter value'): #Note that 'likes' is actually -2*lnL cuts = data[np.where(0.5*(likes - np.min(likes)) < 0.5*chi2.ppf(0.683, 1))] plus = np.max(cuts) - data[np.argmin(likes)] minus = data[np.argmin(likes)] - np.min(cuts) #Print best fit... print " " + paramstr + " = (", '{:.3f}'.format(data[np.argmin(likes)]), " + ", '{:.3f}'.format(plus), " - ", '{:.3f}'.format(minus), ")"
def cutting_point(k, alpha=0.5): """ calculate cutting points for k categories of Gamma(alpha, beta=alpha) output: an array of k-1 cutting values, excluding 0 and inf """ a = np.array(range(1, k), dtype=float) / k res = chi2.ppf(a, 2*alpha) / (2*alpha) return res
def ci_cmle_a(X, v, theta_grid, alpha_level): cmle_a = np.empty_like(theta_grid) for l, theta_l in enumerate(theta_grid): logit_P_l = theta_l * v cmle_a[l] = -cond_a_nll(X, np.exp(logit_P_l)) return invert_test(theta_grid, cmle_a - cmle_a.max(), -0.5 * chi2.ppf(1 - alpha_level, 1))
def calc_ncp(alpha, beta, df): x = chi2.ppf(1 - alpha, df) def to_minimize(ncp): return math.fabs(beta - ncx2.cdf(x, df, ncp)) res = minimize_scalar(to_minimize, method="golden") return res.x
def scale_errors(cov_axes, confidence_level=0.95): """ Returns major axes of error ellipse or hyperbola, rescaled using chi2 test statistic """ dof = len(cov_axes) x2t = chi2.ppf(confidence_level,dof) return N.sqrt(x2t*cov_axes)
for rep in range(replications): start = time.time() marginals_rand = np.random.multinomial(N, true_marginals.flatten(), size=1) / N marginals_rand = np.reshape(marginals_rand, [2, 2]) while not (marginals_rand[1, 1] * marginals_rand[0, 1]): marginals_rand = np.random.multinomial(N, true_marginals.flatten(), size=1) / N marginals_rand = np.reshape(marginals_rand, [2, 2]) marginals_rand = np.reshape(marginals_rand, [2, 2]) data, data_label, data_sensitive = upload_data(ds=ds, n_samples=N, marginals=marginals_rand) data_tuple = [data, data_sensitive, data_label] dist = calculate_distance_eqopp(data_tuple, np.array([0, 1]), 0.5) s_hat.append(dist) asympt = limiting_dist_EQOPP(data_tuple, np.array([0, 1]), 0.5) threshold = asympt * chi2.ppf(.9, 1) cnt = cnt + (dist>threshold) average_0 = average_0 + asympt / replications average_1 = average_1 + dist / replications print(cnt) print(average_0,average_1) ''' plt.figure() counts__, bins__, _ = plt.hist(s_hat, density=True, bins = np.linspace(0,3,30), range = [0,3], alpha=0.2, edgecolor='black', linewidth=1.3,
y_strap = Bootstrap[i]['y'] # On met en place X = (1,X) et on rajoute un dimension à y, et on prépare notre theta X_strap = np.c_[np.ones((X_strap.shape[0], 1)), X_strap] y_strap = y_strap[:, np.newaxis] theta_strap = np.zeros((X_strap.shape[1], 1)) resultat_general.append(algorithme(X_strap, y_strap, theta_strap)) std_theta = pd.DataFrame(resultat_general).std() print('la variance est:', std_theta * std_theta) #%% # test de Wald indice_bon = [] for i in range(1, len(theta)): T = resultats[i] * resultats[i] / (std_theta[i] * std_theta[i]) if T >= chi2.ppf(0.95, 1): indice_bon.append(X_col_names[i - 1]) #%% X = data[indice_bon].astype(int) y = data['y'].astype(int) #%% # On met en place X = (1,X) et on rajoute un dimension à y, et on prépare notre theta X = np.c_[np.ones((X.shape[0], 1)), X] y = y[:, np.newaxis] theta = np.zeros((X.shape[1], 1)) # On découpe les données en 2 échantillons X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #%%
def ci_mean(self, sig=.05, method='gamma', epsilon=10**-8, gamma_low=-10**10, gamma_high=10**10): """ Returns the confidence interval for the mean. Parameters ---------- sig : float significance level. Default is .05 method : str Root finding method, Can be 'nested-brent' or 'gamma'. Default is 'gamma' 'gamma' Tries to solve for the gamma parameter in the Lagrange (see Owen pg 22) and then determine the weights. 'nested brent' uses brents method to find the confidence intervals but must maximize the likelihhod ratio on every iteration. gamma is generally much faster. If the optimizations does not converge, try expanding the gamma_high and gamma_low variable. gamma_low : float Lower bound for gamma when finding lower limit. If function returns f(a) and f(b) must have different signs, consider lowering gamma_low. gamma_high : float Upper bound for gamma when finding upper limit. If function returns f(a) and f(b) must have different signs, consider raising gamma_high. epsilon : float When using 'nested-brent', amount to decrease (increase) from the maximum (minimum) of the data when starting the search. This is to protect against the likelihood ratio being zero at the maximum (minimum) value of the data. If data is very small in absolute value (<10 ``**`` -6) consider shrinking epsilon When using 'gamma', amount to decrease (increase) the minimum (maximum) by to start the search for gamma. If fucntion returns f(a) and f(b) must have differnt signs, consider lowering epsilon. Returns ------- Interval : tuple Confidence interval for the mean """ endog = self.endog sig = 1 - sig if method == 'nested-brent': self.r0 = chi2.ppf(sig, 1) middle = np.mean(endog) epsilon_u = (max(endog) - np.mean(endog)) * epsilon epsilon_l = (np.mean(endog) - min(endog)) * epsilon ulim = optimize.brentq(self._ci_limits_mu, middle, max(endog) - epsilon_u) llim = optimize.brentq(self._ci_limits_mu, middle, min(endog) + epsilon_l) return llim, ulim if method == 'gamma': self.r0 = chi2.ppf(sig, 1) gamma_star_l = optimize.brentq(self._find_gamma, gamma_low, min(endog) - epsilon) gamma_star_u = optimize.brentq(self._find_gamma, \ max(endog) + epsilon, gamma_high) weights_low = ((endog - gamma_star_l) ** -1) / \ np.sum((endog - gamma_star_l) ** -1) weights_high = ((endog - gamma_star_u) ** -1) / \ np.sum((endog - gamma_star_u) ** -1) mu_low = np.sum(weights_low * endog) mu_high = np.sum(weights_high * endog) return mu_low, mu_high
def cluster_datasets(truncated_datasets, residues, out_dir, f, phi, ): print("\tClustering {} datasets".format(len(truncated_datasets))) reference_dataset = get_reference_dataset(truncated_datasets) alignments = get_alignments(residues, ) aligned_maps, distances = align_maps(reference_dataset, truncated_datasets, alignments, f, phi, ) print("\tRange of distances is {} {}".format(min(distances), max(distances), )) data = np.vstack([aligned_map.flatten() for aligned_map in aligned_maps]) models = {} try: for i in range(1, 10): print(i) model = GaussianMixture(n_components=i, covariance_type="diag", verbose=2) model.fit(data) models[i] = model except Exception as e: print(e) print("Model became undefined!") bics = {model_num: model.bic(data) for model_num, model in models.items()} print(bics) model = min(list(bics.items()), key=lambda x: x[1]) print("Best model is {}".format(model)) model = models[model[0]] classes = model.predict(data) # outlier_distance = sample_outlier_distance(model) outliers = {} clusters = [] print(model.means_.shape) print(classes) for i in range(model.means_.shape[0]): print("\tProcessing component: {}".format(i)) means = model.means_[i, :].flatten() precs = np.diag(model.precisions_[i, :].flatten()) outlier_distance = np.sqrt(chi2.ppf(0.95, means.size)) print("Outlier distance: {}".format(outlier_distance)) cluster_maps = {dtag: aligned_maps[j] for j, dtag in enumerate(list(residues.keys())) if classes[j] == i } cluster_outliers = map_list(lambda x: classify_distance(x, outlier_distance, means, precs), cluster_maps.values(), ) for j, dtag in enumerate(list(cluster_maps.keys())): if cluster_outliers[j] == 1: outliers[dtag] = 1 else: outliers[dtag] = 0 inliers = {dtag: aligned_maps[i] for i, dtag in enumerate(list(cluster_maps.keys())) if outliers[dtag] == 0} clusters.append(inliers) # outliers = [] # for xmap in aligned_maps: # distance = gaussian_distance(xmap, model) # # distance = probability_distance(xmap.reshape(1,-1), model) # print(distance) # if distance < outlier_distance: # outliers.append(1) # else: # outliers.append(0) individual_outliers = [{dtag: aligned_maps[i]} for i, dtag in enumerate(list(residues.keys())) if outliers[dtag] == 1] return clusters + individual_outliers
def _likelihood_ratio_confint(self, alpha: float) -> List[float]: """Compute the likelihood ratio confidence interval for the MLE of the previous run. Args: alpha: Specifies the (1 - alpha) confidence level (0 < alpha < 1). Returns: The likelihood ratio confidence interval. """ # Compute the two intervals in which we the look for values above # the likelihood ratio: the two bubbles next to the QAE estimate M = self._M # pylint: disable=invalid-name qae = self._ret['value'] y = int(np.round(M * np.arcsin(np.sqrt(qae)) / np.pi)) if y == 0: right_of_qae = np.sin(np.pi * (y + 1) / M)**2 bubbles = [qae, right_of_qae] elif y == int(M / 2): # remember, M = 2^m is a power of 2 left_of_qae = np.sin(np.pi * (y - 1) / M)**2 bubbles = [left_of_qae, qae] else: left_of_qae = np.sin(np.pi * (y - 1) / M)**2 right_of_qae = np.sin(np.pi * (y + 1) / M)**2 bubbles = [left_of_qae, qae, right_of_qae] # likelihood function a_i = np.asarray(self._ret['values']) p_i = np.asarray(self._ret['probabilities']) m = self._m shots = self._ret['shots'] def loglikelihood(a): return np.sum(shots * p_i * np.log(pdf_a(a_i, a, m))) # The threshold above which the likelihoods are in the # confidence interval loglik_mle = loglikelihood(self._ret['ml_value']) thres = loglik_mle - chi2.ppf(1 - alpha, df=1) / 2 def cut(x): return loglikelihood(x) - thres # Store the boundaries of the confidence interval # It's valid to start off with the zero-width confidence interval, since the maximum # of the likelihood function is guaranteed to be over the threshold, and if alpha = 0 # that's the valid interval lower = upper = self._ret['ml_value'] # Check the two intervals/bubbles: check if they surpass the # threshold and if yes add the part that does to the CI for a, b in zip(bubbles[:-1], bubbles[1:]): # Compute local maximum and perform a bisect search between # the local maximum and the bubble boundaries locmax, val = bisect_max(loglikelihood, a, b, retval=True) if val >= thres: # Bisect pre-condition is that the function has different # signs at the boundaries of the interval we search in if cut(a) * cut(locmax) < 0: left = bisect(cut, a, locmax) lower = np.minimum(lower, left) if cut(locmax) * cut(b) < 0: right = bisect(cut, locmax, b) upper = np.maximum(upper, right) # Put together CI confint = [lower, upper] return [self.post_processing(bound) for bound in confint]
def perform_chi2_test(self, v_in_out=None, delta_in_out=None, calculate_voltage_angles=True, chi2_prob_false=0.05): """ The function perform_chi2_test performs a Chi^2 test for bad data and topology error detection. The function can be called with the optional input arguments v_in_out and delta_in_out. Then, the Chi^2 test is performed after calling the function estimate using them as input arguments. It can also be called without these arguments if it is called from the same object with which estimate had been called beforehand. Then, the Chi^2 test is performed for the states estimated by the funtion estimate and the result, the existence of bad data, is given back as a boolean. As a optional argument the probability of a false measurement can be provided additionally. For bad data detection, the function perform_rn_max_test is more powerful and should be the function of choice. For topology error detection, however, perform_chi2_test should be used. INPUT: **v_in_out** (np.array, shape=(1,), optional) - Vector with initial values for all voltage magnitudes in p.u. (sorted by bus index) **delta_in_out** (np.array, shape=(1,), optional) - Vector with initial values for all voltage angles in degrees (sorted by bus index) OPTIONAL: **calculate_voltage_angles** - (boolean) - Take into account absolute voltage angles and phase shifts in transformers, if init is 'slack'. Default is True **chi2_prob_false** (float) - probability of error / false alarms (standard value: 0.05) OUTPUT: **successful** (boolean) - True if bad data has been detected EXAMPLE: perform_chi2_test(np.array([1.0, 1.0, 1.0]), np.array([0.0, 0.0, 0.0]), 0.97) """ # perform SE self.estimate(v_in_out, delta_in_out, calculate_voltage_angles) # Performance index J(hx) J = np.dot(self.solver.r.T, np.dot(self.solver.R_inv, self.solver.r)) # Number of measurements m = len(self.net.measurement) # Number of state variables (the -1 is due to the reference bus) n = len(self.solver.eppci.v) + len(self.solver.eppci.delta) - 1 # Chi^2 test threshold test_thresh = chi2.ppf(1 - chi2_prob_false, m - n) # Print results self.logger.debug("Result of Chi^2 test:") self.logger.debug("Number of measurements: %d" % m) self.logger.debug("Number of state variables: %d" % n) self.logger.debug("Performance index: %.2f" % J) self.logger.debug("Chi^2 test threshold: %.2f" % test_thresh) if J <= test_thresh: self.bad_data_present = False self.logger.debug( "Chi^2 test passed. No bad data or topology error detected.") else: self.bad_data_present = True self.logger.debug( "Chi^2 test failed. Bad data or topology error detected.") if self.solver.successful: return self.bad_data_present
def poisson_interval(n,alpha=0.05): a=alpha low,high = (chi2.ppf(a/2,2*n)/2, chi2.ppf(1-a/2,2*n+2)/2) if n==0: low=0.0 pass return low,high