def __init__(self, data, **kwargs): r"""Constructor. This will fit both chi2 function in the different regimes. *data* - Data sample to use for fitting Keyword Argument: *chi1/2* - Keyword arguments like floc, fshape, etc. that are passed to the constructor of the corresponding chi2 scipy object. """ data = np.asarray(data) c1 = kwargs.pop("chi1", dict()) c2 = kwargs.pop("chi2", dict()) self.par1 = chi2.fit(data[data > 0.], **c1) self.par2 = chi2.fit(-data[data < 0.], **c2) self.f1 = chi2(*self.par1) self.f2 = chi2(*self.par2) self.eta = float(np.count_nonzero(data > 0.)) / len(data) self.eta_err = np.sqrt(self.eta * (1. - self.eta) / len(data)) # get fit-quality self.ks1 = kstest(data[data > 0.], "chi2", args=self.par1)[1] self.ks2 = kstest(-data[data < 0.], "chi2", args=self.par2)[1] return
def fit(self, X, y=None): self.h_value = np.int((X.shape[0] + self.p_free + 1) / 2) mean_value = np.array([X.mean()]) cov_value = np.mat(X.cov().as_matrix()).I #print("MD calculation Start") self.md_dis = distance.cdist(X, mean_value, metric='mahalanobis', VI=cov_value).ravel() #print("MD calculation end") chi2.fit(self.md_dis, self.p_free) self.p_value_1 = np.sqrt(chi2.ppf(0.99999999999999994375, self.p_free)) self.p_value_2 = np.sqrt(chi2.ppf(0.5, self.p_free)) return self
def __init__(self, data, **kwargs): r"""Constructor, evaluates the percentage of events equal to zero and fits a chi2 to the rest of the data. Parameters ----------- data : array Data values to be fit """ data = np.asarray(data) if len(data) == 2: self.eta = data[0] self.par = [data[1], 0., 1.] self.eta_err = np.nan self.ks = np.nan self.f = chi2(*self.par) return self.par = chi2.fit(data[data > 0], **kwargs) self.f = chi2(*self.par) self.eta = float(np.count_nonzero(data > 0)) / len(data) self.eta_err = np.sqrt(self.eta * (1. - self.eta) / len(data)) self.ks = kstest(data[data > 0], "chi2", args=self.par)[0] return
def __init__(self, data, **kwargs): """ Constructor, evaluates the percentage of events equal to zero and fits a chi2 to the rest of the data. Parameters ----------- data : array Data values to be fit """ data = np.asarray(data) if len(data) == 2: self.eta = data[0] self.par = [data[1], 0., 1.] self.eta_err = np.nan self.ks = np.nan self.f = chi2(*self.par) return self.par = chi2.fit(data[data > 0], **kwargs) self.f = chi2(*self.par) self.eta = float(np.count_nonzero(data > 0)) / len(data) self.eta_err = np.sqrt(self.eta * (1. - self.eta) / len(data)) self.ks = kstest(data[data > 0], "chi2", args=self.par)[0] return
def art_qi2(img, airmask, min_voxels=int(1e3), max_voxels=int(3e5), save_plot=True): r""" Calculates :math:`\text{QI}_2`, based on the goodness-of-fit of a centered :math:`\chi^2` distribution onto the intensity distribution of non-artifactual background (within the "hat" mask): .. math :: \chi^2_n = \frac{2}{(\sigma \sqrt{2})^{2n} \, (n - 1)!}x^{2n - 1}\, e^{-\frac{x}{2}} where :math:`n` is the number of coil elements. :param numpy.ndarray img: input data :param numpy.ndarray airmask: input air mask without artifacts """ from sklearn.neighbors import KernelDensity from scipy.stats import chi2 from mriqc.viz.misc import plot_qi2 # S. Ogawa was born np.random.seed(1191935) data = img[airmask > 0] data = data[data > 0] # Write out figure of the fitting out_file = op.abspath('error.svg') with open(out_file, 'w') as ofh: ofh.write('<p>Background noise fitting could not be plotted.</p>') if len(data) < min_voxels: return 0.0, out_file modelx = data if len(data) < max_voxels else np.random.choice( data, size=max_voxels) x_grid = np.linspace(0.0, np.percentile(data, 99), 1000) # Estimate data pdf with KDE on a random subsample kde_skl = KernelDensity(bandwidth=0.05 * np.percentile(data, 98), kernel='gaussian').fit(modelx[:, np.newaxis]) kde = np.exp(kde_skl.score_samples(x_grid[:, np.newaxis])) # Find cutoff kdethi = np.argmax(kde[::-1] > kde.max() * 0.5) # Fit X^2 param = chi2.fit(modelx[modelx < np.percentile(data, 95)], 32) chi_pdf = chi2.pdf(x_grid, *param[:-2], loc=param[-2], scale=param[-1]) # Compute goodness-of-fit (gof) gof = float(np.abs(kde[-kdethi:] - chi_pdf[-kdethi:]).mean()) if save_plot: out_file = plot_qi2(x_grid, kde, chi_pdf, modelx, kdethi) return gof, out_file
def fit_chi2_maximum_likelihood(sims): """ Fits a chi2 distribution using Maximum Likelihood. (wraps the sci) This has not been tested and is not recommended. :param sims: array of LRT test statistics (continuous part of the distribution) :return: Dictionary of distribution parameters estimated with Maximum Likelihood (ML) """ dof, _, scale = chi2.fit(sims, floc=0.) return {'scale': scale, 'dof': dof}
def plot_logp(state, portion=None): from pylab import axes, title from scipy.stats import chi2, kstest from matplotlib.ticker import NullFormatter # Plot log likelihoods draw, logp = state.logp() start = int((1 - portion) * len(draw)) if portion else 0 genid = arange(state.generation - len(draw) + start, state.generation) + 1 width, height, margin, delta = 0.7, 0.75, 0.1, 0.01 trace = axes([margin, 0.1, width, height]) trace.plot(genid, logp[start:], ',', markersize=1) trace.set_xlabel('Generation number') trace.set_ylabel('Log likelihood at x[k]') title('Log Likelihood History') # Plot log likelihood trend line from bumps.wsolve import wpolyfit from .formatnum import format_uncertainty x = np.arange(start, logp.shape[0]) + state.generation - state.Ngen + 1 y = np.mean(logp[start:], axis=1) dy = np.std(logp[start:], axis=1, ddof=1) p = wpolyfit(x, y, dy=dy, degree=1) px, dpx = p.ci(x, 1.) trace.plot(x, px, 'k-', x, px + dpx, 'k-.', x, px - dpx, 'k-.') trace.text(x[0], y[0], "slope=" + format_uncertainty(p.coeff[0], p.std[0]), va='top', ha='left') # Plot long likelihood histogram data = logp[start:].flatten() hist = axes( [margin + width + delta, 0.1, 1 - 2 * margin - width - delta, height]) hist.hist(data, bins=40, orientation='horizontal', density=True) hist.set_ylim(trace.get_ylim()) null_formatter = NullFormatter() hist.xaxis.set_major_formatter(null_formatter) hist.yaxis.set_major_formatter(null_formatter) # Plot chisq fit to log likelihood histogram float_df, loc, scale = chi2.fit(-data, f0=state.Nvar) df = int(float_df + 0.5) pval = kstest(-data, lambda x: chi2.cdf(x, df, loc, scale)) #with open("/tmp/chi", "a") as fd: # print("chi2 pars for llf", float_df, loc, scale, pval, file=fd) xmin, xmax = trace.get_ylim() x = np.linspace(xmin, xmax, 200) hist.plot(chi2.pdf(-x, df, loc, scale), x, 'r')
def _check_nllf_distribution(data, df, n_draw, trials, alpha): # fit the best chisq to the data given df float_df, loc, scale = chi2.fit(data, f0=df) df = int(float_df + 0.5) cdf = lambda x: chi2.cdf(x, df, loc, scale) # check the quality of the fit (i.e., does the set of nllfs look vaguely # like the fitted chisq distribution). Repeat the test a few times on # small data sets for consistency. p_vals = [] for _ in range(trials): f_samp = choice(data, n_draw, replace=True) p_vals.append(kstest(data, cdf)[1]) print("llf dist", p_vals, df, loc, scale) return alpha > np.mean(p_vals)
def simulate_tourney(year: int, tourney: list) -> list: rankings = pickle.load(open("./predictions/" + str(year) + "_rankings.p", "rb")) vec = pickle.load(open("./predictions/" + str(year) + "_vector.p", "rb")) df = chi2.fit(vec)[0] min_vec, max_vec = min(vec)[0], max(vec)[0] rounds = [tourney] while len(tourney) > 1: print(tourney) print("--------------------------------------------------") new_tourney = [] for i in range(0, len(tourney), 2): # team name, seed in bracket, model's probability that they advance to current position teamA, seedA, prA = tourney[i] rankA = rankings[teamA] teamB, seedB, prB = tourney[i + 1] rankB = rankings[teamB] A_beats_B = compare_teams( teamA, teamB, rankA, rankB, seedA, seedB, df, min_vec, max_vec, print_out=True, ) if A_beats_B >= 0.5: new_tourney.append((teamA, seedA, A_beats_B * prA)) if seedA > seedB: print(f"\t{seedA} {seedB} UPSET") else: new_tourney.append((teamB, seedB, (1 - A_beats_B) * prB)) if seedB > seedA: print(f"\t{seedB} {seedA} UPSET") rounds.append(new_tourney) tourney = new_tourney print(tourney) return rounds
def plot_pairwise_jsd(img_dir, mask_dir, outfn='pairwisejsd.png', nbins=200, fit_chi2=True): """ create a figure of pairwise jensen-shannon divergence for all images in a directory Args: img_dir (str): path to directory of nifti images mask_dir (str): path to directory of corresponding masks Returns: ax (matplotlib ax): ax the plot was created on """ pairwise_jsd = quality.pairwise_jsd(img_dir, mask_dir, nbins=nbins) _, ax = plt.subplots(1, 1) ax.hist(pairwise_jsd, label='Hist.', density=True) if fit_chi2: from scipy.stats import chi2 df, _, scale = chi2.fit(pairwise_jsd, floc=0) logger.info(f'df = {df:0.3e}, scale = {scale:0.3e}') x = np.linspace(0, np.max(pairwise_jsd), 200) ax.plot(x, chi2.pdf(x, df, scale=scale), lw=3, label=r'$\chi^2$ Fit') ax.legend() textstr = r'$df = $' + f'{df:0.2f}' props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) ax.text(0.72, 0.80, textstr, transform=ax.transAxes, verticalalignment='top', bbox=props) ax.set_xlabel(r'Jensen-Shannon Divergence') ax.set_ylabel('Density') ax.set_title(r'Density of Pairwise JSD — $\mu$ = ' + f'{np.mean(pairwise_jsd):.2e}' + r' $\sigma$ = ' + f'{np.std(pairwise_jsd):.2e}', pad=20) ax.ticklabel_format(style='sci', axis='both', scilimits=(0, 0)) if outfn is not None: plt.savefig(outfn, transparent=True, dpi=200) return ax
def test_draw_samples_1d(self, plot=False): # Also make sure the non-mock sampler works by drawing 1D samples (should collapse to chi^2) dtype = np.float32 dtype_dof = np.int32 num_samples = 20000 dof = 3 scale = np.array([[1]]) rv_shape = scale.shape dof_mx = mx.nd.array([dof], dtype=dtype_dof) scale_mx = add_sample_dimension(mx.nd, mx.nd.array(scale, dtype=dtype)) rand_gen = None var = Wishart.define_variable(shape=rv_shape, rand_gen=rand_gen, dtype=dtype).factor variables = { var.degrees_of_freedom.uuid: dof_mx, var.scale.uuid: scale_mx } rv_samples_rt = var.draw_samples(F=mx.nd, variables=variables, num_samples=num_samples) assert array_has_samples(mx.nd, rv_samples_rt) assert get_num_samples(mx.nd, rv_samples_rt) == num_samples assert rv_samples_rt.dtype == dtype if plot: plot_univariate(samples=rv_samples_rt, dist=chi2, df=dof) # Note that the chi-squared fitting doesn't do a great job, so we have a slack tolerance dof_est, _, _ = chi2.fit(rv_samples_rt.asnumpy().ravel()) dof_tol = 1.5 assert np.abs(dof - dof_est) < dof_tol
def test_profile_likelihood(self, range_for_param, param, confidence=0.99, fit_chi2=False): mle, ll_xi0 = self.mle profile_ll = [] params = [] for x in range_for_param: try: pl = mle.profile_likelihood( self.data, param, x, conditioning_method=self.conditioning_method) pl_value = pl.log_likelihood( self.data, conditioning_method=self.conditioning_method) if np.isfinite(pl_value): profile_ll.append(pl_value) params.append(list(pl._params)) except: pass delta = [2 * (ll_xi0 - ll) for ll in profile_ll if np.isfinite(ll)] if fit_chi2: df, loc, scale = chi2.fit(delta) chi2_par = {"df": df, "loc": loc, "scale": scale} else: chi2_par = {"df": 1} lower_bound = ll_xi0 - chi2.ppf(confidence, **chi2_par) / 2 filtered_params = pd.DataFrame([ x + [ll] for x, ll in zip(params, profile_ll) if ll >= lower_bound ]) cols = list(mle.params_names) + ["likelihood"] filtered_params = filtered_params.rename( columns=dict(zip(count(), cols))) return filtered_params
import numpy as np from RDC_IndependenceTest import * from scipy.stats import chi2 import matplotlib.pyplot as plt histogram = np.loadtxt("./datos/RDChistograma.txt", delimiter="\t") histogram[0] = np.sort(histogram[0]) histogram[1] = np.sort(histogram[1]) figure, [ax1, ax2, ax3] = plt.subplots(1, 3, sharey=True) x = chi2.fit(histogram[0]) ax1.hist(histogram[0], density=True, histtype='step', label="HSIC") ax1.plot(histogram[0], chi2.pdf(histogram[0], x[0], x[1], x[2])) ax1.set_title("RDC statistic under H0") x = chi2.fit(histogram[1]) ax2.hist(histogram[1], density=True, histtype='step', label="HSIC") ax2.plot(histogram[1], chi2.pdf(histogram[1], x[0], x[1], x[2])) ax2.set_title("RDC statistic under H0") ax3.hist(histogram[0], density=True, histtype='step', label="HSIC") plt.legend(loc='best') plt.show()
#----------------------------------------------------- #Define the average, mostly for comparison lndelta= np.arange(-23,10,1e-2) delta= np.exp(lndelta) deltabar=np.average(widths) stdev=np.std((widths[:])/deltabar) print deltabar print stdev #----------------------------------------------------- #Work out max likelihood for widths distribution (Chi squared function) #Also calculate the uncertainty. P= delta*np.exp(-delta/(2.0*(deltabar)))/np.sqrt(2.0*np.pi*delta*deltabar) pl.plot(lndelta,P,'g--') optnu,loc,scaling= chi2.fit(widths,1.0,floc=0.0)#,fscale=deltabar nuhess= np.zeros((2,2)) nuhess[0,0]= -polygamma(1,0.5*optnu) nuhess[0,1]= -0.5/scaling nuhess[1,0]=nuhess[0,1] nuhess[1,1]= np.sum((0.5*optnu - widths/scaling)/scaling**2) cov= np.linalg.inv(nuhess) print "optimized degrees of freedom (mine): ", optnu, "+/-", float(np.sqrt(abs(cov[0,0]))) print "with scaling ", scaling, "+/-", float(np.sqrt(abs(cov[1,1]))) #----------------------------------------------------- #plot everything and save the plot P2= chi2.pdf(delta,optnu,loc,scaling) pl.plot(lndelta, delta*P2,'k-') pl.ylabel("$P(\mathrm{ln}|\\alpha_\mathrm{bg} \Delta|)$",fontsize=8)
def Threshold_finder(data, max_population=3, min_population_size=0.2, confidence_interval=0.90, verbose=False): import warnings warnings.filterwarnings("ignore") ''' data: 1D data array with count numbers max_population: Define the maximal number of populations exist in sample datasets min_population_size: The smallest population should have at least 20% population confidence_interval: if unimodel was used, select the confidence interval for lower bound; 0.90 = 5% confidence one tail test ''' best_population = np.inf best_loglike = -np.inf best_mdoel = None model_kind = 'Gaussian' # Set Gaussian to be the default model type for n_components in [ n + 1 for n in list(reversed(np.arange(max_population))) ]: BGM = BayesianGaussianMixture(n_components=n_components, verbose=0).fit(data) # Proceed only if the model can converge if BGM.converged_: if verbose: print('%s populations converged' % str(n_components)) dict_wp = dict() # store weighted probability for each population for p in np.arange(n_components): para = norm.fit( mask_list(data, BGM.predict(data), p)) # fit gaussian model to population p dict_wp[p] = norm(para[0], para[1]).pdf(data) * BGM.weights_[p] # Compute log likelyhood of prediction # wp[0] = norm.pdf(data[i])*weight[0], wp[1] = norm.pdf(data[i])*weight[1] ... # log(wp[0]+wp[1]+...) gives total log likelyhood loglike = sum([ np.log(sum([dict_wp[p][i] for p in np.arange(n_components)])) for i in np.arange(len(data)) ])[0] if loglike > best_loglike and min( BGM.weights_) > min_population_size: # minimal best_loglike = loglike best_population = n_components best_mdoel = BGM if verbose: print('%s model with %s population has log likelyhood of %s ' % (model_kind, n_components, loglike)) else: if verbose: print('%s populations not converged' % str(n_components)) if n_components == 1: # A gaussian model may not best fit one distribution; Other models should also being tested to decide if better 1 model fit exist para = rayleigh.fit(data) loglike = sum(np.log(rayleigh(para[0], para[1]).pdf(data)))[0] if loglike > best_loglike: best_loglike = loglike best_population = 1 best_mdoel = rayleigh(para[0], para[1]) model_kind = 'Rayleigh' if verbose: print( '%s model with %s population has log likelyhood of %s ' % (model_kind, n_components, loglike)) if best_mdoel == None: # nither Gaussian nor Rayleight could fit the data para = chi2.fit(data) loglike = sum(np.log(chi2(para[0], para[1], para[2]).pdf(data)))[0] if loglike > best_loglike: best_loglike = loglike best_population = 1 best_mdoel = chi2(para[0], para[1], para[2]) model_kind = 'Chi-square' if verbose: print('%s model with %s population has log likelyhood of %s ' % (model_kind, n_components, loglike)) if best_population > 1: p = list(best_mdoel.means_).index( min(best_mdoel.means_ )) # Get the population id that represent negatives threshold = max(mask_list(data, best_mdoel.predict(data), p))[0] else: if model_kind == 'Rayleigh' or model_kind == 'Chi-square': threshold = min(1, abs(best_mdoel.interval(confidence_interval)[0])) else: para = norm.fit(data) threshold = min( 1, abs( norm(data, para[0], para[1]).interval(confidence_interval)[0])) print( 'Best model with %s distribution has %s populations with threshold at %s' % (model_kind, best_population, threshold)) return threshold, model_kind, best_mdoel, best_population
bckg_single = [] for file in files: bckg_single.append(list(file['TS'])) ##Now we make hists of the test statistics ## bins = 80 range = (0.0, 20.0) single_hist = histlite.Hist.normalize( histlite.hist(bckg_single[0], bins=bins, range=range)) ## Now to plot. ## fig_bckg = plt.figure(figsize=(w, .75 * w)) ax = plt.gca() ##I'll include a chi squared distribution w/ DOF=1 (and 2, just because). I'll also show the best fitting chi2 dist for each weighting scheme.## chifit_single = chi2.fit(bckg_single[0])[0] chi_degs = [1, 2, chifit_single] colors = ['black', 'gray', 'blue'] for df, color in zip(chi_degs, colors): x = np.linspace(chi2.ppf(0.01, df), chi2.ppf(0.99999, df), 100) rv = chi2(df) chi_dist = rv.pdf(x) ax.plot(x, chi_dist / sum(chi_dist), linestyle=':', color=color, label=r'$\tilde{\chi}^2$: df=' + str(round(df, 2))) histlite.plot1d(ax, single_hist,
##Now we make hists of the test statistics ## bins = 80 range = (0.0, 20.0) uniform_hist = histlite.Hist.normalize( histlite.hist(bckg_uniform['TS'], bins=bins, range=range)) redshift_hist = histlite.Hist.normalize( histlite.hist(bckg_redshift['TS'], bins=bins, range=range)) flux_hist = histlite.Hist.normalize( histlite.hist(bckg_flux['TS'], bins=bins, range=range)) ## Now to plot. ## fig_bckg = plt.figure(figsize=(w, .75 * w)) ax = plt.gca() ##I'll include a chi squared distribution w/ DOF=1 (and 2, just because). I'll also show the best fitting chi2 dist for each weighting scheme.## chifit_uniform = chi2.fit(bckg_uniform['TS'])[0] chifit_redshift = chi2.fit(bckg_redshift['TS'])[0] chifit_flux = chi2.fit(bckg_flux['TS'])[0] chi_degs = [1, 2, chifit_uniform, chifit_redshift, chifit_flux] colors = ['black', 'gray', 'blue', 'red', 'green'] for df, color in zip(chi_degs, colors): x = np.linspace(chi2.ppf(0.01, df), chi2.ppf(0.99999, df), 100) rv = chi2(df) chi_dist = rv.pdf(x) ax.plot(x, chi_dist / sum(chi_dist), linestyle=':', color=color, label=r'$\tilde{\chi}^2$: df=' + str(round(df, 2)))