def IPN_kendallW(X): """ Kendall's W X is 2D numpy-array (n*k ratings matrix) n is the number of objects and k is the number of judges. """ [n, k] = np.shape(X) # if tiedrank ... R = np.zeros_like(X) for i in range(0, np.shape(X)[1]): R[:, i] = rankdata(X[:, i]) R_new = np.sort(np.round(R), axis=0) A = np.matlib.repmat(np.array(range(1, n + 1)), k, 1).T T = np.sum(np.array((A - R_new), dtype=bool), axis=0) + 1 RS = np.sum(R, axis=1) S = np.sum(np.square(RS)) - n * math.pow(np.mean(RS), 2) F = k * k * (n * n * n - n) - k * np.sum(np.power(T, 3) - T) W = float(12) * S / F Fdist = W * (k - 1) / (1 - W) nu1 = n - 1 - (2 / float(k)) nu2 = nu1 * (k - 1) p = f.pdf(Fdist, nu1, nu2) return W, p, Fdist
def IPN_kendallW(X): """ Kendall's W X is 2D numpy-array (n*k ratings matrix) n is the number of objects and k is the number of judges. """ [n, k] = np.shape(X) # if tiedrank ... R = np.zeros_like(X) for i in range(0, np.shape(X)[1]): R[:,i] = rankdata(X[:,i]) R_new = np.sort(np.round(R), axis=0) A = np.matlib.repmat(np.array(range(1,n+1)), k, 1).T T = np.sum(np.array((A-R_new), dtype=bool), axis=0) +1 RS = np.sum(R, axis=1) S = np.sum(np.square(RS)) - n * math.pow(np.mean(RS),2) F = k * k* (n * n * n-n)- k*np.sum(np.power(T, 3)-T) W = float(12)*S/F Fdist = W*(k-1) / (1-W) nu1 = n-1-(2/float(k)); nu2 = nu1*(k-1); p = f.pdf(Fdist, nu1, nu2) return W, p, Fdist
def f_contrast(self): ''' Returns p-value for F_contrast: H0: b0 = b1 = b2 = ... = bn = 0 ''' fvalue = (self.ssr) / (self.n) / (self.ssrh0 / (self.n)) pvalue = f.pdf(fvalue, self.k - 1, self.n - self.k) return (pvalue)
def plot_f_distrubiton(fvalue, dfn, dfd): # Set figure plt.figure(figsize=(8, 6)) # Set degrees of freedom rejection_reg = f.ppf(q=.95, dfn=dfn, dfd=dfd) mean, var, skew, kurt = f.stats(dfn, dfd, moments='mvsk') x = np.linspace(f.ppf(0.01, dfn, dfd), f.ppf(0.99, dfn, dfd), 100) # Plot values plt.plot(x, f.pdf(x, dfn, dfd), alpha=0.6, label=' X ~ F({}, {})'.format(dfn, dfd)) plt.axvline(x=fvalue) plt.vlines(rejection_reg, 0.0, 1.0, linestyles="dashdot", label="Crit. Value: {:.2f}".format(rejection_reg)) plt.legend() plt.ylim(0.0, 1.0) plt.xlim(0.0, 20.0, 5) plt.title('F-Distribution dfn:{}, dfd:{}'.format(dfn, dfd))
def app_time(x, dfn, dfd, a, b): mean = 0.0 dist = np.divide(f.pdf(x, dfn, dfd), (f.cdf(b, dfn, dfd) - f.cdf(a, dfn, dfd))) # f-dist for duration, truncated from a to b dist = np.divide(dist, np.sum(dist)) # normalization for item in zip(x, dist): mean = mean + (item[0] * item[1]) # expectation of duration return dist, mean
def main(): # Input parameters Nminusk = 10000 kminus1 = 2 step = 0.001 # Integrate the F-distribution to get the critical value F = 0. integrate = 0 while integrate < 0.95: F += step integrate += f.pdf(F, kminus1, Nminusk)*step if integrate > 0.95: print "F value at 95%% confidence level is %0.1f" % F break # Plot the F-distribution x = np.linspace(0, 100, 1000) plt.plot(x,f.pdf(x, kminus1, Nminusk), color="blue", linewidth=3) plt.axvline(F, color="black", linestyle="--", linewidth=2) plt.xlim(0, 5) plt.xlabel('$x$') plt.ylabel(r'$F(x, %d, %d)$' % (kminus1, Nminusk)) plt.title("$F(x, %d, %d)$ Distribution" % (kminus1, Nminusk)) plt.legend() plt.show() # Calculate the required number of users download_rate_estimate = 0.02 sigma2_s = download_rate_estimate*(1. - download_rate_estimate) N = 5.3792*sigma2_s/(0.1*download_rate_estimate)**2 print "estimate of N = %d" % round(N) # Run the obtained results through the F test input_downloads = [500, 620, 490] download_fractions = [entry/N for entry in input_downloads] print "F test result = %0.4f" % Ftest(download_fractions, sigma2_s, N) # Perform individual t-test print "The 96.6%% confidence interval is = (%0.2f %0.2f)" % (norm.interval(0.966, loc=0, scale=1)) for fraction in download_fractions[1:]: print "t value = %0.2f (for a measured download rate of %0.4f)" % (ttest(N, fraction, N, download_fractions[0]), fraction) return
def df(x,df1,df2,ncp=0): """ Calculates the density/point estimate of the F-distribution """ from scipy.stats import f,ncf if ncp==0: result=f.pdf(x=x,dfn=df1,dfd=df2,loc=0,scale=1) else: result=ncf.pdf(x=x,dfn=df1,dfd=df2,nc=ncp,loc=0,scale=1) return result
def app_time(x, dfn, dfd, a, b): mean = 0.0 dist = np.divide( f.pdf(x, dfn, dfd), (f.cdf(b, dfn, dfd) - f.cdf(a, dfn, dfd))) # f-dist for duration, truncated from a to b dist = np.divide(dist, np.sum(dist)) # normalization for item in zip(x, dist): mean = mean + (item[0] * item[1]) # expectation of duration return dist, mean
def chi2_distribution(): fig, ax = plt.subplots(1, 1) #display the probability density function df = 10 x=np.linspace(chi2.ppf(0.01, df), chi2.ppf(0.99, df), 100) ax.plot(x, chi2.pdf(x,df)) #simulate the chi2 distribution y = [] n=10 for i in range(1000): chi2r=0.0 r = norm.rvs(size=n) for j in range(n): chi2r=chi2r+r[j]**2 y.append(chi2r) ax.hist(y, normed=True, alpha=0.2) plt.show() fig, ax = plt.subplots(1, 1) #display the probability density function df = 10 x=np.linspace(-4, 4, 100) ax.plot(x, t.pdf(x,df)) #simulate the t-distribution y = [] for i in range(1000): rx = norm.rvs() ry = chi2.rvs(df) rt = rx/np.sqrt(ry/df) y.append(rt) ax.hist(y, normed=True, alpha=0.2) plt.show() fig, ax = plt.subplots(1, 1) #display the probability density function dfn, dfm = 10, 5 x = np.linspace(f.ppf(0.01, dfn, dfm), f.ppf(0.99, dfn, dfm), 100) ax.plot(x, f.pdf(x, dfn, dfm)) #simulate the F-distribution y = [] for i in range(1000): rx = chi2.rvs(dfn) ry = chi2.rvs(dfm) rf = np.sqrt(rx/dfn)/np.sqrt(ry/dfm) y.append(rf) ax.hist(y, normed=True, alpha=0.2) plt.show()
def plot_f_distrib_for_many_coefficients(self, features): from scipy.stats import f # Remove a particular subset of features X = np.delete(self.X, [self.features.index(_) for _ in features], 1) # Prediction from reduced model XT = X.T std_error_matrix = inv(XT.dot(X)) beta = std_error_matrix.dot(XT).dot(self.y) y_hat = X.dot(beta) rss_reduced_model = np.sum((self.y - y_hat)**2) dfn = len(features) dfd = self.df # This should be distributed as chi squared # with degrees of freedom equal to number # of dropped features rss_diff = (rss_reduced_model - self.rss) chi_1 = rss_diff / dfn chi_2 = self.pop_var f_score = chi_1 / chi_2 # 5% and 95% percentile f_05, f_95 = f.ppf([0.05, 0.95], dfn, dfd) x = np.linspace(0.001, 5.0) plt.axvline(x=f_05) plt.axvline(x=f_95) plt.scatter(f_score, f.pdf(f_score, dfn, dfd), marker='o', color='red') plt.plot(x, f.pdf(x, dfn, dfd), color='gray', lw=5, alpha=0.6) plt.title('f-distribtion for dropping features: {0}'.format(features)) plt.show()
def sampling_distribution(): fig, ax = plt.subplots(1, 1) # display the probability density function dfn, dfm = 10, 5 x = np.linspace(f.ppf(0.01, dfn, dfm), f.ppf(0.99, dfn, dfm), 100) ax.plot(x, f.pdf(x, dfn, dfm)) # simulate the sampling distribution y = [] for i in range(1000): r1 = norm.rvs(loc=5, scale=2, size=dfn + 1) r2 = norm.rvs(loc=3, scale=2, size=dfm + 1) rf = np.var(r1) / np.var(r2) y.append(rf) ax.hist(y, normed=True, alpha=0.2) plt.savefig('sampling_distribution.png')
def F_distribution(): fig, ax = plt.subplots(1, 1) # display the probability density function dfn, dfm = 10, 5 x = np.linspace(f.ppf(0.01, dfn, dfm), f.ppf(0.99, dfn, dfm), 100) ax.plot(x, f.pdf(x, dfn, dfm)) # simulate the F-distribution y = [] for i in range(1000): rx = chi2.rvs(dfn) ry = chi2.rvs(dfm) rf = np.sqrt(rx / dfn) / np.sqrt(ry / dfm) y.append(rf) ax.hist(y, normed=True, alpha=0.2) plt.savefig('F_distribution.png')
def getDataSizeDistribution(self): ''' TODO: Read the data size distribution from input file. But it seems that we do not have this column in the input file Therefore, we just use Gaussian distribution Similar to the function above :return: A ndarray of float numbers, with shape (self.deviceNum,) ''' #return np.random.rand(self.deviceNum) #return f.pdf(np.random.uniform(0,4,self.deviceNum), 1, 1) #return np.random.zipf(1.5,self.deviceNum) #return np.random.zipf(1.5,self.deviceNum) if self.distribution=='normal': return (np.random.rand(self.deviceNum)*(self.parameter[1]-self.parameter[0])+self.parameter[0]) elif self.distribution=='f': return f.pdf(np.random.uniform(0,4,self.deviceNum), self.parameter[0], self.parameter[1]) elif self.distribution=='zipf': return np.random.zipf(self.parameter,self.deviceNum)
def sampling_distribution(): fig, ax = plt.subplots(1, 1) #display the probability density function dfn, dfm = 10, 5 x=np.linspace(f.ppf(0.01, dfn, dfm), f.ppf(0.99, dfn, dfm), 100) ax.plot(x, f.pdf(x, dfn, dfm)) #simulate the sampling distribution y = [] for i in range(1000): r1 = norm.rvs(loc=5, scale=2, size=dfn+1) r2 = norm.rvs(loc=3, scale=2, size=dfm+1) rf =np.var(r1)/np.var(r2) y.append(rf) ax.hist(y, normed=True, alpha=0.2) plt.savefig('sampling_distribution.png')
def f_distribution_critical_value(f_value, df_numerator, df_denominator, loc=0, scale=1): """ Gets critical value of a f-distribution :param f_value: observed f-value :param df_numerator: degrees of freedom of numerator :param df_denominator: degrees of freedom of denominator :param loc: :param scale: :return: """ return float( FDistribution.pdf(f_value, df_numerator, df_denominator, loc, scale))
def getErrorRateDistribution(self): if self.distributionError == 'normal': res = (np.random.normal(self.parameterError[0], self.parameterError[1], self.deviceNum)) for i in range(res.shape[0]): if res[i] < 0: res[i] = 0 if res[i] > 1: res[i] = 0.9 return res elif self.distributionError == 'f': return f.pdf(np.random.uniform(0, 4, self.deviceNum), self.parameterError[0], self.parameterError[1]) / 5 elif self.distributionError == 'zipf': res = np.random.zipf(self.parameterError, self.deviceNum) / 100 for i in range(res.shape[0]): if res[i] > 1: res[i] = 0.9 return res
def MalinowskyParameters(data, l): if np.shape(data)[0] < np.shape(data)[1]: data = np.transpose(data) n_row = np.shape(data)[0] #numbers of rows n_col = np.shape(data)[1] #numbers of columns ind = np.zeros(n_col - 1) ie = np.zeros(n_col - 1) index = range(1, n_col) for i in range(0, n_col - 1): ind[i] = (np.sqrt( (np.sum(l[i + 1:n_col])) / (n_row * (n_col - index[i])))) / (n_col - index[i])**2 ie[i] = np.sqrt(index[i] * (np.sum(l[i + 1:n_col])) / (n_row * n_col * (n_col - index[i]))) pc = np.arange(1., n_col + 1, 1) # maximum number of cumponents (i.e number of spectra) p = np.zeros(np.size(pc)) for i in range(0, np.size(pc)): p[i] = (n_row - pc[i] + 1) * (n_col - pc[i] + 1) s1 = np.zeros(np.size(pc)) s2 = np.zeros(np.size(pc)) fi = np.zeros(np.size(l) - 1) result = np.zeros(np.size(l) - 1) a = pc + 1 for i in range(0, n_col - 1): s1[i] = np.sum( (n_row - a[i:np.size(pc)] + 1) * (n_col - a[i:np.size(pc)] + 1)) for j in range(0, n_col - 1): s2[j] = np.sum(l[j + 1:np.size(pc) + 1]) for i in range(0, n_col - 1): fi[i] = (s1[i] / p[i]) * (l[i] / s2[i]) for i in range(0, n_col - 1): result[i] = ((integrate.quad(lambda x: f.pdf(x, 1, (n_col - 1) - i), fi[i], np.inf))[0]) * 100 statistic = pd.DataFrame({'IND': ind, 'IE': ie, 'F': result}) statistic.index = statistic.index + 1 return statistic, pc
def getDataSizeDistribution(self): res = None if self.distributionData == 'normal': res = (np.random.normal(self.parameterError[0], self.parameterError[1], self.deviceNum)) elif self.distributionData == 'f': res = f.pdf(np.random.uniform(0, 4, self.deviceNum), self.parameterData[0], self.parameterData[1]) elif self.distributionData == 'zipf': res = np.random.zipf(self.parameterData, self.deviceNum) if self.isRelated == False: return res else: index = self.allErrorRate.argsort() res.sort() finalRes = copy.deepcopy(res) j = 0 for i in index: finalRes[i] = res[j] j += 1 return finalRes
from scipy.stats import chi2 plt.plot(cvalues, chi2.pdf(cvalues, 1), 'b-', label="Chi2(1)") plt.plot(cvalues, chi2.pdf(cvalues, 5), 'r-', label="Chi2(5)") plt.plot(cvalues, chi2.pdf(cvalues, 30), 'g-', label="Chi2(30)") plt.legend() #sample_chi2 = np.random.chisquare(1, 10) #sns.distplot(sample_chi2) plt.show() # F-distribution from scipy.stats import f fvalues = np.linspace(.1, 5, 100) # pdf(x, df1, df2): Probability density function at x of F. plt.plot(fvalues, f.pdf(fvalues, 1, 30), 'b-', label="F(1, 30)") plt.plot(fvalues, f.pdf(fvalues, 5, 30), 'r-', label="F(5, 30)") plt.legend() # cdf(x, df1, df2): Cumulative distribution function of F. # ie. proba_at_f_inf_3 = f.cdf(3, 1, 30) # P(F(1,30) < 3) # ppf(q, df1, df2): Percent point function (inverse of cdf) at q of F. f_at_proba_inf_95 = f.ppf(.95, 1, 30) # q such P(F(1,30) < .95) assert f.cdf(f_at_proba_inf_95, 1, 30) == .95 # sf(x, df1, df2): Survival function (1 - cdf) at x of F. proba_at_f_sup_3 = f.sf(3, 1, 30) # P(F(1,30) > 3) assert proba_at_f_inf_3 + proba_at_f_sup_3 == 1 # p-value: P(F(1, 30)) < 0.05 low_proba_fvalues = fvalues[fvalues > f_at_proba_inf_95] plt.fill_between(low_proba_fvalues, 0, f.pdf(low_proba_fvalues, 1, 30), alpha=.8, label="P < 0.05")
#perform detection test T_balanced[indice]=compute_GLRT_statistic(Y_balanced,w) T_unbalanced[indice]=compute_GLRT_statistic(Y_unbalanced,w) threshold=threshold_from_pfa(pfa,N) print("seuil: %f" %threshold) #MARK: Display signal xbins=np.linspace(0.001,25,100) xbins_middle=(xbins[1:]+xbins[:-1])/2 n_balanced, bins1, p1 = plt.hist(T_balanced, bins=xbins, normed=1, histtype='stepfilled') n_unbalanced, bins2, p2 = plt.hist(T_unbalanced, bins=xbins, normed=1, histtype='stepfilled') #theoretical pdf plt.plot(xbins_middle, f.pdf(xbins_middle, 4,(3*N-6)), 'k--', linewidth=1.5) ncf_lambda=compute_lambda(w,sym_unbalanced,N,sigmaB_unbalanced) plt.plot(xbins_middle, ncf.pdf(xbins_middle, 4,(3*N-6),ncf_lambda), 'k--', linewidth=1.5) plt.axvline(threshold,color='r') plt.show() #MARK: export csv file output=np.zeros((5,np.size(xbins_middle))) if csv_on==1: output[0,:]=xbins_middle output[1,:]=n_balanced output[2,:]=f.pdf(xbins_middle, 4,(3*N-6)) output[3,:]=n_unbalanced output[4,:]=ncf.pdf(xbins_middle, 4,(3*N-6),ncf_lambda) np.savetxt(filename, output.T,header="bins,hist_balanced,pdf_balanced,hist_unbalanced,pdf_unbalanced", delimiter=",")
def pdf(self, F): return f.pdf(F, self.dfn, self.dfd, loc=self.loc, scale=self.scale)
from scipy.stats import f import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) # Calculate a few first moments: dfn, dfd = 29, 18 mean, var, skew, kurt = f.stats(dfn, dfd, moments='mvsk') # Display the probability density function (``pdf``): x = np.linspace(f.ppf(0.01, dfn, dfd), f.ppf(0.99, dfn, dfd), 100) ax.plot(x, f.pdf(x, dfn, dfd), 'r-', lw=5, alpha=0.6, label='f pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = f(dfn, dfd) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = f.ppf([0.001, 0.5, 0.999], dfn, dfd) np.allclose([0.001, 0.5, 0.999], f.cdf(vals, dfn, dfd)) # True
''' @Author: Runsen @微信公众号: 润森笔记 @博客: https://blog.csdn.net/weixin_44510615 @Date: 2020/7/5 ''' import numpy as np import matplotlib.pyplot as plt from scipy.stats import f x = np.linspace(0, 3, 100) plt.plot(x, f.pdf(x, 20, 20), 'k-', label='y=f(x,20,20)') plt.plot(x, f.pdf(x, 10, 10), 'r-', label='y=f(x,10,10)') plt.plot(x, f.pdf(x, 10, 5), 'g-', label='y=f(x,10,5)') plt.plot(x, f.pdf(x, 10, 20), 'b-', label='y=f(x,10,20)') plt.plot(x, f.pdf(x, 5, 5), 'r--', label='y=f(x,5,5)') plt.plot(x, f.pdf(x, 5, 10), 'g--', label='y=f(x,5,10)') plt.plot(x, f.pdf(x, 5, 1), 'y-', label='y=f(x,5,1)') plt.legend() plt.show()
def sampling_distribution(): fig, ax = plt.subplots(1, 1) #display the probability density function dfn, dfm = 10, 5 x=np.linspace(f.ppf(0.01, dfn, dfm), f.ppf(0.99, dfn, dfm), 100) ax.plot(x, f.pdf(x, dfn, dfm))
from scipy.stats import f print(f.pdf(3,5,2))
def fplot(dname, ref, alt, year=2017, savename='fplotX', nbins=130): ref_pt, ref_rho = ref alt_pt, alt_rho = alt p1 = (ref_pt + 1) * (ref_rho + 1) p2 = (alt_pt + 1) * (alt_rho + 1) path = '{dname}/bkgtest_{ref_pt}-{ref_rho}_{alt_pt}-{alt_rho}'.format( dname=dname, ref_pt=ref_pt, ref_rho=ref_rho, alt_pt=alt_pt, alt_rho=alt_rho) print("X", path) base_dict = skim_gofs(get_names(path)) alt_dict = skim_gofs(get_names(path, alt=True)) base, alt = [], [] for i in range(len(base_dict)): try: ia = alt_dict[i] ib = base_dict[i] alt.append(ia) base.append(ib) except: pass if len(alt) != len(base): raise ValueError("Number of toys for base and ref does not match.") fvals = fval(base, alt, p1, p2, nbins) f_data = fval( get_vals( '{dname}/bkgtest_{ref_pt}-{ref_rho}_{alt_pt}-{alt_rho}/refbase.root' .format(dname=dname, ref_pt=ref_pt, ref_rho=ref_rho, alt_pt=alt_pt, alt_rho=alt_rho)), get_vals( '{dname}/bkgtest_{ref_pt}-{ref_rho}_{alt_pt}-{alt_rho}/refalt.root' .format(dname=dname, ref_pt=ref_pt, ref_rho=ref_rho, alt_pt=alt_pt, alt_rho=alt_rho)), p1, p2, nbins)[0] print(f_data) from scipy.stats import f x_lim = max(np.percentile(fvals, 90), f_data * 1.2) x = np.linspace(0, x_lim, 200) bins = np.linspace(0, x_lim, 30) width = bins[1] - bins[0] fig, ax = plt.subplots() trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) ax.plot(x, len(base) * width * f.pdf(x, p2 - p1, nbins - p2), color='red', label='F-dist, ndf({},{})'.format(p2 - p1, nbins - p2)) ax.hist(fvals, bins, facecolor='none', edgecolor='black', histtype='stepfilled', lw=2, label="Toys, N = {}".format(len(fvals))) ax.hist(fvals[fvals > f_data], bins, facecolor='steelblue', edgecolor='gray', histtype='stepfilled', alpha=0.3, label='p-value = {}'.format( round(float(len(fvals[fvals > f_data])) / len(fvals), 3))) ax.annotate( "", xy=(f_data, 0), xycoords=trans, xytext=(f_data, 0.25), textcoords=trans, arrowprops=dict(lw='4', color='b', arrowstyle="->,head_length=1.5,head_width=0.5"), ) ax.plot([], [], color='blue', lw=2, label="Observed = {:.3f}".format(f_data)) title = "TF({},{}) x TF({},{})".format(ref_pt, ref_rho, alt_pt, alt_rho) ax.legend(title=title) hep.cms.label(data=True, year=year, ax=ax) ax.set_xlim(0, x_lim) xlab = r"$\frac{-2log(\lambda_1/\lambda_2)/(p_2-p_1)}{-2log\lambda_2/(n-p_2)}$" ax.set_xlabel(xlab, x=1, ha='right') ax.set_ylabel("Pseudoexperiments", y=1, ha='right') fig.savefig('{}.pdf'.format(savename), dpi=300, transparent=True, bbox_inches='tight') fig.savefig('{}.png'.format(savename), dpi=300, transparent=True, bbox_inches='tight')
def f23test(data, B, fix): F = np.zeros(2, dtype=float) # pdf C = np.zeros(2, dtype=float) # p-values B2 = np.zeros(2, dtype=float) # for the 2nd order EOS, assign K0 and V0 from user guesses B2[0] = B[0] B2[1] = B[-1] if data.EOS_type == GEOST_thermo.types().names[0]: # Figure out what EOS's to use # If birch murnaghan odr_model = odrpack.Model(fcn=GEOST_thermo.BM2_V, fjacb=GEOST_thermo.BM2_V_JACB, fjacd=GEOST_thermo.BM2_V_JACD) odr_data = odrpack.RealData(x=data.V, y=data.P, sx=data.Verr, sy=data.Perr) odr = odrpack.ODR(odr_data, odr_model, beta0=B2, ifixb=[fix[0],fix[-1]]) odr.set_job(deriv=3) # Use user-supplied derivatives output = odr.run() # Output of ODR run ref_B2 = output.beta # LSQ best-fit parameters err_B2 = output.sd_beta # Parameter errors (1-sigma) f2 = GEOST_thermo.BM2_V(ref_B2,data.V) df2 = data.V.shape[0] - 2 odr_model = odrpack.Model(fcn=GEOST_thermo.BM3_V, fjacb=GEOST_thermo.BM3_V_JACB, fjacd=GEOST_thermo.BM3_V_JACD) odr_data = odrpack.RealData(x=data.V, y=data.P, sx=data.Verr, sy=data.Perr) odr = odrpack.ODR(odr_data, odr_model, beta0=B, ifixb=fix) odr.set_job(deriv=3) # Use user-supplied derivatives, but CHECK THEM!!! output = odr.run() # Output of ODR run ref_B3 = output.beta # LSQ best-fit parameters err_B3 = output.sd_beta # Parameter errors (1-sigma) f3 = GEOST_thermo.BM3_V(ref_B3,data.V) df3 = data.V.shape[0] - 3 elif data.EOS_type == GEOST_thermo.types().names[1]: # If Natural strain odr_model = odrpack.Model(fcn=GEOST_thermo.NS2_V, fjacb=GEOST_thermo.NS2_V_JACB, fjacd=GEOST_thermo.NS2_V_JACD) odr_data = odrpack.RealData(x=data.V, y=data.P, sx=data.Verr, sy=data.Perr) odr = odrpack.ODR(odr_data, odr_model, beta0=B2, ifixb=[fix[0],fix[-1]]) odr.set_job(deriv=3) # Use user-supplied derivatives output = odr.run() # Output of ODR run ref_B2 = output.beta # LSQ best-fit parameters err_B2 = output.sd_beta # Parameter errors (1-sigma) f2 = GEOST_thermo.NS2_V(ref_B2,data.V) df2 = data.V.shape[0] - 2 odr_model = odrpack.Model(fcn=GEOST_thermo.NS3_V, fjacb=GEOST_thermo.NS3_V_JACB, fjacd=GEOST_thermo.NS3_V_JACD) odr_data = odrpack.RealData(x=data.V, y=data.P, sx=data.Verr, sy=data.Perr) odr = odrpack.ODR(odr_data, odr_model, beta0=B, ifixb=fix) odr.set_job(deriv=3) # Use user-supplied derivatives, but CHECK THEM!!! output = odr.run() # Output of ODR run ref_B3 = output.beta # LSQ best-fit parameters err_B3 = output.sd_beta # Parameter errors (1-sigma) f3 = GEOST_thermo.NS3_V(ref_B3,data.V) df3 = data.V.shape[0] - 3 chisq_2 = float(0) chisq_3 = float(0) for i in range(data.P.shape[0]): # Compute the chi-squared for each EOS chisq_2 += (data.P[i] - f2[i])**2 chisq_3 += (data.P[i] - f3[i])**2 # Compute the F-statistic Fx23 = (chisq_2 - chisq_3)/(chisq_3/df3) # Use Scipy's built-in F-distribution methods F = f_dist.pdf(Fx23, 1, df3) # Compute the P-value C = 1 - f_dist.cdf(Fx23, 1, df3) x1 = np.linspace(0.01, 2*Fx23, 128) pdf23 = f_dist.pdf(x1, 1, df3) cdf23 = f_dist.cdf(x1, 1, df3) # Results plt.figure() fig = plt.gcf() fig.canvas.set_window_title("F-Test Results") plt.plot(x1, pdf23, 'r', linewidth=3, alpha=0.8) plt.plot(x1, cdf23, 'b', linewidth=3, alpha=0.8) plt.plot(Fx23, f_dist.pdf(Fx23, 1, df3), 'ko') plt.fill_between(x1, 0, pdf23, where=f_dist.cdf(Fx23, 1, df3)<cdf23, facecolor='red', alpha=0.2) plt.xticks(fontsize=14) plt.ylim([-0.01,1.01]) plt.title("F test for 2nd vs. 3rd order EOS", fontsize=14) plt.xlabel(r"$\left( \chi^{2}_{2} - \chi^{2}_{3} \right) / \left(\chi^{2}_{3} / \nu_{3} \right)$", fontsize=12) plt.ylabel(r"PDF/CDF") plt.legend(['PDF', 'CDF', r"$F_{X}$"], loc='upper right', numpoints=1) plt.text(Fx23, f_dist.pdf(Fx23, 1, df3)+0.05, "p-value= {:8.4f}".format(1-f_dist.cdf(Fx23, 1, df3)), fontsize=14) plt.tight_layout() return [chisq_2/df2, chisq_3/df3, Fx23, f_dist.cdf(Fx23, 1, df3)]
def fqtest(VT, P, VTerr, Perr, debye, Bhit, fix): F = np.zeros(2, dtype=float) # pdf C = np.zeros(2, dtype=float) # p-values Bhit2 = np.zeros(3, dtype=float) # for q fixed at 1 Bhit2[0] = Bhit[0] Bhit2[1] = Bhit[1] Bhit2[2] = 1. fix2 = list(fix) fix2[-1] = 0 odr_model2 = odrpack.Model(fcn=debye.P_thermal, fjacb=debye.JACB, fjacd=debye.JACD) odr_data2 = odrpack.RealData(x=VT, y=P, sx=VTerr, sy=Perr) odr2 = odrpack.ODR(odr_data2, odr_model2, beta0=Bhit2, ifixb=fix2) odr2.set_job(deriv=1) # Use user-supplied derivatives output2 = odr2.run() # Output of ODR run ref_B2 = output2.beta # LSQ best-fit parameters err_B2 = output2.sd_beta # Parameter errors (1-sigma) f2 = debye.P_thermal(ref_B2, VT) df2 = VT.shape[1] - 2 odr_model3 = odrpack.Model(fcn=debye.P_thermal, fjacb=debye.JACB, fjacd=debye.JACD) odr_data3 = odrpack.RealData(x=VT, y=P, sx=VTerr, sy=Perr) odr3 = odrpack.ODR(odr_data3, odr_model3, beta0=Bhit, ifixb=fix) odr3.set_job(deriv=1) # NB: using numerical derivatives here! output3 = odr3.run() # Output of ODR run ref_B3 = output3.beta # LSQ best-fit parameters err_B3 = output3.sd_beta # Parameter errors (1-sigma) f3 = debye.P_thermal(ref_B3, VT) df3 = VT.shape[1] - 3 chisq_2 = float(0) chisq_3 = float(0) for i in range(P.shape[0]): # Compute the chi-squared for each EOS chisq_2 += (P[i] - f2[i])**2 chisq_3 += (P[i] - f3[i])**2 # Compute the F-statistic Fx23 = (chisq_2 - chisq_3)/(chisq_3/df3) x1 = np.linspace(0.01, 2*Fx23, 128) pdf23 = f_dist.pdf(x1, 1, df3) cdf23 = f_dist.cdf(x1, 1, df3) # Use Scipy's built-in F-distribution methods F = f_dist.pdf(Fx23, 1, df3) # Compute the P-value C = 1 - f_dist.cdf(Fx23, 1, df3) # Finally, Make the plot. SHould be a 1 row 2 column plot showing # f-test for 2nd to 3rd order EOS and 3rd to 4th order EOS. plt.figure() fig = plt.gcf() fig.canvas.set_window_title("F-Test Results") plt.plot(x1, pdf23, 'r', linewidth=3, alpha=0.8) plt.plot(x1, cdf23, 'b', linewidth=3, alpha=0.8) plt.plot(Fx23, f_dist.pdf(Fx23, 1, df3), 'ko') plt.fill_between(x1, 0, pdf23, where=f_dist.cdf(Fx23, 1, df3)<cdf23, facecolor='red', alpha=0.2) plt.xticks(fontsize=14) plt.ylim([-0.01,1.01]) plt.title("Comparing q != 1", fontsize=14) plt.xlabel(r"$\left( \chi^{2}_{2} - \chi^{2}_{3} \right) / \left(\chi^{2}_{3} / \nu_{3} \right)$", fontsize=12) plt.legend(['PDF', 'CDF', r"$F_{X}$"], loc='upper right', numpoints=1) plt.text(Fx23, f_dist.pdf(Fx23, 1, df3)+0.05, "p-value= {:8.4f}".format(1-f_dist.cdf(Fx23, 1, df3))) plt.tight_layout() return [chisq_2, output2.stopreason[0], chisq_3, output3.stopreason[0], Fx23, f_dist.cdf(Fx23, 1, df3)]
## Compute F score n = y.size fval = ss_reg / (ss_res / (n - 2)) ''' - Compute the p-value: * Plot the F(1,n) distribution for 100 f values within [10, 25]. Draw P(F(1,n)>F) ie. color the surface defined by x values larger than F below the F(1,n). * P(F(1,n)>F) is the p-value, compute it. ''' ## Plot the F(1, n) distribution for 100 f values within [10, 25] ## Depict P(F(1, n) > F) ie. folor the surface defined by x values larger than F beloww the F(1, n) from scipy.stats import f fvalues = np.linspace(10, 25, 100) plt.plot(fvalues, f.pdf(fvalues, 1, 30), 'b-', label="F(1, 30)") upper_fval_fvalues = fvalues[fvalues > fval] plt.fill_between(upper_fval_fvalues, 0, f.pdf(upper_fval_fvalues, 1, 30), alpha=.8) # pdf(x, df1, df2): Probability density function at x of the given RV. plt.legend() ## P(F(1, n) > F) is the p-value, compute it # Survival function (1 - `cdf`) pval = f.sf(fval, 1, n - 2) ## With statmodels
xsensors_m = xsensors_m - xsensors_m[0,:] M = size(xsensors_m,0); T_sec = 30 ; N = int(T_sec*Fs_Hz) Lruns = 10000 Fstat = zeros(Lruns) for ir in range(0,Lruns): x = random.randn(N,M) F = maxfstat(x, Fs_Hz, xsensors_m, gridaz_deg,gridel_deg, gridc_mps) Fstat[ir] = F[0] #%% xtheo = linspace(0.5,1.5,100) ytheo = f.pdf(xtheo,N,N*(M-1)) HorizontalSize = 5 VerticalSize = 3 figsimul=plt.figure(num=2,figsize=(HorizontalSize,VerticalSize), edgecolor='k', facecolor = [1,1,0.92]); figsimul.clf() h1 = plt.hist(Fstat, normed=True, bins=30, label='histogram') h2 = plt.plot(xtheo,ytheo,'r',linewidth=2, label='Fisher') plt.legend(loc='best') dirfigsave = '/Users/maurice/etudes/stephenA/propal2/figures/' tt='%sthetafixFisher.pdf' %dirfigsave plt.show() figsimul.savefig(tt,format='pdf')
from scipy.stats import f print(f.pdf(3, 5, 2))
from scipy.stats import f import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) # Calculate a few first moments: dfn, dfd = 29, 18 mean, var, skew, kurt = f.stats(dfn, dfd, moments='mvsk') # Display the probability density function (``pdf``): x = np.linspace(f.ppf(0.01, dfn, dfd), f.ppf(0.99, dfn, dfd), 100) ax.plot(x, f.pdf(x, dfn, dfd), 'r-', lw=5, alpha=0.6, label='f pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = f(dfn, dfd) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = f.ppf([0.001, 0.5, 0.999], dfn, dfd) np.allclose([0.001, 0.5, 0.999], f.cdf(vals, dfn, dfd)) # True # Generate random numbers:
print 'Ftest' ########################## F-TEST ############################### bigN = len(AllWave) # Number of data points Nparam1 = 3. Nparam2 = 5. dof1 = bigN - Nparam1 dof2 = bigN - Nparam2 #chi1 = 300. #428.65 #383.6 #really bad #chi2 = 300. #results in chisqr close to 1 ftest = (chisq1 / (dof1)) / (chisq2 / (dof2)) #RJ+BB vs RJ+BB+BB #ftest = ( (chi1-chi2)/(dof1-dof2) ) / (chi2/dof2) proba_at_f_pdf = f.pdf(ftest, dof1, dof2) proba_at_f_cdf = f.cdf(ftest, dof1, dof2) # P(F(1,30) < 3) f_at_proba_98 = f.ppf(.98, dof1, dof2) # q such P(F(1,30) < .95) proba_at_norm_idf = Norm.isf(proba_at_f_cdf) # P(F(1,30) < 3) proba_at_norm_ppf = Norm.ppf(proba_at_f_cdf) # P(F(1,30) < 3) print '' print '-----------' print 'Source: ', Source print 'ftest: ', ftest print 'proba_at_f_pdf: ', proba_at_f_pdf print 'proba_at_f_cdf: ', proba_at_f_cdf print 'f_at_proba_98: ', f_at_proba_98 print 'proba_at_norm_isf: ', proba_at_norm_idf # inverse survival function print 'proba_at_norm_ppf: ', proba_at_norm_ppf, ' sigma' # Number of sigma away print '-----------'
xsensors_m = xsensors_m - xsensors_m[0, :] M = size(xsensors_m, 0) T_sec = 30 N = int(T_sec * Fs_Hz) Lruns = 10000 Fstat = zeros(Lruns) for ir in range(0, Lruns): x = random.randn(N, M) F = maxfstat(x, Fs_Hz, xsensors_m, gridaz_deg, gridel_deg, gridc_mps) Fstat[ir] = F[0] #%% xtheo = linspace(0.5, 1.5, 100) ytheo = f.pdf(xtheo, N, N * (M - 1)) HorizontalSize = 5 VerticalSize = 3 figsimul = plt.figure(num=2, figsize=(HorizontalSize, VerticalSize), edgecolor='k', facecolor=[1, 1, 0.92]) figsimul.clf() h1 = plt.hist(Fstat, normed=True, bins=30, label='histogram') h2 = plt.plot(xtheo, ytheo, 'r', linewidth=2, label='Fisher') plt.legend(loc='best') dirfigsave = '/Users/maurice/etudes/stephenA/propal2/figures/' tt = '%sthetafixFisher.pdf' % dirfigsave plt.show()
def f34test(data, B, fix): F = np.zeros(2, dtype=float) # pdf C = np.zeros(2, dtype=float) # p-values B3 = np.zeros(3, dtype=float) # for the 2nd order EOS, assign K0 and V0 from user guesses B3[0] = B[0] B3[1] = B[1] B3[2] = B[3] if data.EOS_type == GEOST_thermo.types().names[0]: # Figure out what EOS's to use # If birch murnaghan odr_model = odrpack.Model(fcn=GEOST_thermo.BM3_V, fjacb=GEOST_thermo.BM3_V_JACB, fjacd=GEOST_thermo.BM3_V_JACD) odr_data = odrpack.RealData(x=data.V, y=data.P, sx=data.Verr, sy=data.Perr) odr = odrpack.ODR(odr_data, odr_model, beta0=B3, ifixb=[fix[0],fix[1],fix[3]]) odr.set_job(deriv=3) # Use user-supplied derivatives output = odr.run() # Output of ODR run ref_B3 = output.beta # LSQ best-fit parameters err_B3 = output.sd_beta # Parameter errors (1-sigma) f3 = GEOST_thermo.BM3_V(ref_B3,data.V) df3 = data.V.shape[0] - 3 odr_model = odrpack.Model(fcn=GEOST_thermo.BM4_V, fjacb=GEOST_thermo.BM4_V_JACB, fjacd=GEOST_thermo.BM4_V_JACD) odr_data = odrpack.RealData(x=data.V, y=data.P, sx=data.Verr, sy=data.Perr) odr = odrpack.ODR(odr_data, odr_model, beta0=B, ifixb=fix) odr.set_job(deriv=3) # Use user-supplied derivatives, but CHECK THEM!!! output = odr.run() # Output of ODR run ref_B4 = output.beta # LSQ best-fit parameters err_B4 = output.sd_beta # Parameter errors (1-sigma) f4 = GEOST_thermo.BM3_V(ref_B3,data.V) df4 = data.V.shape[0] - 4 elif data.EOS_type == GEOST_thermo.types().names[1]: # If Natural strain odr_model = odrpack.Model(fcn=GEOST_thermo.NS3_V, fjacb=GEOST_thermo.NS3_V_JACB, fjacd=GEOST_thermo.NS3_V_JACD) odr_data = odrpack.RealData(x=data.V, y=data.P, sx=data.Verr, sy=data.Perr) odr = odrpack.ODR(odr_data, odr_model, beta0=B3, ifixb=[fix[0], fix[1], fix[3]]) odr.set_job(deriv=3) # Use user-supplied derivatives, but CHECK THEM!!! output = odr.run() # Output of ODR run ref_B3 = output.beta # LSQ best-fit parameters err_B3 = output.sd_beta # Parameter errors (1-sigma) f3 = GEOST_thermo.NS3_V(ref_B3,data.V) df3 = data.V.shape[0] - 3 odr_model = odrpack.Model(fcn=GEOST_thermo.NS4_V, fjacb=GEOST_thermo.NS4_V_JACB, fjacd=GEOST_thermo.NS4_V_JACD) odr_data = odrpack.RealData(x=data.V, y=data.P, sx=data.Verr, sy=data.Perr) odr = odrpack.ODR(odr_data, odr_model, beta0=B, ifixb=fix) odr.set_job(deriv=3) # Use user-supplied derivatives, but CHECK THEM!!! output = odr.run() # Output of ODR run ref_B4 = output.beta # LSQ best-fit parameters err_B4 = output.sd_beta # Parameter errors (1-sigma) f4 = GEOST_thermo.NS4_V(ref_B4,data.V) df4 = data.V.shape[0] - 4 elif data.EOS_type == GEOST_thermo.types().names[2]: self.LOG_PRINT("ERROR in PLOTS: Cannot do F-test using Vinet EOS.") else: self.LOG_PRINT("ERROR in PLOTS: Unrecognized value of EOS_SELECT") chisq_3 = float(0) chisq_4 = float(0) for i in range(data.P.shape[0]): # Compute the chi-squared for each EOS chisq_3 += (data.P[i] - f3[i])**2 chisq_4 += (data.P[i] - f4[i])**2 # Compute the F-statistic Fx34 = (chisq_3 - chisq_4)/(chisq_4/df4) x1 = np.linspace(0.01, 2*Fx34, 128) pdf34 = f_dist.pdf(x1, 1, df4) cdf34 = f_dist.cdf(x1, 1, df4) # Use Scipy's built-in F-distribution methods F = f_dist.pdf(Fx34, 1, df4) # Compute the P-value C = 1 - f_dist.cdf(Fx34, 1, df4) # Finally, Make the plot. SHould be a 1 row 2 column plot showing # f-test for 2nd to 3rd order EOS and 3rd to 4th order EOS. plt.figure() fig = plt.gcf() fig.canvas.set_window_title("F-Test Results") plt.plot(x1, pdf34, 'r', linewidth=3, alpha=0.8) plt.plot(x1, cdf34, 'b', linewidth=3, alpha=0.8) plt.plot(Fx34, f_dist.pdf(Fx34, 1, df4), 'ko') plt.fill_between(x1, 0, pdf34, where=f_dist.cdf(Fx34, 1, df4)<cdf34, facecolor='red', alpha=0.2) plt.xticks(fontsize=14) plt.ylim([-0.01,1.01]) plt.title("Comparing 3rd vs. 4th order EOS", fontsize=14) plt.xlabel(r"$\left( \chi^{2}_{3} - \chi^{2}_{4} \right) / \left(\chi^{2}_{4} / \nu_{4} \right)$", fontsize=12) plt.legend(['PDF', 'CDF', r"$F_{X}$"], loc='upper right', numpoints=1) plt.text(Fx34, f_dist.pdf(Fx34, 1, df4)+0.05, "p-value= {:8.4f}".format(1-f_dist.cdf(Fx34, 1, df4)), fontsize=14) plt.tight_layout() return [chisq_3/df3, chisq_4/df4, Fx34, f_dist.cdf(Fx34, 1, df4)]
$F$分布のモジュール名は`f`。 ``` t.pdf(x, dfn, dfd) t.cdf(x, dfn, dfd) t.ppf(a, dfn, dfd) t.rvs(dfn, dfd, size=1) ``` * `dfn`:分子の自由度(numerator degree of freedom) * `dfd`:分母自由度(denominator degree of freedom) `scipy.stats`の`f`を読み込む,確率密度関数の図を描く。 from scipy.stats import f x = np.linspace(0.001,5,1000) y = f.pdf(x, dfn=5, dfd=1) plt.plot(x,y) pass `dfn=5`, `dfd=1の時に`x`の値が`0.1`以下の確率は何か? f.cdf(0.1, dfn=5, dfd=1) `dfn=5`, `dfd=1の時に`x`の値が`5`以上の確率は何か? 1-f.cdf(5, dfn=5, dfd=1) ### 一様分布 (Uniform Distribution) 一様分布のモジュール名は`uniform`。 ```
) else: print( "[2]No se rechaza la hipotesis nula de los bloques en el analisis bidireccional, por lo tanto no hay diferencias entre los bloques" ) print("Analisis Unidireccional") print("El valor sacado de la tabla Anova Unidireccional por filas es: ", FTU1) if FU1 > FTU1: print( "[3]Se rechaza la hipotesis nula de las filas (tratamientos), entonces existe una diferencia entre ellas" ) else: print( "[3]No se rechaza la hipotesis nula de las filas (tratamientos), entonces no existe una diferencia entre ellas" ) print("El valor sacado de la tabla Anova Unidireccional por columnas es: ", FTU2) if FU1 > FTU1: print( "[4]Se rechaza la hipotesis nula de las columnas (bloques), entonces existe una diferencia entre ellas" ) else: print( "[4]No se rechaza la hipotesis nula de las columnas (bloques), entonces no existe una diferencia entre ellas" ) from scipy.stats import f val = f.pdf(0.05, 3, 6) print(val)
#H0 at a level a if T2 >(p(n-1))/(n-p)F_{p,n-p}(a) where #F is the F distribution. num_dof = 2 den_dof = 45 - 2 a = .43 #rv = f.pdf(dfn=num_dof, dfd=den_dof, a, loc=0, scale=1) #central F suppositions rv = (2 * (45 + 1 - 2) / (45 + 1 - 2 - 1)) * f.ppf(a, num_dof, den_dof) #rm = (2*(45+1-2)/(45+1-2-1))*f.cdf(a, num_dof, den_dof) fig = plt.figure(figsize=(13, 8)) x = np.linspace(f.ppf(0.01, num_dof, den_dof), f.ppf(0.99, num_dof, den_dof), 100) x_alpha = np.linspace(f.ppf(0.95, num_dof, den_dof), f.ppf(0.99, num_dof, den_dof), 100) plt.plot(x, f.pdf(x, num_dof, den_dof), 'b-', lw=3, label='f pdf') plt.title('Fischer PDF for dfn = {}, dfd = {}'.format(num_dof, den_dof)) plt.fill_between(x_alpha, f.pdf(x_alpha, num_dof, den_dof), color='r', label=r'$\alpha$ = 0.05%') plt.legend(loc='upper right') plt.savefig('./fischer.pdf') print(t, rv) #NOW WE DO THE SAME WITH THE VALUES FROM THE PAIRED LINES METHOD: x_paired = [ 0.6647315155340111, 0.6528186562246899, 0.4926493627791132, 1.2617404216106476, 0.7570001093882107 ]
#== compute the p-value with the asymptotic distribution # (not independent) ppv = pvalunderH0(FF, N, xsensors_m, Fs_Hz, range_azimuth_deg, range_elevation_deg, range_velocity_mps); # pvalues with he limG independent and Findependent ppvG = 1-norm.cdf(FF,1.0,sqrt(2.0*M/(M-1.0)/N))**Q; ppvF = 1-f.cdf(FF,N,N*(M-1))**Q; # pdf of the max of the limG independent and Findependent linx = linspace(0.69,1.3,200) sigmaGlim = sqrt(2.0*M/(M-1.0)/N) nu1 = N nu2 = N*(M-1) pdffromF = f.pdf(linx,nu1,nu2) pdffromFind = Q * pdffromF * (f.cdf(linx,nu1,nu2)**(Q-1)); pdffromGind = Q * norm.pdf(linx,1.0,sigmaGlim) * (norm.cdf(linx,1.0,sigmaGlim)**(Q-1)); dirfigsave = '/Users/maurice/etudes/stephenA/propal2/figures/' #%% # #HorizontalSize = 6 #VerticalSize = 6 #figpvalFoT = plt.figure(num=1,figsize=(HorizontalSize,VerticalSize), # edgecolor='k', facecolor = [1,1,0.92]); #plt.subplot(2,1,1) #plt.ylabel("Frequency") #plt.title("based on the asymptotic distribution")
# Remember from the last chapter that we can partition the total variance in the data ($SS_{total}$) into the variance that is explained by the model ($SS_{model}$) and the variance that is not ($SS_{error}$). We can then compute a *mean square* for each of these by dividing them by their degrees of freedom; for the error this is $N - p$ (where $p$ is the number of means that we have computed), and for the model this is $p - 1$: # # $$ # MS_{model} =\frac{SS_{model}}{df_{model}}= \frac{SS_{model}}{p-1} # $$ # # $$ # MS_{error} = \frac{SS_{error}}{df_{error}} = \frac{SS_{error}}{N - p} # $$ # # With ANOVA, we want to test whether the variance accounted for by the model is greater than what we would expect by chance, under the null hypothesis of no differences between means. Whereas for the t distribution the expected value is zero under the null hypothesis, that's not the case here, since sums of squares are always positive numbers. Fortunately, there is another standard distribution that describes how ratios of sums of squares are distributed under the null hypothesis: The *F* distribution (see figure \@ref(fig:FDist)). This distribution has two degrees of freedom, which correspond to the degrees of freedom for the numerator (which in this case is the model), and the denominator (which in this case is the error). # %% from scipy.stats import f x=np.arange(0.1,10,0.1) ax=sns.lineplot(x=x,y=f.pdf(x, 1, 1),color='r',label='df(1,1)') ax=sns.lineplot(x=x,y=f.pdf(x, 50, 1),color='g',label='df(50,1)') ax=sns.lineplot(x=x,y=f.pdf(x, 50, 10),color='b',label='df(50,10)') ax.set(xlabel='F Values', ylabel='Density') plt.legend() plt.show() # %% [markdown] # To create an ANOVA model, we extend the idea of *dummy coding* that you encountered in the last chapter. Remember that for the t-test comparing two means, we created a single dummy variable that took the value of 1 for one of the conditions and zero for the others. Here we extend that idea by creating two dummy variables, one that codes for the Drug 1 condition and the other that codes for the Drug 2 condition. Just as in the t-test, we will have one condition (in this case, placebo) that doesn't have a dummy variable, and thus represents the baseline against which the others are compared; its mean defines the intercept of the model. Let's create the dummy coding for drugs 1 and 2. # %% df['drug1']=df['group']=='drug1' df['drug2']=df['group']=='drug2' # %% [markdown] # Now we can fit a model using the same approach that we used in the previous chapter:
def fplot(fvals, f_data, ref, alt, year=2017, nbins=130, savename=None, mc=False): ref_pt, ref_rho = ref alt_pt, alt_rho = alt p1 = (ref_pt + 1) * (ref_rho + 1) p2 = (alt_pt + 1) * (alt_rho + 1) from scipy.stats import f x_lim = max(np.percentile(fvals, 95), f_data * 1.05, np.median(fvals) * 3) x = np.linspace(0, x_lim, 200) bins = np.linspace(0, x_lim, 30) width = bins[1] - bins[0] goodvals = fvals[fvals > 0] fig, ax = plt.subplots() trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) with np.errstate(divide='ignore'): ax.plot(x, len(goodvals) * width * f.pdf(x, p2 - p1, nbins - p2), color='red', label='F-dist, ndf({},{})'.format(p2 - p1, nbins - p2)) ax.hist(fvals, bins, facecolor='none', edgecolor='black', histtype='stepfilled', lw=2, label="Toys > 0, N = {}".format(len(goodvals))) ax.hist(goodvals[goodvals > f_data], bins, facecolor='steelblue', edgecolor='gray', histtype='stepfilled', alpha=0.3, label='p-value = {}'.format( round( float(len(goodvals[goodvals > f_data])) / len(goodvals), 3))) ax.annotate( "", xy=(f_data, 0), xycoords=trans, xytext=(f_data, 0.25), textcoords=trans, arrowprops=dict(lw='4', color='b', arrowstyle="->,head_length=1.5,head_width=0.5"), ) ax.plot([], [], color='blue', lw=2, label="Observed = {:.3f}".format(f_data)) title = "TF({},{}) x TF({},{})".format(ref_pt, ref_rho, alt_pt, alt_rho) ax.legend(title=title) hep.cms.label(data=not mc, year=year, ax=ax) ax.set_xlim(0, x_lim) xlab = r"$\frac{-2log(\lambda_1/\lambda_2)/(p_2-p_1)}{-2log\lambda_2/(n-p_2)}$" ax.set_xlabel(xlab, x=1, ha='right') ax.set_ylabel("Pseudoexperiments", y=1, ha='right') if savename is not None: fig.savefig('{}.pdf'.format(savename), dpi=300, transparent=True, bbox_inches='tight') fig.savefig('{}.png'.format(savename), dpi=300, transparent=True, bbox_inches='tight')
import numpy as np from scipy.stats import f import matplotlib.pyplot as plt x=np.arange(0,10,.001) plt.plot(x,f.pdf(x,1,1), x,f.pdf(x,3,8), x,f.pdf(x,8,3), x,f.pdf(x, 40,40)) plt.xlim(0,10) plt.ylim(0,1.5) plt.xlabel('$x$',fontsize=20) plt.ylabel('$P(X=x | K_1,K_2)$',fontsize=18) plt.title('Fisher-Distribution',fontsize=20) plt.legend(['$K_1=1, K_2=1$','$K_1=3, K_2=8$', '$K_1=8 ,K_2=3$', '$K_1=40, K_2=40$']) plt.show()