def dt(x, df=1, loc=0, scale=1, ncp=None, log=False): """ Density Function for the t distribution. Returns the probability density value at the value x. ARGS: --------------- :param x (float, array of floats): The value(s) of x :param df (float): degrees of freedom :param loc: array_like, optional location parameter (default=0) :param scale: float, optional scale (default=1) :param ncp (float): non-centrality parameter delta. Currently not implemented. :param log (bool): take the log? RETURN: --------------- :return: returns an array of density values """ # ========================================================================== if log: return t.logpdf(x, df=df, loc=0, scale=1) else: return t.pdf(x, df=df, loc=0, scale=1)
def GeneratePDF(Data, method = 'Robust_Student_t', lower_threshold = 0.15, upper_threshold = 0.85): '''Generate the pdf estimate of the data Input: /Data/ data to estimate pdf on /method/ Method of estimation. Available methods: 'Robust_Student_t'; 'KDE'; 'Normal' /lower_threshold/ in percentage /upper_threshold/ in percentage Output: /pdf/ fitted pdf /cdf/ fitted cdf ''' x = np.linspace(min(Data), max(Data), 100) if method == 'Robust_Student_t': nu, mu, sigma = uvtfit(Data) pdf = t.pdf(x, nu, mu, sigma) cdf = t.cdf(x, nu, mu, sigma) lower = t.ppf(lower_threshold, nu, mu, sigma) upper = t.ppf(upper_threshold, nu, mu, sigma) elif method == 'Normal': mu, sigma = norm.fit(Data) pdf = norm.pdf(x, mu, sigma) cdf = norm.cdf(x, mu, sigma) lower = norm.ppf(lower_threshold, mu, sigma) upper = norm.ppf(upper_threshold, mu, sigma) elif method == 'KDE': kernal = gaussian_kde(Data) pdf = kernal.evaluate(x) cdf = np.array([kernal.integrate_box(x[0], x[i+1]) for i in range(len(x)-1)]) lower = np.percentile(cdf, lower_threshold*100) upper = np.percentile(cdf, upper_threshold*100) return x, pdf, cdf, lower, upper
def studentT_curve(ax=None, linewidth=4, color='k', mean=0, SD=1, df=20, facecolor='gray', xlabel='standardized units', ylabel='% per standardized unit', alpha=0.5, **plot_opts): if ax is None: fig = plt.gcf() ax = fig.add_subplot(111) plot_opts['linewidth'] = linewidth plot_opts['color'] = color Z = np.linspace(-4,4,101) X = mean+SD*Z Y = tdist.pdf(Z, df) / SD ax.plot(X, Y, **plot_opts) ax.fill_between(X, 0*X, Y, alpha=alpha, facecolor=facecolor) if xlabel: ax.set_xlabel(xlabel, fontsize=20) if ylabel: ax.set_ylabel(ylabel, fontsize=20) ax.set_ylim([0,0.45/SD]) ax.set_xlim([X.min(),X.max()]) return ax
def predict_proba(self, X): N, D = X.shape # P = np.zeros(N) # for n in xrange(N): # x = X[n] # pyx = [] # for c in (0, 1): # pycx = self.pyy[c] # for d in xrange(D): # tinfo_cd = self.tinfo[c][d] # pdf_d = t.pdf(x[d], df=tinfo_cd['df'], loc=tinfo_cd['center'], scale=tinfo_cd['scale']) # pycx *= pdf_d # pyx.append(pycx) # py1x = pyx[1] / (pyx[0] + pyx[1]) # # print "p(y=1|x):", py1x # P[n] = py1x posteriors = np.zeros((N, 2)) for c in (0, 1): probability_matrix = np.zeros((N, D)) for d in xrange(D): tinfo_cd = self.tinfo[c][d] pdf_d = t.pdf(X[:,d], df=tinfo_cd['df'], loc=tinfo_cd['center'], scale=tinfo_cd['scale']) probability_matrix[:,d] = pdf_d posteriors_c = np.prod(probability_matrix, axis=1)*self.pyy[c] posteriors[:,c] = posteriors_c P = posteriors[:,1] / np.sum(posteriors, axis=1) return P
def get_predprobs(self, datum): """ Predictive distribution of NIG is a T distribution. """ muT = self.params.post_params['mu'] nuT = self.params.post_params['nu'] alphaT = self.params.post_params['alpha'] betaT = self.params.post_params['beta'] return t.pdf(datum, 2*alphaT, muT, betaT*(1+nuT) / (alphaT*nuT))
def plot_dist(n, q1_arr): # n=標本の大きさ,q1_arr=q1値のarray xx=np.linspace(-3,3,num=100) # 図を作成するために横軸の値を設定 kde_model=gaussian_kde(q1_arr) # カーネル密度推定を使いt値の分布を推定 t_dist = t.pdf(xx,df=n-2) # 同じ自由度のt分布 plt.plot(xx, kde_model(xx), 'g-') # t値の分布プロット plt.plot(xx, t_dist,'b:') # t分布 plt.ylabel('Kernel Density') # 縦軸のラベル plt.title('n = {0}'.format(n)) # タイトル
def parametricES_student(self, dof: int, confidenceLevel: float = 0.95) -> {}: es_t = {} for name in self.returns.columns: mu = np.mean(self.returns[name]) std = np.std(self.returns[name]) xanu = t.ppf(1 - confidenceLevel, dof) es_t[name] = round( (-1 / (1 - confidenceLevel)) * (1 - dof)**(-1) * (dof - 2 + xanu**2) * t.pdf(xanu, dof) * std - mu, 3) return es_t
def confidence(self, i): """ Return the 95 per cent confidence interval for a fitted parameter https://stats.stackexchange.com/questions/72047/when-fitting-a-curve-how-do-i-calculate-the-95-confidence-interval-for-my-fitt [BestFit(Pi) +/- t(95%,DF)*SE(Pi) NOTES: TODO(arl): make this a user defined interval """ ci = constants.CONFIDENCE_INTERVAL / 100.0 conf = t_distrb.pdf(ci, self.DoF) * self.SE(i) return (self.fit_params[i].value - conf, self.fit_params[i].value + conf)
def chi2_distribution(): fig, ax = plt.subplots(1, 1) #display the probability density function df = 10 x=np.linspace(chi2.ppf(0.01, df), chi2.ppf(0.99, df), 100) ax.plot(x, chi2.pdf(x,df)) #simulate the chi2 distribution y = [] n=10 for i in range(1000): chi2r=0.0 r = norm.rvs(size=n) for j in range(n): chi2r=chi2r+r[j]**2 y.append(chi2r) ax.hist(y, normed=True, alpha=0.2) plt.show() fig, ax = plt.subplots(1, 1) #display the probability density function df = 10 x=np.linspace(-4, 4, 100) ax.plot(x, t.pdf(x,df)) #simulate the t-distribution y = [] for i in range(1000): rx = norm.rvs() ry = chi2.rvs(df) rt = rx/np.sqrt(ry/df) y.append(rt) ax.hist(y, normed=True, alpha=0.2) plt.show() fig, ax = plt.subplots(1, 1) #display the probability density function dfn, dfm = 10, 5 x = np.linspace(f.ppf(0.01, dfn, dfm), f.ppf(0.99, dfn, dfm), 100) ax.plot(x, f.pdf(x, dfn, dfm)) #simulate the F-distribution y = [] for i in range(1000): rx = chi2.rvs(dfn) ry = chi2.rvs(dfm) rf = np.sqrt(rx/dfn)/np.sqrt(ry/dfm) y.append(rf) ax.hist(y, normed=True, alpha=0.2) plt.show()
def compute_T_Pvalue(betas, stds_beta, mask_file, null_hyp=True): ''' Compute Tvalues statistic and Pvalue based upon estimates and their standard deviation beta and std_beta for all voxels beta: shape (nb_vox, 1) std: shape (1) Assume null hypothesis if null_hyp is True ''' from pyhrf.ndarray import xndarray import sys sys.path.append("/home/i2bm/BrainVisa/source/pyhrf/pyhrf-free/trunk/script/WIP/Scripts_IRMf_Adultes_Solv/Scripts_divers_utiles/Scripts_utiles/") from Functions_fit import Permutation_test, stat_mean, stat_Tvalue, stat_Wilcoxon mask = xndarray.load(mask_file).data #to save P and Tval on a map BvalC = xndarray(betas, axes_names=['sagittal', 'coronal', 'axial']) Betasval = BvalC.flatten(mask, axes=['sagittal', 'coronal', 'axial'], new_axis='position').data Stdsval = stds_beta Tval = xndarray(Betasval/Stdsval, axes_names=['position']).data nb_vox = Betasval.shape[0] nb_reg = betas.shape[1] dof = nb_vox - nb_reg #degrees of freedom for STudent distribution assert dof>0 Probas=np.zeros(Betasval.shape) for i in xrange(nb_vox): if null_hyp: #STudent distribution from scipy.stats import t fmix = lambda x: t.pdf(x, dof) else: fmix = lambda t: 1/np.sqrt(2*np.pi*Stdsval[i]**2)*np.exp(- (t - Betasval[i])**2 / (2*Stdsval[i]**2) ) Probas[i] = quad(fmix, Tval[i], float('inf'))[0] Tvalues_ = xndarray(Tval, axes_names=['position']) Pvalues_ = xndarray(Probas, axes_names=['position']) Tvalues = Tvalues_.expand(mask, 'position', ['sagittal','coronal','axial']) Pvalues = Pvalues_.expand(mask, 'position', ['sagittal','coronal','axial']) #Computation of Pvalue using permutations #not possible to do this actually...it was used for group level stats #Pvalue_t = np.zeros(Betasval.shape) #for i in xrange(nb_vox): #Pvalue_t[i] = Permutation_test(Betasval[i], n_permutations=10000, \ #stat = stat_Tvalue, two_tailed=False, plot_histo=False) return Tvalues.data, Pvalues.data
def demo_Student_t_pdf(): x = np.linspace(-6, 6, 1000) plt.figure() colors = list('brkmcyg') * 3 for i, dof in enumerate([1, 2, 5, 10, 20, 50, 100]): plt.plot(x, t.pdf(x, dof), colors[i] + '-', lw=1, alpha=0.7, label='dof = ' + str(dof)) plt.legend() plt.title("Student's t distribution", fontsize=15)
def _p_value(self, t_value): """ TODO check if calculation is correct with tvalue = paramter / std and pavalue with df as totalparameter and not as total observations - totalparameter :param t_value: :return: p-value of parameter """ from scipy.stats import t as t_dist df = (self._ts_matrix.size) - (self._get_total_parameter() * len(self._wa_matrices)) # TODO check calculation of p-values with degrees of freedom # p_value = tdist.pdf(abs(tvalue), self._total_parameter()) return t_dist.pdf(abs(t_value), df)
def test_multidim_student_t(self): from scipy.stats import t from cde.utils.distribution import multidim_t_pdf mu = 5 * np.ones(3) sigma = 3 * np.ones(3) dof = 6 x = np.random.uniform(-10, 10, size=(100, 3)) p1 = np.prod(t.pdf(x, loc=5, scale=3, df=dof), axis=-1) p2 = multidim_t_pdf(x, mu, sigma, dof) self.assertLessEqual(np.sum((p1 - p2)**2), 0.0001)
def plot_student(nu_values, x_range=[-8, 8], title=""): x = np.linspace(-8, 8, 101) plt.figure(figsize=(9, 7)) for i, nu in enumerate(nu_values): #plt.plot(x, t.pdf(x, df=1, loc=1, scale=1), 'g-', label='nu=1') #plt.plot(x, t.pdf(x, df=5, loc=1, scale=1), 'r-', label='nu=5') plt.plot(x, t.pdf(x, df=nu, loc=1, scale=1), colors[i] + '-', label='nu=%i' % (int(nu))) # 'df' is parameter nu plt.plot(x, norm.pdf(x, loc=1, scale=1), 'k-', label='normal') plt.legend(fontsize=15) plt.title(title, fontsize=16)
def parametric_Student_Portfolio(self, dof: int, co) -> {}: riskMeasures = {} mu = np.mean(self.portfolioReturns[name]) std = np.std(self.portfolioReturns[name]) riskMeasures['Portfolio VaR'] = round( mu + std * t.ppf(confidenceLevel, dof) * np.sqrt((dof - 2) / dof), 3) riskMeasures['Portfolio ES'] = round( (-1 / (1 - confidenceLevel)) * (1 - dof)**(-1) * (dof - 2 + xanu**2) * t.pdf(xanu, dof) * std - mu, 3) return riskMeasures
def predictive_postetior_tointegrate(mu, tau, x): M1 = r + self.discount_factor * x M2 = r**2 + 2 * self.discount_factor * r * x + self.discount_factor**2 * x**2 new_mean = (lamb * mean + M1) / (lamb + 1) new_lambda = lamb + 1 new_alpha = alpha + 0.5 new_beta = beta + 0.5 * (M2 - M1**2) + (lamb * (M1 - mean)**2) / (2 * (lamb + 1)) return normal_gamma_pdf(mu, tau, new_mean, new_lambda, new_alpha, new_beta) * \ t.pdf(x, 2 * alpha2, loc=mean2, scale=np.sqrt(beta2/alpha2 * (1 + 1./lamb2)))
def tskew_pdf(x, df, loc, scale, skew): """ Density function of the tskew distribution Based on the formula in Giot and Laurent (JAE 2003 pp. 650) - x = the value to evaluate - df: degrees of freedom (>1) - location: mean of the distribution - scale: standard deviation of the distribution - skew: skewness parameter (>0, if ==1: no skew, <1: left skew, >1 right) NB: I had to parametrize the formula differently to get consistent results """ cons = (2/(skew + (1/skew)))/scale norm_x = x-(loc/scale) if x < loc/scale : pdf = cons*t.pdf(skew*norm_x, df, loc=0, scale=1) # Symmetric t pdf elif x >= loc/scale: pdf = cons*t.pdf(norm_x/skew, df, loc=0, scale=1) # Symmetric t pdf else: raise ValueError('Incorrect parameters') return(pdf)
def update_from_experts(self, state, data): # compute return G_Q, G_U = data[0], data[1] epsilon = self.get_epsilon(state) G = (1.0 - epsilon) * G_Q + epsilon * G_U # update mu-hat and sigma^2-hat self.stat.update(G) mu, sigma2, t = self.stat.mean, self.stat.var, self.stat.count # update a_t and b_t a = self.a0 + t / 2 b = self.b0 + t / 2 * sigma2 + t / 2 * ( self.tau0 / (self.tau0 + t)) * (mu - self.mu0) * (mu - self.mu0) # compute e_t scale = (b / a)**0.5 e_u = tdist.pdf(G, df=2.0 * a, loc=G_U, scale=scale) e_q = tdist.pdf(G, df=2.0 * a, loc=G_Q, scale=scale) # update posterior self.post.update(e_u, e_q)
def gradient_scale(y: np.ndarray, location: np.ndarray, scale: np.ndarray, nu: np.ndarray, tau: np.ndarray, weights: np.ndarray): """Calculates Gradient of scale parameter. """ z = np.where(nu != 0, (((y / location)**nu - 1) / (nu * scale)), np.log(y / location) / scale) w = (tau + 1) / (tau + z**2) h = t.pdf(1 / (scale * np.abs(nu)), df=tau) / t.cdf( 1 / (scale * np.abs(nu)), df=tau) grad = (w * (z**2) - 1) / scale + h / (scale**2 * np.abs(nu)) grad = stabilize_derivative(grad, BCT.stabilize) grad = grad * (-1) * weights return grad
def predict(self, x1, x2): p1 = t.cdf((x1 - self.mu) / sqrt((self.beta * (self.n_null + 1)) / (self.alpha * self.n_null)), 2 * self.alpha) p2 = t.cdf((x2 - self.mu) / sqrt((self.beta * (self.n_null + 1)) / (self.alpha * self.n_null)), 2 * self.alpha) pges = round(100 * abs(p1 - p2), 3) #x = np.linspace(t.ppf(0.001, 2*self.alpha,loc=self.mu),t.ppf(0.999, 2*self.alpha,loc=self.mu), 100) x = np.linspace(self.mu-(t.ppf(0.99, 2*self.alpha,0))*sqrt(self.beta*(self.n_null+1)/(self.n_null*self.alpha)),self.mu+(t.ppf(0.99, 2*self.alpha,0))*sqrt(self.beta*(self.n_null+1)/(self.n_null*self.alpha)), 100) fig, ax = plt.subplots(1, 1) fillrange=np.linspace(x1,x2,100) ax.fill_between(fillrange, t.pdf(fillrange, df=2*self.alpha,loc=self.mu, scale=sqrt(self.beta*(self.n_null+1)/(self.n_null*self.alpha))),y2=0) ax.plot(x, t.pdf(x, df=2*self.alpha,loc=self.mu, scale=sqrt(self.beta*(self.n_null+1)/(self.n_null*self.alpha)))) prob_statement=('The probability to obtain an outcome between '+ str(x1) + ' and '+ str(x2) + ' with the current knowledge is: '+ str(pges) + '%') if not os.path.isdir('static'): os.mkdir('static') else: # Remove old plot files for filename in glob.glob(os.path.join('static', '*.png')): os.remove(filename) # Use time since Jan 1, 1970 in filename in order make # a unique filename that the browser has not chached plotfile = os.path.join('static', str(time.time()) + '.png') plt.savefig(plotfile) return plotfile,prob_statement
def marginal(sample_mu, sample_prec, n, cdf=False): sample_var = p2s(sample_prec) n = float(n) mu = (prior.precision * prior.mu + n * sample_mu) / (prior.precision + n) precision = prior.precision + n alpha = prior.alpha + n / 2. beta = (prior.beta + 0.5 * (n * sample_var) + ((prior.precision * n * ((sample_mu - prior.mu)**2)) / (2 * (prior.precision + n)))) tmu = mu tsigma = beta / (alpha * precision) if cdf: return lambda x: t.cdf(x, 2 * alpha, tmu, tsigma) return lambda x: t.pdf(x, 2 * alpha, tmu, tsigma)
def plot_pdf(self, arg=np.linspace(-2, 2, 100)): """Plot probability density function. Parameters ---------- arg : array Grid of point to evaluate PDF at """ scale = (self.eta/(self.eta-2))**.5 plt.plot(arg, t.pdf(arg, self.eta, scale=1/scale), label='t distribution') plt.plot(arg, self.pdf(arg), label='skew-t distribution') plt.legend() plt.show()
def calc_CVar(Df, alpha=0.05, dist='n'): rtn = Df / Df.shift(1) - 1 if dist == 'n': mu, sig = norm.fit(rtn.dropna().values) mu = mu * 252 sig = sig * 252**(0.5) CVar = alpha**(-1.) * norm.pdf(norm.ppf(alpha)) * sig - mu if dist == 't': nu, mu, sig = t.fit(rtn.dropna().values) mu = mu * 252 sig = sig * 252**(0.5) xanu = t.ppf(alpha, nu) CVar = -1. / alpha * (1 - nu)**(-1.) * (nu - 2 + xanu**2.) * t.pdf( xanu, nu) * sig - mu return CVar
def pdf(x, dof): ''' Parameters ---------- x: float, calc funcion in x. calc funcion in x dof: int, degrees of freedom. which is constant within each integration test case Returns ------- pdf: float, probability in x probability in x ''' #research in scipy incorporate stats and T distribution. pdf. return t.pdf(t, dof)
def tlocscale(x, mu, scale2, nu): """ shifted and scaled student-t pdf Arguments: x {list} -- nodes to evaluate density at mu {float} -- shift scale2 {float} -- scale nu {int} -- degrees of freedom Returns: list -- evaluations of pdf """ scale = np.sqrt(scale2) return student_t.pdf((x - mu) / scale, nu) / scale
def ei_given_sample(self, sample, parameters_kernel, candidate_point): """ Correct See p. 1140. We compute (8) :param sample: :param parameters_kernel: :param candidate_point: :return: """ M = np.min(sample) n = len(sample) mc, c, chol, Z, bc = self.compute_mc_given_sample( sample, candidate_point, parameters_kernel) historical_points = self.gp.data['points'] candidate_vector = np.zeros( (len(self.domain_xe), historical_points.shape[1])) for j in range(len(self.domain_xe)): point = np.concatenate( (candidate_point, np.array(self.domain_xe[j]))) candidate_vector[j, :] = point cov_new = self.gp.evaluate_cov(candidate_vector, parameters_kernel) weights_matrix = self.weights.reshape((len(self.weights), 1)) Rc = np.dot(weights_matrix.transpose(), np.dot(cov_new, weights_matrix)) Rc -= np.dot(c, cho_solve(chol, c.transpose())) one = np.ones((2 * n, 1)) Rc += (1 - np.dot(c, cho_solve(chol, one))) ** 2 / \ (np.dot(one.transpose(), cho_solve(chol, one))) Zc = Z.reshape((len(Z), 1)) sigma_c = np.dot(Zc.transpose(), cho_solve(chol, Zc)) sigma_c -= (bc**2) * np.dot(one.transpose(), cho_solve(chol, one)) sigma_c /= (2.0 * n - 1) difference = M - mc sd = 1.0 / np.sqrt(Rc * sigma_c) component_1 = (M - mc) * t.cdf(difference * sd, 2 * n - 1) component_2 = t.pdf(difference * sd, 2 * n - 1) component_2 *= 1.0 / (2.0 * (n - 1)) component_2 *= (2.0 * n - 1) * np.sqrt(Rc * sigma_c) + (difference** 2) * sd result = component_1 + component_2 return result[0, 0]
def plot_pdf(self, arg=np.linspace(-2, 2, 100)): """Plot probability density function. Parameters ---------- arg : array Grid of point to evaluate PDF at """ scale = (self.eta / (self.eta - 2))**.5 plt.plot(arg, t.pdf(arg, self.eta, scale=1 / scale), label='t distribution') plt.plot(arg, self.pdf(arg), label='skew-t distribution') plt.legend() plt.show()
def plot_dsn(self, input_a=None, input_b=None): """Plots our p-value in a t-distribution""" if input_a: N = len(input_a) else: N = len(self.a) test_stat, df = self.testStat(input_a, input_b) fig, ax = plt.subplots(1, 1) x = np.linspace(t.ppf(0.01, df), t.ppf(0.99, df), 100) ax.plot(x, t.pdf(x, df), 'r-', lw=5, alpha=0.6, label='t pdf') ax.set_xlabel('Test Statistic Values') ax.set_ylabel('Probability') ax.set_title('T-Distribution with {}. D.o.F.'.format(df)) plt.axvline(test_stat, lw=7) plt.show()
def sampling_distribution(): fig, ax = plt.subplots(1, 1) #display the probability density function df = 10 x=np.linspace(t.ppf(0.01, df), t.ppf(0.99, df), 100) ax.plot(x, t.pdf(x, df)) #simulate the sampling distribution y = [] for i in range(1000): r = norm.rvs(loc=5, scale=2, size=df+1) rt =(np.mean(r)-5)/np.sqrt(np.var(r)/df) y.append(rt) ax.hist(y, normed=True, alpha=0.2) plt.savefig('sampling_distribution.png')
def marginal(samples, cdf=False): sample_mu, s = mean(samples), std(samples)**2. n = float(len(samples)) mu = (prior_precision * prior_mean + n * sample_mu) / (prior_precision + n) precision = prior_precision + n alpha = palpha + n / 2. beta = (pbeta + 0.5 * (sum( (samples - sample_mu)**2) + ((prior_precision * n * ((sample_mu - prior_mean)**2)) / (2 * (prior_precision + n))))) tmu = mu tsigma = beta / (alpha * precision) if cdf: return lambda x: t.cdf(x, 2 * alpha, tmu, tsigma**.5) return lambda x: t.pdf(x, 2 * alpha, tmu, tsigma**.5)
def pred_prob(self, data_list, x): """Compute UPM predictive probabilities. """ #x = data_list[-1]; print("data_list :",x, data_list) data_list = data_list[:-1] # The last data-point included manually on the following step. data_length = len(data_list); beta_list = [self.beta0] #if data_length > 1: #print("**", self.gamma_params) for ind in range(data_length): r_ind = data_length - ind - 1 beta = self.beta0 + 0.5*sum((np.array(data_list[r_ind:]) - np.average(np.array(data_list[r_ind:])))**2) \ + (ind + 1)*self.gamma_params[0]/(self.gamma_params[0] + (ind + 1)) * 0.5 * \ (np.average(np.array(data_list[r_ind:])) - self.mu_params[0])**2 beta_list.append(beta) #print("1 :",0.5*sum((np.array(data_list[r_ind:]) - np.average(np.array(data_list[r_ind:])))**2)) #print("2 :",(ind + 1)*self.gamma_params[ind]/(self.gamma_params[ind] + (ind + 1)) * 0.5) #print("3 :",(np.average(np.array(data_list[r_ind:])) - self.mu_params[0])**2) #print(data_list[r_ind:]) #print("* ", (np.array(data_list[r_ind:]) - np.average(np.array(data_list[r_ind:])))) #print("** ", sum((np.array(data_list[r_ind:]) - np.average(np.array(data_list[r_ind:])))**2)) #print(self.num) #print(np.array(self.alpha_params)) #print("beta_list : ", np.array(beta_list)) #print() d = lambda x, df, mu, std : t.pdf(x, df, mu, std) # T distribution for conservative modeling d_normal = lambda x, mu, std: norm.pdf(x, mu, std) # Normal distribution for general purpose modeling ArrayForReturn_t = np.array([d(x, 2*self.alpha_params[i], self.mu_params[i], np.sqrt(beta_list[i]*(self.gamma_params[i]+1)/(self.alpha_params[i]*self.gamma_params[i]))) \ for i in range(data_length+1)]) ArrayForReturn_norm = np.array([d_normal(x, self.mu_params[i], beta_list[i]*(self.gamma_params[i]+1)/(self.alpha_params[i]*self.gamma_params[i]))\ for i in range(data_length+1)]) #print("{}, Expectd Mu : {}, Expected STD : {}".format(self.num ,self.mu_params[-1] # ,math.sqrt(1/(self.alpha_params[-1]/beta_list[-1])) )) #print(" STE Mu : {}, STE Std {}".format(math.sqrt(beta_list[-1]/(self.gamma_params[-1]*(self.alpha_params[-1]+1))), # math.sqrt(beta_list[-1]*math.sqrt(math.sqrt(1.0/self.alpha_params[-1]))))) #print("E(std) : ", np.sqrt(np.array(beta_list)/self.alpha_params)) #print("E(mu) : ", self.mu_params) #print(np.array(beta_list)) #print(self.alpha_params) self.num += 1 #print() return ArrayForReturn_t
def plot_apprxn(S, batch, S_1, batch_1, vol_sup_node, no_tours, F_org_except_sup): bmap = brewer2mpl.get_map('Set2', 'qualitative', 7) colors = bmap.mpl_colors params = { 'axes.labelsize': 10, 'text.fontsize': 8, 'legend.fontsize': 10, 'xtick.labelsize': 10, 'ytick.labelsize': 10, 'text.usetex': False, 'figure.figsize': [6, 5] } plt.rcParams.update(params) ax=plt.axes(frameon=0) plt.grid() ax.set_xlabel('$\hat{\mu}(\mathcal{D}_m(S_n))$') S=np.array(S)*(vol_sup_node/(2*batch)) plt.hist(S, normed=True, alpha=0.5, color=colors[1],label='Empirical distribution') S_1=np.array(S_1)*(vol_sup_node/(2*batch_1)) m_0=0;mu_0=0;nu_0=0;sigma_0=1 n_stt= len(S_1)#31 #no_tours # "n" for student-t nu_tild=nu_0+n_stt S1_avg=np.mean(S_1) S1_sum=sum(S_1) mu_tild=(m_0*mu_0+n_stt*S1_avg)/(m_0+n_stt) temp_second_term=sum((S_1-S1_avg)**2) temp_third_term=(m_0*n_stt*(S1_avg-mu_0)**2)/(m_0+n_stt) sigma_tild=np.sqrt((nu_0*(sigma_0)**2+temp_second_term+temp_third_term)/((nu_0+n_stt)*(m_0+n_stt))) samples_temp = np.linspace(t.ppf(10**(-15), nu_tild, loc=mu_tild, scale=sigma_tild),t.ppf((1-10**(-15)), nu_tild, loc=mu_tild, scale=sigma_tild), 50**4) # samples_temp = np.linspace(4*10**9,8*10**9, 50**4) plt.plot(samples_temp, t.pdf(samples_temp, nu_tild, loc=mu_tild, scale=sigma_tild), color=colors[0], linewidth=2,linestyle='-',label='Approximate posterior') plt.axvline(x=F_org_except_sup, ymin=0, color="blue",alpha=0.75, label='True value', linewidth=2) legend=plt.legend(loc='best') frame = legend.get_frame() frame.set_facecolor('0.9') frame.set_edgecolor('0.75') plt.title('Friendster Network') plt.savefig('plot_HypRW.pdf') print "Saved the figure for Friendster graph in 'plot_HypRW.pdf'" print "Normalized percentage error in true value: ", 100*(F_org_except_sup-S1_avg)/F_org_except_sup
def gradient_nu(y: np.ndarray, location: np.ndarray, scale: np.ndarray, nu: np.ndarray, tau: np.ndarray, weights: np.ndarray): """Calculates Gradient of nu parameter. """ z = np.where(nu != 0, (((y / location)**nu - 1) / (nu * scale)), np.log(y / location) / scale) w = (tau + 1) / (tau + z**2) h = t.pdf(1 / (scale * np.abs(nu)), df=tau) / t.cdf( 1 / (scale * np.abs(nu)), df=tau) grad = ((w * z**2) / nu) - np.log(y / location) * (w * z**2 + ((w * z) / (scale * nu)) - 1) grad = grad + np.sign(nu) * h / (scale * nu**2) grad = stabilize_derivative(grad, BCT.stabilize) grad = grad * (-1) * weights return grad
def naiveBayes(xtrain, ytrain, xtest): ntrain = xtrain.shape[0] ntest = xtest.shape[0] log_prob = pd.DataFrame(0, index = np.arange(ntest), columns = [0, 1]) for y in [0, 1]: xtrain_y = xtrain[ytrain[0] == y] n_y = xtrain_y.shape[0] prob_y = (1 + n_y) / (ntrain + 2) log_condi_prob = pd.DataFrame(0, index = np.arange(ntest), columns = range(15)) for d in range(15): [df, loc, scale] = parameters(xtrain_y[d]) log_condi_prob[d] = np.log(t.pdf(xtest[d], df, loc, scale)) log_prob[y] = log_condi_prob.sum(axis = 1) + np.log(prob_y) prob = log_prob.apply(lambda x: np.exp(x), axis = 1) prob = prob.apply(lambda x: pd.Series([x[0]/x.sum(), x[1]/x.sum()]), axis = 1) pred = prob.apply(lambda x: 0 if x[0] > x[1] else 1, axis = 1) return(prob, pred)
def resid_density_plot(self): n = self.n p = self.p n0 = self.n0 studentized_residuals = self.residual_analysis['student_resid'] ll = np.linspace(studentized_residuals.min(), studentized_residuals.max(), n0) t_density = t.pdf(ll, df=n - p) plt.figure(figsize=(10, 5)) sns.histplot(studentized_residuals, stat='density', kde=True, label='kernel estimator') plt.plot(ll, t_density, 'r--', linewidth=2, label='t_density') plt.title("Histogram and t-Density for Studentized Residuals") plt.legend(loc='upper left')
def t_distribution(): fig, ax = plt.subplots(1, 1) # display the probability density function df = 10 x = np.linspace(-4, 4, 100) ax.plot(x, t.pdf(x, df)) # simulate the t-distribution y = [] for i in range(1000): rx = norm.rvs() ry = chi2.rvs(df) rt = rx / np.sqrt(ry / df) y.append(rt) ax.hist(y, normed=True, alpha=0.2) plt.savefig('t_distribution.png')
def t_distribution(): fig, ax = plt.subplots(1, 1) #display the probability density function df = 10 x=np.linspace(-4, 4, 100) ax.plot(x, t.pdf(x,df)) #simulate the t-distribution y = [] for i in range(1000): rx = norm.rvs() ry = chi2.rvs(df) rt = rx/np.sqrt(ry/df) y.append(rt) ax.hist(y, normed=True, alpha=0.2) plt.savefig('t_distribution.png')
def _post_fit(self, optimization_res, coeff_names, N, verbose=1): self.convergence = optimization_res['success'] self.coeff_ = optimization_res['x'] self.stderr = np.sqrt(np.diag(optimization_res['hess_inv'])) self.zvalues = self.coeff_ / self.stderr self.pvalues = 2 * t.pdf(-np.abs(self.zvalues), df=N) # two tailed test self.loglikelihood = -optimization_res['fun'] self.coeff_names = coeff_names self.total_iter = optimization_res['nit'] if self.convergence and verbose > 0: print("Estimation succesfully completed after {} iterations. " "Use .summary() to see the estimated values".format( self.total_iter)) if not self.convergence and verbose > 0: print("**** The optimization did not converge after {} " "iterations. ****".format(self.total_iter)) print("Message: " + optimization_res['message'])
def get_dists(lr, lb): kde_dict = dict() t_dict = dict() norm_dict = dict() for i in range(3): temp = lr[lb == i] xr = np.linspace(lr.min(), lr.max(), 1000) kde = gaussian_kde(temp) y = kde(xr) kde_dict[str(i)] = y t_pdf = t.pdf(xr, *t.fit(temp)) n_pdf = norm.pdf(xr, *norm.fit(temp)) t_dict[str(i)] = t_pdf norm_dict[str(i)] = n_pdf return xr, kde_dict, t_dict, norm_dict
def estimate_responsiveness(plan_df): elec_results = plan_df[['2008', '2012', '2016']].values state_year_results = elec_results.mean(axis=0) state_vote_t = t(df=2, loc=state_year_results.mean(), scale=state_year_results.std(ddof=1)) truncation_factor = 1 / (state_vote_t.cdf(1) - state_vote_t.cdf(0)) district_share = district_votes_given_state_vote(plan_df) state_share = district_share.mean(axis=0) district_std = elec_results.std(axis=1, ddof=1) district_std = np.tile(district_std, (len(state_share), 1)).T vote_seat_slope = np.nan_to_num( t.pdf(.5, df=2, loc=district_share, scale=district_std)).mean(axis=0) return simps( vote_seat_slope * state_vote_t.pdf(state_share) * truncation_factor, state_share)
def _post_fit(self, optimization_res, coeff_names, sample_size, verbose=1): self.convergence = optimization_res['success'] self.coeff_ = optimization_res['x'] self.stderr = np.sqrt(np.diag(optimization_res['hess_inv'])) self.zvalues = self.coeff_ / self.stderr self.pvalues = 2 * t.pdf(-np.abs(self.zvalues), df=sample_size) self.loglikelihood = -optimization_res['fun'] self.coeff_names = coeff_names self.total_iter = optimization_res['nit'] self.estim_time_sec = time() - self._fit_start_time self.sample_size = sample_size self.aic = 2 * len(self.coeff_) - 2 * self.loglikelihood self.bic = np.log(sample_size) * len( self.coeff_) - 2 * self.loglikelihood if not self.convergence and verbose > 0: print("**** The optimization did not converge after {} " "iterations. ****".format(self.total_iter)) print("Message: " + optimization_res['message'])
def student(x, normf, mu, sig, skew, nu): return map ( lambda y: 2.*t.pdf((y-mu)/sig, nu)*t.cdf(skew*((y-mu)/sig)*np.sqrt((nu+1.0)/(nu+(y-mu)*(y-mu)/sig/sig)),(nu+1.0))/sig*normf if ( (y <3.05) & (y>2.0) ) else 0., x)
from scipy.stats import t print(t.pdf(2,3))
# Enter your code here. X = pisa["year"] Xbar = X.mean() ssX = ((X - Xbar)**2).sum() n = pisa.shape[0] s2b1 = SSE/(ssX * (n-2)) ## 11. T-Distribution ## from scipy.stats import t # 100 values between -3 and 3 x = np.linspace(-3,3,100) # Compute the pdf with 3 degrees of freedom print(t.pdf(x=x, df=3)) ## 12. Statistical Significance of Coefficients ## # The variable s2b1 is in memory. The variance of beta_1 tstat = linearfit.params["year"]/(s2b1**0.5) ## 13. The P-Value ## # At the 95% confidence interval for a two-sided t-test we must use a p-value of 0.975 pval = 0.975 # The degrees of freedom df = pisa.shape[0] - 2 # The probability to test against
def confidence_interval(): # -------------------------------------- # 95% confidence interval - t Student. # sigma unknown - Anderson 2008 pag 310. # http://adventuresinpython.blogspot.com.br/2012/12/confidence-intervals-in-python.html # Levine pag 292, exercício 8.14. s = np.array([1, 2, 3, 4, 5, 6, 20]) # extrai parametros do vetor. n, min_max, mean, var, skew, kurt = stats.describe(s) print '\ndiferente, var extraida por stats.describe(s) e np.var(s)' print 'denominador (n-1) = ', str(var), ', denominador (n) = ', str(np.var(s)), '\n' # set parametros. s_amostra = math.sqrt(var) n_amostra = n mu_amostra = mean alpha = 0.05 # calcula os t scores para os pontos críticos, # percent point function (ppf), dada uma cdf calcula o ppf, # ou seja, o valor x do ponto na Student curve # (n_amostra - 1) é o grau de liberdade. # norm.ppt equivale a norminv e norm.pdf a normpdf. tinf = t.ppf(alpha/2,n_amostra-1) tsup = t.ppf(1-(alpha/2),n_amostra-1) xis_inf = mu_amostra + (tinf * (s_amostra / math.sqrt(n_amostra))) xis_sup = mu_amostra + (tsup * (s_amostra / math.sqrt(n_amostra))) print xis_inf, xis_sup # --------------------------------------------- # desenha curva baseada na distribuição normal. # --------------------------------------------- mu = mu_amostra sigma = s_amostra limite_inferior = xis_inf limite_superior = xis_sup xa = np.linspace(mu-(4*sigma),limite_inferior) xb = np.linspace(limite_inferior,limite_superior) xc = np.linspace(limite_superior,mu+(4*sigma)) x = np.concatenate((xa,xb,xc), axis=0) ya = norm.pdf(xa,mu,sigma) yb = norm.pdf(xb,mu,sigma) yc = norm.pdf(xc,mu,sigma) y = np.concatenate((ya,yb,yc), axis=0) pf_plota.plotar('Confidence Interval','t student scores','x bar', \ 'probabilidade','pdf',x,y,limite_inferior, \ limite_superior,sigma,mu,xa,ya,xc,yc,alpha,np.array([]),n_amostra) # ---------------------------------------------- # desenha curva baseada na distribuição Student. # ---------------------------------------------- mu = mu_amostra sigma = s_amostra limite_inferior = tinf limite_superior = tsup xa = np.linspace(-4,limite_inferior) xb = np.linspace(limite_inferior,limite_superior) xc = np.linspace(limite_superior,4) x = np.concatenate((xa,xb,xc), axis=0) ya = t.pdf(xa,n_amostra-1) yb = t.pdf(xb,n_amostra-1) yc = t.pdf(xc,n_amostra-1) y = np.concatenate((ya,yb,yc), axis=0) za = norm.pdf(xa) zb = norm.pdf(xb) zc = norm.pdf(xc) z = np.concatenate((za,zb,zc), axis=0) pf_plota.plotar('Confidence Interval','t student scores','x bar', \ 'probabilidade','pdf',x,y,limite_inferior, \ limite_superior,sigma,mu,xa,ya,xc,yc,alpha,z,n_amostra)
a = np.random.randn(30) outliers = np.array([8, 8.75, 9.5]) pl.hist(a, 7, weights=[1 / 30] * 30, rwidth=0.8) #fit without outliers x = np.linspace(-5, 10, 500) loc, scale = norm.fit(a) n = norm.pdf(x, loc=loc, scale=scale) loc, scale = laplace.fit(a) l = laplace.pdf(x, loc=loc, scale=scale) fd, loc, scale = t.fit(a) s = t.pdf(x, fd, loc=loc, scale=scale) pl.plot(x, n, 'k>', x, s, 'r-', x, l, 'b--') pl.legend(('Gauss', 'Student', 'Laplace')) pl.savefig('robustDemo_without_outliers.png') #add the outliers pl.figure() pl.hist(a, 7, weights=[1 / 33] * 30, rwidth=0.8) pl.hist(outliers, 3, weights=[1 / 33] * 3, rwidth=0.8) aa = np.hstack((a, outliers)) loc, scale = norm.fit(aa) n = norm.pdf(x, loc=loc, scale=scale)
counts[k] -= 1 # If this was the last data point in this cluster, delete it if counts[k] == 0: del counts[k] del clusters[k] # Update the index of all the other assignments assignments[assignments > k] -= 1 # Calculate the weight for a new cluster # See Escobar and West (1995) for details on why this is the weight. # See the Wikipedia page on conjugate priors for the form of the Student's t # distribution. new_cluster_posterior = cluster_prior.posterior(np.array([y])) t_scale = new_cluster_posterior.b * (new_cluster_posterior.nu + 1) / (new_cluster_posterior.a * new_cluster_posterior.nu) new_cluster_weight = ALPHA * t.pdf(y, 2. * new_cluster_posterior.a, loc=new_cluster_posterior.mu, scale=t_scale) # Calculate the weight for all the other clusters z = [counts[k] * norm.pdf(y, kmean, kstdev) for k,(kmean, kstdev) in enumerate(clusters)] z.append(new_cluster_weight) weights = np.array(z) # Draw a new assignment proportional to the cluster weights k = weighted_sample(weights) assignments[i] = k # If we sampled a new cluster if k == len(clusters): # We need to sample the parameters from the prior # TODO: should we instead sample from the posterior with the one sample? kmean, kstdev = cluster_prior.sample()
mu = 0. # the mean, mu nus = [1., 2., 5, 10, 100] # standard deviations, sigma markers = ['b-', 'r-', 'm-', 'c-', 'g-'] x = np.linspace(-6, 6, 1000) # x # set plot to render labels using latex pl.rc('text', usetex=True) pl.rc('font', family='serif') pl.rc('font', size=14) fig = pl.figure(figsize=(6,5), dpi=100) # plot pdfs for i, nu in enumerate(nus): pl.plot(x, t.pdf(x, nu), markers[i], label='$\\nu=%d$'%nu) # plot a Gaussian for comparison pl.plot(x, norm.pdf(x, mu, 1.), 'k--', label='$N(0,1)$') ax = pl.gca() ax.set_xlabel('$t$', fontsize=14) ax.set_ylabel('$p(t)$', fontsize=14) ax.legend(loc='best', frameon=False) fig.subplots_adjust(bottom=0.15) pl.savefig('../studentst.pdf') pl.show()
def test_sprot(): algn = read_free(sprot_file) # truncate alignments to sequence positions with # gap frequency no greater than 20% - to avoid over-representation of gaps # alignments = truncate(algn, FRAC_ALPHA_CUTOFF) # print alignments.shape pdb_res_list = read_pdb(SPROT_PDB_FILE, 'E') msa_algn = msa_search(pdb_res_list, algn) print msa_algn sca_algn = sca(algn) algn_shape = get_algn_shape(algn) no_pos = algn_shape.no_pos no_seq = algn_shape.no_seq no_aa = algn_shape.no_aa print 'Testing SCA module :' print 'algn_3d_bin hash :' + str(np.sum(np.square(sca_algn.algn_3d_bin))) print 'weighted_3d_algn hash :' +\ str(np.sum(np.square(sca_algn.weighted_3d_algn))) print 'weight hash : ' + str(np.sum(np.square(sca_algn.weight))) print 'pwX hash : ' + str(np.sum(np.square(sca_algn.pwX))) print 'pm hash : ' + str(np.sum(np.square(sca_algn.pm))) print 'Cp has : ' + str(np.sum(np.square(sca_algn.Cp))) print 'Cs hash : ' + str(np.sum(np.square(sca_algn.Cs))) spect = spectral_decomp(sca_algn, 100) print 'spect lb hash : ' + str(np.sum(np.square(spect.pos_lbd))) print 'spect ev hash : ' + str(np.sum(np.square(spect.pos_ev))) print 'spect ldb_rnd hash : ' + str(np.sum(np.square(spect.pos_lbd_rnd))) print 'spect ev hash : ' + str(np.sum(np.square(spect.pos_ev_rnd))) svd_output = LA.svd(sca_algn.pwX) U = svd_output[0] sv = svd_output[1] V = svd_output[2] # perform independent components calculations kmax = 8 learnrate = 0.0001 iterations = 20000 w = ica(transpose(spect.pos_ev[:, 0:kmax]), learnrate, iterations) ic_P = transpose(dot(w, transpose(spect.pos_ev[:, 0:kmax]))) print "ic_P hash :" + str(mat_sum(square(ic_P))) # calculate the matrix Pi = U*V' # this provides a mathematical mapping between # positional and sequence correlation n_min = min(no_seq, no_pos) Pi = dot(U[:, 0:n_min-1], transpose(V[:, 0:n_min-1])) U_p = dot(Pi, spect.pos_ev) p_cutoff = 0.9 nfit = 3 cutoffs = zeros((nfit, 1)) sector_def = [] for i in range(0, nfit): nu, mu, sigma = t.fit(ic_P[:, i]) q75, q25 = percentile(ic_P[:, i], [75, 25]) iqr = q75 - q25 binwidth = 2*iqr*pow(size(ic_P[:, i]), -1/3.0) # Freedman-Diaconisrule nbins = round(ptp(ic_P[:, i])/binwidth) yhist, xhist = histogram(ic_P[:, i], nbins) x_dist = arange(min(xhist), max(xhist), (max(xhist) - min(xhist))/100) cdf_jnk = t.cdf(x_dist, nu, mu, sigma) pdf_jnk = t.pdf(x_dist, nu, mu, sigma) maxpos = argmax(pdf_jnk) tail = zeros((1, size(pdf_jnk))) if abs(max(ic_P[:, i])) > abs(min(ic_P[:, i])): tail[:, maxpos:] = cdf_jnk[maxpos:] else: tail[0:maxpos] = cdf_jnk[0:maxpos] x_dist_pos = argmin(abs(tail - p_cutoff)) cutoffs[i] = x_dist[x_dist_pos] sector_def.append(array(where(ic_P[:, i] > cutoffs[i])[0])[0]) print sector_def
def sampling_distribution(): fig, ax = plt.subplots(1, 1) #display the probability density function df = 50 x=np.linspace(t.ppf(0.01, df), t.ppf(0.99, df), 100) ax.plot(x, t.pdf(x, df))
#!/usr/bin/env python import numpy as np import matplotlib.pyplot as pl from scipy.stats import t, laplace, norm x = np.linspace(-4, 4, 100) n = norm.pdf(x, loc=0, scale=1) l = laplace.pdf(x, loc=0, scale=1 / (2 ** 0.5)) t = t.pdf(x, df=1, loc=0, scale=1) pl.plot(n, 'k:', t, 'b--', l, 'r-') pl.legend(('Gauss', 'Student', 'Laplace')) pl.savefig('studentLaplacePdfPlot_1.png') pl.figure() pl.plot(np.log(n), 'k:', np.log(t), 'b--', np.log(l), 'r-') pl.legend(('Gauss', 'Student', 'Laplace')) pl.savefig('studentLaplacePdfPlot_2.png') pl.show()
Y = Data[1] #pull out the dependent variables Coefficients = np.linalg.inv(X.T.dot(X)).dot(X.T.dot(Y)) #use the classic formula print(Coefficients) #compare the estimated coefficients with the real coefficients #another way to do this using mle: X = Data[0] #pull out the explanatory variables and transpose it Y = Data[1] #pull out the dependent variables b = np.random.uniform(size = len(X[0]) + 1)*0.1 #generate random starting values Coefficients = minimize(OLS_mle, x0 = b, args = (X,Y), method = 'BFGS').x #optimize the adaptive lasso #compute standard errors s2 = sum((Y - X.dot(Coefficients[0:n_vars+1]))**2)/(N-(n_vars + 1)) #estimate s2 var_hat = s2*np.linalg.inv(X.T.dot(X)) #compute var hat Standard_Error = (np.diag(var_hat))**0.5 #compute the standard errors t_stat = Coefficients[0:n_vars+1]/Standard_Error #compute the t statistics for each variable p_values = 2*t.pdf(abs(t_stat),df = N - 1) #compute the p-values tm(Coefficients,Standard_Error,t_stat,p_values) #call the table making function to compute the table #compare r2 as the number of features goes up r2 = np.zeros(6) #an array to hold the r2s r2_adj = np.zeros(6) #an array to hold the r2_adjs n_f = np.array([2,20,40,60,80,100]) #an array of # of features for i in range(len(n_f)): #loop over features r2r2_adj = n_features(n_f[i]) #run model and extract r2 r2[i] = r2r2_adj[0] #pull out r2 r2_adj[i] = r2r2_adj[1] #pull out r2_adj
# use np.abs to get upper tail p = st.distributions.t.sf(np.abs(t), df) * 2 print("Probability of sample outcome by chance: ", p) alpha = 0.05 if p < alpha: print("Significant") else: print("Not signficant") # <codecell> from scipy.stats import t x = np.linspace(t.ppf(0.0001, df), t.ppf(0.9999, df), 100) plt.plot(x, t.pdf(x, df), color=isseorange, alpha=0.9, label='t pdf') plt.fill_between(x, t.pdf(x, df), facecolor=isseorange, alpha = 0.4) plt.xlabel('Probabilty distribution over t values') plt.legend(loc='best', frameon=False)# plt.title('Degrees of freedom 298') plt.savefig('student-t.pdf') # <codecell> x = np.linspace(t.ppf(9.92242823716e-161, df), t.ppf(0.999999999, df), 100) plt.plot(x, t.pdf(x, df), color=isseorange, alpha=0.9, label='t pdf') plt.fill_between(x, t.pdf(x, df), facecolor=isseorange, alpha = 0.4) plt.xlabel('Probabilty distribution over t values') plt.legend(loc='best', frameon=False) plt.savefig('student-t2.pdf')
def studentv(x, normf, mu, sig, skew, nu): if ( (x <3.05) & (x>2.0) ): return 2.*t.pdf((x-mu)/sig, nu)*t.cdf(skew*((x-mu)/sig)*np.sqrt((nu+1.0)/(nu+(x-mu)*(x-mu)/sig/sig)),(nu+1.0))/sig*normf else: return 0.
# # The density functions of the t-distributions are used in signficance testing. The probability density function (pdf) models the relative likelihood of a continous random variable. The cumulative density function (cdf) models the probability of a random variable being less than or equal to a point. The degrees of freedom (df) accounts for the number of observations in the sample. In general the degrees of freedom will be equal to the number of observations minus 2. Say we had a sample size with just 2 observations, we could fit a line through them perfectly and no error in the model. To account for this we subtract 2 from the number of observations to compute the degrees of freedom. # # Scipy has a functions in the library scipy.stats.t which can be used to compute the pdf and cdf of the t-distribution for any number of degrees of freedom. scipy.stats.t.pdf(x,df) is used to estimate the pdf at variable x with df degrees of freedom. # In[11]: from scipy.stats import t import matplotlib.pyplot as plt get_ipython().magic('matplotlib inline') # 100 values between -3 and 3. x = np.linspace(-3,3,100) # Compute the pdf with 3 degrees of freedom. print("t.pdf:\n", t.pdf(x=x, df=3)) # Pdf with 3 degrees of freedom. tdist3 = t.pdf(x=x, df=3) # Pdf with 30 degrees of freedom. tdist30 = t.pdf(x=x, df=30) # Plot pdfs plt.plot(x, tdist3) plt.plot(x, tdist30) # ###12: Statistical Significance of Coefficients # Now that we know what the t-distribution is we can use it for significance testing. To do significance testing we must first start by stating our hypothesis. We want to test whether the lean of the tower depends on the year, ie. every year the tower leans a certain amount. This is done by setting null and alternative hypotheses. In our case we will say the null hypothesis is that the lean of the tower of pisa does not depend on the year, meaning the coefficient will be equal to zero. The alternative hypothesis would then follow as the lean of the tower depend on the year, the coefficient is not equal to zero. These are written mathematically as, # # $H_0: \beta_1=0$
## 10. Variance of Coefficients ## # Enter your code here. s2b1 = SSE / (pisa.shape[0] - 2) * pisa['year'].var() print(s2b1) ## 11. T-Distribution ## from scipy.stats import t # 100 values between -3 and 3 x = np.linspace(-3,3,100) # Compute the pdf with 3 degrees of freedom print(t.pdf(x=x, df=3)) plt.plot(x, t.pdf(x=x, df=3)) plt.show() ## 12. Statistical Significance of Coefficients ## # The variable s2b1 is in memory. The variance of beta_1 tstat = abs(linearfit.params.year) / (s2b1) ** (1/2) print(tstat) ## 13. The P-Value ## # At the 95% confidence interval for a two-sided t-test we must use a p-value of 0.975 pval = 0.975
gl.plot(x_grid, y_values, color = color, fill = 1, alpha = 0.1) gl.set_zoom(ax = ax1, X = ret1,xlimPad = [0.1,0.1]) gl.savefig(folder_images +'InitPointsInferenceDaysEstimation.png', dpi = 100, sizeInches = [10, 4]) if (t_distribution_graph): gl.init_figure() x_grid = np.linspace(-4,4,100) dfs = [1,3,5,26] for df in dfs: t_pdf = t.pdf(x_grid, df) color = gl.get_color() ax1 = gl.plot(x_grid, t_pdf, alpha = 1, lw = 3, AxesStyle = "Normal", legend = ["df %i"%df],color = color, labels = ["t-distribution","t","pdf(t)"]) color = "k"; x_grid, y_values = bMA.gaussian1D_points(mean = 0, std = 1, num = 100, x_grid = x_grid) gl.plot(x_grid, y_values, alpha = 0.1, lw = 3, AxesStyle = "Normal", legend = ["Guassian"],color = color, fill = 1) gl.set_zoom(ax = ax1, X = x_grid,xlimPad = [0.1,0.1]) gl.savefig(folder_images +'t-distribution.png', dpi = 100, sizeInches = [14,6])
p_r = np.zeros(shape=(n + 1, n)) # allocate memory to store the entries of p(r_t|x_{1:t}) map_p_r = np.empty(shape=(n,)) # allocate memory to store the MAP estimate of p(r_t|x_{1:t}) # BOCPD initialization k_n = np.array([k_0]) alpha_n = np.array([alpha_0]) mu_n = np.array([mu_0]) beta_n = np.array([beta_0]) x_sum = np.array([0.0]) x2_sum = np.array([0.0]) p_r_x = np.array([1.0]) # p(r_t, x_{1:t}), t = 0 := p(r_0 = 0) = 1 # start BOCPD loop for i, x_i in enumerate(x): # observe new datum # compute the predictive probabilities p(x_t|r_{t-1}, x_{t-r:t-1}) p_x = student_t.pdf(x_i, 2.0 * alpha_n, mu_n, np.sqrt(beta_n * (k_n + 1.0) / (alpha_n * k_n))) # compute the growth probabilities p(r_t != 0, x_{1:t}) p_rx_x = (1.0 - 1.0 / l) * p_x * p_r_x # compute the changepoint probability, p(r_t = 0, x_{1:t}) p_r0_x = (1.0 / l) * np.dot(p_x, p_r_x) # update the probability distribution p(r_t, x_{1:t}) and normalize it to obtain # p(r_t|x_{1:t}) p_r_x = np.append(p_r0_x, p_rx_x) p_r_x = p_r_x / np.sum(p_r_x) # keep the result in memory p_r[0 : i + 2, i] = p_r_x # p(r_t|x_{1:t}) map_p_r[i] = p_r_x.argmax() # argmax r_t p(r_t|x_{1:t})
# VARIANCE OF COEFFICIENTS # Compute SSE SSE = np.sum((y.values - yhat)**2) # Compute variance in X xvar = np.sum((pisa.year - pisa.year.mean())**2) # Compute variance in b1 s2b1 = (SSE / (y.shape[0] - 2)) / xvar # T-DISTRIBUTION from scipy.stats import t # 100 values between -3 and 3 x = np.linspace(-3,3,100) # Compute the pdf with 3 degrees of freedom print(t.pdf(x=x, df=3)) # Pdf with 3 degrees of freedom tdist3 = t.pdf(x=x, df=3) # Pdf with 30 degrees of freedom tdist30 = t.pdf(x=x, df=30) # Plot pdfs plt.plot(x, tdist3) plt.plot(x, tdist30) # STATISTICAL SIGNIFICANCE OF COEFFICIENTS tstat = linearfit.params["year"] / np.sqrt(s2b1) # P-VALUE # At the 95% confidence interval for a two-sided t-test we must use a p-value of 0.975 pval = 0.975