def test_null_constrained(): # Create a mixed population of Z-scores: 1000 standard normal and # 20 uniformly distributed between 3 and 4. grid = np.linspace(0.001, 0.999, 1000) z0 = norm.ppf(grid) z1 = np.linspace(3, 4, 20) zs = np.concatenate((z0, z1)) for estimate_mean in False,True: for estimate_scale in False,True: for estimate_prob in False,True: emp_null = NullDistribution(zs, estimate_mean=estimate_mean, estimate_scale=estimate_scale, estimate_null_proportion=estimate_prob) if not estimate_mean: assert_allclose(emp_null.mean, 0, atol=1e-5, rtol=1e-5) if not estimate_scale: assert_allclose(emp_null.sd, 1, atol=1e-5, rtol=1e-2) if not estimate_prob: assert_allclose(emp_null.null_proportion, 1, atol=1e-5, rtol=1e-2) # consistency check assert_allclose(emp_null.pdf(np.r_[-1, 0, 1]), norm.pdf(np.r_[-1, 0, 1], loc=emp_null.mean, scale=emp_null.sd), rtol=1e-13)
def smooth_PSTHs(PSTHs, smooth_sd=0.5): n_psths, n_bins = PSTHs.shape kern_x = np.linspace(0, n_bins, n_bins, endpoint=False) - n_bins/2. + 1 #... plus one seems to remove offsets in convolution kern = norm.pdf(kern_x, scale=smooth_sd) kern /= kern.sum() return np.apply_along_axis(np.convolve, 1, PSTHs, kern, 'same')
def arma_likelihood(time_series, phis=array([]), thetas=array([]), mu=0., sigma=1.): """ Return the log-likelihood of the ARMA model parameters, given the time series. Parameters ---------- time_series : ndarray of shape (n,1) The time series in question phis : ndarray of shape (p,) The phi parameters thetas : ndarray of shape (q,) The theta parameters mu : float The parameter mu sigma : float The parameter sigma Returns ------- log_likelihood : float The log-likelihood of the model """ F, Q, H, dim_states, dim_time_series = state_space_rep(phis, thetas, mu, sigma) mus, covs = kalman(F, Q, H, time_series - mu) likelihood = 0. for i in xrange(len(mus)): cond_mu = H.dot(mus[i]) cond_sigma = H.dot(covs[i].dot(H.T)) likelihood += log(norm.pdf(time_series[i] - mu, loc=cond_mu, scale=sqrt(cond_sigma))) return float(likelihood)
def post_l(l,x,d,alpha,sigma): #normal = [norm.pdf(np.cos(alpha)*(x[1]-d[1]) + np.sin(alpha)*(x[0]-d[0])) for d in data]\ mean=(np.cos(alpha)*(d[1]) + np.sin(alpha)*(d[0])).mean() std=np.sqrt(np.cos(alpha)*(x[1]) + np.sin(alpha)*(x[0])) normal = norm.pdf(l,mean,std) # sigma? #normal = norm.pdf(np.cos(alpha)*(x[1]-d[:,1]) + np.sin(alpha)*(x[0]-d[:,0])) return np.prod(normal)
def eval_fun(abc, pop, *args): # errors = [] # for indiv in pop: # # inverse exponential with offset, y = a * exp(b/x) + c # predicted = (indiv[0] * np.exp(indiv[1] / args[0]) + indiv[2]) # errors.append(predicted - args[1]) # evaluate the population with some broadcasting pred = (pop[:,0][:,np.newaxis] * np.exp(pop[:,1][:,np.newaxis]/args[0][np.newaxis,:]) + pop[:,2][:,np.newaxis]) errors = pred - args[1][np.newaxis,:] # sum of squared error #errors = np.asarray(errors) sse = np.sum(errors*errors,1) #sae = np.sum(np.abs(errors),1) # calculate the weight with a normal kernel weights = np.log(norm.pdf(sse,scale=pop[:,3])) #weights = np.log(norm.pdf(sse,scale=.1)) # see if return both weights and predicted vals if abc._save_posts: return weights,pred else: return weights
def _updateInternalSamplingScore(self, amount, location, width): ''' Updating process is performed as follows: Normal distribution probability density is calculated such as its peak is located at `location`, its variance is defined by `width` such that the area under the PDF curve equals the absolute value of `amount`. The resulting curve is then added to the score ''' values = norm.pdf(self.x, location, width) * amount self._scores = [v1 + v2 for (v1, v2) in zip(self._scores, values)]
def _kern(self, y, x = 0.0, h = 1.0): '''Gaussian kernel, gives the weight ascribed to points in y relative to x with scale parameter h Parameters ---------- y : 1-dim length m, or 2 dimensional (n*m) numpy.array of floats containing coordinates of n, m-dimensional feature points x : float or 1-dim length m numpy.array of floats containing the coordinates of point relative to which kernel weights are calculated h : float or 1-dim length m numpy.array of floats containing the scale parameter for each dimension NOTE - the final division through by h is present in the original R package, but I'm fairly sure it's an error ''' return gauss.pdf((y-x)/h) / h
def sample_pa(x,d,sigma): first=(x[1]-d[1]).mean() second=(x[0]-d[0]).mean() print "x, d, sigma ", x, d, sigma cnt = 0 L = [] z = first / np.sqrt(first**2+second**2) L.append(-np.arccos(-z)) L.append(-np.arccos( z)) L.append( np.arccos(-z)) L.append( np.arccos( z)) Lmax = max([norm.pdf(np.cos(a)*first + np.sin(a)*second,0,sigma) for a in L]) while True: u=rnd.uniform(0,Lmax) a=rnd.uniform(-np.pi/2.,np.pi/2.) normal = norm.pdf(np.cos(a)*first + np.sin(a)*second,0,sigma) # right? if u < normal: break cnt += 1 print cnt return a
def estimate_params_for_normal(x, low_bound , mu_initial, sigma_initial): """ Takes a vector x of truncated data with a known lower truncation bound and estimates the parameters of the fit of an untruncated normal distribution. code from Chris Fonnesbeck's Python data analysis tutorial on Sense https://sense.io/prometheus2305/data-analysis-in-python/files/Statistical%20Data%20Modeling.py """ # normalize vector mu_initial = float(mu_initial) sigma_initial = float(sigma_initial) #x = np.random.normal(size=10000,loc=2000,scale= 2000) x = map(lambda y: (y-mu_initial )/sigma_initial ,x) a = (low_bound - mu_initial)/sigma_initial # normalize lower bound #_ = plt.hist(x, bins=100) #plt.show() #plt.close() # We can construct a log likelihood for this function using the conditional # form trunc_norm = lambda theta, a, x: -(np.log(norm.pdf(x, theta[0], theta[1])) - np.log(1 - norm.cdf(a, theta[0], theta[1]))).sum() # For this example, we will use another optimization algorithm, the # **Nelder-Mead simplex algorithm**. It has a couple of advantages: # # - it does not require derivatives # - it can optimize (minimize) a vector of parameters # # SciPy implements this algorithm in its `fmin` function: # we have normalized data, given that the loer truncation point a # is pretty far out in the tail - the standard normal parameters are # a first good guess, i.e. 0,1 initial_guess = np.array([0,1]) sol = fmin(trunc_norm, initial_guess , args=(a, x)) print sol mean_normalized,stddev_normalized = sol[0],sol[1] mean_est =( 1 + mean_normalized ) * mu_initial stddev_est = stddev_normalized * sigma_initial print mean_est,stddev_est return mean_est,stddev_est
def test_null_distribution(): # Create a mixed population of Z-scores: 1000 standard normal and # 20 uniformly distributed between 3 and 4. grid = np.linspace(0.001, 0.999, 1000) z0 = norm.ppf(grid) z1 = np.linspace(3, 4, 20) zs = np.concatenate((z0, z1)) emp_null = NullDistribution(zs, estimate_null_proportion=True) assert_allclose(emp_null.mean, 0, atol=1e-5, rtol=1e-5) assert_allclose(emp_null.sd, 1, atol=1e-5, rtol=1e-2) assert_allclose(emp_null.null_proportion, 0.98, atol=1e-5, rtol=1e-2) # consistency check assert_allclose(emp_null.pdf(np.r_[-1, 0, 1]), norm.pdf(np.r_[-1, 0, 1], loc=emp_null.mean, scale=emp_null.sd), rtol=1e-13)
def __init__(self, num_target_qubits: int, mu: float = 0, sigma: float = 1, low: float = -1, high: float = 1) -> None: r""" Args: num_target_qubits: Number of qubits it acts on, has a minimum value of 1. mu: Expected value of considered normal distribution sigma: standard deviation of considered normal distribution low: Lower bound, i.e., the value corresponding to \|0...0> (assuming an equidistant grid) high: Upper bound, i.e., the value corresponding to \|1...1> (assuming an equidistant grid) """ validate_min('num_target_qubits', num_target_qubits, 1) probabilities, _ = UnivariateDistribution.\ pdf_to_probabilities( lambda x: norm.pdf(x, mu, sigma), low, high, 2 ** num_target_qubits) super().__init__(num_target_qubits, probabilities, low, high)
def arma_likelihood(time_series, phis=array([]), thetas=array([]), mu=0., sigma=1.): """ Return the log-likelihood of the ARMA model parameters, given the time series. Parameters ---------- time_series : ndarray of shape (n,1) The time series in question phis : ndarray of shape (p,) The phi parameters thetas : ndarray of shape (q,) The theta parameters mu : float The parameter mu sigma : float The parameter sigma Returns ------- log_likelihood : float The log-likelihood of the model """ F, Q, H, dim_states, dim_time_series = state_space_rep( phis, thetas, mu, sigma) mus, covs = kalman(F, Q, H, time_series - mu) likelihood = 0. for i in xrange(len(mus)): cond_mu = H.dot(mus[i]) cond_sigma = H.dot(covs[i].dot(H.T)) likelihood += log( norm.pdf(time_series[i] - mu, loc=cond_mu, scale=sqrt(cond_sigma))) return float(likelihood)
def run(): """ <Description> Args: param1: This is the first param. Returns: This is a description of what is returned. """ nSamples = [10,50,150,300,400,500,1000] distArr = [] np.random.seed(42) for n in nSamples: dist = np.random.normal(loc=0.0, scale=1.0, size=n) distArr.append(dist) # make a figure showing the different distributions for # different number of samples nBins = 30 bins = np.linspace(-3,3,nBins) nGraphs = len(nSamples) fig = plt.figure(figsize=(8,24)) plt.subplot(nGraphs,1,1) plt.title("Recommend at least 300 points from N(0,1)", fontsize=25) for i,n in enumerate(nSamples): plt.subplot(nGraphs,1,i+1) plt.hist(distArr[i],bins=bins,label="N={:d}".format(n),normed=True) labelPDF = "N(0,1) Dist." if (i == nGraphs-1) else "" plt.plot(bins,norm.pdf(bins),label=labelPDF, linewidth=3,linestyle='--',color='r') plt.legend() plt.ylabel("Weighted Proportion",fontsize=20) plt.xlabel("Value of Standard Normal",fontsize=20) plt.tight_layout() fig.savefig("CompareDist.png")
def f(x): return norm.pdf(x) # set low/high values low = [-normal_max_value] + [0]*self.K
def f(x): return norm.pdf(x)
def pl_no_gamma(l, p, sigma): u, v = p if u > l: return 1 / (l + 1.0) * norm.pdf(np.linalg.norm([u - l, v]), 0, sigma) return 1 / (l + 1.0) * norm.pdf(v, 0, sigma)
def pl_no_gamma(l,p,sigma): u,v = p if u > l: return 1/(l+1.0)*norm.pdf(np.linalg.norm([u-l,v]),0,sigma) return 1/(l+1.0)*norm.pdf(v,0,sigma)
def _get_class_posterior(self,x,c) : # calculate log probs for feature log_probs = zeros(x.size) for i,f in enumerate(x) : log_probs[i] = log10(norm.pdf(f,self.means[c,i],self.stdvs[c,i])) return log_probs
S_X1 = np.std(X1, ddof=1) X2 = OI_Data['Mean tBMD'] SortedValues2 = np.sort(X2.values) N2 = len(X2) X2_Bar = np.mean(X2) S_X2 = np.std(X2, ddof=1) ## Kernel density estimation (Gaussian kernel) NormalIQR = np.sum(np.abs(norm.ppf(np.array([0.25,0.75]), 0, 1))) KernelEstimator1 = np.zeros(N1) DataIQR1 = np.abs(X1.quantile(0.75)) - np.abs(X1.quantile(0.25)) KernelHalfWidth1 = 0.9*N1**(-1/5) * min(S_X1,DataIQR1/NormalIQR) for Value in SortedValues1: KernelEstimator1 += norm.pdf(SortedValues1-Value,0,KernelHalfWidth1*2) KernelEstimator1 = KernelEstimator1/KernelEstimator1.sum() KernelEstimator2 = np.zeros(N2) DataIQR2 = np.abs(X2.quantile(0.75)) - np.abs(X2.quantile(0.25)) KernelHalfWidth2 = 0.9*N2**(-1/5) * min(S_X2,DataIQR2/NormalIQR) for Value in SortedValues2: KernelEstimator2 += norm.pdf(SortedValues2-Value,0,KernelHalfWidth2*2) KernelEstimator2 = KernelEstimator2/KernelEstimator2.sum() ## Prepare histogram and store data BinsValues = np.linspace(450,700,21) Counts1, Bins = np.histogram(Healthy_Data['Mean tBMD'],BinsValues) RelativeWeights1 = Counts1/Counts1.sum() Counts2, Bins = np.histogram(OI_Data['Mean tBMD'],BinsValues) RelativeWeights2 = Counts2/Counts2.sum()
def segment(image, n_segments=2, burn_in=1000, samples=1000, lag=5): """ Return image segment samples. Parameters ---------- image : (N,M) ndarray Pixel array with single-dimension values (e.g. hue) Returns ------- labels : (samples,N,M) ndarray The image segment label array emission_params: (samples,K,2) ndarray The Gaussian emission distribution parameters (mean, precision) log_probs : (samples,) ndarray """ # allocate arrays res_labels = zeros((samples, image.shape[0], image.shape[1]), dtype=int) res_emission_params = zeros((samples, n_segments, 6)) res_log_prob = zeros((samples,)) padded_labels = ones((image.shape[0] + 2, image.shape[1] + 2), dtype=int)*-1 labels = padded_labels[1:-1, 1:-1] emission_params = zeros((n_segments, 6)) log_prob = None conditional = zeros((n_segments,)) # init emission_params sample_mean_r = image[:,:,0].mean() sample_mean_g = image[:,:,1].mean() sample_mean_b = image[:,:,2].mean() sample_var_r = image[:,:,0].var() sample_var_g = image[:,:,1].var() sample_var_b = image[:,:,2].var() sample_prec_r = 1./sample_var_r sample_prec_g = 1./sample_var_g sample_prec_b = 1./sample_var_b for k in xrange(n_segments): """ emission_params[k,0] = norm.rvs(sample_mean_r, sqrt(sample_var_r/n_segments)) emission_params[k,1] = sample_prec_r emission_params[k,2] = norm.rvs(sample_mean_g, sqrt(sample_var_g/n_segments)) emission_params[k,3] = sample_prec_g emission_params[k,4] = norm.rvs(sample_mean_b, sqrt(sample_var_b/n_segments)) emission_params[k,5] = sample_prec_b """ emission_params[k,0] = norm.rvs(0.5, 0.1) emission_params[k,1] = 1/(0.25**2) emission_params[k,2] = norm.rvs(0.5, 0.1) emission_params[k,3] = 1/(0.25**2) emission_params[k,4] = norm.rvs(0.5, 0.1) emission_params[k,5] = 1/(0.25**2) # init labels for n in xrange(image.shape[0]): for m in xrange(image.shape[1]): labels[n,m] = randint(0, n_segments) try: # gibbs for i in xrange(burn_in + samples*lag - (lag - 1)): for n in xrange(image.shape[0]): for m in xrange(image.shape[1]): # resample label for k in xrange(n_segments): labels[n,m] = k conditional[k] = 0. conditional[k] += phi_blanket( memoryview(padded_labels), n, m, memoryview(FS)) """ for x in xrange(max(n-2,0), min(n+3,image.shape[0])): for y in xrange(max(m-2,0), min(m+3, image.shape[1])): clique = padded_labels[x:x+3,y:y+3] conditional[k] += phi(clique) """ mean_r = emission_params[k, 0] var_r = 1./emission_params[k, 1] mean_g = emission_params[k, 2] var_g = 1./emission_params[k, 3] mean_b = emission_params[k, 4] var_b = 1./emission_params[k, 5] conditional[k] += log(norm.pdf(image[n,m,0], mean_r, sqrt(var_r))) conditional[k] += log(norm.pdf(image[n,m,1], mean_g, sqrt(var_g))) conditional[k] += log(norm.pdf(image[n,m,2], mean_b, sqrt(var_b))) labels[n,m] = sample_categorical(conditional) for k in xrange(n_segments): mask = (labels == k) # resample label mean red mean_r = emission_params[k, 0] prec_r = emission_params[k, 1] numer_r = TAU_0*MU_0 + prec_r*sum(image[mask][:, 0]) denom_r = TAU_0 + prec_r*sum(mask) post_mean_r = numer_r/denom_r post_var_r = 1./(denom_r) emission_params[k, 0] = norm.rvs(post_mean_r, sqrt(post_var_r)) # resample label var red post_alpha_r = ALPHA_0 + sum(mask)/2. post_beta_r = BETA_0 + sum((image[mask][:, 0] - emission_params[k,0])**2)/2. post_r = gamma(post_alpha_r, scale=1./post_beta_r) emission_params[k, 1] = post_r.rvs() # resample label mean green mean_g = emission_params[k, 2] prec_g = emission_params[k, 3] numer_g = TAU_0*MU_0 + prec_g*sum(image[mask][:, 1]) denom_g = TAU_0 + prec_g*sum(mask) post_mean_g = numer_g/denom_g post_var_g = 1./(denom_g) emission_params[k, 2] = norm.rvs(post_mean_g, sqrt(post_var_g)) # resample label var green post_alpha_g = ALPHA_0 + sum(mask)/2. post_beta_g = BETA_0 + sum((image[mask][:, 1] - emission_params[k,2])**2)/2. post_g = gamma(post_alpha_g, scale=1./post_beta_g) emission_params[k, 3] = post_g.rvs() # resample label mean blue mean_b = emission_params[k, 4] prec_b = emission_params[k, 5] numer_b = TAU_0*MU_0 + prec_b*sum(image[mask][:, 2]) denom_b = TAU_0 + prec_b*sum(mask) post_mean_b = numer_b/denom_b post_var_b = 1./(denom_b) emission_params[k, 4] = norm.rvs(post_mean_b, sqrt(post_var_b)) # resample label var blue post_alpha_b = ALPHA_0 + sum(mask)/2. post_beta_b = BETA_0 + sum((image[mask][:, 2] - emission_params[k,4])**2)/2. post_b = gamma(post_alpha_b, scale=1./post_beta_b) emission_params[k, 5] = post_b.rvs() log_prob = 0. for n in xrange(image.shape[0]): for m in xrange(image.shape[1]): #clique = padded_labels[n:n+3,m:m+3] label = labels[n,m] mean_r = emission_params[label, 0] var_r = 1./emission_params[label, 1] mean_g = emission_params[label, 2] var_g = 1./emission_params[label, 3] mean_b = emission_params[label, 4] var_b = 1./emission_params[label, 5] #log_prob += phi(clique) log_prob += log(norm.pdf(image[n,m,0], mean_r, sqrt(var_r))) log_prob += log(norm.pdf(image[n,m,1], mean_g, sqrt(var_g))) log_prob += log(norm.pdf(image[n,m,2], mean_b, sqrt(var_b))) # prior on theta? log_prob += phi_all(memoryview(padded_labels), memoryview(FS)) sys.stdout.write('\riter {} log_prob {}'.format(i, log_prob)) sys.stdout.flush() if i < burn_in: pass elif not (i - burn_in)%lag: res_i = i/lag res_emission_params[res_i] = emission_params[:] res_labels[res_i] = labels res_log_prob[i] = log_prob sys.stdout.write('\n') return res_labels, res_emission_params, res_log_prob except KeyboardInterrupt: return res_labels, res_emission_params, res_log_prob
def __init__(self, n_normal, normal_max_value, p_zeros, rhos, i_normal=None, i_ps=None): """ Constructor. The Gaussian Conditional Independence Model for Credit Risk Reference: https://arxiv.org/abs/1412.1183 Args: n_normal (int): number of qubits to represent the latent normal random variable Z normal_max_value (float): min/max value to truncate the latent normal random variable Z p_zeros (list or array): standard default probabilities for each asset rhos (list or array): sensitivities of default probability of assets with respect to latent variable Z i_normal (list or array): indices of qubits to represent normal variable i_ps (list or array): indices of qubits to represent asset defaults """ self.n_normal = n_normal self.normal_max_value = normal_max_value self.p_zeros = p_zeros self.rhos = rhos self.K = len(p_zeros) num_qubits = [n_normal] + [1] * self.K # set and store indices if i_normal is not None: self.i_normal = i_normal else: self.i_normal = range(n_normal) if i_ps is not None: self.i_ps = i_ps else: self.i_ps = range(n_normal, n_normal + self.K) # get normal (inverse) CDF and pdf F = lambda x: norm.cdf(x) F_inv = lambda q: norm.ppf(q) f = lambda x: norm.pdf(x) # set low/high values low = [-normal_max_value] + [0] * self.K high = [normal_max_value] + [1] * self.K # call super constructor super().__init__(num_qubits, low=low, high=high) # create normal distribution self._normal = NormalDistribution(n_normal, 0, 1, -normal_max_value, normal_max_value) # create linear rotations for conditional defaults self._slopes = np.zeros(self.K) self._offsets = np.zeros(self.K) self._rotations = [] for k in range(self.K): psi = F_inv(p_zeros[k]) / np.sqrt(1 - rhos[k]) # compute slope / offset slope = -np.sqrt(rhos[k]) / np.sqrt(1 - rhos[k]) slope *= f(psi) / np.sqrt(1 - F(psi)) / np.sqrt(F(psi)) offset = 2 * np.arcsin(np.sqrt(F(psi))) # adjust for integer to normal range mapping offset += slope * (-normal_max_value) slope *= 2 * normal_max_value / (2**n_normal - 1) self._offsets[k] = offset self._slopes[k] = slope lry = LinearYRotation(slope, offset, n_normal, i_state=self.i_normal, i_target=self.i_ps[k]) self._rotations += [lry]
def __init__(self, num_target_qubits, mu=0, sigma=1, low=-1, high=1): self.validate(locals()) probabilities, _ = UnivariateDistribution.\ pdf_to_probabilities(lambda x: norm.pdf(x, mu, sigma), low, high, 2 ** num_target_qubits) super().__init__(num_target_qubits, probabilities, low, high)
def PermutationTest(x,y,NRepetition=45**2,SignificanceLevel=0.05): # Analyze data x_bar = np.mean(x) y_bar = np.mean(y) d = x_bar - y_bar XData = pd.DataFrame({'Values':x,'Group':'Control'},index=range(len(x))) YData = pd.DataFrame({'Values':y,'Group':'Test'},index=range(len(y))) Pool = XData.append(YData,ignore_index=True) N = len(Pool) D = np.zeros(NRepetition) for i in range(NRepetition): n = np.random.randint(1,N-1) SampleA = Pool.sample(n) SampleB = Pool.drop(SampleA.index) D[i] = SampleA['Values'].mean() - SampleB['Values'].mean() # Analyze distribution of D from scipy.stats.distributions import norm D.sort() D_bar = np.mean(D) S_D = np.std(D,ddof=1) N_D = len(D) # Kernel density estimation (Gaussian kernel) KernelEstimator = np.zeros(N_D) NormalIQR = np.abs(norm.interval(0.25,0,1)).sum() DataIQR = np.abs(np.quantile(D,0.75)) - np.abs(np.quantile(D,0.25)) KernelHalfWidth = 0.9 * N_D ** (-1 / 5) * S_D for Value in D: KernelEstimator += norm.pdf(D - Value, 0, KernelHalfWidth * 2) KernelEstimator = KernelEstimator / N_D ## Histogram and density distribution TheoreticalDistribution = norm.pdf(D, D_bar, S_D) Figure, Axes = plt.subplots(1, 1, figsize=(5.5, 4.5), dpi=100) Histogram = Axes.hist(D, density=True, bins=20, edgecolor=(0, 0, 1), color=(1, 1, 1), label='Histogram') Axes.plot(D, KernelEstimator, color=(1, 0, 0), label='Kernel Density') Axes.plot(D, TheoreticalDistribution, linestyle='--', color=(0, 0, 0), label='Normal Distribution') plt.xlabel('D values') plt.ylabel('Density (-)') plt.legend(loc='upper center', ncol=3, bbox_to_anchor=(0.5, 1.15), prop={'size':10}) plt.show() plt.close(Figure) EmpiricalQuantiles = np.arange(0.5, N_D + 0.5) / N_D MinValue = np.quantile(D,SignificanceLevel / 2) MaxValue = np.quantile(D,1 - SignificanceLevel / 2) RejectionRange = np.array([[-np.inf,MinValue],[MaxValue,np.inf]]) Figure, Axes = plt.subplots(1, 1, figsize=(5.5, 4.5), dpi=100) Histogram = Axes.hist(D, density=True, bins=20, edgecolor=(0, 0, 1), color=(1, 1, 1), label='Histogram') Axes.fill_between([min(D),MinValue], [max(Histogram[0]),max(Histogram[0])], color=(0, 0, 0), alpha=0.1) Axes.fill_between([max(D),MaxValue], [max(Histogram[0]),max(Histogram[0])], color=(0, 0, 0), alpha=0.1, label='Rejection range') Axes.plot([d,d], [0,max(Histogram[0])], color=(1, 0, 0), label='Actual difference') plt.xlabel('D values') plt.ylabel('Density (-)') plt.legend(loc='upper center', ncol=3, bbox_to_anchor=(0.5, 1.15), prop={'size':10}) plt.show() plt.close(Figure) p = len(D[abs(D)>=abs(d)]) / len(D) # Z = (D - D_bar) / S_D # z_d = (d - D_bar) / S_D # TheoreticalQuantiles = norm.cdf(Z) # # # Compute range of Z values # D_zmin = D_bar - 10 * S_D # D_zmax = D_bar + 10 * S_D # # Step = 0.001 # x = np.arange(D.min(), D.max(), Step) # range of x in spec # y = norm.pdf(x, D_bar, S_D) # # x_all = np.arange(D_zmin, D_zmax, Step) # entire range of x, both in and out of spec # # y_all = norm.pdf(x_all, D_bar, S_D) # # x_all = np.arange(-10, 10, Step) # entire range of x, both in and out of spec # y_all = norm.pdf(x_all, 0, 1) # y_d = norm.pdf(z_d, 0, 1) # # y_sorted = np.zeros(len(y_all)) # y_sorted += y_all # y_sorted.sort() # # CI = 0.95 # y_area = 0 # i = 1 # while y_area / y_all.sum() < CI: # y_area += y_sorted[-i] # i += 1 # z_CI = i / 2 * Step # # # Entire range of x, both in and out of spec # x_CI = np.arange(-z_CI, z_CI, Step) # y_CI = norm.pdf(x_CI, 0, 1) # # # Plot in data space # Figure, Axes = plt.subplots(1, 1, figsize=(5.5, 4.5), dpi=100) # Axes.fill_between(x_CI, y_CI, 0, alpha=0.15, color=(0, 0, 0), label=str(int(0.95 * 100)) + '% CI') # Axes.plot([z_d,z_d], [0,y_d], color=(0, 0, 1), label='Difference Observed') # Axes.plot(x_all, y_all, color=(1, 0, 0), label='Normal distribution') # Axes.set_xlabel('Z values') # # plt.xlim([D_bar - 4.2 * S_D, D_bar + 4.2 * S_D]) # plt.xlim([-5, 5]) # plt.ylim([0, 0.45]) # plt.legend(loc='upper center', ncol=3, bbox_to_anchor=(0.5, 1.15)) # # plt.show() # plt.close() # # # Plot in data space # d_CI = (z_CI + D_bar) * S_D # dx_CI = np.arange(-d_CI, d_CI, Step) # dy_CI = norm.pdf(dx_CI, D_bar, S_D) # d_y = norm.pdf(d, D_bar, S_D) # # Figure, Axes = plt.subplots(1, 1, figsize=(5.5, 4.5), dpi=100) # Axes.fill_between(dx_CI, dy_CI, 0, alpha=0.15, color=(0, 0, 0), label=str(int(0.95 * 100)) + '% CI') # Axes.plot([d, d], [0, d_y], color=(0, 0, 1), label='Difference Observed') # Axes.plot(D, TheoreticalDistribution, color=(1, 0, 0), label='Normal distribution') # Axes.set_xlabel('D values') # plt.ylim([0,max(TheoreticalDistribution)*1.05]) # plt.legend(loc='upper center', ncol=3, bbox_to_anchor=(0.5, 1.15)) # plt.show() return d, RejectionRange, p
# Below we visualize the distributions of SBP for the whole population # (i.e. the marginal distribution), and for the subpopulations of people # who are 40 and 60 years old, respectively. This visualization is based # on a model that has been fit to the data. It may be misleading if the # model do not fit the data well. This is an important topic, but we # will set it aside for now. # + sbp = np.linspace(50, 200, 100) # Grid of possible blood pressure values mn0 = da.BPXSY1.mean() # marginal mean blood pressure sd0 = da.BPXSY1.std() # marginal SD of blood pressure from scipy.stats.distributions import norm y0 = norm.pdf(sbp, mn0, sd0) mn1 = np.dot(result.params, [1, 40]) # Conditional mean for a 40 year old person sd1 = np.sqrt(result.scale) y1 = norm.pdf(sbp, mn1, sd1) mn2 = np.dot(result.params, [1, 60]) # Conditional mean for a 60 year old person sd2 = np.sqrt(result.scale) y2 = norm.pdf(sbp, mn2, sd2) sns.set_style("whitegrid") ax = sns.lineplot(sbp, y0, label="Overall") sns.lineplot(sbp, y1, label="40 year old") sns.lineplot(sbp, y2, label="60 year old")
def v_truncate(x: float) -> float: """Computes the additive correction term to the moment matching approximation of the truncated Gaussian as detailed in original paper. """ return norm.pdf(x) / norm.cdf(x)
def pdf(self, data): pdf_y_given_x = norm.pdf(data[:, 1], loc=np.sin(4 * data[:, 0]) + 0.5 * data[:, 0], scale=self.scale * np.abs(data[:, 0])) return pdf_y_given_x
def plot_EgSAX(data, segMeans, alphaSize, compRatio): from matplotlib.ticker import NullFormatter from plot_utils import adjust_spines # Stored originally as wordSize x numStreams. Becaule plot func plots columsn by default if segMeans.ndim == 1: numStreams = 1 segMeans = np.atleast_2d(segMeans).T else: numStreams = segMeans.shape[1] wordSize = segMeans.shape[0] bpList = bp_lookup(alphaSize) PAAStreams = np.zeros((wordSize*compRatio, numStreams)) for stream in xrange(numStreams): temp = [] for mean in segMeans[:,stream]: temp.extend([mean]*int(compRatio)) PAAStreams[:,stream] = np.array(temp) # Multiple Ploting nullfmt = NullFormatter() # no labels # start with a rectangular Figure plt.figure() # Plot Gaussina Axes (l,b r,t) axGauss = plt.axes([0.05,0.1, 0.2, 0.85]) adjust_spines(axGauss, []) # Plot Sax Axes axSAX = plt.axes([0.3,0.1, 0.65, 0.85], ) # no labels #axSAX.xaxis.set_major_formatter(nullfmt) axGauss.yaxis.set_major_formatter(nullfmt) axGauss.yaxis.set_minor_formatter(nullfmt) # the SAX plot: axSAX.plot(PAAStreams, drawstyle = 'steps', color = 'r', lw = 1.5) for bp in bpList: axSAX.axhline(y=bp, xmin=0, xmax=PAAStreams.shape[0], ls = '--', color = 'k') axSAX.plot(data, color = 'b', lw = 1) # the Gaussian plot y = np.linspace(-3, 3, 1000) gauss_data = gauss_dist.pdf(y) axGauss.plot(-gauss_data, y) for bp in bpList: axGauss.axhline(y=bp, xmin=0, xmax=PAAStreams.shape[0], ls = '--', color = 'k') # Tweek Axis axSAX.set_ylim( axGauss.get_ylim() ) #axGauss.set_xlim( 0, -0.5 ) adjust_spines(axGauss, []) adjust_spines(axSAX, ['left', 'bottom']) plt.xlabel('Time Steps') plt.show() return PAAStreams
def zLogLik(x, y, z): return log(n.pdf(x*y, loc = z, scale = 0.2))
def pdf(self, data): return norm.pdf(data-self.limit, loc=self.mu, scale=math.sqrt(self.sigma))
def smooth_overlap(e_k_3d, e=0., scale=0.02, axis=2): e_k_3d[np.isnan(e_k_3d)] = -np.inf t1 = norm.pdf(e_k_3d, loc=e, scale=scale) # todo interpolate axis 2 return np.sum(t1, axis=(axis, 3))
def pdf(self, data): #print self.sigma return norm.pdf(data, loc=self.mu, scale=math.sqrt(self.sigma))
from scipy.stats import kde x1 = np.random.normal(-1, 0.5, 15) # parameters: (loc=0.0, scale=1.0, size=None) x2 = np.random.normal(6, 1, 10) y = np.r_[x1, x2] # r_ Translates slice objects to concatenation along the first axis. x = np.linspace(min(y), max(y), 100) s = 0.4 # Smoothing parameter kernels = np.transpose([norm.pdf(x, yi, s) for yi in y]) # Calculate the kernels density = kde.gaussian_kde(y) # plt.plot(x, kernels, 'k:') # plt.plot(x, kernels.sum(1), 'r') # plt.plot(y, np.zeros(len(y)), 'bo', ms=10) xgrid = np.linspace(x.min(), x.max(), 200) # plt.hist(y, bins=28, normed=True) # plt.plot(xgrid, density(xgrid), 'r-') # Create a bi-modal distribution with a mixture of Normals. x1 = np.random.normal(-1, 2, 15) # parameters: (loc=0.0, scale=1.0, size=None)
def count_likelihood_standard(this_counts,tot_counts,this_port,this_cv,this_fract_recov=1): L = [norm.pdf(this_c,loc=tot_c*this_port*this_fract_recov,scale=(tot_c*this_port*this_fract_recov)*this_cv) \ for this_c,tot_c in zip(this_counts,tot_counts)] logL = numpy.log2(numpy.array(L)) return sum(logL)
## Analyze distribution of Ln(CV) for i in range(2): D = SystemFitted[i]['LogCV'].values D.sort() D_bar = np.mean(D) S_D = np.std(D, ddof=1) N_D = len(D) ## Kernel density estimation (Gaussian kernel) KernelEstimator = np.zeros(N_D) NormalIQR = np.abs(norm.interval(0.25, 0, 1)).sum() DataIQR = np.abs(np.quantile(D, 0.75)) - np.abs(np.quantile(D, 0.25)) KernelHalfWidth = 0.9 * N_D**(-1 / 5) * S_D for Value in D: KernelEstimator += norm.pdf(D - Value, 0, KernelHalfWidth * 2) KernelEstimator = KernelEstimator / N_D ## Histogram and density distribution TheoreticalDistribution = norm.pdf(D, D_bar, S_D) Figure, Axes = plt.subplots(1, 1, figsize=(5.5, 4.5), dpi=100) Histogram = Axes.hist(D, density=True, bins=20, edgecolor=(0, 0, 1), color=(1, 1, 1), label='Histogram') Axes.plot(D, KernelEstimator, color=(1, 0, 0), label='Kernel Density') Axes.plot(D, TheoreticalDistribution, linestyle='--',
def QQPlot(DataValues, Alpha_CI=0.95, DataLabel='Data'): ### Based on: https://www.tjmahr.com/quantile-quantile-plots-from-scratch/ ### Itself based on Fox book: Fox, J. (2015) ### Applied Regression Analysis and Generalized Linear Models. ### Sage Publications, Thousand Oaks, California. # Data analysis N = len(DataValues) X_Bar = np.mean(DataValues) S_X = np.std(DataValues) # Sort data to get the rank Data_Sorted = np.zeros(N) Data_Sorted += DataValues Data_Sorted.sort() # Compute quantiles EmpiricalQuantiles = np.arange(0.5, N + 0.5) / N TheoreticalQuantiles = norm.ppf(EmpiricalQuantiles, X_Bar, S_X) ZQuantiles = norm.ppf(EmpiricalQuantiles, 0, 1) # Compute data variance DataIQR = np.quantile(DataValues, 0.75) - np.quantile(DataValues, 0.25) NormalIQR = np.sum(np.abs(norm.cdf(np.array([0.25, 0.75]), 0, 1))) Variance = DataIQR / NormalIQR Z_Space = np.linspace(min(ZQuantiles), max(ZQuantiles), 100) Variance_Line = Z_Space * Variance + np.median(DataValues) # Compute alpha confidence interval (CI) Z_SE = np.sqrt(norm.cdf(Z_Space) * (1 - norm.cdf(Z_Space)) / N) / norm.pdf(Z_Space) Data_SE = Z_SE * Variance Z_CI_Quantile = norm.ppf(np.array([(1 - Alpha_CI) / 2]), 0, 1) # Create point in the data space Data_Space = np.linspace(min(TheoreticalQuantiles), max(TheoreticalQuantiles), 100) # QQPlot BorderSpace = max(0.05 * abs(Data_Sorted.min()), 0.05 * abs(Data_Sorted.max())) Y_Min = Data_Sorted.min() - BorderSpace Y_Max = Data_Sorted.max() + BorderSpace Figure, Axes = plt.subplots(1, 1, figsize=(5.5, 4.5), dpi=100) Axes.plot(TheoreticalQuantiles, Data_Sorted, linestyle='none', marker='o', mew=0.5, fillstyle='none', color=(0, 0, 0), label=DataLabel) Axes.plot(Data_Space, Variance_Line, linestyle='--', color=(1, 0, 0), label='Variance :' + str(format(np.round(Variance, 2), '.2f'))) Axes.plot(Data_Space, Variance_Line + Z_CI_Quantile * Data_SE, linestyle='--', color=(0, 0, 1), label=str(int(100 * Alpha_CI)) + '% CI') Axes.plot(Data_Space, Variance_Line - Z_CI_Quantile * Data_SE, linestyle='--', color=(0, 0, 1)) plt.xlabel('Theoretical quantiles (-)') plt.ylabel('Empirical quantiles (-)') plt.ylim([Y_Min, Y_Max]) plt.legend(loc='upper center', ncol=3, bbox_to_anchor=(0.5, 1.15), prop={'size': 10}) plt.show() plt.close(Figure) return Variance
# The simulation above shows that when the subsample size increases from 100 to 400 (a factor of 4), the standard deviation of the difference between two correlation coefficients decreases by roughly a factor of 2. The mathematical expression sqrt(2 / m) is an approximation to this standard deviation that can be computed without access to any data. # ### The shape of sampling distributions # Above we focused on the magnitude of the difference between a statistic calculated on two independent samples from a population. Here we focus instead on the shape of the distribution of statistics calculated on subsamples. As discussed in the lectures, the central limit theorem implies that many (but not all) statistics have approximately normal sampling distributions, even if the underlying data are not close to being normally distributed. # # We will illustrate this phenomenon using the systolic blood pressure data from the NHANES study. First we use a histogram to look at the distribution of individual systolic blood pressure values. Note that it is somewhat right-skewed. # In[8]: sns.distplot(da.BPXSY1.dropna()) # Next we calculate 1000 sample means from 1000 subsamples of size 50 and inspect their distribution. # In[9]: m = 50 sbp_mean = [] for i in range(1000): dx = da.sample(m) sbp_mean.append(dx.BPXSY1.dropna().mean()) sns.distplot(sbp_mean) # The lines below plot the density of a normal approximation to the data generated above x = np.linspace(np.min(sbp_mean), np.max(sbp_mean), 100) from scipy.stats.distributions import norm y = norm.pdf(x, np.mean(sbp_mean), np.std(sbp_mean)) plt.plot(x, y, color='orange') # The plots above show that while the distribution of individual systolic blood pressure measures is somewhat skewed to the right, the distribution of means of size 50 is approximately symmetric. The distribution of means is also approximately normal, as shown by the orange curve, which is the best-fitting normal approximation to the data.
steps = [-deme_size, 0, deme_size] dis_prop = (sigma**2) / (2.0 * deme_size**2 ) # Caluculate Dispersal Probability for Deme-Model p = np.array([dis_prop, 1 - 2 * dis_prop, dis_prop]) # draw_list = np.random.choice(steps, p=p, size=50000000) # First do the deme offset # draw_list = np.around(np.random.normal(scale=sigma, size=10000000)) # draw_list = np.around(np.random.uniform(low=-half_length, high=half_length, size=50000000)) draw_list = np.around(np.random.laplace(scale=scale, size=5000000)) print("Mean: %.2f" % np.mean(draw_list)) print("Std: %.4f" % np.std(draw_list)) # Now plot different dispersal kernels: x_plot = np.linspace(-10, 10, 100000) y_norm = norm.pdf(x_plot, scale=2) y_laplace = laplace.pdf(x_plot, scale=scale) y_uniform = uniform.pdf(x_plot, scale=2 * half_length, loc=-half_length) plt.figure() plt.plot(x_plot, y_laplace, label="Laplace: 3", linewidth=3) plt.plot(x_plot, y_norm, label="Normal: 0", linewidth=3) plt.plot(x_plot, y_uniform, label="Uniform: -1.2", linewidth=3, color='y') plt.ylabel("Probability Density", fontsize=25) plt.legend(prop={'size': 25}) plt.tick_params(axis='x', labelsize=15) plt.tick_params(axis='y', labelsize=15) plt.show() plt.figure() x_plot = np.linspace(-14, 14, 100000)
def f(x): # pylint: disable=invalid-name return norm.pdf(x)
vbmm = VBMM.VBMM(x, max_components=10) vbmm.Fit() mixtures = np.concatenate( (vbmm.pis[vbmm.components], vbmm.means[vbmm.components], vbmm.sigmas[vbmm.components])) mixtures = np.reshape(mixtures, (3, -1)) plt.hist(x, histtype='stepfilled', bins=50, alpha=0.85, color="#7A68A6", normed=True, label='Real data') plt.xlabel('data') plt.xlim(0, 60) plt.ylim(0, 0.25) x_range = np.linspace(x.min() - 1, x.max() + 1, 500) y_range = np.asarray([ mixtures[0, i] * norm.pdf(x_range, mixtures[1, i], mixtures[2, i]) for i in range(mixtures.shape[1]) ]) y_range = np.sum(y_range, axis=0) plt.plot(x_range, y_range, color="#A60628", linewidth=2, label='Esitmated pdf') plt.legend() plt.title('VBMM') plt.show()