def plot_histograms(x_data): fig, axs = plt.subplots(1, 2, figsize=(6, 3), sharey=False) axs[0].hist( x_data["x"].ravel(), bins=251, density=True, histtype='stepfilled', alpha=0.2 ) axs[0].set(title=r"$x$") # hyperparam for grad x x_grad = x_data["x'"].ravel() nonzero = np.abs(x_grad) > 1e-3 grad_rho = nonzero.mean() grad_var = x_grad[nonzero].var() grad_loc, grad_scale = laplace.fit(x_grad[nonzero]) print( f"grad_rho={grad_rho:.3f} grad_var={grad_var:.3f} grad_scale={grad_scale:.3f}") # compare laplace fit and empirical distribution fitted = laplace(loc=0, scale=grad_scale) axs[1].hist( x_grad[nonzero], bins=100, log="y", density=True, histtype='stepfilled', alpha=0.2 ) t = np.linspace(-2, 2, 100) axs[1].plot(t, fitted.pdf(t)) axs[1].set(title=r"$\nabla x$") fig.tight_layout()
def draw_dpf(self,data): num_bins = 100 ncols = int(np.sqrt(data.shape[1]))+1 nrows = int(np.sqrt(data.shape[1]))+1 assert (ncols*nrows>=data.shape[1]) _, axes = plt.subplots(ncols, nrows, figsize=(nrows * 3, ncols * 3)) axes = axes.flatten() for ax_i, ax in enumerate(axes): if ax_i>=data.shape[1]: continue e = data[:, ax_i] left = min(e.min(), -e.max()) right = max(e.max(), -e.min()) param = laplace.fit(e) print(ax_i, param) try: tmp = pdf(e, num_bins, left, right) index = tmp.index.T._data values = tmp.values except: print(ax_i,'error') ax.plot([x.left for x in index], values) plt.savefig(os.path.join(self.data_root,'statepdf.jpg'))
def __init__(self, mode=0, elem=None, sample=None): if mode == 0: self.mu = elem[0] self.sigma = elem[1] else: self.mu, self.sigma = laplace.fit(sample) self.math_average = laplace.mean(loc=self.mu, scale=self.sigma) self.dispersion = laplace.var(loc=self.mu, scale=self.sigma)
def learn_arrslow(data, earthmodel, topstations): print "Arrival Slowness:" (start_time, end_time, detections, leb_events, leb_evlist, site_up, sites, phasenames, phasetimedef, phaseprop, sitenames, ttime_prefix, ddrange_file, qfvc_file, hydro_dir, infra_dir) = data numsites = earthmodel.NumSites() phaseid = 0 site_raw = [[] for site in xrange(numsites)] for evnum, event in enumerate(leb_events): if event[EV_MEDIUM_COL] != MEDIUM_SEISMIC or event[EV_DEPTH_COL] > 10: continue for ph, detnum in leb_evlist[evnum]: if ph==phaseid: siteid = int(detections[detnum, DET_SITE_COL]) if earthmodel.SiteMedium(siteid) != MEDIUM_SEISMIC: continue (arrtime, arraz, dist, oop_angle, arr_henergy) \ = earthmodel.ArrivalParams(event[EV_LON_COL], event[EV_LAT_COL], event[EV_DEPTH_COL], event[EV_TIME_COL], event[EV_HYDRO_ENERGY_COL], phaseid, siteid) if arrtime > 0: arrslow = earthmodel.ArrivalSlowness(event[EV_LON_COL], event[EV_LAT_COL], event[EV_DEPTH_COL], phaseid, siteid) res = arrslow - detections[detnum, DET_SLO_COL] site_raw[siteid].append(res) locs, scales = [], [] for siteid in topstations: raw = site_raw[siteid] loc, scale = laplace.fit(raw) locs.append(loc) scales.append(scale) print siteid, loc, scale print "Inverse-Gamma:", invgamma_fit(scales)
def fit(data: FloatIterable, mu: Optional[float] = None, b: Optional[float] = None) -> 'Laplace': """ Fit a Beta distribution to the data. :param data: Iterable of data to fit to. :param mu: Optional fixed value for mu. :param b: Optional fixed value for b. """ kwargs = {} for arg, kw in zip((mu, b), ('floc', 'fscale')): if arg is not None: kwargs[kw] = arg loc, scale = laplace.fit(data=data, **kwargs) return Laplace(mu=loc, b=scale)
def getHist(self, data, bars, fit): """ Calcul les carac de l'histograme Met en place l'abscisse en fonction du nombre de bâtons Fait le fitting des 'donnees' où donnees_param [0] = moyenne et donnees_param[1] = STD trace le fit suivant une loi normale de paramêtre moyenne et STD """ self.histogram, bin_edges = np.histogram(data, 10) self.abscissa = np.linspace(min(bin_edges), max(bin_edges), bars) if (fit == 'NORM'): self.fit_parameter = norm.fit(data) self.data_fit = norm.pdf(self.abscissa, self.fit_parameter[0], self.fit_parameter[1]) elif (fit == 'LAP'): self.fit_parameter = lap.fit(data) else: self.data_fit = None
def test_draw_samples_non_mock(self, plot=False): # Also make sure the non-mock sampler works dtype = np.float32 num_samples = 100000 location = np.array([0.5]) scale = np.array([2]) rv_shape = (1, ) location_mx = add_sample_dimension(mx.nd, mx.nd.array(location, dtype=dtype)) scale_mx = add_sample_dimension(mx.nd, mx.nd.array(scale, dtype=dtype)) rand_gen = None var = Laplace.define_variable(shape=rv_shape, rand_gen=rand_gen, dtype=dtype).factor variables = {var.location.uuid: location_mx, var.scale.uuid: scale_mx} rv_samples_rt = var.draw_samples(F=mx.nd, variables=variables, num_samples=num_samples) assert array_has_samples(mx.nd, rv_samples_rt) assert get_num_samples(mx.nd, rv_samples_rt) == num_samples assert rv_samples_rt.dtype == dtype if plot: plot_univariate(samples=rv_samples_rt, dist=laplace, loc=location[0], scale=scale[0]) location_est, scale_est = laplace.fit(rv_samples_rt.asnumpy().ravel()) location_tol = 1e-2 scale_tol = 1e-2 assert np.abs(location[0] - location_est) < location_tol assert np.abs(scale[0] - scale_est) < scale_tol
except: print(ticker) for portfolio in range(random_porfolios): print(portfolio) weights = np.random.random(len(tickers)) weights /= np.sum(weights) weights2 = weights.copy() portfolio_returns = df.dot(weights) # Laplace model center, scale = laplace.fit(portfolio_returns) # Gaussian model mu, sigma = norm.fit(portfolio_returns) end_returns_laplace = [] end_returns_gaussian = [] for simulation in range(simulations): laplace_model = np.random.laplace(center, scale, t).cumsum() end_returns_laplace.append(laplace_model[-1]) gaussian_model = np.random.normal(mu, sigma, t).cumsum() end_returns_gaussian.append(gaussian_model[-1])
def getParams(ticker): data = DataReader(ticker, "yahoo", datetime(1890, 1, 1), datetime.now()) changes = log(data['Adj Close'][1:]) - log(data.shift()['Adj Close'][1:]) params = laplace.fit(changes) return params, std(changes[-365:]) * (365**(.5))
import matplotlib.pyplot as pl import numpy as np from scipy.stats import t, laplace, norm a = np.random.randn(30) outliers = np.array([8, 8.75, 9.5]) pl.hist(a, 7, weights=[1 / 30] * 30, rwidth=0.8) #fit without outliers x = np.linspace(-5, 10, 500) loc, scale = norm.fit(a) n = norm.pdf(x, loc=loc, scale=scale) loc, scale = laplace.fit(a) l = laplace.pdf(x, loc=loc, scale=scale) fd, loc, scale = t.fit(a) s = t.pdf(x, fd, loc=loc, scale=scale) pl.plot(x, n, 'k>', x, s, 'r-', x, l, 'b--') pl.legend(('Gauss', 'Student', 'Laplace')) pl.savefig('robustDemo_without_outliers.png') #add the outliers pl.figure() pl.hist(a, 7, weights=[1 / 33] * 30, rwidth=0.8) pl.hist(outliers, 3, weights=[1 / 33] * 3, rwidth=0.8) aa = np.hstack((a, outliers))
def bootstrap(a, f=None, b=100, method="balanced", family=None, strata=None, smooth=False, random_state=None): """ Calculate function values from bootstrap samples or optionally return bootstrap samples themselves Parameters ---------- a : array-like Original sample f : callable or None Function to be bootstrapped b : int Number of bootstrap samples method : string * 'ordinary' * 'balanced' * 'parametric' family : string or None * 'gaussian' * 't' * 'laplace' * 'logistic' * 'F' * 'gamma' * 'log-normal' * 'inverse-gaussian' * 'pareto' * 'beta' * 'poisson' strata : array-like or None Stratification labels, ignored when method is parametric smooth : boolean Whether or not to add noise to bootstrap samples, ignored when method is parametric random_state : int or None Random number seed Returns ------- y | X : np.array Function applied to each bootstrap sample or bootstrap samples if f is None """ np.random.seed(random_state) a = np.asarray(a) n = len(a) # stratification not meaningful for parametric sampling if strata is not None and (method != "parametric"): strata = np.asarray(strata) if len(strata) != len(a): raise ValueError("a and strata must have" " the same length") # recursively call bootstrap without stratification # on the different strata masks = [strata == x for x in np.unique(strata)] boot_strata = [ bootstrap(a=a[m], f=None, b=b, method=method, strata=None, random_state=random_state) for m in masks ] # concatenate resampled strata along first column axis X = np.concatenate(boot_strata, axis=1) else: if method == "ordinary": # i.i.d. sampling from ecdf of a X = np.reshape(a[np.random.choice(range(a.shape[0]), a.shape[0] * b)], newshape=(b, ) + a.shape) elif method == "balanced": # permute b concatenated copies of a r = np.reshape([a] * b, newshape=(b * a.shape[0], ) + a.shape[1:]) X = np.reshape(r[np.random.permutation(range(r.shape[0]))], newshape=(b, ) + a.shape) elif method == "parametric": if len(a.shape) > 1: raise ValueError("a must be one-dimensional") # fit parameters by maximum likelihood and sample if family == "gaussian": theta = norm.fit(a) arr = norm.rvs(size=n * b, loc=theta[0], scale=theta[1], random_state=random_state) elif family == "t": theta = t.fit(a, fscale=1) arr = t.rvs(size=n * b, df=theta[0], loc=theta[1], scale=theta[2], random_state=random_state) elif family == "laplace": theta = laplace.fit(a) arr = laplace.rvs(size=n * b, loc=theta[0], scale=theta[1], random_state=random_state) elif family == "logistic": theta = logistic.fit(a) arr = logistic.rvs(size=n * b, loc=theta[0], scale=theta[1], random_state=random_state) elif family == "F": theta = F.fit(a, floc=0, fscale=1) arr = F.rvs(size=n * b, dfn=theta[0], dfd=theta[1], loc=theta[2], scale=theta[3], random_state=random_state) elif family == "gamma": theta = gamma.fit(a, floc=0) arr = gamma.rvs(size=n * b, a=theta[0], loc=theta[1], scale=theta[2], random_state=random_state) elif family == "log-normal": theta = lognorm.fit(a, floc=0) arr = lognorm.rvs(size=n * b, s=theta[0], loc=theta[1], scale=theta[2], random_state=random_state) elif family == "inverse-gaussian": theta = invgauss.fit(a, floc=0) arr = invgauss.rvs(size=n * b, mu=theta[0], loc=theta[1], scale=theta[2], random_state=random_state) elif family == "pareto": theta = pareto.fit(a, floc=0) arr = pareto.rvs(size=n * b, b=theta[0], loc=theta[1], scale=theta[2], random_state=random_state) elif family == "beta": theta = beta.fit(a) arr = beta.rvs(size=n * b, a=theta[0], b=theta[1], loc=theta[2], scale=theta[3], random_state=random_state) elif family == "poisson": theta = np.mean(a) arr = poisson.rvs(size=n * b, mu=theta, random_state=random_state) else: raise ValueError("Invalid family") X = np.reshape(arr, newshape=(b, n)) else: raise ValueError("method must be either 'ordinary'" " , 'balanced', or 'parametric'," " '{method}' was supplied".format(method=method)) # samples are already smooth in the parametric case if smooth and (method != "parametric"): X += np.random.normal(size=X.shape, scale=1 / np.sqrt(n)) if f is None: return X else: return np.asarray([f(x) for x in X])
def fit(Y): m, s = dist.fit(Y) return np.array([m, np.log(s)])
import numpy as np import matplotlib.pylab as pl from scipy.stats import t, laplace, norm a = np.random.randn(30) outliers = np.array([8, 8.75, 9.5]) pl.hist(a, 7, weights=[1 / 30] * 30, rwidth=0.8) #fit without outliers x = np.linspace(-5, 10, 500) loc, scale = norm.fit(a) n = norm.pdf(x, loc=loc, scale=scale) loc, scale = laplace.fit(a) l = laplace.pdf(x, loc=loc, scale=scale) fd, loc, scale = t.fit(a) s = t.pdf(x, fd, loc=loc, scale=scale) pl.plot(x, n, 'k>', x, s, 'r-', x, l, 'b--') pl.legend(('Gauss', 'Student', 'Laplace')) pl.savefig('robustDemo_without_outliers.png') #add the outliers pl.figure() pl.hist(a, 7, weights=[1 / 33] * 30, rwidth=0.8) pl.hist(outliers, 3, weights=[1 / 33] * 3, rwidth=0.8) aa = np.hstack((a, outliers)) loc, scale = norm.fit(aa)
def crispr_surf_statistical_significance(sgRNA_summary_table, sgRNA_indices, perturbation_profile, gammas2betas, null_distribution, simulation_n, test_type, guideindices2bin, averaging_method, padj_cutoffs, effect_size, limit, scale, estimate_statistical_power): """ Function to assess the statistical significance of deconvolved genomic signal. Calculates empirical p-values for each beta, then performs FDR correction through the Benjamini-Hochberg procedure for p.adj.-values. """ # Load sgRNA summary table df_summary_table = pd.read_csv(sgRNA_summary_table) replicates = len([x for x in df_summary_table.columns.tolist() if 'Log2FC_Replicate' in x]) # Gamma chosen for downstream analysis gamma_chosen = gammas2betas['gamma_chosen'] # Load estimated betas into dictionary beta_distributions = {} for i in range(len(gammas2betas['combined'])): beta_distributions[i] = gammas2betas['combined'][i] # Decide how to draw from null distribution and perform deconvolution on simulated null arrays logger.info('Performing %s simulations to construct beta null distributions ...' % (simulation_n)) if null_distribution == 'negative_control': if 'negative_control' not in df_summary_table['sgRNA_Type'].unique().tolist(): null_distribution = 'gaussian' replicate_parameters = [] if null_distribution == 'negative_control': replicate_parameters.append('NA') # Grab all negative control sgRNA lfc scores negative_control_guide_scores = [] for i in range(1, int(replicates) + 1): negative_control_guide_scores.append(np.array(df_summary_table.loc[(df_summary_table['sgRNA_Type'] == 'negative_control'), ['Log2FC_Replicate' + str(i)]]).flatten().tolist()) # Construct many simulated null arrays to perform deconvolution beta_distributions_null = crispr_surf_deconvolution_simulations(negative_control_scores = ['negative_control_guides', negative_control_guide_scores], sgRNA_indices = sgRNA_indices, perturbation_profile = perturbation_profile, gamma_list = [gamma_chosen], simulations_n = simulation_n, replicates = replicates, guideindices2bin = guideindices2bin, averaging_method = averaging_method, scale = scale) elif null_distribution == 'laplace': # Parameterize observed signal with laplace distribution (assume majority of observation sgRNAs are null) for i in range(1, int(replicates) + 1): # # Remove distribution skew # sorted_nc = sorted(np.array(df_summary_table.loc[(df_summary_table['sgRNA_Type'] != 'positive_control'), ['Log2FC_Replicate' + str(i)]]).flatten().tolist()) # median_val = np.median(sorted_nc) # left_tail_median = np.median(sorted_nc[:int(0.1*len(sorted_nc))]) # right_tail_median = np.median(sorted_nc[-int(0.1*len(sorted_nc)):]) # # Left skewed # if (median_val - left_tail_median) > (right_tail_median - median_val): # half_dist = [x for x in sorted_nc if x >= median_val] # # Right skewed # else: # half_dist = [x for x in sorted_nc if x <= median_val] # half_dist_mirrored = [(2*median_val - x) for x in half_dist] # total_dist = half_dist + half_dist_mirrored # replicate_parameters.append(laplace.fit(total_dist)) # Parameterize distribution directly observation_median = np.median(np.array(df_summary_table.loc[(df_summary_table['sgRNA_Type'] != 'positive_control'), ['Log2FC_Replicate' + str(i)]]).flatten().tolist()) laplace_loc, laplace_scale = laplace.fit(np.array(df_summary_table.loc[(df_summary_table['sgRNA_Type'] != 'positive_control'), ['Log2FC_Replicate' + str(i)]]).flatten().tolist()) replicate_parameters.append([observation_median, laplace_scale]) # Construct many simulated null arrays to perform deconvolution beta_distributions_null = crispr_surf_deconvolution_simulations(negative_control_scores = ['laplace', replicate_parameters], sgRNA_indices = sgRNA_indices, perturbation_profile = perturbation_profile, gamma_list = [gamma_chosen], simulations_n = simulation_n, replicates = replicates, guideindices2bin = guideindices2bin, averaging_method = averaging_method, scale = scale) elif null_distribution == 'gaussian': # Parameterize observed signal with gaussian distribution (assume majority of observation sgRNAs are null) for i in range(1, int(replicates) + 1): # # Remove distribution skew # sorted_nc = sorted(np.array(df_summary_table.loc[(df_summary_table['sgRNA_Type'] != 'positive_control'), ['Log2FC_Replicate' + str(i)]]).flatten().tolist()) # median_val = np.median(sorted_nc) # left_tail_median = np.median(sorted_nc[:int(0.1*len(sorted_nc))]) # right_tail_median = np.median(sorted_nc[-int(0.1*len(sorted_nc)):]) # # Left skewed # if (median_val - left_tail_median) > (right_tail_median - median_val): # half_dist = [x for x in sorted_nc if x >= median_val] # # Right skewed # else: # half_dist = [x for x in sorted_nc if x <= median_val] # half_dist_mirrored = [(2*median_val - x) for x in half_dist] # total_dist = half_dist + half_dist_mirrored # replicate_parameters.append(norm.fit(total_dist)) # Parameterize distribution directly observation_median = np.median(np.array(df_summary_table.loc[(df_summary_table['sgRNA_Type'] != 'positive_control'), ['Log2FC_Replicate' + str(i)]]).flatten().tolist()) gaussian_loc, gaussian_scale = norm.fit(np.array(df_summary_table.loc[(df_summary_table['sgRNA_Type'] != 'positive_control'), ['Log2FC_Replicate' + str(i)]]).flatten().tolist()) replicate_parameters.append([observation_median, gaussian_scale]) # Construct many simulated null arrays to perform deconvolution beta_distributions_null = crispr_surf_deconvolution_simulations(negative_control_scores = ['gaussian', replicate_parameters], sgRNA_indices = sgRNA_indices, perturbation_profile = perturbation_profile, gamma_list = [gamma_chosen], simulations_n = simulation_n, replicates = replicates, guideindices2bin = guideindices2bin, averaging_method = averaging_method, scale = scale) # Calculate p-values logger.info('Calculating p. values for %s betas ...' % (len(beta_distributions))) beta_pvals = [] if test_type == 'nonparametric': for i in range(len(beta_distributions)): if (i + 1)%500 == 0: logger.info('Calculated p. values for %s out of %s betas ...' % ((i + 1), len(beta_distributions))) estimated_beta = beta_distributions[i] # null_betas = beta_distributions_null[i] # beta_pvals.append(2.0*float(max(0.0, min(sum(x >= estimated_beta for x in null_betas), sum(x <= estimated_beta for x in null_betas))))/float(len(null_betas))) null_betas = np.array(beta_distributions_null[i]) beta_pvals.append(2.0 * min((null_betas >= estimated_beta).sum(), (null_betas <= estimated_beta).sum()) / float(len(null_betas))) elif test_type == 'parametric': for i in range(len(beta_distributions)): if (i + 1)%500 == 0: logger.info('Calculated p. values for %s out of %s betas ...' % ((i + 1), len(beta_distributions))) estimated_beta = beta_distributions[i] null_betas_loc, null_betas_scale = norm.fit(beta_distributions_null[i]) beta_pvals.append(2.0*float(max(0.0, min([norm(loc = null_betas_loc, scale = null_betas_scale).sf(estimated_beta), 1.0 - norm(loc = null_betas_loc, scale = null_betas_scale).sf(estimated_beta)])))) logger.info('Calculated p. values for %s out of %s betas ...' % (len(beta_distributions), len(beta_distributions))) beta_pvals_adj = multipletests(pvals = beta_pvals, alpha = 0.05, method = 'fdr_bh')[1] gammas2betas['p'] = beta_pvals gammas2betas['padj'] = beta_pvals_adj new_p_cutoff = beta_pvals[pymin(range(len(beta_pvals_adj)), key=lambda i: pyabs(beta_pvals_adj[i] - float(padj_cutoffs[0])))] # Estimate statistical power if estimate_statistical_power == 'yes': beta_statistical_power = [] if scale > 1: beta_corrected_effect_size = crispr_surf_statistical_power(sgRNA_indices = guideindices2bin.keys(), gammas2betas = gammas2betas, effect_size = effect_size, gamma_chosen = gamma_chosen, perturbation_profile = perturbation_profile, scale = scale) else: beta_corrected_effect_size = crispr_surf_statistical_power(sgRNA_indices = sgRNA_indices, gammas2betas = gammas2betas, effect_size = effect_size, gamma_chosen = gamma_chosen, perturbation_profile = perturbation_profile, scale = scale) for i in range(len(beta_corrected_effect_size)): # shifted_distribution = [x + beta_corrected_effect_size[i] for x in beta_distributions_null[i]] # percentile_cutoff = np.percentile(beta_distributions_null[i], (100.0 - float(new_p_cutoff)*100.0/2.0)) beta_dist_null = np.array(beta_distributions_null[i]) shifted_distribution = beta_dist_null + beta_corrected_effect_size[i] percentile_cutoff = np.percentile(beta_dist_null, (100.0 - float(new_p_cutoff)*100.0/2.0)) if (i + 1)%500 == 0: logger.info('Calculated statistical power for %s out of %s betas ...' % ((i + 1), len(beta_distributions))) # beta_statistical_power.append(float(sum(x >= percentile_cutoff for x in shifted_distribution))/float(len(shifted_distribution))) beta_statistical_power.append((shifted_distribution > percentile_cutoff).sum() / float(len(shifted_distribution))) gammas2betas['power'] = beta_statistical_power return gammas2betas, replicate_parameters