def kde_param_reflection(distro): ### this version is very susceptible to local maxima... ### kde_param tries to ensure correct handling of multimodal distributions distro = distro[np.isfinite(distro)] MIN, MAX = min(distro), max(distro) span = np.linspace(MIN, MAX, 200) ### create distribution reflection lower = MIN - abs(distro - MIN) upper = MAX + abs(distro - MAX) ### staple them together merge = np.concatenate([lower, distro, upper]) ### compute kernal density estimation for both KDE_MAIN = KDEUnivariate(distro) KDE_FULL = KDEUnivariate(merge) ### fit distro, using the std from the main! KDE_MAIN.fit(bw = np.std(distro)/4.) KDE_FULL.fit(bw = np.std(distro)/4.) ### need to use the main KDE to scale the full scale = np.median(np.divide(KDE_MAIN.evaluate(span), KDE_FULL.evaluate(span))) ### now maximize the full KDE, using the maxed main as the starting guess result = minimize(lambda x: -1*KDE_FULL.evaluate(x), x0 = span[KDE_MAIN.evaluate(span) == max(KDE_MAIN.evaluate(span))], method='Powell') ## Powell has been working pretty well. return {'result' : float(result['x']), 'kde' : KDE_MAIN, 'kde_reflect' : interp1d(span, KDE_FULL.evaluate(span) * scale)}
def calc_bayes_factor(prior_samples, posterior_samples, x=0): '''Returns the Bayes Factor (BF01) such that values >1 indicate there is more support for `x` under the posterior, relative to the prior. ''' kde = KDEUnivariate(prior_samples) kde.fit() prior_density_at_zero = kde.evaluate([x]) kde = KDEUnivariate(posterior_samples) kde.fit() posterior_density_at_zero = kde.evaluate([x]) BF_prior_post = prior_density_at_zero/posterior_density_at_zero return BF_prior_post[0]
def compute_kde(data, test_x): data = data.flatten() test_x = test_x.flatten() kde = KDEUnivariate(data) kde.fit(kernel="gau", bw="silverman") dens = kde.evaluate(test_x) return dens, None
def kde_1d(signal, x_grid=None): """ Return 1d kde of a vector signal (Created 01/24/2015) Todo: how are the kde's normalized? (i want the kde to sum to 1....) https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/ http://glowingpython.blogspot.com/2012/08/kernel-density-estimation-with-scipy.html Usage ----- >>> x = np.linspace(0,1,401) >>> kde = tw.kde_1d(signal, x) >>> plt.plot(x, kde) >>> plt.grid('on') """ # from scipy.stats.kde import gaussian_kde # if x is None: # x = np.linspace(0,1,401) # # return gaussian_kde(signal)(x) from statsmodels.nonparametric.kde import KDEUnivariate kde = KDEUnivariate(signal) kde.fit() if x_grid is None: x_grid = np.linspace(0, 1, 401) #bin_space = x_grid[1]-x_grid[0] # kde estimate kde_est = kde.evaluate(x_grid) # normalize to pdf (need to come back on this....multiply by bin-spacing??) kde_est /= kde_est.sum() return kde_est, x_grid
def gen_kde_pdf(distribution, bounds=None, kde_width=None): ## boundary correction for KDE if bounds == None: print("\t setting bounds to max value") var_min, var_max = min(distribution), max(distribution) else: distribution = distribution[np.where((distribution > bounds[0]) & (distribution < bounds[1]))] var_min, var_max = bounds[0], bounds[1] lower = var_min - abs(distribution - var_min) upper = var_max + abs(distribution - var_max) merge = np.concatenate([lower, upper, distribution]) if kde_width == None: print("... setting kde_width") kde_width = S_MAD(distribution) / 2. KDE_MERGE = KDEUnivariate(merge) KDE_MERGE.fit(bw=kde_width) SCALE = np.divide(1., integrate.quad(KDE_MERGE.evaluate, var_min, var_max)[0]) return lambda X: SCALE * KDE_MERGE.evaluate(X)
def fit_kde(x, grid): resol = len(grid) d = np.zeros(resol) kde = KDEUnivariate(x) kde.fit() d = kde.evaluate(grid) return d
def gaussian_density_estimation(samples, weights, grid, bw=0.1): """ Kernel density estimation with Gaussian kernel. Parameters ---------- samples : np.ndarray Array of sample values. weights : np.ndarray Array of sample weights. If None, unweighted KDE will be performed. grid : np.ndarray Grid points at which the KDE function should be evaluated. bw : float Bandwidth parameter for kernel density estimation. Associated with sigma in the case of a Gaussian kernel. Returns ---------- np.ndarray The probability density values at the supplied grid points. """ # KDE for fine-grained optimization kde = KDEUnivariate(samples) kde.fit(weights=weights, bw=bw, fft=False) # evaluate pdf on a grid to for use in SGOOP # TODO: area under curve between points instead of pdf at point return kde.evaluate(grid)
def reweight(rc, metad_traj, cv_columns, v_minus_c_col, rc_bins=20, kt=2.5): """ Reweighting biased MD trajectory to unbiased probabilty along a given reaction coordinate. Using rbias column from COLVAR to perform reweighting per Tiwary and Parinello """ # read in parameters from sgoop object colvar = metad_traj[cv_columns].values v_minus_c = metad_traj[v_minus_c_col].values # calculate rc observable for each frame colvar_rc = np.sum(colvar * rc, axis=1) # calculate frame weights, per Tiwary and Parinello, JCPB 2015 (c(t) method) weights = np.exp(v_minus_c / kt) norm_weights = weights / weights.sum() # fit weighted KDE with statsmodels method kde = KDEUnivariate(colvar_rc) kde.fit(weights=norm_weights, bw=0.05, fft=False) # evaluate pdf on a grid to for use in SGOOP grid = np.linspace(colvar_rc.min(), colvar.max(), num=rc_bins) pdf = kde.evaluate(grid) pdf = pdf / pdf.sum() return pdf, grid
def get_kde(self, forecast_data, bandwidth=None): kde = KDEUnivariate(forecast_data) silverman_bw = bw_silverman(forecast_data) if bandwidth is None or bandwidth < silverman_bw: kde.fit(bw=silverman_bw) else: kde.fit(bw=bandwidth) return kde if noise_sigma is not None and noise_sigma>silverman_bw: kde_obs=KDEUnivariate(forecast_data) kde_obs.fit(bw=noise_sigma) kde_obs = kde_obs.evaluate(y_steps) kde_ax.plot(kde_obs, y_steps, c=c_kde, ls='-')
def kde_statsmodels_u(self, x_grid, bandwidth=0.2, **kwargs): """Univariate Kernel Density Estimation with Statsmodels""" from statsmodels.nonparametric.kde import KDEUnivariate kde = KDEUnivariate(self.data) kde.fit(bw=bandwidth, **kwargs) return kde.evaluate(x_grid)
def draw_hist_and_kde(sample, grid, true_pdf): # гистограмма plt.hist(sample, 20, range=(grid.min(), grid.max()), normed=True, label='histogram') # ядерная оценка плотности kernel_density = KDEUnivariate(sample) kernel_density.fit() plt.plot(grid, kernel_density.evaluate(grid), color='green', linewidth=2, label='kde') # истинная плотность plt.plot(grid, true_pdf(grid), color='red', linewidth=2, alpha=0.3, label='true pdf') plt.legend() plt.show()
def calcKDE(kd_bw=0.1): """ """ #> KDE using StatsModels kde = KDEUnivariate(nao_rn) kde.fit(bw=kd_bw) return kde.evaluate(x_kde)
def fit(self, data): self.min = np.min(data) self.max = np.max(data) self.mean = np.mean(data) self.std = np.std(data) self.dist = KDEUnivariate(data) self.dist.fit() return self
def weighted_kernel_density_1d(values, weights, bw='silverman', plot=False): from statsmodels.nonparametric.kde import KDEUnivariate kden= KDEUnivariate(values) kden.fit(weights=weights, bw=bw, fft=False) if plot: import matplotlib.pyplot as plt plt.plot(kden.support, [kden.evaluate(xi) for xi in kden.support], 'o-') return kden
def find_outiers_kde(x): x_scaled = scale(list(map(float, x))) kde = KDEUnivariate(x_scaled) kde.fit(bw="scott", fft=True) pred = kde.evaluate(x_scaled) n = sum(pred < 0.5) outlierindices = np.asarray(pred).argsort()[:n] outliervalue = np.asarray(x)[outlierindices] return outlierindices, outliervalue
def normalize_data(img, contrast='T1'): ''' Normalizes 3D images via KDE and clamping Params: - img: 3D image Returns: - normalized image ''' if contrast == 'T1': CONTRAST = 1 else: CONTRAST = 0 if (len(np.nonzero(img)[0])) == 0: normalized_img = img else: tmp = np.asarray(np.nonzero(img.flatten())) q = np.percentile(tmp, 99.) tmp = tmp[tmp <= q] tmp = np.asarray(tmp, dtype=float).reshape(-1, 1) GRID_SIZE = 80 bw = float(q) / GRID_SIZE kde = KDEUnivariate(tmp) kde.fit(kernel='gau', bw=bw, gridsize=GRID_SIZE, fft=True) X = 100. * kde.density Y = kde.support idx = argrelextrema(X, np.greater) idx = np.asarray(idx, dtype=int) H = X[idx] H = H[0] p = Y[idx] p = p[0] x = 0. if CONTRAST == 1: T1_CLAMP_VALUE = 1.25 x = p[-1] normalized_img = img / x normalized_img[normalized_img > T1_CLAMP_VALUE] = T1_CLAMP_VALUE else: T2_CLAMP_VALUE = 3.5 x = np.amax(H) j = np.where(H == x) x = p[j] if len(x) > 1: x = x[0] normalized_img = img / x normalized_img[normalized_img > T2_CLAMP_VALUE] = T2_CLAMP_VALUE normalized_img /= normalized_img.max() return normalized_img
def bootstrap_stats( args: Dict[str, Any], out_q: Optional[mp.Queue] = None) -> Union[None, Dict[str, Any]]: r''' Computes statistics and KDEs of data via sampling with replacement Arguments: args: dictionary of arguments. Possible keys are: data - data to resample name - name prepended to returned keys in result dict weights - array of weights matching length of data to use for weighted resampling n - number of times to resample data x - points at which to compute the kde values of resample data kde - whether to compute the kde values at x-points for resampled data mean - whether to compute the means of the resampled data std - whether to compute standard deviation of resampled data c68 - whether to compute the width of the absolute central 68.2 percentile of the resampled data out_q: if using multiporcessing can place result dictionary in provided queue Returns: Result dictionary if `out_q` is `None` else `None`. ''' out_dict, mean, std, c68, boot = {}, [], [], [], [] name = '' if 'name' not in args else args['name'] weights = None if 'weights' not in args else args['weights'] if 'n' not in args: args['n'] = 100 if 'kde' not in args: args['kde'] = False if 'mean' not in args: args['mean'] = False if 'std' not in args: args['std'] = False if 'c68' not in args: args['c68'] = False if args['kde'] and args['data'].dtype != 'float64': data = np.array(args['data'], dtype='float64') else: data = args['data'] len_d = len(data) np.random.seed() for i in range(args['n']): points = np.random.choice(data, len_d, replace=True, p=weights) if args['kde']: kde = KDEUnivariate(points) kde.fit() boot.append([kde.evaluate(x) for x in args['x']]) if args['mean']: mean.append(np.mean(points)) if args['std']: std.append(np.std(points, ddof=1)) if args['c68']: c68.append(np.percentile(np.abs(points), 68.2)) if args['kde']: out_dict[f'{name}_kde'] = boot if args['mean']: out_dict[f'{name}_mean'] = mean if args['std']: out_dict[f'{name}_std'] = std if args['c68']: out_dict[f'{name}_c68'] = c68 if out_q is not None: out_q.put(out_dict) else: return out_dict
def get_kde(cnv_name, test_name, data, n_points=1000): dist1 = data.loc[data[cnv_name] == 0, test_name].dropna().values dist2 = data.loc[data[cnv_name] == 1, test_name].dropna().values d_min = np.min(np.hstack((dist1, dist2))) d_max = np.max(np.hstack((dist1, dist2))) kde1 = KDEUnivariate(dist1) kde1.fit(kernel='gau', bw='scott', fft=True, gridsize=100, cut=3) kde2 = KDEUnivariate(dist2) kde2.fit(kernel='gau', bw='scott', fft=True, gridsize=100, cut=3) x1, y1 = kde1.support, kde1.density x2, y2 = kde2.support, kde2.density y01 = np.zeros(y1.shape[0]) y02 = np.zeros(y2.shape[0]) # Make sure the densities are nonnegative y1 = np.amax(np.c_[np.zeros_like(y1), y1], axis=1) y2 = np.amax(np.c_[np.zeros_like(y2), y2], axis=1) return y01, y02, x1, y1, x2, y2, d_min, d_max
def kde_param(distribution, x0): ### kde_param tries to ensure correct handling of multimodal distributions ### compute kernal density estimation KDE = KDEUnivariate(distribution) KDE.fit(bw=np.std(distribution)/3.0) result = scipy.optimize.minimize(lambda x: -1*KDE.evaluate(x), x0 = x0, method='Powell') ## Powell has been working pretty well. return {'result' : float(result['x']), 'kde' : KDE}
def kernel_weighted_samples(x, color, x_grid, **kwargs): from statsmodels.nonparametric.kde import KDEUnivariate import matplotlib.pyplot as plt """Univariate Kernel Density Estimation with Statsmodels""" kde = KDEUnivariate(x) kde.fit(**kwargs) # kernel, bw, fft, weights, gridsize, ...] pdf = kde.evaluate(x_grid) plt.plot(x_grid, pdf, color=color, alpha=0.5, lw=3) plt.fill_between(x_grid, pdf, where=None, color=color, alpha=0.2) return pdf, x_grid
def kde_hist_weight(data, xra, nbin=50, bandwidth=None, density=False, weights=None, err=None, mirror=False, cdf=False): data = data[np.isfinite(data)] xmin, xmax = xra if mirror: idx = (data < xmin + 0.3) data = np.append(data, 2.0 * xmin - data[idx]) x_plot = np.linspace(xmin, xmax, nbin) kde_est = KDEUnivariate(data) fft_opt = False if weights is None: fft_opt = True weights_sum = len(data) * 1.0 else: ftt_opt = False weights_sum = np.sum(weights) if bandwidth is not None: bw_in = bandwidth else: bw_in = 'normal_reference' kde_est.fit(bw=bw_in, weights=weights, fft=fft_opt) if density: result = kde_est.evaluate(x_plot) else: result = kde_est.evaluate(x_plot) * weights_sum result_x = x_plot func = lambda x: kde_est.evaluate(x) if cdf: cdf = [] for xx in x_plot: vv, _ = quad(func, xmin, xx) cdf.append(vv) if cdf: return result, result_x, np.array(cdf) else: return result, result_x
def kde_param(distribution, x0): ### compute kernal density estimation KDE = KDEUnivariate(distribution) KDE.fit(bw=np.std(distribution)/3.0) result = scipy.optimize.minimize(lambda x: -1*KDE.evaluate(x), x0 = x0, method='Powell') #print(result) return {'result' : float(result['x']), 'kde' : KDE}
def sample_pdf(catalog, parameter, pdf_fun, params, bounds): ## Catalog: pd.DataFrame() input catalog with arbitrary distribution function ## input_fun: desired distribution of sample ## scale: scale of sample param_span = np.linspace(min(catalog[parameter]), max(catalog[parameter]), 100) print("... determine master KDE") KDE = KDEUnivariate(catalog[parameter]) KDE.fit(bw=np.std(catalog[parameter]) / 3) KDE_FUN = interp1d(param_span, KDE.evaluate(param_span)) ## need to rescale within the bounds. NORM = np.divide( 1., integrate.quad(KDE.evaluate, bounds[0], bounds[1], points=param_span[np.where((param_span > bounds[0]) & (param_span < bounds[1]))], limit=200)[0]) ########################################## N = len(catalog[catalog[parameter].between(*bounds)]) ############################################ ### we need the scale from the other function result, kde_fun = determine_scale(catalog, parameter, pdf_fun, params, bounds=bounds) sample = np.random.uniform(0.0, 1.0, len(catalog)) * len(catalog) * NORM * KDE_FUN( catalog[parameter]) boo_array = sample < result['x'] * pdf_fun(catalog[parameter], *params) return catalog[boo_array & (catalog[parameter].between( bounds[0], bounds[1], inclusive=True))].copy()
def uniform_kde_sample(frame, variable, bounds, p_scale=0.7, cut=True): ### updated uniform sample function to ### homogenize the distribution of the training variable. print("... uniform_kde_sample") if variable == 'TEFF': kde_width = 100 else: kde_width = 0.15 ### Basics var_min, var_max = min(frame[variable]), max(frame[variable]) distro = np.array(frame[variable]) ### Handle boundary solution lower = var_min - abs(distro - var_min) upper = var_max + abs(distro - var_max) merge = np.concatenate([lower, upper, distro]) ### KDE KDE_MERGE = KDEUnivariate(merge) KDE_MERGE.fit(bw=kde_width) #### interp KDE_MERGE for computation speed span = np.linspace(var_min, var_max, 100) KDE_FUN = interp1d(span, KDE_MERGE.evaluate(span)) ### Rescale full_c = len(distro) / integrate.quad(KDE_MERGE.evaluate, var_min, var_max)[0] #### This rescales the original distribution KDE function ### respan, because I don't want to be penalized for low counts outide variable range respan = np.linspace(bounds[0], bounds[1], 100) scale = np.percentile(KDE_MERGE.evaluate(respan) * full_c, p_scale * 100.) ### Accept-Reject sampling sample = np.random.uniform(0, 1, len(distro)) * KDE_FUN(distro) * full_c boo_array = sample < scale selection = frame.iloc[boo_array].copy() shuffle = selection.iloc[np.random.permutation(len(selection))].copy() return shuffle
def reweight(rc, metad_traj, cv_columns, v_minus_c_col, rc_bins=20, kt=2.5, kde=False): """ Reweighting biased MD trajectory to unbiased probabilty along a given reaction coordinate. Using rbias column from COLVAR to perform reweighting per Tiwary and Parinello """ # read in parameters from sgoop object colvar = metad_traj[cv_columns].values v_minus_c = metad_traj[v_minus_c_col].values # calculate rc observable for each frame colvar_rc = np.sum(colvar * rc, axis=1) # calculate frame weights, per Tiwary and Parinello, JCPB 2015 (c(t) method) weights = np.exp(v_minus_c / kt) norm_weights = weights / weights.sum() if kde: # KDE for fine-grained optimization kde = KDEUnivariate(colvar_rc) kde.fit(weights=norm_weights, bw=0.1, fft=False) # evaluate pdf on a grid to for use in SGOOP # TODO: area under curve between points instead of pdf at point grid = np.linspace(colvar_rc.min(), colvar_rc.max(), num=rc_bins) pdf = kde.evaluate(grid) return pdf, grid # histogram density for coarse optimization ( hist, bin_edges = np.histogram(colvar_rc, weights=norm_weights, bins=rc_bins, density=True, range=(colvar_rc.min(), colvar_rc.max())) # set grid points to center of bins bin_width = bin_edges[1] - bin_edges[0] grid = bin_edges[:-1] + bin_width pdf = hist return pdf, grid
def mimic_arviz_posterior(context: ParameterContext, state: SequentialAlgorithmState, num_cols: int = 3, ax: Axes = None, **kwargs) -> Axes: """ Helper function for mimicking arviz plotting functionality. Args: context: parameter context to plot. state: associated state. num_cols: the number of columns. ax: pre-defined axes to use. """ if ax is None: num_rows = len(context.parameters) // num_cols _, ax = plt.subplots(num_rows, num_cols) w = state.normalized_weights().cpu().numpy() flat_axes = ax.ravel() handled = list() for ax_, (p, v) in zip(flat_axes, context.parameters.items()): v_numpy = v.cpu().numpy() kde = KDEUnivariate(v_numpy) kde.fit(weights=w, fft=False) x_linspace = np.linspace(v_numpy.min(), v_numpy.max(), 250) ax_.plot(x_linspace, kde.evaluate(x_linspace), **kwargs) ax_.spines["top"].set_visible(False) ax_.spines["right"].set_visible(False) ax_.spines["left"].set_visible(False) ax_.axes.get_yaxis().set_visible(False) ax_.set_title(p) handled.append(ax_) for ax_ in (ax_ for ax_ in flat_axes if ax_ not in handled): ax_.axis("off") return ax
def md_prob(rc, max_cal_traj, rc_bins, bandwidth=0.02, **storage_dict): # Calculates probability along a given RC data_array = max_cal_traj.values proj = np.sum(data_array * rc, axis=1) # get probability w/ statstmodels KDE kde = KDEUnivariate(proj) kde.fit(bw=bandwidth) grid = np.linspace(proj.min(), proj.max(), num=rc_bins) prob = kde.evaluate(grid) # prob = prob / prob.sum() if storage_dict['prob_list'] is not None: storage_dict['prob_list'].append(prob) return prob, grid # Normalize
def empiricalPDF(data): """ Evaluate a probability density function using kernel density estimation for input data. :param data: :class:`numpy.ndarray` of data values. :returns: PDF values at the data points. """ LOG.debug("Calculating empirical PDF") sortedmax = np.sort(data) kde = KDEUnivariate(sortedmax) kde.fit() try: res = kde.evaluate(sortedmax) except MemoryError: res = np.zeros(len(sortedmax)) return res
def kde(data, kernel='gau', bw='scott', gridsize=None, cut=3, clip=(-np.inf, np.inf), cumulative=False): """Compute a univariate kernel density estimate using statsmodels.""" fft = kernel == "gau" kde = KDEUnivariate(data) kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip) if cumulative: grid, y = kde.support, kde.cdf else: grid, y = kde.support, kde.density # Make sure the density is nonnegative y = np.amax(np.c_[np.zeros_like(y), y], axis=1) return grid, y
def PSTH(spike_times, bw_psth=BW_PSTH, mirror=False, trial_time=None, norm=True, trial_duration=2.5, **kwargs): num = len(spike_times) spike_times_flat = flatten(spike_times) total_spikes = len(spike_times_flat) if trial_time is None: trial_time = (numpy.min(spike_times_flat), numpy.max(spike_times_flat)) if mirror: spike_times_flat = numpy.hstack((-1 * spike_times_flat + 2 * trial_time[0], spike_times_flat, -1 * spike_times_flat + 2 * trial_time[1])) kde = KDEUnivariate(spike_times_flat) if bw_psth is not None: kde.fit(bw=bw_psth) else: kde.fit() if norm: pre_factor = total_spikes / (num * quad(lambda x: kde.evaluate([x])[0], trial_time[0], trial_time[1])[0]) else: pre_factor = 1. return(lambda x: pre_factor * kde.evaluate([x])[0])