def _compute_joint_kde(self, *nodes, normref=True): endog = [self.node_data.info[node]['data'] for node in nodes] t = time.time() if normref: kde = KDEMultivariate(data=endog, var_type='c' * len(nodes), bw='normal_reference') else: kde = KDEMultivariate(data=endog, var_type='c' * len(nodes), bw='cv_ml', defaults=EstimatorSettings(efficient=True)) print("Fit joint KDE for %s in %s seconds" % (nodes, time.time() - t)) self.kdes_joint[nodes] = kde
def plot_kde(dist_code, num_obvs, sample_no, **kwargs): """ Plots KDE for sample :param dist_code: :param num_obvs: :param sample_no: :return: """ if dist_code == 'geyser': data = read_geyser() data, inverse = transform(data) num_obvs = data.shape[0] sample_no = 1 dist = 'geyser' else: dist = dist_from_code(dist_code) source = sample_name(dist_code, num_obvs, sample_no) data = read_data(source) assert data.shape[0] == num_obvs kde = KDEMultivariate(data, 'c' * data.shape[1], bw='cv_ml') ## cv_ml png_file = png_name(dist_code, num_obvs, sample_no, 'kde') if kwargs['contour']: do_kde_contour(kde, png_file, dist, None) else: do_plot_kde(kde, png_file, dist, None)
def kde_statsmodels_m(x, x_grid, bandwidth=0.2, **kwargs): """Multivariate Kernel Density Estimation with Statsmodels""" kde = KDEMultivariate(x, bw=bandwidth * np.ones_like(x), var_type='c', **kwargs) return kde.pdf(x_grid)
def plot_density_panel(chains, names=None, hist_on=False, figsizeinches=None): ''' Plot marginal posterior densities Args: * **chains** (:class:`~numpy.ndarray`): Sampling chain for each parameter * **names** (:py:class:`list`): List of strings - name of each parameter * **hist_on** (:py:class:`bool`): Flag to include histogram on density plot * **figsizeinches** (:py:class:`list`): Specify figure size in inches [Width, Height] ''' nsimu, nparam = chains.shape # number of rows, number of columns ns1, ns2, names, figsizeinches = setup_plot_features(nparam=nparam, names=names, figsizeinches=figsizeinches) f = plt.figure(dpi=100, figsize=(figsizeinches)) # initialize figure for ii in range(nparam): # define chain chain = chains[:, ii].reshape(nsimu, 1) # check indexing # define x grid chain_grid = make_x_grid(chain) # Compute kernel density estimate kde = KDEMultivariate(chain, bw='normal_reference', var_type='c') # plot density on subplot plt.subplot(ns1, ns2, ii+1) if hist_on is True: # include histograms hist(chain, density=True) plt.plot(chain_grid, kde.pdf(chain_grid), 'k') # format figure plt.xlabel(names[ii]) plt.ylabel(str('$\\pi$({}$|M^{}$)'.format(names[ii], '{data}'))) plt.tight_layout(rect=[0, 0.03, 1, 0.95], h_pad=1.0) # adjust spacing return f
def velocity_graphs(N0=0, N=4500, vmax=1, resolution=0.05): data = dict( (n, np.genfromtxt("pdf/VX-{0:04d}.csv".format(n), delimiter=' ')) for n in range(N)) Tdata = np.genfromtxt("bulk.csv", delimiter=' ') # T = Tdata[:, 2] t = Tdata[:, 1] x, y = np.mgrid[-vmax:vmax:resolution, -vmax:vmax:resolution] for n in np.arange(N0, N): kde = KDEMultivariate(data=data[n][:, 3:5], bw='normal_reference', var_type='cc') fig = plt.figure() ax = fig.gca() fig.subplots_adjust(wspace=0) fig.suptitle("Time = {0:.2f} s".format(t[n]), fontsize=7) plt.xlabel("$x$-velocity") plt.ylabel("$y$-velocity") nx = x.shape[0] ny = x.shape[1] pdf = np.zeros((nx, ny)) print("Evaluating the function") for i in range(nx): for j in range(ny): pdf[i, j] = kde.pdf([x[i, j], y[i, j]]) #cs = ax.contour(x, y, pdf, vmin=0.0, vmax=1.6, label="Simulation") cs = ax.contour(x, y, pdf, label="Simulation", cmap=plt.cm.Paired) cs.set_clim(0, 1.6) plt.clabel(cs, inline=1, fontsize=5, fmt="%1.1f") fig.savefig("v-pdf{0:04d}.png".format(n), bbox_inches='tight', dpi=300) plt.close()
def kde_statsmodels_m(x: np.array, x_grid: np.array) -> np.array: """Multivariate Kernel Density Estimation with Statsmodels""" kde = KDEMultivariate( x, bw='cv_ml', # bandwidth * np.ones_like(x), var_type='u') return kde.pdf(x_grid)
def kde_entropy_statsmodels(points, n_est=None): """ Use statsmodels KDEMultivariate pdf to estimate entropy. Density evaluated at sample points. Slow and fails for bimodal, dirichlet; poor for high dimensional MVN. """ from statsmodels.nonparametric.kernel_density import KDEMultivariate n, d = points.shape # Default to the full set if n_est is None: n_est = n # reduce size of draw to n_est if n_est >= n: x = points else: x = points[permutation(n)[:n_est]] n = n_est predictor = KDEMultivariate(data=x, var_type='c' * d) p = predictor.pdf() H = -np.mean(log(p)) return H / LN2
def speed_graphs(N0=0, N=4500, vmax=3, resolution=300): data = dict( (n, np.genfromtxt("pdf/v-{0:04d}.csv".format(n), delimiter=' ')) for n in range(N)) Tdata = np.genfromtxt("bulk.csv", delimiter=' ') T = Tdata[:, 2] t = Tdata[:, 1] x = np.linspace(0, vmax, resolution) for n in np.arange(N0, N): kde = KDEMultivariate(data[n], bw='normal_reference', var_type='c') fig = plt.figure() ax = fig.gca() fig.subplots_adjust(wspace=0) fig.suptitle("Time = {0:.2f} s".format(t[n]), fontsize=7) ax.set_ylim(-0.01, 2.5) plt.xlabel("Velocity norm") plt.ylabel("PDF") # Fix the seed for reproducibility ax.plot(x, kde.pdf(x), label="Simulation") ax.plot(x, maxwell_boltzman_speed(v=x, m=1, kT=T[n]), label="Maxwell-Boltzmann") ax.legend(loc='upper right', shadow=True) fig.savefig("v-pdf{0:04d}.png".format(n), bbox_inches='tight', dpi=300) plt.close()
def calculatePDF(self, tracks): """ Calculate a 2-d probability density surface using kernel density estimation. :param tracks: Collection of :class:`Track` objects. """ if len(tracks) == 0: # No tracks: return np.zeros(self.X.shape) lon = np.array([]) lat = np.array([]) for t in tracks: lon = np.append(lon, t.Longitude) lat = np.append(lat, t.Latitude) xy = np.vstack([self.X.ravel(), self.Y.ravel()]) data = np.array([[lon], [lat]]) kde = KDEMultivariate(data, bw='cv_ml', var_type='cc') pdf = kde.pdf(data_predict=xy) return pdf.reshape(self.X.shape)
def estimate_kernel_density( coordinates, variable_types=None, bandwidths="cv_ml", mins=None, maxs=None, grid_sizes=None, ): n_dimension = len(coordinates) if variable_types is None: variable_types = "c" * n_dimension kde_multivariate = KDEMultivariate( coordinates, var_type=variable_types, bw=bandwidths ) if mins is None: mins = tuple(coordinate.min() for coordinate in coordinates) if maxs is None: maxs = tuple(coordinate.max() for coordinate in coordinates) if grid_sizes is None: grid_sizes = (64,) * n_dimension return kde_multivariate.pdf( make_mesh_grid_coordinates_per_axis(mins, maxs, grid_sizes) ).reshape(grid_sizes)
def _kde(sample_no, data): t0 = datetime.now() kde = KDEMultivariate(data, 'c' * data.shape[1], bw='cv_ml') ## cv_ml elapsed = (datetime.now() - t0).total_seconds() hd, corr_factor = hellinger_distance(dist, kde) return (dist_code, num_obvs, sample_no, 'KDE', '', '', '', 0, 0, 0, num_obvs, 0.0, hd, elapsed)
def kde_statsmodels_m(x, x_grid, bandwidth=0.2, **kwargs): from statsmodels.nonparametric.kernel_density import KDEMultivariate #for multivariate KDE """Multivariate Kernel Density Estimation with Statsmodels""" kde = KDEMultivariate(x, bw=np.array(bandwidth * np.ones_like(x)), var_type='c', **kwargs) return kde.pdf(x_grid) #return the pdf evaluated at the entries of x_grid
def data_to_pdf(data, coords): num_of_variables = 1 if len(data.shape) > 1: num_of_variables = data.shape[1] kde = KDEMultivariate(data=data, bw='normal_reference', var_type='c' * num_of_variables) return kde.pdf(coords)
def kde_statsmodels_m(self, x_grid, bandwidth=0.2, **kwargs): """Multivariate Kernel Density Estimation with Statsmodels""" from statsmodels.nonparametric.kernel_density import KDEMultivariate kde = KDEMultivariate(self.data, bw=bandwidth * np.ones_like(x), var_type='c', **kwargs) return kde.pdf(x_grid)
def bandwidthEstimate(x, y): data = np.transpose(np.array([x,y])) # Cross Validation Maximum Likelihood used for bandwidth estimation k = KDEMultivariate(data,var_type='cc',bw='cv_ml') bandwidth = k.bw return bandwidth
def kde(self): if hasattr(self, "kde"): return self.kde kde = KDEMultivariate(self.input_data, var_type=self.var_type, bw=self.bw_method) self.kde = kde self.evaluate_kde = kde.pdf return kde
def gen(): for ix in sample_range: source = sample_name(dist_code, num_obvs, ix) data = read_data(source) assert data.shape[0] == num_obvs t0 = datetime.now() kde = KDEMultivariate(data, 'c' * data.shape[1], bw='cv_ml') ## cv_ml elapsed = (datetime.now() - t0).total_seconds() hd, corr_factor = hellinger_distance(dist, kde) yield result_kde(dist_code, num_obvs, ix, hd, elapsed)
def getOriginBandwidth(data): """ Calculate the optimal bandwidth for kernel density estimation from data. :param data: :class:`numpy.ndarray` of data points for training data :returns: Bandwidth parameter. """ dens = KDEMultivariate(data=data, var_type='cc', bw='cv_ml') return dens.bw
def kde_xval(bw, args): sample = args['x'] n_folds = args['n_folds'] var_type = args['var_type'] losses = [] for train, test in KFold(n_splits=n_folds).split(sample): kde = KDEMultivariate(sample[train], var_type=var_type, bw=[bw]) pdf = kde.pdf(sample[test]) logpdf = np.log(pdf) logpdfsum = logpdf.sum() losses.append(-1 * logpdfsum) return np.mean(losses)
def plot_density_panel(chains, names = None, settings = None): ''' Plot marginal posterior densities Args: * **chains** (:class:`~numpy.ndarray`): Sampling chain for each parameter * **names** (:py:class:`list`): List of strings - name of each parameter * **settings** (:py:class:`dict`): Settings for features of this method. Returns: * (:py:class:`tuple`): (figure handle, settings actually used in program) ''' default_settings = { 'maxpoints': 500, 'fig': dict(figsize = (5,4), dpi = 100), 'kde': dict(bw = 'normal_reference', var_type = 'c'), 'plot': dict(color = 'k', marker = None, linestyle = '-', linewidth = 3), 'xlabel': {}, 'ylabel': {}, 'hist_on': False, 'hist': dict(density = True), } settings = check_settings(default_settings = default_settings, user_settings = settings) nsimu, nparam = chains.shape # number of rows, number of columns ns1, ns2 = generate_subplot_grid(nparam) names = generate_names(nparam, names) f = plt.figure(**settings['fig']) # initialize figure for ii in range(nparam): # define chain chain = chains[:,ii].reshape(nsimu,1) # check indexing # define x grid chain_grid = make_x_grid(chain) # Compute kernel density estimate kde = KDEMultivariate(chain, **settings['kde']) # plot density on subplot plt.subplot(ns1,ns2,ii+1) if settings['hist_on'] is True: # include histograms hist(chain, **settings['hist']) plt.plot(chain_grid, kde.pdf(chain_grid), **settings['plot']) # format figure plt.xlabel(names[ii], **settings['xlabel']) plt.ylabel(str('$\pi$({}$|M^{}$)'.format(names[ii], '{data}')), **settings['ylabel']) plt.tight_layout(rect=[0, 0.03, 1, 0.95],h_pad=1.0) # adjust spacing return f, settings
def normal_pdf_box_vs_point(): N = 3000 D = 3 L = 10 resolution = 0.01 vmax = 0.5 num_of_interals = np.floor(2 * vmax / resolution) np.random.seed(1) sigma = 0.1 # Positions will be uniformy distributed pos = 2 * L * (np.random.rand(N, D) - 0.5) # Velocities will normally distributed vel = sigma * np.random.randn(N, D) data = np.concatenate((pos, vel), axis=1) data_box = box_to_particles(data, x=np.array([0, 0, 0]), a=2) data_box = data_box[:, 3] print "Number of particles in a box {0}".format(data_box.shape[0]) # vx, vy = np.mgrid[-vmax:vmax:resolution, -vmax:vmax:resolution] vx = np.linspace(-vmax, vmax, num_of_interals) pdf_box = data_to_pdf(data_box, vx) kde = KDEMultivariate(data=data[:, np.array([0, 1, 2, 3])], bw='normal_reference', var_type='cccc') print kde.bw dl = resolution pdf_point = np.zeros((num_of_interals, 1)) # Need to calculate integral \int p(vx, vy, x, y, z) dvx dvy area = 0 for n, v in enumerate(vx): vv = np.array([v]) pdf_point[n] = \ kde.pdf(np.concatenate((np.array([0, 0, 0]), vv), axis=1)) area += pdf_point[n] area *= dl pdf_point /= area pdf_true = (norm(0, sigma).pdf(vx)) fig = plt.figure() ax = fig.gca() l1, = ax.plot(vx, pdf_point) l2, = ax.plot(vx, pdf_box) l3, = ax.fill(vx, pdf_true, ec='gray', fc='gray', alpha=0.4) # cs.set_clim(0, 1.6) plt.legend([l1, l2, l3], ["Point approach", "Box approach", "Gaussian"]) fig.savefig("compare.png", bbox_inches='tight', dpi=300) plt.close()
def fit(self, X: np.ndarray, y: np.ndarray): super(KDE4BO, self).fit(X, y) self.kde_vartypes = "".join([ "u" if n_choices > 0 else "c" for n_choices in self.config_transformer.n_choices_list ]) n_good = max(2, (self.top_n_percent * X.shape[0]) // 100) N = X.shape[0] L = len(self.config_transformer.n_choices_list) if n_good <= L or N - n_good <= L: return None idx = np.argsort(y) if self.good_kde is None: good_kde_bw = np.zeros( [len(self.config_transformer.n_choices_list)]) + 0.1 bad_kde_bw = deepcopy(good_kde_bw) else: good_kde_bw = self.good_kde.bw bad_kde_bw = self.bad_kde.bw X_good = X[idx[:n_good]] X_bad = X[idx[n_good:]] for X_, bw_vector in zip([X_good, X_bad], [good_kde_bw, bad_kde_bw]): M = X_.shape[1] for i in range(M): bw = bw_vector[i] n_choices = self.config_transformer.n_choices_list[i] X_[:, i] = self.process_constants_vector(X_[:, i], n_choices, bw, mode="replace") self.good_kde = KDEMultivariate(data=X_good, var_type=self.kde_vartypes, bw=self.bw_estimation) self.bad_kde = KDEMultivariate(data=X_bad, var_type=self.kde_vartypes, bw=self.bw_estimation) return self
def kde_statsmodels_m_cdf_output(x, x_grid, bandwidth=0.2, **kwargs): """Multivariate Kernel Cumulative Density Estimation with Statsmodels""" #kde = KDEMultivariate(x, bw=bandwidth * np.ones_like(x), # var_type='c', **kwargs) #! bw = "cv_ml", "cv_ls", "normal_reference", np.array([0.23]) kde = None while kde == None: with warnings.catch_warnings(): warnings.filterwarnings('ignore') try: kde = KDEMultivariate(data=x, var_type='c', bw="cv_ml") x_grid_sorted = sorted(x_grid) cdf = kde.cdf(x_grid_sorted) except Warning as e: print('error found:', e) warnings.filterwarnings('default') return cdf, kde.bw
def post_point_stationary_pdf( N0=0, N=3, vmax=1, resolution=0.05, x=np.array([0, 0, 0]) # position ): """ Return pdf p(v, x) """ data = np.genfromtxt("pdf/VX-{0:04d}.csv".format(N0), delimiter=' ') for n in np.arange(N0 + 1, N0 + N): data = np.concatenate( (data, np.genfromtxt("pdf/VX-{0:04d}.csv".format(n), delimiter=' ')), axis=0) print "Number of particles {0}".format(data.shape[0]) kde = KDEMultivariate(data=data[:, np.array([0, 1, 2, 3, 4])], bw='normal_reference', var_type='ccccc') vx, vy = np.mgrid[-vmax:vmax:resolution, -vmax:vmax:resolution] dA = resolution**2 nx = vx.shape[0] ny = vx.shape[1] pdf = np.zeros((nx, ny)) # Need to calculate integral \int p(vx, vy, x, y, z) dvx dvy area = 0 for i in range(nx): for j in range(ny): v = np.array([vx[i, j], vy[i, j]]) pdf[i, j] = kde.pdf(np.concatenate((x, v), axis=1)) area += pdf[i, j] * dA save_contour_plot(vx, vy, pdf / area, filename="pdfpoint-vxvy.png", title="Point $f^{(1)}(v^{(1)})$", xlabel="Streamwise velocity", ylabel="Spanwise velocity")
def estimate_cond_pdf(self, x, z, X): # normal_reference works better with mixed types if 'c' not in [self.variable_types[xi] for xi in x+z]: bw = 'cv_ml' else: bw = 'cv_ls'#'normal_reference' # if conditioning on the empty set, return a pdf instead of cond pdf if len(z) == 0: return KDEMultivariate(X[x], var_type=''.join([self.variable_types[xi] for xi in x]), bw=bw, defaults=self.defaults) else: return KDEMultivariateConditional(endog=X[x], exog=X[z], dep_type=''.join([self.variable_types[xi] for xi in x]), indep_type=''.join([self.variable_types[zi] for zi in z]), bw=bw, defaults=self.defaults)
def pdf(self, pdf_points, bw=None): """ Compute probability density function at points pdf_points. Parameters ---------- pdf_points : 2D array-like Points at which to compute the probability density function. bw : 1D array-like Bandwidths. NOTE: if bw=None then bw=self.h. Returns ------- pdf : 1D array-like Probability density function at points pdf_points. """ if bw == None: bw = self.h return KDEMultivariate(self.data, 'c' * self.d, bw=bw).pdf(pdf_points)
def kde_statsmodels_m(data, grid, **kwargs): """ Multivariate Kernel Density Estimation with Statsmodels Parameters ---------- data : numpy.array Data points used to compute a density estimator. It has `n x p` dimensions, representing n points and p variables. grid : numpy.array Data points at which the desity will be estimated. It has `m x p` dimensions, representing m points and p variables. Returns ------- out : numpy.array Density estimate. Has `m x 1` dimensions """ kde = KDEMultivariate(data, **kwargs) return kde.pdf(grid)
def box_stationary_jointpdf( N0=0, N=3, vmax=1, resolution=0.05, a=1.0, x1=np.array([0, 0, 0]), # position x2=np.array([0.1, 0.1, 0.1]) # position ): """ Return pdf p2(v, x) """ pairs = np.zeros((2, 2)) for n in np.arange(N0, N0 + N): data = np.genfromtxt("pdf/VX-{0:04d}.csv".format(n), delimiter=' ') data1 = box_to_particles(data, x=x1, a=a) data1 = data1[:, 3] data2 = box_to_particles(data, x=x2, a=a) data2 = data2[:, 3] tmppairs = cartesian((data1, data2)) pairs = np.concatenate((pairs, tmppairs), axis=0) pairs = pairs[2:, :] print "Number of pairs {0}".format(pairs.shape[0]) # pairs = np.array([[0, 0], [1, 1], [2, 2]]) kde = KDEMultivariate(data=pairs, bw='normal_reference', var_type='cc') vx1, vx2 = np.mgrid[-vmax:vmax:resolution, -vmax:vmax:resolution] coords = np.vstack([item.ravel() for item in [vx1, vx2]]) pdf = data_to_pdf(pairs, coords) save_contour_plot(vx1, vx2, pdf.reshape(vx1.shape), filename="v-pdf2.png", title="$f^{(2)}(v^{(1)}_x,v^{(1)}_x)$")
def _calculate(self, tracks): """ Calculate a histogram of TC genesis counts given a set of tracks. :param tracks: Collection of :class:`Track` objects. """ log.debug("Calculating PDF for set of {0:d} tracks".format( len(tracks))) hist = ma.zeros((len(self.lon_range) - 1, len(self.lat_range) - 1)) xy = np.vstack([self.X.ravel(), self.Y.ravel()]) x = [] y = [] for track in tracks: if len(track.Longitude) == 0: pass elif len(track.Longitude) == 1: x.append(track.Longitude) y.append(track.Latitude) else: x.append(track.Longitude[0]) y.append(track.Latitude[0]) xx = np.array(x) yy = np.array(y) ii = np.where((xx >= self.gridLimit['xMin']) & (xx <= self.gridLimit['xMax']) & (yy >= self.gridLimit['yMin']) & (yy <= self.gridLimit['yMax'])) values = np.vstack([xx[ii], yy[ii]]) kernel = KDEMultivariate(values, bw='cv_ml', var_type='cc') pdf = kernel.pdf(data_predict=xy) Z = np.reshape(pdf, self.X.shape) return Z.T
def sm_bw(self, n_max=None, method='cv_ml'): """ Compute optimal bandwidths with the statsmodels package. Parameters ---------- n_max : int Maximum number of points considered in the computation of AMISE. NOTE: Computation time of AMISE and its derivatives is quadratic in this number of points. NOTE: if n_max=None then n_max=self.points. (default: None) method : str Type of solver. (see https://www.statsmodels.org/stable/generated/statsmodels.nonparametric.kernel_density.KDEMultivariate.html) (default: 'cv_ml') Returns ------- self : active_particles.mkde.MKDE MKDE object. """ # RESTRICTION OF DATA self._res_data(n_max) # MINIMISAITON ALGORITHM self.min_method = ('sm', method) self.sm_minimisation_res = KDEMultivariate(self.res_data, 'c' * self.d, bw=self.min_method[1]) self.h = self.sm_minimisation_res.bw # optimised bandwidths return self