def plot_density_panel(chains, names=None, hist_on=False, figsizeinches=None): ''' Plot marginal posterior densities Args: * **chains** (:class:`~numpy.ndarray`): Sampling chain for each parameter * **names** (:py:class:`list`): List of strings - name of each parameter * **hist_on** (:py:class:`bool`): Flag to include histogram on density plot * **figsizeinches** (:py:class:`list`): Specify figure size in inches [Width, Height] ''' nsimu, nparam = chains.shape # number of rows, number of columns ns1, ns2, names, figsizeinches = setup_plot_features(nparam=nparam, names=names, figsizeinches=figsizeinches) f = plt.figure(dpi=100, figsize=(figsizeinches)) # initialize figure for ii in range(nparam): # define chain chain = chains[:, ii].reshape(nsimu, 1) # check indexing # define x grid chain_grid = make_x_grid(chain) # Compute kernel density estimate kde = KDEMultivariate(chain, bw='normal_reference', var_type='c') # plot density on subplot plt.subplot(ns1, ns2, ii+1) if hist_on is True: # include histograms hist(chain, density=True) plt.plot(chain_grid, kde.pdf(chain_grid), 'k') # format figure plt.xlabel(names[ii]) plt.ylabel(str('$\\pi$({}$|M^{}$)'.format(names[ii], '{data}'))) plt.tight_layout(rect=[0, 0.03, 1, 0.95], h_pad=1.0) # adjust spacing return f
def kde_statsmodels_m(x: np.array, x_grid: np.array) -> np.array: """Multivariate Kernel Density Estimation with Statsmodels""" kde = KDEMultivariate( x, bw='cv_ml', # bandwidth * np.ones_like(x), var_type='u') return kde.pdf(x_grid)
def kde_statsmodels_m(x, x_grid, bandwidth=0.2, **kwargs): """Multivariate Kernel Density Estimation with Statsmodels""" kde = KDEMultivariate(x, bw=bandwidth * np.ones_like(x), var_type='c', **kwargs) return kde.pdf(x_grid)
def calculatePDF(self, tracks): """ Calculate a 2-d probability density surface using kernel density estimation. :param tracks: Collection of :class:`Track` objects. """ if len(tracks) == 0: # No tracks: return np.zeros(self.X.shape) lon = np.array([]) lat = np.array([]) for t in tracks: lon = np.append(lon, t.Longitude) lat = np.append(lat, t.Latitude) xy = np.vstack([self.X.ravel(), self.Y.ravel()]) data = np.array([[lon], [lat]]) kde = KDEMultivariate(data, bw='cv_ml', var_type='cc') pdf = kde.pdf(data_predict=xy) return pdf.reshape(self.X.shape)
def data_to_pdf(data, coords): num_of_variables = 1 if len(data.shape) > 1: num_of_variables = data.shape[1] kde = KDEMultivariate( data=data, bw='normal_reference', var_type='c' * num_of_variables) return kde.pdf(coords)
def kde_entropy_statsmodels(points, n_est=None): """ Use statsmodels KDEMultivariate pdf to estimate entropy. Density evaluated at sample points. Slow and fails for bimodal, dirichlet; poor for high dimensional MVN. """ from statsmodels.nonparametric.kernel_density import KDEMultivariate n, d = points.shape # Default to the full set if n_est is None: n_est = n # reduce size of draw to n_est if n_est >= n: x = points else: x = points[permutation(n)[:n_est]] n = n_est predictor = KDEMultivariate(data=x, var_type='c' * d) p = predictor.pdf() H = -np.mean(log(p)) return H / LN2
def kde_entropy_statsmodels(points, n_est=None): """ Use statsmodels KDEMultivariate pdf to estimate entropy. Density evaluated at sample points. Slow and fails for bimodal, dirichlet; poor for high dimensional MVN. """ from statsmodels.nonparametric.kernel_density import KDEMultivariate n, d = points.shape # Default to the full set if n_est is None: n_est = n # reduce size of draw to n_est if n_est >= n: x = points else: x = points[permutation(n)[:n_est]] n = n_est predictor = KDEMultivariate(data=x, var_type='c'*d) p = predictor.pdf() H = -np.mean(log(p)) return H / LN2
def speed_graphs(N0=0, N=4500, vmax=3, resolution=300): data = dict( (n, np.genfromtxt("pdf/v-{0:04d}.csv".format(n), delimiter=' ')) for n in range(N)) Tdata = np.genfromtxt("bulk.csv", delimiter=' ') T = Tdata[:, 2] t = Tdata[:, 1] x = np.linspace(0, vmax, resolution) for n in np.arange(N0, N): kde = KDEMultivariate(data[n], bw='normal_reference', var_type='c') fig = plt.figure() ax = fig.gca() fig.subplots_adjust(wspace=0) fig.suptitle("Time = {0:.2f} s".format(t[n]), fontsize=7) ax.set_ylim(-0.01, 2.5) plt.xlabel("Velocity norm") plt.ylabel("PDF") # Fix the seed for reproducibility ax.plot(x, kde.pdf(x), label="Simulation") ax.plot(x, maxwell_boltzman_speed(v=x, m=1, kT=T[n]), label="Maxwell-Boltzmann") ax.legend(loc='upper right', shadow=True) fig.savefig("v-pdf{0:04d}.png".format(n), bbox_inches='tight', dpi=300) plt.close()
def velocity_graphs(N0=0, N=4500, vmax=1, resolution=0.05): data = dict( (n, np.genfromtxt("pdf/VX-{0:04d}.csv".format(n), delimiter=' ')) for n in range(N)) Tdata = np.genfromtxt("bulk.csv", delimiter=' ') # T = Tdata[:, 2] t = Tdata[:, 1] x, y = np.mgrid[-vmax:vmax:resolution, -vmax:vmax:resolution] for n in np.arange(N0, N): kde = KDEMultivariate(data=data[n][:, 3:5], bw='normal_reference', var_type='cc') fig = plt.figure() ax = fig.gca() fig.subplots_adjust(wspace=0) fig.suptitle("Time = {0:.2f} s".format(t[n]), fontsize=7) plt.xlabel("$x$-velocity") plt.ylabel("$y$-velocity") nx = x.shape[0] ny = x.shape[1] pdf = np.zeros((nx, ny)) print("Evaluating the function") for i in range(nx): for j in range(ny): pdf[i, j] = kde.pdf([x[i, j], y[i, j]]) #cs = ax.contour(x, y, pdf, vmin=0.0, vmax=1.6, label="Simulation") cs = ax.contour(x, y, pdf, label="Simulation", cmap=plt.cm.Paired) cs.set_clim(0, 1.6) plt.clabel(cs, inline=1, fontsize=5, fmt="%1.1f") fig.savefig("v-pdf{0:04d}.png".format(n), bbox_inches='tight', dpi=300) plt.close()
def estimate_kernel_density( coordinates, variable_types=None, bandwidths="cv_ml", mins=None, maxs=None, grid_sizes=None, ): n_dimension = len(coordinates) if variable_types is None: variable_types = "c" * n_dimension kde_multivariate = KDEMultivariate( coordinates, var_type=variable_types, bw=bandwidths ) if mins is None: mins = tuple(coordinate.min() for coordinate in coordinates) if maxs is None: maxs = tuple(coordinate.max() for coordinate in coordinates) if grid_sizes is None: grid_sizes = (64,) * n_dimension return kde_multivariate.pdf( make_mesh_grid_coordinates_per_axis(mins, maxs, grid_sizes) ).reshape(grid_sizes)
def wind_dir_pressure(year=2013): from statsmodels.nonparametric.kernel_density import KDEMultivariate as KDE import robust as rb min2 = 0 sigfac = 3 sigsamp = 5 d = get_data(year=year) wdir = d["winddir_deg"] wdir_rand = wdir + np.random.normal(0,12,len(wdir)) bad = np.isnan(wdir_rand) wdir_rand[bad] = np.random.uniform(0,360,np.sum(bad)) press = d["pressure"] dist1 = wdir_rand dist2 = press med1 = np.median(dist1) sig1 = rb.std(dist1) datamin1 = np.min(dist1) datamax1 = np.max(dist1) min1 = 0.0 max1 = 360.0 med2 = np.median(dist2) sig2 = rb.std(dist2) datamin2 = np.min(dist2) datamax2 = np.max(dist2) min2 = np.min(dist2) max2 = np.max(dist2) X, Y = np.mgrid[min1:max1:100j, min2:max2:100j] positions = np.vstack([X.ravel(), Y.ravel()]) values = np.vstack([dist1, dist2]) kernel = KDE(values,var_type='cc',bw=[sig1/sigsamp,sig2/sigsamp]) Z = np.reshape(kernel.pdf(positions).T, X.shape) aspect = (max1-min1)/(max2-min2) * 8.5/11.0 plot_params() plt.ion() plt.figure(5,figsize=(11,8.5)) plt.clf() ax = plt.subplot(111) ax.imshow(np.rot90(Z), cmap=plt.cm.CMRmap_r,aspect=aspect, \ extent=[min1, max1, min2, max2],origin='upper') ax.yaxis.labelpad = 12 ax.set_ylabel('Atmospheric Pressure (in-Hg)',fontsize=fs) ax.set_xlabel('Wind Direction (degrees)',fontsize=fs) plt.title('Wind Direction and Pressure at Thacher Observatory in '+str(year),fontsize=fs) plt.savefig('Wind_Direction_Pressure_'+str(year)+'.png',dpi=300) mpl.rcdefaults() return
def data_to_pdf(data, coords): num_of_variables = 1 if len(data.shape) > 1: num_of_variables = data.shape[1] kde = KDEMultivariate(data=data, bw='normal_reference', var_type='c' * num_of_variables) return kde.pdf(coords)
def kde_statsmodels_m(x, x_grid, bandwidth=0.2, **kwargs): from statsmodels.nonparametric.kernel_density import KDEMultivariate #for multivariate KDE """Multivariate Kernel Density Estimation with Statsmodels""" kde = KDEMultivariate(x, bw=np.array(bandwidth * np.ones_like(x)), var_type='c', **kwargs) return kde.pdf(x_grid) #return the pdf evaluated at the entries of x_grid
def kde_statsmodels_m(self, x_grid, bandwidth=0.2, **kwargs): """Multivariate Kernel Density Estimation with Statsmodels""" from statsmodels.nonparametric.kernel_density import KDEMultivariate kde = KDEMultivariate(self.data, bw=bandwidth * np.ones_like(x), var_type='c', **kwargs) return kde.pdf(x_grid)
class KDE4BO(BaseDensityEstimator): def __init__(self, top_n_percent=15, bandwidth_factor=3, min_bandwidth=1e3, bw_estimation="normal_reference", min_points_in_kde=2): super(KDE4BO, self).__init__(top_n_percent, bandwidth_factor, min_bandwidth, bw_estimation, min_points_in_kde) self.good_kde = None self.bad_kde = None def fit(self, X: np.ndarray, y: np.ndarray): super(KDE4BO, self).fit(X, y) self.kde_vartypes = "".join([ "u" if n_choices > 0 else "c" for n_choices in self.config_transformer.n_choices_list ]) n_good = max(2, (self.top_n_percent * X.shape[0]) // 100) N = X.shape[0] L = len(self.config_transformer.n_choices_list) if n_good <= L or N - n_good <= L: return None idx = np.argsort(y) if self.good_kde is None: good_kde_bw = np.zeros( [len(self.config_transformer.n_choices_list)]) + 0.1 bad_kde_bw = deepcopy(good_kde_bw) else: good_kde_bw = self.good_kde.bw bad_kde_bw = self.bad_kde.bw X_good = X[idx[:n_good]] X_bad = X[idx[n_good:]] for X_, bw_vector in zip([X_good, X_bad], [good_kde_bw, bad_kde_bw]): M = X_.shape[1] for i in range(M): bw = bw_vector[i] n_choices = self.config_transformer.n_choices_list[i] X_[:, i] = self.process_constants_vector(X_[:, i], n_choices, bw, mode="replace") self.good_kde = KDEMultivariate(data=X_good, var_type=self.kde_vartypes, bw=self.bw_estimation) self.bad_kde = KDEMultivariate(data=X_bad, var_type=self.kde_vartypes, bw=self.bw_estimation) return self def predict(self, X: np.ndarray): super(KDE4BO, self).predict(X) good_pdf = self.good_kde.pdf(X) bad_pdf = self.bad_kde.pdf(X) return good_pdf / bad_pdf
def histogram(self): x = linspace(0, self.maxV, self.resolution) v = [ pp.getLinearVelocity().norm() for pp in self.sim.getParticleList() ] kde = KDEMultivariate(v, bw='normal_reference', var_type='c') with open("v-pdf{0:04d}.csv".format(self.count), 'w') as csvfile: writer = csv.writer(csvfile, delimiter=' ') for n in range(self.resolution): writer.writerow([x[n], kde.pdf(x)[n]])
def kde_xval(bw, args): sample = args['x'] n_folds = args['n_folds'] var_type = args['var_type'] losses = [] for train, test in KFold(n_splits=n_folds).split(sample): kde = KDEMultivariate(sample[train], var_type=var_type, bw=[bw]) pdf = kde.pdf(sample[test]) logpdf = np.log(pdf) logpdfsum = logpdf.sum() losses.append(-1 * logpdfsum) return np.mean(losses)
def plot_density_panel(chains, names = None, settings = None): ''' Plot marginal posterior densities Args: * **chains** (:class:`~numpy.ndarray`): Sampling chain for each parameter * **names** (:py:class:`list`): List of strings - name of each parameter * **settings** (:py:class:`dict`): Settings for features of this method. Returns: * (:py:class:`tuple`): (figure handle, settings actually used in program) ''' default_settings = { 'maxpoints': 500, 'fig': dict(figsize = (5,4), dpi = 100), 'kde': dict(bw = 'normal_reference', var_type = 'c'), 'plot': dict(color = 'k', marker = None, linestyle = '-', linewidth = 3), 'xlabel': {}, 'ylabel': {}, 'hist_on': False, 'hist': dict(density = True), } settings = check_settings(default_settings = default_settings, user_settings = settings) nsimu, nparam = chains.shape # number of rows, number of columns ns1, ns2 = generate_subplot_grid(nparam) names = generate_names(nparam, names) f = plt.figure(**settings['fig']) # initialize figure for ii in range(nparam): # define chain chain = chains[:,ii].reshape(nsimu,1) # check indexing # define x grid chain_grid = make_x_grid(chain) # Compute kernel density estimate kde = KDEMultivariate(chain, **settings['kde']) # plot density on subplot plt.subplot(ns1,ns2,ii+1) if settings['hist_on'] is True: # include histograms hist(chain, **settings['hist']) plt.plot(chain_grid, kde.pdf(chain_grid), **settings['plot']) # format figure plt.xlabel(names[ii], **settings['xlabel']) plt.ylabel(str('$\pi$({}$|M^{}$)'.format(names[ii], '{data}')), **settings['ylabel']) plt.tight_layout(rect=[0, 0.03, 1, 0.95],h_pad=1.0) # adjust spacing return f, settings
def normal_pdf_box_vs_point(): N = 3000 D = 3 L = 10 resolution = 0.01 vmax = 0.5 num_of_interals = np.floor(2 * vmax / resolution) np.random.seed(1) sigma = 0.1 # Positions will be uniformy distributed pos = 2 * L * (np.random.rand(N, D) - 0.5) # Velocities will normally distributed vel = sigma * np.random.randn(N, D) data = np.concatenate((pos, vel), axis=1) data_box = box_to_particles(data, x=np.array([0, 0, 0]), a=2) data_box = data_box[:, 3] print "Number of particles in a box {0}".format(data_box.shape[0]) # vx, vy = np.mgrid[-vmax:vmax:resolution, -vmax:vmax:resolution] vx = np.linspace(-vmax, vmax, num_of_interals) pdf_box = data_to_pdf(data_box, vx) kde = KDEMultivariate(data=data[:, np.array([0, 1, 2, 3])], bw='normal_reference', var_type='cccc') print kde.bw dl = resolution pdf_point = np.zeros((num_of_interals, 1)) # Need to calculate integral \int p(vx, vy, x, y, z) dvx dvy area = 0 for n, v in enumerate(vx): vv = np.array([v]) pdf_point[n] = \ kde.pdf(np.concatenate((np.array([0, 0, 0]), vv), axis=1)) area += pdf_point[n] area *= dl pdf_point /= area pdf_true = (norm(0, sigma).pdf(vx)) fig = plt.figure() ax = fig.gca() l1, = ax.plot(vx, pdf_point) l2, = ax.plot(vx, pdf_box) l3, = ax.fill(vx, pdf_true, ec='gray', fc='gray', alpha=0.4) # cs.set_clim(0, 1.6) plt.legend([l1, l2, l3], ["Point approach", "Box approach", "Gaussian"]) fig.savefig("compare.png", bbox_inches='tight', dpi=300) plt.close()
def _compute_joint_kde(self, *nodes, normref=True): endog = [self.node_data.info[node]['data'] for node in nodes] t = time.time() if normref: kde = KDEMultivariate(data=endog, var_type='c' * len(nodes), bw='normal_reference') else: kde = KDEMultivariate(data=endog, var_type='c' * len(nodes), bw='cv_ml', defaults=EstimatorSettings(efficient=True)) print("Fit joint KDE for %s in %s seconds" % (nodes, time.time() - t)) self.kdes_joint[nodes] = kde
def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True): """ We want to calculate the causal effect of X and Y through back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) for some admissable set of control variables, Z. First we calculate the conditional density P(Y|X,Z), then the density P(Z). We find the support of Z so we can properly sum over it later. variable_types are a dictionary with the column name pointing to an element of set(['o', 'u', 'c']), for 'ordered', 'unordered discrete', or 'continuous'. """ conditional_density_vars = causes + admissable_set self.causes = causes self.effects = effects self.admissable_set = admissable_set self.conditional_density_vars = conditional_density_vars if len(X) > 300 or max(len(causes+admissable_set),len(effects+admissable_set)) >= 3: self.defaults=EstimatorSettings(n_jobs=4, efficient=True) else: self.defaults=EstimatorSettings(n_jobs=-1, efficient=False) if variable_types: self.variable_types = variable_types dep_type = [variable_types[var] for var in effects] indep_type = [variable_types[var] for var in conditional_density_vars] density_types = [variable_types[var] for var in admissable_set] else: self.variable_types = self.__infer_variable_types(X) if 'c' not in variable_types.values(): bw = 'cv_ml' else: bw = 'normal_reference' if admissable_set: self.density = KDEMultivariate(X[admissable_set], var_type=''.join(density_types), bw=bw, defaults=self.defaults) self.conditional_density = KDEMultivariateConditional(endog=X[effects], exog=X[conditional_density_vars], dep_type=''.join(dep_type), indep_type=''.join(indep_type), bw=bw, defaults=self.defaults) if expectation: self.conditional_expectation = KernelReg(X[effects].values, X[conditional_density_vars].values, ''.join(indep_type), bw='cv_ls') self.support = self.__get_support(X) self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u']] self.discrete_Z = list(set(self.discrete_variables).intersection(set(admissable_set))) self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ] self.continuous_Z = list(set(self.continuous_variables).intersection(set(admissable_set)))
def plot_kde(dist_code, num_obvs, sample_no, **kwargs): """ Plots KDE for sample :param dist_code: :param num_obvs: :param sample_no: :return: """ if dist_code == 'geyser': data = read_geyser() data, inverse = transform(data) num_obvs = data.shape[0] sample_no = 1 dist = 'geyser' else: dist = dist_from_code(dist_code) source = sample_name(dist_code, num_obvs, sample_no) data = read_data(source) assert data.shape[0] == num_obvs kde = KDEMultivariate(data, 'c' * data.shape[1], bw='cv_ml') ## cv_ml png_file = png_name(dist_code, num_obvs, sample_no, 'kde') if kwargs['contour']: do_kde_contour(kde, png_file, dist, None) else: do_plot_kde(kde, png_file, dist, None)
def _kde(sample_no, data): t0 = datetime.now() kde = KDEMultivariate(data, 'c' * data.shape[1], bw='cv_ml') ## cv_ml elapsed = (datetime.now() - t0).total_seconds() hd, corr_factor = hellinger_distance(dist, kde) return (dist_code, num_obvs, sample_no, 'KDE', '', '', '', 0, 0, 0, num_obvs, 0.0, hd, elapsed)
def kde_statsmodels_m_cdf_output(x, x_grid, bandwidth=0.2, **kwargs): """Multivariate Kernel Cumulative Density Estimation with Statsmodels""" #kde = KDEMultivariate(x, bw=bandwidth * np.ones_like(x), # var_type='c', **kwargs) #! bw = "cv_ml", "cv_ls", "normal_reference", np.array([0.23]) kde = None while kde == None: with warnings.catch_warnings(): warnings.filterwarnings('ignore') try: kde = KDEMultivariate(data=x, var_type='c', bw="cv_ml") x_grid_sorted = sorted(x_grid) cdf = kde.cdf(x_grid_sorted) except Warning as e: print('error found:', e) warnings.filterwarnings('default') return cdf, kde.bw
def bandwidthEstimate(x, y): data = np.transpose(np.array([x,y])) # Cross Validation Maximum Likelihood used for bandwidth estimation k = KDEMultivariate(data,var_type='cc',bw='cv_ml') bandwidth = k.bw return bandwidth
def kde(self): if hasattr(self, "kde"): return self.kde kde = KDEMultivariate(self.input_data, var_type=self.var_type, bw=self.bw_method) self.kde = kde self.evaluate_kde = kde.pdf return kde
def post_point_stationary_pdf( N0=0, N=3, vmax=1, resolution=0.05, x=np.array([0, 0, 0]) # position ): """ Return pdf p(v, x) """ data = np.genfromtxt("pdf/VX-{0:04d}.csv".format(N0), delimiter=' ') for n in np.arange(N0 + 1, N0 + N): data = np.concatenate( (data, np.genfromtxt("pdf/VX-{0:04d}.csv".format(n), delimiter=' ')), axis=0) print "Number of particles {0}".format(data.shape[0]) kde = KDEMultivariate(data=data[:, np.array([0, 1, 2, 3, 4])], bw='normal_reference', var_type='ccccc') vx, vy = np.mgrid[-vmax:vmax:resolution, -vmax:vmax:resolution] dA = resolution**2 nx = vx.shape[0] ny = vx.shape[1] pdf = np.zeros((nx, ny)) # Need to calculate integral \int p(vx, vy, x, y, z) dvx dvy area = 0 for i in range(nx): for j in range(ny): v = np.array([vx[i, j], vy[i, j]]) pdf[i, j] = kde.pdf(np.concatenate((x, v), axis=1)) area += pdf[i, j] * dA save_contour_plot(vx, vy, pdf / area, filename="pdfpoint-vxvy.png", title="Point $f^{(1)}(v^{(1)})$", xlabel="Streamwise velocity", ylabel="Spanwise velocity")
def gen(): for ix in sample_range: source = sample_name(dist_code, num_obvs, ix) data = read_data(source) assert data.shape[0] == num_obvs t0 = datetime.now() kde = KDEMultivariate(data, 'c' * data.shape[1], bw='cv_ml') ## cv_ml elapsed = (datetime.now() - t0).total_seconds() hd, corr_factor = hellinger_distance(dist, kde) yield result_kde(dist_code, num_obvs, ix, hd, elapsed)
def getOriginBandwidth(data): """ Calculate the optimal bandwidth for kernel density estimation from data. :param data: :class:`numpy.ndarray` of data points for training data :returns: Bandwidth parameter. """ dens = KDEMultivariate(data=data, var_type='cc', bw='cv_ml') return dens.bw
def kde_statsmodels_m(data, grid, **kwargs): """ Multivariate Kernel Density Estimation with Statsmodels Parameters ---------- data : numpy.array Data points used to compute a density estimator. It has `n x p` dimensions, representing n points and p variables. grid : numpy.array Data points at which the desity will be estimated. It has `m x p` dimensions, representing m points and p variables. Returns ------- out : numpy.array Density estimate. Has `m x 1` dimensions """ kde = KDEMultivariate(data, **kwargs) return kde.pdf(grid)
def _calculate(self, tracks): """ Calculate a histogram of TC genesis counts given a set of tracks. :param tracks: Collection of :class:`Track` objects. """ log.debug("Calculating PDF for set of {0:d} tracks".format( len(tracks))) hist = ma.zeros((len(self.lon_range) - 1, len(self.lat_range) - 1)) xy = np.vstack([self.X.ravel(), self.Y.ravel()]) x = [] y = [] for track in tracks: if len(track.Longitude) == 0: pass elif len(track.Longitude) == 1: x.append(track.Longitude) y.append(track.Latitude) else: x.append(track.Longitude[0]) y.append(track.Latitude[0]) xx = np.array(x) yy = np.array(y) ii = np.where((xx >= self.gridLimit['xMin']) & (xx <= self.gridLimit['xMax']) & (yy >= self.gridLimit['yMin']) & (yy <= self.gridLimit['yMax'])) values = np.vstack([xx[ii], yy[ii]]) kernel = KDEMultivariate(values, bw='cv_ml', var_type='cc') pdf = kernel.pdf(data_predict=xy) Z = np.reshape(pdf, self.X.shape) return Z.T
def fit(self, X: np.ndarray, y: np.ndarray): super(KDE4BO, self).fit(X, y) self.kde_vartypes = "".join([ "u" if n_choices > 0 else "c" for n_choices in self.config_transformer.n_choices_list ]) n_good = max(2, (self.top_n_percent * X.shape[0]) // 100) N = X.shape[0] L = len(self.config_transformer.n_choices_list) if n_good <= L or N - n_good <= L: return None idx = np.argsort(y) if self.good_kde is None: good_kde_bw = np.zeros( [len(self.config_transformer.n_choices_list)]) + 0.1 bad_kde_bw = deepcopy(good_kde_bw) else: good_kde_bw = self.good_kde.bw bad_kde_bw = self.bad_kde.bw X_good = X[idx[:n_good]] X_bad = X[idx[n_good:]] for X_, bw_vector in zip([X_good, X_bad], [good_kde_bw, bad_kde_bw]): M = X_.shape[1] for i in range(M): bw = bw_vector[i] n_choices = self.config_transformer.n_choices_list[i] X_[:, i] = self.process_constants_vector(X_[:, i], n_choices, bw, mode="replace") self.good_kde = KDEMultivariate(data=X_good, var_type=self.kde_vartypes, bw=self.bw_estimation) self.bad_kde = KDEMultivariate(data=X_bad, var_type=self.kde_vartypes, bw=self.bw_estimation) return self
def pdf(self, pdf_points, bw=None): """ Compute probability density function at points pdf_points. Parameters ---------- pdf_points : 2D array-like Points at which to compute the probability density function. bw : 1D array-like Bandwidths. NOTE: if bw=None then bw=self.h. Returns ------- pdf : 1D array-like Probability density function at points pdf_points. """ if bw == None: bw = self.h return KDEMultivariate(self.data, 'c' * self.d, bw=bw).pdf(pdf_points)
def estimate_cond_pdf(self, x, z, X): # normal_reference works better with mixed types if 'c' not in [self.variable_types[xi] for xi in x+z]: bw = 'cv_ml' else: bw = 'cv_ls'#'normal_reference' # if conditioning on the empty set, return a pdf instead of cond pdf if len(z) == 0: return KDEMultivariate(X[x], var_type=''.join([self.variable_types[xi] for xi in x]), bw=bw, defaults=self.defaults) else: return KDEMultivariateConditional(endog=X[x], exog=X[z], dep_type=''.join([self.variable_types[xi] for xi in x]), indep_type=''.join([self.variable_types[zi] for zi in z]), bw=bw, defaults=self.defaults)
def box_stationary_jointpdf( N0=0, N=3, vmax=1, resolution=0.05, a=1.0, x1=np.array([0, 0, 0]), # position x2=np.array([0.1, 0.1, 0.1]) # position ): """ Return pdf p2(v, x) """ pairs = np.zeros((2, 2)) for n in np.arange(N0, N0 + N): data = np.genfromtxt("pdf/VX-{0:04d}.csv".format(n), delimiter=' ') data1 = box_to_particles(data, x=x1, a=a) data1 = data1[:, 3] data2 = box_to_particles(data, x=x2, a=a) data2 = data2[:, 3] tmppairs = cartesian((data1, data2)) pairs = np.concatenate((pairs, tmppairs), axis=0) pairs = pairs[2:, :] print "Number of pairs {0}".format(pairs.shape[0]) # pairs = np.array([[0, 0], [1, 1], [2, 2]]) kde = KDEMultivariate(data=pairs, bw='normal_reference', var_type='cc') vx1, vx2 = np.mgrid[-vmax:vmax:resolution, -vmax:vmax:resolution] coords = np.vstack([item.ravel() for item in [vx1, vx2]]) pdf = data_to_pdf(pairs, coords) save_contour_plot(vx1, vx2, pdf.reshape(vx1.shape), filename="v-pdf2.png", title="$f^{(2)}(v^{(1)}_x,v^{(1)}_x)$")
def wind_speed_pressure(year=2013,peak=False): from statsmodels.nonparametric.kernel_density import KDEMultivariate as KDE import robust as rb min2 = 0 sigfac = 3 sigsamp = 5 d = get_data(year=year) if peak: wind = d['windhi'] tag = 'peak' word = 'Peak ' else: wind = d["wind"] tag = 'ave' word = 'Average ' wind_rand = wind + np.random.normal(0,0.5,len(wind)) press = d["pressure"] dist1 = press dist2 = wind_rand med1 = np.median(dist1) sig1 = rb.std(dist1) datamin1 = np.min(dist1) datamax1 = np.max(dist1) min1 = np.min(dist1) max1 = np.max(dist1) med2 = np.median(dist2) sig2 = rb.std(dist2) datamin2 = np.min(dist2) datamax2 = np.max(dist2) max2 = min(med2 + sigfac*sig2,datamax2) X, Y = np.mgrid[min1:max1:100j, min2:max2:100j] positions = np.vstack([X.ravel(), Y.ravel()]) values = np.vstack([dist1, dist2]) kernel = KDE(values,var_type='cc',bw=[sig1/sigsamp,sig2/sigsamp]) Z = np.reshape(kernel.pdf(positions).T, X.shape) aspect = (max1-min1)/(max2-min2) * 8.5/11.0 plot_params() plt.ion() plt.figure(5,figsize=(11,8.5)) plt.clf() ax = plt.subplot(111) ax.imshow(np.rot90(Z), cmap=plt.cm.CMRmap_r,aspect=aspect, \ extent=[min1, max1, min2, max2],origin='upper') ax.yaxis.labelpad = 12 ax.set_xlabel('Atmospheric Pressure (in-Hg)',fontsize=fs) ax.set_ylabel(word+'Wind Speed (mph)',fontsize=fs) plt.title('Wind Speed and Pressure at Thacher Observatory in '+str(year),fontsize=fs) plt.savefig('Wind'+tag+'_Pressure_'+str(year)+'.png',dpi=300) mpl.rcdefaults() return
def kde_statsmodels_m(x, x_grid, bandwidth=0.2, **kwargs): """Multivariate Kernel Density Estimation using Statsmodels""" kde = KDEMultivariate(x, bw=bandwidth * np.ones_like(x), var_type='c', **kwargs) return kde.pdf(x_grid)
def GetKDE_Scatter(self, yax="Defo", xax="Area", positions=None): """ The evaluated Gaussian Kernel Density Estimate -> for scatter plots Parameters ---------- xax : str Identifier for X axis (e.g. "Area", "Area Ratio","Circ",...) yax : str Identifier for Y axis positions : list of points The positions where the KDE will be computed. Note that the KDE estimate is computed from the the points that are set in `self._filter`. Returns ------- density : 1d ndarray The kernel density evaluated for the filtered data points. See Also -------- `RTDC_DataSet.ApplyFilter` `scipy.stats.gaussian_kde` `statsmodels.nonparametric.kernel_density.KDEMultivariate` TODO ---- Do not use positions for the hasher. If the plot is filtered with marker size, we might end up computing the same KDE for the same points over and over again. """ # Dictionary for KDE # kernel density estimator # Ask Christoph H. about kernel density estimator, he has an other library # which allows for manual setting of the bandwidth parameter kde_type = self.Configuration["Plotting"]["KDE"].lower() # make sure the density is used for only this set of variables key = yax+"+"+xax+"_"+kde_type if kde_type == "multivariate": bwx = self.Configuration["Plotting"]["KDE Multivariate "+xax] bwy = self.Configuration["Plotting"]["KDE Multivariate "+yax] key += "_bw{}+{}_".format(bwx,bwy) # make sure the density is only used for the same set of # filters. if self.Configuration["Filtering"]["Enable Filters"]: key += str(self.Configuration["Filtering"]).strip("{}") if positions is not None: # compute hash of positions hasher = hashlib.sha256() hasher.update(positions) key += hasher.hexdigest() if not self._KDE_Scatter.has_key(key): if self.Configuration["Filtering"]["Enable Filters"]: x = getattr(self, dfn.cfgmaprev[xax])[self._filter] y = getattr(self, dfn.cfgmaprev[yax])[self._filter] else: x = getattr(self, dfn.cfgmaprev[xax]) y = getattr(self, dfn.cfgmaprev[yax]) input_positions = np.vstack([x.ravel(), y.ravel()]) # Kernel Density estimation if kde_type == "gauss": a = time.time() estimator = gaussian_kde(input_positions) if positions is None: positions = input_positions density = estimator(positions) print("gaussian estimation scatter time: ", time.time()-a) elif kde_type == "multivariate": a = time.time() estimator_ly = KDEMultivariate(data=[x,y],var_type='cc', bw=[bwx, bwy]) if positions is None: positions = input_positions density = estimator_ly.pdf(positions) print("multivariate estimation scatter time: ", time.time()-a) elif kde_type=="gaussmix": if yax=="Defo": xy = np.array([x,np.log(y)]).T else: xy = np.array([x,y]).T a = time.time() clf = mixture.GMM(n_components=np.ceil(bwx), covariance_type='full', \ random_state=None, thresh=None, min_covar=0.001, n_iter=100, n_init=2, \ params='wmc', init_params='wmc') clf.fit(xy) density = np.exp(clf.score_samples(xy)[0]) print("gaussian mixture scatter time: ", time.time()-a) else: raise ValueError("Unknown KDE estimator {}".format( kde_type)) self._KDE_Scatter[key] = density
def GetKDE_Contour(self, yax="Defo", xax="Area"): """ The evaluated Gaussian Kernel Density Estimate -> for contours Parameters ---------- xax : str Identifier for X axis (e.g. "Area", "Area Ratio","Circ",...) yax : str Identifier for Y axis Returns ------- X, Y, Z : coordinates The kernel density Z evaluated on a rectangular grid (X,Y). See Also -------- `scipy.stats.gaussian_kde` `statsmodels.nonparametric.kernel_density.KDEMultivariate` """ if xax is None or yax is None: xax, yax = self.GetPlotAxes() kde_type = self.Configuration["Plotting"]["KDE"].lower() # dummy area-circ deltaarea = self.Configuration["Plotting"]["Contour Accuracy "+xax] deltacirc = self.Configuration["Plotting"]["Contour Accuracy "+yax] # kernel density estimator # Ask Christoph H. about kernel density estimator, he has an other library # which allows for manual setting of the bandwidth parameter key = yax+"+"+xax+"_"+kde_type+str(deltaarea)+str(deltacirc) if kde_type == "multivariate": bwx = self.Configuration["Plotting"]["KDE Multivariate "+xax] bwy = self.Configuration["Plotting"]["KDE Multivariate "+yax] key += "_bw{}+{}_".format(bwx,bwy) # make sure the density is only used for the same set of # filters. if self.Configuration["Filtering"]["Enable Filters"]: key += str(self.Configuration["Filtering"]).strip("{}") if not self._KDE_Contour.has_key(key): # setup if self.Configuration["Filtering"]["Enable Filters"]: x = getattr(self, dfn.cfgmaprev[xax])[self._filter] y = getattr(self, dfn.cfgmaprev[yax])[self._filter] else: x = getattr(self, dfn.cfgmaprev[xax]) y = getattr(self, dfn.cfgmaprev[yax]) # evaluation xlin = np.arange(x.min(), x.max(), deltaarea) ylin = np.arange(y.min(), y.max(), deltacirc) Xmesh,Ymesh = np.meshgrid(xlin,ylin) X = Xmesh.ravel() Y = Ymesh.ravel() if kde_type == "gauss": estimator = gaussian_kde([x,y]) Z = estimator.evaluate([X,Y]).reshape(len(ylin),len(xlin)) elif kde_type == "multivariate": estimator_ly = KDEMultivariate(data=[x,y],var_type='cc', bw=[bwx, bwy]) Z = estimator_ly.pdf([X,Y]).reshape(len(ylin),len(xlin)) elif kde_type=="gaussmix": if yax=="Defo": xy = np.array([x,np.log(y)]).T XY = np.array([X,np.log(Y)]).T else: XY = np.array([X,Y]).T xy = np.array([x,y]).T clf = mixture.GMM(n_components=np.ceil(bwx), covariance_type='full', \ random_state=None, thresh=None, min_covar=0.001, n_iter=100, n_init=2, \ params='wmc', init_params='wmc') clf.fit(xy) Z = np.exp(clf.score_samples(XY)[0]).reshape(len(ylin),len(xlin)) else: raise ValueError("Unknown KDE estimator {}".format( kde_type)) self._KDE_Contour[key] = (Xmesh,Ymesh,Z) return self._KDE_Contour[key]
def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None, xdata=None, labels=None, ax=None): """ High Density Region boxplot Parameters ---------- data : sequence of ndarrays or 2-D ndarray The vectors of functions to create a functional boxplot from. If a sequence of 1-D arrays, these should all be the same size. The first axis is the function index, the second axis the one along which the function is defined. So ``data[0, :]`` is the first functional curve. ncomp : int, optional Number of components to use. If None, returns the as many as the smaller of the number of rows or columns in data. alpha : list of floats between 0 and 1, optional Extra quantile values to compute. Default is None threshold : float between 0 and 1, optional Percentile threshold value for outliers detection. High value means a lower sensitivity to outliers. Default is `0.95`. bw: array_like or str, optional If an array, it is a fixed user-specified bandwidth. If `None`, set to `normal_reference`. If a string, should be one of: - normal_reference: normal reference rule of thumb (default) - cv_ml: cross validation maximum likelihood - cv_ls: cross validation least squares xdata : ndarray, optional The independent variable for the data. If not given, it is assumed to be an array of integers 0..N-1 with N the length of the vectors in `data`. labels : sequence of scalar or str, optional The labels or identifiers of the curves in `data`. If not given, outliers are labeled in the plot with array indices. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. hdr_res : HdrResults instance An `HdrResults` instance with the following attributes: - 'median', array. Median curve. - 'hdr_50', array. 50% quantile band. [sup, inf] curves - 'hdr_90', list of array. 90% quantile band. [sup, inf] curves. - 'extra_quantiles', list of array. Extra quantile band. [sup, inf] curves. - 'outliers', ndarray. Outlier curves. Notes ----- The median curve is the curve with the highest probability on the reduced space of a Principal Component Analysis (PCA). Outliers are defined as curves that fall outside the band corresponding to the quantile given by `threshold`. The non-outlying region is defined as the band made up of all the non-outlying curves. Behind the scene, the dataset is represented as a matrix. Each line corresponding to a 1D curve. This matrix is then decomposed using Principal Components Analysis (PCA). This allows to represent the data using a finite number of modes, or components. This compression process allows to turn the functional representation into a scalar representation of the matrix. In other words, you can visualize each curve from its components. Each curve is thus a point in this reduced space. With 2 components, this is called a bivariate plot (2D plot). In this plot, if some points are adjacent (similar components), it means that back in the original space, the curves are similar. Then, finding the median curve means finding the higher density region (HDR) in the reduced space. Moreover, the more you get away from this HDR, the more the curve is unlikely to be similar to the other curves. Using a kernel smoothing technique, the probability density function (PDF) of the multivariate space can be recovered. From this PDF, it is possible to compute the density probability linked to the cluster of points and plot its contours. Finally, using these contours, the different quantiles can be extracted along with the median curve and the outliers. Steps to produce the HDR boxplot include: 1. Compute a multivariate kernel density estimation 2. Compute contour lines for quantiles 90%, 50% and `alpha` % 3. Plot the bivariate plot 4. Compute median curve along with quantiles and outliers curves. References ---------- [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for Functional Data", vol. 19, pp. 29-45, 2010. Examples -------- Load the El Nino dataset. Consists of 60 years worth of Pacific Ocean sea surface temperature data. >>> import matplotlib.pyplot as plt >>> import statsmodels.api as sm >>> data = sm.datasets.elnino.load(as_pandas=False) Create a functional boxplot. We see that the years 1982-83 and 1997-98 are outliers; these are the years where El Nino (a climate pattern characterized by warming up of the sea surface and higher air pressures) occurred with unusual intensity. >>> fig = plt.figure() >>> ax = fig.add_subplot(111) >>> res = sm.graphics.hdrboxplot(data.raw_data[:, 1:], ... labels=data.raw_data[:, 0].astype(int), ... ax=ax) >>> ax.set_xlabel("Month of the year") >>> ax.set_ylabel("Sea surface temperature (C)") >>> ax.set_xticks(np.arange(13, step=3) - 1) >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"]) >>> ax.set_xlim([-0.2, 11.2]) >>> plt.show() .. plot:: plots/graphics_functional_hdrboxplot.py See Also -------- banddepth, rainbowplot, fboxplot """ fig, ax = utils.create_mpl_ax(ax) if labels is None: # For use with pandas, get the labels if hasattr(data, 'index'): labels = data.index else: labels = np.arange(len(data)) data = np.asarray(data) if xdata is None: xdata = np.arange(data.shape[1]) n_samples, dim = data.shape # PCA and bivariate plot pca = PCA(data, ncomp=ncomp) data_r = pca.factors # Create gaussian kernel ks_gaussian = KDEMultivariate(data_r, bw=bw, var_type='c' * data_r.shape[1]) # Boundaries of the n-variate space bounds = np.array([data_r.min(axis=0), data_r.max(axis=0)]).T # Compute contour line of pvalue linked to a given probability level if alpha is None: alpha = [threshold, 0.9, 0.5] else: alpha.extend([threshold, 0.9, 0.5]) alpha = list(set(alpha)) alpha.sort(reverse=True) n_quantiles = len(alpha) pdf_r = ks_gaussian.pdf(data_r).flatten() pvalues = [np.percentile(pdf_r, (1 - alpha[i]) * 100, interpolation='linear') for i in range(n_quantiles)] # Find mean, outliers curves if have_de_optim: median = differential_evolution(lambda x: - ks_gaussian.pdf(x), bounds=bounds, maxiter=5).x else: median = brute(lambda x: - ks_gaussian.pdf(x), ranges=bounds, finish=fmin) outliers_idx = np.where(pdf_r < pvalues[alpha.index(threshold)])[0] labels_outlier = [labels[i] for i in outliers_idx] outliers = data[outliers_idx] # Find HDR given some quantiles def _band_quantiles(band): """Find extreme curves for a quantile band. From the `band` of quantiles, the associated PDF extrema values are computed. If `min_alpha` is not provided (single quantile value), `max_pdf` is set to `1E6` in order not to constrain the problem on high values. An optimization is performed per component in order to find the min and max curves. This is done by comparing the PDF value of a given curve with the band PDF. Parameters ---------- band : array_like alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]`` Returns ------- band_quantiles : list of 1-D array ``(max_quantile, min_quantile)`` (2, n_features) """ min_pdf = pvalues[alpha.index(band[0])] try: max_pdf = pvalues[alpha.index(band[1])] except IndexError: max_pdf = 1E6 band = [min_pdf, max_pdf] pool = Pool() data = zip(range(dim), itertools.repeat((band, pca, bounds, ks_gaussian))) band_quantiles = pool.map(_min_max_band, data) pool.terminate() pool.close() band_quantiles = list(zip(*band_quantiles)) return band_quantiles extra_alpha = [i for i in alpha if 0.5 != i and 0.9 != i and threshold != i] if extra_alpha != []: extra_quantiles = [y for x in extra_alpha for y in _band_quantiles([x])] else: extra_quantiles = [] # Inverse transform from n-variate plot to dataset dataset's shape median = _inverse_transform(pca, median)[0] hdr_90 = _band_quantiles([0.9, 0.5]) hdr_50 = _band_quantiles([0.5]) hdr_res = HdrResults({ "median": median, "hdr_50": hdr_50, "hdr_90": hdr_90, "extra_quantiles": extra_quantiles, "outliers": outliers, "outliers_idx": outliers_idx }) # Plots ax.plot(np.array([xdata] * n_samples).T, data.T, c='c', alpha=.1, label=None) ax.plot(xdata, median, c='k', label='Median') fill_betweens = [] fill_betweens.append(ax.fill_between(xdata, *hdr_50, color='gray', alpha=.4, label='50% HDR')) fill_betweens.append(ax.fill_between(xdata, *hdr_90, color='gray', alpha=.3, label='90% HDR')) if len(extra_quantiles) != 0: ax.plot(np.array([xdata] * len(extra_quantiles)).T, np.array(extra_quantiles).T, c='y', ls='-.', alpha=.4, label='Extra quantiles') if len(outliers) != 0: for ii, outlier in enumerate(outliers): label = str(labels_outlier[ii]) if labels_outlier is not None else 'Outliers' ax.plot(xdata, outlier, ls='--', alpha=0.7, label=label) handles, labels = ax.get_legend_handles_labels() # Proxy artist for fill_between legend entry # See http://matplotlib.org/1.3.1/users/legend_guide.html plt = _import_mpl() for label, fill_between in zip(['50% HDR', '90% HDR'], fill_betweens): p = plt.Rectangle((0, 0), 1, 1, fc=fill_between.get_facecolor()[0]) handles.append(p) labels.append(label) by_label = OrderedDict(zip(labels, handles)) if len(outliers) != 0: by_label.pop('Median') by_label.pop('50% HDR') by_label.pop('90% HDR') ax.legend(by_label.values(), by_label.keys(), loc='best') return fig, hdr_res
def kde_m(x, x_grid, bandwidth): #kde = KDEMultivariate(x, bw=bandwidth * np.ones_like(x),var_type='c') kde = KDEMultivariate(x, bw=bandwidth, var_type='c') return kde.pdf(x_grid)
class CausalEffect(object): def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True): """ We want to calculate the causal effect of X and Y through back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) for some admissable set of control variables, Z. First we calculate the conditional density P(Y|X,Z), then the density P(Z). We find the support of Z so we can properly sum over it later. variable_types are a dictionary with the column name pointing to an element of set(['o', 'u', 'c']), for 'ordered', 'unordered discrete', or 'continuous'. """ conditional_density_vars = causes + admissable_set self.causes = causes self.effects = effects self.admissable_set = admissable_set self.conditional_density_vars = conditional_density_vars if variable_types: self.variable_types = variable_types dep_type = [variable_types[var] for var in effects] indep_type = [variable_types[var] for var in conditional_density_vars] density_types = [variable_types[var] for var in admissable_set] else: self.variable_types = self.__infer_variable_types(X) if 'c' not in variable_types.values(): bw = 'cv_ml' else: bw = 'normal_reference' if admissable_set: self.density = KDEMultivariate(X[admissable_set], var_type=''.join(density_types), bw=bw) self.conditional_density = KDEMultivariateConditional(endog=X[effects], exog=X[conditional_density_vars], dep_type=''.join(dep_type), indep_type=''.join(indep_type), bw=bw) if expectation: self.conditional_expectation = KernelReg(X[effects].values, X[conditional_density_vars].values, ''.join(indep_type), bw='cv_ls') self.support = self.__get_support(X) self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u']] self.discrete_Z = list(set(self.discrete_variables).intersection(set(admissable_set))) self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ] self.continuous_Z = list(set(self.continuous_variables).intersection(set(admissable_set))) def __infer_variable_types(self,X): """ fill this in later. """ pass def __get_support(self, X): """ find the smallest cube around which the densities are supported, allowing a little flexibility for variables with larger bandwidths. """ data_support = { variable : (X[variable].min(), X[variable].max()) for variable in X.columns} variable_bandwidths = { variable : bw for variable, bw in zip(self.effects + self.conditional_density_vars, self.conditional_density.bw)} support = {} for variable in self.effects + self.conditional_density_vars: if self.variable_types[variable] == 'c': lower_support = data_support[variable][0] - 10. * variable_bandwidths[variable] upper_support = data_support[variable][1] + 10. * variable_bandwidths[variable] support[variable] = (lower_support, upper_support) else: support[variable] = data_support[variable] return support def integration_function(self,*args): # takes continuous z, discrete z, then x data = pd.DataFrame({ k : [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes + self.effects, args)}) conditional = self.conditional_density.pdf(exog_predict=data[self.conditional_density_vars].values[0], endog_predict=data[self.effects].values[0]) density = self.density.pdf(data_predict=data[self.admissable_set]) return conditional * density def expectation_integration_function(self, *args): data = pd.DataFrame({ k : [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes, args)}) conditional = self.conditional_expectation.fit(data_predict=data[self.conditional_density_vars].values)[0] density = self.density.pdf(data_predict=data[self.admissable_set]) return conditional * density def pdf(self, x): """ Currently, this does the whole sum/integral over the cube support of Z. We may be able to improve this by taking into account how the joint and conditionals factorize, and/or finding a more efficient support. This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete variable cardinalities. It runs in O(n_1 n_2 ... n_k) in the cardinality of the discrete variables, |Z_1| = n_1, etc. It likewise runs in O(V^n) for n continuous Z variables. Factorizing the joint/conditional distributions in the sum could linearize the runtime. """ causal_effect = 0. x = x[self.causes + self.effects] if self.discrete_Z: discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1])+1)) for variable in self.discrete_Z] for z_vals in itertools.product(*discrete_variable_ranges): z_discrete = pd.DataFrame({k : [v] for k, v in zip(self.discrete_Z, z_vals)}) if self.continuous_Z: continuous_Z_ranges = [self.support[variable] for variable in self.continuous_Z] args = z_discrete.join(x).values[0] causal_effect += nquad(self.integration_function,continuous_Z_ranges,args=args)[0] else: z_discrete = z_discrete[self.admissable_set] exog_predictors = x.join(z_discrete)[self.conditional_density_vars] conditional = self.conditional_density.pdf(exog_predict=exog_predictors, endog_predict=x[self.effects]) density = self.density.pdf(data_predict=z_discrete) dc = conditional * density causal_effect += dc return causal_effect elif self.continuous_Z: continuous_Z_ranges = [self.support[var] for var in self.continuous_Z] causal_effect, error = nquad(self.integration_function,continuous_Z_ranges,args=tuple(x.values[0])) return causal_effect else: return self.conditional_density.pdf(exog_predict=x[self.causes],endog_predict=x[self.effects]) def expected_value( self, x): """ Currently, this does the whole sum/integral over the cube support of Z. We may be able to improve this by taking into account how the joint and conditionals factorize, and/or finding a more efficient support. This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete variable cardinalities. It runs in O(n_1 n_2 ... n_k) in the cardinality of the discrete variables, |Z_1| = n_1, etc. It likewise runs in O(V^n) for n continuous Z variables. Factorizing the joint/conditional distributions in the sum could linearize the runtime. """ causal_effect = 0. x = x[self.causes] if self.discrete_Z: discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1])+1)) for variable in self.discrete_Z] for z_vals in itertools.product(*discrete_variable_ranges): z_discrete = pd.DataFrame({k : [v] for k, v in zip(self.discrete_Z, z_vals)}) if self.continuous_Z: continuous_Z_ranges = [self.support[variable] for variable in self.continuous_Z] args = z_discrete.join(x).values[0] causal_effect += nquad(self.expectation_integration_function,continuous_Z_ranges,args=args)[0] else: z_discrete = z_discrete[self.admissable_set] exog_predictors = x.join(z_discrete)[self.conditional_density_vars] causal_effect += self.conditional_expectation.fit(data_predict=exog_predictors.values)[0] * self.density.pdf(data_predict=z_discrete.values) return causal_effect elif self.continuous_Z: continuous_Z_ranges = [self.support[var] for var in self.continuous_Z] causal_effect, error = nquad(self.expectation_integration_function,continuous_Z_ranges,args=tuple(x.values[0])) return causal_effect else: return self.conditional_expectation.fit(data_predict=x[self.causes])[0]
def kde_statsmodels_m(x, x_grid, bandwidth=0.2, **kwargs): """Multivariate Kernel Density Estimation with Statsmodels, use with heterogeneous data""" kde = KDEMultivariate(x, bw=bandwidth * np.ones_like(x), var_type='c', **kwargs) return kde.pdf(x_grid)