def fit(self, X): if self._scaling: self._scaler = StandardScaler() X = self._scaler.fit_transform(X) X = X[:512] self._kde = KernelDensity().fit(X) return self
def evaluate_vec(real_points, fake_points, validation_fake_points=None): """Compute the average log-likelihood and the Coverage metric. Coverage metric is defined in arXiv paper. It counts a mass of true data covered by the 95% quantile of the model density. """ # Estimating density with KDE dist = fake_points[:-1] - fake_points[1:] dist = dist * dist dist = np.sqrt(np.sum(dist, axis=1)) bandwidth = np.median(dist) num_real = len(real_points) num_fake = len(fake_points) if validation_fake_points is not None: max_score = -1000000. num_val = len(validation_fake_points) b_grid = bandwidth * (2.**(np.arange(14) - 7.)) for _bandwidth in b_grid: kde = KernelDensity(kernel='gaussian', bandwidth=_bandwidth) kde.fit(np.reshape(fake_points, [num_fake, -1])) score = np.mean( kde.score_samples( np.reshape(validation_fake_points, [num_val, -1]))) if score > max_score: # logging.debug("Updating bandwidth to %.4f" # " with likelyhood %.2f" % (_bandwidth, score)) bandwidth = _bandwidth max_score = score kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth) kde.fit(np.reshape(fake_points, [num_fake, -1])) # Computing Coverage, refer to Section 4.3 of arxiv paper model_log_density = kde.score_samples( np.reshape(fake_points, [num_fake, -1])) # np.percentaile(a, 10) returns t s.t. np.mean( a <= t ) = 0.1 threshold = np.percentile(model_log_density, 5) real_points_log_density = kde.score_samples( np.reshape(real_points, [num_real, -1])) ratio_not_covered = np.mean(real_points_log_density <= threshold) C = 1. - ratio_not_covered return C
def est_KL(N): N = 5000 tar_samps = banana(N) param_init_S1_quad = np.array([1.69, -0.827, -0.37, -0.2]) res_S1_quad = minimize(objective_SAA_quad, param_init_S1_quad, args=(tar_samps[:,0], 1)) # Only take the first column since this is for S1 param_final_S1_quad = res_S1_quad.x print('First component of inverse computed') param_init_S2_quad = np.array([0.25, 1.5, 1.55, 0.15, -0.7, -0.1, -2.5]) res_S2_quad = minimize(objective_SAA_quad, param_init_S2_quad, args=(tar_samps, 2)) param_final_S2_quad = res_S2_quad.x print('Second component of inverse computed') new_ref_samps = st.norm.rvs(size=(N,2)) # print(new_ref_samps[0]) # import pdb; pdb.set_trace(); T1_approx = T1_quad(new_ref_samps[:,0], param_final_S1_quad) print('First component of forward computed') # Feed in new_T1 into where z1 needs to be in the S2 T2_approx = T2_quad(new_ref_samps[:,1], T1_approx, param_final_S2_quad) print('Second component of forward computed') # Now calculate the KL between [T1_approx, T2_approx] and new_tar_samps T12_kde_approx = np.column_stack((T1_approx,T2_approx)) kde_approx = KernelDensity(kernel='gaussian').fit(T12_kde_approx) # Fit the approximate forward map log_dens_approx = kde_approx.score_samples(T12_kde_approx) # generate some new target_samps new_tar_samps = banana(N) T12_kde_true = new_tar_samps kde_true = KernelDensity(kernel='gaussian').fit(T12_kde_true) log_dens_true = kde_true.score_samples(T12_kde_true) PI_true = np.exp(log_dens_true) PI_approx = np.exp(log_dens_approx) # Remove any negative terms ## neg_true_idx = np.where(PI_true <= 0)[0] # neg_approx_idx = np.where(PI_approx <= 0)[0] # negs = list(set(neg_true_idx).intersection(neg_approx_idx)) # S = entropy(PI_approx[PI_approx != negs], PI_true[PI_true != negs]) S = entropy(PI_approx, PI_true) return S
def fit(self, X, y): """ Train the model. :param X: A Nx3 array, where the features are distance(Angstrom)/10, angle1(rad), angle2(rad) The **distance** is the closest distance between the two line segments (i.e. coarse grained elementts) **angle1** is the angle between the line along the stem vector and the line along the shortest connection between the two elements. A an angle between two straight lines, it is defined between 0 and 90 degrees. **angle2** is the angle between the connecting vector (pointing from the stem to the loop), projected onto the plane normal to the stem direction and the twist vector (location of minor groove) at the point closest to the interaction. As an angle between two vectors, it is defined between 0 and 180 degrees. :param y: An array of length N. 0 means no interaction, 1 means interaction. """ # Check that X and y have correct shape X, y = check_X_y(X, y) log.info("Trainings-data has shape %s", X.shape) log.info("We have %s known interactions ", sum(y)) if X.shape[1] != 3: raise TypeError( "Expect exactly 3 features, found {}".format(X.shape[1])) if not all(yi in [0, 1] for yi in y): raise ValueError("y should only contain the values 1 and 0") ame = X[np.where(y)] non_ame = X[np.where(y == 0)] if self.symmetric: ame = self._make_symmetric(ame) non_ame = self._make_symmetric(non_ame) log.info("Fitting. First positive sample: %s", X[np.where(y)][0]) self.ame_kde_ = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth).fit(ame).score_samples self.non_ame_kde_ = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth).fit(non_ame).score_samples self.X_ = X self.y_ = y
def computePdfKdeSklearn(self, dataset): ''' compute pdf and its values for elements in dataset ''' bwSklearn = estimate_bandwidth(dataset) print("bwSklearn este " + str(bwSklearn)) kde = KernelDensity(kernel='gaussian', bandwidth=bwSklearn).fit(dataset) logPdf = kde.score_samples(dataset) pdf = np.exp(logPdf) return pdf
def get_kde_pdf(X, bandwidth=2, step=.1, num_samples=200, optimize=False): """ return kde and pdf from a data sample """ if len(X) == 0: return [], np.array([]), [] if optimize: bandwidths = 10**np.linspace(-1, 1, 10) grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=LeaveOneOut(len(X))) grid.fit(X[:, None]) kde = KernelDensity(kernel='gaussian', bandwidth=grid.best_params_['bandwidth']).fit( X[:, None]) else: kde = KernelDensity(kernel='gaussian', bandwidth=2).fit(X[:, None]) pdf = np.exp(kde.score_samples(np.arange(0, 100, step)[:, None])) samples = kde.sample(num_samples) return kde, np.array(pdf), samples
def train(self): [parttraindata, validationdata] = datapreparation.splitTraindata(self.trainData) # individual modeling kdeModel = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth).fit(parttraindata) # modeling based on others' data # for each dim, train a model otherKdeModel = [] for i in range(0, len(self.trainDataOfNeighborsDim)): aModel = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidthNeighbor[i]).fit( self.trainDataOfNeighborsDim[i]) otherKdeModel.append(aModel) # mixture modeling self.trainedModel = _mixturemodels.FixBwMixtureModels( parameters=None, models=[kdeModel] + otherKdeModel) em.runEM(validationdata, mixmodels=self.trainedModel) print(self.trainedModel.params)
def calculate_kde(points, df_osm_built, df_osm_pois=None, bandwidth=400, X_weights=None, pois_weight=9, log_weight=True): """ Evaluate the probability density function using Kernel Density Estimation of input geo-localized data KDE's bandwidth related to walkable-distances Parameters ---------- df : pandas.DataFrame input data with column [geometry] containing shapely geometries XX_YY : pandas.Panel meshgrid to evaluate the probability density function bandwidth: Returns ---------- pandas.Series """ # X_b : Buildings array X_b = [[p.x, p.y] for p in df_osm_built.geometry.centroid.values] # X_p : Points array if (df_osm_pois is None): X_p = [] else: X_p = [[p.x, p.y] for p in df_osm_pois.geometry.centroid.values] # X : Full array X = np.array(X_b + X_p) # Points where the probability density function will be evaluated Y = np.array([[p.x, p.y] for p in points.values]) if (not (X_weights is None)): # Weighted Kernel Density Estimation # Building's weight + POIs weight X_W = np.concatenate( [X_weights.values, np.repeat([pois_weight], len(X_p))]) if (log_weight): # Apply logarithm X_W = np.log(X_W) PDF = WeightedKernelDensityEstimation(X, X_W, bandwidth, Y) return pd.Series(PDF / PDF.max()) else: # Kernel Density Estimation # Sklearn kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(X) # Sklearn returns the results in the form log(density) PDF = np.exp(kde.score_samples(Y)) return pd.Series(PDF / PDF.max())
def kernel_density_estimation(predicted_positions): l = predicted_positions.shape[0] // 2 kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(predicted_positions.T) sc = kde.score_samples(predicted_positions.T) max = np.argmax(np.exp(sc)) landmarks = predicted_positions[:, max] landmarks = np.reshape(landmarks, (l, 2)) return landmarks
def __init__(self, space_dim, done_fktn, predict_change=False, sample_rejection=False): self.input_dim = space_dim + 1 self.output_dim = self.input_dim -1 self.X = None self.Y = None self.done = done_fktn self.type = 'GP' self.predict_change = predict_change self.sample_rejection = sample_rejection self.nb_samples = 500 self.kde = KernelDensity(bandwidth = 10/(space_dim * np.power(1000, 1/space_dim)))
def marcenko_pastur_loss(sigma, n_features, n_obs, e_val, bwidth, kernel='gaussian', n_pts=1000): """ Return the loss (sum of squared errors) from the Marcenko-Pastur distribution. Arguments --------- sigma : float Standard deviation of observations. n_features : int Number of features. n_obs : int Number of observations (in time). e_val : matrix Diagonal matrix of eigenvalues. bwidth : float Bandwidth values. kernel : KernelDensity Kernel used to fit observations. n_pts : int Number of points to sample the PDF. Notes ----- Function adapted from "Machine Learning for Asset Managers", Marcos López de Prado (2020). """ # Compute Theoretical PDF pdf0 = marcenko_pastur_pdf(n_features, n_obs, sigma, n_pts) # Compute Empirical PDF # Fit kernel to a series of observations if len(e_val.shape) == 1: e_val = e_val.reshape(-1, 1) kde = KernelDensity(kernel=kernel, bandwidth=bwidth).fit(e_val) # Create index x = pdf0.index.values if len(x.shape) == 1: x = x.reshape(-1, 1) # Derive the probability of observations log_density = kde.score_samples(x) pdf1 = pd.Series(np.exp(log_density), index=x.flatten()) # Return loss loss = np.sum((pdf1 - pdf0)**2) return loss
def contour_plot(points, x_label, y_label): kde = KernelDensity(kernel='epanechnikov', bandwidth=0.01).fit(points) x_limits = np.min(points[:, 0]), np.max(points[:, 0]) y_limits = np.min(points[:, 1]), np.max(points[:, 1]) x, y = np.meshgrid(np.linspace(*x_limits, 300), np.linspace(*y_limits, 300)) xy = np.stack([x.ravel(), y.ravel()]).T z = kde.score_samples(xy).reshape(x.shape) levels = np.linspace(z.max() - 10., z.max(), 100) plt.contourf(x, y, np.exp(z), levels=np.exp(levels), cmap=plt.cm.gist_rainbow) plt.xlabel(x_label) plt.ylabel(y_label) plt.show()
def kde_fit_cv(x, bs=None, cv=10): """ x is [n x p] bs is [k x 1] list of bootstrap values to compare cv is int, the number of folds to cross-validate """ if bs is None: bs = np.linspace(0.1, 1.0, 30) grid = GridSearchCV(KernelDensity(), {'bandwidth': bs}, cv=cv) grid.fit(x) # print grid.best_params_ return grid.best_estimator_, grid.best_params_['bandwidth']
def approximateLogLiklihood(x_generated, x_test, searchSpace = np.logspace(-4, 0, 5)): x_generated = np.array(x_generated).reshape((len(x_generated),-1)) x_test = np.array(x_test).reshape((len(x_test),-1)) # use grid search cross-validation to optimize the bandwidth print "new" params = {'bandwidth': searchSpace} grid = GridSearchCV(KernelDensity(), params, n_jobs=4) grid.fit(x_generated) print grid.best_params_ kde = grid.best_estimator_ scores = kde.score_samples(x_test) return np.sum(scores)/len(scores)
def __init__(self, data, mirror=False, **kwds): self.mirror = mirror if kwds is None: if self.mirror: self.kde_object = KernelDensity(kernel='gaussian').fit( np.vstack([-data, data]).reshape(-1, 1)) else: self.kde_object = KernelDensity(kernel='gaussian').fit(data) else: if self.mirror: self.kde_object = KernelDensity(**kwds).fit( np.vstack([-data.reshape(-1, 1), data.reshape(-1, 1)])) else: self.kde_object = KernelDensity(**kwds).fit(data.reshape( -1, 1)) try: self.d = data.shape[1] except IndexError: self.d = 1 self.n = data.shape[0]
def crossValidation(self, data): ''' Compute the band width by using the cross validation method. Input: data -- a numpy array Output: [float] ''' grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(.1, 1.0, 30)}, cv=20) grid.fit(data) return grid.best_params_['bandwidth']
def plot_scatter(X, scale, out_prefix, title, kde=True): """Draws a 2D scatter plot (png) of the core and accessory distances Also draws contours of kernel density estimare Args: X (numpy.array) n x 2 array of core and accessory distances for n samples. scale (numpy.array) Scaling factor from :class:`~PopPUNK.models.BGMMFit` out_prefix (str) Prefix for output plot file (.png will be appended) title (str) The title to display above the plot kde (bool) Whether to draw kernel density estimate contours (default = True) """ plt.figure(figsize=(11, 8), dpi=160, facecolor='w', edgecolor='k') if kde: xx, yy, xy = get_grid(0, 1, 100) # KDE estimate kde = KernelDensity(bandwidth=0.03, metric='euclidean', kernel='epanechnikov', algorithm='ball_tree') kde.fit(X) z = np.exp(kde.score_samples(xy)) z = z.reshape(xx.shape).T levels = np.linspace(z.min(), z.max(), 10) plt.contour(xx * scale[0], yy * scale[1], z, levels=levels[1:], cmap='plasma') scatter_alpha = 1 else: scatter_alpha = 0.1 plt.scatter(X[:, 0] * scale[0].flat, X[:, 1] * scale[1].flat, s=1, alpha=scatter_alpha) plt.title(title) plt.xlabel('Core distance (' + r'$\pi$' + ')') plt.ylabel('Accessory distance (' + r'$a$' + ')') plt.savefig(out_prefix + ".png") plt.close()
def check_if_events_in_cluster(points, events, event_time, n_selection=n_selection_po, multiprocess=True, event_type='po', ): #pylint: disable=redefined-outer-name '''check if a list of events are in the 4D cluster.''' output = {'event_number': [], 'run_number': [], 'in_veto_volume': [], } data_arr_nowall = remove_wall_points_np(data_arr_from_points(points)) #print(data_arr_nowall.shape) if not data_arr_nowall.shape[0]: warn.warn('No points left in cluster after removing wall points', RuntimeWarning) for row in events.iterrows(): output['event_number'].append(row[1].event_number) output['run_number'].append(row[1].run_number) output['in_veto_volume'].append(False) return output if events.empty: return output data_arr_scores = kde_likelihood(data_arr_nowall, multiprocess=multiprocess, event_type=event_type) data_arr_selected = data_arr_scores[-len(data_arr_scores)//n_selection:] db = DBSCAN(eps=DBSCAN_radius, min_samples=DBSCAN_samples)\ .fit(pd.DataFrame(data_arr_selected).values[:, :4]) data_arr_cluster = np.zeros(data_arr_selected.shape, dtype=[('x', np.double), ('y', np.double), ('z', np.double), ('t', np.double), ('score', np.double), ('label', int)]) data_arr_cluster['x'] = data_arr_selected['x'] data_arr_cluster['y'] = data_arr_selected['y'] data_arr_cluster['z'] = data_arr_selected['z'] data_arr_cluster['t'] = data_arr_selected['t'] data_arr_cluster['score'] = data_arr_selected['score'] data_arr_cluster['label'] = db.labels_ data_arr_df = pd.DataFrame(data_arr_cluster) data_wo_outliers = data_arr_df.query('label != -1').values[:, :4] selected_fit = KernelDensity(kernel='tophat', rtol=kde_rtol, bandwidth=kernel_radius).fit(data_wo_outliers) for row in events.iterrows(): t = abs(row[1].event_time - event_time)/(2*timestep) score = selected_fit.score([[row[1].x_3d_nn, row[1].y_3d_nn, row[1].z_3d_nn, t]]) output['event_number'].append(row[1].event_number) output['run_number'].append(row[1].run_number) output['in_veto_volume'].append(not score == -np.inf) return output
def get_minima(a, bandwidth, isPlot): a_low = a.min() - 2 a_hi = a.max() + 2 a = a.reshape(-1, 1) num_x = 300 + a_hi - a_low kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(a) s = np.linspace(a_low, a_hi, num=num_x) s = s.reshape(-1, 1) e = kde.score_samples(s) e = np.exp(e) # Fill in low values on -inf for a derivable function # # e_min = np.min(e[np.isfinite(e)]) # e [np.isneginf(e) ] = (e_min - 1) if isPlot: plt.plot(s, e) plt.show() mi = argrelextrema(e, np.less)[0] s = s.squeeze() minima = s[mi] nMinima = len(minima) l = [] if nMinima == 0: # print("No minima found, no split") l.append(a) else: l.append(a[a < minima[0]]) for i in range(0, nMinima - 1): l.append(a[(a > minima[i]) & (a < minima[i + 1])]) """c""" l.append(a[a > minima[nMinima - 1]]) return l
def set_args_params(self, args): self.args = args self.use_index = self.args.use_index self.reward_type = self.args.reward_type self.ep_length = self.args.ep_length self.always_render = self.args.render self.use_global_density = self.args.use_global_density self.use_extrinsic_reward = self.args.use_extrinsic_reward self.kde_goal = KernelDensity(kernel='gaussian', bandwidth=self.args.goal_bandwidth) self.kde_tra = KernelDensity(kernel='gaussian', bandwidth=self.args.trajectory_bandwidth) self.set_observation_space() print('use index', self.use_index) print('reward_type', self.reward_type) print('ep_length', self.ep_length) if self.always_render: self.viewer = self._get_viewer('human') self.viewer._run_speed = 100
def train_KDE_model(train_df, bandwith=KDE_BANDWITH): """ Train KDE model based on coordinates of incidents. """ kde = KernelDensity(bandwidth=bandwith, metric='haversine', kernel='gaussian', algorithm='ball_tree') kde.fit(train_df[['latitude', 'longitude']] * np.pi / 180) return kde
def weighted_density(coords, gridcoords, weights, bandwidth=0.004, atol=0.01): """ Compute a weighted density estimate :param coords: NP-array (N X 3) of coordinates :param gridcoords: 3D matrix of coordinates on which the kde will be evaluated :param weights: NP-array (N x 1) of values for each point indicated by the coordinates :param bandwidth: bandwidth of the Gaussian kernel to be used :return: density estimate """ kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth, atol=atol).fit(coords, sample_weight=weights) density = kde.score_samples(gridcoords) # return log scores return density
def get_numerical_signature(values, S): ''' Learns a distribution of the values Then generates a sample of size S ''' # Transform data to numpy array Xnumpy = np.asarray(values) X = Xnumpy.reshape(-1, 1) # Learn kernel kde = KernelDensity(kernel=C.kd["kernel"], bandwidth=C.kd["bandwidth"]).fit(X) sig_v = [kde.sample()[0][0] for x in range(S)] return sig_v
def cross_valid_bw(X): std_data = np.linalg.norm(np.std(X, axis=0)) print('bw_CV') grid = GridSearchCV( KernelDensity(kernel='tophat'), {'bandwidth': np.linspace(0.2 * std_data, 1.5 * std_data, 15)}, cv=20) # 20-fold cross-validation grid.fit(X) h_cv = grid.best_params_['bandwidth'] print('done', std_data, h_cv) return h_cv
def calc_kdes(X_train, Y_train, X_valid, bw): prob_matrix = np.zeros((2, X_valid.shape[0])) for i in range(0, 2): X_train_class_i = X_train[Y_train == i, :] for j in range(0, FEATS): kde = KernelDensity(kernel = 'gaussian', bandwidth = bw) kde.fit(X_train_class_i[:,[j]]) log_prob = kde.score_samples(X_valid[:,[j]]) prob_matrix[i] = np.add(prob_matrix[i], log_prob) return prob_matrix
def fit(self, X, y): unique_vals = np.unique(y) unique_vals = np.sort(unique_vals) if len(unique_vals) == 1: kde = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth) kde.fit(X[y == unique_vals[0]]) if unique_vals[0] == 0: self.kernels.append(kde) self.kernels.append(None) else: self.kernels.append(None) self.kernels.append(kde) else: assert (len(unique_vals) == 2) kde = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth) kde.fit(X[y == 0]) self.kernels.append(kde) kde = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth) kde.fit(X[y == 1]) self.kernels.append(kde)
def density_est_kde(ds1, ds2, metric='euclidean'): if metric == 'cosine_similarity': sep_intra = cosine_similarity(ds1, ds1) sep_inter = cosine_similarity(ds1, ds2) else: sep_intra = pairwise_distances(ds1, ds1, metric=metric) sep_inter = pairwise_distances(ds1, ds2, metric=metric) sep_intra = sep_intra.flatten() sep_inter = sep_inter.flatten() # --- intra xfit = np.linspace(0, 2, len(sep_intra)) X = sep_intra[:, np.newaxis] Xfit = xfit[:, np.newaxis] kde = KernelDensity(bandwidth=0.05) kde.fit(X) log_dens = kde.score_samples(Xfit) density = np.exp(log_dens) # density *= 1/np.sum(density) plt.plot(xfit, density, '#069af3', lw=2) print "--- fitted intra cluster separation" # --- inter xfit = np.linspace(0, 2, len(sep_inter)) X = sep_inter[:, np.newaxis] Xfit = xfit[:, np.newaxis] kde = KernelDensity(bandwidth=0.05) kde.fit(X) log_dens = kde.score_samples(Xfit) density = np.exp(log_dens) # density *= 1/np.sum(density) plt.plot(xfit, density, '#00B050', lw=2) plt.show()
def fitKDE(obs, bWidth=.25, kernel='gaussian', x=None): # Fit kernel to a series of obs, and derive the prob of obs # x is the array of values on which the fit KDE will be evaluated if len(obs.shape) == 1: obs = obs.reshape(-1, 1) kde = KernelDensity(kernel=kernel, bandwidth=bWidth).fit(obs) if x is None: x = np.unique(obs).reshape(-1, 1) if len(x.shape) == 1: x = x.reshape(-1, 1) logProb = kde.score_samples(x) # log(density) pdf = pd.Series(np.exp(logProb), index=x.flatten()) return pdf
def build_kde_model(prices): min_dcgr = -0.3 max_dcgr = 0.3 num_bins = 1e4 bin_width = (max_dcgr - min_dcgr) / num_bins possible_cgr = np.linspace(min_dcgr, max_dcgr, num_bins) dcgr = contGrowthRate(prices) silverman_bw = 1.06 * np.std(dcgr[:, 0]) * len(dcgr[:, 0])**(-1 / 5) kde = KernelDensity(kernel='gaussian', bandwidth=silverman_bw).fit(dcgr) kernel_estimate = np.exp(kde.score_samples(possible_cgr[:, np.newaxis])) return kernel_estimate
def parzen(dataset: int, feature: str) -> None: b = 0.1 X = load_false(dataset) scaler = StandardScaler(copy=False) X_transformed = scaler.fit_transform(X) kdex = KernelDensity(kernel='gaussian', bandwidth=b) fx = X[feature].values.reshape((-1, 1)) kdex.fit(fx) x_d = np.linspace(min(fx.flatten()) - .5, max(fx.flatten()) + .5, 1000) logprob = kdex.score_samples(x_d[:, None]) Y = load_true(dataset) Y_transformed = scaler.transform(Y) kdey = KernelDensity(kernel='gaussian', bandwidth=b) fy = Y[feature].values.reshape((-1, 1)) kdey.fit(fy) y_d = np.linspace(min(fy.flatten()) - .5, max(fy.flatten()) + .5, 1000) logproby = kdey.score_samples(y_d[:, None]) plt.clf() plt.fill_between(x_d, np.exp(logprob), alpha=0.5) plt.plot(fx, np.full_like(fx, -0.01), '|k', label='Inliers', markeredgewidth=1) plt.fill_between(y_d, np.exp(logproby), alpha=0.5) plt.plot(fy, np.full_like(fy, -0.01), '.k', label='Outliers', markeredgewidth=1) d = DATASETS_FALSE[dataset].split("/")[2] #plt.title(feature) plt.legend(loc='best') plt.ylim(-0.02, 1.1) #plt.show() plt.savefig("results/plots_features/" + d + "/" + feature + '.svg')