def kde_labeler(picks): if isinstance(picks, torch.Tensor): picks = picks.clone().cpu().data.numpy().astype(int) nums = np.array([x for x in range(0, 101)]).reshape(-1, 1) picks = picks.reshape(-1, 1) lower = np.percentile(picks, 25) upper = np.percentile(picks, 75) IQR = upper - lower std = picks.std() if std < 0.5: std = 1.0 IQR = 1.0 if IQR < 0.1: IQR = 0.1 m = min(np.sqrt(std * std), IQR / 1.349) bandwidth = (0.9 * float(m)) / (float(pow(float(len(picks)), 0.2))) if bandwidth > 5: # TODO: Handle this in a manner not using print statements. Maybe set a warning flag print( f"Bandwidth too high! m: {m} std: {std} IQR: {IQR} bandwidth: {bandwidth}" ) kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth) kde.fit(picks) log_dens = kde.score_samples(nums) label = np.exp(log_dens) label = label / label.sum() return label
def kernel_fit_single(data, bw=None, min_size=20, kern='gaussian'): """ guassian fit to 1D data """ res = np.histogram(data.ravel(), bins='sqrt', density=True) std_data = data.std() if (bw == None): bw = (data.ravel().shape[0] * (std_data + 2) / 4.)**(-1. / (std_data + 4)) N_bins = res[1].shape[0] if (N_bins < min_size): extra = 0.2 #N_bins *=2 else: extra = 0.0 # get plus or minus 20% x_grid = np.linspace(res[1][0] - extra * abs(res[1][0]), res[1][-1] + extra * abs(res[1][0]), N_bins) kde = KernelDensity(bandwidth=bw, kernel=kern) kde.fit(data.ravel()[:, None]) pdf = np.exp(kde.score_samples(x_grid[:, None])) return pdf, x_grid
def calculateKernelDensity(args): try: frame, XY, kernel, bandwidth, positions, Xgrid, Ygrid, extend = args # the input parameters # Compute the kernel density kdf = KernelDensity(kernel=kernel, bandwidth=float(bandwidth), algorithm='kd_tree') kdf.fit(XY) # Evaluate the kernel on the grid Z = kdf.score_samples(positions) Z = Z.reshape(Xgrid.shape) # put the result back into the grid shape # Z = remap0to1(Z) # map array to [0,1] except: # For debugging puropses it helps to first create a NoneType error outside # the multiprocessing part. If an error occurs in the multiprocessing # the thread is not finishing and no traceback is printed (it appears # as if the process is still running). #raise frame, kdf, Z, Xgrid, Ygrid, extend = None, None, None, None, None, None return [frame, (kdf, Z, Xgrid, Ygrid, extend)]
def kde_sklearn(x, x_grid, bandwidth): ## Kernel Density Estimation with Scikit-learn kde_skl = KernelDensity(kernel='gaussian', bandwidth=bandwidth) kde_skl.fit(x) ##score_samples''() returns the log-likelihood of the samples log_pdf = np.exp(kde_skl.score_samples(x_grid)) return log_pdf
def plot_kde(obj, lo, hi, true, test): obj_plot = np.linspace(lo, hi, 10000)[:, np.newaxis] avg_std = np.mean(std(obj)) bandwidth = 1.06 * avg_std * len(obj)**-0.2 plt.figure() # ax = plt.gca() for i in range(obj.shape[1]): a = obj[:, i][:, np.newaxis] #1.06*np.std(a)*len(a)**-0.2 # Bandwidth estimated by Silverman's Rule of Thumb kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian', algorithm='ball_tree') kde.fit(a) log_dens = kde.score_samples(obj_plot) plt.plot(obj_plot, np.exp(log_dens)) # vline_color = next(ax._get_lines.prop_cycler)['color'] # plt.axvline(np.mean(a), linestyle=':', color = vline_color, label='Update %i' %(i+1)) plt.axvline(np.mean(average(obj)), color='red', label='Mean of all predictions') plt.axvline(true, label='True value', linestyle='dashdot', color='black', linewidth=2) plt.ylabel('PDF') plt.xlabel('Cycle') plt.tight_layout() plt.legend()
class LeveOneOutEntropyEstimator(ItEstimator): """ Leave One Out cross-validation entropy estimation from datapoints by using kernel estimation of the probability density See More: Ivanov A. V. and Rozhkova . Properties of the statistical estimate of the entropy of a random vector with a probability density """ def __init__(self, kernel, min_log_proba, bandwith=1.0): self.kde = KernelDensity(kernel=kernel, bandwidth=bandwith) self.min_log_proba = min_log_proba def estimateFromData(self, datapoints): entropy = 0.0 if len(datapoints.shape) == 1: datapoints = np.expand_dims(datapoints, 1) for i in range(datapoints.shape[0]): curr = np.delete(datapoints, i, axis=0) self.kde.fit(curr) score = self.kde.score(datapoints[None, i, :]) if score < self.min_log_proba: print(score) continue entropy -= score return entropy / datapoints.shape[0] def entropy(self, X): return self.estimateFromData(X) def flags(self): return False, False, False
class OneClassKDE(BaseClassifier): _fit_params = ["bandwidth"] _predict_params = [] def __init__(self, *args, **kwargs): self.bandwidth = kwargs["bandwidth"] self.perc_keep = kwargs["perc_keep"] def fit(self, data, **kwargs): #self.train_data = data self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth) idx = numpy.random.randint(2, size=len(data)).astype(numpy.bool) print idx self.kde.fit(data[idx, :]) self.training_score = self.kde.score_samples(data[~idx, :]) self.direct_thresh = numpy.percentile(self.training_score, 100-self.perc_keep) print 'training', self.training_score.min(), self.training_score.mean(), self.training_score.max(), self.direct_thresh print self.direct_thresh def predict(self, data): score = self.kde.score_samples(data) self.score = score res = (score < self.direct_thresh) print 'test', self.score.min(), self.score.mean(), self.score.max() print res.sum(), "of", len(self.score), 'outliers' return res.astype(numpy.uint8)*-2+1 def decision_function(self, data=None): return self.score
def projected_density_gauss(pos, fov, ncells): """ Input: pos: particle positions mass: particle masses centre: centre of sub-&halo fov: field-of-view ncells: number of grid cells """ pos = pos - centre _indx = np.logical_and(np.abs(pos[:, 0]) < 0.5*fov, np.abs(pos[:, 1]) < 0.5*fov) pos = pos[_indx, :] n = 1024*1024 h = (4*np.std(pos[:, :2])**5/(3*n))**(1/5) #TODO: plot this falty situation kde_skl = KernelDensity(bandwidth=h, kernel='gaussian', algorithm='ball_tree') xx, yy = np.mgrid[min(pos[:, 0]):max(pos[:, 0]):complex(ncells), min(pos[:, 1]):max(pos[:, 1]):complex(ncells)] xy_sample = np.vstack([xx.ravel(), yy.ravel()]).T kde_skl.fit(pos[:, :2]) sigma = np.exp(kde_skl.score_samples(xy_sample)) sigma = sigma.reshape(xx.shape) return sigma, h
def Kde_model(bw, data): #Returns the classifier list for the given bandwidth and data #The data must be [[feats],y] kde_list = [[], []] data_0 = data[data[:, -1] == 0] data_1 = data[data[:, -1] == 1] #Class 0 X_feats = data_0[:, :-1] Y = data_0[:, -1] for feat in range(X_feats.shape[1]): X_y = np.column_stack((X_feats[:, feat], Y)) kde = KernelDensity(kernel='gaussian', bandwidth=bw) kde.fit(X_y) kde_list[0].append(kde) #Class 1 X_feats = data_1[:, :-1] Y = data_1[:, -1] for feat in range(X_feats.shape[1]): X_y = np.column_stack((X_feats[:, feat], Y)) kde = KernelDensity(kernel='gaussian', bandwidth=bw) kde.fit(X_y) kde_list[1].append(kde) return kde_list
class KDECluster: ''' points is a vector of vectors [[],[]] ''' def __init__(self, points, bw): if len(points) < 5: self.kde_ = KernelDensity(kernel='gaussian', bandwidth=bw) else: self.kde_ = KernelDensity(kernel='epanechnikov', algorithm='ball_tree', bandwidth=bw, leaf_size=50) self.points_ = points self.kde_.fit(points) #.......................................................................... def compare(self, cluster): scores_self = np.exp(self.kde_.score_samples(cluster.points_)) scores_clus = np.exp(cluster.kde_.score_samples(self.points_)) m_self = max(scores_self) m_clus = max(scores_clus) return max(m_clus, m_self)
def kde_naive_bayes(X_train, Y_train, bw): #divide the training set into two matrixes, one for each class matrix_0 = [] matrix_1 = [] for i in range(len(Y_train)): if Y_train[i] == 0: matrix_0.append(X_train[i]) else: matrix_1.append(X_train[i]) #convert the matrixes into numpy arrays matrix_0 = np.array(matrix_0) matrix_1 = np.array(matrix_1) #prior probabilities for each class prior_prob_0 = len(matrix_0) / len(X_train) prior_prob_1 = len(matrix_1) / len(X_train) #vectors to store the conditional distributions on each class kde_0 = [] kde_1 = [] #kernel estimator distribution for each feature-class combination for i in range(0, 4): #last is not feature #create KernelDensity object, fit with training data and store the distributions kde_0_k = KernelDensity(kernel='gaussian', bandwidth=bw) kde_0_k.fit(matrix_0[:, i].reshape(-1, 1)) kde_0.append(kde_0_k) #create KernelDensity object, fit with training data and store the distributions kde_1_k = KernelDensity(kernel='gaussian', bandwidth=bw) kde_1_k.fit(matrix_1[:, i].reshape(-1, 1)) kde_1.append(kde_1_k) #convert into numpy arrays kde_0 = np.array(kde_0) kde_1 = np.array(kde_1) return (prior_prob_0, prior_prob_1, kde_0, kde_1)
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs): """Kernel Density Estimation with Scikit-learn. Fit KDE""" kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(x[:, np.newaxis]) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis]) return kde_skl, np.exp(log_pdf)
def _importance_preprocess_uni(states, rewards, gradients, p_tar, p_gen): res = _create_episode_info() flat_states = [s for traj in states for s in traj] # TODO Pass in as args? kde = KernelDensity(kernel='gaussian', bandwidth=0.25) kde.fit(flat_states) for ss, rs, gs, ps, qs in izip(states, rewards, gradients, p_tar, p_gen): state_probs = kde.score_samples(ss) traj_p = np.cumsum(ps) # + np.mean(state_probs) traj_q = np.cumsum(qs) + state_probs traj_grads = np.cumsum(gs, axis=0) r_acc = np.cumsum(rs[::-1])[::-1] r_grad = (r_acc * traj_grads.T).T res.r_grads.extend(r_grad) res.traj_p_tar.extend(traj_p) res.traj_p_gen.extend(traj_q) res.traj_grads.extend(traj_grads) res.traj_r.extend(r_acc) # Used for estimating fisher res.act_grads.extend(gs) res.state_act_p_tar.extend(traj_p) res.state_act_p_gen.extend(traj_q) return res
def estimate_distribution(samples, h=0.1, n_points=100): kde = KernelDensity(bandwidth=h) samples = samples[:, np.newaxis] kde.fit(samples) xs = np.linspace(-1.0, 1.0, n_points) ys = [np.exp(kde.score([x])) for x in xs] return xs, ys
class RegularizedKernelDensityEstimator(BaseEstimator): def __init__(self, bandwidth=1.0, regularization=1.0e-5): self.bandwidth = bandwidth self.regularization = regularization def setup(self): self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth) height, width = self.shape self.uniform_density = -np.log(width * height) self.kde_constant = np.log(1 - self.regularization) self.uniform_constant = np.log(self.regularization) def fit(self, X): self.shape = X[0, 2:4] self.setup() self.kde.fit(X[:, 0:2]) return self def score_samples(self, X): kde_logliks = self.kde.score_samples(X[:, :2]) logliks = np.logaddexp(self.kde_constant + kde_logliks, self.uniform_constant + self.uniform_density) return logliks def score(self, X): return np.sum(self.score_samples(X))
class AUCKernelDensityEstimator(BaseEstimator): def __init__(self, nonfixations, bandwidth=1.0): self.bandwidth = bandwidth self.nonfixations = nonfixations def setup(self): self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth) def fit(self, X): self.setup() self.kde.fit(X) self.nonfixation_values = self.kde.score_samples(self.nonfixations) return self def score_samples(self, X): pos_logliks = self.kde.score_samples(X) neg_logliks = self.nonfixation_values aucs = [ general_roc(np.array([p]), neg_logliks)[0] for p in pos_logliks ] return aucs def score(self, X): return np.sum(self.score_samples(X))
def kde_sklearn(x, bandwidth=0.2, **kwargs): x_grid = np.linspace(x.min() - 1, x.max() + 1, 500) """Kernel Density Estimation with Scikit-learn""" kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(x[:, np.newaxis]) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis]) return np.exp(log_pdf), x_grid
def kde_single_arr(x_series, bandwidth=1.0): """ x_series: 一个pd.Series,即一列数据 对x_series里面的数据进行核密度估计 """ kde = KernelDensity(bandwidth, kernel='gaussian') kde.fit(x_series.values.reshape(-1, 1)) return kde
def _fit(self, df, kernel='gaussian'): """ Estimate density for errors as a function of perceptual distance """ df = df.copy() errors = df[~df['correct']] kde = KernelDensity(kernel=kernel) # this may take a bit if it is a large sample kde.fit(errors['distance'].values.reshape(-1, 1)) self.kde = kde
def kde_sklearn(ndim, kernel, bd, Nt, No, coordo, coordt, dtype='float64', rtime=False): ''' Calculating PDF by using KDE (Epanechnikov) based on scikit-learn KernelDensity method. Inputs: ndim -- the number of dimensions/variables [int] kernel -- the type of the kernel [str] bd -- a list of bandwidths for each dimension/variable [list] Nt -- the number of locations whose PDF will be estimated [int] No -- the number of sampled locations [int] coordo -- the sampled locations [ndarray with shape(No, ndim)] coordt -- the locations to be estimated [ndarray with shape(Nt, ndim)] Outputs: pdf -- the estimated pdf [ndarray with shape(Nt,)] ''' # Check kernel types if kernel.lower() not in allowed_kernels: raise Exception('Unknown kernel type %s' % kernel) # Convert the float value of the bd into a list in a numpy array if ndim == 1 and isinstance(bd, float): bd = np.array([bd], dtype='float64') # Check dimensions if (No, ndim) != coordo.shape and (No, ) != coordo.shape: raise Exception('Wrong dimension and size of coordo!') if (Nt, ndim) != coordt.shape and (Nt, ) != coordt.shape: print Nt, ndim, coordt.shape raise Exception('Wrong dimension and size of coordt!') if len(bd) != ndim: raise Exception( 'The length of the bandwidht does not equal to the number of the dimensions!' ) # Reshape coordt when ndim is 1 and the shape is (Nt,) to the shape (Nt, 1) if ndim == 1 and coordt.shape == (Nt, ): coordt = coordt[:, np.newaxis] # Calculate the pdf and compute the time start = time() kde_skl = KernelDensity(bandwidth=bd[0], kernel=kernel.lower()) kde_skl.fit(coordo) log_pdf = kde_skl.score_samples(coordt) pdf = np.exp(log_pdf, dtype=dtype) end = time() # Return results if rtime: return pdf, end - start else: return pdf
def createfeatmat(N): grid = getgridcoords(N).T featmat = np.zeros((len(vals), N ** 2)) for i in range(len(vals)): m = np.array([vals[i][0], vals[i][1]]).T k = KernelDensity(bandwidth=0.5 / (N - 1), kernel="gaussian") k.fit(m) featmat[i, :] = k.score_samples(grid) return featmat
def kernel_fit_hist(data, hist, bw=None, min_size=20, kern='gaussian'): """ guassian fit to 1D data """ x_grid = 0.5*(hist[1][1:]+hist[1][:-1]) # sample one less than histogram kde = KernelDensity(bandwidth=bw, kernel=kern) kde.fit(data.ravel()[:, None]) pdf = np.exp(kde.score_samples(x_grid[:, None])) return pdf, hist[1]
def estimate_distribution(samples, h=0.1, n_points=100): kde = KernelDensity(bandwidth=h) min_xs = min(samples) max_xs = max(samples) samples = samples[:, np.newaxis] kde.fit(samples) xs = np.linspace(min_xs, max_xs, n_points) ys = np.exp(kde.score_samples(xs[:, np.newaxis])) print xs.shape, ys.shape, sum(ys) return xs, ys
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs): """Kernel Density Estimation with Scikit-learn""" kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(x[:, np.newaxis]) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis]) N = np.trapz(np.exp(log_pdf), x_grid) return np.exp(log_pdf)/N
def plot_scatter(X, scale, out_prefix, title, kde=True): """Draws a 2D scatter plot (png) of the core and accessory distances Also draws contours of kernel density estimare Args: X (numpy.array) n x 2 array of core and accessory distances for n samples. scale (numpy.array) Scaling factor from :class:`~PopPUNK.models.BGMMFit` out_prefix (str) Prefix for output plot file (.png will be appended) title (str) The title to display above the plot kde (bool) Whether to draw kernel density estimate contours (default = True) """ plt.figure(figsize=(11, 8), dpi=160, facecolor='w', edgecolor='k') if kde: xx, yy, xy = get_grid(0, 1, 100) # KDE estimate kde = KernelDensity(bandwidth=0.03, metric='euclidean', kernel='epanechnikov', algorithm='ball_tree') kde.fit(X) z = np.exp(kde.score_samples(xy)) z = z.reshape(xx.shape).T levels = np.linspace(z.min(), z.max(), 10) plt.contour(xx * scale[0], yy * scale[1], z, levels=levels[1:], cmap='plasma') scatter_alpha = 1 else: scatter_alpha = 0.1 plt.scatter(X[:, 0] * scale[0].flat, X[:, 1] * scale[1].flat, s=1, alpha=scatter_alpha) plt.title(title) plt.xlabel('Core distance (' + r'$\pi$' + ')') plt.ylabel('Accessory distance (' + r'$a$' + ')') plt.savefig(out_prefix + ".png") plt.close()
def _evaluate_vec(self, opts, step, real_points, fake_points, validation_fake_points, prefix=''): """Compute the average log-likelihood and the Coverage metric. Coverage metric is defined in arXiv paper. It counts a mass of true data covered by the 95% quantile of the model density. """ # Estimating density with KDE dist = fake_points[:-1] - fake_points[1:] dist = dist * dist dist = np.sqrt(np.sum(dist, axis=(1, 2, 3))) bandwidth = np.median(dist) num_real = len(real_points) num_fake = len(fake_points) if validation_fake_points is not None: max_score = -1000000. num_val = len(validation_fake_points) b_grid = bandwidth * (2.**(np.arange(14) - 7.)) for _bandwidth in b_grid: kde = KernelDensity(kernel='gaussian', bandwidth=_bandwidth) kde.fit(np.reshape(fake_points, [num_fake, -1])) score = np.mean( kde.score_samples( np.reshape(validation_fake_points, [num_val, -1]))) if score > max_score: # logging.debug("Updating bandwidth to %.4f" # " with likelyhood %.2f" % (_bandwidth, score)) bandwidth = _bandwidth max_score = score kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth) kde.fit(np.reshape(fake_points, [num_fake, -1])) # Computing Coverage, refer to Section 4.3 of arxiv paper model_log_density = kde.score_samples( np.reshape(fake_points, [num_fake, -1])) # np.percentaile(a, 10) returns t s.t. np.mean( a <= t ) = 0.1 threshold = np.percentile(model_log_density, 5) real_points_log_density = kde.score_samples( np.reshape(real_points, [num_real, -1])) ratio_not_covered = np.mean(real_points_log_density <= threshold) log_p = np.mean(real_points_log_density) C = 1. - ratio_not_covered logging.info('Evaluating: log_p=%.3f, C=%.3f' % (log_p, C)) return log_p, C
def fit(self, X, Y): x0 = X[Y==0,:] x1 = X[Y==1,:] self.pc0 = np.log(np.float(x0.shape[0])/np.float(X.shape[0])) self.pc1 = np.log(np.float(x1.shape[0])/np.float(X.shape[0])) self.kdes = [] for ix in range(X.shape[1]): kde0 = KernelDensity(kernel = 'gaussian', bandwidth = self.bw) kde0.fit(x0[:,[ix]]) kde1 = KernelDensity(kernel = 'gaussian', bandwidth = self.bw) kde1.fit(x1[:,[ix]]) self.kdes.append((kde0,kde1))
def train_KDE_model(train_df, bandwith=KDE_BANDWITH): """ Train KDE model based on coordinates of incidents. """ kde = KernelDensity(bandwidth=bandwith, metric='haversine', kernel='gaussian', algorithm='ball_tree') kde.fit(train_df[['latitude', 'longitude']] * np.pi / 180) return kde
def calc_kdes(X_train, Y_train, X_valid, bw): prob_matrix = np.zeros((2, X_valid.shape[0])) for i in range(0, 2): X_train_class_i = X_train[Y_train == i, :] for j in range(0, FEATS): kde = KernelDensity(kernel = 'gaussian', bandwidth = bw) kde.fit(X_train_class_i[:,[j]]) log_prob = kde.score_samples(X_valid[:,[j]]) prob_matrix[i] = np.add(prob_matrix[i], log_prob) return prob_matrix
class KDEntropyEstimator(ItEstimator): discrete = False def __init__(self, kernel="gaussian", min_log_proba=-500, bandwith=1.0, kfold=10): self.kde = KernelDensity(kernel=kernel, bandwidth=bandwith) self.min_log_proba = min_log_proba self.kfold = kfold def estimateFromData(self, datapoints): if len(datapoints.shape) == 1: datapoints = np.expand_dims(datapoints, 1) entropy = 0.0 n, d = datapoints.shape ma = np.ones(n, dtype=np.bool) unit = n // self.kfold rem = n % self.kfold start = 0 end = unit + rem for i in range(self.kfold): sel = np.arange(start, end) ma[start:end] = False curr = datapoints[ma, :] self.kde.fit(curr) score = self.kde.score(datapoints[sel, :]) ma[:] = True start = end end = min(unit + end, n) if score < self.min_log_proba: continue entropy -= score return entropy / n def entropy(self, X): np.random.seed(0) return self.estimateFromData(X) def flags(self): return False, False, False
def five_lambdas(lambdas): misclassifications = [] for each_lambda in lambdas: ##kde = KernelDensity(kernel='epanechnikov', bandwidth=each_lambda) kdg = KernelDensity(kernel='gaussian', bandwidth=each_lambda) kdg.fit(combined.loc[:, combined.columns != 'y']) smooth = kdg.score_samples(combined.loc[:, combined.columns != 'y']) data = pd.DataFrame({'Dat': smooth, 'y': pd.concat([d0['y'],d1['y'],d2['y']])}) data_x_train, data_x_test, data_y_train, data_y_test = train_test_split(data, pd.DataFrame(data['y']), test_size=0.3, stratify=data['y']) model = LDA() model.fit(data_x_train.loc[:, data_x_train.columns != 'y'], data_x_train['y']) misclassification = model.score(data_x_test.loc[:, data_x_test.columns != 'y'], data_y_test) misclassifications.append(misclassification) return misclassifications
def kde2D(x, y, bandwidth, xbins=100j, ybins=100j, **kwargs): """Build 2D kernel density estimate (KDE).""" # create grid of sample locations (default: 100x100) xx, yy = np.mgrid[x.min():x.max():xbins, y.min():y.max():ybins] xy_sample = np.vstack([yy.ravel(), xx.ravel()]).T xy_train = np.vstack([y, x]).T kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(xy_train) # score_samples() returns the log-likelihood of the samples z = np.exp(kde_skl.score_samples(xy_sample)) return xx, yy, np.reshape(z, xx.shape)
def get_kde_jsd(x, y, kw_kde={}): kde = KernelDensity(**kw_kde) kde.fit(x) log_p_x = kde.score_samples(x) log_p_y = kde.score_samples(y) kde.fit(y) log_q_x = kde.score_samples(x) log_q_y = kde.score_samples(y) log_mix_x = np.logaddexp(log_p_x, log_q_x) log_mix_y = np.logaddexp(log_p_y, log_q_y) kl_p_m = log_p_x.mean() - (log_mix_x.mean() - np.log(2)) kl_q_m = log_q_y.mean() - (log_mix_y.mean() - np.log(2)) js_divergence = (kl_p_m + kl_q_m) / 2. js_distance = np.sqrt(js_divergence) return js_distance
def construct_kde(array, bandwidth=None): if bandwidth == None: bw = 1.2*array.std()*np.power(array.size,-1/5) else: bw = bandwidth kde = KernelDensity(kernel='gaussian', bandwidth=bw) kde.fit(array.reshape(-1,1)) x = np.linspace(array.min(),array.max(),200) log_dens=kde.score_samples(x.reshape(-1,1)) kdens=np.exp(log_dens) total_dens=np.sum(kdens) cdf_array=np.zeros(shape=len(x)) delta=x[1]-x[0] for i in range(len(x)): cdf_array[i] = np.sum(kdens[:i])*delta return x,kdens, cdf_array
class KDEModel(object): """ Wrapper class for Scikit Learn's Kernel Density Estimation model. Attributes ---------- model : KernelDensity Wrapped class model. """ def __init__(self, kernel='gaussian', bandwidth=.001): self.model = KernelDensity(kernel='gaussian', bandwidth=bandwidth) def fit(self, train_X): """ Wrapper method for fit() method of Kernel Density model. Parameters ---------- train_X : {array-like, sparse matrix}, shape = [n_samples, n_features] """ self.model.fit(train_X) def generate_samples(self, n_samples): """ Generates the random samples according to the fitted distribution. Returns ------- list List of numpy arrays of randomly generated observations. """ points = self.model.sample(n_samples) return points def score_samples(self, X): """ Predicts the log likelihood score of the samples in X. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] """ return self.model.score_samples(X)
class OneClassKDE(BaseClassifier): _fit_params = ["bandwidth"] def __init__(self, *args, **kwargs): self.bandwidth = kwargs["bandwidth"] def fit(self, data, **kwargs): #self.train_data = data self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth) self.kde.fit(data) self.training_score = self.kde.score_samples(data) self.direct_thresh = numpy.percentile(self.training_score, 10) def predict(self, data): score = self.kde.score_samples(data) self.score = score return (score < self.direct_thresh).astype(numpy.int32)*-2+1 def decision_function(self, data): return self.score
def resample_state(D, w): w_norm = np.sum(w) # Normalization factor for weights w_ecdf = np.cumsum(w) / w_norm # New weight given the new measurement # Resample the points D_new, ind = np.empty_like(D), np.empty_like(D) for i, q in enumerate(D): ind[i] = bisect.bisect_left(w_ecdf, np.random.uniform( 0, 1)) # Indexes for new samples D_new[i] = D[int( ind[i] )] # New weighted particles (samples) from previous step given new measuremnt # Regularize it! # std = np.std(D_new) bandwidth = 0.05 #1.06*std*len(D_new)**-0.2 ## used to be 0.08 kde = KernelDensity( bandwidth=bandwidth, kernel='gaussian', algorithm='ball_tree' ) # Bandwidth = 0.006 is calculated based on Silverman's Rule of Thumb kde.fit(D_new[:, np.newaxis]) return kde.sample(num_particles).flatten(), ind
def nmultitype_conf_matrix(self,tipos,nfolds): cadena = "" for t in tipos: cadena += t if not os.path.exists("models/nmultitype_conf_matrix" + self.bd +"ts"+cadena+"Promedio"+str(nfolds)+".p") or True: #Creamos la matriz de matrices donde guardaremos los resultados parciales matrices = [None] * nfolds * nfolds #Creamos/Recuperamos el modelo Node2Vec n2v = node2vec(self.bd,self.port,self.user,self.pss,self.label,1000,20,6,self.mode,[],1) n2v.learn("normal",0,False,0) #Creamos los arrays X e Y, anadiendo X = [] Y = [] #Creamos un array de comunes que son los nodos que son a la vez de ambos tipos comunes = list() for tipo in tipos: for n in n2v.n_types[tipo]: if n in n2v.w2v: X.append(n2v.w2v[n]) if n in n2v.n_types[tipos[0]] and n in n2v.n_types[tipos[1]]: comunes.append(n2v.w2v[n]) Y.append(tipo) #Creamos los k folds estratificados X = np.array(X) Y = np.array(Y) skf = StratifiedKFold(Y, n_folds=nfolds) it = 0 kdes = [] for train_index, test_index in skf: print "k-fold para kde" X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] Y_test = Y_test.astype('|S64') #Creamos la funcion de densidad de probabilidad de cada tipo for t in tipos: print "Creando KDE para el tipo "+t tempX = [] for idx,n in enumerate(Y_train): if n == t: tempX.append(X_train[idx]) #Calculating KDE with the train set #use grid search cross-validation to optimize the bandwidth #params = {'bandwidth': np.logspace(-1, 1, 10)} #grid = GridSearchCV(neighbors.KernelDensity(), params) #grid.fit(tempX) #print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) # use the best estimator to compute the kernel density estimate #kde = grid.best_estimator_ kde = KernelDensity(kernel='gaussian', bandwidth=0.1) kde.fit(tempX) kdes.append(kde) print "Terminado KDE para el tipo "+t #Dividimos el conjunto de test en tipo1, tipo2 y tipo1+2 cont = 0 for idx,x in enumerate(X_test): total = 0 x = np.array(x) if any((x == a).all() for a in comunes): Y_test[idx] = str(tipos[0]+"+"+tipos[1]) cont += 1 print "Numero de elementos con doble tipo:"+str(cont) #Creamos k-folds estratificados para el arbol de decision skf = StratifiedKFold(Y_test, n_folds=nfolds) for train_index, test_index in skf: print "k-fold para decission tree" X_train1, X_test1 = X_test[train_index], X_test[test_index] Y_train1, Y_test1 = Y_test[train_index], Y_test[test_index] clf = DecisionTreeClassifier(random_state=0) print X_train1[0] clf.fit(X_train1,Y_train1) export_graphviz(clf); Y_pred1 = clf.predict(X_test1) matriz = metrics.confusion_matrix(Y_test1, Y_pred1,[tipos[0],tipos[1],tipos[0]+"+"+tipos[1]]) matrices[it] = np.array(matriz) print matrices[it] it += 1 f = open( "models/nmultitype_conf_matrix" + self.bd +"ts"+cadena+"Promedio"+str(nfolds)+".p", "w" ) pickle.dump(matrices,f) else: f = open( "models/nmultitype_conf_matrix" + self.bd +"ts"+cadena+"Promedio"+str(nfolds)+".p", "r" ) matrices = pickle.load(f) total = matrices[0] for m in matrices[1:]: total += m print total matriz_promedio = total matriz_promedio = matriz_promedio.astype('float') #print matrices #print matriz_promedio matriz_promedio = matriz_promedio / len(matrices) #print matriz_promedio #calculando porcentajes a partir del promedio de frecuencias for i in range(0,len(matriz_promedio)): suma = 0 for j in range(0,len(matriz_promedio)): suma += matriz_promedio[i][j] matriz_promedio[i][j] = float(matriz_promedio[i][j]) for j in range(0,len(matriz_promedio)): if suma > 0: matriz_promedio[i][j] = round(float(matriz_promedio[i][j] * 100) / float(suma),2) else: matriz_promedio[i][j] = 0 matriz_promedio = matriz_promedio.astype('string') for i in range(0,len(matriz_promedio)): for j in range(0,len(matriz_promedio)): matriz_promedio[i][j] = str(matriz_promedio[i][j])+"%" return matriz_promedio