def cv1(x, bws, model='gaussian', plot=False, n_folds=10): """ This calculates the leave-one-out cross validation. If you set plot to True, then it will show a big grid of the test and training samples with the KDE chosen at each step. You might need to modify the code if you want a nicer layout :) """ # Get the number of bandwidths to check and the number of objects N_bw = len(bws) N = len(x) cv_1 = np.zeros(N_bw) # If plotting is requested, set up the plot region if plot: fig, axes = plt.subplots(N_bw, int(np.ceil(N/n_folds)), figsize=(15, 8)) xplot = np.linspace(-3, 8, 1000) # Loop over each band-width and calculate the probability of the # test set for this band-width for i, bw in enumerate(bws): # I will do N-fold CV here. This divides X into N_folds kf = KFold(N) # Initiate - lnP will contain the log likelihood of the test sets # and i_k is a counter for the folds that is used for plotting and # nothing else.. lnP = 0.0 i_k = 0 # Loop over each fold for train, test in kf.split(x): x_train = x[train, :] x_test = x[test, :] # Create the kernel density model for this bandwidth and fit # to the training set. kde = KD(kernel=model, bandwidth=bw).fit(x_train) # score evaluates the log likelihood of a dataset given the fitted KDE. log_prob = kde.score(x_test) if plot: # Show the tries ax = axes[i][i_k] # Note that the test sample is hard to see here. hist(x_train, bins=10, ax=ax, color='red') hist(x_test, bins=10, ax=ax, color='blue') ax.plot(xplot, np.exp(kde.score_samples(xplot[:, np.newaxis]))) i_k += 1 lnP += log_prob # Calculate the average likelihood cv_1[i] = lnP/N return cv_1
def question1b(t, key, bwrange): import matplotlib.colors as colors import matplotlib.cm as cmx import seaborn as sns sns.set() t = Table().read('joint-bh-mass-table.csv') X_plot = np.linspace(np.min(t['MBH']) - 4, np.max(t['MBH']) + 4, num=1000)[:, np.newaxis] X = t['MBH'][:, np.newaxis] #plt.scatter(X[:,0], np.zeros(len(X[:,0])), marker = 'x', color = 'black') #different values for the bandwidth bwrange = np.arange(1, 10, 0.1) #plot many lines using colors from a color map, with MAX the amount of lines #jet = plt.get_cmap('jet') #cNorm = colors.Normalize(vmin=0, vmax=len(bwrange)) #scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet) #set the number of folds kf = KFold(n_splits=5) likelyhood = np.zeros(len(bwrange)) for bw, i in zip(bwrange, np.arange(len(bwrange))): lh = [] for train_i, test_i in kf.split(X): Xtrain, Xtest = X[train_i], X[test_i] kde = KernelDensity(bandwidth=bw, kernel='gaussian').fit(Xtrain) log_dens = kde.score_samples(Xtrain) lhscore = kde.score(Xtest) #print('Bandwidth: {0}, Likelyhood: {1}'.format(bw, lhscore)) lh = np.append(lh, lhscore) likelyhood[i] = np.mean(lh) print('Highest likelyhood ({0}) at bandwidth = {1}'.format( round(np.max(likelyhood), 2), bwrange[np.argmax(likelyhood)])) plt.plot(bwrange, likelyhood, color='black', alpha=0.8, label='Likelyhood') plt.scatter(bwrange[np.argmax(likelyhood)], np.max(likelyhood), marker='x', s=100, color='orange', label='Maximum likelyhood') plt.xlabel('Bandwidth [$10^6$ M$_{\odot}$]') plt.ylabel('Likelyhood') plt.legend(loc='best') plt.title('Black hole mass density distribution') #plt.savefig('Blackhole-kde-bandwidth-likelyhood.svg') plt.show()
def plot_density(model, path): N_samples = 10000 samples = model.predict_y_samples(Xs, N_samples, session=sess)[:, :, 0] # objective = np.average([model.compute_log_likelihood() for _ in range(1000)]) fig, ax = plt.subplots(1, 1, figsize=(6, 6)) ax.scatter(X, Y, marker='.', color='C1') levels = np.linspace(-1, 2, 200) ax.set_ylim(min(levels), max(levels)) ax.set_xlim(min(Xs), max(Xs)) cs = np.zeros((len(Xs), len(levels))) for i, Ss in enumerate(samples.T): bandwidth = 1.06 * np.std(Ss) * len(Ss) ** (-1. / 5) # Silverman's (1986) rule of thumb. kde = KernelDensity(bandwidth=float(bandwidth)) kde.fit(Ss.reshape(-1, 1)) for j, level in enumerate(levels): cs[i, j] = kde.score(np.array(level).reshape(1, 1)) ax.pcolormesh(Xs.flatten(), levels, np.exp(cs.T), cmap='Blues_r') # , alpha=0.1) ax.scatter(X, Y, marker='.', color='C1') plt.savefig(os.path.join(path, 'density_{:03d}.png'.format(k))) plt.close()
def plot_posterior(gp, **options): bounds = gp.bounds posterior = gp.get_posterior S = gp.num_posterior_samples ax = get_axes(**options) Xs = np.linspace(*bounds[0], num=1000) samples = posterior(Xs, S)[:, :, 0] # print(samples) ydif = (max(gp.Y) - min(gp.Y)) * 0.15 levels = np.linspace(min(gp.Y) - ydif, max(gp.Y) + ydif, 1000) ax.set_ylim(min(levels), max(levels)) ax.set_xlim(min(Xs), max(Xs)) cs = np.zeros((len(Xs), len(levels))) for i, Ss in enumerate(samples.T): bandwidth = 1.06 * np.std(Ss) * len(Ss)**( -1. / 5) # Silverman's (1986) rule of thumb. kde = KernelDensity(bandwidth=float(bandwidth)) kde.fit(Ss.reshape(-1, 1)) for j, level in enumerate(levels): cs[i, j] = kde.score(np.array(level).reshape(1, 1)) ax.pcolormesh(Xs.flatten(), levels, np.exp(cs.T), cmap='Blues_r') # , alpha=0.1) ax.scatter(gp.X, gp.Y, s=15, color="red", zorder=10) '''for j in range(0, 5):
def pdf(self, token, years, bandwidth=5): """ Estimate a density function from a token's rank series. Args: token (str) years (range) Returns: OrderedDict {year: density} """ series = self.series(token) data = [] for year, wpm in series.items(): data += [year] * round(wpm) data = np.array(data)[:, np.newaxis] pdf = KernelDensity(bandwidth=bandwidth).fit(data) samples = OrderedDict() for year in years: samples[year] = np.exp(pdf.score(year)) return samples
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true): kde = KernelDensity(kernel=kernel, bandwidth=bandwidth, atol=atol, rtol=rtol) log_dens = kde.fit(X).score_samples(Y) assert_allclose(np.exp(log_dens), dens_true, atol=atol, rtol=max(1E-7, rtol)) assert_allclose(np.exp(kde.score(Y)), np.prod(dens_true), atol=atol, rtol=max(1E-7, rtol))
def part_d_test(x, p1, p2): kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(x) sc1 = kde.score(p1) sc2 = kde.score(p2) sc_b = cross_val_score(kde, x) sc1_b = sc1 / (sum(sc_b) / len(sc_b)) sc2_b = sc2 / (sum(sc_b) / len(sc_b)) print(sc1_b) print(sc2_b) print('c')
def kde3d(x, y, z, data_point): values = np.vstack([x, y, z]).T # Use grid search cross-validation to optimize the bandwidth # params = {'bandwidth': np.logspace(-1, 1, 20)} kde = KernelDensity(bandwidth=0.3) kde.fit(values) kde_coords = kde.sample(10000) log_pdf = kde.score_samples(kde_coords) percentile = np.sum(log_pdf < kde.score(data_point))/10000. return (percentile)
def test(): size=100 a=0.5 p=2 rand=0 # generate sequence states, observations, filtered_state_estimates = KalmanSequence(size, a, rand) # # plot sequence # plt.plot(states, marker='.', label="true") # # plt.plot(observations, label="obs") # plt.plot(filtered_state_estimates, marker='.', label="est") # plt.legend() # plt.show() # plt.clf() # produce blocks (X:label, Y:features) X,Y = produce_blocks(states, p) XY = np.concatenate((X,Y), axis=1) # print("data") # print(states) # print(X) # print(Y) # estimate pdf # Compute the total log probability density under the model. # aka score = log-likelihood kde_x = KernelDensity(kernel='gaussian', bandwidth=2).fit(X) kde_y = KernelDensity(kernel='gaussian', bandwidth=2).fit(Y) kde_xy = KernelDensity(kernel='gaussian', bandwidth=2).fit(XY) print("estimation") print(e ** kde_x.score(X)) print(e ** kde_y.score(Y)) print(e ** kde_xy.score(XY)) entropy_est = -np.mean(kde_xy.score(XY) - kde_y.score(Y)) print("Estimated Lower Bound: ", func(entropy_est)) print("Kalman Filter MSE : ", mse(states, filtered_state_estimates))
def calculateDensityKernel(self, pt_3d, num_neigh=10, noprogress=True): dists, nn_idxs = self.kdt.query(pt_3d, num_neigh) densities = [] for i in tqdm(range(0, pt_3d.shape[0]), disable=noprogress): nn_coords = self.all_coords3d[nn_idxs[i], :3] density = KernelDensity().fit(nn_coords) log_density = density.score(pt_3d[i].reshape(1, -1)) density = np.exp(log_density) densities.append(density) densities = np.array(densities).reshape(-1) return densities
def calculate_KL_KDE((posterior_distances, prior_distances)): from sklearn.neighbors import KernelDensity from scipy.integrate import quad h_silverman = lambda d: d.std() * (4. / 3 / len(d))**(1. / 5) h = h_silverman prior = KernelDensity(kernel='gaussian', bandwidth=h(prior_distances)).fit( prior_distances.reshape(-1, 1)) posterior = KernelDensity(kernel='gaussian', bandwidth=h(posterior_distances)).fit( posterior_distances.reshape(-1, 1)) ce = lambda x: -prior.score(x) * np.exp(posterior.score(x)) hh = lambda x: -posterior.score(x) * np.exp(posterior.score(x)) x_max = np.max((posterior_distances.max(), prior_distances.max())) vals = (quad(ce, 0., x_max)[0], quad(hh, 0., x_max)[0]) return vals[0] - vals[1]
class KernelDensityLmC(ContinuousLmC): def Init(self): ContinuousLmC.Init(self) self.kde = KernelDensity() self.lBandWidth = np.logspace(-2, 0, 10) self.BandWidth = 0.001 self.KernelType = 'additivekde' def SetPara(self, conf): ContinuousLmC.SetPara(self, conf) self.BandWidth = conf.GetConf('bandwidth', self.BandWidth) self.KernelType = conf.GetConf('kernel', self.KernelType) return True def Construct(self, lTerm, Word2VecModel): if [] == lTerm: return lX = np.array( [Word2VecModel[term] for term in lTerm if term in Word2VecModel]) # self.kde = self.CVForBestKde() self.FitKernel(lX) logging.debug('doc kde lm estimated') def FitKernel(self, lX): if self.KernelType == 'additivekde': self.kde = AdditiveKdeC() self.kde.Bandwidth = self.BandWidth self.kde.fit(lX) return if self.KernelType == 'kde': self.kde = KernelDensity(kernel='gaussian', bandwidth=self.BandWidth).fit(lX) return def CVForBestKde(self): ''' this is CV for each doc's best bandwidth It is better/more intuitive to CV for training query's ranking performance ''' params = {'bandwidth': self.lBandWidth} # logging.debug('cv bandwidth from [%s]',json.dumps(self.lBandWidth)) grid = GridSearchCV(KernelDensity(), params) logging.debug('fitting on [%d] vector', len(self.lX)) grid.fit(self.lX) logging.info('best bandwidth = [%f]', grid.best_estimator_.bandwidth) return grid.best_estimator_ def pdf(self, x): return np.exp(self.LogPdf(x)) def LogPdf(self, x): return self.kde.score(x)
def get_params_ll(X_train, X_validate, bandwidth_kernel): """ Fit data using this bandwidth and kernel and report back log-likelihood fit on validation data (30% of training). This works better than 3-fold cross-validation, which was found to overfit data. :param X: data for only positive class :param bandwidth_kernel: list of bandwidth and kernel to evaluate """ bandwidth = bandwidth_kernel[0] kernel = bandwidth_kernel[1] kde = KernelDensity(bandwidth=bandwidth, metric='euclidean', kernel=kernel ) kde.fit(X_train) ll = kde.score(X_validate) return ll
def bestbandwidth(a): kf = KFold(n_splits=10) kf.get_n_splits(a) Max = -1e99 for train_index, test_index in kf.split(a): a_train, a_test = a[train_index], a[test_index] kde = KernelDensity(kernel='gaussian', bandwidth=0.001 + i / 1000.).fit(a_train) log_dens = kde.score_samples(a_train) loglikelihood = kde.score(a_test) array = np.append(array, loglikelihood) Loglikelihood = np.nanmean(array) if Loglikelihood > Max: Max = Loglikelihood Bandwidth = 0.001 + i / 1000. print 'new best value for the bandwidth: ', Bandwidth
def plot_posterior_samples(target_model, x_counts=1000, samples=100, points=True, kde=True): m = target_model bounds = m.bounds S = samples if type(m).__name__ == 'DGPRegression': posterior = m.get_posterior elif type(m).__name__ == 'GPyRegression': posterior = m.get_posterior else: raise ValueError("The target_model should be either 'DGPRegression'" "or 'GpyRegression'") Xs = np.linspace(*bounds[0], x_counts) samples = posterior(Xs, size=S) samples = samples[:, :, 0] ydif = (max(m.Y) - min(m.Y)) * 0.15 levels = np.linspace(min(m.Y) - ydif, max(m.Y) + ydif, 1000) ax = plt.gca() # ax.set_ylim(min(levels), max(levels)) # ax.set_ylim(min(levels), 1.0) # ax.set_xlim(min(Xs), max(Xs)) plt.xticks(np.arange(0, 100, step=10)) plt.xlabel(r"$\theta$") plt.ylabel(r"$d(x_\theta, x_{obs})$") if kde == True: cs = np.zeros((len(Xs), len(levels))) for i, Ss in enumerate(samples.T): bandwidth = 1.06 * np.std(Ss) * len(Ss)**( -1. / 5) # Silverman's (1986) rule of thumb. kde = KernelDensity(bandwidth=float(bandwidth)) kde.fit(Ss.reshape(-1, 1)) for j, level in enumerate(levels): cs[i, j] = kde.score(np.array(level).reshape(1, 1)) ax.pcolormesh(Xs.flatten(), levels, np.exp(cs.T), cmap='Blues_r') # , alpha=0.1) if points == True: ax.scatter(m.X, m.Y, s=15, color="red", zorder=10) return
def calculateFeatures(self, distancesArray, nearestNeighborsArray, iterVector) -> dict: resultsDict = {} if "avgDistance" in self.features: # Calculate Average Distance for k-Nearest Neighbos resultsDict["avgDistance"] = np.mean(distancesArray) if "maxDistance" in self.features: # Calculate Max Distance for k-th Neighbor resultsDict["maxDistance"] = np.max(distancesArray) if "localDensity" in self.features: kde = KernelDensity(kernel='gaussian', bandwidth=1.0).fit(nearestNeighborsArray) resultsDict["localDensity"] = -1 * kde.score(iterVector) return resultsDict
def jackknife_bandwidths(data, bandwidths, kernel="gaussian"): """Perform jack-knife sampling over different bandwidths for KDEs for each time-series in the dataset. Parameters ---------- data: list of arrays A list of (variable length) arrays of values. The values should represent "times" of "events". bandwidths: array The possible bandwidths to try kernel: string (optional, default="gaussian") The kernel to use for the KDE. Should be accepted by sklearn's KernelDensity class. Returns ------- result: array of shape (n_bandwidths,) The total likelihood of unobserved data over all jackknife samplings and all time series in the dataset for each bandwidth. """ result = np.zeros(bandwidths.shape[0]) for j in range(bandwidths.shape[0]): kde = KernelDensity(bandwidth=bandwidths[j], kernel=kernel) for i in range(len(data)): likelihood = 0.0 for k in range(len(data[i])): if k < len(data[i]) - 1: jackknife_sample = np.hstack([data[i][:k], data[i][k + 1 :]]) else: jackknife_sample = data[i][:k] kde.fit(jackknife_sample[:, None]) likelihood += np.exp(kde.score(np.array([[data[i][k]]]))) result[j] += likelihood return result
class EmpiricalDistribution1DKDE(object): def __init__(self, param_name, samples, minval=None, maxval=None, bandwidth=0.1, nbins=40): """ Minvals and maxvals should specify priors for these. Should make these required. """ self.ndim = 1 self.param_name = param_name self.bandwidth = bandwidth # code below relies on samples axes being swapped. but we # want to keep inputs the same # create a 2D KDE from which to evaluate self.kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit( samples.reshape((samples.size, 1))) if minval is None: # msg = "minvals for KDE empirical distribution were not supplied. Resulting distribution may not have support over full prior" # logger.warning(msg) # widen these to add support minval = min(samples) maxval = max(samples) # significantly faster probability estimation using interpolation # instead of evaluating KDE every time self.minval = minval self.maxval = maxval xvals = np.linspace(minval, maxval, num=nbins) self._Nbins = nbins scores = np.array( [self.kde.score(np.atleast_2d(xval)) for xval in xvals]) # interpolate within prior self._logpdf = interp1d(xvals, scores, kind='linear', fill_value=-1000) def draw(self): params = self.kde.sample(1).T return params.squeeze()
from data import importdata import numpy as np from sklearn.neighbors import NearestNeighbors, KernelDensity dataset = ['abalone16_29', 'balance_scale', 'breast_cancer', 'car', 'cmc', 'ecoli', 'glass', 'haberman', 'heart_cleveland', 'hepatitis', 'new_thyroid', 'postoperative', 'solar_flare', 'transfusion', 'vehicle', 'yeastME3', 'bupa', 'german', 'horse_colic', 'ionosphere', 'seeds', 'vertebal'] for data in dataset: db = getattr(importdata, 'load_' + data)() print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX") print('Zbior danych: %s' % data) metrics = ['minkowski'] for metric in metrics: nearestN = KernelDensity(kernel='epanechnikov') nearestN.fit(db.data, db.target) miniority_ind = np.where(db.target == 1) miniority_data = db.data[miniority_ind] miniority_target = db.target[miniority_ind] for d in miniority_data: print(nearestN.score(d))
print(result) # Non parametric regression #Nearest Neighbors with 5 neighbor neigh = NearestNeighbors(n_neighbors=5) neigh.fit(df3, y) #K nearest neighbors with 5 neighbor kneigh = KNeighborsRegressor(n_neighbors=5) kneigh.fit(df3, y) distances, indices = kneigh.kneighbors(df3) print("Estimated value for selected features:", kneigh.predict(input_validation[Columnlist])) #Kernel destiny kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(df3) kdescore = kde.score(df3, yv) print("Kernel score:", kdescore) ##Write prediction to file test = test.fillna(0) test = pd.DataFrame(test) test.columns = [ 'Tweet Id', 'User Name', 'Favs', 'RTs', 'Followers', 'Following', 'Listed', 'likes', 'tweets', 'reply', 'URLs', 'Tweet content' ] #add wordcounts of tweet test['Tweet content'] = test['Tweet content'].astype(str) test['Word Count'] = test['Tweet content'].str.split().str.len() test = test[Columnlist] #dar inja sotun result ra ezafe mikonim
n_jobs : int regarding the number of jobs to run in parallel. Default = maximum number of jobs. Returns ---------- F1_CAPe : dict containing for each key multiple of k (k, 2*k, 3*k,...,ntimes*k) the array [tn,fp,fn,tp] obtained with the estimate of the prior in such case. class_priors : array of shape (ntimes,) containing every k new labels the estimate of the class prior. """ n = np.shape(X_train)[0] tmp_cont = 0.1 query_list = [] labeled_ex = np.zeros(n, dtype=np.int) ker = KernelDensity().fit(X_train) dmu = [np.exp(ker.score(X_train[i:i + 1])) for i in range(n)] mean_prob_term = math.log(np.mean(dmu), 10) #Take the log density F1_CAPe = {} class_priors = np.zeros(ntimes, dtype=float) for j in range(ntimes): prior, labeled_ex, query_list = CAPe(X_train, labeled_ex, query_list, k, real_anomalies, tmp_cont, mean_prob_term, case, n_jobs) class_priors[j] = prior tmp_cont = 1 - min(prior, 0.9999) #update the contamination factor F1_CAPe[int(k * (j + 1))] = get_tnfpfntp(
driverID=2 tripInd=2 driverDir = '/home/user1/Desktop/SharedFolder/Kaggle/DriversCleaned/'+str(driverID) df = pd.read_csv(driverDir+'_' + str(tripInd)+'.csv') trip = Trip(driverID,tripInd,df) trip.getSpeed() trip.getAcc() #trip.getRadius() #trip.getCacc() trip.getFeatures() X=trip.features[['v','acc']] probas = np.zeros(X.shape[0]) for i in range(X.shape[0]): probas[i]=clf.score(X.loc[i]) # <codecell> probas.mean() # <codecell> sns.jointplot(X.v,X.acc,kind = "scatter",size=6,ratio=5,marginal_kws={'bins':30}) #sns.kdeplot(X[['cacc','acc']]) # <codecell> xN = np.asanyarray(X[['cacc','acc']]) # <codecell>
def evaluate_generator(self, test_data, iteration=None): is_load_weights = iteration is not None if is_load_weights: self.load_weights(iteration) test_data = test_data[:] if type(test_data['poses']) == torch.Tensor: test_data['poses'] = test_data['poses'].numpy() test_data['konf_obsts'] = test_data['konf_obsts'].numpy() test_data['actions'] = test_data['actions'].numpy() poses = torch.from_numpy(test_data['poses']).float().to(self.device) konf_obsts = torch.from_numpy(test_data['konf_obsts']).float().to( self.device) n_data = len(poses) n_smpls_per_state = 100 smpls = [] print "Making samples..." stime = time.time() for i in range(n_smpls_per_state): if self.architecture == 'gnn': noise = torch.randn(n_data, self.n_dim_actions).to(self.device) new_smpls1 = self.generator(konf_obsts[:500], poses[:500], noise[:500]) new_smpls2 = self.generator(konf_obsts[500:], poses[500:], noise[500:]) new_smpls = torch.cat([new_smpls1, new_smpls2], dim=0) else: noise = torch.randn(n_data, self.n_dim_actions).to(self.device) new_smpls = self.generator(konf_obsts, poses, noise) smpls.append(new_smpls.cpu().detach().numpy()) print "Sample making time", time.time() - stime smpls = np.stack(smpls) real_actions = test_data['actions'] real_actions, real_mean, real_std = self.normalize_data(real_actions) real_data_scores = [] entropies = [] min_mses = [] for idx in range(n_data): smpls_from_state = smpls[:, idx, :] smpls_from_state, _, _ = self.normalize_data( smpls_from_state, real_mean, real_std) real_action = real_actions[idx].reshape(-1, self.n_dim_actions) unnormalized_real_action = real_action * real_std + real_mean unnormalized_smpls_from_state = smpls_from_state * real_std + real_mean min_mse = self.measure_min_mse_between_samples_and_point( unnormalized_real_action, unnormalized_smpls_from_state) min_mses.append(min_mse) # fit the KDE - how likely is the real action come from the learend distribution of smpls_from_state generated_model = KernelDensity( kernel='gaussian', bandwidth=0.1).fit(smpls_from_state) real_data_scores.append(generated_model.score(real_action)) # measure the entropy if 'pick' in self.action_type: base_angles = unnormalized_smpls_from_state[:, 4:6] H, _, _ = np.histogram2d(base_angles[:, 0], base_angles[:, 1], bins=10, range=self.domain[:, 4:6].transpose()) else: place_x, place_y = unnormalized_smpls_from_state[:, 0], unnormalized_smpls_from_state[:, 1] encoded_theta = unnormalized_smpls_from_state[:, 1:] # H_theta, _, _ = np.histogram2d(encoded_theta[:, 0], encoded_theta[:, 1], bins=10, range=self.domain[:, 2:].transpose()) H, _, _ = np.histogram2d(place_x, place_y, bins=10, range=self.domain[:, 0:2].transpose()) # I think the angle entropy is more important # For a given x,y, what is the entropy on the angles? I think entropy of angles # has more to say, because this is what we should get accurately. all_smpls_out_of_range = np.sum(H) == 0 if all_smpls_out_of_range: entropy = np.inf else: prob = H / np.sum(H) entropy = sp.stats.entropy(prob.flatten()) entropies.append(entropy) return np.mean(min_mses), np.mean(real_data_scores), np.mean(entropies)
def make2D_KDE(X, n_samp = 1e5, bandwidth = None, n_folds = 3, bw_train_size = 1000, bw_range_size = 20, doplot = True): """ Make a 2D Kernel Density Estimation and draw a n_samp number of samples from it best bandwidth obtained from previous runs bandwidth = 0.0546938775510204 bandwidth = 0.05894736842105264 Input: X (2D numpy array): the training data, consisting of the Y - J and J - H colours.\n n_samp (int): the number of samples to draw from the KDE. Default = 100000.\n bandwidth (float): the bandwidth to use for the KDE from which the samples will be drawn. Set to None to let the script find the best bandwidth. Default = None.\n n_folds (int): the number of folds to use when determining the bandwidth.\n bw_train_size (int): size of the training set that will be used to determine the best bandwidth. Default = 1000.\n bw_range_size (int); the amount of bandwidths to try out in the interval 0.04 to 0.1. Default = 20.\n doplot (boolean): whether to make a hex-bin plot of the drawn samples or not. Default = True. Output: samples (2D numpy array): the samples drawn from the KDE. """ import matplotlib.pyplot as plt import seaborn as sns from sklearn.neighbors import KernelDensity from sklearn.model_selection import KFold from matplotlib import rcParams rcParams['font.family'] = 'Latin Modern Roman' from matplotlib.colors import LogNorm #shuffle the data np.random.shuffle(X) #determine the best bandwidth if it is not provided if bandwidth == None: #first we find the optimum bandwidth kf = KFold(n_splits = n_folds) #range of bandwidths to try bwrange = np.linspace(0.02, 0.08, bw_range_size) #the array which will store the likelyhood likelyhood = np.zeros(len(bwrange)) print('Finding the best bandwidth...') for bw, i in zip(bwrange, np.arange(len(bwrange))): print('Iteration {0}, bandwidth {1}'.format(i, bw)) lh = [] #split the data into a train and test set using only the first 1000 samples for train_i, test_i in kf.split(X[:,:bw_train_size]): Xtrain, Xtest = X[train_i], X[test_i] kde = KernelDensity(bandwidth = bw, kernel = 'gaussian').fit(Xtrain) lhscore = kde.score(Xtest) lh = np.append(lh, lhscore) print('Bandwidth: {0}, score: {1}'.format(bw, lhscore)) likelyhood[i] = np.mean(lh) plt.plot(bwrange, likelyhood) plt.xlabel('Bandwidth') plt.ylabel('Likelyhood') plt.title('KDE likelyhood for different bandwidths') plt.savefig('2D_KDE_likelyhood_run4.png', dpi = 300) plt.close() #find the bandwidth which gave the highest likelyhood bandwidth = bwrange[np.argmax(likelyhood)] print('Best bandwidth: {0}'.format(bandwidth)) kde = KernelDensity(bandwidth = bandwidth, kernel = 'gaussian').fit(X) #pull samples from the kde samples = kde.sample(int(n_samp)) #plot the samples in a hexbin plot if doplot: plt.hexbin(samples[:, 0], samples[:, 1], bins = 'log', cmap = 'Reds') plt.colorbar(label = 'Density of samples [logarithmic]') plt.xlabel('Y - J') plt.ylabel('J - H') plt.title('Distribution of samples in (Y-J, J-H) colour space') plt.savefig('Samples_distribution_hex.pdf', dpi = 300) plt.show() return samples
class KDE(): """Kernel density estimation (KDE) for accurate local density estimation. This is achieved by using maximum-likelihood estimation of the generative kernel density model which is regularized using cross-validation. Parameters ---------- bandwidth: float, optional bandwidth for the kernel density estimation. If not specified, will be determined automatically using maximum likelihood on a test-set. nh_size: int, optional number of points in a typical neighborhood... only relevant for evaluating a crude estimate of the bandwidth. If run in combination with t-SNE, should be on the order of the perplexity. xtol,atol,rtol: float, optional precision parameters for kernel density estimates and bandwidth optimization determination. test_ratio_size: float, optional ratio of the test size for determining the bandwidth. """ def __init__(self, bandwidth=None, test_ratio_size=0.1, xtol=0.01, atol=0.000005, rtol=0.00005, extreme_dist=False, nn_dist=None): self.bandwidth = bandwidth self.test_ratio_size = test_ratio_size self.xtol = xtol self.atol = atol self.rtol = rtol self.extreme_dist = extreme_dist self.nn_dist = nn_dist def fit(self, X): """Fit kernel model to X""" if self.bandwidth is None: self.bandwidth = self.find_optimal_bandwidth(X) else: self.kde = KernelDensity(bandwidth=self.bandwidth, algorithm='kd_tree', kernel='gaussian', metric='euclidean', atol=self.atol, rtol=self.rtol, breadth_first=True, leaf_size=40) self.kde.fit(X) return self def evaluate_density(self, X): """Given an array of data, computes the local density of every point using kernel density estimation Input ------ Data X : array, shape(n_sample,n_feature) Return ------ Log of densities for every point: array, shape(n_sample) Return: kde.score_samples(X) """ return self.kde.score_samples(X) def bandwidth_estimate(self, X): """Gives a rough estimate of the optimal bandwidth (based on the notion of some effective neigborhood) Return --------- bandwidth estimate, minimum possible value : tuple, shape(2) """ if self.nn_dist is None: nn = NearestNeighbors(n_neighbors=2, algorithm='kd_tree') nn.fit(X) nn_dist, _ = nn.kneighbors(X, n_neighbors=2, return_distance=True) else: nn_dist = self.nn_dist h_min = np.mean(nn_dist[:, 1]) h_max = 5 * h_min # heuristic bound !! careful !! return h_max, h_min def find_optimal_bandwidth(self, X): """Performs maximum likelihood estimation on a test set of the density model fitted on a training set """ from scipy.optimize import fminbound hest, hmin = self.bandwidth_estimate(X) print("[kde] Minimum bound = %.4f \t Rough estimate of h = %.4f" % (hmin, hest)) X_train, X_test = train_test_split(X, test_size=self.test_ratio_size) args = (X_train, X_test) # We are trying to find reasonable tight bounds (hmin,1.5*hest) to bracket the error function minima if self.xtol > hmin: tmp = round_float(hmin) print( '[kde] Bandwidth tolerance (xtol) greater than minimum bound, adjusting xtol: %.5f -> %.5f' % (self.xtol, tmp)) self.xtol = tmp h_optimal, score_opt, _, niter = fminbound( self.log_likelihood_test_set, hmin, 1.5 * hest, args, maxfun=100, xtol=self.xtol, full_output=True) print("[kde] Found log-likelihood minima in %i evaluations, h = %.5f" % (niter, h_optimal)) if self.extreme_dist is False: # in the case of distribution with extreme variances in density, these bounds will fail ... assert abs(h_optimal - 1.5 * hest) > 1e-4, "Upper boundary reached for bandwidth" assert abs(h_optimal - hmin) > 1e-4, "Lower boundary reached for bandwidth" return h_optimal ''' def find_nh_size(self, X, h_optimal = None, n_estimate = 100): """ Given the optimal bandwidth from the CV score, finds the nh_size (using a binary search) which yield h_opt according to the formula np.median(dist_to_nth_neighor) = h_opt """ if h_optimal is None: h_optimal = self.bandwidth # should trigger a bug if this is not defined ! nn = NearestNeighbors(n_neighbors = n_estimate, algorithm='kd_tree').fit(X) nn_dist, _ = self.nbrs.kneighbors(X, n_neighbors = 3*n_estimate) max_n = 3*n_estimate min_n = 0 n_var = n_estimate while True: # performs binary search until convergence ! h_est = np.median(nn_dist[:,n_var]) print(n_var,'\t', h_est) if h_est > h_optimal: max_n = n_var change = round(0.5*(max_n - min_n))+min_n if change != n_var: n_var = change else: break else: min_n = n_var change = round(0.5*(max_n - min_n))+min_n if change != n_var: n_var = change else: break return n_var ''' def log_likelihood_test_set(self, bandwidth, X_train, X_test): """Fit the kde model on the training set given some bandwidth and evaluates the log-likelihood of the test set """ self.kde = KernelDensity(bandwidth=bandwidth, algorithm='kd_tree', atol=self.atol, rtol=self.rtol, leaf_size=40) self.kde.fit(X_train) return -self.kde.score(X_test)
def apply(self): # [WUMBO] - Weighted UID-filtered Multi-Metric Based Outlier detection ############################################################################## # Wumbo is an anomaly detector designed for large datasets with pockets of # common groups. # # Part of the output of Wumbo is the Outlier Score [0,1], the other part # is the weight, or Risk Score, a number that represents "Outlierness" # of the node. # # The algorithm requires no hyperparameters to be chosen except # alpha. # # There are some optional arguments to select starting with filtering out similar # identities/uid's so that one identity/uid can't cluster with itself to poison # the density values. # # Additionally, the default measurements are kernel density, average distance, # and max distance from some k-Nearest Neighbors where k is already # calculated by 5 <= sqrt(N) <= 50. This helps scale the dataset to a large # number of data points with common behavioral characteristics relative to the # size of the total, larger dataset. # # These metrics are then combined and evaluated against the entire dataset # to find outliers and Score the weighted "Risk." ############################################################################## # Initialize a temporary and return dataframe temp_df = self.dataframe.copy(deep=True) results_df = pd.DataFrame() # Calculate number of k-Neighbors ############################################################################## # This is a number that is equal to the square root of distinct count of UID's # with a minimum of 5 and a maximum of 50. (Concept comes from t-SNE paper) # This way the number of k-Neighbors scales with the size of the data. ############################################################################## kNeighbors = min(max(int(len(self.dataframe[self.uidColumnName].unique()) ** 0.5),5),50) numberOfRows = len(self.dataframe.index) # Initialize feature columns if "avgDistance" in self.features: results_df["avgDistance"] = 0 if "maxDistance" in self.features: results_df["maxDistance"] = 0 if "localDensity" in self.features: results_df["localDensity"] = 0 # Iterate through dataframe for x in range(numberOfRows): # Identify current UID Value iterVector: np.array # Filter (or not) if self.filter==True: temp_df = self.dataframe.copy(deep=True) currentUID = temp_df.loc[x,self.uidColumnName] # Filter out UID's and convert to numpy iterVector = temp_df.loc[x].drop(self.uidColumnName).reset_index(drop=True).to_numpy().reshape(1,-1) neighbors = NearestNeighbors(n_neighbors=kNeighbors) neighbors.fit(temp_df.loc[x | temp_df[self.uidColumnName]!=currentUID].drop([self.uidColumnName], axis=1).to_numpy()) distancesArray, ind = neighbors.kneighbors(iterVector, return_distance=True) nearestNeighborsArray = temp_df[temp_df.index.isin(ind[0])].drop(labels=self.uidColumnName,axis=1).to_numpy() else: currentUID = temp_df.loc[x,self.uidColumnName] #print(len(temp_df)) # Find nearest neighbors iterVector = temp_df.loc[x].drop(self.uidColumnName).reset_index(drop=True).to_numpy().reshape(1,-1) neighbors = NearestNeighbors(n_neighbors=kNeighbors) neighbors.fit(temp_df.drop([self.uidColumnName], axis=1).to_numpy()) distancesArray, ind = neighbors.kneighbors(iterVector, return_distance=True) nearestNeighborsArray = temp_df[temp_df.index.isin(ind[0])].drop(labels=self.uidColumnName,axis=1).to_numpy() # Calculate Features (for layer 1) based on Feature Values for Model resultsDict = self.calculateFeatures(distancesArray=distancesArray, nearestNeighborsArray=nearestNeighborsArray, iterVector=iterVector) resultsDict[self.uidColumnName] = currentUID results_df = results_df.append(resultsDict, ignore_index=True) output_df = self.dataframe.copy(deep=True) kde = KernelDensity(kernel='gaussian', bandwidth=1.0).fit(results_df[self.features].to_numpy()) for x in range(numberOfRows): iterRow = results_df.loc[x].drop("uid").reset_index(drop=True).to_numpy().reshape(1,-1) output_df.loc[x, "Risk Score"] = 1/(numberOfRows * 10 ** (kde.score(iterRow))) if 1/((numberOfRows * 10 ** (kde.score(iterRow)))) > (1/self.alpha): output_df.loc[x, "Outlier"] = 1 else: output_df.loc[x, "Outlier"] = 0 return output_df
zero_test = test_imgs[test_labels == 1, :] one_test = test_imgs[test_labels == 0, :] #counting white pixels train_count_zero = np.sum(zero_train > 25, axis=1) train_count_one = np.sum(one_train > 25, axis=1) test_count_zero = np.sum(zero_test > 25, axis=1) test_count_one = np.sum(one_test > 25, axis=1) kde1 = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(train_count_one.reshape(-1, 1)) kde0 = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(train_count_zero.reshape(-1, 1)) onescores_one = np.array( [kde1.score(i.reshape(-1, 1)) for i in test_count_one]) onescores_zero = np.array( [kde0.score(i.reshape(-1, 1)) for i in test_count_one]) zeroscores_one = np.array( [kde1.score(i.reshape(-1, 1)) for i in test_count_zero]) zeroscores_zero = np.array( [kde0.score(i.reshape(-1, 1)) for i in test_count_zero]) plt.subplot(3, 1, 1) plt.hist(train_count_zero, 100) plt.title('Histogram of Pixel Count for Digit 0') plt.subplot(3, 1, 2) plt.hist(train_count_one, 100) plt.title('Histogram of Pixel Count for Digit 1') plt.subplot(3, 1, 3) sns.distplot(train_count_zero)
def optimize_bd(dfGenome, dfPos, dfGene, outpath): "Bandwidth optimization by fitting the density to positive set" dfPos['mid'] = ((dfPos['end'] - dfPos['start']) / 2) + dfPos['start'] chrs = list(dfGenome.chrom.unique()) bdlist = list(np.linspace(1000, 1000000, 1000)) sc = np.array([0.0] * (len(bdlist) + 1)) for chrname in chrs: chrlen = int(dfGenome[dfGenome.chrom == chrname].length) N = dfPos[dfPos.chrom == chrname].shape[0] dfchr = dfGene[dfGene.chrom == chrname] dfPosChr = dfPos[dfPos.chrom == chrname] Xp = np.array(list(dfPosChr['mid']))[:, np.newaxis] X = np.array(list(dfchr['mid']))[:, np.newaxis] ## estimate the density at each 1000 bp X_plot = np.linspace(0, chrlen, int(chrlen / 1000))[:, np.newaxis] b = np.array([[0, 0]]) print("optimization for", chrname) for bd in bdlist: kde = KernelDensity(kernel='gaussian', bandwidth=bd).fit(X) a = np.c_[bd, kde.score(Xp)] b = np.r_[b, a] sc[:] = sc[:] + b[:, 1] end = np.c_[bdlist, list(sc[1:, ])] idxrow = np.argwhere(end == max(end[:, 1]))[0, 0] newbd = int(end[idxrow, 0]) print("the bandwith is", newbd) #plt.plot(bdlist, list(sc[1:,])) #plt.title("genome") #plt.xlabel("bandwidth (bp)") #plt.ylabel("log score of positive set") #plt.savefig(path + 'gene_density_optimization.png') #plt.close() dfout = pd.DataFrame({'A': bdlist, 'B': sc[1:, ]}) dfout.to_csv(path_or_buf=outpath + "bandwidth_trials.txt", sep='\t', header=False, index=False) return newbd
class KDE: """Kernel density estimation (KDE) for accurate local density estimation. This is achieved by using maximum-likelihood estimation of the generative kernel density model which is regularized using cross-validation. Parameters ---------- bandwidth: float, optional bandwidth for the kernel density estimation. If not specified, will be determined automatically using maximum likelihood on a test-set. nh_size : int, optional (default = 'auto') number of points in a typical neighborhood... only relevant for evaluating a crude estimate of the bandwidth.'auto' means that the nh_size is scaled with number of samples. We use nh_size = 100 for 10000 samples. The minimum neighborhood size is set to 4. test_ratio_size: float, optional (default = 0.8) Ratio size of the test set used when performing maximum likehood estimation. In order to have smooth density estimations (prevent overfitting), it is recommended to use a large test_ratio_size (closer to 1.0) rather than a small one. atol: float, optional (default = 0.000005) kernel density estimate precision parameter. determines the precision used for kde. smaller values leads to slower execution but better precision rtol: float, optional (default = 0.00005) kernel density estimate precision parameter. determines the precision used for kde. smaller values leads to slower execution but better precision xtol: float, optional (default = 0.01) precision parameter for optimizing the bandwidth using maximum likelihood on a test set test_ratio_size: float, optional ratio of the test size for determining the bandwidth. kernel: str, optional (default='gaussian') Type of Kernel to use for density estimates. Other options are {'epanechnikov'|'linear','tophat'}. """ def __init__(self, nh_size='auto', bandwidth=None, test_ratio_size=0.1, xtol=0.01, atol=0.000005, rtol=0.00005, extreme_dist=False, nn_dist=None, kernel='gaussian'): self.bandwidth = bandwidth self.nh_size = nh_size self.test_ratio_size = test_ratio_size self.xtol = xtol self.atol = atol self.rtol = rtol self.extreme_dist = extreme_dist self.nn_dist = nn_dist self.kernel = kernel # epanechnikov other option def fit(self, X): """Fit kernel model to X""" if self.nh_size is 'auto': self.nh_size = max([int(25 * np.log10(X.shape[0])), 4]) if X.shape[1] > 8: print( 'Careful, you are trying to do density estimation for data in a D > 8 dimensional space\n ... you are warned !' ) if self.bandwidth is None: self.bandwidth = self.find_optimal_bandwidth(X) else: self.kde = KernelDensity(bandwidth=self.bandwidth, algorithm='kd_tree', kernel=self.kernel, metric='euclidean', atol=self.atol, rtol=self.rtol, breadth_first=True, leaf_size=40) self.kde.fit(X) return self def evaluate_density(self, X): """Given an array of data, computes the local density of every point using kernel density estimation Input ------ Data X : array, shape(n_sample,n_feature) Return ------ Log of densities for every point: array, shape(n_sample) Return: kde.score_samples(X) """ return self.kde.score_samples(X) def bandwidth_estimate(self, X_train, X_test): """Gives a rough estimate of the optimal bandwidth (based on the notion of some effective neigborhood) Return --------- bandwidth estimate, minimum possible value : tuple, shape(2) """ if self.nn_dist is None: nn = NearestNeighbors(n_neighbors=self.nh_size, algorithm='kd_tree') nn.fit(X_train) nn_dist, _ = nn.kneighbors(X_test, n_neighbors=self.nh_size, return_distance=True) else: nn_dist = self.nn_dist dim = X_train.shape[1] # Computation of minimum bound # This can be computed by taking the limit h -> 0 and making a saddle-point approx. mean_nn2_dist = np.mean(nn_dist[:, 1] * nn_dist[:, 1]) h_min = np.sqrt(mean_nn2_dist / dim) idx_1 = np.random.choice(np.arange(len(X_train)), size=min([1000, len(X_train)]), replace=False) idx_2 = np.random.choice(np.arange(len(X_test)), size=min([1000, len(X_test)]), replace=False) max_size = min([len(idx_1), len(idx_2)]) tmp = np.linalg.norm(X_train[idx_1[:max_size]] - X_test[idx_2[:max_size]], axis=1) h_max = np.sqrt(np.mean(tmp * tmp) / dim) h_est = 10 * h_min return h_est, h_min, h_max def find_optimal_bandwidth(self, X): """Performs maximum likelihood estimation on a test set of the density model fitted on a training set """ from scipy.optimize import fminbound X_train, X_test = train_test_split(X, test_size=self.test_ratio_size) args = (X_test, ) hest, hmin, hmax = self.bandwidth_estimate(X_train, X_test) print( "[kde] Minimum bound = %.4f \t Rough estimate of h = %.4f \t Maximum bound = %.4f" % (hmin, hest, hmax)) # We are trying to find reasonable tight bounds (hmin, 4.0*hest) to bracket the error function minima # Would be nice to have some hard accurate bounds self.xtol = round_float(hmin) print( '[kde] Bandwidth tolerance (xtol) set to precision of minimum bound : %.5f ' % self.xtol) self.kde = KernelDensity(algorithm='kd_tree', atol=self.atol, rtol=self.rtol, leaf_size=40, kernel=self.kernel) self.kde.fit(X_train) # hmax is the upper bound, however, heuristically it appears to always be way above the actual bandwidth. hmax*0.2 seems much better but still conservative h_optimal, score_opt, _, niter = fminbound( self.log_likelihood_test_set, hmin, hmax * 0.2, args, maxfun=100, xtol=self.xtol, full_output=True) print( "[kde] Found log-likelihood maximum in %i evaluations, h = %.5f" % (niter, h_optimal)) if self.extreme_dist is False: # These bounds should always be satisfied ... assert abs(h_optimal - hmax) > 1e-4, "Upper boundary reached for bandwidth" assert abs(h_optimal - hmin) > 1e-4, "Lower boundary reached for bandwidth" return h_optimal # @profile def log_likelihood_test_set(self, bandwidth, X_test): """Fit the kde model on the training set given some bandwidth and evaluates the negative log-likelihood of the test set """ self.kde.bandwidth = bandwidth # l_test = len(X_test) return -self.kde.score( X_test[:2000] ) # X_test[np.random.choice(np.arange(0, l_test), size=min([int(0.5*l_test), 1000]), replace=False)]) # this should be accurate enough !
dist = np.sqrt( np.sum(np.square(y_reconstructed - test_latents).reshape( len(test_latents), -1), axis=1)) sns.distplot(dist) pred_save(dist, PRED_FOLDER + 'prediction_unet_vae_pca_reconstruced.csv') # %% from sklearn.manifold import TSNE import matplotlib.pyplot as plt print('TSNE fitting...') tsne = TSNE(n_components=2, random_state=SEED, verbose=True) y_TSNE = tsne.fit_transform(test_latents) plt.scatter(y_TSNE[:, 0], y_TSNE[:, 1], s=1) rmse_tsne_test = np.sqrt( np.square(y_TSNE[:, 0] - np.mean(y_TSNE[:, 0])) + np.square(y_TSNE[:, 1] - np.mean(y_TSNE[:, 1]))) sns.distplot(rmse_tsne_test) pred_save(rmse_tsne_test, PRED_FOLDER + 'prediction_unet_vae_tsne_rmse.csv') # %% from sklearn.neighbors import KernelDensity kd = KernelDensity() kd.fit(test_latents) score = [kd.score(i.reshape(1, -1)) for i in test_latents] score = score - np.min(score) sns.distplot(score) pred_save(score, PRED_FOLDER + 'prediction_unet_vae_latentkd.csv') # %%
def getKDE(userJson): # arrayFilePath = "../data/test2.txt" # vector = np.loadtxt(arrayFilePath, dtype=np.float32) # # X_row = np.size(vector, 0) # 计算 X 一行元素的个数 # X_col = np.size(vector, 1) # 计算 X 一列元素的个数 # # # 计算出每两个高维向量之间的距离 # dis = [] # for i in range(X_row): # vec1 = vector[i] # for j in range(i + 1, X_row): # vec2 = vector[j] # dis_c = np.sqrt(np.sum(np.square(vec1 - vec2))) # dis.append([dis_c]) words = userJson["words"][0:100]#只选取用户100个查询 similarValueList = [] for index in range(len(words)): pair = words[index] for key, value in pair.items(): sentence = key # 整句 wordList1 = value # 每个词一个格子的list # 两两比较所有words for index in range(index+1, len(words)): pair2 = words[index] for key, value in pair2.items(): sentence2 = key wordList2 = value # 比较两个短句的相似度,分拆后两两比较, # similarValue = model.similarity(sentence, sentence2) # 两两比较两个句子的每个词语之间的相似度,选出 最相似的k个值加权平均, similar_K = [] minlen = min(len(wordList1),len(wordList2)) # 选择小的长度 # 复杂度可能太高 for word1 in wordList1: for word2 in wordList2: curSimilar = model.wv.similarity(word1,word2) similar_K.append(curSimilar) # 选取相似最大的minlen个,求和平均 similar_K.sort() similar_K.reverse() similarValue = sum(similar_K[0:minlen])/minlen # similarValueList.append(similarValue) global maxValue global minValue maxValue = np.max(similarValueList) minValue = np.min(similarValueList) # 归一化到 0-1 # dis3 = MaxMinNormalization(similarValueList, maxValue, minValue) dis3 = similarValueList # print(dis) print(dis3) print(len(similarValueList)) # 标准差 stdValue = np.std(dis3) # ----------------------------------------------------------- X = [] # 1维变2维 for item in dis3: X.append([item]) N = len(dis3) maxValue3 = np.max(dis3) minValue3 = np.min(dis3) # 创建等差数列 -5 到 10, N个数 ,作为x坐标轴 X_plot = np.linspace(minValue3 - 1, maxValue3 + 1, N)[:, np.newaxis] # 真实密度 fig, ax = plt.subplots() # 这里需要计算出一个合理的bandwidth # bandwidth约等于 1/N^(0.2) * stdValue bandwidth = 1 / pow(N, 0.2) * stdValue print("bandwidth,N",bandwidth, N) # for kernel in ['gaussian', 'tophat', 'epanechnikov']: for kernel in ['gaussian']: kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(X) # bandwidth=0.008 log_dens = kde.score_samples(X_plot) exp_dens = np.exp(log_dens) ax.plot(X_plot[:, 0], np.exp(log_dens), '-', label="kernel = '{0}'".format(kernel)) ax.text(6, 0.38, "N={0} points".format(N)) ax.legend(loc='upper left') # ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k') ax.set_xlim(minValue3, maxValue3) ax.set_ylim(-0.02, 10) plt.show() density = np.exp(kde.score([[0.5]])) # 个性化访问概率 # p = 1/N * 求和(kde.score) # 密度*bandwidth 计算出概率,这里有一定问题 应该是积分 probability = density * bandwidth print(probability) return kde
print("Kernel bandwidth:") bw = np.random.uniform(1, 5) print(bw) print("Our KDE:") my_kde = TruncatedNormalKernelDensity(bandwidth=bw) my_kde.fit(x) print(my_kde.score_samples(y)) print(my_kde.score(y)) print("SciKitLearn KDE:") skl_kde = KernelDensity(kernel='gaussian', bandwidth=bw) skl_kde.fit(x) print(skl_kde.score_samples(y)) print(skl_kde.score(y)) print("Test that truncation works:") y_vals = sorted(y) up = y_vals[5] low = y_vals[2] print(f"With upperbound {up}:") up_kde = TruncatedNormalKernelDensity(bandwidth=bw, upperbound=up) up_kde.fit(x) print(up_kde.score_samples(y)) print(f"With lowerbound {low}:") low_kde = TruncatedNormalKernelDensity(bandwidth=bw, lowerbound=low) low_kde.fit(x) print(low_kde.score_samples(y))