def kde_sklearn(data, grid, **kwargs): """ Kernel Density Estimation with Scikit-learn Parameters ---------- data : numpy.array Data points used to compute a density estimator. It has `n x p` dimensions, representing n points and p variables. grid : numpy.array Data points at which the desity will be estimated. It has `m x p` dimensions, representing m points and p variables. Returns ------- out : numpy.array Density estimate. Has `m x 1` dimensions """ kde_skl = KernelDensity(**kwargs) kde_skl.fit(data) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(grid) return np.exp(log_pdf)
def pdf(self, token, years, bandwidth=5): """ Estimate a density function from a token's rank series. Args: token (str) years (range) Returns: OrderedDict {year: density} """ series = self.series(token) data = [] for year, wpm in series.items(): data += [year] * round(wpm) data = np.array(data)[:, np.newaxis] pdf = KernelDensity(bandwidth=bandwidth).fit(data) samples = OrderedDict() for year in years: samples[year] = np.exp(pdf.score(year)) return samples
def KDE_plt(categories,inter_arrivals): KDEs = [] for i in range(0,len(categories)): X = np.asarray(extract_cat_samples(inter_arrivals,categories,i))#for single inter-arrivals in a category #X = np_matrix(categories[i][0])#for avg(inter-arrival)/person in a category kde = KernelDensity(kernel='gaussian', bandwidth=4).fit(X) KDEs.append(kde) #to use for prob_return() max_sample = max_interarrival_mean(categories,inter_arrivals,i) X_plot = np.linspace(0,1.5*max_sample,2000)[:, np.newaxis] log_dens = kde.score_samples(X_plot) plt.figure(i) plt.plot(X_plot[:, 0], np.exp(log_dens), '-',label="kernel = '{0}'".format('gaussian')) #plt.draw() #plt.pause(0.001) #plt.title("Non-Parametric Density Estimation for category=%s Visitors"%(i)) plt.hist(combine_inner_lists(extract_cat_samples(inter_arrivals,categories,i)),bins=40,normed=1,color="cyan",alpha=.3,label="histogram") #alpha, from 0 (transparent) to 1 (opaque) # plt.hist(np.asarray(categories[i][0]),bins=40,normed=1,color="cyan",alpha=.3,label="histogram") #alpha, from 0 (transparent) to 1 (opaque) plt.xlabel("inter-arrival time (days)") plt.ylabel("PDF") plt.legend() save_as='./app/static/img/cat_result/kde/kdeplt_cat'+str(i)+'.png' # dump result into kde folder plt.savefig(save_as) plt.show(block=False) plt.close(plt.figure(i)) return KDEs
def EstimateDensity(self,name,df,histogram,f,s,ax): # if the desired output is in Histogram format if(histogram): finRes = [] lab = [] for i in xrange(5): res = np.array(df[ df[f] == i][s]) if(res.shape[0]>0): finRes.append(res) lab.append(name[0]+ ' = ' + str(i)) pl.hist(finRes, bins=2, normed=True, histtype='bar',label = lab) # if the desired output is simple plot else: for i in xrange(5): res = np.array(df[ df[f] == i][s]) if(res.shape[0]>0): res = res.reshape(res.shape[0],1) X_plot = np.array(np.linspace(-1, 5,20)).reshape(20,1) kde= KernelDensity(kernel='exponential', bandwidth=0.05) kde.fit(res) log_dens = kde.score_samples(X_plot) ax.plot(X_plot,np.exp(log_dens),label=name[0]+ ' = ' + str(i)) ax.legend() ax.set_title(name[1] + " distrubution for changing " + name[0])
def kde(self, term, bandwidth=2000, samples=1000, kernel='gaussian'): """ Estimate the kernel density of the instances of term in the text. Args: term (str): A stemmed term. bandwidth (int): The kernel bandwidth. samples (int): The number of evenly-spaced sample points. kernel (str): The kernel function. Returns: np.array: The density estimate. """ # Get the offsets of the term instances. terms = np.array(self.terms[term])[:, np.newaxis] # Fit the density estimator on the terms. kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(terms) # Score an evely-spaced array of samples. x_axis = np.linspace(0, len(self.tokens), samples)[:, np.newaxis] scores = kde.score_samples(x_axis) # Scale the scores to integrate to 1. return np.exp(scores) * (len(self.tokens) / samples)
def plot_kde_histogram2(X1, X2, f_name, bins=25): """ Plot KDE-smoothed histogram of the data in X1/X2. Assume data is 1D. """ import matplotlib.pyplot as plt # make a figure and configure an axis fig = plt.figure() ax = fig.add_subplot(111) ax.hold(True) for (X, style) in [(X1, '-'), (X2, '--')]: X_samp = X.ravel()[:,np.newaxis] X_min = np.min(X_samp) X_max = np.max(X_samp) X_range = X_max - X_min sigma = X_range / float(bins) plot_min = X_min - (X_range/3.0) plot_max = X_max + (X_range/3.0) plot_X = np.linspace(plot_min, plot_max, 1000)[:,np.newaxis] # make a kernel density estimator for the data in X kde = KernelDensity(kernel='gaussian', bandwidth=sigma).fit(X_samp) ax.plot(plot_X, np.exp(kde.score_samples(plot_X)), linestyle=style) fig.savefig(f_name, dpi=None, facecolor='w', edgecolor='w', \ orientation='portrait', papertype=None, format=None, \ transparent=False, bbox_inches=None, pad_inches=0.1, \ frameon=None) plt.close(fig) return
def plot_sklearn_kde(df, support, column='AirTime', bins=50): """ Plots a KDE and a histogram using sklearn.KernelDensity. Uses Gaussian kernels. The optimal bandwidth is calculated according to Silverman's rule of thumb. Parameters ---------- df: A pandas.DataFrame support: A 1-d numpy array. Input data points for the probabilit density function. Returns ------- A matplotlib.axes.Axes instance. """ bw = get_silverman_bandwidth(df, column) kde = KernelDensity(kernel='gaussian', bandwidth=bw) x = df[column] kde.fit(x[:, np.newaxis]) y = kde.score_samples(support[:, np.newaxis]) fig, ax = plt.subplots(figsize=(8, 5)) ax.hist(np.ravel(x), bins=bins, alpha=0.5, color=sns.xkcd_rgb["denim blue"], normed=True) ax.plot(support, np.exp(y)) ax.set_xlabel(column, fontsize=14) ax.set_ylabel('Density', fontsize=14) ax.set_title('Kernel Density Plot', fontsize=14) sns.despine(ax=ax, offset=5, trim=True) return ax
def kdescatter(xs, ys, log_color=False, atol=1e-4, rtol=1e-4, n_jobs=1, n_samp_scaling=100, n_samp_tuning=1000, ax=None, **kwargs): if ax is None: import matplotlib.pyplot as plt ax = plt kwargs.setdefault('linewidths', 0) kwargs.setdefault('s', 20) kwargs.setdefault('cmap', 'winter') X = np.asarray([xs, ys]).T n = X.shape[0] samp_X = X[np.random.choice(n, min(n_samp_scaling, n), replace=False)] median_sqdist = np.median(euclidean_distances(samp_X, squared=True)) bws = np.logspace(-2, 2, num=10) * np.sqrt(median_sqdist) est = GridSearchCV(KernelDensity(), {'bandwidth': bws}, n_jobs=n_jobs) est.fit(X[np.random.choice(n, min(n_samp_tuning, n), replace=False)]) bw = est.best_params_['bandwidth'] kde = KernelDensity(bandwidth=bw) kde.fit(X) densities = kde.score_samples(X) if not log_color: np.exp(densities, out=densities) ax.scatter(xs, ys, c=densities, **kwargs)
def draw_posterior_kld_hist(X_kld, X_vae, f_name, bins=25): """ Plot KDE-smoothed histograms. """ import matplotlib.pyplot as plt # make a figure and configure an axis fig = plt.figure() ax = fig.add_subplot(111) ax.set_xlabel('Posterior KLd Density') ax.set_title('Posterior KLds: Over-regularized vs. Standard') ax.hold(True) for (X, style, label) in [(X_kld, '-', 'ORK'), (X_vae, '--', 'VAR')]: X_samp = X.ravel()[:,np.newaxis] X_min = np.min(X_samp) X_max = np.max(X_samp) X_range = X_max - X_min sigma = X_range / float(bins) plot_min = X_min - (X_range/4.0) plot_max = X_max + (X_range/4.0) plot_X = np.linspace(plot_min, plot_max, 1000)[:,np.newaxis] # make a kernel density estimator for the data in X kde = KernelDensity(kernel='gaussian', bandwidth=sigma).fit(X_samp) ax.plot(plot_X, np.exp(kde.score_samples(plot_X)), linestyle=style, label=label) ax.legend() fig.savefig(f_name, dpi=None, facecolor='w', edgecolor='w', \ orientation='portrait', papertype=None, format='pdf', \ transparent=False, bbox_inches=None, pad_inches=0.1, \ frameon=None) plt.close(fig) return
def test2(): arr = np.concatenate((np.linspace(0, 10, 10), np.linspace(2, 4, 10), np.linspace(7, 10, 10)))[:, np.newaxis] kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(arr) X = np.linspace(0,10,1000)[:, np.newaxis] log_dens = kde.score_samples(X) plt.plot(X, log_dens) plt.show()
def kde_opt4(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): df_new = pd.DataFrame() df_new["hour"] = df["hour"] df_new["weekday"] = df["weekday"] + df["hour"] / 24. df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x)) df_new["x"] = df["x"] df_new["y"] = df["y"] return df_new logging.info("train kde_opt4 model") df_cell_train_feats_kde = prepare_feats(df_cell_train_feats) df_cell_test_feats_kde = prepare_feats(df_cell_test_feats) n_class = len(np.unique(y_train)) y_test_pred = np.zeros((len(df_cell_test_feats_kde), n_class), "d") for i in range(n_class): X = df_cell_train_feats_kde[y_train == i] y_test_pred_i = np.ones(len(df_cell_test_feats_kde), "d") for feat in df_cell_train_feats_kde.columns.values: X_feat = X[feat].values BGK10_output = kdeBGK10(X_feat) if BGK10_output is None: kde = gaussian_kde(X_feat, "scott") kde = gaussian_kde(X_feat, kde.factor * 0.741379) y_test_pred_i *= kde.evaluate(df_cell_test_feats_kde[feat].values) else: bandwidth, mesh, density = BGK10_output kde = KernelDensity(kernel='gaussian', metric='manhattan', bandwidth=bandwidth) kde.fit(X_feat[:, np.newaxis]) y_test_pred_i *= np.exp(kde.score_samples(df_cell_test_feats_kde[feat].values[:, np.newaxis])) y_test_pred[:, i] += y_test_pred_i return y_test_pred
def surface_density(c, bandwidth=0.2, grid_step=0.02): """ Given particle positions as a coordinate object, compute the surface density using a kernel density estimate. """ if not HAS_SKLEARN: raise ImportError("scikit-learn is required to use this function.") xgrid = np.arange(2., 9.+0.1, grid_step) # deg ygrid = np.arange(26.5, 33.5+0.1, grid_step) # deg shp = (xgrid.size, ygrid.size) meshies = np.meshgrid(xgrid, ygrid) grid = np.vstack(map(np.ravel, meshies)).T x = c.l.degree y = c.b.degree skypos = np.vstack((x,y)).T kde = KernelDensity(bandwidth=bandwidth, kernel='epanechnikov') kde.fit(skypos) dens = np.exp(kde.score_samples(grid)).reshape(meshies[0].shape) log_dens = np.log10(dens) return grid, log_dens
def plot_kde_histogram(X, f_name, bins=25): """ Plot KDE-smoothed histogram of the data in X. Assume data is univariate. """ import matplotlib.pyplot as plt X = X.ravel() np.random.shuffle(X) X = X[0:min(X.shape[0], 1000000)] X_samp = X[:,np.newaxis] X_min = np.min(X_samp) X_max = np.max(X_samp) X_range = X_max - X_min sigma = X_range / float(bins) plot_min = X_min - (X_range/3.0) plot_max = X_max + (X_range/3.0) plot_X = np.linspace(plot_min, plot_max, 1000)[:,np.newaxis] # make a kernel density estimator for the data in X kde = KernelDensity(kernel='gaussian', bandwidth=sigma).fit(X_samp) # make a figure fig = plt.figure() ax = fig.add_subplot(111) ax.plot(plot_X, np.exp(kde.score_samples(plot_X))) fig.savefig(f_name, dpi=None, facecolor='w', edgecolor='w', \ orientation='portrait', papertype=None, format=None, \ transparent=False, bbox_inches=None, pad_inches=0.1, \ frameon=None) plt.close(fig) return
def max_prob(df): df_tmp = df.copy() arr = [] for ind in df_tmp.index: row = df_tmp.loc[ind] d = row.dropna().values # d = d.dropna() if len(d)==0: centre = np.NaN arr.append(centre) continue # arr = vals.sort(axis=0) # df_ordered = pd.DataFrame(vals, index=df.index, columns=df.columns) x_grid = np.linspace(d.min(), d.max(), 50) x_grid = x_grid.reshape(-1,1) d = d.reshape(-1,1) kde = KernelDensity().fit(d) log_dens = kde.score_samples(x_grid) vals = np.exp(log_dens).round(4) centre = x_grid[vals.argmax()][0] centre2 = round(centre, 4) # TODO first element adds unnecessary decimal places (use decimal places class to fix) arr.append(centre2) return arr
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs): """Kernel Density Estimation with Scikit-learn""" kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(x[:, np.newaxis]) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis]) return np.exp(log_pdf)
def test_kernel_density_sampling(n_samples=100, n_features=3): rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) bandwidth = 0.2 for kernel in ['gaussian', 'tophat']: # draw a tophat sample kde = KernelDensity(bandwidth, kernel=kernel).fit(X) samp = kde.sample(100) assert_equal(X.shape, samp.shape) # check that samples are in the right range nbrs = NearestNeighbors(n_neighbors=1).fit(X) dist, ind = nbrs.kneighbors(X, return_distance=True) if kernel == 'tophat': assert np.all(dist < bandwidth) elif kernel == 'gaussian': # 5 standard deviations is safe for 100 samples, but there's a # very small chance this test could fail. assert np.all(dist < 5 * bandwidth) # check unsupported kernels for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']: kde = KernelDensity(bandwidth, kernel=kernel).fit(X) assert_raises(NotImplementedError, kde.sample, 100) # non-regression test: used to return a scalar X = rng.randn(4, 1) kde = KernelDensity(kernel="gaussian").fit(X) assert_equal(kde.sample().shape, (1, 1))
def get_density_based_best_sample(X, known_votes, possibilities): total_votes = sum(map(lambda x: len(x), known_votes)) print total_votes X = X.toarray() current_vectors = numpy.copy(X) #print 'X', X #print 'known_votes ', known_votes original_docs = len(X) possibilities = set([x[0] for x in possibilities]) #print possibilities for i, sample in enumerate(known_votes): for k in range(len(sample)): current_vectors = numpy.append(current_vectors, [X[i]], axis=0) #print 'current_vectors ', current_vectors, len(current_vectors) #assert current_vectors != X model = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(current_vectors) scores = model.score_samples(X) if (total_votes % 3): #Explore low density regions sorted_scores = sorted(enumerate(scores), key = lambda x: x[1], reverse=True) else: #Exploit high density regions 1 times out of 3 sorted_scores = sorted(enumerate(scores), key = lambda x: x[1]) #print sorted_scores for i in range(original_docs): if sorted_scores[i][0] in possibilities: #print sorted_scores[i][0] return sorted_scores[i][0] return None
def sklearn_kde(data, points): from sklearn.neighbors import KernelDensity # Silverman bandwidth estimator n, d = data.shape bandwidth = (n * (d + 2) / 4.)**(-1. / (d + 4)) # standardize data so that we can use uniform bandwidth mu, sigma = mean(data, axis=0), std(data, axis=0) data, points = (data - mu)/sigma, (points - mu)/sigma #print("starting grid search for bandwidth over %d points"%n) #from sklearn.grid_search import GridSearchCV #from numpy import logspace #params = {'bandwidth': logspace(-1, 1, 20)} #fitter = GridSearchCV(KernelDensity(), params) #fitter.fit(data) #kde = fitter.best_estimator_ #print("best bandwidth: {0}".format(kde.bandwidth)) #import time; T0 = time.time() kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth, rtol=1e-6, atol=1e-6) #print("T:%6.3f fitting"%(time.time()-T0)) kde.fit(data) #print("T:%6.3f estimating"%(time.time()-T0)) log_pdf = kde.score_samples(points) #print("T:%6.3f done"%(time.time()-T0)) return exp(log_pdf)
def find_kernel(data, numgrid = 1000, bw = 0.002): Xtrain = data[:,0:2] ytrain = data[2] # Set up the data grid for the contour plot xgrid = np.linspace(-74.1, -73.65, numgrid=1000) ygrid = np.linspace(40.5, 40.8, numgrid=1000) X, Y = np.meshgrid(xgrid, ygrid) xy = np.vstack([Y.ravel(), X.ravel()]).T # Plot map of with distributions of each species fig = plt.figure() # construct a kernel density estimate of the distribution kde = KernelDensity(bandwidth=bw, kernel='gaussian') kde.fit(Xtrain, y = ytrain) # evaluate only on the land: -9999 indicates ocean Z = np.exp(kde.score_samples(xy)) Z = Z.reshape(X.shape) # plot contours of the density levels = np.linspace(0, Z.max(), 25) plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds) plt.title('BK CRIME') plt.show() return Z
def sklearn_density(sample_points, evaluation_points): """ Estimate the probability density function from which a set of sample points was drawn and return the estimated density at the evaluation points. """ from sklearn.neighbors import KernelDensity # Silverman bandwidth estimator n, d = sample_points.shape bandwidth = (n * (d + 2) / 4.)**(-1. / (d + 4)) # Standardize data so that we can use uniform bandwidth. # Note that we will need to scale the resulting density by sigma to # correct the area. mu, sigma = mean(sample_points, axis=0), std(sample_points, axis=0) data, points = (sample_points - mu)/sigma, (evaluation_points - mu)/sigma #print("starting grid search for bandwidth over %d points"%n) #from sklearn.grid_search import GridSearchCV #from numpy import logspace #params = {'bandwidth': logspace(-1, 1, 20)} #fitter = GridSearchCV(KernelDensity(), params) #fitter.fit(data) #kde = fitter.best_estimator_ #print("best bandwidth: {0}".format(kde.bandwidth)) #import time; T0 = time.time() kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth, rtol=1e-6, atol=1e-6) #print("T:%6.3f fitting"%(time.time()-T0)) kde.fit(data) #print("T:%6.3f estimating"%(time.time()-T0)) log_pdf = kde.score_samples(points) #print("T:%6.3f done"%(time.time()-T0)) return exp(log_pdf)/np.prod(sigma) # undo the x scaling on the data points
def kde_fit_quantiles(rtquants, nsamples=1000, bw=.1): """ takes quantile estimates and fits cumulative density function returns samples to pass to sns.kdeplot() """ kdefit = KernelDensity(kernel='gaussian', bandwidth=bw).fit(rtquants) samples = kdefit.sample(n_samples=nsamples).flatten() return samples
def test_KernelDensity_sampling(n_samples=100, n_features=3): np.random.seed(0) X = np.random.random((n_samples, n_features)) bandwidth = 0.2 for kernel in ["gaussian", "tophat"]: # draw a tophat sample kde = KernelDensity(bandwidth, kernel=kernel).fit(X) samp = kde.sample(100) assert_equal(X.shape, samp.shape) # check that samples are in the right range nbrs = NearestNeighbors(n_neighbors=1).fit(X) dist, ind = nbrs.kneighbors(X, return_distance=True) if kernel == "tophat": assert np.all(dist < bandwidth) elif kernel == "gaussian": # 5 standard deviations is safe for 100 samples, but there's a # very small chance this test could fail. assert np.all(dist < 5 * bandwidth) # check unsupported kernels for kernel in ["epanechnikov", "exponential", "linear", "cosine"]: kde = KernelDensity(bandwidth, kernel=kernel).fit(X) assert_raises(NotImplementedError, kde.sample, 100)
def xy_kde(xy,bandwidth,N_grid=100,levels=[0.8,0.6,0.4,0.2]): x_edges = np.linspace(np.min(xy[:,0]),np.max(xy[:,0]),N_grid+1) y_edges = np.linspace(np.min(xy[:,1]),np.max(xy[:,1]),N_grid+1) x_centres = np.array([x_edges[b] + (x_edges[b+1]-x_edges[b])/2 for b in range(N_grid)]) y_centres = np.array([y_edges[b] + (y_edges[b+1]-y_edges[b])/2 for b in range(N_grid)]) x_grid, y_grid = np.meshgrid(x_centres,y_centres) xy_grid = np.array([np.ravel(x_grid),np.ravel(y_grid)]).T kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(xy) H = np.exp(kde.score_samples(xy_grid).reshape(N_grid,N_grid)) # this bit is taken from the corner_plot.py method. ###################################### Hflat = H.flatten() inds = np.argsort(Hflat)[::-1] Hflat = Hflat[inds] sm = np.cumsum(Hflat) sm /= sm[-1] V = np.empty(len(levels)) for i, v0 in enumerate(levels): try: V[i] = Hflat[sm <= v0][-1] except: V[i] = Hflat[0] ##################################### V = np.sort(V) return H, V, x_grid, y_grid, bandwidth
def art_qi2(img, airmask, min_voxels=int(1e3), max_voxels=int(3e5), save_plot=True): r""" Calculates :math:`\text{QI}_2`, based on the goodness-of-fit of a centered :math:`\chi^2` distribution onto the intensity distribution of non-artifactual background (within the "hat" mask): .. math :: \chi^2_n = \frac{2}{(\sigma \sqrt{2})^{2n} \, (n - 1)!}x^{2n - 1}\, e^{-\frac{x}{2}} where :math:`n` is the number of coil elements. :param numpy.ndarray img: input data :param numpy.ndarray airmask: input air mask without artifacts """ from sklearn.neighbors import KernelDensity from scipy.stats import chi2 from mriqc.viz.misc import plot_qi2 # S. Ogawa was born np.random.seed(1191935) data = img[airmask > 0] data = data[data > 0] # Write out figure of the fitting out_file = op.abspath('error.svg') with open(out_file, 'w') as ofh: ofh.write('<p>Background noise fitting could not be plotted.</p>') if len(data) < min_voxels: return 0.0, out_file modelx = data if len(data) < max_voxels else np.random.choice( data, size=max_voxels) x_grid = np.linspace(0.0, np.percentile(data, 99), 1000) # Estimate data pdf with KDE on a random subsample kde_skl = KernelDensity(bandwidth=0.05 * np.percentile(data, 98), kernel='gaussian').fit(modelx[:, np.newaxis]) kde = np.exp(kde_skl.score_samples(x_grid[:, np.newaxis])) # Find cutoff kdethi = np.argmax(kde[::-1] > kde.max() * 0.5) # Fit X^2 param = chi2.fit(modelx[modelx < np.percentile(data, 95)], 32) chi_pdf = chi2.pdf(x_grid, *param[:-2], loc=param[-2], scale=param[-1]) # Compute goodness-of-fit (gof) gof = float(np.abs(kde[-kdethi:] - chi_pdf[-kdethi:]).mean()) if save_plot: out_file = plot_qi2(x_grid, kde, chi_pdf, modelx, kdethi) return gof, out_file
def get_log_density(x, bins): x_kde = bins[:, np.newaxis] bandwidth = 1.06 * np.std(x) * np.power(len(x), -0.2) kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(x[:, np.newaxis]) log_density = kde.score_samples(x_kde) return log_density
def estimate_density(city): """Return a Gaussian KDE of venues in `city`.""" kde = KernelDensity(bandwidth=175, rtol=1e-4) surround = xp.build_surrounding(DB.venue, city, likes=-1, checkins=1) kde.fit(surround.venues[:, :2]) max_density = approximate_maximum_density(kde, surround.venues[:, :2]) # pylint: disable=E1101 return lambda xy: np.exp(kde.score_samples(xy))/max_density
def fit_kde(costs, frac_std): """ Fit a KDE to the costs, use a gaussian kernel and a bandwidth that is the specified fraction of the std. """ bw = frac_std * np.std(costs) kde = KernelDensity(bandwidth=bw) return kde.fit(costs)
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true): kde = KernelDensity(kernel=kernel, bandwidth=bandwidth, atol=atol, rtol=rtol) log_dens = kde.fit(X).score_samples(Y) assert_allclose(np.exp(log_dens), dens_true, atol=atol, rtol=max(1E-7, rtol)) assert_allclose(np.exp(kde.score(Y)), np.prod(dens_true), atol=atol, rtol=max(1E-7, rtol))
def CrossValidationScore(Xs,h, kernel='gaussian'): kde = KernelDensity(h, kernel=kernel) ret = 0. for i in range(len(Xs)): x = np.concatenate([Xs[0:i],Xs[i+1:-1]]) kde.fit(x) ret +=kde.score_samples(Xs[i].reshape(1,-1)) ret/=(1.*len(Xs)) return ret
def build_classifiers(training_data, bandwidth): classifiers = {} for category in training_data: print 'Classifier category: ' + category print 'Number of samples: ' + str(len(training_data[category])) kde = KernelDensity(bandwidth=bandwidth) classifiers[category] = kde.fit(training_data[category]) return classifiers
tumor_x.append(c.loc[0]) tumor_y.append(c.loc[1]) else: stroma_cnt += 1 stroma_features.append(c.features) stroma_x.append(c.loc[0]) stroma_y.append(c.loc[1]) # if stroma_cnt > 0 and tumor_cnt > 0: if stroma_cnt > 10 and tumor_cnt > 10: eligible_patch += 1 # print("eligible patch") xy = np.vstack([tumor_x, tumor_y]) s_xy = np.vstack([stroma_x, stroma_y]).T kde_skl_1 = KernelDensity(bandwidth=16) kde_skl_1.fit(xy.T) sc_1 = kde_skl_1.score_samples(s_xy) kde_skl_2 = KernelDensity(bandwidth=20) kde_skl_2.fit(xy.T) sc_2 = kde_skl_2.score_samples(s_xy) kde_skl_3 = KernelDensity(bandwidth=24) kde_skl_3.fit(xy.T) sc_3 = kde_skl_3.score_samples(s_xy) kde_skl_4 = KernelDensity(bandwidth=30) kde_skl_4.fit(xy.T) sc_4 = kde_skl_4.score_samples(s_xy)
from sklearn.datasets import load_digits from sklearn.decomposition import PCA from sklearn.model_selection import GridSearchCV from sklearn.neighbors import KernelDensity # load the data digits = load_digits() data = digits.data # project the 64-dimensional data to a lower dimension pca = PCA(n_components=15, whiten=False) data = pca.fit_transform(digits.data) # use grid search cross-validation to optimize the bandwidth params = {'bandwidth': np.logspace(-1, 1, 20)} grid = GridSearchCV(KernelDensity(), params) grid.fit(data) print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) # use the best estimator to compute the kernel density estimate kde = grid.best_estimator_ # sample 44 new points from the data new_data = kde.sample(44, random_state=0) new_data = pca.inverse_transform(new_data) # turn data into a 4x11 grid new_data = new_data.reshape((4, 11, -1)) real_data = digits.data[:44].reshape((4, 11, -1))
def test_kde_sample_weights(): n_samples = 400 size_test = 20 weights_neutral = np.full(n_samples, 3.) for d in [1, 2, 10]: rng = np.random.RandomState(0) X = rng.rand(n_samples, d) weights = 1 + (10 * X.sum(axis=1)).astype(np.int8) X_repetitions = np.repeat(X, weights, axis=0) n_samples_test = size_test // d test_points = rng.rand(n_samples_test, d) for algorithm in ['auto', 'ball_tree', 'kd_tree']: for metric in ['euclidean', 'minkowski', 'manhattan', 'chebyshev']: if algorithm != 'kd_tree' or metric in KDTree.valid_metrics: kde = KernelDensity(algorithm=algorithm, metric=metric) # Test that adding a constant sample weight has no effect kde.fit(X, sample_weight=weights_neutral) scores_const_weight = kde.score_samples(test_points) sample_const_weight = kde.sample(random_state=1234) kde.fit(X) scores_no_weight = kde.score_samples(test_points) sample_no_weight = kde.sample(random_state=1234) assert_allclose(scores_const_weight, scores_no_weight) assert_allclose(sample_const_weight, sample_no_weight) # Test equivalence between sampling and (integer) weights kde.fit(X, sample_weight=weights) scores_weight = kde.score_samples(test_points) sample_weight = kde.sample(random_state=1234) kde.fit(X_repetitions) scores_ref_sampling = kde.score_samples(test_points) sample_ref_sampling = kde.sample(random_state=1234) assert_allclose(scores_weight, scores_ref_sampling) assert_allclose(sample_weight, sample_ref_sampling) # Test that sample weights has a non-trivial effect diff = np.max(np.abs(scores_no_weight - scores_weight)) assert diff > 0.001 # Test invariance with respect to arbitrary scaling scale_factor = rng.rand() kde.fit(X, sample_weight=(scale_factor * weights)) scores_scaled_weight = kde.score_samples(test_points) assert_allclose(scores_scaled_weight, scores_weight)
def cumi(x_orig, y_orig, z_orig, normalization=False, k=5, density_estimation_method="kde", k_density=5, bw=.01): """Calculates the uniformed conditional mutual information where the distribution for :math:`x` and :math:`z` is replaced by a uniform distribution. `cumi` takes two random variable :math:`x` and :math:`y` and estimated their mutual information conditioned on the third random variable :math:`z` using the KSG estimator while :math:`x`, :math:`y` is replaced by a uniform distribution. Arguments --------- x_orig: `List` One random variable from the time-series data. y_orig: `List` Another random variable from the time-series data. z_orig: `List` Another random variable from the time-series data. normalization: `bool` (Default: False) Whether to normalize the expression of :math:`x, y, z` by their standard deviation. k: `int` (Default: 5) Number for nearest neighbors used in entropy calculation density_estimation_method: `str` (Default: `kde`) Which 2D density estimator you would like to use. `kde` is kde estimator while `knn` is knn based estimator. k_density: `bool` (default: False) The number of k nearest neighbors you would like to use when calculating the density (only applicable when density_estimation_method is to be `knn` or using knn based density estimation). bw: `float` (default: 0.01) Bindwidth used for the kernel density estimator. Returns ------- A estimated conditional mutual information value between two variables (x, y), conditioning on a third variable z where the distribution for the x, z is replaced by a uniform distribution. """ x = deepcopy(x_orig) y = deepcopy(y_orig) z = deepcopy(z_orig) assert len(x) == len(y), "Lists should have same length" assert len(x) == len(z), "Lists should have same length" N = len(x) dx = len(x[0]) dy = len(y[0]) dz = len(z[0]) if normalization: x /= np.std(x) y /= np.std(y) z /= np.std(z) data_xyz = np.concatenate((x, y, z), axis=1) data_xz = np.concatenate((x, z), axis=1) data_yz = np.concatenate((y, z), axis=1) tree_xyz = ss.cKDTree(data_xyz) tree_xz = ss.cKDTree(data_xz) tree_yz = ss.cKDTree(data_yz) tree_z = ss.cKDTree(z) if density_estimation_method.lower() == "kde": kernel = KernelDensity(bandwidth=bw) kernel.fit(data_xz) kde = np.exp(kernel.score_samples(data_xz)) weight = (1 / kde) / np.mean(1 / kde) elif density_estimation_method.lower() == "knn": knn_dis = [ tree_xz.query(point, k_density + 1, p=np.inf)[0][k_density] for point in data_xz ] density_estimate = np.array([ float(k_density) / N / knn_dis[i]**(dx + dz) for i in range(len(knn_dis)) ]) weight = (1 / density_estimate) / np.mean(1 / density_estimate) else: raise ValueError("The density estimation method is not recognized") knn_dis = [ tree_xyz.query(point, k + 1, p=np.inf)[0][k] for point in data_xyz ] information_samples = [0 for i in range(N)] for i in range(N): information_samples[i] += weight[i] * digamma( len(tree_xyz.query_ball_point(data_xyz[i], knn_dis[i], p=np.inf)) - 1) information_samples[i] += weight[i] * -digamma( len(tree_xz.query_ball_point(data_xz[i], knn_dis[i], p=np.inf)) - 1) information_samples[i] += weight[i] * -digamma( np.sum(weight[j] for j in tree_yz.query_ball_point( data_yz[i], knn_dis[i], p=np.inf)) - weight[i]) information_samples[i] += weight[i] * digamma( np.sum( weight[j] for j in tree_z.query_ball_point(z[i], knn_dis[i], p=np.inf)) - weight[i]) return np.mean(information_samples)
#print dick estimations = [] for relation, points in dick.iteritems(): print relation X = np.array(points[1])[:, np.newaxis] Y = np.array(points[0])[:, np.newaxis] #plt.scatter(Y,[0] * len(Y))#np.random.normal(0,0.03,len(Y))) #plt.scatter(X,[1] * len(X))#np.random.normal(1,0.03,len(X))) #plt.title(relation) #plt.ylabel("Correctness") #plt.xlabel("Confidence") #kde X_plot = np.linspace(0, 1, 1000)[:, np.newaxis] kde_incorrect = KernelDensity(kernel='gaussian', bandwidth=0.15).fit(X) kde_allpt = KernelDensity(kernel='gaussian', bandwidth=0.15).fit(Y) log_dens_incorrect = kde_incorrect.score_samples(X_plot) log_dens_allpt = kde_allpt.score_samples(X_plot) #ax = plt.gca() estimation = np.exp( np.subtract(log_dens_incorrect + np.log(len(X)), log_dens_allpt + np.log(len(Y)))) #ax.plot(X_plot, estimation) for i in range(0, len(estimation) - 1): if estimation[i] > estimation[i + 1]: estimation[i + 1] = estimation[i]
def segment(self, Z, verbose=True): """Fit the model using Z as data to be segmented. Args: Z (np array, shape (n_samples, 2)): Data to be segemented. verbose (bool): Verbosity. Returns: labels (np array, (n_samples,): Segment label for each sample. """ if verbose: print('Segmenting regions using watershed...') print('- num samples: {}'.format(len(Z))) # outliers if self.prune_outliers: if verbose: print('- pruning outliers') self.lof_ = LocalOutlierFactor(n_neighbors=self.outlier_neighbors, contamination=0.1) lof_pred = self.lof_.fit_predict(Z) lof_scores = self.lof_.negative_outlier_factor_ lof_scores = minmax_scale(lof_scores) self.Z_crop_ = Z[lof_scores > self.outlier_threshold] self.Z_left_ = np.where(lof_scores > self.outlier_threshold) num_outliers = Z.shape[0] - self.Z_crop_.shape[0] print('-> outliers pruned: {}'.format(num_outliers)) else: self.Z_crop_ = Z # normalize Z and inset self.Z_norm_ = minmax_scale( self.Z_crop_, feature_range=(0 + self.ngrid_pad, 1 - self.ngrid_pad), axis=0, ) # estimate probability density using Gaussian kernal if verbose: print('- performing KDE') self.kde_ = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth).fit(self.Z_norm_) # convert density estimate to an image of probs and normalize if verbose: print('- scoring KDE') x, y = np.meshgrid(np.linspace(0, 1, self.ngrid), np.linspace(0, 1, self.ngrid)) log_dens = self.kde_.score_samples( np.array((x.flatten(), y.flatten())).T) self.P_ = np.reshape(log_dens, (self.ngrid, self.ngrid)) self.P_ = np.exp(self.P_) / np.max(np.exp(self.P_)) # find peaks if verbose: print('- finding peaks') self.peaks_ = peak_local_max( self.P_, min_distance=self.peak_min_distance, threshold_rel=self.peak_threshold_rel, exclude_border=False, ) # convert peaks to image and dialate self.P_peaks_ = np.ones_like(self.P_) for peak in self.peaks_: for i in range(-self.peak_dialation, self.peak_dialation + 1): for j in range(-self.peak_dialation, self.peak_dialation + 1): self.P_peaks_[(peak[0] + i, peak[1] + j)] = 0 # euclidean distance transform if verbose: print('- computing edt') self.P_edt_ = ndi.distance_transform_edt(self.P_peaks_) # perform watershed on edt if verbose: print('- performing watershed on edt') markers = ndi.label(1 - self.P_peaks_)[0] # use peaks as seed markers self.P_labels_ = watershed(self.P_edt_, markers, compactness=self.compactness) # find boundaries if verbose: print('- finding boundaries') self.P_bounds_ = find_boundaries(self.P_labels_) # find labels for Zs indices = np.round(self.Z_norm_ * self.ngrid).astype(int) self.Z_labels_ = self.P_labels_[indices[:, 1], indices[:, 0]] # swap axes if verbose: print('-> num regions found: {}'.format(len(self.peaks_))) return self.Z_labels_
def doqueries(givenfield, command): con = lite.connect( '/home/hartsuiker/Documents/dbdm/DDM2017/FinalProject/DDM17final.db') with con: cur = con.cursor() commandR1 = """SELECT ImageID,COUNT(DISTINCT StarID) FROM imagetable_H WHERE Flux1/dFlux1 > 5 and ImageID in( SELECT ID FROM mastertable WHERE MJD between 56800 and 57300) and class = -1 GROUP BY ImageID UNION SELECT ImageID,COUNT(DISTINCT StarID) FROM imagetable_Ks WHERE Flux1/dFlux1 > 5 and ImageID in( SELECT ID FROM mastertable WHERE MJD between 56800 and 57300) and class = -1 GROUP BY ImageID UNION SELECT ImageID,COUNT(DISTINCT StarID) FROM imagetable_Z WHERE Flux1/dFlux1 > 5 and ImageID in( SELECT ID FROM mastertable WHERE MJD between 56800 and 57300) and class = -1 GROUP BY ImageID UNION SELECT ImageID,COUNT(DISTINCT StarID) FROM imagetable_J WHERE Flux1/dFlux1 > 5 and ImageID in( SELECT ID FROM mastertable WHERE MJD between 56800 and 57300) and class = -1 GROUP BY ImageID UNION SELECT ImageID,COUNT(DISTINCT StarID) FROM imagetable_Y WHERE Flux1/dFlux1 > 5 and ImageID in( SELECT ID FROM mastertable WHERE MJD between 56800 and 57300) and class = -1 GROUP BY ImageID ORDER BY imageID asc """ commandR2 = '''SELECT h.StarID,j.mag1-h.mag1 FROM imagetable_H as h join imagetable_J as j on h.StarID = j.StarID WHERE J.mag1-h.mag1 > 1.5 ORDER BY h.StarID asc ''' commandR3 = '''SELECT ks.StarID,ks.imageID,ABS(ks.Flux1-( SELECT AVG(ks2.Flux1) FROM imagetable_Ks as ks2 WHERE ks.imageID = ks2.imageID) )/ks.dFlux1 FROM imagetable_Ks as ks WHERE ABS(ks.Flux1 -( SElECT AVG(ks2.Flux1) FROM imagetable_Ks as ks2 WHERE ks.imageID = ks2.imageID)) > 20 *ks.dFlux1 ORDER BY ks.StarID asc,ks.imageID asc ''' commandR4 = '''SELECT ID FROM mastertable WHERE FieldID = %s ORDER BY ID asc ''' % (givenfield) commandR5 = '''SELECT y.StarID,y.Mag1,z.Mag1,j.Mag1,h.Mag1,ks.Mag1 FROM imagetable_Y as y join imagetable_Z as z on z.StarID = y.StarID join imagetable_J as j on j.StarID = y.StarID join imagetable_H as h on h.StarID = y.StarID join imagetable_Ks as ks on ks.StarID = y.StarID join mastertable as m on m.ID = y.ImageID WHERE y.Flux1/y.dFlux1 > 30 and z.Flux1/z.dFlux1 > 30 and j.Flux1/j.dFlux1 > 30 and h.Flux1/h.dFlux1 > 30 and ks.Flux1/ks.dFlux1 > 30 and y.class = -1 and z.class = -1 and j.class = -1 and h.class = -1 and ks.class = -1 and m.FieldID = %s and ks.ImageID=( SELECT m2.ID FROM mastertable as m2 WHERE m2.Filename = 'Field-%s-Ks-E001.fits') ORDER BY y.StarID asc ''' % (givenfield, givenfield) commandR6 = '''SELECT y.Mag1-j.Mag1,j.Mag1-h.Mag1 FROM imagetable_Y as y join imagetable_J as j on j.StarID = y.StarID join imagetable_H as h on h.StarID = y.StarID WHERE y.Mag1-j.Mag1 not NULL and j.Mag1-h.Mag1 not NULL and y.class = -1 and j.class = -1 and h.class = -1 limit 100 ''' if command == 1: command = commandR1 elif command == 2: command = commandR2 elif command == 3: command = commandR3 elif command == 4: command = commandR4 elif command == 5: command = commandR5 elif command == 6: command = commandR6 rows = cur.execute(command) # for row in rows: # print row if command == commandR2: Q = 0 for row in rows: if Q == 0: a = np.array(row) Q = 1 else: a = np.vstack((a, row)) print a plt.hist(a[:, 1], bins=200) plt.ylabel('amount of objects', fontsize=50) plt.xlabel('J-H color', fontsize=50) plt.xticks(fontsize=40) plt.yticks(fontsize=40) plt.xlim(1.49, 1.75) plt.title('J-H color of all objects with J-H > 1.5', fontsize=60) plt.show() plt.close() if command == commandR3: Q = 0 for row in rows: if Q == 0: a = np.array(row) Q = 1 else: a = np.vstack((a, row)) print a plt.hist(a[:, 2], bins=280) plt.ylabel('amount of objects', fontsize=50) plt.xlabel('deviation from the mean flux [flux uncertainties]', fontsize=50) plt.xticks(fontsize=40) plt.yticks(fontsize=40) plt.xlim(0, 145) plt.title( 'deviation from the mean flux for all deviations > 20 times the flux uncertainty', fontsize=32) plt.show() plt.close() if command == commandR5: Q = 0 for row in rows: if Q == 0: a = np.array(row) Q = 1 else: a = np.vstack((a, row)) sns.kdeplot(a[:, 1], label='Y', shade=True, linewidth=3.5) sns.kdeplot(a[:, 2], label='Z', shade=True, linewidth=3.5) sns.kdeplot(a[:, 3], label='J', shade=True, linewidth=3.5) sns.kdeplot(a[:, 4], label='H', shade=True, linewidth=3.5) sns.kdeplot(a[:, 5], label='Ks', shade=True, linewidth=3.5) plt.title( 'Kernel density plot in all filters of all objects in field ' + str(givenfield), fontsize=50) leg = plt.legend(fontsize=60, loc='upper left') for line in leg.get_lines(): line.set_linewidth(6.0) plt.xlabel('Magnitude in given filter', fontsize=50) plt.ylabel('Normalized counts', fontsize=50) plt.xticks(fontsize=30) plt.yticks(fontsize=30) plt.show() plt.close() if command == commandR6: Q = 0 for row in rows: if Q == 0: a = np.array(row) Q = 1 else: a = np.vstack((a, row)) kf = KFold(n_splits=10) kf.get_n_splits(a) print 'shape', np.shape(a) Max = -1e99 # for i in range(1000): # print 0.001+i/1000. # array=[] # for train_index,test_index in kf.split(a): # a_train,a_test = a[train_index],a[test_index] # kde = KernelDensity(kernel='gaussian', bandwidth=0.001+i/1000.).fit(a_train) # log_dens = kde.score_samples(a_train) # loglikelihood = kde.score(a_test) # array = np.append(array,loglikelihood) # Loglikelihood = np.nanmean(array) # if Loglikelihood > Max: # Max=Loglikelihood # Bandwidth = 0.001+i/1000. # print 'new best value for the bandwidth: ',Bandwidth Bandwidth = 0.061 #calculated with the above for loop for the fist 2000 entries of the query kde = KernelDensity(kernel='gaussian', bandwidth=Bandwidth).fit(a) samples = kde.sample(100000) # plt.scatter(samples[:,0],samples[:,1]) # plt.xlabel('Y-J',fontsize=50) # plt.ylabel('J-H',fontsize=50) # plt.xticks(fontsize=40) # plt.yticks(fontsize=40) # plt.title('sample of J-H color vs the Y-J color for 100,000 stars',fontsize=50) # plt.show() # plt.close() data = samples df = pd.DataFrame(data, columns=["Y-J", "J-H"]) sns.jointplot(x="Y-J", y="J-H", data=df, stat_func=None, kind="kde") # plt.xlabel('Y-J',fontsize=40) # plt.ylabel('J-H',fontsize=40) plt.xticks(fontsize=20) plt.yticks(fontsize=20) # plt.title('sample of J-H color vs the Y-J color for 100,000 stars as 2D distribution',fontsize=40) plt.show() plt.close() con.commit()
hit_prob = float(hits) / float(total) out_prob = float(outs) / float(total) out_rows = hit_vector.loc[hit_vector['events'].isin(out_list)] single_rows = hit_vector.loc[hit_vector['events'] == 'Single'] double_rows = hit_vector.loc[hit_vector['events'] == 'Double'] triple_rows = hit_vector.loc[hit_vector['events'] == 'Triple'] hit_vector = hit_vector.drop(hit_vector.columns[[0, 1, 2]], axis=1) out_rows = out_rows.drop(out_rows.columns[[0, 1, 2]], axis=1) single_rows = single_rows.drop(single_rows.columns[[0, 1, 2]], axis=1) double_rows = double_rows.drop(double_rows.columns[[0, 1, 2]], axis=1) triple_rows = triple_rows.drop(triple_rows.columns[[0, 1, 2]], axis=1) hit_rows = pd.concat([single_rows, double_rows, triple_rows]) kde = KernelDensity(bandwidth=4.53793103448) kde2 = KernelDensity(bandwidth=5.5620689655172413) heat_list = [] ## grid = GridSearchCV(KernelDensity(), # {'bandwidth': np.linspace(0.1, 10.0, 30)}, # cv=20) # 20-fold cross-validation ## grid.fit(hit_rows) ## print grid.best_params_ # print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) kde.fit(hit_vector) kde2.fit(hit_rows) for angle in xrange(-20, 5):
def KDE(x,y): kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(x) score = sum(kde.score_samples(y[:,])) return score
#class_road_pos = {} # 功能:将一个二重列表写入到csv文件中 # 输入:文件名称,数据列表 def createListCSV(fileName="", dataList=[]): with open(fileName, "wb") as csvFile: csvWriter = csv.writer(csvFile) for data in dataList: csvWriter.writerow(data) csvFile.close # 读取groundtruth图片,计算每一类的中心点 with open(label_list) as file_object: lines = file_object.readlines() # line example: /home/yangshuhui/code/data/GT5label/label3/06753.png for line in lines: class_road_pos = kdeEstimates.split_img_center(line.rstrip()) for cls, pos in class_road_pos.items(): pos_list[cls].append(pos) # kde函数生成 for cls, pos in pos_list.items(): #createListCSV(cls, pos) if pos: X = np.array(pos) print(X) kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(X)
print("pval_silverman(data) = {}".format(pval_silverman(data))) t2 = time.time() print("Critical bandwidth computation time: {}".format(t1-t0)) print("Silverman test computation time: {}".format(t2-t1)) fig, ax = plt.subplots() ax.hist(data, bins=50, normed=True) x_grid = np.linspace(np.min(data)-2, np.max(data)+2, 100) ax.plot(x_grid, KDE(data, h_crit).evaluate(x_grid), linewidth=2, color='black') plt.show() if 0: data = np.random.randn(1000) h = .5 print("np.std(data) = {}".format(np.std(data))) resamp = KernelDensity(kernel='gaussian', bandwidth=h).fit(data).sample(1000)/np.sqrt(1+h**2/np.var(data)) print("np.std(resamp) = {}".format(np.std(resamp))) if 0: N = 1000 data = np.hstack([np.random.randn(N/2), np.random.randn(N/4)+4]) h = 0.1 print("is_unimodal_kde(h, data) = {}".format(is_unimodal_kde(h, data))) #plt.show() h_crit = critical_bandwidth_m_modes(data, 2) x = np.linspace(-3, 8, 200) y = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(data.reshape(-1, 1)).score_samples(x.reshape(-1, 1)) plt.plot(x, np.exp(y)) plt.show() if 0:
fcols = [c for c in original_data.columns if c != 'label'] classes = np.sort(np.unique(sampled_data["label"].values)) kdes = {} for c in classes: print("\n\n=========================", "class", c, "============================\n\n") cdata = original_data.loc[original_data.label == c, fcols].values sampled_cdata = sampled_data.loc[sampled_data.label == c, fcols].values NUM_CDATA = cdata.shape[0] NUM_SDATA = sampled_cdata.shape[0] print("Fitting & Sampling...") if bandwidth is not None: for bw in [float(b) for b in bandwidth.split(",")]: print(" -- bw --", bw) kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(cdata) if not onlyNew: samples = kde.sample(NUM_SDATA, random_state=0) # random sampling. max_val = np.max(cdata) min_val = np.min(cdata) rnd_data = np.random.rand( NUM_SDATA, cdata.shape[1]) * (max_val - min_val) + min_val print("Evaluating ...") kdes[c] = kde odata = cdata[0:NUM_SDATA, :] print("class", c, "original", kde.score(odata) / NUM_SDATA) print("class", c, "sklearn KDE sampled", kde.score(samples) / NUM_SDATA)
import numpy as np from scipy import stats import matplotlib.pyplot as pltV from sklearn.neighbors import KernelDensity fig, plt = pltV.subplots(1, 1) r = stats.norm.rvs(size=20) z = stats.norm.rvs(size=80) v = stats.norm.rvs(size=150) print(r) print(z) print(v) print(stats.norm.fit(r)) print(stats.norm.fit(z)) print(stats.norm.fit(v)) x241 = np.linspace(-5, 5).reshape(-1, 1) norm241 = stats.norm.pdf(x241) plt.plot(norm241, 'r-') kde = KernelDensity(kernel='gaussian').fit(x241) norm2412 = np.exp(kde.score_samples(x241)) plt.plot(norm2412) pltV.show()
def showSurveyStatistics(simulatedSurvey, pdfFile=None, pngFile=None, usekde=False): """ Produce a plot with the survey statistics. Parameters ---------- simulatedSurvey : Object containing the simulated survey. Keywords -------- pdfFile : string Name of optional PDF file in which to save the plot. pngFile : string Name of optional PNG file in which to save the plot. usekde : boolean If true use kernel density estimates to show the distribution of survey quantities instead of histograms. """ try: _ = simulatedSurvey.observedParallaxes.shape except AttributeError: stderr.write("You have not generated the observations yet!\n") return parLimitPlot = 50.0 plxSnrLim = 5.0 positiveParallaxes = (simulatedSurvey.observedParallaxes > 0.0) goodParallaxes = (simulatedSurvey.observedParallaxes / simulatedSurvey.parallaxErrors >= plxSnrLim) estimatedAbsMags = ( simulatedSurvey.observedMagnitudes[positiveParallaxes] + 5.0 * np.log10(simulatedSurvey.observedParallaxes[positiveParallaxes]) - 10.0) relParErr = (simulatedSurvey.parallaxErrors[positiveParallaxes] / simulatedSurvey.observedParallaxes[positiveParallaxes]) deltaAbsMag = estimatedAbsMags - simulatedSurvey.absoluteMagnitudes[ positiveParallaxes] useagab(usetex=False, fontfam='sans') fig = plt.figure(figsize=(27, 12)) axA = fig.add_subplot(2, 3, 1) apply_tufte(axA, withgrid=False) axA.set_prop_cycle(cycler('color', get_distinct(3))) minPMinThird = np.power(simulatedSurvey.minParallax, -3.0) maxPMinThird = np.power(parLimitPlot, -3.0) x = np.linspace(simulatedSurvey.minParallax, np.min([parLimitPlot, simulatedSurvey.maxParallax]), 1001) axA.plot(x, 3.0 * np.power(x, -4.0) / (minPMinThird - maxPMinThird), '--', label='model', lw=3) if usekde: scatter = rse(simulatedSurvey.trueParallaxes) bw = 1.06 * scatter * simulatedSurvey.numberOfStarsInSurvey**(-0.2) kde = KernelDensity(bandwidth=bw) kde.fit(simulatedSurvey.trueParallaxes[:, None]) samples = np.linspace(simulatedSurvey.trueParallaxes.min(), simulatedSurvey.trueParallaxes.max(), 200)[:, None] logdens = kde.score_samples(samples) axA.plot(samples, np.exp(logdens), '-', lw=3, label='true') else: axA.hist(simulatedSurvey.trueParallaxes, bins='auto', density=True, histtype='step', lw=3, label='true') if usekde: scatter = rse(simulatedSurvey.observedParallaxes) bw = 1.06 * scatter * simulatedSurvey.numberOfStarsInSurvey**(-0.2) kde = KernelDensity(bandwidth=bw) kde.fit(simulatedSurvey.observedParallaxes[:, None]) samples = np.linspace(simulatedSurvey.observedParallaxes.min(), simulatedSurvey.observedParallaxes.max(), 200)[:, None] logdens = kde.score_samples(samples) axA.plot(samples, np.exp(logdens), '-', lw=3, label='observed') else: axA.hist(simulatedSurvey.observedParallaxes, bins='auto', density=True, histtype='step', lw=3, label='observed') axA.set_xlabel(r'$\varpi$, $\varpi_\mathrm{true}$ [mas]') axA.set_ylabel(r'$p(\varpi)$, $p(\varpi_\mathrm{true})$') leg = axA.legend(loc='best', handlelength=1.0) for t in leg.get_texts(): t.set_fontsize(14) axA.text(0.025, 0.9, 'a', horizontalalignment='center', verticalalignment='center', transform=axA.transAxes, weight='bold', fontsize=30) axB = fig.add_subplot(2, 3, 2) apply_tufte(axB, withgrid=False) axB.set_prop_cycle(cycler('color', get_distinct(3))) m = np.linspace(simulatedSurvey.observedMagnitudes.min(), simulatedSurvey.observedMagnitudes.max(), 1000) axB.plot(m, np.exp(simulatedSurvey.apparentMagnitude_lpdf(m)), '--', lw=3, label='model') if usekde: scatter = rse(simulatedSurvey.apparentMagnitudes) bw = 1.06 * scatter * simulatedSurvey.numberOfStarsInSurvey**(-0.2) kde = KernelDensity(bandwidth=bw) kde.fit(simulatedSurvey.apparentMagnitudes[:, None]) samples = np.linspace(simulatedSurvey.apparentMagnitudes.min(), simulatedSurvey.apparentMagnitudes.max(), 200)[:, None] logdens = kde.score_samples(samples) axB.plot(samples, np.exp(logdens), '-', label='true', lw=3) else: axB.hist(simulatedSurvey.apparentMagnitudes, bins='auto', density=True, histtype='step', lw=3, label='true') if usekde: scatter = rse(simulatedSurvey.observedMagnitudes) bw = 1.06 * scatter * simulatedSurvey.numberOfStarsInSurvey**(-0.2) kde = KernelDensity(bandwidth=bw) kde.fit(simulatedSurvey.observedMagnitudes[:, None]) samples = np.linspace(simulatedSurvey.observedMagnitudes.min(), simulatedSurvey.observedMagnitudes.max(), 200)[:, None] logdens = kde.score_samples(samples) axB.plot(samples, np.exp(logdens), '-', label='observed', lw=3) else: axB.hist(simulatedSurvey.observedMagnitudes, bins='auto', density=True, histtype='step', lw=3, label='observed') axB.set_xlabel("$m$, $m_\mathrm{true}$") axB.set_ylabel("$p(m)$, $p(m_\mathrm{true})$") leg = axB.legend(loc=(0.03, 0.55), handlelength=1.0) for t in leg.get_texts(): t.set_fontsize(14) axB.text(0.025, 0.9, 'b', horizontalalignment='center', verticalalignment='center', transform=axB.transAxes, weight='bold', fontsize=30) axC = fig.add_subplot(2, 3, 3) apply_tufte(axC, withgrid=False) axC.set_prop_cycle(cycler('color', get_distinct(3))) x = np.linspace(simulatedSurvey.absoluteMagnitudes.min(), simulatedSurvey.absoluteMagnitudes.max(), 300) axC.plot(x, norm.pdf(x, loc=simulatedSurvey.meanAbsoluteMagnitude, scale=simulatedSurvey.stddevAbsoluteMagnitude), '--', lw=3, label='model') if usekde: scatter = rse(simulatedSurvey.absoluteMagnitudes) bw = 1.06 * scatter * simulatedSurvey.numberOfStarsInSurvey**(-0.2) kde = KernelDensity(bandwidth=bw) kde.fit(simulatedSurvey.absoluteMagnitudes[:, None]) samples = np.linspace(simulatedSurvey.absoluteMagnitudes.min(), simulatedSurvey.absoluteMagnitudes.max(), 200)[:, None] logdens = kde.score_samples(samples) axC.plot(samples, np.exp(logdens), '-', label='true', lw=3) else: axC.hist(simulatedSurvey.absoluteMagnitudes, bins='auto', density=True, histtype='step', lw=3, label='true') if (simulatedSurvey.absoluteMagnitudes[goodParallaxes].size >= 3): if usekde: scatter = rse(simulatedSurvey.absoluteMagnitudes[goodParallaxes]) bw = 1.06 * scatter * simulatedSurvey.absoluteMagnitudes[ goodParallaxes].size**(-0.2) kde = KernelDensity(bandwidth=bw) kde.fit(simulatedSurvey.absoluteMagnitudes[goodParallaxes][:, None]) samples = np.linspace( simulatedSurvey.absoluteMagnitudes[goodParallaxes].min(), simulatedSurvey.absoluteMagnitudes[goodParallaxes].max(), 200)[:, None] logdens = kde.score_samples(samples) axC.plot( samples, np.exp(logdens), '-', label=r'$\varpi/\sigma_\varpi\geq{0:.1f}$'.format(plxSnrLim), lw=3) else: axC.hist( simulatedSurvey.absoluteMagnitudes[goodParallaxes], bins='auto', density=True, histtype='step', lw=3, label=r'$\varpi/\sigma_\varpi\geq{0:.1f}$'.format(plxSnrLim)) axC.set_xlabel("$M$") axC.set_ylabel("$p(M)$") leg = axC.legend(loc=(0.03, 0.55), handlelength=1.0) for t in leg.get_texts(): t.set_fontsize(14) axC.text(0.025, 0.9, 'c', horizontalalignment='center', verticalalignment='center', transform=axC.transAxes, weight='bold', fontsize=30) axD = fig.add_subplot(2, 3, 4) apply_tufte(axD, withgrid=False) axD.set_prop_cycle(cycler('color', get_distinct(3))) axD.plot(simulatedSurvey.trueParallaxesNoLim, simulatedSurvey.observedParallaxesNoLim - simulatedSurvey.trueParallaxesNoLim, 'k,', label=r'$m_\mathrm{lim}=\infty$') axD.plot(simulatedSurvey.trueParallaxes, simulatedSurvey.observedParallaxes - simulatedSurvey.trueParallaxes, '.', label=r'$m_\mathrm{{lim}}={0}$'.format( simulatedSurvey.apparentMagnitudeLimit)) axD.plot(simulatedSurvey.trueParallaxes[positiveParallaxes], simulatedSurvey.observedParallaxes[positiveParallaxes] - simulatedSurvey.trueParallaxes[positiveParallaxes], '.', label=r'$\varpi>0$') axD.plot(simulatedSurvey.trueParallaxes[goodParallaxes], simulatedSurvey.observedParallaxes[goodParallaxes] - simulatedSurvey.trueParallaxes[goodParallaxes], 'o', label=r'$\varpi/\sigma_\varpi\geq{0:.1f}$'.format(plxSnrLim)) axD.set_xlabel(r"$\varpi_\mathrm{true}$ [mas]") axD.set_ylabel("$\\varpi-\\varpi_\\mathrm{true}$ [mas]") leg = axD.legend(loc='best', handlelength=0.5, ncol=2) for t in leg.get_texts(): t.set_fontsize(14) axD.text(0.025, 0.9, 'd', horizontalalignment='center', verticalalignment='center', transform=axD.transAxes, weight='bold', fontsize=30) axE = fig.add_subplot(2, 3, 5) apply_tufte(axE, withgrid=False) axE.set_prop_cycle(cycler('color', get_distinct(3))) axE.plot(simulatedSurvey.trueParallaxesNoLim, simulatedSurvey.absoluteMagnitudesNoLim, 'k,', label=r'$m_\mathrm{lim}=\infty$') axE.plot(simulatedSurvey.trueParallaxes, simulatedSurvey.absoluteMagnitudes, '.', label=r'$m_\mathrm{{lim}}={0}$'.format( simulatedSurvey.apparentMagnitudeLimit)) axE.plot(simulatedSurvey.trueParallaxes[positiveParallaxes], simulatedSurvey.absoluteMagnitudes[positiveParallaxes], '.', label=r'$\varpi>0$') axE.plot(simulatedSurvey.trueParallaxes[goodParallaxes], simulatedSurvey.absoluteMagnitudes[goodParallaxes], 'o', label=r'$\varpi/\sigma_\varpi\geq{0:.1f}$'.format(plxSnrLim)) axE.set_xlabel(r"$\varpi_\mathrm{true}$ [mas]") axE.set_ylabel("$M_\\mathrm{true}$") axE.axhline(y=simulatedSurvey.meanAbsoluteMagnitude) leg = axE.legend(loc='best', handlelength=0.5, ncol=2) for t in leg.get_texts(): t.set_fontsize(14) axE.text(0.025, 0.9, 'e', horizontalalignment='center', verticalalignment='center', transform=axE.transAxes, weight='bold', fontsize=30) plt.suptitle( "Simulated survey statistics: $N_\\mathrm{{stars}}={0}$, ".format( simulatedSurvey.numberOfStars) + "$m_\\mathrm{{lim}}={0}$, ".format( simulatedSurvey.apparentMagnitudeLimit) + "$N_\\mathrm{{survey}}={0}$, ".format( simulatedSurvey.numberOfStarsInSurvey) + "${0}\\leq\\varpi\\leq{1}$, ".format(simulatedSurvey.minParallax, simulatedSurvey.maxParallax) + "$\\mu_M={0}$, ".format(simulatedSurvey.meanAbsoluteMagnitude) + "$\\sigma_M={0:.2f}$".format(simulatedSurvey.stddevAbsoluteMagnitude)) if pdfFile is not None: plt.savefig(pdfFile) if pngFile is not None: plt.savefig(pngFile) if (pdfFile is None and pngFile is None): plt.show()
def KDE(X,X_plot): kde = KernelDensity(kernel='gaussian',bandwidth=0.75 ).fit(X.reshape(-1,1)) log_dens = kde.score_samples(X_plot) return log_dens
def estimate_jensen_shannon_divergence_from_numerical_distribution( particles, x_N, y_N, h=0.2, xlimit=[-4, 4], ylimit=[-4, 4], grid_N=100, plot=True): """ :param particles: :param x_N: :param y_N: :param h: :param xlimit: :param ylimit: :param grid_N: :return: """ # Fit the particles kde1 = KernelDensity(kernel='gaussian', bandwidth=h).fit(particles) # Create mesh grid x_grid_N = grid_N x_grid = np.linspace(xlimit[0], xlimit[1], x_grid_N) y_grid_N = grid_N y_grid = np.linspace(ylimit[0], ylimit[1], y_grid_N) x_GH, y_GH = np.meshgrid(x_grid, y_grid) xy_grid = np.vstack([x_GH.flatten(), y_GH.flatten()]).T log_pdf_kde = kde1.score_samples(xy_grid) pdf_kde = np.exp(log_pdf_kde) sigma_prior = 1 sigma_y = 1 # Compute log prior, this is straight forward log_pdf_prior_M = scipy.stats.multivariate_normal.logpdf( xy_grid, np.zeros(2), sigma_prior * np.eye(2)) log_pdf_prior_GH = log_pdf_prior_M.reshape((grid_N, grid_N)) # Compute log likelihood log_pdf_lik_M = np.zeros(xy_grid.shape[0]) for mm in range(xy_grid.shape[0]): a = xy_grid[mm, 0] b = xy_grid[mm, 1] log_ll = scipy.stats.norm.logpdf(y_N, a * b * x_N, sigma_y) log_pdf_lik_M[mm] = np.sum(log_ll) log_pdf_lik_GH = log_pdf_lik_M.reshape((grid_N, grid_N)) # Compute unnormalized log posterior log_pdf_post_GH = log_pdf_lik_GH + log_pdf_prior_GH log_pdf_post_vector = log_pdf_post_GH.flatten() # Compute Posterior pdf_post_vector = np.exp(log_pdf_post_vector) pdf_post_vector = pdf_post_vector / np.sum(pdf_post_vector) # Compute jensen-shannon divergence # JSD(q|p) = 0.5 * KL(q|m) + 0.5 * KL(p|m) where m = 0.5*(p+q) q = 0.5 * (pdf_post_vector + pdf_kde) jsd = 0.5 * entropy(pdf_post_vector, q) + 0.5 * entropy(pdf_kde, q) if plot: # Plot data plt.plot(x_N, y_N, 'k.') plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.title("Data distribution") plt.show() # Plot contour plots for prior make_log_pdf_contour_plot(x_grid, y_grid, log_pdf_prior_GH, "Prior distribution", -100) # Plot contour for likelihood make_log_pdf_contour_plot(x_grid, y_grid, log_pdf_lik_GH, "likelihood", -100) # Plot contour plots for posterior make_log_pdf_contour_plot(x_grid, y_grid, log_pdf_post_GH, "posterior", -100) # Plot contour plots for KDE x_particle = particles[:, 0] y_particle = particles[:, 1] plt.scatter(x_particle, y_particle) plt.title("Particles") plt.show() make_log_pdf_contour_plot(x_grid, y_grid, np.reshape(log_pdf_kde, (grid_N, grid_N)), "kde(particles), jsd = {}".format(jsd), -100) return jsd
def __init__(self, data, bandwidth, kernel): self.__kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(data)
def find_optimum_bandwidth(spike_times, bandwidths=10**np.linspace(-1, 1, 100)): grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=LeaveOneOut()) grid.fit(spike_times[:, None]) bandwidth = grid.best_params_ return bandwidth['bandwidth'] # bandwidth = find_optimum_bandwidth(spike_times) # print(bandwidth) bandwidth = 0.126 spike_times = np.sort(spike_times) # instantiate and fit the KDE model kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian') kde.fit(spike_times[:, None]) # score_samples returns the log of the probability density logprob = kde.score_samples(spike_times[:, None]) # ax.fill_between(spike_times, np.exp(logprob), alpha=0.5) ax.plot(spike_times, np.exp(logprob), alpha=1, lw=2, color="k") pl.savefig("fig.png") pl.close()
def set_grid(self, reset=False): if not Parallelograms.bandwidth_grid or reset: Parallelograms.bandwidth_grid = GridSearchCV(KernelDensity(kernel='tophat'), {'bandwidth': np.linspace(0.1, 1.0, 100)}, cv=8) # 8-fold cross-validation return Parallelograms.bandwidth_grid
class KDE(BaseOutlierDetector): """Outlier detector using Kernel Density Estimation (KDE). Parameters ---------- algorithm : str, default 'auto' Tree algorithm to use. Valid algorithms are ['kd_tree'|'ball_tree'|'auto']. atol : float, default 0.0 Desired absolute tolerance of the result. bandwidth : float, default 1.0 Bandwidth of the kernel. breadth_first : bool, default True If true, use a breadth-first approach to the problem. Otherwise use a depth-first approach. contamination : float, default 0.1 Proportion of outliers in the data set. Used to define the threshold. kernel : str, default 'gaussian' Kernel to use. Valid kernels are ['gaussian'|'tophat'|'epanechnikov'|'exponential'|'linear'|'cosine']. leaf_size : int, default 40 Leaf size of the underlying tree. metric : str, default 'euclidean' Distance metric to use. rtol : float, default 0.0 Desired relative tolerance of the result. metric_params : dict, default None Additional parameters to be passed to the requested metric. Attributes ---------- anomaly_score_ : array-like of shape (n_samples,) Anomaly score for each training data. contamination_ : float Actual proportion of outliers in the data set. threshold_ : float Threshold. References ---------- .. [#parzen62] Parzen, E., "On estimation of a probability density function and mode," Ann. Math. Statist., 33(3), pp. 1065-1076, 1962. Examples -------- >>> import numpy as np >>> from kenchi.outlier_detection import KDE >>> X = np.array([ ... [0., 0.], [1., 1.], [2., 0.], [3., -1.], [4., 0.], ... [5., 1.], [6., 0.], [7., -1.], [8., 0.], [1000., 1.] ... ]) >>> det = KDE() >>> det.fit_predict(X) array([ 1, 1, 1, 1, 1, 1, 1, 1, 1, -1]) """ @property def X_(self): """array-like of shape (n_samples, n_features): Training data. """ return self.estimator_.tree_.data def __init__( self, algorithm='auto', atol=0., bandwidth=1., breadth_first=True, contamination=0.1, kernel='gaussian', leaf_size=40, metric='euclidean', rtol=0., metric_params=None ): self.algorithm = algorithm self.atol = atol self.bandwidth = bandwidth self.breadth_first = breadth_first self.contamination = contamination self.kernel = kernel self.leaf_size = leaf_size self.metric = metric self.rtol = rtol self.metric_params = metric_params def _check_is_fitted(self): super()._check_is_fitted() check_is_fitted(self, 'X_') def _fit(self, X): self.estimator_ = KernelDensity( algorithm = self.algorithm, atol = self.atol, bandwidth = self.bandwidth, breadth_first = self.breadth_first, kernel = self.kernel, leaf_size = self.leaf_size, metric = self.metric, rtol = self.rtol, metric_params = self.metric_params ).fit(X) return self def _anomaly_score(self, X): return -self.estimator_.score_samples(X)
kernel = 'gaussian' bins = np.linspace(-1, 1, 200) kde_path = './kde_%i_%i_%i_%i.jbl' % (window, overlap, decimation_rate, spectrum_bins_left) if os.path.exists(kde_path): kdes = joblib.load(kde_path) else: kdes = dict() for cls in raw_data_seg: kdes[cls] = dict() for run_name, run_seg in raw_data_seg[cls].items(): kdes[cls][run_name] = np.zeros((run_seg.shape[0], bins.shape[0])) for i in range(0, run_seg.shape[0]): segment = run_seg[i] kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(segment[:, np.newaxis]) kdes[cls][run_name][i] = np.exp( kde.score_samples(bins[:, np.newaxis])) joblib.dump(kdes, kde_path) from Functions.StatFunctions import KLDiv def kl_dv_fn(kdes, k): kl_foward = {} kl_reverse = {} for cls in kdes: kl_foward[cls] = dict() kl_reverse[cls] = dict() for run in kdes[cls]: run_pdf = kdes[cls][run]
def __init__(self, **kwargs): self.kde = KernelDensity(**kwargs) self.pre_whiten = PCA(whiten=True)
while n < N_tr: print('Stock: ', count, '/24', ' - Training: ', n + 1, '/', N_tr) "%%%%%%%%%%%%%%%%%%%%% TRANING %%%%%%%%%%%%%%%%%%%%%" "Input vector" u_train = X_train[n, :][np.newaxis, :] size = np.zeros(len(Set_Dict_tr)) kullback = np.ones(len(Set_Dict_tr)) for i in Set_Dict_tr: prev_dict_ytr = Set_Dict_tr[str(i)].reshape(-1, 1) curr_dict_ytr = np.append(prev_dict_ytr, u_train[0, -1])[:, np.newaxis] kde_prev = KernelDensity(kernel='gaussian', bandwidth=0.05).fit(prev_dict_ytr) kde_curr = KernelDensity(kernel='gaussian', bandwidth=0.05).fit(curr_dict_ytr) den_prev = np.exp(kde_prev.score_samples(y_grid[:, None])) den_curr = np.exp(kde_curr.score_samples(y_grid[:, None])) kullback[int(i) - 1] = entropy(pk=den_prev, qk=den_curr) size[int(i) - 1] = np.shape(Set_Dictionaries[str(i)])[0] del prev_dict_ytr, curr_dict_ytr, kde_prev, kde_curr, den_prev, den_curr entr_near_cluster = np.min(kullback) near_cluster = np.argmin(kullback) + 1
def main(classifier,model, X_train, y_train, Y_train,X_test, y_test,Y_test, X_test_adv,Bandwidth): batch_size = 256 X_test,X_test_adv,Y_test = get_testing_data(X_test,X_test_adv,y_test,classifier) uncerts_normal = np.zeros((X_test.shape[0],),dtype=float) uncerts_adv = np.zeros((X_test.shape[0],),dtype=float) print('Getting deep feature representations...') X_train_features = get_deep_representations(model, X_train, batch_size=batch_size) X_test_normal_features = get_deep_representations(model, X_test, batch_size=batch_size) X_test_adv_features = get_deep_representations(model, X_test_adv, batch_size=batch_size) class_inds = {} for i in range(Y_train.shape[1]): class_inds[i] = np.where(Y_train.argmax(axis=1) == i)[0] # print('class_inds:', class_inds) kdes = {} warnings.warn("Using pre-set kernel bandwidths that were determined " "optimal for the specific CNN models of the paper. If you've " "changed your model, you'll need to re-optimize the " "bandwidth.") for i in range(Y_train.shape[1]): kdes[i] = KernelDensity(kernel='gaussian', bandwidth=Bandwidth) \ .fit(X_train_features[class_inds[i]]) preds_test_normal = classifier.predict(X_test) preds_test_adv = classifier.predict(X_test_adv) preds_test_normal = preds_test_normal.argmax(axis=1) preds_test_adv = preds_test_adv.argmax(axis=1) densities_normal = score_samples( kdes, X_test_normal_features, preds_test_normal ) densities_adv = score_samples( kdes, X_test_adv_features, preds_test_adv ) # print(densities_adv) ## Z-score the uncertainty and density values uncerts_normal_z, uncerts_adv_z = normalize( uncerts_normal, uncerts_adv ) densities_normal_z, densities_adv_z = normalize( densities_normal, densities_adv ) values, labels, lr = train_lr( densities_pos=densities_adv_z, densities_neg=densities_normal_z, uncerts_pos=uncerts_adv_z, uncerts_neg=uncerts_normal_z ) ## Evaluate detector # Compute logistic regression model predictions probs = lr.predict_proba(values)[:, 1] # Compute AUC n_samples = len(X_test) FPR, TPR, auc_score = compute_roc( probs_neg=probs[:n_samples], probs_pos=probs[n_samples:] ) print('FPR:', FPR) print('TPR:', TPR) print('auc:', auc_score) print('Detector ROC-AUC score: %0.4f' % auc_score) print('Total:',n_samples) print('Clean:',np.sum((probs[n_samples:])>0.5)) print('Adv:',np.sum(probs[:n_samples]<0.5)) print('P:',np.sum((probs[n_samples:])>0.5)/np.sum(probs>0.5)) print('R:', np.sum(probs[n_samples:] > 0.5) / probs[:n_samples].shape[0]) print('Detector ROC-AUC score: %0.4f' % auc_score) concat = np.vstack((FPR, TPR)) return concat
def umi(x, y, k=5, density_estimation_method="kde", k_density=5, bw=.01): """Calculates the uniformed mutual information where the distribution for :math:`x` is replaced by a uniform distribution. `umi` takes two random variable x and y and estimated their mutual using the KSG estimator while x is replaced by a uniform distribution. Arguments --------- x: `List` One random variable from the time-series data. y: `List` Another random variable from the time-series data. k: `int` (Default: 5) Number for nearest neighbors used in entropy calculation density_estimation_method: `str` (Default: `kde`) Which 2D density estimator you would like to use. `kde` is kde estimator while `knn` is knn based estimator. k_density: `bool` (default: False) The number of k nearest neighbors you would like to use when calculating the density (only applicable when density_estimation_method is to be `knn` or using knn based density estimation). bw: `float` (default: 0.1) Bindwidth used for the kernel density estimator. Returns ------- A estimated uniform mutual information value between two variables (x, y) where the distribution for the x is replaced by a uniform distribution. """ assert len(x) == len(y), "Lists should have same length" assert k <= len(x) - 1, "Set k smaller than num. samples - 1" N = len(x) dx = len(x[0]) dy = len(y[0]) data = np.concatenate((x, y), axis=1) tree_xy = ss.cKDTree(data) tree_x = ss.cKDTree(x) tree_y = ss.cKDTree(y) if density_estimation_method.lower() == "kde": kernel = KernelDensity(bandwidth=bw) kernel.fit(x) kde = np.exp(kernel.score_samples(x)) weight = (1 / kde) / np.mean(1 / kde) elif density_estimation_method.lower() == "knn": knn_dis = [ tree_x.query(point, k_density + 1, p=np.inf)[0][k_density] for point in x ] density_estimate = np.array([ float(k_density) / N / knn_dis[i]**dx for i in range(len(knn_dis)) ]) weight = (1 / density_estimate) / np.mean(1 / density_estimate) else: raise ValueError("The density estimation method is not recognized") knn_dis = [tree_xy.query(point, k + 1, p=2)[0][k] for point in data] ans = digamma(k) + 2 * log(N - 1) - digamma(N) + vd(dx) + vd(dy) - vd(dx + dy) weight_y = np.zeros(N) for i in range(N): weight_y[i] = np.sum(weight[j] for j in tree_y.query_ball_point( y[i], knn_dis[i], p=2)) - weight[i] weight_y *= N / np.sum(weight_y) for i in range(N): nx = len(tree_x.query_ball_point(x[i], knn_dis[i], p=2)) - 1 ny = np.sum(weight[j] for j in tree_y.query_ball_point( y[i], knn_dis[i], p=2)) - weight[i] ans += -weight[i] * log(nx) / N # ans += -ny * log(ny) / N / (len(tree_y.query_ball_point(y[i], knn_dis[i], p=2))-1) ans += -weight[i] * log(ny) / N return ans
x1_kgroups = X[np.where(zh_kgroups == 0)][:, np.newaxis] x2_kgroups = X[np.where(zh_kgroups == 1)][:, np.newaxis] acc_kgroups = metric.accuracy(z, zh_kgroups) t.add_row(['kernel k-groups', acc_kgroups]) print t ### kernel density estimation for truth X_plot = np.linspace(low, high, num_points)[:, np.newaxis] x1_true = X[np.where(z == 0)][:, np.newaxis] x2_true = X[np.where(z == 1)][:, np.newaxis] fig = plt.figure() ax = fig.add_subplot(111) kde1 = KernelDensity(kernel='gaussian', bandwidth=bw).fit(x1_true) log_dens1 = kde1.score_samples(X_plot) kde2 = KernelDensity(kernel='gaussian', bandwidth=bw).fit(x2_true) log_dens2 = kde2.score_samples(X_plot) ax.fill_between(X_plot[:, 0], np.exp(log_dens1), alpha=.3, color='k') ax.plot(X_plot[:, 0], np.exp(log_dens1), color='k', label='truth') ax.fill_between(X_plot[:, 0], np.exp(log_dens2), alpha=.3, color='k') ax.plot(X_plot[:, 0], np.exp(log_dens2), color='k') xs = np.linspace(low, high, num_points) ax.plot(xs, scipy.stats.norm.pdf(xs, x1_mu_kmeans, np.sqrt(x1_var_kmeans)), label="%s" % (methods[0]), color=colors[0]) ax.plot(xs, scipy.stats.norm.pdf(xs, x2_mu_kmeans, np.sqrt(x2_var_kmeans)),
X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis] bins = np.linspace(-5, 10, 10) fig, ax = plt.subplots(2, 2, sharex=True, sharey=True) fig.subplots_adjust(hspace=0.05, wspace=0.05) # histogram 1 ax[0, 0].hist(X[:, 0], bins=bins, fc='#AAAAFF', **density_param) ax[0, 0].text(-3.5, 0.31, "Histogram") # histogram 2 ax[0, 1].hist(X[:, 0], bins=bins + 0.75, fc='#AAAAFF', **density_param) ax[0, 1].text(-3.5, 0.31, "Histogram, bins shifted") # tophat KDE kde = KernelDensity(kernel='tophat', bandwidth=0.75).fit(X) log_dens = kde.score_samples(X_plot) ax[1, 0].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF') ax[1, 0].text(-3.5, 0.31, "Tophat Kernel Density") # Gaussian KDE kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X) log_dens = kde.score_samples(X_plot) ax[1, 1].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF') ax[1, 1].text(-3.5, 0.31, "Gaussian Kernel Density") for axi in ax.ravel(): axi.plot(X[:, 0], np.full(X.shape[0], -0.01), '+k') axi.set_xlim(-4, 9) axi.set_ylim(-0.02, 0.34)
fnames = ['GlobalTemp_1.txt', 'GlobalTemp_2.txt'] data = load_data(fnames) # Sanity check print(data['GlobalTemp_1.txt'][6, 0]) # Should be 0.74 print(data['GlobalTemp_2.txt'][6, 0]) # Should be 1.07 # Remove -99.99 from row 8 data_row8_1997 = remove_99(data['GlobalTemp_1.txt'][6, None]) data_row8_2017 = remove_99(data['GlobalTemp_2.txt'][6, None]) x1 = np.linspace(-2, 4, 1000) x2 = np.linspace(-2, 4, 1000) kde1 = KernelDensity(kernel='epanechnikov', bandwidth=0.4).fit(data_row8_1997[:, None]) kde2 = KernelDensity(kernel='epanechnikov', bandwidth=0.4).fit(data_row8_2017[:, None]) def f_kde1(x): return np.exp((kde1.score_samples([[x]]))) def f_kde2(x): return np.exp((kde2.score_samples([[x]]))) # Remember score_samples return log(probability density) !!!!! p1 = np.exp(kde1.score_samples(x1[:, None])) p2 = np.exp(kde2.score_samples(x2[:, None]))
def window_analysis(Windows, ref_labels, labels1, Chr=1, ncomp=4, amova=True, supervised=True, include_who=[], range_sample=[130, 600], rand_sample=0, clsize=15, cl_freqs=5, Bandwidth_split=20, quantile=0.1, centre_d=True, PC_sel=0): kde_class_labels = labels1 kde_label_dict = { z: [x for x in range(len(kde_class_labels)) if kde_class_labels[x] == z] for z in list(set(kde_class_labels)) } if include_who: include = [ x for x in range(len(kde_class_labels)) if kde_class_labels[x] in include_who ] ref_labels = include_who kde_class_labels = [kde_class_labels[x] for x in include] kde_label_dict = { z: [ x for x in range(len(kde_class_labels)) if kde_class_labels[x] == z ] for z in include_who } if rand_sample: sample = rand_sample sample_range = [0, sample] Freq_extract = { Chr: { bl: Windows[Chr][bl] for bl in np.random.choice( list(Windows[Chr].keys()), sample, replace=True) } } if range_sample: sample_range = range_sample Freq_extract = { Chr: { bl: Windows[Chr][bl] for bl in list(sorted(Windows[Chr].keys())) [sample_range[0]:sample_range[1]] } } Results = {'header': ['Chr', 'window'], 'info': [], 'coords': []} Frequencies = {'header': ['Chr', 'window', 'cl'], 'coords': [], 'info': []} Construct = {'header': ['Chr', 'window', 'cl'], 'coords': [], 'info': []} PC_var = {'header': ['Chr', 'window'], 'coords': [], 'info': []} pc_density = [] pc_coords = [] sim_fst = [] for c in Freq_extract[Chr].keys(): Sequences = Windows[Chr][c] if Sequences.shape[1] <= 3: Results[Chr][c] = [0, 0] print('hi') continue Sequences = np.nan_to_num(Sequences) pca = PCA(n_components=ncomp, whiten=False, svd_solver='randomized').fit(Sequences) data = pca.transform(Sequences) from sklearn.preprocessing import scale if include_who: data = data[include, :] ##### PC density PC = PC_sel pc_places = data[:, PC] if centre_d: pc_places = scale(pc_places, with_std=False) X_plot = np.linspace(-8, 8, 100) Focus_labels = list(range(data.shape[0])) bandwidth_pc = estimate_bandwidth(pc_places.reshape(-1, 1), quantile=quantile, n_samples=len(pc_places)) if bandwidth_pc <= 1e-3: bandwidth_pc = 0.01 bandwidth = estimate_bandwidth(data, quantile=quantile, n_samples=len(Focus_labels)) if bandwidth <= 1e-3: bandwidth = 0.01 kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth_pc).fit( np.array(pc_places).reshape(-1, 1)) log_dens = kde.score_samples(X_plot.reshape(-1, 1)) pc_density.append(np.exp(log_dens)) pc_coords.append(pc_places) PC_var['coords'].append([Chr, c]) PC_var['info'].append([x for x in pca.explained_variance_]) ### params = { 'bandwidth': np.linspace(np.min(data), np.max(data), Bandwidth_split) } grid = GridSearchCV(KernelDensity(algorithm="ball_tree", breadth_first=False), params, verbose=0) ###################################### ####### TEST global Likelihood ####### ###################################### #### Mean Shift approach ## from sklearn.cluster import MeanShift, estimate_bandwidth ms = MeanShift(bandwidth=bandwidth, cluster_all=False, min_bin_freq=clsize) ms.fit(data[Focus_labels, :]) labels = ms.labels_ Tree = { x: [Focus_labels[y] for y in range(len(labels)) if labels[y] == x] for x in [g for g in list(set(labels)) if g != -1] } Keep = [x for x in Tree.keys() if len(Tree[x]) > clsize] Tree = {x: Tree[x] for x in Keep} Ngps = len(Tree) SpaceX = {x: data[Tree[x], :] for x in Tree.keys()} these_freqs = [] ### Extract MScluster likelihood by sample for hill in SpaceX.keys(): if len(Tree[hill]) >= cl_freqs: if supervised == False: print('hi') cl_seqs = Sequences[Tree[hill], :] freq_vector = [ float(x) / (cl_seqs.shape[0] * 2) for x in np.sum(cl_seqs, axis=0) ] Frequencies['coords'].append([Chr, c, hill]) Frequencies['info'].append(freq_vector) these_freqs.append(freq_vector) grid.fit(data[Tree[hill], :]) # use the best estimator to compute the kernel density estimate kde = grid.best_estimator_ P_dist = kde.score_samples(data[Tree[hill], :]) Dist = kde.score_samples(data) P_dist = np.nan_to_num(P_dist) Dist = np.nan_to_num(Dist) if np.std(P_dist) == 0: Dist = np.array( [int(Dist[x] in P_dist) for x in range(len(Dist))]) else: Dist = scipy.stats.norm(np.mean(P_dist), np.std(P_dist)).cdf(Dist) Dist = np.nan_to_num(Dist) Construct['coords'].append([Chr, c, hill]) Construct['info'].append(Dist) ######################################### ############# AMOVA ################ ######################################### if supervised: labels = [x for x in kde_class_labels if x in ref_labels] Who = [ z for z in it.chain(*[kde_label_dict[x] for x in ref_labels]) ] Ngps = len(ref_labels) #print(ref_labels) for hill in ref_labels: if len(kde_label_dict[hill]) >= cl_freqs: if include_who: Seq_specific = Sequences[include, :] cl_seqs = Seq_specific[kde_label_dict[hill], :] freq_vector = [ float(x) / (cl_seqs.shape[0] * 2) for x in np.sum(cl_seqs, axis=0) ] Frequencies['coords'].append([Chr, c, hill]) Frequencies['info'].append(freq_vector) these_freqs.append(freq_vector) else: Who = [ x for x in range(len(labels)) if labels[x] != -1 and labels[x] in Keep ] labels = [labels[x] for x in Who] Who = [Focus_labels[x] for x in Who] # if len(these_freqs) > 1: Pairwise = return_fsts2(np.array(these_freqs)) sim_fst.extend(Pairwise.fst) if len(list(set(labels))) == 1: Results['info'].append([Chr, c, 0, 1]) #Results['info'].append([AMOVA,Ngps]) continue if amova: clear_output() AMOVA, Cig = AMOVA_FM42(data[Who, :], labels, n_boot=0, metric='euclidean') print('counting: {}, Ngps: {}'.format(AMOVA, Ngps)) Results['info'].append([Chr, c, AMOVA, Ngps]) Results['info'] = pd.DataFrame( np.array(Results['info']), columns=['chrom', 'window', 'AMOVA', 'Ngps']) if len(sim_fst) > 3: X_plot = np.linspace(0, .3, 100) freq_kde = KernelDensity(kernel='gaussian', bandwidth=0.02).fit( np.array(sim_fst).reshape(-1, 1)) log_dens = freq_kde.score_samples(X_plot.reshape(-1, 1)) fig_roost_dens = [ go.Scatter(x=X_plot, y=np.exp(log_dens), mode='lines', fill='tozeroy', name='', line=dict(color='blue', width=2)) ] ## layout = go.Layout( title='allele frequency distribution across clusters', yaxis=dict(title='density'), xaxis=dict(title='fst')) fig = go.Figure(data=fig_roost_dens, layout=layout) else: fig = [] return Frequencies, sim_fst, Results, Construct, pc_density, pc_coords, fig
class Watershed: """Watershed segemention for two-dimensional data. Performs probability density estimation of data, then applies watershed to segement into discrete regions. Basically a wrapper on sklearn to manage preprocessing and store data. Args: prune_outliers (bool): To prune or not to prune. outlier_neighbors (int): Number of neighbors to use. outlier_thresholds (float): Outlier threshold. bandwidth (float): KDE bandwidth. ngrid (int): KDE grid resolution. ngrid_pad (float): Grid inset padding. peak_min_distance (int): Minimum number of pixels separating peaks. peak_threshold_rel (float): Minimum relative intensity of peaks. peak_dialation (float): Peak dialation factor. compactness (float): Compactness factor for compact watershed. Attributes: lof_: LocalOutlierFactor. kde_: KernelDensity estimator. Z_crop_: Z cropped for outliers. Z_norm_: Z normalized and inset. Z_labels_ (np array, shape (n_samples,)): Segment labels. P_ (np array, shape (ngrid, ngrid)): Probabilty density map. P_peaks_ (np array, shape (ngrid, ngrid)): KDE peaks. P_edt_ (np array, shape (ngrid, ngrid)): Euclidean distance transform. P_labels_ (np array, shape (ngrid, ngrid)): KDE Labels. P_bounds_ (np array, shape (ngrid, ngrid)): KDE Bounds . peaks_ (np array, shape (num_peaks, 2)): List of KDE peaks. """ def __init__( self, prune_outliers=False, outlier_neighbors=1, outlier_threshold=0.7, bandwidth=1.0 / 40, ngrid=600, ngrid_pad=0.07, peak_min_distance=10, peak_threshold_rel=0.1, peak_dialation=2, compactness=0.01, ): self.prune_outliers = prune_outliers self.outlier_neighbors = outlier_neighbors self.outlier_threshold = outlier_threshold self.bandwidth = bandwidth self.ngrid = ngrid self.ngrid_pad = ngrid_pad self.peak_min_distance = peak_min_distance self.peak_threshold_rel = peak_threshold_rel self.peak_dialation = peak_dialation self.compactness = compactness def segment(self, Z, verbose=True): """Fit the model using Z as data to be segmented. Args: Z (np array, shape (n_samples, 2)): Data to be segemented. verbose (bool): Verbosity. Returns: labels (np array, (n_samples,): Segment label for each sample. """ if verbose: print('Segmenting regions using watershed...') print('- num samples: {}'.format(len(Z))) # outliers if self.prune_outliers: if verbose: print('- pruning outliers') self.lof_ = LocalOutlierFactor(n_neighbors=self.outlier_neighbors, contamination=0.1) lof_pred = self.lof_.fit_predict(Z) lof_scores = self.lof_.negative_outlier_factor_ lof_scores = minmax_scale(lof_scores) self.Z_crop_ = Z[lof_scores > self.outlier_threshold] self.Z_left_ = np.where(lof_scores > self.outlier_threshold) num_outliers = Z.shape[0] - self.Z_crop_.shape[0] print('-> outliers pruned: {}'.format(num_outliers)) else: self.Z_crop_ = Z # normalize Z and inset self.Z_norm_ = minmax_scale( self.Z_crop_, feature_range=(0 + self.ngrid_pad, 1 - self.ngrid_pad), axis=0, ) # estimate probability density using Gaussian kernal if verbose: print('- performing KDE') self.kde_ = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth).fit(self.Z_norm_) # convert density estimate to an image of probs and normalize if verbose: print('- scoring KDE') x, y = np.meshgrid(np.linspace(0, 1, self.ngrid), np.linspace(0, 1, self.ngrid)) log_dens = self.kde_.score_samples( np.array((x.flatten(), y.flatten())).T) self.P_ = np.reshape(log_dens, (self.ngrid, self.ngrid)) self.P_ = np.exp(self.P_) / np.max(np.exp(self.P_)) # find peaks if verbose: print('- finding peaks') self.peaks_ = peak_local_max( self.P_, min_distance=self.peak_min_distance, threshold_rel=self.peak_threshold_rel, exclude_border=False, ) # convert peaks to image and dialate self.P_peaks_ = np.ones_like(self.P_) for peak in self.peaks_: for i in range(-self.peak_dialation, self.peak_dialation + 1): for j in range(-self.peak_dialation, self.peak_dialation + 1): self.P_peaks_[(peak[0] + i, peak[1] + j)] = 0 # euclidean distance transform if verbose: print('- computing edt') self.P_edt_ = ndi.distance_transform_edt(self.P_peaks_) # perform watershed on edt if verbose: print('- performing watershed on edt') markers = ndi.label(1 - self.P_peaks_)[0] # use peaks as seed markers self.P_labels_ = watershed(self.P_edt_, markers, compactness=self.compactness) # find boundaries if verbose: print('- finding boundaries') self.P_bounds_ = find_boundaries(self.P_labels_) # find labels for Zs indices = np.round(self.Z_norm_ * self.ngrid).astype(int) self.Z_labels_ = self.P_labels_[indices[:, 1], indices[:, 0]] # swap axes if verbose: print('-> num regions found: {}'.format(len(self.peaks_))) return self.Z_labels_