def __init__(self, data, bin_num=None, shift_num=50, normed=True, force_scott = False, rule = 'scott'): self.data_min = min(data) self.data_max = max(data) self.shift_num = shift_num self.data = data self.data_len = len(self.data) self.normed=normed ##If None use KDE to autobin if bin_num == None: kde_result = kde(self.data) if len(self.data) >= 40 and not force_scott and kde_result: self.bw,self.kde_mesh,self.kde_den = kde_result self.bins_from_bw() self.bw2,self.kde_mesh,self.kde_den = kde(self.data, None, self.ash_mesh.min(), self.ash_mesh.max()) else: print("Using Scott's rule") kernel = stats.gaussian_kde(self.data) kernel.set_bandwidth(rule) self.bw = kernel.factor * self.data.std() # kde factor is bandwidth scaled by sigma self.bins_from_bw() self.kde_mesh = self.ash_mesh self.kde_den = kernel(self.kde_mesh) else: print("Using bin number: ", bin_num) self.set_bins(bin_num) kernel = stats.gaussian_kde(self.data) kernel.set_bandwidth(self.bw) self.kde_mesh = self.ash_mesh self.kde_den = kernel(self.kde_mesh)
def requirement1() : global min_range global max_range ds = [100, 500, 1000, 10000] b = 100 h = 0.1 k = 10 xs = np.linspace(min_range, max_range, 200) # Histogram as example legends = [] data = get_data(200) plot_true_distribution(1000) legends.append('True distribution') for d in ds : data = get_data(d) plt.hist(data, density=True, bins=b, alpha=0.4) legends.append('#bin = ' + str(b) + ', #data = ' + str(d)) plt.legend(legends) plt.title('Requirement 1-1') plt.savefig('req1-1', dpi=300) plt.show() # KDE as example plt.figure() legends = [] data = get_data(200) plot_true_distribution(1000) legends.append('True distribution') density = kde(data) for d in ds : data = get_data(d) density = kde(data) density.set_bandwidth(h) plt.plot(xs, density(xs)) legends.append('h = ' + str(h) + ', #data = ' + str(d)) plt.legend(legends) plt.title('Requirement 1-2') plt.savefig('req1-2', dpi=300) plt.show() # KNN as example plt.figure() legends = [] data = get_data(200) plot_true_distribution(1000) legends.append('True distribution') for d in ds : data = get_data(d) density = knn(data, k) plt.plot(xs, density(xs)) legends.append('k = ' + str(k) + ', #data = ' + str(d)) plt.legend(legends) plt.ylim([0, 0.4]) plt.title('Requirement 1-3') plt.savefig('req1-3', dpi=300) plt.show()
def __init__(self, data, bin_num=None, shift_num=50, density=True, force_scott=False, rule='scott', weights=None): self.data_min = min(data) self.data_max = max(data) self.shift_num = shift_num self.data = data self.data_len = len(self.data) self.density = density self.weights = weights # If None use KDE to autobin if bin_num is None: kde_result = kde(self.data) if len(self.data) >= 50 and not force_scott and kde_result: self.bw, self.kde_mesh, self.kde_den = kde_result self._bins_from_bw() self.bw2, self.kde_mesh, self.kde_den = \ kde(self.data, None, self.ash_mesh.min(), self.ash_mesh.max()) elif rule == 'fd': print("Using FD rule") kernel = stats.gaussian_kde(self.data) self.bin_width = 2 * (stats.iqr(self.data) / (len(self.data)**(1 / 3))) self.bw_from_bin_width() kernel.set_bandwidth(self.bw) self._bins_from_bw() self.kde_mesh = self.ash_mesh self.kde_den = kernel(self.kde_mesh) else: print("Using Scott's rule") kernel = stats.gaussian_kde(self.data) kernel.set_bandwidth(rule) # kde factor is bandwidth scaled by sigma self.bw = kernel.factor * self.data.std() self._bins_from_bw() self.kde_mesh = self.ash_mesh self.kde_den = kernel(self.kde_mesh) else: print("Using bin number: ", bin_num) self.set_bins(bin_num) kernel = stats.gaussian_kde(self.data) kernel.set_bandwidth(self.bw) self.kde_mesh = self.ash_mesh self.kde_den = kernel(self.kde_mesh)
def requirement3() : global min_range global max_range data = get_data(200) hs = [0.1, 1, 2] legends = [] xs = np.linspace(min_range, max_range, 200) density = kde(data) plot_true_distribution(1000) legends.append('True distribution') # KDE with different h for h in hs : density.set_bandwidth(h) plt.plot(xs, density(xs)) legends.append('h = ' + str(h)) plt.legend(legends) plt.title('Requirement 3') plt.show() # Comment out to get Cross-validation KDE and Variable KDE '''
def get_bandwidth_fxn(data,period=None,nneighb=None,epses=2.**np.arange(-40,41),beta='-1/d',d=None): """ Constructs a bandwidth function for a given dataset. Performs a kernel density estimate q_\epsilon, and sets the bandwidth to q_epsilon^beta. Parameters ---------- data : 2D array-like Two-dimensional dataset used to create the diffusion map. period : 1D array-like or float, optional Period of the coordinate, e.g. 360 for an angle in degrees. If None, all coordinates are taken to be aperiodic. If scalar, assumed to be period of each coordinate. If 1D array-like with each value a scalar or None, each coordinate has periodicity of that size. nneighb : int or None, optional Number of neighbors to include in constructing the diffusion map. Default is None, which corresponds to using all neighbors. beta : float or string, optional Parameter for constructing the bandwidth function for the Diffusion map. If rho is None, it will be set to q_\epsilon^beta, where q_\epsilon is an estimate of the density. If rho is provided, this parameter is unused. As with alpha, this will interpret strings that are evaluatable expressions. Default is 0.0 d : int or None, optional Dimension of the system. If None, dimension is estimated using the kde. Returns ------- rho : 1d array The estimated bandwidth function. """ N = len(data) if ((beta == 0) or (beta == '0')): return np.ones(N),None # Handle uniform bandwidth case. else: # Use q^beta as bandwidth, where q is an estimate of the density. q,d_est,eps_opt = kde.kde(data,epses=epses,period=period,nneighb=nneighb,d=d) if d is None: d = d_est # If beta parameter is an expression, evaluate it and convert to float beta = _eval_param(beta,d) return q**beta,d
def plot_kde_uncertainty(data, n_resamples=1000, x_resolution=1000, significance=0.05, palette=sns.color_palette()): """ Bootstrap a confidence interval for the KDE of the provided dataset, and plot along with the KDE. """ assert n_resamples >= 100 x_grid = np.linspace(min(data), max(data), x_resolution) orig_kde = kde(data, x_grid) resampled_kdes = np.zeros((n_resamples, x_resolution)) for i in range(n_resamples): resample = sample_with_replacement(data) resampled_kdes[i] = kde(resample, x_grid) # sort to get percentiles resampled_kdes.sort(axis=0) def percentile_index(percentile, N): """ Find the index of the x'th percentile in a sorted collection of size N. """ assert 0 <= percentile <= 1 return int(np.round(percentile * N)) def ci_index(alpha, N): """ Find the indices in a sorted collection of size N of the two bounds of a confidence interval of significance level alpha. """ return percentile_index(alpha / 2., N), percentile_index( (1. - alpha / 2.), N) i_lower, i_higher = ci_index(significance, n_resamples) plt.plot(x_grid, resampled_kdes[i_lower], '--', color=palette[0]) plt.plot(x_grid, resampled_kdes[i_higher], '--', color=palette[0]) plt.plot(x_grid, orig_kde, '-', color=palette[0]) sns.despine()
def find_Lines(self): No_of_images = len(glob.glob1(self.myPath, "*.jpg")) numbers = np.arange(1, No_of_images + 1) for number in numbers: oriimg = cv2.imread(self.myPath + "\Coins_img ({}).jpg".format(number)) newimg = self.resize_img(oriimg) # print (newimg.shape) gray = cv2.cvtColor(newimg, cv2.COLOR_BGR2GRAY) # gray = cv2.medianBlur(gray,(5,5)) gray = cv2.bilateralFilter(gray, 13, 60, 60) output = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR) Circles = cv2.HoughCircles(gray, cv2.HOUGH_GRADIENT, 1, 40, param1=90, param2=30, minRadius=0, maxRadius=0) if Circles is None: cv2.imshow("Vertical Lines", output) cv2.waitKey(0) cv2.destroyAllWindows() if Circles is not None: Detected = np.uint16(np.around(Circles)) centers = [] for x, y, r in Detected[0, :]: cv2.circle(output, (x, y), r, (0, 255, 255), 3) cv2.circle(output, (x, y), 2, (255, 0, 0), 2) centers.append((x, y)) X_axis_points = [i[0] for i in centers] clusters = kde(X_axis_points, bdw=3.5) #If the model isn't being accurate try tweaking the bdw line_clusters = self.make_clustered_array(centers, clusters) for i in line_clusters: i = np.array(i) X_1 = np.argmin(i[:, 1]) X_2 = np.argmax(i[:, 1]) cv2.line(output, tuple(i[X_1]), tuple(i[X_2]), (0, 0, 0), 3) cv2.imshow("Vertical Lines", output) plt.show() cv2.waitKey(0) cv2.destroyAllWindows() return None
def test_kde(self): import kde lines = inspect.getsourcelines(kde.kde)[0] lines = [line.strip() for line in lines] data = np.random.normal(size=100) bw = np.random.normal(1) + 10. query = np.random.normal(1) myout = my_kde(x=query, bw=bw, data=data) self.assertTrue(np.abs(kde.kde(x=query, bw=bw, data=data) - myout) < 1e-6, 'kde function does not return correct value') self.assertTrue(len(lines) == 1, "please write your kde on one line, yes a def():" + "counts so you cannot do that") pat = ".* +for +.*" self.assertFalse( self.search_lines(pat, lines), "please do not use the keyword for") pat = ".* +if +.*" self.assertFalse( self.search_lines(pat, lines), "please do not use the keyword if") pat = ".* +in +.*" self.assertFalse( self.search_lines(pat, lines), "please do not use the keyword in") pat = ".*vectorize.*" self.assertFalse( self.search_lines(pat, lines), "please do not use vectorize") pat = ".*map.*" self.assertFalse( self.search_lines(pat, lines), "please do not use map")
def main(Nsamp=None, Nmesh=None): """ Generates plots for the 16 test cases for both the analytical pdf and the kernel density estimate. Parameters ---------- Nsamp: int Number of samples used for the kde Nmesh: int Number of points used for the mesh """ if Nsamp is None: Nsamp = 10000 classes = [(cls, name) for name, cls in inspect.getmembers(dgp) if inspect.isclass(cls) and not cls in (dgp.dgp, dgp.LogNormal)] for cls, name in classes: print 'Generating graph for', name model = cls() x = model.sample(size=Nsamp) t, mesh, kdense = kde.kde(x, N=Nmesh) f = model.pdf(mesh) fig = plt.figure() ax = fig.add_subplot(111) ax.set_title(name, size=36) plt.plot(mesh, kdense) plt.plot(mesh, f) for label in ax.get_xticklabels() + ax.get_yticklabels(): label.set_fontsize(24) fig.set_figheight(10) fig.set_figwidth(12) fig.savefig(name+'.pdf') plt.close() return None
def main(Nsamp=None, Nmesh=None): """ Generates plots for the 16 test cases for both the analytical pdf and the kernel density estimate. Parameters ---------- Nsamp: int Number of samples used for the kde Nmesh: int Number of points used for the mesh """ if Nsamp is None: Nsamp = 10000 classes = [(cls, name) for name, cls in inspect.getmembers(dgp) if inspect.isclass(cls) and not cls in (dgp.dgp, dgp.LogNormal)] for cls, name in classes: print 'Generating graph for', name model = cls() x = model.sample(size=Nsamp) t, mesh, kdense = kde.kde(x, N=Nmesh) f = model.pdf(mesh) fig = plt.figure() ax = fig.add_subplot(111) ax.set_title(name, size=36) plt.plot(mesh, kdense) plt.plot(mesh, f) for label in ax.get_xticklabels() + ax.get_yticklabels(): label.set_fontsize(24) fig.set_figheight(10) fig.set_figwidth(12) fig.savefig(name + '.pdf') plt.close() return None
from kde import kde from gauss1D import gauss1D import matplotlib.pyplot as plt h, k = parameters() print('Question: Kernel/K-Nearest Neighborhood Density Estimators') # Produce the random samples samples = np.random.normal(0, 1, 100) # Compute the original normal distribution realDensity = gauss1D(0, 1, 100, 5) # Estimate the probability density using the KDE estDensity = kde(samples, h) # plot results plt.subplot(2, 1, 1) plt.plot(estDensity[:, 0], estDensity[:, 1], 'r', linewidth=1.5, label='KDE Estimated Distribution') plt.plot(realDensity[:, 0], realDensity[:, 1], 'b', linewidth=1.5, label='Real Distribution') plt.legend() plt.show()
m1 = -1 # mean value s1 = 0.1 # % variance N2 = 500 # number of data in data set 2 m2 = 2 # mean value s2 = 0.5 # variance h = 0.1 # bandwidth npoints = 100 # number of abscis points in kde x1 = math.sqrt(s1)*np.random.randn(N1,1) + m1 x2 = math.sqrt(s2)*np.random.randn(N2,1) + m2 x = np.concatenate((x1,x2),axis=0) # Kernel Density Estimate (xx,kde_estimate) = kde.kde(x,'Gaussian',h, npoints) plt.plot(xx,kde_estimate, 'r', label='Kernel Density Estimate') # the histogram of the data n, bins, patches = plt.hist(x, 50, normed=1, facecolor='green', alpha=0.75, label='Histogram') # empirical CDF (xx,pp) = ecdf(x, npoints) plt.plot(xx,pp, 'k', label='Empirical CDF') # Smooth Empirical CDF (KDE Integral) kde_integral = kde_integral(kde_estimate) plt.plot(xx,kde_integral, 'm', label='Smooth Empirical CDF') plt.legend(loc='upper left') plt.show()
m1 = -1 # mean value s1 = 0.1 # % variance N2 = 500 # number of data in data set 2 m2 = 2 # mean value s2 = 0.5 # variance h = 0.1 # bandwidth npoints = 100 # number of abscis points in kde x1 = math.sqrt(s1) * np.random.randn(N1, 1) + m1 x2 = math.sqrt(s2) * np.random.randn(N2, 1) + m2 x = np.concatenate((x1, x2), axis=0) # Kernel Density Estimate (xx, kde_estimate) = kde.kde(x, 'Gaussian', h, npoints) plt.plot(xx, kde_estimate, 'r', label='Kernel Density Estimate') # the histogram of the data n, bins, patches = plt.hist(x, 50, normed=1, facecolor='green', alpha=0.75, label='Histogram') # empirical CDF (xx, pp) = ecdf(x, npoints) plt.plot(xx, pp, 'k', label='Empirical CDF') # Smooth Empirical CDF (KDE Integral)