def test_simple_2D(): np.random.seed(42) N = int(2e5) var1 = 50 * np.random.normal(size=N) + 0.1 var2 = 0.01 * np.random.normal(size=N) - 300 # Do the self-consistent density estimate myPDF, axes = fastKDE.pdf(var1, var2) # Extract the axes from the axis list v1, v2 = axes
def fastkde_2d(d_x, d_y, xmin=None, xmax=None, ymin=None, ymax=None): """Perform a two-dimensional kernel density estimation. Wrapper round fastkde.fastKDE. Boundary corrections implemented by reflecting boundary conditions. Parameters ---------- d_x, d_y: numpy.array x/y coordinates of data to perform kde on xmin, xmax, ymin, ymax: float lower/upper prior bounds in x/y coordinates optional, default None Returns ------- x,y: numpy.array x/y-coordinates of kernel density estimates. One-dimensional array p: numpy.array kernel density estimates. Two-dimensional array """ xmin, xmax = check_bounds(d_x, xmin, xmax) ymin, ymax = check_bounds(d_y, ymin, ymax) f = [xmax is None or xmin is None, ymax is None or ymin is None] d_x_, d_y_ = mirror_2d(d_x, d_y, xmin, xmax, ymin, ymax) with warnings.catch_warnings(): warnings.simplefilter("ignore") p, (x, y) = fastKDE.pdf(d_x_, d_y_, axisExpansionFactor=f, numPointsPerSigma=10 * (2 - f[0]) * (2 - f[1])) p *= (2 - f[0]) p *= (2 - f[1]) if xmin is not None: p = p[:, x >= xmin] x = x[x >= xmin] if xmax is not None: p = p[:, x <= xmax] x = x[x <= xmax] if ymin is not None: p = p[y >= ymin, :] y = y[y >= ymin] if ymax is not None: p = p[y <= ymax, :] y = y[y <= ymax] return x, y, p
def rough_lipschitz_k( samples, EPS=1.0e-5 ): # estimate a distribution from samples using # kernel density estimation (guassian kernel) N = len(samples) sa = np.array(samples).reshape( (N,-1) ) sa += np.random.random( size=sa.shape ) * EPS D = sa.shape[1] # grab individual dimensoins sa_dims = [] for i in xrange(sa.shape[1]): sa_dims.append( sa[:,i] ) kde_pdf, kde_axes = fastKDE.pdf( *sa_dims ) if D == 1: kde_axes = [ kde_axes ] # now compute max derivative max_deriv = None it1 = np.nditer(kde_pdf,flags=['multi_index']) while not it1.finished: it2 = np.nditer(kde_pdf,flags=['multi_index']) while not it2.finished: # grab indices and pdf idx1 = it1.multi_index p1 = it1.value idx2 = it2.multi_index p2 = it2.value # compute x from indices and axes x1 = np.array( map(lambda i,a: a[i], idx1, kde_axes)) x2 = np.array( map(lambda i,a: a[i], idx2, kde_axes)) # x distance diff_x = np.linalg.norm( x1 - x2, ord=1 ) diff_p = abs( p1 - p2 ) if diff_x != 0: deriv = diff_p / diff_x if max_deriv is None or deriv > max_deriv: max_deriv = deriv it2.iternext() it1.iternext() return max_deriv
def fastkde_1d(d, xmin=None, xmax=None): """Perform a one-dimensional kernel density estimation. Wrapper round fastkde.fastKDE. Boundary corrections implemented by reflecting boundary conditions. Parameters ---------- d: numpy.array Data to perform kde on xmin, xmax: float lower/upper prior bounds optional, default None Returns ------- x: numpy.array x-coordinates of kernel density estimates p: numpy.array kernel density estimates """ xmin, xmax = check_bounds(d, xmin, xmax) f = xmax is None or xmin is None d_ = mirror_1d(d, xmin, xmax) with warnings.catch_warnings(): warnings.simplefilter("ignore") p, x = fastKDE.pdf(d_, axisExpansionFactor=f, numPointsPerSigma=10 * (2 - f)) p *= 2 - f if xmin is not None: p = p[x >= xmin] x = x[x >= xmin] if xmax is not None: p = p[x <= xmax] x = x[x <= xmax] return x, p
def test_simple_3D(): np.random.seed(42) N = int(1e3) # number of points # generate 3 independent samples from 3 different distributions x_1 = stats.norm.rvs(size=N) x_2 = stats.gamma.rvs(2, size=N) x_3 = stats.betaprime.rvs(5, 6, size=N) # calculate the 3D PDF pdf, values = fastKDE.pdf(x_1, x_2, x_3, numPoints=[ 65, 65, 65 ]) # simply add more variables to the argument list for higher dimensions # note though that memory quickly becomes an issue # the numPoints argument results in a coarser PDF--but one that is calculated # faster (and with less memory) # calculate the index of the mode of the distribution # (we'll plot 2D slices through the mode) i_mode_ravel = np.argmax(pdf.ravel()) nmode = np.unravel_index(i_mode_ravel, np.shape(pdf))
def __init__(self, *vars, renormalise=True, **fastKDE_kwargs): """ Compute probability density function. (see fastkde.fastKDE.pdf) NOTE: Coordinates in self.axes are in the same order as the input variables while it is in reversed order in self.pdf (see fastkde.fastKDE.pdf). See active_particles.scde.PDF.evaluate for probability density function evaluation. Positional arguments -------------------- vars : array-like Input variables. Parameters ---------- renormalise : bool Rescale probability density function values by the integral over the computed volume. DEFAULT: True Optional keyword arguments -------------------------- (see fastkde.fastKDE.pdf) """ self.vars = vars self.n = len(self.vars) self.fastKDE_kwargs = fastKDE_kwargs self.pdf, self.axes = fastKDE.pdf(*self.vars, **self.fastKDE_kwargs) if self.n == 1: self.axes = [np.array(self.axes)] self._extended_axes() if renormalise: self.renormalise()
r = pairs[k].split(",", 2) x = scale(np.array(r[1].split(), dtype=np.float)) y = scale(np.array(r[2].split(), dtype=np.float)) print(len(x)) mask = (x > -maxstd) & (x < maxstd) & ( y > -maxstd) & ( y < maxstd) x = x[mask] y = y[mask] numPoints = 32+1 pXY, axes = fastKDE.pdf(x, y, numPoints=numPoints,axisExpansionFactor = 0.1) fig,axs = PP.subplots(1,2,figsize=(10,5)) #Plot a scatter plot of the incoming data axs[0].plot(x,y,'k.',alpha=0.1) axs[0].set_title('Original (x,y) data') #Set axis labels for i in (0,1): axs[i].set_xlabel('x') axs[i].set_ylabel('y')
hull = ConvexHull(ha) x = ha[hull.vertices, 0] y = ha[hull.vertices, 1] x = np.append(x, x[0]) y = np.append(y, y[0]) x1, y1 = m(x, y) m.plot(x1, y1, 'r-', lw=2) m.drawparallels(pars, labels=[1, 0, 0, 0], labelstyle='+/-') m.drawmeridians(mers) plt.title("Event %i" % evtnum) xmin, xmax, ymin, ymax = p_lon2.min() - 1, p_lon2.max() + 1, p_lat2.min( ) - 1, p_lat2.max() + 1 # Fast KDE based on O'Brien et al., Comput. Stat. Data Anal. 101, 148-160 (2016) xax = np.linspace(xmin, xmax, 513) yax = np.linspace(ymin, ymax, 513) myPDF, axes = fastKDE.pdf(p_lon2, p_lat2, axes=[xax, yax], numPoints=513) zz = myPDF ax1 = np.zeros(len(axes[0])) ax2 = np.zeros(len(axes[1])) ax1 = axes[0] ax2 = axes[1] xx, yy = np.meshgrid(ax1, ax2) xy = np.zeros((len(x), 2)) xy[:, 0] = x xy[:, 1] = y bbp = mplPath.Path(xy) mask_array = np.zeros((len(ax1), len(ax2)), dtype=int) for i in range(len(ax1)): for j in range(len(ax2)): if bbp.contains_point((ax1[i], ax2[j])): mask_array[i, j] = 0
def plotScreenImage(beam, keys=['x', 'y'], scale=[1, 1], iscale=1, colormap=plt.cm.jet, size=None, grid=False, marginals=False, limits=None, screen=False, use_scipy=False, subtract_mean=[False, False], **kwargs): #Do the self-consistent density estimate key1, key2 = keys if not isinstance(subtract_mean, (list, tuple)): subtract_mean = [subtract_mean, subtract_mean] if not isinstance(scale, (list, tuple)): scale = [scale, scale] if not isinstance(size, (list, tuple)): size = [size, size] x, f1, p1 = nice_array( scale[0] * (beam[key1] - subtract_mean[0] * np.mean(beam[key1]))) y, f2, p2 = nice_array( scale[1] * (beam[key2] - subtract_mean[1] * np.mean(beam[key2]))) u1, u2 = [beam[k].units for k in keys] ux = p1 + u1 uy = p2 + u2 labelx = f'{key1} ({ux})' labely = f'{key2} ({uy})' if fastKDE_installed and not use_scipy: myPDF, axes = fastKDE.pdf(x, y, **kwargs) v1, v2 = axes elif SciPy_installed: xmin = x.min() xmax = x.max() ymin = y.min() ymax = y.max() v1, v2 = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = np.vstack([v1.ravel(), v2.ravel()]) values = np.vstack([x, y]) kernel = stats.gaussian_kde(values) myPDF = np.reshape(kernel(positions).T, v1.shape) else: raise Exception("fastKDE or SciPy required") # normalise the PDF to 1 myPDF = myPDF / myPDF.max() * iscale # Initialise the plot objects # start with a square Figure # Add a gridspec with two rows and two columns and a ratio of 2 to 7 between # the size of the marginal axes and the main axes in both directions. # Also adjust the subplot parameters for a square plot. if marginals: fig = plt.figure(figsize=(12.41, 12.41)) gs = fig.add_gridspec(2, 2, width_ratios=(8, 2), height_ratios=(2, 8), left=0.1, right=0.9, bottom=0.1, top=0.95, wspace=0.05, hspace=0.05) ax = fig.add_subplot(gs[1, 0]) ax_histx = fig.add_subplot(gs[0, 0], sharex=ax) ax_histy = fig.add_subplot(gs[1, 1], sharey=ax) else: fig = plt.figure(figsize=(10, 10)) fig.subplots_adjust(top=0.95) ax = fig.add_subplot() # Define ticks # Major ticks every 5, minor ticks every 1 if size[0] is None: use_size = False if not screen: xmin, xmax = [min(v1), max(v1)] ymin, ymax = [min(v2), max(v2)] size = [xmax - xmin, ymax - ymin] else: xmin, xmax = -15, 15 ymin, ymax = -15, 15 size = [15, 15] minvalx = xmin maxvalx = xmax meanvalx = (xmin + xmax) / 2.0 if not subtract_mean[0] else 0 minvaly = ymin maxvaly = ymax meanvaly = (ymin + ymax) / 2.0 if not subtract_mean[1] else 0 else: use_size = True maxvalx = size[0] / f1 minvalx = -maxvalx meanvalx = (max(v1) + min(v1)) / 2.0 if not subtract_mean[0] else 0 maxvaly = size[1] / f2 minvaly = -maxvaly meanvaly = (max(v2) + min(v2)) / 2.0 if not subtract_mean[1] else 0 size[0] = size[0] / f1 size[1] = size[1] / f2 # print(meanvaly, minvaly, maxvaly) major_ticksx = meanvalx + np.arange(minvalx, maxvalx + (maxvalx - minvalx) / 100, (maxvalx - minvalx) / 4) minor_ticksx = meanvalx + np.arange(minvalx, maxvalx + (maxvalx - minvalx) / 100, (maxvalx - minvalx) / 40) ax.set_xticks(major_ticksx) ax.set_xticks(minor_ticksx, minor=True) major_ticksy = meanvaly + np.arange(minvaly, maxvaly + (maxvaly - minvaly) / 100, (maxvaly - minvaly) / 4) minor_ticksy = meanvaly + np.arange(minvaly, maxvaly + (maxvaly - minvaly) / 100, (maxvaly - minvaly) / 40) # print(minvaly, maxvaly, meanvaly, major_ticksy) ax.set_yticks(major_ticksy) ax.set_yticks(minor_ticksy, minor=True) if marginals: hist, bin_edges = myPDF.sum(axis=0)[:-1], v1 hist_x = bin_edges[:-1] + np.diff(bin_edges) / 2 hist_width = np.diff(bin_edges) hist_y, hist_f, hist_prefix = nice_array(hist / hist_width) ax_histx.bar(hist_x, hist_y, hist_width, color=colormap(hist_y / max(hist_y))) hist, bin_edges = myPDF.sum(axis=1)[:-1], v2 hist_x = bin_edges[:-1] + np.diff(bin_edges) / 2 hist_width = np.diff(bin_edges) hist_y, hist_f, hist_prefix = nice_array(hist / hist_width) ax_histy.barh(hist_x, hist_y, hist_width, color=colormap(hist_y / max(hist_y))) # Make a circle for the edges of the screen if screen: draw_circle = plt.Circle((meanvalx, meanvaly), size + 0.05, fill=True, ec='w', fc=colormap(0), zorder=-1) ax.add_artist(draw_circle) if screen: ax.set_facecolor('k') else: ax.set_facecolor(colormap(0)) # Make a circle to clip the PDF if screen: circ = plt.Circle((meanvalx, meanvaly), max(size), facecolor='none') else: circ = plt.Circle((meanvalx, meanvaly), 3 * max(size), facecolor='none') # ax.add_patch(circ) # Plot the outline # Plot the PDF if grid: # Add a grid ax.grid(which='minor', color="w", alpha=0.3, clip_path=circ) ax.grid(which='major', color="w", alpha=0.55, clip_path=circ) # Set the image limits to slightly larger than the screen size if limits: if isinstance(limits, (int, float)): limits = (-limits, limits) if np.array(limits).shape == (2, 2): ax.set_xlim(limits[0]) ax.set_ylim(limits[1]) bbox = plt.Rectangle((min(limits[0]), min(limits[1])), max(limits[0]) - min(limits[0]), max(limits[1]) - min(limits[1]), facecolor="none", edgecolor="none") elif np.array(limits).shape == (2, ): ax.set_xlim(limits) ax.set_ylim(limits) # make a bounding box for the limits bbox = plt.Rectangle((min(limits), min(limits)), max(limits) - min(limits), max(limits) - min(limits), facecolor="none", edgecolor="none") elif screen or use_size: ax.set_xlim([meanvalx - (size[0] + 0.5), meanvalx + (size[0] + 0.5)]) ax.set_ylim([meanvaly - (size[1] + 0.5), meanvaly + (size[1] + 0.5)]) bbox = plt.Rectangle((-(size[0] + 0.5), -(size[1] + 0.5)), size[0] + 0, size[1] + 0, facecolor="none", edgecolor="none") else: ax.set_xlim([min(v1), max(v1)]) ax.set_ylim([min(v2), max(v2)]) bbox = plt.Polygon([(min(v1), min(v2)), (min(v1), max(v2)), (max(v1), max(v2)), (max(v1), min(v2))], facecolor="none", edgecolor="none") # ax.add_artist(bbox) mesh = ax.pcolormesh(v1, v2, myPDF, cmap=colormap, zorder=1, shading='auto') #, clip_path=bbox) if screen: mesh.set_clip_path(circ) if marginals: plt.setp(ax_histx.get_xticklabels(), visible=False) plt.setp(ax_histy.get_yticklabels(), visible=False) # ax_histy.set_ylim([-(size + 0.5), (size + 0.5)]) ax.set_xlabel(labelx) ax.set_ylabel(labely) # Extract the screen name file, ext = os.path.splitext(os.path.basename(beam.filename)) # Set the screen name as the title plt.suptitle(file) # Show the final image plt.draw()
def Fast2DKDE(self, X, Y): from fastkde import fastKDE pdf, axes = fastKDE.pdf(X, Y) ax1, ax2 = axes return ax1, ax2, pdf
def Fast1DKDE(self, X): from fastkde import fastKDE pdf, axes = fastKDE.pdf(X) return axes, pdf
def test_fastkde_runs(): gauss = stats.norm(-2, 4) data = gauss.rvs(size=100) _ = fastKDE.pdf(data)
def __init__(self, *vars, renormalise=True, wrap_period=None, wrap_method='linear', wrap_fill_value=0, wrap_processes=None, **fastKDE_kwargs): """ Compute probability density function. (see fastkde.fastKDE.pdf) NOTE: Coordinates in self.axes are in the same order as the input variables while it is in reversed order in self.pdf (see fastkde.fastKDE.pdf). See active_particles.scde.PDF.evaluate for probability density function evaluation. Positional arguments -------------------- vars : array-like Input variables. Parameters ---------- renormalise : bool Rescale probability density function values by the integral over the computed volume. DEFAULT: True wrap_period : float Period over which to wrap the computed probability density function. NOTE: If wrap_period == None, the computed probability density function remains unwrapped. DEFAULT: None wrap_method : string Method of interpolation. (see scipy.interpolate.griddata) DEFAULT: linear wrap_fill_value : float Value used to fill in for requested points outside of the convex hull of the input points. (see scipy.interpolate.griddata) DEFAULT: 0 wrap_processes : int Number of worker processes to use. (see multiprocessing.Pool) NOTE: If processes == None then processes = os.cpu_count(). DEFAULT: None Optional keyword arguments -------------------------- (see fastkde.fastKDE.pdf) """ self.vars = vars self.n = len(self.vars) self.fastKDE_kwargs = fastKDE_kwargs self.pdf, self.axes = fastKDE.pdf(*self.vars, **self.fastKDE_kwargs) if self.n == 1: self.axes = [np.array(self.axes)] self._extended_axes() if wrap_period != None: self.wrap(wrap_period, method=wrap_method, fill_value=wrap_fill_value, processes=wrap_processes) if renormalise: self.renormalise()
def googleimage_seg(my_input_folder, my_valid_folder, seg_folder, my_new_folder, amplitudes=[100, 128, 128], dimensions=[5, 5, 5], kde=True, chop=True, chop_size=3500, sample=False, sample_size=700, title='My Title'): """Take folder MY_INPUT_FOLDER of images and SEG_FOLDER of segmented images, create folder MY_NEW_FOLDER containing 3D Histograms of foreground and background pixel distributions. Images can be scraped from google images using https://github.com/hardikvasa/google-images-download. Images can be segmented in MATLAB using http://calvin.inf.ed.ac.uk/software/figure-ground-segmentation-by-transferring-window-masks/ MY_INPUT_FOLDER -- Absolute filepath to the folder of images you wish to plot MY_VALID_FOLDER -- Absolute filepath to the folder containing valid_lab.pkl and valid_rgb.pkl SEG_FOLDER -- Absolute filepath to folder of segmented images (.png files) MY_NEW_FOLDER -- Absolute filepath to the folder to where plots and data will be exported AMPLITUDES -- Amplitudes of each axis [L a b]; axes extend from 0 to L, -a to a, and -b to b DIMENSIONS -- Dimensions of bins in CIELAB space [L a b]; all pixels within confines of a bin take on the same color value KDE -- If True, use a Kernel Density Estimate to smooth results in CIELAB space after data collection https://bitbucket.org/lbl-cascade/fastkde CHOP -- If True, plot only the first CHOP_SIZE most frequent values (only affects visualization) SAMPLE -- If True, use SAMPLE_SIZE to randomly thin out data if plots are too dense (only affects visualization) TITLE -- Plot title """ #******************** """INITIALIZATION""" #******************** assert (not os.path.exists(my_new_folder)) os.makedirs(my_new_folder) Lw, aw, bw = dimensions[0], dimensions[1], dimensions[ 2] # Bin dimensions (widths) L_amp, a_amp, b_amp = amplitudes[0], amplitudes[1], amplitudes[ 2] # Amplitude of each axis Lbins, abins, bbins = L_amp / Lw, a_amp * 2 / aw, b_amp * 2 / bw # Number of 1D bins per axis L_list, a_list, b_list = [], [], [] # for figures L_blist, a_blist, b_blist = [], [], [] # for grounds unique_bins, unique_background_bins = {}, {} # Initialize dictionaries Lvec, avec, bvec = np.linspace(0, L_amp, Lbins + 1), np.linspace( -a_amp, a_amp, abins + 1), np.linspace(-b_amp, b_amp, bbins + 1) # Vectors for each axis with open(my_valid_folder + '/valid_lab.pkl', 'rb') as pickle_load: valid_lab = pickle.load(pickle_load) with open(my_valid_folder + '/valid_rgb.pkl', 'rb') as pickle_load: valid_rgb = pickle.load(pickle_load) #************************ """DEFINING FUNCTIONS""" #************************ def bounder_v2(x, v): """Take x and evenly-spaced ordered vector, return list of bin coordinates""" x0 = v[0] # minimum value of vector w = v[1] - x0 # width of a bin on given axis binnum = ceil( (x - x0) / w ) # number of bins is distance between x & x0, divided by bin width # edge case if binnum == 0: binnum == 1 return binnum def binner_v2(Linput, ainput, binput): """Take an LAB value, axis vectors, return linear index of 3D bin""" # position of bin on each axis Lbin = bounder_v2(Linput, Lvec) abin = bounder_v2(ainput, avec) bbin = bounder_v2(binput, bvec) return [Lbin, abin, bbin] def sub2ind(ypos, xpos): """Take a 2D matrix coordinates, return linear index""" linear_index = imagewidth * ypos + xpos return linear_index def ind2sub(linear_index): """Take linear index, return 2D matrix coordinates""" ypos = linear_index // imagewidth xpos = linear_index % imagewidth return (ypos, xpos) def bins2lab(bin_list): """Take bin_list [Lbin, abin, bbin], return [L, a, b]""" L = Lw * bin_list[0] - Lw / 2 a = -a_amp + aw * bin_list[1] - aw / 2 b = -b_amp + bw * bin_list[2] - bw / 2 return [L, a, b] def uniq(lst): last = object() for item in lst: if item == last: continue yield item last = item images_skipped, total_images = 0, 0 #********************* """DATA COLLECTION""" #********************* # iterate through folder of images for my_image in os.listdir(my_input_folder): total_images += 1 my_image_path = my_input_folder + '/' + my_image # reverse engineer file path to image try: rgb_img = mpimg.imread(my_image_path) # array [height][width][RGB] except ValueError as error: print(error, ';', '%s was skipped' % my_image) images_skipped += 1 continue except OSError as os_error: print(os_error, ';', 'No image was skipped') # .DS_store, not an image total_images -= 1 continue rgb_img = rgb_img / 255 try: lab_img = color.rgb2lab(rgb_img) except ValueError as error: print(error, ';', '%s was skipped' % my_image) images_skipped += 1 continue imshape = lab_img.shape # (height,width,depth) imageheight, imagewidth = imshape[0], imshape[1] seglist = os.listdir(seg_folder) segnum, imagenum, count = 0, my_image[:3], 0 while segnum != imagenum and count < len(seglist): segnum = seglist[count][:3] count += 1 segs_skipped = 0 if segnum == imagenum: exist_segmask = True else: exist_segmask = False segs_skipped += 1 # iterate through pixels and add to unique bins if exist_segmask: my_seg_path = seg_folder + '/' + my_image + '.png' logicmask = mpimg.imread( my_seg_path) # array [height][width][0 or 1] for xpos in range(imagewidth): for ypos in range(imageheight): Linput, ainput, binput = lab_img[ypos, xpos][0], lab_img[ ypos, xpos][1], lab_img[ypos, xpos][2] bin = str(binner_v2(Linput, ainput, binput)) # string b/c dictionary my_vals = bins2lab(binner_v2(Linput, ainput, binput)) if not logicmask[ypos, xpos]: # 0 = grounds, 1 = figures if bin in unique_background_bins: unique_background_bins[bin] += 1 else: unique_background_bins[bin] = 1 L_blist.append(my_vals[0]) a_blist.append(my_vals[1]) b_blist.append(my_vals[2]) else: if bin in unique_bins: unique_bins[bin] += 1 else: unique_bins[bin] = 1 L_list.append(my_vals[0]) a_list.append(my_vals[1]) b_list.append(my_vals[2]) else: for xpos in range(imagewidth): for ypos in range(imageheight): Linput, ainput, binput = lab_img[ypos, xpos][0], lab_img[ ypos, xpos][1], lab_img[ypos, xpos][2] bin = str(binner_v2(Linput, ainput, binput)) # string b/c dictionary if bin in unique_bins: unique_bins[bin] += 1 else: unique_bins[bin] = 1 my_vals = bins2lab(binner_v2(Linput, ainput, binput)) L_list.append(my_vals[0]) a_list.append(my_vals[1]) b_list.append(my_vals[2]) print('Out of %s total images, %d were skipped' % (total_images, images_skipped)) print('Out of %s images processed, %d were not segmented' % (total_images - images_skipped, segs_skipped)) #******************** """VISUALIZATION""" #******************** #for 3D histogram plt.close() varL, vara, varb = np.asarray(L_list), np.asarray(a_list), np.asarray( b_list) if kde: myPDF, axes = fastKDE.pdf(varL, vara, varb) varL, vara, varb = axes varlist, vardensity = [], [] for L in range(len(varL)): for a in range(len(vara)): for b in range(len(varb)): varlist.append([varL[L], vara[a], varb[b]]) vardensity.append(myPDF[b][a][L]) else: unique_bins_sorted = sorted( unique_bins.items(), key=operator.itemgetter(1), reverse=1) #list of tuples sorted by descending frequency varlist, vardensity = [], [] for unique_bin in unique_bins_sorted: varlist.append(bins2lab(eval(unique_bin[0]))) vardensity.append(unique_bin[1]) #sorted by descending frequency s_vardensity = sorted(vardensity, reverse=True) s_varlist = [ bin for _, bin in sorted(zip(vardensity, varlist), reverse=True) ] if 0 in s_vardensity: last = s_vardensity.index(0) s_varlist, s_vardensity = s_varlist[:last + 1], s_vardensity[:last + 1] print("Color-density pairs with density=0 have been removed") my_colors_valid = [] # For LAB values outside of RGB gamut, use spatial.cKDTree to find nearest LAB values inside of RGB gamut # https://stackoverflow.com/questions/10818546/finding-index-of-nearest-point-in-numpy-arrays-of-x-and-y-coordinates count = 0 valid_lab_tree = spatial.cKDTree(valid_lab) for lab_color in s_varlist: if lab_color not in valid_lab: lab_color = valid_lab[valid_lab_tree.query(lab_color)[1]] my_colors_valid.append(lab_color) count += 1 print(len(s_varlist) + 1 - count) # as of this line, s_varlist and s_vardensity are sorted by density #sorting by s_varlist so the while loop works my_densities = [ bin for _, bin in sorted(zip(my_colors_valid, s_vardensity)) ] my_colors_valid = sorted(my_colors_valid) colors_valid, densities = [], [] #basically a linked list, sums repeated color-density pairs from using spatial.cKDTree above my_colors_valid.append("empty") my_densities.append("empty") currDense = my_densities[0] while my_colors_valid[0] != "empty": currColor, nextColor = my_colors_valid[0], my_colors_valid[1] if nextColor == currColor: currDense += my_densities[1] my_densities.remove(my_densities[0]) else: colors_valid.append(currColor) densities.append(currDense) my_densities.remove(my_densities[0]) currDense = my_densities[0] my_colors_valid.remove(currColor) s_varlist = [bin for bin, _ in sorted(zip(colors_valid, densities))] s_vardensity = sorted(densities) # sorted and chopped if chop: if chop < last: s_c_vardensity, s_c_varlist = s_vardensity[: chop_size], s_varlist[: chop_size] else: print( 'Chop size of %c >= number of non-zero values %f. No values were chopped.' % (chop, last)) s_c_vardensity, s_c_varlist = s_vardensity, s_varlist else: s_c_vardensity, s_c_varlist = s_vardensity, s_varlist x, y, z = [], [], [] for lab in s_c_varlist: x.append(lab[0]) #L y.append(lab[1]) #a z.append(lab[2]) #b colors = [] for i in range(len(s_c_varlist)): lab_color = s_c_varlist[i] #already binned # rgb_color = list(color.lab2rgb([[lab_color]])[0][0]) rgb_color = list(valid_rgb[valid_lab.index(lab_color)][0]) colors.append(rgb_color) colors = np.asarray(colors) plt.close() fig, ax = plt.subplots(subplot_kw=dict(projection='3d')) if kde: ax.scatter(z, y, x, s=[foo * 80000 for foo in s_c_vardensity], c=colors) else: ax.scatter(z, y, x, s=[foo / 100 for foo in s_c_vardensity], c=colors) ax.set_xlabel('b') ax.set_ylabel('a') ax.set_zlabel('L') ax.set_xlim([-b_amp, b_amp]) ax.set_ylim([-a_amp, a_amp]) ax.set_zlim([0, L_amp]) plt.title(title) plt.savefig(my_new_folder + '/' + 'histogram 3D' + '.svg', format='svg', bbox_inches='tight') with open(my_new_folder + '/' + 'colors.pkl', 'wb') as pickle_file: pickle.dump(s_varlist, pickle_file, protocol=pickle.HIGHEST_PROTOCOL) with open(my_new_folder + '/' + 'densities.pkl', 'wb') as pickle_file: pickle.dump(s_vardensity, pickle_file, protocol=pickle.HIGHEST_PROTOCOL) # for 3D histogram of background pixels plt.close() varL, vara, varb = np.asarray(L_blist), np.asarray(a_blist), np.asarray( b_blist) if kde: myPDF, axes = fastKDE.pdf(varL, vara, varb) varL, vara, varb = axes varlist, vardensity = [], [] for L in range(len(varL)): for a in range(len(vara)): for b in range(len(varb)): varlist.append([varL[L], vara[a], varb[b]]) vardensity.append(myPDF[b][a][L]) else: unique_background_bins_sorted = sorted( unique_background_bins.items(), key=operator.itemgetter(1), reverse=1) #list of tuples sorted by descending frequency varlist, vardensity = [], [] for unique_background_bin in unique_background_bins_sorted: varlist.append(bins2lab(eval(unique_background_bin[0]))) vardensity.append(unique_background_bin[1]) #sorted by descending frequency s_vardensity = sorted(vardensity, reverse=True) s_varlist = [ bin for _, bin in sorted(zip(vardensity, varlist), reverse=True) ] if 0 in s_vardensity: last = s_vardensity.index(0) s_varlist, s_vardensity = s_varlist[:last + 1], s_vardensity[:last + 1] print("Color-density pairs with density=0 have been removed") my_colors_valid = [] # For LAB values outside of RGB gamut, use spatial.cKDTree to find nearest LAB values inside of RGB gamut # https://stackoverflow.com/questions/10818546/finding-index-of-nearest-point-in-numpy-arrays-of-x-and-y-coordinates count = 0 valid_lab_tree = spatial.cKDTree(valid_lab) for lab_color in s_varlist: if lab_color not in valid_lab: lab_color = valid_lab[valid_lab_tree.query(lab_color)[1]] my_colors_valid.append(lab_color) count += 1 print(len(s_varlist) + 1 - count) # as of this line, s_varlist and s_vardensity are sorted by density #sorting by s_varlist so the while loop works my_densities = [ bin for _, bin in sorted(zip(my_colors_valid, s_vardensity)) ] my_colors_valid = sorted(my_colors_valid) colors_valid, densities = [], [] # basically a linked list, sums repeated color-density pairs from using spatial.cKDTree above my_colors_valid.append("empty") my_densities.append("empty") currDense = my_densities[0] while my_colors_valid[0] != "empty": currColor, nextColor = my_colors_valid[0], my_colors_valid[1] if nextColor == currColor: currDense += my_densities[1] my_densities.remove(my_densities[0]) else: colors_valid.append(currColor) densities.append(currDense) my_densities.remove(my_densities[0]) currDense = my_densities[0] my_colors_valid.remove(currColor) s_varlist = [bin for bin, _ in sorted(zip(colors_valid, densities))] s_vardensity = sorted(densities) # sorted and chopped if chop: if chop < last: s_c_vardensity, s_c_varlist = s_vardensity[: chop_size], s_varlist[: chop_size] else: print( 'Chop size of %c >= number of non-zero values %f. No values were chopped.' % (chop, last)) s_c_vardensity, s_c_varlist = s_vardensity, s_varlist else: s_c_vardensity, s_c_varlist = s_vardensity, s_varlist # sorted, chopped, and sampled if sample: s_c_varlist, s_c_vardensity = zip(*random.sample( list(zip(s_c_varlist, s_c_vardensity)), sample_size)) x, y, z = [], [], [] for lab in s_c_varlist: x.append(lab[0]) #L y.append(lab[1]) #a z.append(lab[2]) #b colors = [] for i in range(len(s_c_varlist)): lab_color = s_c_varlist[i] #already binned rgb_color = list(valid_rgb[valid_lab.index(lab_color)][0]) colors.append(rgb_color) colors = np.asarray(colors) plt.close() fig, ax = plt.subplots(subplot_kw=dict(projection='3d')) # resizing points if kde: ax.scatter(z, y, x, s=[foo * 80000 for foo in s_c_vardensity], c=colors) else: ax.scatter(z, y, x, s=[foo / 100 for foo in s_c_vardensity], c=colors) ax.set_xlabel('b') ax.set_ylabel('a') ax.set_zlabel('L') ax.set_xlim([-b_amp, b_amp]) ax.set_ylim([-a_amp, a_amp]) ax.set_zlim([0, L_amp]) plt.title(title + ' background') plt.savefig(my_new_folder + '/' + 'background histogram 3D' + '.svg', format='svg', bbox_inches='tight') with open(my_new_folder + '/' + 'background_colors.pkl', 'wb') as pickle_file: pickle.dump(s_varlist, pickle_file, protocol=pickle.HIGHEST_PROTOCOL) with open(my_new_folder + '/' + 'background_densities.pkl', 'wb') as pickle_file: pickle.dump(s_vardensity, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
def plot_density(self, max_number=9000): """Plot the density using kernel densty estimates (KDEs).""" spacing = 2049 # Use base cmap to create transparent. mycmap = transparent_cmap(plt.cm.plasma) mycmap = transparent_cmap(plt.cm.gnuplot) # mycmap = transparent_cmap(plt.cm.bone) # Make a grid to sample on (randomized a little bit). rows, cols = self.get_points(max_number=max_number) grid_rows = np.linspace(0, self.height, spacing) # + 10 * (np.random.rand(512) - 0.5) grid_cols = np.linspace(0, self.width, spacing) # + 10 * (np.random.rand(512) - 0.5) axes = np.array([grid_cols, grid_rows]) pdf, axes = fastKDE.pdf(cols, rows, axes=axes) pdf[pdf < 0] = np.min(pdf[pdf > 0]) pdf -= pdf.min() # Normalize the PDF to compare across maps. # pdf -= pdf.mean() pdf /= pdf.max() # mg, _ = np.meshgrid(grid_rows, grid_cols) # for point in tqdm(axes.T): # col = int(point[0]) # row = int(point[1]) # debug() # if row < self.height and col < self.width: # if self.map_image[int(np.floor(row)), int(np.floor(col)), :].sum() == 0: # r_ = np.argmin(np.abs(grid_rows - row)) # c_ = np.argmin(np.abs(grid_cols - col)) # pdf[r_, c_] = pdf.max() # Make the plot! plt.close("all") plt.ion() fig, ax = plt.subplots(1, 1) # fig.set_size_inches(width / 220, height / 220) ax.imshow(self.map_image) cb = ax.contourf(axes[0], axes[1], pdf, 15, cmap=mycmap, antialiased=True) fig.subplots_adjust(bottom=0) fig.subplots_adjust(top=1) fig.subplots_adjust(right=1) fig.subplots_adjust(left=0) plt.gca().set_axis_off() plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0) plt.margins(0, 0) plt.gca().xaxis.set_major_locator(plt.NullLocator()) plt.gca().yaxis.set_major_locator(plt.NullLocator()) plt.show()
def featurizePairs(path, filepairs, filetargets, maxstd, size, featurizingmethod = 0, ratio=1, doubletrainset=True): f = open(path + filepairs ); pairs = f.readlines(); pairs.pop(0) f.close(); y_te = np.genfromtxt(path + filetargets, delimiter=",") for k in range(0, int(len(y_te)*ratio)): if(k%100==0): print(k) r = pairs[k].split(",", 2) x = scale(np.array(r[1].split(), dtype=np.float)) y = scale(np.array(r[2].split(), dtype=np.float)) mask = (x > -maxstd) & (x < maxstd) & ( y > -maxstd) & ( y < maxstd) x = x[mask] y = y[mask] try: if(featurizingmethod == 0): pXY = getHisto(x, y, size, maxstd) elif(featurizingmethod == 1): pXY, axes = fastKDE.pdf(x, y, numPoints=size+1, axisExpansionFactor = 0.1) pXY = delete(pXY, s_[0], axis=0) pXY = delete(pXY, s_[0], axis=1) arrayXY = np.ravel(pXY) if(k==0): vectorizedPairs = arrayXY else: vectorizedPairs = np.vstack((vectorizedPairs, arrayXY)) if(doubletrainset == True): arrayYX = np.ravel(np.transpose(pXY)) vectorizedPairs = np.vstack((vectorizedPairs, arrayYX)) if y_te[k] == -1: arrayTargetXY = np.array([1,0,0]) arrayTargetYX = np.array([0,0,1]) elif y_te[k] == 0: arrayTargetXY = np.array([0,1,0]) arrayTargetYX = np.array([0,1,0]) elif y_te[k] == 1: arrayTargetXY = np.array([0,0,1]) arrayTargetYX = np.array([1,0,0]) if(k==0): vectorizedTarget = arrayTargetXY else: vectorizedTarget = np.vstack((vectorizedTarget, arrayTargetXY)) if (doubletrainset == True): vectorizedTarget = np.vstack((vectorizedTarget, arrayTargetYX)) except ValueError: print("pbkde nbpoints pairs " + str(k)) np.savetxt(path + "vectorized" + "_maxstd" + maxstd + "_size" + size + filepairs, vectorizedPairs) np.savetxt(path + "vectorized" + "_maxstd" + maxstd + "_size" + size + filetargets, vectorizedTarget) return vectorizedPairs,vectorizedTarget
def googleimage_lin(my_input_folder, my_new_folder, my_valid_folder, amplitudes=[100, 128, 128], dimensions=[5, 5, 5], seg=True, tau=3, kde=True, chop=True, chop_size=3500, sample=False, sample_size=700, title='My Title'): """Take folder MY_INPUT_FOLDER of images, create folder MY_NEW_FOLDER containing segmented images and/or 3D Histogram of pixel distribution. Images can be scraped from google images using https://github.com/hardikvasa/google-images-download. MY_INPUT_FOLDER -- Absolute filepath to the folder of images you wish to segment and/or plot MY_NEW_FOLDER -- Absolute filepath to the folder to where segmented images and/or plots and data will be exported MY_VALID_FOLDER -- Absolute filepath to folder containing valid_lab.pkl and valid_rgb.pkl AMPLITUDES -- Amplitudes of each axis [L a b]; axes extend from 0 to L, -a to a, and -b to b DIMENSIONS -- Dimensions of bins in CIELAB space [L a b]; all pixels within confines of a bin take on the same color value SEG -- If True, segment each image using an approximation of the Lin et. al, 2013 method http://vis.stanford.edu/papers/semantically-resonant-colors TAU -- Segmentation parameter, in CIELAB units KDE -- If True, use a Kernel Density Estimate to smooth results in CIELAB space after data collection https://bitbucket.org/lbl-cascade/fastkde CHOP -- If True, plot only the first CHOP_SIZE most frequent values (only affects visualization) SAMPLE -- If True, use SAMPLE_SIZE to randomly thin out data if plots are too dense (only affects visualization) TITLE -- Plot title """ #******************** """INITIALIZATION""" #******************** assert (not os.path.exists(my_new_folder)) os.makedirs(my_new_folder) Lw, aw, bw = dimensions[0], dimensions[1], dimensions[ 2] # Bin dimensions (widths) L_amp, a_amp, b_amp = amplitudes[0], amplitudes[1], amplitudes[ 2] # Amplitude of each axis Lbins, abins, bbins = L_amp / Lw, a_amp * 2 / aw, b_amp * 2 / bw # Number of 1D bins per axis L_list, a_list, b_list = [], [], [] # Destination for pixel values unique_bins = { } # Dictionary for unique bins (2D histogram). Key is CIELAB value, value is absolute frequency. total_images, images_skipped = 0, 0 # Vector for each axis L_vec, a_vec, b_vec = np.linspace(0, L_amp, Lbins + 1), np.linspace( -a_amp, a_amp, abins + 1), np.linspace(-b_amp, b_amp, bbins + 1) with open(my_valid_folder + '/valid_lab.pkl', 'rb') as pickle_load: valid_lab = pickle.load(pickle_load) with open(my_valid_folder + '/valid_rgb.pkl', 'rb') as pickle_load: valid_rgb = pickle.load(pickle_load) #************************ """DEFINING FUNCTIONS""" #************************ def bounder_v2(x, v): """Take x and evenly-spaced ordered vector, return list of bin coordinates.""" x0 = v[0] # minimum value of vector w = v[1] - x0 # width of a bin on given axis binnum = ceil( (x - x0) / w ) # number of bins is distance between x & x0, divided by bin width # edge case if binnum == 0: binnum = 1 # check to make sure this works return binnum def binner_v2(L_input, a_input, b_input): """Take an LAB value, axis vectors, return linear index of 3D bin.""" # position of bin on each axis L_bin = bounder_v2(L_input, L_vec) a_bin = bounder_v2(a_input, a_vec) b_bin = bounder_v2(b_input, b_vec) return [L_bin, a_bin, b_bin] def sub2ind(ypos, xpos): """Take a 2D matrix coordinates, return linear index.""" linear_index = imagewidth * ypos + xpos # imagewidth is defined on a per-image basis return linear_index def ind2sub(linear_index): """Take linear index, return 2D matrix coordinates.""" ypos = linear_index // imagewidth xpos = linear_index % imagewidth return (ypos, xpos) def neighbors(ypos, xpos): """Find all 8 neighboring coordinates to given coordinates.""" # could do this programmatically top_left = [ypos - 1, xpos - 1] top = [ypos - 1, xpos] top_right = [ypos - 1, xpos + 1] left = [ypos, xpos - 1] right = [ypos, xpos + 1] bottom_left = [ypos + 1, xpos - 1] bottom = [ypos + 1, xpos] bottom_right = [ypos + 1, xpos + 1] return [ top_left, top, top_right, left, right, bottom_left, bottom, bottom_right ] def bins2lab(bin_list): """Take bin_list [Lbin, abin, bbin] and return [L, a, b].""" L = Lw * bin_list[0] - Lw / 2 a = -a_amp + aw * bin_list[1] - aw / 2 b = -b_amp + bw * bin_list[2] - bw / 2 return [L, a, b] def grouper(ypos, xpos): """Group pixels using approximation of Lin et al., 2013 method. Variables unspecified in this function body are nonlocally defined.""" my_ind = sub2ind(ypos, xpos) # get linear index of coordinate if my_ind in linear_array: # if pixel is still ungrouped neighbor_list = neighbors(ypos, xpos) # find neigboring pixels for neighbor in neighbor_list: n_ypos, n_xpos = neighbor[0], neighbor[1] if 0 <= n_ypos < imageheight and 0 <= n_xpos < imagewidth: # if neighboring pixels in image dimensions dist = np.linalg.norm( np.array(lab_array[ypos, xpos]) - np.array(lab_array[n_ypos, n_xpos]) ) #calculate distance between neighbor and given pixel if dist <= tau and my_array[ n_ypos, n_xpos] == num_groups: # if distance smaller than tau and there is currently one connected component linear_array.remove( my_ind) # remove grouped pixel from to-be-grouped my_array[ypos, xpos] = my_array[ n_ypos, n_xpos] # give neighboring pixel same value as given pixel nonlocal num_grouped num_grouped += 1 break # don't need to look at any other neighbors def pos2bin(ypos, xpos): """Bin pixel from given image at specified position. Variables unspecified in this function body are nonlocally defined.""" Linput, ainput, binput = lab_img[ypos, xpos][0], lab_img[ ypos, xpos][1], lab_img[ypos, xpos][2] bin = str(binner_v2(Linput, ainput, binput)) # string b/c dictionary if bin in unique_bins: unique_bins[bin] += 1 else: unique_bins[bin] = 1 my_vals = bins2lab(binner_v2(Linput, ainput, binput)) L_list.append(my_vals[0]) a_list.append(my_vals[1]) b_list.append(my_vals[2]) #********************* """DATA COLLECTION""" #********************* for my_image in os.listdir( my_input_folder): # iterate through folder of images total_images += 1 border_pixels = [] # initialize list for border pixels my_image_path = my_input_folder + '/' + my_image # reverse engineer file path to image try: rgb_img = mpimg.imread(my_image_path) # array [height][width][RGB] except ValueError as error: print(error, ';', '%s was skipped' % my_image) images_skipped += 1 continue except OSError as os_error: print(os_error, ';', 'No image was skipped') # .DS_store, not an image total_images -= 1 continue rgb_img = rgb_img / 255 # np.array(height, width, [RGB]) try: lab_img = color.rgb2lab(rgb_img) # np.array(height, width, [LAB]) except ValueError as error: print(error, ';', '%s was skipped' % my_image) images_skipped += 1 continue imshape = lab_img.shape # (height, width, depth) imageheight, imagewidth = imshape[0], imshape[1] # initialize imageheightximagewidth array of lab values lab_array = np.zeros((imageheight, imagewidth), dtype="object") # iterate through all pixels, add CIELAB values to unique bins and to lab_array for xpos in range(imagewidth): for ypos in range(imageheight): Linput, ainput, binput = lab_img[ypos, xpos][0], lab_img[ ypos, xpos][1], lab_img[ypos, xpos][2] lab_array[ypos, xpos] = [Linput, ainput, binput] #*************************************************************** foo = color.lab2rgb(np.array([[[Linput, ainput, binput]]]))[0][0] if foo[0] < 0 or foo[0] > 1 or foo[1] < 0 or foo[1] > 1 or foo[ 2] < 0 or foo[2] > 1: print('HELP') #*************************************************************** if seg: # Iterate through border pixels |=|, add CIELAB values to border_pixels list for xpos in range(1, imagewidth - 1): for ypos in [0, imageheight - 1]: Linput, ainput, binput = lab_img[ypos, xpos][0], lab_img[ ypos, xpos][1], lab_img[ypos, xpos][2] border_pixels.append((Linput, ainput, binput)) for ypos in range(imageheight): for xpos in [0, imagewidth - 1]: Linput, ainput, binput = lab_img[ypos, xpos][0], lab_img[ ypos, xpos][1], lab_img[ypos, xpos][2] border_pixels.append((Linput, ainput, binput)) # b/w background exists if >= 75% of border pixels are within tau=3 of b/w (Lin et. al) white, black = np.array((100, 0, 0)), np.array((0, 0, 0)) border_dist_white = [ np.linalg.norm(pixel - white) for pixel in border_pixels ] border_dist_black = [ np.linalg.norm(pixel - black) for pixel in border_pixels ] border_bool_white = [dist <= tau for dist in border_dist_white] border_bool_black = [dist <= tau for dist in border_dist_black] whiteborder = sum(border_bool_white) / len( border_bool_white) >= 0.75 #boolean blackborder = sum(border_bool_black) / len( border_bool_black) >= 0.75 #boolean border = whiteborder or blackborder # segmentation if a border exists if border: print('%s has a border' % my_image) #lab_array = np.zeros((imageheight,imagewidth), dtype="object") linear_array = list( range(imageheight * imagewidth) ) # linear indices of coordinates for to-be-grouped pixels my_array = np.zeros( (imageheight, imagewidth), dtype="int") # representation of segmentation lab_array[ypos, xpos] = [Linput, ainput, binput] my_array[ 0, 0] = 1 # leftmost, topmost pixel assumed to have the same color as the background my_array[imageheight - 1, imagewidth - 1] = 1 linear_array.remove( 0) # have just "looked" at leftmost, topmost pixel linear_array.remove(imageheight * imagewidth - 1) num_groups = 1 # of different connected components (1, 2,...) num_grouped = 2 # of pixels in a connected component (left-topmost, bottom-rightmost) #start from top left of image for ypos in range(imageheight): for xpos in range(imagewidth): grouper(ypos, xpos) #start from bottom right of image for ypos in list(range(imageheight))[::-1]: for xpos in list(range(imagewidth))[::-1]: grouper(ypos, xpos) # if there are still ungrouped values, there must be at least 2 connected components if linear_array: num_groups += 1 # if there is a background and there are at least two connected components if border and num_groups > 1: for xpos in range(imagewidth): for ypos in range(imageheight): if my_array[ypos, xpos] == 1: # ignore background pixels continue else: pos2bin(ypos, xpos) plt.close() plt.imshow(my_array) try: plt.savefig(my_new_folder + '/' + my_image + '.png', format='png', bbox_inches='tight') except ValueError as error: print(my_image + ' was processed but not saved') # image fails to meet two conditions (background and >=2 connected components) else: for xpos in range(imagewidth): for ypos in range(imageheight): pos2bin(ypos, xpos) # seg==False else: for xpos in range(imagewidth): for ypos in range(imageheight): pos2bin(ypos, xpos) print('Out of %s total images seen, %d were skipped' % (total_images, images_skipped)) #******************* """VISUALIZATION""" #******************* # for 3D histogram plt.close() varL, vara, varb = np.asarray(L_list), np.asarray(a_list), np.asarray( b_list) if kde: myPDF, axes = fastKDE.pdf(varL, vara, varb) varL, vara, varb = axes varlist, vardensity = [], [] for L in range(len(varL)): for a in range(len(vara)): for b in range(len(varb)): varlist.append([varL[L], vara[a], varb[b]]) vardensity.append(myPDF[b][a][L]) else: unique_bins_sorted = sorted( unique_bins.items(), key=operator.itemgetter(1), reverse=1) #list of tuples sorted by descending frequency varlist, vardensity = [], [] for unique_bin in unique_bins_sorted: varlist.append(bins2lab(eval(unique_bin[0]))) vardensity.append(unique_bin[1]) # sorted by descending frequency s_vardensity = sorted(vardensity, reverse=True) s_varlist = [ bin for _, bin in sorted(zip(vardensity, varlist), reverse=True) ] if 0 in s_vardensity: last = s_vardensity.index(0) s_varlist, s_vardensity = s_varlist[:last + 1], s_vardensity[:last + 1] print("Color-density pairs with density=0 have been removed") my_colors_valid = [] # For LAB values outside of RGB gamut, use spatial.cKDTree to find nearest LAB values inside of RGB gamut # https://stackoverflow.com/questions/10818546/finding-index-of-nearest-point-in-numpy-arrays-of-x-and-y-coordinates count = 0 valid_lab_tree = spatial.cKDTree(valid_lab) for lab_color in s_varlist: if lab_color not in valid_lab: lab_color = valid_lab[valid_lab_tree.query(lab_color)[1]] my_colors_valid.append(lab_color) count += 1 print(len(s_varlist) + 1 - count) # as of this line, s_varlist and s_vardensity are sorted by density #sorting by s_varlist so the while loop works my_densities = [ bin for _, bin in sorted(zip(my_colors_valid, s_vardensity)) ] my_colors_valid = sorted(my_colors_valid) colors_valid, densities = [], [] # basically a linked list, sums repeated color-density pairs from using spatial.cKDTree above my_colors_valid.append("empty") my_densities.append("empty") currDense = my_densities[0] while my_colors_valid[0] != "empty": currColor, nextColor = my_colors_valid[0], my_colors_valid[1] if nextColor == currColor: currDense += my_densities[1] my_densities.remove(my_densities[0]) else: colors_valid.append(currColor) densities.append(currDense) my_densities.remove(my_densities[0]) currDense = my_densities[0] my_colors_valid.remove(currColor) s_varlist = [bin for bin, _ in sorted(zip(colors_valid, densities))] s_vardensity = sorted(densities) # sorted and chopped if chop: if chop < last: s_c_vardensity, s_c_varlist = s_vardensity[: chop_size], s_varlist[: chop_size] else: print( 'Chop size of %c >= number of non-zero values %f. No values were chopped.' % (chop, last)) s_c_vardensity, s_c_varlist = s_vardensity, s_varlist else: s_c_vardensity, s_c_varlist = s_vardensity, s_varlist # to thin out plot, use a random sample if sample: s_c_varlist, s_c_vardensity = zip(*random.sample( list(zip(s_c_varlist, s_c_vardensity)), sample_size)) x, y, z = [], [], [] for lab in s_c_varlist: x.append(float(lab[0])) #L y.append(float(lab[1])) #a z.append(float(lab[2])) #b # color points by position in CIELAB space colors = [] for i in range(len(s_c_varlist)): lab_color = s_c_varlist[i] # already binned rgb_color = list(valid_rgb[valid_lab.index(lab_color)][0]) colors.append(rgb_color) colors = np.asarray(colors) plt.close() fig, ax = plt.subplots(subplot_kw=dict(projection='3d')) # resizing points if kde: ax.scatter(z, y, x, s=[foo * 80000 for foo in s_c_vardensity], c=colors) else: ax.scatter(z, y, x, s=[foo / 100 for foo in s_c_vardensity], c=colors) ax.set_xlabel('b') ax.set_ylabel('a') ax.set_zlabel('L') ax.set_xlim([-b_amp, b_amp]) ax.set_ylim([-a_amp, a_amp]) ax.set_zlim([0, L_amp]) plt.title(title) plt.savefig(my_new_folder + '/' + 'histogram 3D' + '.svg', format='svg', bbox_inches='tight') with open(my_new_folder + '/' + 'colors.pkl', 'wb') as pickle_file: pickle.dump(s_varlist, pickle_file, protocol=pickle.HIGHEST_PROTOCOL) with open(my_new_folder + '/' + 'densities.pkl', 'wb') as pickle_file: pickle.dump(s_vardensity, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
'size' : '15', \ 'weight' : 'bold'} mpl.rc('font', **font) mpl.rc('axes', labelweight='bold' ) # needed for bold axis labels in more recent version of matplotlib N = int(1e3) # number of points # generate 3 independent samples from 3 different distributions x_1 = stats.norm.rvs(size=N) x_2 = stats.gamma.rvs(2, size=N) x_3 = stats.betaprime.rvs(5, 6, size=N) # calculate the 3D PDF pdf, values = fastKDE.pdf(x_1, x_2, x_3, numPoints=[ 65, 65, 65 ]) # simply add more variables to the argument list for higher dimensions # note though that memory quickly becomes an issue # the numPoints argument results in a coarser PDF--but one that is calculated # faster (and with less memory) # calculate the index of the mode of the distribution # (we'll plot 2D slices through the mode) i_mode_ravel = argmax(pdf.ravel()) nmode = unravel_index(i_mode_ravel, shape(pdf)) # set the levels clevels = linspace(0, pdf[nmode], 64) # create the plot fig, axs = PP.subplots(1, 3, figsize=(15, 5))
def featurizePairs(pathinput,pairsfilename,targetfilename, publicinfofilename, filepairsvectorized, filetargetsvectorized, maxstd, size, featurizingmethod = 0, ratio=1, doubletrainset=True, isTestSet = False, onlynumerical = False): print(pathinput) for i in range(0,len(pathinput)): dfpairs = pd.read_csv(pathinput[i] + pairsfilename[i] + ".csv", index_col="SampleID") dfpublicinfo = pd.read_csv(pathinput[i] + publicinfofilename[i] + ".csv", index_col="SampleID") if(isTestSet == False): dftarget = pd.read_csv(pathinput[i] + targetfilename[i] + ".csv", index_col="SampleID") if(i==0): dfpairsGlobal = dfpairs dfpublicinfoGlobal = dfpublicinfo if (isTestSet == False): dftargetGlobal = dftarget else: dfpairsGlobal = dfpairsGlobal.append(dfpairs) dfpublicinfoGlobal = dfpublicinfoGlobal.append(dfpublicinfo) if (isTestSet == False): dftargetGlobal = dftargetGlobal.append(dftarget) print("Total number of pairs to featurize : " + str(dfpairsGlobal.shape[0])) cpt = 0 for k in range(0, int(dfpairsGlobal.shape[0]*ratio)): if(k%100==0): print(k) A = dfpairsGlobal['A'].iloc[k] B = dfpairsGlobal['B'].iloc[k] publicinfoA = dfpublicinfoGlobal['A type'].iloc[k] publicinfoB = dfpublicinfoGlobal['B type'].iloc[k] if (isTestSet == False): target = dftargetGlobal['Target'].iloc[k] if((publicinfoA == "Numerical" and publicinfoB == "Numerical") or onlynumerical == False): x = scale(np.array(A.split(), dtype=np.float)) y = scale(np.array(B.split(), dtype=np.float)) mask = (x > -maxstd) & (x < maxstd) & ( y > -maxstd) & ( y < maxstd) x = x[mask] y = y[mask] try: if(featurizingmethod == 0): pXY = getHisto(x, y, size, maxstd) elif(featurizingmethod == 1): pXY, axes = fastKDE.pdf(x, y, numPoints=size+1, axisExpansionFactor = 0.1) pXY = delete(pXY, s_[0], axis=0) pXY = delete(pXY, s_[0], axis=1) elif (featurizingmethod == 2): pOfXGivenY, axes = fastKDE.conditional(x, y, numPoints=size+1, axisExpansionFactor=0.1) pOfYGivenX, axes = fastKDE.conditional(y, x, numPoints=size+1, axisExpansionFactor=0.1) pOfXGivenY = delete(pOfXGivenY, s_[0], axis=0) pOfXGivenY = delete(pOfXGivenY, s_[0], axis=1) pOfYGivenX = delete(pOfYGivenX, s_[0], axis=0) pOfYGivenX = delete(pOfYGivenX, s_[0], axis=1) pXY = np.hstack((pOfYGivenX, pOfXGivenY)) elif (featurizingmethod == 3): pOfXGivenY, axes1 = fastKDE.conditional(x, y, numPoints=size+1, axisExpansionFactor=0.1) pOfYGivenX, axes2 = fastKDE.conditional(y, x, numPoints=size+1, axisExpansionFactor=0.1) # pXY = np.dstack((pOfYGivenX, np.transpose(pOfXGivenY))) pXY = np.dstack((pOfYGivenX, pOfXGivenY)) pXY = delete(pXY, s_[0], axis=0) pXY = delete(pXY, s_[0], axis=1) pYX = np.dstack((pOfXGivenY, pOfYGivenX)) pYX = delete(pYX, s_[0], axis=0) pYX = delete(pYX, s_[0], axis=1) arrayXY = np.ravel(pXY) # arrayXY = pXY if(cpt==0): vectorizedPairs = arrayXY else: vectorizedPairs = np.vstack((vectorizedPairs, arrayXY)) if(doubletrainset == True): if (featurizingmethod == 3): arrayYX = np.ravel(pYX) else: arrayYX = np.ravel(np.transpose(pXY)) vectorizedPairs = np.vstack((vectorizedPairs, arrayYX)) if(isTestSet == False): if target == -1: arrayTargetXY = np.array([1,0,0]) arrayTargetYX = np.array([0,0,1]) elif target == 0: arrayTargetXY = np.array([0,1,0]) arrayTargetYX = np.array([0,1,0]) elif target == 1: arrayTargetXY = np.array([0,0,1]) arrayTargetYX = np.array([1,0,0]) if(cpt==0): vectorizedTarget = arrayTargetXY else: vectorizedTarget = np.vstack((vectorizedTarget, arrayTargetXY)) if (doubletrainset == True): vectorizedTarget = np.vstack((vectorizedTarget, arrayTargetYX)) cpt = cpt + 1 except ValueError: print("pbkde nbpoints pairs " + str(k)) np.savetxt(filepairsvectorized, vectorizedPairs) if (isTestSet == False): np.savetxt(filetargetsvectorized, vectorizedTarget) return vectorizedPairs,vectorizedTarget else: return vectorizedPairs
# -*- coding: utf-8 -*- """ Created on Tue May 24 18:05:04 2016 @author: bing """ #!python import numpy as np from fastkde import fastKDE import pylab as PP #Generate two random variables dataset (representing 100000 pairs of datapoints) N = 2e5 var1 = 50*np.random.normal(size=N) + 0.1 var2 = 0.01*np.random.normal(size=N) - 300 #Do the self-consistent density estimate myPDF,axes = fastKDE.pdf(var1,var2) #Extract the axes from the axis list v1,v2 = axes #Plot contours of the PDF should be a set of concentric ellipsoids centered on #(0.1, -300) Comparitively, the y axis range should be tiny and the x axis range #should be large PP.contour(v1,v2,myPDF) PP.show()
def featurizePairs(pathinput,pairsfilename,targetfilename, publicinfofilename, pathoutput, maxstd, size, featurizingmethod = 0, ratio=1, doubletrainset=True): print(pathinput) for i in range(0,len(pathinput)): dfpairs = pd.read_csv(pathinput[i] + pairsfilename[i] + ".csv", index_col="SampleID") dftarget = pd.read_csv(pathinput[i] + targetfilename[i] + ".csv", index_col="SampleID") dfpublicinfo = pd.read_csv(pathinput[i] + publicinfofilename[i] + ".csv", index_col="SampleID") if(i==0): dfpairsGlobal = dfpairs dftargetGlobal = dftarget dfpublicinfoGlobal = dfpublicinfo else: dfpairsGlobal = dfpairsGlobal.append(dfpairs) dftargetGlobal = dftargetGlobal.append(dftarget) dfpublicinfoGlobal = dfpublicinfoGlobal.append(dfpublicinfo) # # f = open(path + filepairs ); # pairs = f.readlines(); # pairs.pop(0) # f.close(); # y_te = np.genfromtxt(path + filetargets, delimiter=",") print(dfpairsGlobal.shape[0]) cpt = 0 for k in range(0, int(dfpairsGlobal.shape[0]*ratio)): if(k%100==0): print(k) A = dfpairsGlobal['A'].iloc[k] B = dfpairsGlobal['B'].iloc[k] target = dftargetGlobal['Target'].iloc[k] publicinfoA = dfpublicinfoGlobal['A type'].iloc[k] publicinfoB = dfpublicinfoGlobal['B type'].iloc[k] if(publicinfoA == "Numerical" and publicinfoB == "Numerical"): x = scale(np.array(A.split(), dtype=np.float)) y = scale(np.array(B.split(), dtype=np.float)) mask = (x > -maxstd) & (x < maxstd) & ( y > -maxstd) & ( y < maxstd) x = x[mask] y = y[mask] try: if(featurizingmethod == 0): pXY = getHisto(x, y, size, maxstd) elif(featurizingmethod == 1): pXY, axes = fastKDE.pdf(x, y, numPoints=size+1, axisExpansionFactor = 0.1) pXY = delete(pXY, s_[0], axis=0) pXY = delete(pXY, s_[0], axis=1) elif (featurizingmethod == 2): pOfXGivenY, axes = fastKDE.conditional(x, y, numPoints=size+1, axisExpansionFactor=0.1) pOfYGivenX, axes = fastKDE.conditional(y, x, numPoints=size+1, axisExpansionFactor=0.1) pOfXGivenY = delete(pOfXGivenY, s_[0], axis=0) pOfXGivenY = delete(pOfXGivenY, s_[0], axis=1) pOfYGivenX = delete(pOfYGivenX, s_[0], axis=0) pOfYGivenX = delete(pOfYGivenX, s_[0], axis=1) pXY = np.hstack((pOfYGivenX, pOfXGivenY)) arrayXY = np.ravel(pXY) if(cpt==0): vectorizedPairs = arrayXY else: vectorizedPairs = np.vstack((vectorizedPairs, arrayXY)) if(doubletrainset == True): arrayYX = np.ravel(np.transpose(pXY)) vectorizedPairs = np.vstack((vectorizedPairs, arrayYX)) if target == -1: arrayTargetXY = np.array([1,0,0]) arrayTargetYX = np.array([0,0,1]) elif target == 0: arrayTargetXY = np.array([0,1,0]) arrayTargetYX = np.array([0,1,0]) elif target == 1: arrayTargetXY = np.array([0,0,1]) arrayTargetYX = np.array([1,0,0]) if(cpt==0): vectorizedTarget = arrayTargetXY else: vectorizedTarget = np.vstack((vectorizedTarget, arrayTargetXY)) if (doubletrainset == True): vectorizedTarget = np.vstack((vectorizedTarget, arrayTargetYX)) cpt = cpt + 1 except ValueError: print("pbkde nbpoints pairs " + str(k)) np.savetxt(pathoutput + "vectorized" + "_maxstd" + str(maxstd) + "_size" + str(size) + "_method" + str(featurizingmethod) + pairsfilename[0], vectorizedPairs) np.savetxt(pathoutput + "vectorized" + "_maxstd" + str(maxstd) + "_size" + str(size) + "_method" + str(featurizingmethod) + targetfilename[0], vectorizedTarget) return vectorizedPairs,vectorizedTarget
from fastkde import fastKDE from scipy import stats import pylab as PP import matplotlib as mpl import numpy as np # set plot default fonts (fonts that are generally nice figures font = { 'family' : 'serif', \ 'size' : '15', \ 'weight' : 'bold'} mpl.rc('font', **font) mpl.rc('axes', labelweight='bold' ) # needed for bold axis labels in more recent version of matplotlib #Generate two random variables dataset (representing 100000 pairs of datapoints) N = int(2e5) var1 = 50 * np.random.normal(size=N) + 0.1 var2 = 0.01 * np.random.normal(size=N) - 300 #Do the self-consistent density estimate myPDF, axes = fastKDE.pdf(var1, var2) #Extract the axes from the axis list v1, v2 = axes #Plot contours of the PDF should be a set of concentric ellipsoids centered on #(0.1, -300) Comparitively, the y axis range should be tiny and the x axis range #should be large PP.contour(v1, v2, myPDF) PP.show()
print 'Read beam files = ', time.clock() - start m1 = 100 * beam.x m2 = 100 * beam.y print max(beam.x) xmin = -0.03 xmax = 0.03 ymin = -0.03 ymax = 0.03 X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] print 'Grid = ', time.clock() - start positions = np.vstack([X.ravel(), Y.ravel()]) print 'Positions = ', time.clock() - start values = [m1, m2] print 'Values = ', time.clock() - start kernel, axes = fastKDE.pdf(m1, m2) print 'Kernel = ', time.clock() - start kpos = kernel(positions) print 'Kpos = ', time.clock() - start Z = np.reshape(kpos.T, X.shape) print 'Z= ', time.clock() - start import matplotlib.pyplot as plt fig, ax = plt.subplots() ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r, extent=[xmin, xmax, ymin, ymax]) # ax.plot(m1[::10], m2[::10], 'k.', markersize=2) ax.set_xlim([xmin, xmax]) ax.set_ylim([ymin, ymax]) plt.show()