def bic_kmeans(features, **kwargs): ''' Run kmeans on features with **kwargs given to scipy.cluster.vq.kmeans for different numbers of clusters, k. Choose, finally, the clustering that minimizes the Beysian Information Criterion or BIC. ''' max_k = int(2*numpy.log(len(features))) base_distances = vq(features, numpy.array([numpy.average(features, axis=0)]))[1] base_std = numpy.std(base_distances) centers_list = [] bic_list = [] distances_list = [] for k in range(1, max_k+1): centers = kmeans(features, k, **kwargs)[0] clusters, distances = vq(features, centers) bic = calculate_bic(clusters, distances, base_std) centers_list.append(centers) distances_list.append(distances) bic_list.append(bic) best_index = numpy.argmin(bic_list) return centers_list[best_index], distances_list[best_index]
def _get_centroid_mask(overtones): flat = overtones.reshape((len(overtones) * 48, overtones.shape[2])) f0flat = flat[np.argmax(flat, 1) == 0] f0flat = f0flat[np.max(f0flat, 1) > 0] f0flat = (f0flat.T / np.max(f0flat, 1)).T centroids, distortion = kmeans(f0flat, 24) codes, dists = vq(f0flat, centroids) #centroids = centroids[np.bincount(codes) > np.median(np.bincount(codes))] flat_norm = (flat.T / np.max(flat, 1)).T codes, dists = vq(flat_norm, centroids) flat_filtered = np.copy(flat) for i, (s, c) in enumerate(zip(flat, codes)): if c < 0 or c > len(centroids): continue centroid = centroids[c] centroid_denorm = centroid * np.max(s) flat_filtered[i, 1:] -= centroid_denorm[1:] flat_filtered[i, 1:] = np.maximum(flat_filtered[i, 1:], 0) overtones_filtered = flat_filtered.reshape(overtones.shape) return overtones_filtered
def getImageDescriptor(model, im, conf): im = standardizeImage(im) height, width = im.shape[:2] numWords = model.vocab.shape[1] frames, descrs = getPhowFeatures(im, conf.phowOpts) # quantize appearance if model.quantizer == 'vq': binsa, _ = vq(descrs.T, model.vocab.T) elif model.quantizer == 'kdtree': raise ValueError('quantizer kdtree not implemented') else: raise ValueError('quantizer {0} not known or understood'.format(model.quantizer)) hist = [] for n_spatial_bins_x, n_spatial_bins_y in zip(model.numSpatialX, model.numSpatialX): binsx, distsx = vq(frames[0, :], linspace(0, width, n_spatial_bins_x)) binsy, distsy = vq(frames[1, :], linspace(0, height, n_spatial_bins_y)) # binsx and binsy list to what spatial bin each feature point belongs to if (numpy.any(distsx < 0)) | (numpy.any(distsx > (width/n_spatial_bins_x+0.5))): print ("something went wrong") import pdb; pdb.set_trace() if (numpy.any(distsy < 0)) | (numpy.any(distsy > (height/n_spatial_bins_y+0.5))): print ("something went wrong") import pdb; pdb.set_trace() # combined quantization number_of_bins = n_spatial_bins_x * n_spatial_bins_y * numWords temp = arange(number_of_bins) # update using this: http://stackoverflow.com/questions/15230179/how-to-get-the-linear-index-for-a-numpy-array-sub2ind temp = temp.reshape([n_spatial_bins_x, n_spatial_bins_y, numWords]) bin_comb = temp[binsx, binsy, binsa] hist_temp, _ = histogram(bin_comb, bins=range(number_of_bins+1), density=True) hist.append(hist_temp) hist = hstack(hist) hist = array(hist, 'float32') / sum(hist) return hist
def read_unclustered_data(filename, num_clusters, cl_type="kMeans"): """Return dictionary of cluster id to array of points. Given a filename in the format of lat, lng generate k clusters based on arguments. Outputs a dictionary with the cluster id as the key mapped to a list of lat, lng pts """ request_points = [] with open(filename, 'rb') as input_file: input_file.next() # Skip the header row for line in input_file: lat, lng = line.split(',') request_points.append((float(lat), float(lng))) request_points = array(request_points) if cl_type == "kMeans": # computing K-Means with K = num_clusters centroids, _ = kmeans(request_points, int(num_clusters)) # assign each sample to a cluster idx, _ = vq(request_points, centroids) else: # computeing kMedoids using distance matrix centroids = get_kmedoids(request_points, int(num_clusters)) # assign each sample to a cluster idx, _ = vq(request_points, centroids) # map cluster lat, lng to cluster index cluster_points = defaultdict(list) for i in xrange(len(request_points)): lat, lng = request_points[i] cluster_points[idx[i]].append((lat, lng)) return cluster_points
def _get_larger_chroms(ref_file): """Retrieve larger chromosomes, avoiding the smaller ones for plotting. """ from scipy.cluster.vq import kmeans, vq all_sizes = [] for c in ref.file_contigs(ref_file): all_sizes.append(float(c.size)) all_sizes.sort() # separate out smaller chromosomes and haplotypes with kmeans centroids, _ = kmeans(np.array(all_sizes), 2) idx, _ = vq(np.array(all_sizes), centroids) little_sizes = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx, all_sizes))) little_sizes = [x[1] for x in little_sizes] # create one more cluster with the smaller, removing the haplotypes centroids2, _ = kmeans(np.array(little_sizes), 2) idx2, _ = vq(np.array(little_sizes), centroids2) little_sizes2 = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx2, little_sizes))) little_sizes2 = [x[1] for x in little_sizes2] # get any chromosomes not in haplotype/random bin thresh = max(little_sizes2) larger_chroms = [] for c in ref.file_contigs(ref_file): if c.size > thresh: larger_chroms.append(c.name) return larger_chroms
def run(self, features, number_of_clusters='3', restarts=10): if number_of_clusters != 'Use BIC': k = int(number_of_clusters) if k == 1: result = numpy.zeros(len(features), dtype=numpy.int32) return [result] return [vq(features, kmeans(features, k, iter=restarts)[0])[0]] else: return [vq(features, bic_kmeans(features, iter=restarts)[0])[0]]
def sphere_tissue_image(size=100, n_points=12): center = np.array([size/2,size/2,size/2],float) radius = size/4. points = {} for p in range(n_points): theta = np.random.rand()*2.*np.pi phi = np.random.rand()*np.pi - np.pi/2. points[p+3] = center + radius*np.array([np.cos(theta)*np.cos(phi),np.sin(theta)*np.cos(phi),np.sin(phi)]) points = array_dict(points) point_target_area = 4.*np.pi*np.power(radius,2.)/float(n_points) point_target_distance = np.power(point_target_area/np.pi,0.5) sigma_deformation = (size/100.)*(20./n_points) omega_forces = dict(distance=0.1*size/100., repulsion=100.0*np.power(size/100.,2)) for iterations in xrange(100): point_vectors = np.array([points[p]- points.values() for p in points.keys()]) point_distances = np.array([vq(points.values(),np.array([points[p]]))[1] for p in points.keys()]) point_vectors = point_vectors/(point_distances[...,np.newaxis]+1e-7) point_distance_forces = omega_forces['distance']*((point_distances-point_target_distance)[...,np.newaxis]*point_vectors/point_target_distance).sum(axis=1) point_repulsion_forces = omega_forces['repulsion']*np.power(point_target_distance,2)*(point_vectors/(np.power(point_distances,2)+1e-7)[...,np.newaxis]).sum(axis=1) point_forces = np.zeros((len(points),3)) point_forces += point_distance_forces point_forces += point_repulsion_forces point_forces = np.minimum(1.0,sigma_deformation/np.linalg.norm(point_forces,axis=1))[:,np.newaxis] * point_forces new_points = points.values() + point_forces new_points = center+ radius*((new_points-center)/np.linalg.norm((new_points-center),axis=1)[:,np.newaxis]) points = array_dict(new_points,points.keys()) points[2] = center coords = np.transpose(np.mgrid[0:size,0:size,0:size],(1,2,3,0)).reshape((np.power(size,3),3)).astype(int) labels = points.keys()[vq(coords,points.values())[0]] ext_coords = coords[vq(coords,np.array([center]))[1]>size/3.] img = np.ones((size,size,size),np.uint8) img[tuple(np.transpose(coords))] = labels img[tuple(np.transpose(ext_coords))] = 1 img = SpatialImage(img,resolution=(60./size,60./size,60./size)) return img
def performance_measure(reference_set,experimental_set,measure='jaccard_index'): VP = (vq(experimental_set,reference_set)[1]==0).sum() FP = (vq(experimental_set,reference_set)[1]>0).sum() FN = (vq(reference_set,experimental_set)[1]>0).sum() if measure == 'true_positive': return VP elif measure == 'precision': return VP/float(VP+FP) elif measure == 'recall': return VP/float(VP+FN) elif measure == 'dice_index': return 2*VP / float(2*VP+FP+FN) elif measure == 'jaccard_index': return VP/float(VP+FP+FN)
def kmeans(iData, clustNumber, oPrefix, norm=False): '''Perform k-means cluster analysis and return MAP of zones''' print 'Run K-Means' height, width = iData.shape[1:3] #reshape 3D cube of data into 2D matrix and get indeces of valid pixels iData, notNanDataI = cube2flat(iData) if norm: #center and norm iDataMean = iData[:, notNanDataI].mean(axis=1) iDataStd = iData[:, notNanDataI].std(axis=1) iData = np.subtract(iData.T, iDataMean).T iData = np.divide(iData.T, iDataStd).T #perform kmeans on valid data and return codebook codeBook = vq.kmeans(iData[:, notNanDataI].astype('f8').T, clustNumber)[0] #perform vector quantization of input data uzing the codebook #return vector of labels (for each valid pixel) labelVec = vq.vq(iData[:, notNanDataI].astype('f8').T, codeBook)[0]+1 #create and fill MAP of zones zoneMap = np.zeros(width*height) + np.nan zoneMap[notNanDataI] = labelVec zoneMap = zoneMap.reshape(height, width) #visualize map of zones plt.imsave(oPrefix + 'zones.png', zoneMap) return zoneMap
def clustering_scipy_kmeans(features, n_clust = 8): """ """ whitened = whiten(features) print whitened.shape initial = [kmeans(whitened,i) for i in np.arange(1,12)] plt.plot([var for (cent,var) in initial]) plt.show() #cent, var = initial[3] ##use vq() to get as assignment for each obs. #assignment,cdist = vq(whitened,cent) #plt.scatter(whitened[:,0], whitened[:,1], c=assignment) #plt.show() codebook, distortion = kmeans(whitened, n_clust) print codebook, distortion assigned_label, dist = vq(whitened, codebook) for ii in range(8): plt.subplot(4,2,ii+1) plt.plot(codebook[ii]) plt.show() centroid, label = kmeans2(whitened, n_clust, minit = 'points') print centroid, label for ii in range(8): plt.subplot(4,2,ii) plt.plot(centroid[ii]) plt.show()
def new_labelled_page(no_of_samples:int, window_size:int, page_scale:int or tuple, labelled_centroids:[tuple], page_paths:[str]): ### Duplication from above weighter = gaussian_weighter(window_size) windower = f.partial(win_centred_on, window=window_size) shifter = f.partial(point_shift, window=window_size) scaler = img_scaler(page_scale) make_observations = compose(prepare_features, real_fft) img, label = open_image_label(*page_paths) img, label = scaler(img, label) f_img = prepare_fft_image(img, window_size) access_img = img_accessor(img, identity) access_label = img_accessor(label, identity) access_f_img = img_accessor(f_img, compose(windower, shifter)) ### End of duplication labels = [a[0] for a in labelled_centroids] centroids = np.asarray([a[1] for a in labelled_centroids]) new_label = np.zeros_like(label) for s in img_slices(new_label.shape, 80): unlabelled_samples = sample_all_in_area(s, applier(identity, compose(make_observations, access_f_img))) coords = [a[0] for a in unlabelled_samples] observations = np.asarray([a[1] for a in unlabelled_samples]) codes, dist = vq.vq(observations, centroids) for i, code in zip(coords, codes): new_label[i] = labels[code] return new_label
def computeHistograms(codebook, descriptors): code, dist = vq.vq(descriptors, codebook) bins=range(codebook.shape[0] + 1) #print "bins:", bins histogram_of_words, bin_edges = np.histogram(code, bins, normed=True) #print histogram_of_words return histogram_of_words
def kmeans(features, projection, ite = 50, k = 4, threshold = 1e-5): """ perform k_keamns clustering and return a the result as a subsapce clustering object """ from scipy.cluster.vq import kmeans, vq import datetime from measures import spatial_coherence centroids, distance = kmeans(features, k, iter=ite, thresh=threshold) code, _ = vq(features, centroids) run_ = datetime.datetime.now().strftime("%y_%m_%d_%H_%M") params = "projection_size=%d, k=%d" %(len(projection), k) clusters = clusters_from_code(code, k, projection) clustering_id = "(%s)_(%s)_(%s)_(%s)" %("exhaustive_kmeans", params, run_, projection) #print clustering_id km_clt = KMClustering(algorithm ="exhaustive_kmeans", parameters = params, run = run_, clustering_id = clustering_id, clusters = clusters, ccontains_noise = False, cclustering_on_dimension = True) measures = {'spatial_coherence': spatial_coherence(km_clt, len(features))[0], 'distortion': distance} km_clt.update_measures(measures) return km_clt
def vectorQuantization (features, bits, debug=False): from scipy.cluster.vq import vq D = len(features[0]) np_features = np.array(features) nom_features = np.empty(np_features.shape, dtype=str) for i in range(D): column = np_features[:,i] max_val = np.max(column) min_val = np.min(column) bits = bits denom = bits step = (max_val - min_val)/denom partition = [0]*(denom+1) codebook = [0]*(denom+1) for j in range(denom+1): partition[j] = (min_val+(step*j)) codebook[j] = j column = np.array(column) partition = np.array(partition) if debug: print('****') print(column) print(partition) tmp = vq(column,partition) nom_col = [str(int(x)+1) for x in tmp[0]] if debug: print tmp[0] print nom_col print '****' nom_features[:,i] = nom_col return nom_features
def bow(images,codebook,clusters): out = images temp = [] print "-"*60 print "Creating the pseudo database." for im in images: c = Counter() bag,dist = vq(whiten(im[1]),codebook) for word in bag: c[word]+=1 #Creating histograms for i in range(clusters): if i in c.iterkeys(): c[i] = c[i]/sum(c.values()) if i not in c.iterkeys(): c[i] = 0 temp.append(c) for i in range(len(temp)): out[i].append(temp[i]) print "Done.\n" return out
def classify_kmeans(infile, clusternumber): ''' apply kmeans ''' #Load infile in data array driver = gdal.GetDriverByName('GTiff') driver.Register() ds = gdal.Open(infile, gdal.GA_Update) databand = ds.GetRasterBand(1) #Read input raster into array data = ds.ReadAsArray() #replace no data value with numpy.nan #data[data==-999.0]=numpy.nan pixel = numpy.reshape(data,(data.shape[0]*data.shape[1])) centroids, variance = kmeans(pixel, clusternumber) code, distance = vq(pixel,centroids) centers_idx = numpy.reshape(code,(data.shape[0],data.shape[1])) clustered = centroids[centers_idx] # Write outraster to file databand.WriteArray(clustered) databand.FlushCache() #Close file databand = None clustered = None ds = None
def select(file, output, clusters=None): """ Select clusters containing real motifs and discard the rest Parameters ---------- file : An hdf5 file containing clustered motif matches as generated by birdwerdz.hdf.classify output : Name of output file which will contain only motifs from selected clusters. If same as input file, will delete motifs from the file clusters : Clusters to select """ if file == output: mode = 'r+' else: mode = 'w-' with h5py.File(output, mode) as out: if file != output: with h5py.File(file, 'r+') as src: for entry in src.values(): out['/'].copy(entry,entry.name) for entry in out.values(): if not isinstance(entry,h5py.Group) or 'motifs' not in entry.keys(): continue amp_vecs= entry['motifs']['spectrogram'].sum(1) cluster_path = 'cluster_mean_spectrograms' id,_ = vq(amp_vecs, out[cluster_path][:].sum(1)) new_motifs=np.delete(entry['motifs'], np.where( [i not in clusters for i in id])[0]) del entry['motifs'] entry.create_dataset('motifs',data=new_motifs)
def performMCCAlgorithm(dataSet, specificDataPointIndex, numIterations = 200, numClusters = 4, subDataRatio = 0.5): periodsAhead = np.array([1, 2, 3, 4, 5, 6, 9, 12, 18, 24, 36, 60, 120]) strippedDataSet = dataSet dataLength = strippedDataSet.shape[0] dataWidth = strippedDataSet.shape[1] specificDataPoint = strippedDataSet[specificDataPointIndex,:] numPeriods = len(periodsAhead) statisticWeightsbyIteration = np.empty(shape=(numIterations, 4),dtype=float) # Perform Bootstrapped Clustering for i in range(0,numIterations): # Perform Bootstrapped Clustering / Chooose Data Subset subDataSetIndexes = np.random.choice(range(0,dataLength),size=dataLength*subDataRatio,replace=True) subDataSet = strippedDataSet[subDataSetIndexes,:] # Perform Bootstrapped Clustering / Find Data Clusters for Subset of Data kMeansClusters = spc.kmeans(subDataSet, numClusters) clusterCenters = kMeansClusters[0] # Perform Bootstrapped Clustering / Record Clustering Cost for Weighting Scheme clusteringCost = kMeansClusters[1] statisticWeightsbyIteration[i,0] = clusteringCost # Perform Bootstrapped Clustering / Apply Found Data Clusters to All Data allClusters = spc.vq(strippedDataSet, clusterCenters) clusterAssignments = allClusters[0] clusterDistortions = allClusters[1] display = 1 #TEST if display: #TEST plt.scatter(dataSet[0:60,0],dataSet[0:60,1],c=clusterAssignments[0:60]) #TEST plt.show() statisticWeightsbyIteration[i,1] = max(clusterDistortions) statisticWeightsbyIteration[i,2] = np.mean(clusterDistortions) statisticWeightsbyIteration[i,3] = np.std(clusterDistortions) return statisticWeightsbyIteration
def scipy_labels(data, clusters, nReps): # run scipy.cluster.vq.kmeans on data using an initial clusters # number of iterations is one less than used for mpi, since the # starting clusters are the result after one mpi iteration codebook, dist = kmeans2(data, clusters, nReps, 1e-6) labels, dist = vq(data, codebook) return labels, codebook
def connected_regions(image): """ Converts image into grayscale, quantizes, counts connected regions """ # render_image(image) colors = 2 # Quantization into two colors image_rgb = np.dstack(image) pixels = np.reshape( image_rgb, (image_rgb.shape[0] * image_rgb.shape[1], image_rgb.shape[2]) ) centroids, _ = vq.kmeans(pixels, colors) quantized, _ = vq.vq(pixels, centroids) quantized_idx = quantized.reshape( (image_rgb.shape[0], image_rgb.shape[1]) ) if len(centroids) > 1: # for_render = (quantized_idx * 255).astype(np.uint8) # render_image(for_render) regions = len(region_sizes(quantized_idx)) regions_inverted = len(region_sizes(1 - quantized_idx)) # import pdb; pdb.set_trace() # if regions == 0: # regions = image[0].shape[0] * image[0].shape[1] # print regions return max([regions, regions_inverted]) else: return 0
def project(self, descriptors): imhist = np.zeros((self.nbr_word)) words, distance = cluster.vq(descriptors, self.voc) for i in words: imhist[i] += 1 return imhist
def reduce_colors(image, k): '''Apply kmeans algorithm. Input: image, number of clusters to use Returns: colors, counts per color, new image ''' if k > 32: print "Setting colors to maximum allowed of 32" k = 32 rows, cols, rgb = image.shape # reshape the image in a single row array of RGB pixels image_row = np.reshape(image,(rows * cols, 3)) #HERE ADD CODE TO GET A GOOD GUESS OF COLORS AND PASS THAT AS #SECOND ARGUMENT TO kmeans #image_array_sample = shuffle(image_row, random_state=0)[:1000] #kguess = kmeans(image_array_sample, k) #colors,_ = kmeans(image_row, kguess) # perform the clustering colors,_ = kmeans(image_row, k) # vector quantization, assign to each pixel the index of the nearest centroid (i=1..k) qnt,_ = vq(image_row,colors) # reshape the qnt vector to the original image shape image_centers_id = np.reshape(qnt,(rows, cols)) # assign the color value to each pixel newimage = colors[image_centers_id] #count number of pixels of each cluster color counts,bins = sp.histogram(qnt, len(colors)) return colors, counts, newimage
def main(): args = get_args() # This catches files sent in with stdin if isinstance(args.infile, TextIOWrapper): data = JSONFile(args.infile, True) else: data = args.infile points = np.array([ [point.get('lon'), point.get('lat')] for point in data ]) # In testing, found that a higher number of iterations led to less # errors due to missing centroids (Note: whitening led to worse results) centroids, distortion = kmeans(points, args.number_of_vans, 2000) index, distortion = vq(points, centroids) vans = [[] for _ in range(args.number_of_vans)] for i, point in enumerate(data): vans[index[i]].append(point) vans = distribute(vans, len(data), centroids) create_output(args.outfile, vans)
def clustering2(img,clusters): "another clustering method - no major differences" #Reshaping image in list of pixels to allow kmean Algorithm #From 1792x1792x3 to 1792^2x3 pixels = np.reshape(img,(img.shape[0]*img.shape[1],3)) centroids,_ = kmeans2(pixels,3,iter=3,minit= 'random') #print ("Centroids : ",centroids.dtype,centroids.shape,type(centroids)) #print centroids # quantization #Assigns a code from a code book to each observation #code : A length N array holding the code book index for each observation. #dist : The distortion (distance) between the observation and its nearest code. code,_ = vq(pixels,centroids) #print ("Code : ",code.dtype,code.shape,type(code)) #print code # reshaping the result of the quantization reshaped = np.reshape(code,(img.shape[0],img.shape[1])) #print ("reshaped : ",reshaped.dtype,reshaped.shape,type(reshaped)) clustered = centroids[reshaped] #print ("clustered : ",clustered.dtype,clustered.shape,type(clustered)) #scatter3D(centroids) return clustered
def main(): gdal.AllRegister() infile = auxil.select_infile() if infile: inDataset = gdal.Open(infile,GA_ReadOnly) cols = inDataset.RasterXSize rows = inDataset.RasterYSize bands = inDataset.RasterCount else: return pos = auxil.select_pos(bands) bands = len(pos) x0,y0,rows,cols=auxil.select_dims([0,0,rows,cols]) K = auxil.select_integer(6,msg='Number clusters') G = zeros((rows*cols,len(pos))) k = 0 for b in pos: band = inDataset.GetRasterBand(b) G[:,k] = band.ReadAsArray(x0,y0,cols,rows)\ .astype(float).ravel() k += 1 centers, _ = kmeans(G,K) labels, _ = vq(G,centers) outfile,fmt = auxil.select_outfilefmt() if outfile: driver = gdal.GetDriverByName(fmt) outDataset = driver.Create(outfile, cols,rows,1,GDT_Byte) outBand = outDataset.GetRasterBand(1) outBand.WriteArray(reshape(labels,(rows,cols))\ ,0,0) outBand.FlushCache() outDataset = None inDataset = None
def cluster_svm(x_data, y_data, kmean, xlab, ylab, show_graph): x = vstack(x_data) y = vstack(y_data) #print data #dat = np.insert(x, 1, y, axis=1) dat = hstack((x,y)) #print dat #data = vstack((t,c, d)) # computing K-Means with K = 2 (2 clusters) centroids,_ = kmeans2(dat,kmean, iter=20) # assign each sample to a cluster idx,_ = vq(dat,centroids) # some plotting using numpy's logical indexing if show_graph: plt.figure() plt.plot(dat[idx==0,0],dat[idx==0,1],'ob', dat[idx==1,0],dat[idx==1,1],'or', dat[idx==2,0],dat[idx==2,1],'ok',) plt.plot(centroids[:,0],centroids[:,1],'sg',markersize=8) plt.xlabel(xlab) plt.ylabel(ylab) plt.show() return centroids
def clustering(img,clusters): "use the kmean algo to cluster img colors" #Reshaping image in list of pixels to allow kmean Algorithm #From 1792x1792x3 to (1792^2)x3 pixels = np.reshape(img,(img.shape[0]*img.shape[1],3)) #clustering is done on hue value of a pixel color #performing the clustering centroids,_ = kmeans(pixels,clusters,iter=3) # quantization #Assigns a code from a code book to each observation #code : A length N array holding the code book index for each observation. code,_ = vq(pixels,centroids) #print ("Code : ",code.dtype,code.shape,type(code)) # reshaping the result of the quantization reshaped = np.reshape(code,(img.shape[0],img.shape[1])) #print ("reshaped : ",reshaped.dtype,reshaped.shape,type(reshaped)) #print reshaped #print nbrDiff(reshaped) clustered = centroids[reshaped] #print ("Centroids : ",centroids.dtype,centroids.shape,type(centroids)) #print ("Clustered : ",clustered.dtype,clustered.shape,type(clustered)) #print ("nbrDiff de Clustered 0 = " , nbrDiff(clustered[:,:,0])) #print ("nbrDiff de Clustered 1 = " ,nbrDiff(clustered[:,:,1])) #print ("nbrDiff de Clustered 2 = " ,nbrDiff(clustered[:,:,2])) #print nbrDiff(reshaped) return clustered,reshaped,standardCode
def project(self, descriptors): """Project descriptors on the vocabulary to create a histogram of words.""" imhist = numpy.zeros((self.word_count)) words, distance = vq.vq(descriptors, self.voc) for w in words: imhist[w] += 1 return imhist
def Kmeans_map(obs, code_book): No = obs.shape[0] nc = code_book.shape[0] # nc is current number of clusters (may decrease if zero clusters last iteration) # # compute membership and distances between obs and code_book obs_code, distort = vq(obs, code_book) distortsum = np.sum(distort) distortmax = np.amax(distort) # # vq returns an indexing array obs_code mapping rows of obs (the points) to code_book (the centroids) # distort is an array of length No that has difference between observation and chosen centroid # vq stands for vector quantization and is provided in SciPy # VectorDimension = obs.shape[1] NewCode_Book = np.zeros([nc, VectorDimension]) NumPointsinClusters = np.zeros([nc]) for i in np.arange(nc): # Loop over clusters labelled with i cell_members = np.compress(np.equal(obs_code, i), obs, 0) NumPointsinClusters[i] = cell_members.shape[0] # Extract Points in this Cluster; extract points whose quantization label is i # NewCode_Book[i] = np.sum(cell_members, 0) # Calculate centroid of i'th cluster return NewCode_Book, NumPointsinClusters, distortsum, distortmax, No
def test_kmeans(): obs = sp.random.uniform(0, 10, (1000, 2)) # knum = 7 obs = scvq.whiten(obs) # run kmeans with diffirent number of clusters for knum in range(2, 8): codebook, dist = scvq.kmeans(obs, knum) ind, dist = scvq.vq(obs, codebook) # visualize # plt.ion() plt.ioff() plt.figure(knum) colors = ["b*", "g+", "ro", "yp", "ms", "ch", "wx"] for icluster in range(knum): x = (ind == icluster).nonzero()[0] plt.plot(obs[x, 0], obs[x, 1], colors[icluster]) for iline in range(sp.size(x)): plt.plot([obs[x[iline], 0], codebook[icluster, 0]], [obs[x[iline], 1], codebook[icluster, 1]], "k--") # the cluster centroid plt.plot(codebook[:, 0], codebook[:, 1], "ko") # the plot size plt.xlim((-0.3, 3.8)) plt.ylim((-0.3, 3.8)) plt.show()
for v_num, v_id in enumerate(v_ids): print "Creating Self-Organized Map for " + vehicle_type + " with " + fuel_type + " consuption (ID " + str(v_num) + " of " + str(len(v_ids)) + ")\r", # Opens ID frame sampling analysis (NOT NORMALIZED) input_file_name = input_file_path_base + vehicle_type + '/' + fuel_type + '/' + v_id df = pd.read_csv(input_file_name) data = df.fillna(0).as_matrix().astype(float) # Starts K-Means analysis best_distortion = None best_code_book = None best_distance = None best_k = None for k_mean in k_means: centroids, distortion = kmeans(data, k_mean) # Uses kmeans to clusterize data from actual map code_book, distance = vq(data, centroids) # Gets codebook of ID's # Saves results if distortion is more than "elbow_rate" percent smaller than the best distortion so far if best_distortion == None or abs(distortion - best_distortion)/best_distortion > elbow_rate: best_distortion = distortion best_code_book = code_book best_distance = distance best_k = k_mean # If distortion is not "elbow_percent" percent smaller than the best distortion so far, quites the analysis else: break df['CODE_BOOK'] = best_code_book # Saves codebook on result on dataframe df['DISTANCE'] = best_distance # Saves distances from centroids on dataframe f.write(vehicle_type + ' ' + fuel_type + ' ' + v_id[:-4] + ' k=' + str(best_k) + '\n')
'''We are going to continue the investigation into the sightings of legendary Pokémon from the previous exercise. Just like the previous exercise, we will use the same example of Pokémon sightings. In this exercise, you will form clusters of the sightings using k-means clustering. x and y are columns of X and Y coordinates of the locations of sightings, stored in a Pandas data frame, df. The following are available for use: matplotlib.pyplot as plt, seaborn as sns, and pandas as pd.''' import pandas as pd from matplotlib import pyplot as plt import seaborn as sns x = [9, 6, 2, 3, 1, 7, 1, 6, 1, 7, 23, 26, 25, 23, 21, 23, 23, 20, 30, 23] y = [8, 4, 10, 6, 0, 4, 10, 10, 6, 1, 29, 25, 30, 29, 29, 30, 25, 27, 26, 30] df = pd.DataFrame({'x': x, 'y': y}) # Import kmeans and vq functions from scipy.cluster.vq import kmeans, vq # Compute cluster centers centroids, _ = kmeans(df, 2) # Assign cluster labels df['cluster_labels'], _ = vq(df, centroids) # Plot the points with seaborn sns.scatterplot(x='x', y='y', hue='cluster_labels', data=df) plt.show()
# K-means clustering: first exercise # This exercise will familiarize you with the usage of k-means clustering on a dataset. Let us use the Comic Con dataset and check how k-means clustering works on it. # Recall the two steps of k-means clustering: # Define cluster centers through kmeans() function. It has two required arguments: observations and number of clusters. # Assign cluster labels through the vq() function. It has two required arguments: observations and cluster centers. # The data is stored in a Pandas data frame, comic_con. x_scaled and y_scaled are the column names of the standardized X and Y coordinates of people at a given point in time. # Instructions # 100 XP # Import kmeans and vq functions in SciPy. # Generate cluster centers using the kmeans() function with two clusters. # Create cluster labels using these cluster centers. # Import the kmeans and vq functions from scipy.cluster.vq import kmeans, vq # Generate cluster centers cluster_centers, distortion = kmeans(comic_con[['x_scaled', 'y_scaled']], 2) # Assign cluster labels comic_con['cluster_labels'], distortion_list = vq(comic_con[['x_scaled', 'y_scaled']], cluster_centers) # Plot clusters sns.scatterplot(x='x_scaled', y='y_scaled', hue='cluster_labels', data = comic_con) plt.show()
#np数据从0开始计算,第0维维序号排除,第10维为标签排除,所以为1到9 points = dataset[:,1:9] cancer_label = dataset[:,10] print "points:\n",points print "cancer_label:\n",cancer_label # k-means聚类 #将原始数据做归一化处理 data=whiten(points) #使用kmeans函数进行聚类,输入第一维为数据,第二维为聚类个数k. #有些时候我们可能不知道最终究竟聚成多少类,一个办法是用层次聚类的结果进行初始化.当然也可以直接输入某个数值. #k-means最后输出的结果其实是两维的,第一维是聚类中心,第二维是损失distortion,我们在这里只取第一维,所以最后有个[0] #centroid = kmeans(data,max(cluster))[0] centroid = kmeans(data,2)[0] print centroid #使用vq函数根据聚类中心对所有数据进行分类,vq的输出也是两维的,[0]表示的是所有数据的label label=vq(data,centroid)[0] num = [0,0] for i in label: if(i == 0): num[0] = num[0] + 1 else: num[1] = num[1] + 1 print 'num =',num #np.savetxt('file.csv',label) print "Final clustering by k-means:\n",label result = np.subtract(label,cancer_label) print "result:\n",result count = [0,0] for i in result: if(i == 0):
from collections import defaultdict from similar_words import load_vectors import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--basename', help='base name of word vector files', type=str) parser.add_argument('--maxwords', help='maximum number of words to cluster', type=int) parser.add_argument('--k', help='number of clusters', type=int) args = parser.parse_args() vectors, words = load_vectors(args.basename, args.maxwords) centroids, _ = kmeans(vectors, args.k) idx, _ = vq(vectors, centroids) clusters = defaultdict(set) for i, c in enumerate(idx): clusters[c].add(words[i]) for c in range(args.k): print 'CLUSTER', c + 1, for word in clusters[c]: print word, print print
#print 'x ' + str(x) + ' y ' + str(y) + ' w ' + str(w) + ' h ' + str(h) #if horiz_aspect_ratio > 2 or vert_aspect_ratio > 2: #cv2.rectangle(img,(x,y),(x+w,y+h),(0,0,255),2) #draw in all contours to see how they fall #contour_sizes.append([float(x)*4,float(y)*4,max(float(w),float(h))])#,horiz_aspect_ratio,vert_aspect_ratio]) contour_sizes.append([cx*8.0,cy*8.0,max(float(w),float(h))/8.0])#,horiz_aspect_ratio,vert_aspect_ratio]) contour_lookup.append(c) #contour_sizes.append([float(x),float(w),float(h)])#,horiz_aspect_ratio]) #cv2.drawContours(img,[c],0,(0,255,0),1) whitened_contour_sizes = clustering.whiten(contour_sizes) #print str(contour_sizes) # let scipy do its magic (k==3 groups) centers,dist = clustering.kmeans(whitened_contour_sizes,75,iter=100) code, distance = clustering.vq(whitened_contour_sizes,centers) #print str(centroid) #print str(code) #print 'contours is ' + str(len(contour_sizes)) + ' and code is ' + str(len(code)) colors = [( int(random.uniform(0, 255)),int(random.uniform(0, 255)),int(random.uniform(0, 255))) for i in code ] #print str(colors) for i, label in enumerate(code): color = colors[label] x,y,w,h = cv2.boundingRect(contour_lookup[i]) #box = contour_sizes[i] #x=int(box[0]) #y=int(box[1]) #w=int(box[2]) #h=int(box[3])
def compress(self, node, idx, bits=None, min_qsnr=None, sparse=False): val = self.get(node, idx) flattened_val = val.flatten() if bits is not None: bins = int(math.pow(2, bits)) if bins > val.size: raise Exception( 'More bins than values with {} bits'.format(bits)) kmeans = KMeans(n_clusters=bins) kmeans.fit(flattened_val.reshape((-1, 1))) codebook = kmeans.cluster_centers_ codebook = codebook.astype(val.dtype) codes = vq(flattened_val.reshape((-1, 1)), codebook) compressed_val = np.array([codebook[code] for code in codes[0] ]).reshape(val.shape) elif min_qsnr: cur_qsnr = -math.inf bits = 1 while cur_qsnr < min_qsnr: bits += 1 if bits > 7: raise Exception( 'Cannot find a solution with less than 8 bits \ for {} with min_qsnr = {}'.format( node.name, min_qsnr)) bins = int(math.pow(2, bits)) if bins > val.size: break kmeans = KMeans(n_clusters=bins) kmeans.fit(flattened_val.reshape((-1, 1))) codebook = kmeans.cluster_centers_ codebook = codebook.astype(val.dtype) codes = vq(flattened_val.reshape((-1, 1)), codebook) compressed_val = np.array( [codebook[code] for code in codes[0]]).reshape(val.shape) cur_qsnr = qsnr(compressed_val.astype(np.float32), val.astype(np.float32)) else: # automatic search of optimal k with inertia method silhouette = [] inertia = [] for bits in range(1, 8): bins = int(math.pow(2, bits)) if bins > val.size: break kmeans = KMeans(n_clusters=bins) kmeans.fit(flattened_val.reshape((-1, 1))) codebook = kmeans.cluster_centers_ codebook = codebook.astype(val.dtype) codes = vq(flattened_val.reshape((-1, 1)), codebook) compressed_val = np.array( [codebook[code] for code in codes[0]]).reshape(val.shape) inertia.append(kmeans.inertia_) silhouette.append( silhouette_score(flattened_val.reshape(-1, 1), compressed_val.flatten().reshape(-1, 1))) elb_idx = np.argmax(np.diff(np.diff( np.array(inertia)))) # 2nd grade derivative to find the elbow elb_idx = 1 if elb_idx == 0 else elb_idx bits = np.argmax( np.array(silhouette[elb_idx - 1:elb_idx + 1]) ) + 1 # take the three around the elbow and look at the silhouette bins = int(math.pow(2, bits)) kmeans = KMeans(n_clusters=bins) kmeans.fit(flattened_val.reshape((-1, 1))) codebook = kmeans.cluster_centers_ codebook = codebook.astype(val.dtype) codes = vq(flattened_val.reshape((-1, 1)), codebook) compressed_val = np.array([codebook[code] for code in codes[0] ]).reshape(val.shape) if sparse: freqs = np.unique(codes, return_counts=True) max_index = np.where(freqs[1] == freqs[1].max())[0][0] sparse_val = freqs[0][max_index] else: sparse_val = None self.set(node, idx, val, compressed_val, sparse_val) x = 1
# -*- coding: utf-8 -*- """ Created on Thu Mar 14 02:02:38 2019 @author: js """ import numpy as np from scipy.cluster.vq import vq, kmeans, whiten list1 = [88.0, 74.0, 96.0, 85.0] list2 = [92.0, 99.0, 95.0, 94.0] list3 = [91.0, 87.0, 99.0, 95.0] list4 = [78.0, 99.0, 97.0, 81.0] list5 = [88.0, 78.0, 98.0, 84.0] list6 = [100.0, 95.0, 100.0, 92.0] data = np.array([list1,list2,list3,list4,list5,list6]) whiten = whiten(data) centroids,_ = kmeans(whiten, 2) result,_= vq(whiten, centroids) print(result)
# results of this approach) # feat=whiten(feat) # The use of the Kmeans2 is giving crappy results as it is only running once. # centroids, labels=kmeans2(feat,K, minit='points', iter=10) # To use the Kmeans algorithm from scipy we get centroids and the variance and # the iter parameter is not the iterations but the number of times to # run kmeans centroids, variance = kmeans(feat, K, iter=100) log("Centroids and Variance Calculated... proceeding to calculate Labels and Distance Matrix" ) # But for this is necessary to compute the labels from the centroids (distance) # from the features and the centroids. labels, distance = vq(feat, centroids) log("Labels length: " + str(len(labels))) outputCluster = open("cluster-" + Xfile, "w") for k in range(len(labels)): outputCluster.write(str(k + 1) + " " + str(labels[k] + 1) + "\n") outputCluster.close() log("Clusters Exported to file clusters-...") log("Starting Display") # A partir dos centroids pode-se calcular a distância de cada documento a cada # um dos centroids com uma cor respectiva. # Se desenharmos horizontalmente os pontos com X a ser a data de # publicação e Y o cluster?
def countpairs(src, lns, rnd, radii, rndtype='lens', srcweights=None, rndweights=None, numthreads=1): """ Create annuli dictionary and run chunkcount for every inner/outer radius pair """ srcpos = zip(src.data['RA'], src.data['DEC']) rndpos = zip(rnd.data['RA'], rnd.data['DEC']) if jackknife == True: jkresults = {} #separate d (src) and r (rnd) into SAME kmeans region, by position n_jk = njackknife centers, _ = kmeans(srcpos, n_jk) src_k_indices, _ = vq(srcpos, centers) rnd_k_indices, _ = vq(rndpos, centers) #count DD and DR for each radius for each sample for k in range(n_jk): #radii in increasing order annuli = {} #sources first src_k_mask = (src_k_indices != k) this_dpos = np.array(srcpos)[src_k_mask] this_srcweights = np.array(srcweights)[src_k_mask] annuli = loopradius(annuli, 'Psrcsum', 'srcpairs', radii, lns, this_dpos, this_srcweights, numthreads) del this_dpos #again with randoms rnd_k_mask = (rnd_k_indices != k) this_rpos = np.array(rndpos)[rnd_k_mask] this_rndweights = np.array(rndweights)[rnd_k_mask] annuli = loopradius(annuli, 'Prndsum', 'rndpairs', radii, lns, this_rpos, this_rndweights, numthreads) jkresults[k] = {} jkresults[k]['Psrcsum'] = [annuli[rad]['Psrcsum'] for rad in radii] jkresults[k]['Prndsum'] = [annuli[rad]['Prndsum'] for rad in radii] #get w for this sample tot_src = np.sum([annuli[m]['Psrcsum'] for m in annuli.keys()]) tot_rnd = np.sum([annuli[m]['Prndsum'] for m in annuli.keys()]) jkresults[k]['w'] = w([annuli[rad]['Psrcsum'] for rad in radii], [annuli[rad]['Psrcsum'] for rad in radii], tot_src, tot_rnd) #get jacknife estimate from average of jackknife regions jkresults['w'] = jk([jkresults[k]['Psrcsum'] for k in range(n_jk)], [jkresults[k]['Prndsum'] for k in range(n_jk)], w) #get jackknife variance from the results of jackknife regions jkresults['var'] = varjk( [jkresults[k]['Psrcsum'] for k in range(n_jk)], [jkresults[k]['Prndsum'] for k in range(n_jk)], w) #answer with full sample annuli = {} #sources first srcpos = zip(src.data['RA'], src.data['DEC']) annuli = loopradius(annuli, 'Psrcsum', 'srcpairs', radii, lns, srcpos, srcweights, numthreads) #save space del src, srcpos, srcweights #again with randoms rndpos = zip(rnd.data['RA'], rnd.data['DEC']) annuli = loopradius(annuli, 'Prndsum', 'rndpairs', radii, lns, rndpos, rndweights, numthreads) del rnd, rndpos, rndweights if jackknife == True: return annuli, jkresults else: return annuli
trainImages[i].append(filename) im = cv2.imread(os.path.join(trainImageDir, filename)) imgray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) kp, des = featureExtractor.get_keypoints_and_descriptors(imgray) desList.append(des) descriptors = desList[0] for des in desList: descriptors = np.vstack((descriptors, des)) if descriptors.dtype != "float32": descriptors = np.float32(descriptors) voc, variance = kmeans(descriptors, numWords, 30) sumNum = [] for i in range(len(gestureIDlist)): if i == 0: sumNum.append(0) else: val = sumNum[i - 1] + len(trainImages[i - 1]) sumNum.append(val) trainData = np.zeros((sumNum[-1] + len(trainImages[-1]), numWords), "float32") trainLabels = np.zeros((sumNum[-1] + len(trainImages[-1])), "uint32") for gestureID in range(len(gestureIDlist)): for numFrame in range(len(trainImages[gestureID])): words, distance = vq(desList[sumNum[gestureID] + numFrame], voc) for w in words: trainData[sumNum[gestureID] + numFrame][w] += 1 trainLabels[sumNum[gestureID] + numFrame] = gestureID
def kmeansFits(parser): (options,args)= parser.parse_args() if len(args) == 0: parser.print_help() return if options.outfilename is None: print "-o filename options needs to be set ..." print "Returning ..." return None numpy.random.seed(seed=options.seed) #Restore fits savefilename= args[0] if os.path.exists(savefilename): savefile= open(savefilename,'rb') params= pickle.load(savefile) type= pickle.load(savefile) band= pickle.load(savefile) savefile.close() else: print "Input file does not exist ..." print "Returning ..." return #Prepare params for K-means print "Preparing data ..." if type == 'powerlawSF': if len(band) > 1: nparams= 4 else: nparams= 2 elif type == 'DRW': if len(band) == 1: nparams= 2 else: print "DRW for multi-band fits not implemented yet ..." print "Returning ..." return elif type == 'KS11': nparams= 3 elif type == 'scatter': nparams= 1 ndata= len(params) kIn= numpy.zeros((ndata,nparams)) if type == 'powerlawSF': #Stack as A,g,Ac,gc kIn[:,0]= numpy.array([p['logA'] for p in params.values()]).reshape(ndata) kIn[:,1]= numpy.array([p['gamma'] for p in params.values()]).reshape(ndata) if len(band) > 1: kIn[:,2]= numpy.array([p['logAgr'] for p in params.values()]).reshape(ndata) kIn[:,3]= numpy.array([p['gammagr'] for p in params.values()]).reshape(ndata) elif type == 'DRW': print "type == 'DRW' not implemented yet ..." print "Returning ..." return elif type == 'KS11': #Stack as A,g,s kIn[:,0]= numpy.array([p['logA'] for p in params.values()]).reshape(ndata) kIn[:,1]= numpy.array([p['gamma'] for p in params.values()]).reshape(ndata) kIn[:,2]= numpy.array([p['s'] for p in params.values()]).reshape(ndata) #Whiten, i.e., give unit variance print "Whitening data ..." whitenFactors= numpy.zeros(nparams) for ii in range(nparams): whitenFactors[ii]= numpy.std(kIn[:,ii]) kIn[:,ii]/= whitenFactors[ii] #Ready to run K-means print "Running K-means ..." book, dist= kmeans(kIn,options.k) assign, dist= vq(kIn,book) #De-whiten the codebook for ii in range(nparams): book[:,ii]*= whitenFactors[ii] #Prepare for saving print "Preparing output for saving ..." outparams= [] weights= [] for kk in range(options.k): if type == 'powerlawSF': if len(band) > 1: outparams.append({'logA':book[kk,0], 'gamma':book[kk,1], 'logAgr':book[kk,2], 'gammagr':book[kk,3]}) else: outparams.append({'logA':book[kk,0], 'gamma':book[kk,1]}) elif type == 'DRW': print "DRW not implemented yet ..." print "Returning ..." return if type == 'KS11': outparams.append({'logA':book[kk,0], 'gamma':book[kk,1], 's':book[kk,2]}) thisassign= assign[(assign == kk)] weights.append(len(thisassign)) #Save print "Saving ..." if os.path.exists(options.outfilename): print options.outfilename+" exists ..." print "*Not* overwriting ..." print "Remove file before running ..." return if options.savefits: import pyfits cols= [] if type == 'powerlawSF': colA= [] colg= [] for kk in range(options.k): colA.append(outparams[kk]['logA']) colg.append(outparams[kk]['gamma']) colA= numpy.array(colA) colg= numpy.array(colg) colw= numpy.log(numpy.array(weights)) cols.append(pyfits.Column(name='logA',format='E', array=colA)) cols.append(pyfits.Column(name='gamma',format='E', array=colg)) elif type == 'KS11': colA= [] colg= [] cols= [] for kk in range(options.k): colA.append(outparams[kk]['logA']) colg.append(outparams[kk]['gamma']) colg.append(outparams[kk]['s']) colA= numpy.array(colA) colg= numpy.array(colg) cols= numpy.array(colg) cols.append(pyfits.Column(name='logA',format='E', array=colA)) cols.append(pyfits.Column(name='gamma',format='E', array=colg)) cols.append(pyfits.Column(name='s',format='E', array=cols)) colw= numpy.log(numpy.array(weights)) cols.append(pyfits.Column(name='logweight',format='E', array=colw)) columns= pyfits.ColDefs(cols) tbhdu= pyfits.new_table(columns) tbhdu.writeto(options.outfilename) else: outfile= open(options.outfilename,'wb') pickle.dump(outparams,outfile) pickle.dump(weights,outfile) outfile.close() return
def recolour(image, pal): palette_array = np.array(pal, dtype=np.uint8) im_array = np.reshape(np.array(image), (image.size[0] * image.size[1], 3)) quant, _ = vq(im_array, palette_array) idx = np.reshape(quant, (image.size[1], image.size[0])) return Image.fromarray(palette_array[idx])
def fit(self, X, sample_weight=[]): """Fit a mixture of Gaussian model on X Parameters: ----------- X : ndarray [n_samples, n_features] Each row is a sample. Returns: -------- self : object. return the object itself. """ # # Use uniform weights over samples if sample weights are not specified. # self.n_samples = X.shape[0] self.n_features = X.shape[1] if sample_weight == []: self.sample_weight = np.ones( (self.n_samples, 1), dtype=np.float64) / self.n_samples elif len(sample_weight.shape) == 1: self.sample_weight = np.reshape(sample_weight, (self.n_samples, 1)) self.sample_weight.astype(np.float64) # # Run KMeans to find initial centroids and radiuses # #whitened_samples = whiten(X) whitened_samples = X centroids, distortion = kmeans(whitened_samples, self.n_components) indexes, distortion = vq(whitened_samples, centroids) if self.n_components != centroids.shape[0]: print 'KMean wasn\'t able to find as many clusters as you wanted' self.n_components = centroids.shape[0] coef = np.zeros((self.n_components, ), np.float64) # # Update mixing coefficients, mean and covariance # cim = np.zeros((self.n_samples, ), np.float64) for i in range(self.n_samples): cim[i] = np.argmin(np.sum(np.power(X[i, :] - centroids, 2), axis=1)) mus = [] sigmas = [] for k in range(self.n_components): Xk = X[np.where(cim == k)[0], :] coef[k] = Xk.shape[0] / float(self.n_samples) Xkb = Xk - centroids[k, :] sigmak = np.dot(np.transpose(Xkb), Xkb) / float(self.n_samples - 1) if self.cov_type == 'diag': sigmak = np.diag(np.diag(sigmak)) # # Check the component k is degenerate # # Quick check if np.mean(np.diag(sigmak)) >= self.dege_tole: mus.append(centroids[k, :]) var = np.diag(sigmak) mvar = np.sum(var) / len(np.where(var > 0)[0]) sigmak = sigmak * (1-self.sigma_corr_factor) + \ self.sigma_corr_factor*mvar*np.eye(sigmak.shape[0], dtype=np.float64) #sigmak += self.sigma_epsilon*np.eye(sigmak.shape[0], dtype=np.float64) sigmas.append(sigmak) self.components = self._pack_gaussian(mus, sigmas) self.n_components = len(mus) self.coef = coef self.staged_loss.append(self._loss(X)) # # Main loop: Expectation Maximization # resp_mat = np.zeros((self.n_samples, self.n_components), np.float64) for iter in range(self.max_iters): # # E step. Compute responsibility. # for k in range(self.n_components): resp_mat[:, k] = self.components[k].pdf(X) * self.coef[k] resp_mat /= np.reshape(np.sum(resp_mat, axis=1), (self.n_samples, 1)) # # M step. Re-estimate parameters # mus = [] sigmas = [] NK = np.sum(resp_mat, axis=0) self.coef = NK / float(self.n_samples) for k in range(self.n_components): r = np.reshape( resp_mat[:, k], (self.n_samples, 1)) / NK[k] * self.sample_weight r /= np.sum(r) muk = np.sum(r * X, axis=0) sigmak = 0.0 for n in range(self.n_samples): xkb = np.asmatrix(X[n, :] - muk) sigmak += r[n, 0] * np.transpose(xkb) * xkb # # Check if the component k is degenerate # # Quick check if np.mean(np.diag(sigmak)) < self.dege_tole: self._del_component(k) k -= 1 continue # # Correct sigmak # var = np.diag(sigmak) mvar = np.sum(var) / len(np.where(var > 0)[0]) sigmak = sigmak * (1-self.sigma_corr_factor) + \ self.sigma_corr_factor*mvar*np.eye(sigmak.shape[0], dtype=np.float64) if self.cov_type == 'diag': sigmak = np.diag(np.diag(sigmak)) mus.append(muk) sigmas.append(sigmak) self.components = self._pack_gaussian(mus, sigmas) self.staged_loss.append(self._loss(X)) if self.verbose: print "Iteration: ", iter, " Negative-Log-likelihood: ", self.staged_loss[ -1] # # Early stopping if the improvement is tiny. # if np.abs((self.staged_loss[-1] - self.staged_loss[-2])) < self.tolerance: break self.components_ = self.components self.coef_ = self.coef self.n_components_ = self.n_components # # Remove very small components # #self._prune() return self
def main(): try: dataset = sys.argv[1] except: print dataSets print "enter one of these datasets: " return 0 if ((dataset in dataSets) == False): print 'dataset name is invalid:' return 0 try: nCodewords = int(sys.argv[2]) except: nCodewords = 1000 print 'codewords: %d ' % (nCodewords) try: nSamples = int(sys.argv[3]) except: nSamples = 500000 print 'nSamples: %d ' % (nSamples) # dataPath = rootPath + dataset + dataDir catlist = os.listdir(dataPath) nCategories = len(catlist) nSamplesPerCat = int(np.round(nSamples / nCategories)) count = 0 for cat in catlist: catfilePath = dataPath + cat catname = cat.split('.')[0] catData = np.genfromtxt(catfilePath, dtype='float', usecols=np.arange(2, 15)) if (catData.shape[0] <= nSamplesPerCat): catSample = catData else: rndsample = np.random.randint(0, catData.shape[0], nSamplesPerCat) catSample = catData[rndsample, :] if (count == 0): cumData = catSample else: cumData = np.concatenate((cumData, catSample), axis=0) count += 1 # compute the codebook for the dataset [CodeBook, label] = kmeans2(cumData, nCodewords, iter=nIterKmeans, minit='points', missing='warn') #@UnusedVariable # write codebook to file cbfilepath = rootPath + dataset + cbDir + dataset + str( nCodewords) + codebookext cbfile = open(cbfilepath, 'w') np.savetxt( cbfile, CodeBook, fmt='%f', delimiter=' ', ) cbfile.close() # compute the bag-of-features histogram for each image for cat in catlist: catfilePath = dataPath + cat catname = cat.split('.')[0] catData = np.genfromtxt(catfilePath, dtype='float', usecols=np.arange(2, 15)) [catLabel, catDist] = vq(catData, CodeBook) #@UnusedVariable catImgId = np.genfromtxt(catfilePath, dtype=np.int, usecols=np.arange(15, 16)) catId = np.genfromtxt(catfilePath, dtype=np.int, usecols=np.arange(16, 17))[0] ImgId = np.unique(catImgId) catboffilepath = rootPath + dataset + bofDir + catname + str( nCodewords) + bofext catboffile = open(catboffilepath, 'w') imgcount = 0 for imgid in ImgId: imgLabel = catLabel[catImgId == imgid] [hist, edges] = np.histogram(imgLabel, nCodewords) #@UnusedVariable if imgcount == 0: dataout = np.hstack((hist.T, imgid, catId)) else: dataout = np.vstack((dataout, np.hstack( (hist.T, imgid, catId)))) imgcount += 1 print('%s : %s' % (catname, imgid)) np.savetxt( catboffile, dataout, fmt='%d', delimiter=' ', ) catboffile.close() return 0
def testCluster(self): print "< testCluster >" numVertices = 8 graph = SparseGraph(GeneralVertexList(numVertices)) graph.addEdge(0, 1) graph.addEdge(0, 2) graph.addEdge(1, 2) graph.addEdge(3, 4) graph.addEdge(3, 5) graph.addEdge(4, 5) graph.addEdge(0, 3) W = graph.getWeightMatrix() graphIterator = [] graphIterator.append(W[0:6, 0:6].copy()) W[1, 6] += 1 W[6, 1] += 1 graphIterator.append(W[0:7, 0:7].copy()) W[4, 7] += 1 W[7, 4] += 1 graphIterator.append(W.copy()) graphIterator = iter(graphIterator) k = 2 clusterer = NingSpectralClustering(k) clustersList = clusterer.cluster( toSparseGraphListIterator(graphIterator)) #Why are the bottom rows of Q still zero? #Try example in which only edges change numVertices = 7 graph = SparseGraph(GeneralVertexList(numVertices)) graph.addEdge(0, 1) graph.addEdge(0, 2) graph.addEdge(1, 2) graph.addEdge(3, 4) WList = [] W = graph.getWeightMatrix() WList.append(W[0:5, 0:5].copy()) graph.addEdge(3, 5) graph.addEdge(4, 5) W = graph.getWeightMatrix() WList.append(W[0:6, 0:6].copy()) graph.addEdge(0, 6) graph.addEdge(1, 6) graph.addEdge(2, 6) W = graph.getWeightMatrix() WList.append(W[0:7, 0:7].copy()) iterator = iter(WList) clustersList = clusterer.cluster(toSparseGraphListIterator(iterator)) #Seems to work, amazingly #print(clustersList) #Try removing rows/cols W2 = W[0:5, 0:5] W3 = W[0:4, 0:4] WList = [W, W2, W3] iterator = iter(WList) clustersList = clusterer.cluster(toSparseGraphListIterator(iterator)) #nptst.assert_array_equal(clustersList[0][0:5], clustersList[1]) nptst.assert_array_equal(clustersList[1][0:4], clustersList[2]) #Make sure 1st clustering (without updates) is correct L = GraphUtils.normalisedLaplacianRw(scipy.sparse.csr_matrix(W)) numpy.random.seed(21) lmbda, Q = scipy.sparse.linalg.eigs(L, min(k, L.shape[0] - 1), which="SM", ncv=min(20 * k, L.shape[0]), v0=numpy.random.rand(L.shape[0])) V = VqUtils.whiten(Q) centroids, distortion = vq.kmeans(V, k, iter=20) clusters, distortion = vq.vq(V, centroids) #This should be equal but the eigenvector computation is unstable #even with repeated runs (and no way to set the seed) nptst.assert_array_equal(clusters, clustersList[0])
# -*- coding: utf-8 -*- """ K-means clustering @author: David André Rodríguez Méndez (AndreRdz7) """ # Import libraries import numpy as numpy from scipy.cluster.vq import vq, kmeans # Create datasets data = np.random.random(90).reshape(30, 3) c1 = np.random.choice(range(len(data))) c2 = np.random.choice(range(len(data))) # Getting k clust_centers = np.vstack([data[c1], data[c2]]) print(clust_centers) print(vq(data, clust_centers)) # K-means kmeans(data, clust_centers)
from scipy.cluster.vq import kmeans, vq, whiten from numpy import vstack, array from numpy.random import rand # data generation with three features data = vstack((rand(100, 3) + array([.5, .5, .5]), rand(100, 3))) print(data) # whitening of data data = whiten(data) print(data) # computing K-Means with K = 3 (2 clusters) print("-------------computing K-Means with K = 3 (2 clusters)--------------") centroids, _ = kmeans(data, 3) print(centroids) # assign each sample to a cluster clx, _ = vq(data, centroids) # check clusters of observation print(clx)
# -*- coding: utf-8 -*- """ Created on Mon Jun 12 16:48:24 2017 @author: Xinyu Li """ from scipy.cluster.vq import kmeans,vq from matplotlib.finance import quotes_historical_yahoo_ochl from datetime import datetime start = datetime(2014,7,1) end = datetime(2014,9,30) listDji = ['AXP','BA','CAT','CSCO','CVX','DD','DIS','GE','GS','HD','IBM', 'INTC','JNJ','JPM','KO','MCD','MMM','MRK','MSFT','NKE','PFE','PG','T','TRV', 'UNH','UTX','V','VZ','WMT','XOM'] quotes = [ [0 for col in range(90)] for row in range(30)] listTemp = [ [0 for col in range(90)] for row in range(30)] for i in range(30): quotes[i] = quotes_historical_yahoo_ochl(listDji[i], start, end) days = len(quotes[0]) for i in range(30): for j in range(days-1): if (quotes[i][j][2] and quotes[i][j+1][2] and (quotes[i][j+1][2] >= quotes[i][j][2])): listTemp[i][j] = 1.0 else: listTemp[i][j] = -1.0 data = vstack(listTemp) centroids,_ = kmeans(data,4) #float or double is supported result,_= vq(data,centroids) print result
def sphere_tissue_image(size=100, n_points=12, n_layers=1): center = np.array([size / 2, size / 2, size / 2], float) coords = np.transpose(np.mgrid[0:size, 0:size, 0:size], (1, 2, 3, 0)).reshape((np.power(size, 3), 3)).astype(int) coords_distances = np.linalg.norm(coords - center, axis=1) points = {} layer_img = {} for layer in xrange(n_layers): radius = (layer + 1) * size / float(2 * n_layers + 1) layer_n_points = n_points * np.power(layer + 1, 2) layer_points = {} for p in range(layer_n_points): theta = np.random.rand() * 2. * np.pi phi = np.random.rand() * np.pi - np.pi / 2. layer_points[p + np.power(layer, 2) * n_points + 3] = center + radius * np.array([ np.cos(theta) * np.cos(phi), np.sin(theta) * np.cos(phi), np.sin(phi) ]) layer_points = array_dict(layer_points) point_target_area = 4. * np.pi * np.power( radius, 2.) / float(layer_n_points) point_target_distance = np.power(point_target_area / np.pi, 0.5) sigma_deformation = (size / 100.) * (20. / layer_n_points) omega_forces = dict(distance=0.1 * size / 100., repulsion=100.0 * np.power(size / 100., 2)) for iterations in xrange(100): point_vectors = np.array([ layer_points[p] - layer_points.values() for p in layer_points.keys() ]) point_distances = np.array([ vq(layer_points.values(), np.array([layer_points[p]]))[1] for p in layer_points.keys() ]) point_vectors = point_vectors / ( point_distances[..., np.newaxis] + 1e-7) point_distance_forces = omega_forces['distance'] * ( (point_distances - point_target_distance)[..., np.newaxis] * point_vectors / point_target_distance).sum(axis=1) point_repulsion_forces = omega_forces['repulsion'] * np.power( point_target_distance, 2) * (point_vectors / (np.power(point_distances, 2) + 1e-7)[..., np.newaxis]).sum(axis=1) point_forces = np.zeros((len(layer_points), 3)) point_forces += point_distance_forces point_forces += point_repulsion_forces point_forces = np.minimum( 1.0, sigma_deformation / np.linalg.norm( point_forces, axis=1))[:, np.newaxis] * point_forces new_points = layer_points.values() + point_forces new_points = center + radius * ( (new_points - center) / np.linalg.norm( (new_points - center), axis=1)[:, np.newaxis]) layer_points = array_dict(new_points, layer_points.keys()) for p in layer_points.keys(): points[p] = layer_points[p] labels = layer_points.keys()[vq(coords, layer_points.values())[0]] layer_img[layer + 1] = np.ones((size, size, size), np.uint8) layer_img[layer + 1][tuple(np.transpose(coords))] = labels points[2] = center points = array_dict(points) # coords = np.transpose(np.mgrid[0:size,0:size,0:size],(1,2,3,0)).reshape((np.power(size,3),3)).astype(int) # labels = points.keys()[vq(coords,points.values())[0]] img = np.ones((size, size, size), np.uint8) for layer in xrange(n_layers): layer_coords = coords[ (coords_distances > (2 * layer + 1) * size / float(4 * (n_layers + 1))) & (coords_distances <= (2 * layer + 3) * size / float(4 * (n_layers + 1)))] img[tuple(np.transpose(layer_coords))] = layer_img[layer + 1][tuple( np.transpose(layer_coords))] center_coords = coords[coords_distances <= size / float(4 * (n_layers + 1))] img[tuple(np.transpose(center_coords))] = 2 ext_coords = coords[coords_distances > (n_layers + 1) * size / float(2 * (n_layers + 2))] img[tuple(np.transpose(ext_coords))] = 1 img = SpatialImage(img, voxelsize=(60. / size, 60. / size, 60. / size)) return img
def pr07((xs, ys, ), var_number): template_vars = {} template_vars['var_number'] = var_number data = np.column_stack(( xs, ys, )).astype('float') figure_filenames = [] for n_clusters in [2, 3, 4]: centroids, __ = vq.kmeans(data, n_clusters) idx, __ = vq.vq(data, centroids) plt.figure() for i in xrange(n_clusters): plt.plot( data[idx == i, 0], data[idx == i, 1], 'o', markersize=2, color=CLUSTER_COLORS[i], ) plt.plot(centroids[:, 0], centroids[:, 1], '*',
def KMEANS(self): # clusters K = 3 data_arr = [] meal_name_arr = [] with open('./NewDataSet/Cluster_dataset/clusterisbnrate.csv', 'rb') as f: reader = csv.reader(f) for row in reader: if reader.line_num != 1: '''for x in row[2:]: print x''' data_arr.append([float(x) for x in row[1:]]) meal_name_arr.append([row[0]]) data = vstack(data_arr) print "data :" print data meal_name = vstack(meal_name_arr) # normalization data = whiten(data) # computing K-Means with K (clusters) centroids, distortion = kmeans(data, 3) print "distortion = " + str(distortion) # assign each sample to a cluster cntr = [] print("Centroids:") print centroids cntr = centroids print("Cntr :") print cntr print "---------------------------------------------------------" print("Centroids after sort:") #centroids=cntr.sort() #print centroids.sort() print "---------------------------------------------------------" idx, _ = vq(data, centroids) print "idx:" print idx print "-----------------------------------------------------------" '''# some plotting using numpy's logical indexing plot(data[idx==0,0], data[idx==0,1],'ob', data[idx==1,0], data[idx==1,1],'or', data[idx==2,0], data[idx==2,1],'og')''' print meal_name print data for i in range(K): print centroids[i] * 3 #print round(centroids[i]) print "max value:" max1 = max(centroids) print "min value:" min1 = min(centroids) toprated = [] lowrated = [] medrated = [] for i in range(K): result_names = meal_name[idx == i, 0] print "=================================" vv = round(centroids[i]) print vv name = "" print "Cluster " + str(i + 1) for name1 in result_names: name = name1 print name1 '''if(i== 0) : f1.write(name) elif (i==1): f2.write(name) elif (i==2): f3.write(name)''' if (centroids[i] == max1): #for name1 in result_names: toprated.append(name) name = name1 + "\n" f1.write(name) elif (centroids[i] == min1): lowrated.append(name) name = name1 + "\n" f3.write(name) else: medrated.append(name) name = name1 + "\n" f2.write(name) print "--------------------------------------------------------------------------" print "toprated:" print toprated print "--------------------------------------------------------------------------" print "medrated:" print medrated print "--------------------------------------------------------------------------" print "lowrated:" print lowrated print "--------------------------------------------------------------------------" '''plot(centroids[:,0], centroids[:,1], 'sg',markersize=8)''' show()
def quantize(pixels, palette): """quantize an image with a given color palette""" # pixels = np.reshape(img, (img.shape[0] * img.shape[1], 3)) qnt, _ = vq(pixels, palette) centers_idx = np.reshape(qnt, (pixels.shape[0])) return centers_idx
def test_vq(self): initc = np.concatenate(([[X[0]], [X[1]], [X[2]]])) for tp in np.array, np.matrix: label1, dist = _vq.vq(tp(X), tp(initc)) assert_array_equal(label1, LABEL1) tlabel1, tdist = vq(tp(X), tp(initc))
def make_codeword(features, codebook): codeword, _ = vq(features, codebook) return codeword
''' Draw Keypoints ''' # img=cv2.drawKeypoints(gray,kp,img) # cv2.imwrite(target + '_keypoints1.jpg',img) if (count % batch_size == 0 or len(image_paths) - count < 3): print "saving:", count, "/", len(image_paths) ''' K-Means ''' # kmeans.partial_fit(descriptors) i = 0 for des in des_list: words, distance = vq(des, kmeans.cluster_centers_) for w in words: im_features[i][w] += 1 i += 1 des_list = [] # descriptors = np.array([], dtype=np.int32).reshape(0,128) print "K-MEANS Partial Completed", str(time.time() - t0) ''' SVM ''' lin_clf.fit(im_features, im_classes) with open("model_svm.pickle", "wb") as f1, open("clusters.pickle", "wb") as f2: pickle.dump(lin_clf, f1) pickle.dump(kmeans, f2) if count % test_split == 0:
def computeHistograms(self, codebook, descriptors): code, dist = vq.vq(descriptors, codebook) histogram_of_words, bin_edges = histogram( code, bins=range(codebook.shape[0] + 1), normed=True) return histogram_of_words
def train_categorical_feature(feature_input, outcome, limit, number_of_clusters): input = feature_input.values if len(pd.unique(input)) == 2: vocabulary = np.unique(input) p = np.array([0, 1]) d = np.zeros(len(input), dtype=np.int) d[input == vocabulary[1]] = 1 output = dict(zip(["d", "vocabulary", "p"], [d, vocabulary, p])) print output return output vocabulary_t = pd.unique(input) count_1 = np.zeros(len(vocabulary_t), dtype=int) count_0 = np.copy(count_1) outcome_1 = outcome.values == 1 outcome_0 = outcome.values == 0 for index, item in enumerate(vocabulary_t): if pd.notnull(item): count_1[index] = sum((input == item) * (outcome_1)) count_0[index] = sum((input == item) * (outcome_0)) else: count_1[index] = sum(pd.isnull(input) * (outcome_1)) count_0[index] = sum(pd.isnull(input) * (outcome_0)) condition = (count_0 + count_1) >= limit condition[pd.isnull(vocabulary_t)] = True # n = sum(condition) # vocabulary = np.zeros(n, dtype = str) # p = np.zeros(n) def log_ratio(count_1, count_0): if count_1 == 0: return log(1 / (2 * float(count_0))) elif count_0 == 0: return log(2 * count_1) else: return log(count_1 / float(count_0)) v_log_ratio = np.vectorize(log_ratio) vocabulary = vocabulary_t[condition] p = v_log_ratio(count_1[condition], count_0[condition]) # index = 0 # for i in range(len(vocabulary_t)): # if (condition[i]): # vocabulary[index] = str(vocabulary_t[index]) # p[index] = log_ratio(count_1[index], count_0[index]) # index = index + 1 # if (count_1[index] == 0): # p[index] = log(1./(2*count_0[index])) # elif (count_0[index] == 0): # p[index] = log(2*count_1[index]) # else: # p[index] = log(count_1[index]./count_0[index]) # print "sum(condition == 0) is {0}".format(sum(condition == 0)) if sum(condition == 0) <= 1: if sum(condition == 0) == 1: p = np.append( p, log_ratio(count_1[condition == 0][0], count_0[condition == 0][0])) # if (count_1[condition == 0][0] == 0): # p[condition == 0] = log(1./(2*count_0[condition == 0][0])) # elif (count_0[condition == 0] == 0): # p[condition == 0] = log(2*count_1[condition == 0][0]) # else: # p[condition == 0] = log(count_1[condition == 0][0]./count_0[condtion == 0][0]) vocabulary = np.append(vocabulary, vocabulary_t[condition == 0]) else: # print "number of clusters {0}".format(number_of_clusters) cl = min(number_of_clusters, sum(condition == 0) - 1) # why is it -1 here? # cl_vocabulary = pd.DataFrame() # print "cl {0}".format(cl) residual_1 = count_1[condition == 0] residual_0 = count_0[condition == 0] # print "length of the residual_1 {0}".format(len(residual_1)) # s = np.zeros(len(residual_1)) s = v_log_ratio(residual_1, residual_0).reshape([len(residual_1), 1]) whitened = whiten(s) codebook = kmeans(whitened, cl)[0] code = vq(whitened, codebook)[0] # print "length of code {0}".format(len(code)) s1 = pd.Series(data=vocabulary_t[condition == 0]) # .astype(str) s2 = pd.Series(data=code) cl_vocabulary = pd.DataFrame.from_dict({ "cat_feature_input": s1, "cluster_id": s2 }) #print cl_vocabulary.axes cl_p = np.zeros(cl, dtype=float) # print cl_p, len(cl_p) for i in range(cl): # print i c1 = residual_1[code == i] c0 = residual_0[code == i] cl_p[i] = log_ratio(sum(c1), sum(c0)) # print "Hey" d = np.zeros(len(input)) d[pd.isnull(input)] = p[pd.isnull(vocabulary)] for i in range(len(vocabulary)): d[input == vocabulary[i]] = p[i] vocabulary = vocabulary.astype(str) if 'cl_vocabulary' in locals(): print "cl_vocabulary in locals()" for i in range(len(cl_vocabulary)): d[input == cl_vocabulary.loc[i, "cat_feature_input"]] = cl_p[ cl_vocabulary.loc[i, "cluster_id"]] #print cl_vocabulary.axes cl_vocabulary.loc[:, "cat_feature_input"] = cl_vocabulary[ "cat_feature_input"].astype(str) # print cl_vocabulary["cat_feature_input"].apply(type) output = dict( zip(["d", "vocabulary", "cl_vocabulary", "p", "cl_p"], [d, vocabulary, cl_vocabulary, p, cl_p])) else: output = dict(zip(["d", "vocabulary", "p"], [d, vocabulary, p])) #print output return output
os.mkdir(base_path+os.sep+str(id)) #f, axes = plt.subplots(nrows=2, ncols=1, figsize=(15, 10)) sarraster = plt.imread(base_path+os.sep+'subj'+os.sep+str(id)+'.gif') # Removing speckles sarraster = ndi.median_filter(sarraster , size=2) # Flatten image to get line of values flatsarraster = sarraster.flatten().astype(float) print(flatsarraster.shape) # In remaining subplots add k-means classified images for i in range(2,6): #This scipy code classifies k-mean, code has same length as flattened #SAR raster and defines which class the SAR value corresponds to centroids, variance = kmeans(flatsarraster, i) code, distance = vq(flatsarraster, centroids) fig = plt.figure() fig.suptitle('K-Means Classification') # In first subplot add original SAR image ax = plt.subplot(241) plt.axis('off') ax.set_title('Original Image') plt.imshow(sarraster, cmap = 'gray') print(sarraster.shape) #Since code contains the classified values, reshape into SAR dimensions codeim = code.reshape(sarraster.shape[0], -1) print(codeim.shape) #codeim = ndi.median_filter(codeim , size=4) for j in range(i): #Plot the subplot with (i+2)th k-means
def displayResult(): noOfCluster=0 #Get Radio button input to check user choice chart = request.form['radio'] #If user choice is cluster if chart == 'cluster': noOfCluster =long(request.form['cluster']) data_arr = [] meal_name_arr = [] #Url of data csv url='https://storage.googleapis.com/cloudbucket786/imptry4.csv' response=urllib2.urlopen(url) reader = csv.reader(response) for row in reader: if row[5] is None: row[5]=0 if row[5]=='': row[5]=0 if "," in row[6] : rowVal=row[6].split(",") row[6]=rowVal[0]+''+rowVal[1] row[6]=float(row[6]) if row[6]=='': row[6]=0 if row[6]=='N' : row[6]=0 if "," in row[7] : rowVal=row[7].split(",") row[7]=rowVal[0]+''+rowVal[1] row[7]=float(row[6]) if row[7]=='': row[7]=0 if row[7]=='N' : row[7]=0 data_arr.append([float(x) for x in row[5:]])#adding data to data_array meal_name_arr.append([row[0]])#adding ids to second array #print data_arr fig = plt.figure() ax = fig.add_subplot(111, projection='3d')#We are using 3D projection as we are plotting 3D data data = vstack( data_arr ) meal_name = vstack(meal_name_arr) # normalization data = whiten(data)#Before running k-means, it is beneficial to rescale each feature dimension of the observation set with whitening. #Each feature is divided by its standard deviation across all observations to give it unit variance. # computing K-Means with K (clusters) centroids, distortion = kmeans(data,noOfCluster) # assign each sample to a cluster idx,_ = vq(data,centroids) # some plotting using numpy's logical indexing listOfColor=['ob','or','og','oc','om','ok','oy'] for index in range(noOfCluster): plot(data[idx==index,0], data[idx==index,1],data[idx==index,2],listOfColor[index])# using 3 objects for 3D projection for index in range(noOfCluster): result_names = meal_name[idx==index, 0] print "=================================" print "Cluster " + str(index+1) for name in result_names: print name plot(centroids[:,0], centroids[:,1], centroids[:,2], 'oy',markersize=8) #saving file to temp image #Assigning labels to axis ax.set_xlabel('X Label') ax.set_ylabel('Y Label') ax.set_zlabel('Z Label') pylab.savefig('temp.jpg') pylab.clf() image="https://www.pythonanywhere.com/user/abhitej/files/home/abhitej/temp.jpg" #Overwrites the image on pythonanywhere.com return render_template('home.html',image=image,display='display:block;') else: list=[] words=request.form['words'] list=words.split(",") list1=[] for s in list: list1.append(s.encode('ascii','ignore')) return render_template('home.html',list1=list1,display='display:none;')# Assigning display none for cluster if user chooce wordcloud