def kmeans2(): features = locations() whitened = whiten(features) book = array((whitened[0],whitened[2])) kmeans(whitened,book) (array([[ 2.3110306 , 2.86287398], [ 0.93218041, 1.24398691]]), 0.85684700941625547)
def kmeans1(): features = array([[ 1.9,2.3], [ 1.5,2.5], [ 0.8,0.6], [ 0.4,1.8], [ 0.1,0.1], [ 0.2,1.8], [ 2.0,0.5], [ 0.3,1.5], [ 1.0,1.0]]) whitened = whiten(features) book = array((whitened[0],whitened[2])) kmeans(whitened,book) (array([[ 2.3110306 , 2.86287398], [ 0.93218041, 1.24398691]]), 0.85684700941625547)
def clustering_scipy_kmeans(features, n_clust = 8): """ """ whitened = whiten(features) print whitened.shape initial = [kmeans(whitened,i) for i in np.arange(1,12)] plt.plot([var for (cent,var) in initial]) plt.show() #cent, var = initial[3] ##use vq() to get as assignment for each obs. #assignment,cdist = vq(whitened,cent) #plt.scatter(whitened[:,0], whitened[:,1], c=assignment) #plt.show() codebook, distortion = kmeans(whitened, n_clust) print codebook, distortion assigned_label, dist = vq(whitened, codebook) for ii in range(8): plt.subplot(4,2,ii+1) plt.plot(codebook[ii]) plt.show() centroid, label = kmeans2(whitened, n_clust, minit = 'points') print centroid, label for ii in range(8): plt.subplot(4,2,ii) plt.plot(centroid[ii]) plt.show()
def kmeans_net(net, layers, num_c=16, initials=None): # net: 网络 # layers: 需要量化的层 # num_c: 各层的量化级别 # initials: 初始聚类中心 codebook = {} # 量化码表 if type(num_c) == type(1): num_c = [num_c] * len(layers) else: assert len(num_c) == len(layers) # 对各层进行聚类分析 print "==============Perform K-means=============" for idx, layer in enumerate(layers): print "Eval layer:", layer W = net.params[layer][0].data.flatten() W = W[np.where(W != 0)] # 筛选不为0的权重 # 默认情况下,聚类中心为线性分布中心 if initials is None: # Default: uniform sample min_W = np.min(W) max_W = np.max(W) initial_uni = np.linspace(min_W, max_W, num_c[idx] - 1) codebook[layer], _ = scv.kmeans(W, initial_uni) elif type(initials) == type(np.array([])): codebook[layer], _ = scv.kmeans(W, initials) elif initials == 'random': codebook[layer], _ = scv.kmeans(W, num_c[idx] - 1) else: raise Exception # 将0权重值附上 codebook[layer] = np.append(0.0, codebook[layer]) print "codebook size:", len(codebook[layer]) return codebook
def custom(): _items = {} users = [] for line in open('my_items_likehood.txt'): user, item = keys(line) users.append(user) if item in _items: _items[item].append(user) else: _items[item] = [user] sorted_users = sorted(users) l = len(sorted_users) items={} count=0 features=[] for item in _items: features.append(user_matrix(l, _items[item], sorted_users)) if count == 100: break count += 1 print 'whiten' whitened = whiten(array(features)) print 'kmeans' print kmeans(whitened) print "%d items voted by %d users" % (len(items), len(users))
def _get_larger_chroms(ref_file): """Retrieve larger chromosomes, avoiding the smaller ones for plotting. """ from scipy.cluster.vq import kmeans, vq all_sizes = [] for c in ref.file_contigs(ref_file): all_sizes.append(float(c.size)) all_sizes.sort() # separate out smaller chromosomes and haplotypes with kmeans centroids, _ = kmeans(np.array(all_sizes), 2) idx, _ = vq(np.array(all_sizes), centroids) little_sizes = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx, all_sizes))) little_sizes = [x[1] for x in little_sizes] # create one more cluster with the smaller, removing the haplotypes centroids2, _ = kmeans(np.array(little_sizes), 2) idx2, _ = vq(np.array(little_sizes), centroids2) little_sizes2 = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx2, little_sizes))) little_sizes2 = [x[1] for x in little_sizes2] # get any chromosomes not in haplotype/random bin thresh = max(little_sizes2) larger_chroms = [] for c in ref.file_contigs(ref_file): if c.size > thresh: larger_chroms.append(c.name) return larger_chroms
def cluster(df, means, csv_min, csv_max): data = [] for i in range(csv_min, csv_max): a = array(df.ix[:, i].values) b = a[a != "--"] print np.sort(kmeans(b.astype(np.float), means)[0]) data.append(np.sort(kmeans(b.astype(np.float), means)[0])) return data
def test_kmeans_lost_cluster(self): # This will cause kmean to have a cluster with no points. data = np.fromfile(DATAFILE1, sep=", ") data = data.reshape((200, 2)) initk = np.array([[-1.8127404, -0.67128041], [2.04621601, 0.07401111], [-2.31149087, -0.05160469]]) kmeans(data, initk) with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) kmeans2(data, initk, missing="warn") assert_raises(ClusterError, kmeans2, data, initk, missing="raise")
def test_kmeans_lost_cluster(self): # This will cause kmean to have a cluster with no points. data = TESTDATA_2D initk = np.array([[-1.8127404, -0.67128041], [2.04621601, 0.07401111], [-2.31149087,-0.05160469]]) kmeans(data, initk) with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) kmeans2(data, initk, missing='warn') assert_raises(ClusterError, kmeans2, data, initk, missing='raise')
def clusterkmeans(self): wh = whiten(self.counts) #normalizes the counts for easier clustering scale = self.counts[0] / wh[0] #compute kmeans for k = 1,2 compare the distortions and choose the better one one = kmeans(wh, 1) two = kmeans(wh, 2) if one[1] < two[1]: print 'found only one cluser' threshold = None else: km = two threshold = scale * km[0].mean() #set threshold to be the average of two centers return threshold
def test_kmeans_lost_cluster(self): # This will cause kmeans to have a cluster with no points. data = TESTDATA_2D initk = np.array([[-1.8127404, -0.67128041], [2.04621601, 0.07401111], [-2.31149087,-0.05160469]]) with suppress_warnings() as sup: sup.filter(UserWarning, "One of the clusters is empty. Re-run kmean with a different initialization") kmeans(data, initk) kmeans2(data, initk, missing='warn') assert_raises(ClusterError, kmeans2, data, initk, missing='raise')
def cluster_points(coord_points, N): """ Function that returns k which is an nx2 matrix of lon-lat vector columns containing the optimal cluster centroid spacings within a large set of random numbers e.g. those produced by the many_points() function above! """ return kmeans(coord_points, N)[0]
def createdatabase(): X_train = detectcompute(train1) print "Clustering the data with K-means" codebook,distortion = kmeans(whiten(X_train),k) print "Done.\n" imtrain = singledetect(test1) Pdatabase = bow(imtrain,codebook,k) #Pseudo database with list structure #Writing to html.table print "Converting the database into a HTML file" htmltable = open("table.htm","r+") begin = "<htm><body><table cellpadding=5><tr><th>Filename</th><th>Histogram</th></tr>" htmltable.write(begin) for i in range(len(Pdatabase)): middle = "<tr><td>%(filename)s</td><td>%(histogram)s</td></tr>" % {"filename": Pdatabase[i][0], "histogram": Pdatabase[i][-1]} htmltable.write(middle) end = "</table></body></html>" htmltable.write(end) htmltable.close() print "Done.\n" codebook_to_file(codebook)
def kmeans(features, projection, ite = 50, k = 4, threshold = 1e-5): """ perform k_keamns clustering and return a the result as a subsapce clustering object """ from scipy.cluster.vq import kmeans, vq import datetime from measures import spatial_coherence centroids, distance = kmeans(features, k, iter=ite, thresh=threshold) code, _ = vq(features, centroids) run_ = datetime.datetime.now().strftime("%y_%m_%d_%H_%M") params = "projection_size=%d, k=%d" %(len(projection), k) clusters = clusters_from_code(code, k, projection) clustering_id = "(%s)_(%s)_(%s)_(%s)" %("exhaustive_kmeans", params, run_, projection) #print clustering_id km_clt = KMClustering(algorithm ="exhaustive_kmeans", parameters = params, run = run_, clustering_id = clustering_id, clusters = clusters, ccontains_noise = False, cclustering_on_dimension = True) measures = {'spatial_coherence': spatial_coherence(km_clt, len(features))[0], 'distortion': distance} km_clt.update_measures(measures) return km_clt
def run_kmeans(whitened, k=3): book = list() for i in range(k): book.append(whitened[i]) codebook, distortion = kmeans(whitened, array(book)) # codebook, distortion = kmeans(whitened, k) return codebook
def worldplot(self,kmeans=None,proj='merc'): """ plots customer GPS location on a map with state and national boundaries. IN kmeans (int) number of means for k-means clustering, default=None proj (string) the map projection to use, use 'robin' to plot the whole earth, default='merc' """ # create a matplotlib Basemap object if proj == 'robin': my_map = Basemap(projection=proj,lat_0=0,lon_0=0,resolution='l',area_thresh=1000) else: my_map = Basemap(projection=proj,lat_0=33.,lon_0=-125.,resolution='l',area_thresh=1000., llcrnrlon=-130.,llcrnrlat=25,urcrnrlon=-65., urcrnrlat=50) my_map.drawcoastlines(color='grey') my_map.drawcountries(color='grey') my_map.drawstates(color='grey') my_map.drawlsmask(land_color='white',ocean_color='white') my_map.drawmapboundary() #my_map.fillcontinents(color='black') x,y = my_map(np.array(self.data['lon']),np.array(self.data['lat'])) my_map.plot(x,y,'ro',markersize=3,alpha=.4,linewidth=0) if kmeans: # k-means clustering algorithm---see wikipedia for details data_in = self.data.drop(['id','clv','level'],axis=1) # vq is scipy's vector quantization module output,distortion = vq.kmeans(data_in,kmeans) x1,y1 = my_map(output[:,1],output[:,0]) my_map.plot(x1,y1,'ko',markersize=20,alpha=.4,linewidth=0) plt.show() return output
def classify_kmeans(infile, clusternumber): ''' apply kmeans ''' #Load infile in data array driver = gdal.GetDriverByName('GTiff') driver.Register() ds = gdal.Open(infile, gdal.GA_Update) databand = ds.GetRasterBand(1) #Read input raster into array data = ds.ReadAsArray() #replace no data value with numpy.nan #data[data==-999.0]=numpy.nan pixel = numpy.reshape(data,(data.shape[0]*data.shape[1])) centroids, variance = kmeans(pixel, clusternumber) code, distance = vq(pixel,centroids) centers_idx = numpy.reshape(code,(data.shape[0],data.shape[1])) clustered = centroids[centers_idx] # Write outraster to file databand.WriteArray(clustered) databand.FlushCache() #Close file databand = None clustered = None ds = None
def kmeans(iData, clustNumber, oPrefix, norm=False): '''Perform k-means cluster analysis and return MAP of zones''' print 'Run K-Means' height, width = iData.shape[1:3] #reshape 3D cube of data into 2D matrix and get indeces of valid pixels iData, notNanDataI = cube2flat(iData) if norm: #center and norm iDataMean = iData[:, notNanDataI].mean(axis=1) iDataStd = iData[:, notNanDataI].std(axis=1) iData = np.subtract(iData.T, iDataMean).T iData = np.divide(iData.T, iDataStd).T #perform kmeans on valid data and return codebook codeBook = vq.kmeans(iData[:, notNanDataI].astype('f8').T, clustNumber)[0] #perform vector quantization of input data uzing the codebook #return vector of labels (for each valid pixel) labelVec = vq.vq(iData[:, notNanDataI].astype('f8').T, codeBook)[0]+1 #create and fill MAP of zones zoneMap = np.zeros(width*height) + np.nan zoneMap[notNanDataI] = labelVec zoneMap = zoneMap.reshape(height, width) #visualize map of zones plt.imsave(oPrefix + 'zones.png', zoneMap) return zoneMap
def test_large_features(self): # Generate a data set with large values, and run kmeans on it to # (regression for 1077). d = 300 n = 100 m1 = np.random.randn(d) m2 = np.random.randn(d) x = 10000 * np.random.randn(n, d) - 20000 * m1 y = 10000 * np.random.randn(n, d) + 20000 * m2 data = np.empty((x.shape[0] + y.shape[0], d), np.double) data[:x.shape[0]] = x data[x.shape[0]:] = y kmeans(data, 2)
def read_unclustered_data(filename, num_clusters, cl_type="kMeans"): """Return dictionary of cluster id to array of points. Given a filename in the format of lat, lng generate k clusters based on arguments. Outputs a dictionary with the cluster id as the key mapped to a list of lat, lng pts """ request_points = [] with open(filename, 'rb') as input_file: input_file.next() # Skip the header row for line in input_file: lat, lng = line.split(',') request_points.append((float(lat), float(lng))) request_points = array(request_points) if cl_type == "kMeans": # computing K-Means with K = num_clusters centroids, _ = kmeans(request_points, int(num_clusters)) # assign each sample to a cluster idx, _ = vq(request_points, centroids) else: # computeing kMedoids using distance matrix centroids = get_kmedoids(request_points, int(num_clusters)) # assign each sample to a cluster idx, _ = vq(request_points, centroids) # map cluster lat, lng to cluster index cluster_points = defaultdict(list) for i in xrange(len(request_points)): lat, lng = request_points[i] cluster_points[idx[i]].append((lat, lng)) return cluster_points
def getPupilThresholdWithClustering(gray,K=2, distanceWeight=2, resizeTo=(40,40)): ''' Detects the pupil in the image, gray, using k-means gray : gray scale image K : Number of clusters distanceWeight : Defines the weight of the position parameters reSize : the size of the image to do k-means on ''' smallI = cv2.resize(gray, resizeTo) M,N = smallI.shape #Generate coordinates in a matrix X,Y = np.meshgrid(range(M),range(N)) #Make coordinates and intensity into one vectors z = smallI.flatten() x = X.flatten() y = Y.flatten() # make a feature vectors containing (x,y,intensity) features = np.zeros((len(x),3)) features[:,0] = z; features[:,1] = y/distanceWeight; #Divide so that the distance of position weighs less than intensity features[:,2] = x/distanceWeight; features = np.array(features,'f') # cluster data centroids,variance = vq.kmeans(features,K) plotClusters(centroids, features, M, N) centroidsByPupilCandidacy = sorted(centroids, key = lambda c: evaluateCentroid(c, resizeTo)) return centroidsByPupilCandidacy[-1][0] + 10
def performMCCAlgorithm(dataSet, specificDataPointIndex, numIterations = 200, numClusters = 4, subDataRatio = 0.5): periodsAhead = np.array([1, 2, 3, 4, 5, 6, 9, 12, 18, 24, 36, 60, 120]) strippedDataSet = dataSet dataLength = strippedDataSet.shape[0] dataWidth = strippedDataSet.shape[1] specificDataPoint = strippedDataSet[specificDataPointIndex,:] numPeriods = len(periodsAhead) statisticWeightsbyIteration = np.empty(shape=(numIterations, 4),dtype=float) # Perform Bootstrapped Clustering for i in range(0,numIterations): # Perform Bootstrapped Clustering / Chooose Data Subset subDataSetIndexes = np.random.choice(range(0,dataLength),size=dataLength*subDataRatio,replace=True) subDataSet = strippedDataSet[subDataSetIndexes,:] # Perform Bootstrapped Clustering / Find Data Clusters for Subset of Data kMeansClusters = spc.kmeans(subDataSet, numClusters) clusterCenters = kMeansClusters[0] # Perform Bootstrapped Clustering / Record Clustering Cost for Weighting Scheme clusteringCost = kMeansClusters[1] statisticWeightsbyIteration[i,0] = clusteringCost # Perform Bootstrapped Clustering / Apply Found Data Clusters to All Data allClusters = spc.vq(strippedDataSet, clusterCenters) clusterAssignments = allClusters[0] clusterDistortions = allClusters[1] display = 1 #TEST if display: #TEST plt.scatter(dataSet[0:60,0],dataSet[0:60,1],c=clusterAssignments[0:60]) #TEST plt.show() statisticWeightsbyIteration[i,1] = max(clusterDistortions) statisticWeightsbyIteration[i,2] = np.mean(clusterDistortions) statisticWeightsbyIteration[i,3] = np.std(clusterDistortions) return statisticWeightsbyIteration
def create_code_book(input_filename, num_clusters, num_observations, feature_list=utils.QUIVER_FEATURES): """Create a code book from a cmp.h5 file. Args: input_filename: path to the SAM or cmp.h5 file num_clusters: the number of codes to create in the code book num_observations: the number of bases to use to create the code book clusters feature_list: the list of features to read from the cmp.h5 to cluster Returns: code_book: a numpy array of cluster centers. rows are codes, columns are features feature_list: labels for the columns of the code book """ log.debug("Checking for missing features...") if input_filename.endswith(".cmp.h5"): training_array = read_cmph5(input_filename, feature_list, num_observations) elif input_filename.endswith(".sam") or input_filename.endswith(".bam"): training_array = read_sam(input_filename, feature_list, num_observations) else: raise RuntimeError, "Input file must be SAM, BAM, or cmp.h5" clusterable_array, std_dev = make_data_clusterable(training_array, feature_list) code_book, distortion = vq.kmeans(clusterable_array, num_clusters) raw_code_book = convert_to_raw(code_book, feature_list, std_dev) return raw_code_book, feature_list
def train(self, featurefiles, k=100, subsampling=10): """Train a vocabulary from features in files listed in |featurefiles| using k-means with k words. Subsampling of training data can be used for speedup. """ image_count = len(featurefiles) descr = [] descr.append(sift.read_features_from_file(featurefiles[0])[1]) descriptors = descr[0] # Stack features for k-means. for i in numpy.arange(1, image_count): descr.append(sift.read_features_from_file(featurefiles[i])[1]) descriptors = numpy.vstack((descriptors, descr[i])) # Run k-means. self.voc, distortion = vq.kmeans(descriptors[::subsampling, :], k, 1) self.word_count = self.voc.shape[0] # Project training data on vocabulary. imwords = numpy.zeros((image_count, self.word_count)) for i in range(image_count): imwords[i] = self.project(descr[i]) occurence_count = numpy.sum((imwords > 0)*1, axis=0) self.idf = numpy.log(image_count / (occurence_count + 1.0)) self.trainingdata = featurefiles
def connected_regions(image): """ Converts image into grayscale, quantizes, counts connected regions """ # render_image(image) colors = 2 # Quantization into two colors image_rgb = np.dstack(image) pixels = np.reshape( image_rgb, (image_rgb.shape[0] * image_rgb.shape[1], image_rgb.shape[2]) ) centroids, _ = vq.kmeans(pixels, colors) quantized, _ = vq.vq(pixels, centroids) quantized_idx = quantized.reshape( (image_rgb.shape[0], image_rgb.shape[1]) ) if len(centroids) > 1: # for_render = (quantized_idx * 255).astype(np.uint8) # render_image(for_render) regions = len(region_sizes(quantized_idx)) regions_inverted = len(region_sizes(1 - quantized_idx)) # import pdb; pdb.set_trace() # if regions == 0: # regions = image[0].shape[0] * image[0].shape[1] # print regions return max([regions, regions_inverted]) else: return 0
def main(): args = get_args() # This catches files sent in with stdin if isinstance(args.infile, TextIOWrapper): data = JSONFile(args.infile, True) else: data = args.infile points = np.array([ [point.get('lon'), point.get('lat')] for point in data ]) # In testing, found that a higher number of iterations led to less # errors due to missing centroids (Note: whitening led to worse results) centroids, distortion = kmeans(points, args.number_of_vans, 2000) index, distortion = vq(points, centroids) vans = [[] for _ in range(args.number_of_vans)] for i, point in enumerate(data): vans[index[i]].append(point) vans = distribute(vans, len(data), centroids) create_output(args.outfile, vans)
def build_cluster(image, featureValue, K): img = cv2.imread(image) fast = cv2.FastFeatureDetector(featureValue) orb = cv2.ORB(180) kp = fast.detect(img,None) kp, des = orb.compute(img, kp) # build keypoints location array for cluster locations = np.empty((len(kp),2)) for i in range(len(kp)): loc = array((int(kp[i].pt[0]), int(kp[i].pt[1]))) locations[i]=loc kcenters, distortion = kmeans(locations, K) kcenters = kcenters[kcenters[:,0].argsort()] # cluster index: 0: left eye, 1 mouth and nose, 2: right eye kpCluster = {i: [] for i in range(K)} clusterLoc = {i: [] for i in range(K)} for i in range(len(kp)): set = 0 minDis = sys.maxint for j in range(K): dis = euclidean(locations[i], kcenters[j]) if dis<minDis: set = j minDis = dis kpCluster[set].append(kp[i]) clusterLoc[set].append(locations[i]) imageFeature = [len(kp)] for i in range(K): clusterFeature = cluster_feature(clusterLoc[i], kcenters[i]) imageFeature = imageFeature + clusterFeature return imageFeature
def spectral_clustering(W, k): # ====================== ADD YOUR CODE HERE ====================== # Instructions: Perform spectral clustering to partition the # data into k clusters. Implement the steps that # are described in Algorithm 2 on the assignment. L = diag(sum(W, axis=0)) - W w, v = linalg.eig(L) y = real(v[:, w.argsort()[:k]]) clusters, _ = kmeans(y, k) labels = zeros(y.shape[0]) for i in range(y.shape[0]): dist = inf for j in range(k): distance = euclideanDistance(y[i], clusters[j]) if distance < dist: dist = distance labels[i] = j # ============================================================= return labels
def main(): gdal.AllRegister() infile = auxil.select_infile() if infile: inDataset = gdal.Open(infile,GA_ReadOnly) cols = inDataset.RasterXSize rows = inDataset.RasterYSize bands = inDataset.RasterCount else: return pos = auxil.select_pos(bands) bands = len(pos) x0,y0,rows,cols=auxil.select_dims([0,0,rows,cols]) K = auxil.select_integer(6,msg='Number clusters') G = zeros((rows*cols,len(pos))) k = 0 for b in pos: band = inDataset.GetRasterBand(b) G[:,k] = band.ReadAsArray(x0,y0,cols,rows)\ .astype(float).ravel() k += 1 centers, _ = kmeans(G,K) labels, _ = vq(G,centers) outfile,fmt = auxil.select_outfilefmt() if outfile: driver = gdal.GetDriverByName(fmt) outDataset = driver.Create(outfile, cols,rows,1,GDT_Byte) outBand = outDataset.GetRasterBand(1) outBand.WriteArray(reshape(labels,(rows,cols))\ ,0,0) outBand.FlushCache() outDataset = None inDataset = None
def kmeans(X, K): """ kmeans to find clusers: x: dataset k: num of clusters #todo: this implements just for 2 cluster, initilization need to re-implement """ ret = {"mean": [], "cov": [], "coff": []} kmean_ret = vq.kmeans(X, K) ##assign data to cluster to calculate covariance data = [] for i in range(0, K, 1): data.append([]) for i in range(0, X.shape[0], 1): dis = [] max = 0 max_idx = -1 for j in range(0, K, 1): _dis = ((X[i] - kmean_ret[0][j]) ** 2).sum() if _dis >= max: max = _dis max_idx = j data[max_idx].append(X[i]) for i in range(0, K, 1): data[i] = np.asarray(data[i]) ret["cov"].append(np.cov(data[i].transpose())) ret["mean"].append(kmean_ret[0][i]) ret["coff"].append(float(data[i].size / 2) / X.shape[0]) return ret
from scipy.cluster.vq import kmeans, whiten from numpy import genfromtxt, zeros from matplotlib import pyplot as plt consolidated_data = genfromtxt('../new_consolidated_data.csv', delimiter=',', skip_header=1) features = consolidated_data[:, 2:-13] whitened = whiten(features) k_distortion = zeros(50) for k in range(1, 51): centroids, distortion = kmeans(whitened, k) k_distortion[k - 1] = distortion fig, ax = plt.subplots() plt.plot(range(1, 51), k_distortion) plt.xlabel("Number of clusters") plt.ylabel("Distortion") plt.savefig("output/states_all_year_elbow.svg") plt.show()
def testCluster(self): print "< testCluster >" numVertices = 8 graph = SparseGraph(GeneralVertexList(numVertices)) graph.addEdge(0, 1) graph.addEdge(0, 2) graph.addEdge(1, 2) graph.addEdge(3, 4) graph.addEdge(3, 5) graph.addEdge(4, 5) graph.addEdge(0, 3) W = graph.getWeightMatrix() graphIterator = [] graphIterator.append(W[0:6, 0:6].copy()) W[1, 6] += 1 W[6, 1] += 1 graphIterator.append(W[0:7, 0:7].copy()) W[4, 7] += 1 W[7, 4] += 1 graphIterator.append(W.copy()) graphIterator = iter(graphIterator) k = 2 clusterer = NingSpectralClustering(k) clustersList = clusterer.cluster( toSparseGraphListIterator(graphIterator)) #Why are the bottom rows of Q still zero? #Try example in which only edges change numVertices = 7 graph = SparseGraph(GeneralVertexList(numVertices)) graph.addEdge(0, 1) graph.addEdge(0, 2) graph.addEdge(1, 2) graph.addEdge(3, 4) WList = [] W = graph.getWeightMatrix() WList.append(W[0:5, 0:5].copy()) graph.addEdge(3, 5) graph.addEdge(4, 5) W = graph.getWeightMatrix() WList.append(W[0:6, 0:6].copy()) graph.addEdge(0, 6) graph.addEdge(1, 6) graph.addEdge(2, 6) W = graph.getWeightMatrix() WList.append(W[0:7, 0:7].copy()) iterator = iter(WList) clustersList = clusterer.cluster(toSparseGraphListIterator(iterator)) #Seems to work, amazingly #print(clustersList) #Try removing rows/cols W2 = W[0:5, 0:5] W3 = W[0:4, 0:4] WList = [W, W2, W3] iterator = iter(WList) clustersList = clusterer.cluster(toSparseGraphListIterator(iterator)) #nptst.assert_array_equal(clustersList[0][0:5], clustersList[1]) nptst.assert_array_equal(clustersList[1][0:4], clustersList[2]) #Make sure 1st clustering (without updates) is correct L = GraphUtils.normalisedLaplacianRw(scipy.sparse.csr_matrix(W)) numpy.random.seed(21) lmbda, Q = scipy.sparse.linalg.eigs(L, min(k, L.shape[0] - 1), which="SM", ncv=min(20 * k, L.shape[0]), v0=numpy.random.rand(L.shape[0])) V = VqUtils.whiten(Q) centroids, distortion = vq.kmeans(V, k, iter=20) clusters, distortion = vq.vq(V, centroids) #This should be equal but the eigenvector computation is unstable #even with repeated runs (and no way to set the seed) nptst.assert_array_equal(clusters, clustersList[0])
def kmean_anchors(path='../data/coco128.yaml', n=9, img_size=640, thr=4.0, gen=1000, verbose=True): """ Creates kmeans-evolved anchors from training dataset Arguments: path: path to dataset *.yaml, or a loaded dataset n: number of anchors img_size: image size used for training thr: anchor-label wh ratio threshold hyperparameter hyp['anchor_t'] used for training, default=4.0 gen: generations to evolve anchors using genetic algorithm verbose: print all results Return: k: kmeans evolved anchors Usage: from utils.autoanchor import *; _ = kmean_anchors() """ thr = 1. / thr def metric(k, wh): # compute metrics r = wh[:, None] / k[None] x = torch.min(r, 1. / r).min(2)[0] # ratio metric # x = wh_iou(wh, torch.tensor(k)) # iou metric return x, x.max(1)[0] # x, best_x def anchor_fitness(k): # mutation fitness _, best = metric(torch.tensor(k, dtype=torch.float32), wh) return (best * (best > thr).float()).mean() # fitness def print_results(k): k = k[np.argsort(k.prod(1))] # sort small to large x, best = metric(k, wh0) bpr, aat = (best > thr).float().mean(), (x > thr).float().mean() * n # best possible recall, anch > thr print('thr=%.2f: %.4f best possible recall, %.2f anchors past thr' % (thr, bpr, aat)) print('n=%g, img_size=%s, metric_all=%.3f/%.3f-mean/best, past_thr=%.3f-mean: ' % (n, img_size, x.mean(), best.mean(), x[x > thr].mean()), end='') for i, x in enumerate(k): print('%i,%i' % (round(x[0]), round(x[1])), end=', ' if i < len(k) - 1 else '\n') # use in *.cfg return k if isinstance(path, str): # *.yaml file with open(path) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict from utils.datasets import LoadImagesAndLabels dataset = LoadImagesAndLabels(data_dict['train'], augment=True, rect=True) else: dataset = path # dataset # Get label wh shapes = img_size * dataset.shapes / dataset.shapes.max(1, keepdims=True) wh0 = np.concatenate([l[:, 3:5] * s for s, l in zip(shapes, dataset.labels)]) # wh # Filter i = (wh0 < 3.0).any(1).sum() if i: print('WARNING: Extremely small objects found. ' '%g of %g labels are < 3 pixels in width or height.' % (i, len(wh0))) wh = wh0[(wh0 >= 2.0).any(1)] # filter > 2 pixels # wh = wh * (np.random.rand(wh.shape[0], 1) * 0.9 + 0.1) # multiply by random scale 0-1 # Kmeans calculation print('Running kmeans for %g anchors on %g points...' % (n, len(wh))) s = wh.std(0) # sigmas for whitening k, dist = kmeans(wh / s, n, iter=30) # points, mean distance k *= s wh = torch.tensor(wh, dtype=torch.float32) # filtered wh0 = torch.tensor(wh0, dtype=torch.float32) # unfiltered k = print_results(k) # Evolve npr = np.random f, sh, mp, s = anchor_fitness(k), k.shape, 0.9, 0.1 # fitness, generations, mutation prob, sigma pbar = tqdm(range(gen), desc='Evolving anchors with Genetic Algorithm') # progress bar for _ in pbar: v = np.ones(sh) while (v == 1).all(): # mutate until a change occurs (prevent duplicates) v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0) kg = (k.copy() * v).clip(min=2.0) fg = anchor_fitness(kg) if fg > f: f, k = fg, kg.copy() pbar.desc = 'Evolving anchors with Genetic Algorithm: fitness = %.4f' % f if verbose: print_results(k) return print_results(k)
def apply_kmeans(box_dict, k): # for every object class in the box_dict # reduce the list of boxes to the clustered boxes with kmeans # return the new dictionary kmeans_dict = dict() for obj_class in box_dict: print obj_class boxes = box_dict[obj_class] if len(boxes) > k: # write a representation for each proposal box as a vector def box_to_vec(pbox): # list of metrics which we want to reduce the Euclidean distance of: # includes centroid, and each of the individual coordinates of the box, # which are used to recover box coordinates after the k means in vector reprepresentation # are found. To weight the impact of the centroid measure, # we multiply by 1/area: the centroid matters less as box area increases. # we also include the coordinates, since distances between them are relevant as well. # Note that including the original coordinates in the vector allows us to recover the # original representation of the box. # we also include the score (scaled down) for the same reason. We scale it down since score-space # should not really affect the distance between boxes (having similar scores is not necessarily a good reason # to combine or not) metrics = [ pbox.centroid()[0], pbox.centroid()[1], pbox.centroid()[0] / pbox.area(), pbox.centroid()[1] / pbox.area(), pbox.x1, pbox.y1, pbox.x2, pbox.y2, 0.00001 * pbox.score ] return metrics # we will append the columns together and then take transpose # so that each row is a box with n features (here n = 9) first_col = box_to_vec(boxes[0]) # for rescaling oldx1, oldy1, oldx2, oldy2, oldscore = first_col[4], first_col[ 5], first_col[6], first_col[7], first_col[8] first_col = np.array(first_col) first_col = first_col.T box_mat = first_col for i in range(1, len(boxes)): new_col = np.array(box_to_vec(boxes[i])) new_col = new_col.T box_mat = np.c_[box_mat, new_col] box_mat = box_mat.T box_mat = box_mat.astype('float') # whiten box_mat = whiten(box_mat) # need to rescale the coords when we recover the boxes from the representation vectors newx1, newy1, newx2, newy2, newscore = 0, 0, 0, 0, 0 if len(np.shape(box_mat)) > 1: newx1, newy1, newx2, newy2, newscore = box_mat[0][4], box_mat[ 0][5], box_mat[0][6], box_mat[0][7], box_mat[0][8] else: newx1, newy1, newx2, newy2, newscore = box_mat[4], box_mat[ 5], box_mat[6], box_mat[7], box_mat[8] scalex1, scaley1, scalex2, scaley2, scalescore = oldx1 / ( 0. + newx1), oldy1 / (0. + newy1), oldx2 / ( 0. + newx2), oldy2 / (0. + newy2), oldscore / (0. + newscore) # use k-means codebook, distortion = kmeans(box_mat, k) centroid_boxes = [] for i in range(np.shape(codebook)[0]): # we chop off from 4 onwards because these are (pbox.x1, pbox.y1, pbox.x2, pbox.y2, pbox.score) # this is a direct inverse from box_to_vec # need to multiply these coords by standard deviations across all instances of feature. thebox = box(scalex1 * codebook[i][4], scaley1 * codebook[i][5], scalex2 * codebook[i][6], scaley2 * codebook[i][7], scalescore * codebook[i][8]) centroid_boxes.append(thebox) print "# of centroids: " + str(len(centroid_boxes)) print centroid_boxes[0] print centroid_boxes[1] print centroid_boxes[2] if obj_class not in kmeans_dict: kmeans_dict[obj_class] = [] kmeans_dict[obj_class] = centroid_boxes else: kmeans_dict[obj_class] = box_dict[obj_class] print "===================================" return kmeans_dict
def kmean_anchors(path='./data/coco128.yaml', n=9, img_size=640, thr=4.0, gen=1000, verbose=True): """ Creates kmeans-evolved anchors from training dataset Arguments: path: path to dataset *.yaml, or a loaded dataset n: number of anchors img_size: image size used for training thr: anchor-label wh ratio threshold hyperparameter hyp['anchor_t'] used for training, default=4.0 gen: generations to evolve anchors using genetic algorithm Return: k: kmeans evolved anchors Usage: from utils.utils import *; _ = kmean_anchors() """ thr = 1. / thr def metric(k): # compute metrics r = wh[:, None] / k[None] x = torch.min(r, 1. / r).min(2)[0] # ratio metric # x = wh_iou(wh, torch.tensor(k)) # iou metric return x, x.max(1)[0] # x, best_x def fitness(k): # mutation fitness _, best = metric(k) return (best * (best > thr).float()).mean() # fitness def print_results(k): k = k[np.argsort(k.prod(1))] # sort small to large x, best = metric(k) bpr, aat = (best > thr).float().mean(), (x > thr).float().mean() * n # best possible recall, anch > thr print('thr=%.2f: %.3f best possible recall, %.2f anchors past thr' % (thr, bpr, aat)) print('n=%g, img_size=%s, metric_all=%.3f/%.3f-mean/best, past_thr=%.3f-mean: ' % (n, img_size, x.mean(), best.mean(), x[x > thr].mean()), end='') for i, x in enumerate(k): print('%i,%i' % (round(x[0]), round(x[1])), end=', ' if i < len(k) - 1 else '\n') # use in *.cfg return k if isinstance(path, str): # *.yaml file with open(path) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict from utils.datasets import LoadImagesAndLabels dataset = LoadImagesAndLabels(data_dict['train'], augment=True, rect=True) else: dataset = path # dataset # Get label wh shapes = img_size * dataset.shapes / dataset.shapes.max(1, keepdims=True) wh = torch.tensor(np.concatenate([l[:, 3:5] * s for s, l in zip(shapes, dataset.labels)])).float() # wh wh = wh[(wh > 2.0).all(1)].numpy() # filter > 2 pixels # Kmeans calculation from scipy.cluster.vq import kmeans print('Running kmeans for %g anchors on %g points...' % (n, len(wh))) s = wh.std(0) # sigmas for whitening k, dist = kmeans(wh / s, n, iter=30) # points, mean distance k *= s wh = torch.tensor(wh) k = print_results(k) # Plot # k, d = [None] * 20, [None] * 20 # for i in tqdm(range(1, 21)): # k[i-1], d[i-1] = kmeans(wh / s, i) # points, mean distance # fig, ax = plt.subplots(1, 2, figsize=(14, 7)) # ax = ax.ravel() # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.') # fig, ax = plt.subplots(1, 2, figsize=(14, 7)) # plot wh # ax[0].hist(wh[wh[:, 0]<100, 0],400) # ax[1].hist(wh[wh[:, 1]<100, 1],400) # fig.tight_layout() # fig.savefig('wh.png', dpi=200) # Evolve npr = np.random f, sh, mp, s = fitness(k), k.shape, 0.9, 0.1 # fitness, generations, mutation prob, sigma for _ in tqdm(range(gen), desc='Evolving anchors with Genetic Algorithm:'): v = np.ones(sh) while (v == 1).all(): # mutate until a change occurs (prevent duplicates) v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0) kg = (k.copy() * v).clip(min=2.0) fg = fitness(kg) if fg > f: f, k = fg, kg.copy() if verbose: print_results(k) k = print_results(k) return k
def initialize(self, poses, rest_pose, num_bones, iterations, mayaMesh=None, jointList=None): bones = [] num_verts = rest_pose.shape[0] # shape mean array scale num_poses = poses.shape[0] bone_transforms = np.empty( (num_bones, num_poses, 4, 3)) # [(R, T) for for each pose] for each bone # 3rd dim has 3 rows for R and 1 row for T # Use k-means to assign bones to vertices whitened = whiten(rest_pose) codebook, _ = kmeans(whitened, num_bones) rest_pose_corrected = np.empty( (num_bones, num_verts, 3)) # Rest pose - mean of vertices attached to each bone # confirm mode if mayaMesh: #rigid Skin vert_assignments, bones = self.manual_codebook(mayaMesh, jointList) boneArray = [] for i in bones: boneArray.append(cmds.xform(i, q=1, t=1, ws=1)) self.rest_bones_t = np.array(boneArray) #rest_bones_t = np.empty((num_bones , 3)) for bone in range(num_bones): #rest_bones_t[bone] = np.mean(rest_pose[vert_assignments == bone] , axis = 0) self.rest_bones_t[bone] = np.array(boneArray[bone]) rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone] for pose in range(num_poses): bone_transforms[bone, pose] = self.kabsch( rest_pose_corrected[bone, vert_assignments == bone], poses[pose, vert_assignments == bone]) else: # Compute initial random bone transformations vert_assignments, _ = vq( whitened, codebook) # Bone assignment for each vertex (|num_verts| x 1) self.rest_bones_t = np.empty( (num_bones, 3)) # Translations for bones at rest pose for bone in range(num_bones): self.rest_bones_t[bone] = np.mean( rest_pose[vert_assignments == bone], axis=0) rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone] for pose in range(num_poses): bone_transforms[bone, pose] = self.kabsch( rest_pose_corrected[bone, vert_assignments == bone], poses[pose, vert_assignments == bone]) for it in range(iterations): # Re-assign bones to vertices using smallest reconstruction error from all poses constructed = np.empty( (num_bones, num_poses, num_verts, 3)) # |num_bones| x |num_poses| x |num_verts| x 3 for bone in range(num_bones): Rp = bone_transforms[bone, :, :3, :].dot( (rest_pose - self.rest_bones_t[bone]).T).transpose( (0, 2, 1)) # |num_poses| x |num_verts| x 3 # R * p + T constructed[bone] = Rp + bone_transforms[bone, :, np.newaxis, 3, :] errs = np.linalg.norm(constructed - poses, axis=(1, 3)) # position value average vert_assignments = np.argmin(errs, axis=0) # For each bone, for each pose, compute new transform using kabsch for bone in range(num_bones): self.rest_bones_t[bone] = np.mean( rest_pose[vert_assignments == bone], axis=0) rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone] for pose in range(num_poses): P = rest_pose_corrected[bone, vert_assignments == bone] Q = poses[pose, vert_assignments == bone] if (P.size == 0 or Q.size == 0): print 'Skip Iteration' else: bone_transforms[bone, pose] = self.kabsch(P, Q) # jointList is correct Index Joint return bone_transforms, self.rest_bones_t, bones
'sqlite:///c:/RBSA/year1/RBSA_METER_DATA_1/RBSA_METER_DATA_1.sqlite') dict = {'meter_min_cluster': [], 'meter_max_cluster': []} for siteid in pd.read_sql_query("SELECT DISTINCT siteid FROM RBSA_METER_DATA", engine).values: siteid = int(siteid[0]) df = pd.read_sql_query( "SELECT siteid, time, IDT from RBSA_METER_DATA WHERE siteid='{}'". format(siteid), engine) df['siteid'] = df['siteid'].astype('int') df = df.set_index('siteid') df = df.dropna() df['month'] = df['time'].apply(lambda x: x[2:5]) df = df.loc[df['month'].isin(['DEC', 'JAN', 'FEB'])] if pd.isnull(df['IDT']).all(): continue codebook, _ = kmeans(np.array(df['IDT']), 2) # two clusters dict['meter_min_cluster'].append(min(codebook)) dict['meter_max_cluster'].append(max(codebook)) df_meter = pd.DataFrame.from_dict(dict) # audit engine = sqlalchemy.create_engine( 'sqlite:///c:/OpenStudio-ResStock/OpenStudio-ResStock/data/rbsa/rbsa.sqlite' ) df = pd.read_sql_query( "SELECT siteid, ResInt_HeatTemp, ResInt_HeatTempNight from SF_ri_heu", engine) resint_heattemp = df['ResInt_HeatTemp'] resint_heattemp = resint_heattemp[resint_heattemp > 0].dropna() resint_heattempnight = df['ResInt_HeatTempNight']
def kmean_anchors(path='data/DsiacPlusF2.txt', n=9, img_size=(416, 416)): # from utils.utils import *; _ = kmean_anchors() # Produces a list of target kmeans suitable for use in *.cfg files from utils.datasets import LoadImagesAndLabels thr = 0.20 # IoU threshold def print_results(thr, wh, k): k = k[np.argsort(k.prod(1))] # sort small to large iou = wh_iou(torch.Tensor(wh), torch.Tensor(k)) max_iou, min_iou = iou.max(1)[0], iou.min(1)[0] bpr, aat = (max_iou > thr).float().mean(), (iou > thr).float().mean() * n # best possible recall, anch > thr print('%.2f iou_thr: %.3f best possible recall, %.2f anchors > thr' % (thr, bpr, aat)) print('kmeans anchors (n=%g, img_size=%s, IoU=%.3f/%.3f/%.3f-min/mean/best): ' % (n, img_size, min_iou.mean(), iou.mean(), max_iou.mean()), end='') for i, x in enumerate(k): print('%i,%i' % (round(x[0]), round(x[1])), end=', ' if i < len(k) - 1 else '\n') # use in *.cfg return k def fitness(thr, wh, k): # mutation fitness iou = wh_iou(wh, torch.Tensor(k)).max(1)[0] # max iou bpr = (iou > thr).float().mean() # best possible recall return iou.mean() * bpr # product # Get label wh wh = [] dataset = LoadImagesAndLabels(path, augment=True, rect=True, cache_labels=True) nr = 1 if img_size[0] == img_size[1] else 10 # number augmentation repetitions for s, l in zip(dataset.shapes, dataset.labels): wh.append(l[:, 3:5] * (s / s.max())) # image normalized to letterbox normalized wh wh = np.concatenate(wh, 0).repeat(nr, axis=0) # augment 10x wh *= np.random.uniform(img_size[0], img_size[1], size=(wh.shape[0], 1)) # normalized to pixels (multi-scale) # Darknet yolov3.cfg anchors use_darknet = False if use_darknet: k = np.array([[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]]) else: # Kmeans calculation from scipy.cluster.vq import kmeans print('Running kmeans for %g anchors on %g points...' % (n, len(wh))) s = wh.std(0) # sigmas for whitening k, dist = kmeans(wh / s, n, iter=30) # points, mean distance k *= s k = print_results(thr, wh, k) # # Plot # k, d = [None] * 20, [None] * 20 # for i in tqdm(range(1, 21)): # k[i-1], d[i-1] = kmeans(wh / s, i) # points, mean distance # fig, ax = plt.subplots(1, 2, figsize=(14, 7)) # ax = ax.ravel() # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.') # Evolve npr = np.random wh = torch.Tensor(wh) f, sh, ng, mp, s = fitness(thr, wh, k), k.shape, 1000, 0.9, 0.1 # fitness, generations, mutation probability, sigma for _ in tqdm(range(ng), desc='Evolving anchors'): v = np.ones(sh) while (v == 1).all(): # mutate until a change occurs (prevent duplicates) v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0) # 98.6, 61.6 kg = (k.copy() * v).clip(min=2.0) fg = fitness(thr, wh, kg) if fg > f: f, k = fg, kg.copy() print_results(thr, wh, k) k = print_results(thr, wh, k) return k
v_ids = [i for i in os.listdir(input_file_path_base + vehicle_type + '/' + fuel_type) if i[-4:] == '.csv'] for v_num, v_id in enumerate(v_ids): print "Creating Self-Organized Map for " + vehicle_type + " with " + fuel_type + " consuption (ID " + str(v_num) + " of " + str(len(v_ids)) + ")\r", # Opens ID frame sampling analysis (NOT NORMALIZED) input_file_name = input_file_path_base + vehicle_type + '/' + fuel_type + '/' + v_id df = pd.read_csv(input_file_name) data = df.fillna(0).as_matrix().astype(float) # Starts K-Means analysis best_distortion = None best_code_book = None best_distance = None best_k = None for k_mean in k_means: centroids, distortion = kmeans(data, k_mean) # Uses kmeans to clusterize data from actual map code_book, distance = vq(data, centroids) # Gets codebook of ID's # Saves results if distortion is more than "elbow_rate" percent smaller than the best distortion so far if best_distortion == None or abs(distortion - best_distortion)/best_distortion > elbow_rate: best_distortion = distortion best_code_book = code_book best_distance = distance best_k = k_mean # If distortion is not "elbow_percent" percent smaller than the best distortion so far, quites the analysis else: break df['CODE_BOOK'] = best_code_book # Saves codebook on result on dataframe df['DISTANCE'] = best_distance # Saves distances from centroids on dataframe f.write(vehicle_type + ' ' + fuel_type + ' ' + v_id[:-4] + ' k=' + str(best_k) + '\n')
# -*- coding: utf-8 -*- """ K-means clustering @author: David André Rodríguez Méndez (AndreRdz7) """ # Import libraries import numpy as numpy from scipy.cluster.vq import vq, kmeans # Create datasets data = np.random.random(90).reshape(30, 3) c1 = np.random.choice(range(len(data))) c2 = np.random.choice(range(len(data))) # Getting k clust_centers = np.vstack([data[c1], data[c2]]) print(clust_centers) print(vq(data, clust_centers)) # K-means kmeans(data, clust_centers)
def test_kmeans_large_thres(self): # Regression test for gh-1774 x = np.array([1, 2, 3, 4, 10], dtype=float) res = kmeans(x, 1, thresh=1e16) assert_allclose(res[0], np.array([4.])) assert_allclose(res[1], 2.3999999999999999)
cat_path = join(datasetpath, cat) cat_files = get_imgfiles(cat_path) cat_features = extractSift(cat_files) all_files = all_files + cat_files all_features.update(cat_features) cat_label[cat] = label for i in cat_files: all_files_labels[i] = label print "---------------------" print "## computing the visual words via k-means" all_features_array = dict2numpy(all_features) nfeatures = all_features_array.shape[0] nclusters = int(sqrt(nfeatures)) codebook, distortion = vq.kmeans(all_features_array, nclusters, thresh=K_THRESH) with open(datasetpath + CODEBOOK_FILE, 'wb') as f: dump(codebook, f, protocol=HIGHEST_PROTOCOL) print "---------------------" print "## compute the visual words histograms for each image" all_word_histgrams = {} for imagefname in all_features: word_histgram = computeHistograms(codebook, all_features[imagefname]) all_word_histgrams[imagefname] = word_histgram print "---------------------" print "## write the histograms to file to pass it to the svm"
from numpy import vstack, array from numpy.random import rand #from scipy.cluster.vq import whiten import scipy.cluster.vq as vec # data generation with three features data = vstack((rand(100, 3) + array([.5, .5, .5]), rand(100, 3))) # whitening of data data = vec.whiten(data) # computing K-Means with K = 3 (2 clusters) centroids, _ = vec.kmeans(data, 3) # assign each sample to a cluster clx, _ = vec.vq(data, centroids) print(data) print(centroids) print(clx)
def find_starting_set(run=True, display=False): """ For of the 100 clustered demands points, Determine whether a base in the set of n bases can reach that demand point within r1. As soon as a demand point cannot be reached by a base within r1, throw that set of bases out There is no way to brute force every combination of 8 ambulances But instead use the demand points, find the surrounding bases, and check whether that fulfills the set coverage. Reorder the bases list, and then randomize it. """ if not run: return global calls, bases, demands, times, converted_calls, calls_kmeans delta = 0 # See below. clust_call_to_base = [] call_array = array(converted_calls) if not calls_kmeans: calls_kmeans = kmeans(call_array, top_n) # Get the first coordinate, then find the closest actual base. calls_clustered_list = calls_kmeans[0] if display: print("For each representative call point, find the bases. \n") for each_call in calls_clustered_list: if display: print("\n") # Search for points. If it empty, then redo it. delta = 0.01 actual = [] reorder_bases = copy.deepcopy(bases) reorder_bases.sort(key=itemgetter(0)) while not actual: actual = search.surrounding_points( each_call, delta, [], # Doesn't actually do anything reorder_bases) delta += 0.01 clust_call_to_base.append(tuple([list(each_call), actual])) if display: plot([each_call], "red") print("-----------------------------------------------------") print(each_call, " with distance of %.2f km " % (delta)) print("-----------------------------------------------------") for each in actual: print(each) plot([each[0]], "green") plt.show() if display: print("<< EOF >>") return clust_call_to_base
bpms = [] for line in fd: if comment.match(line): continue md = pat.search(line) if md: bpm = md.group(2) bpm = float(bpm) title = re.sub(pat, "", line) track = Track(title, bpm) tracks.append(track) bpms.append(bpm) obs = np.array(bpms).T cb, error = kmeans(obs, args.nClusters) codes, dist = vq(obs, cb) total_error = 0 for i, item in enumerate(tracks): bpm = cb[codes[i]] tracks[i].new_bpm = bpm error = dist[i] tracks[i].error = error total_error += error tracks.sort(key=lambda x: x.orig_bpm) curbpm = 0 for _, track in enumerate(tracks):
from collections import defaultdict from similar_words import load_vectors import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--basename', help='base name of word vector files', type=str) parser.add_argument('--maxwords', help='maximum number of words to cluster', type=int) parser.add_argument('--k', help='number of clusters', type=int) args = parser.parse_args() vectors, words = load_vectors(args.basename, args.maxwords) centroids, _ = kmeans(vectors, args.k) idx, _ = vq(vectors, centroids) clusters = defaultdict(set) for i, c in enumerate(idx): clusters[c].add(words[i]) for c in range(args.k): print 'CLUSTER', c + 1, for word in clusters[c]: print word, print print
def colorCluster(img): img = imread(img) pixel = reshape(img,(img.shape[0]*img.shape[1],3)) centroids,_ = kmeans(pixel,3) # six colors will be found return centroids
def kmean_anchors(path='./data/coco64.txt', n=9, img_size=(640, 640), thr=0.20, gen=1000): # Creates kmeans anchors for use in *.cfg files: from utils.utils import *; _ = kmean_anchors() # n: number of anchors # img_size: (min, max) image size used for multi-scale training (can be same values) # thr: IoU threshold hyperparameter used for training (0.0 - 1.0) # gen: generations to evolve anchors using genetic algorithm from utils.datasets import LoadImagesAndLabels def print_results(k): k = k[np.argsort(k.prod(1))] # sort small to large iou = wh_iou(wh, torch.Tensor(k)) max_iou = iou.max(1)[0] bpr, aat = (max_iou > thr).float().mean(), ( iou > thr).float().mean() * n # best possible recall, anch > thr print('%.2f iou_thr: %.3f best possible recall, %.2f anchors > thr' % (thr, bpr, aat)) print( 'n=%g, img_size=%s, IoU_all=%.3f/%.3f-mean/best, IoU>thr=%.3f-mean: ' % (n, img_size, iou.mean(), max_iou.mean(), iou[iou > thr].mean()), end='') for i, x in enumerate(k): print('%i,%i' % (round(x[0]), round(x[1])), end=', ' if i < len(k) - 1 else '\n') # use in *.cfg return k def fitness(k): # mutation fitness iou = wh_iou(wh, torch.Tensor(k)) # iou max_iou = iou.max(1)[0] return (max_iou * (max_iou > thr).float()).mean() # product # Get label wh wh = [] dataset = LoadImagesAndLabels(path, augment=True, rect=True) nr = 1 if img_size[0] == img_size[ 1] else 10 # number augmentation repetitions for s, l in zip(dataset.shapes, dataset.labels): wh.append(l[:, 3:5] * (s / s.max())) # image normalized to letterbox normalized wh wh = np.concatenate(wh, 0).repeat(nr, axis=0) # augment 10x wh *= np.random.uniform(img_size[0], img_size[1], size=(wh.shape[0], 1)) # normalized to pixels (multi-scale) wh = wh[(wh > 2.0).all(1)] # remove below threshold boxes (< 2 pixels wh) # Kmeans calculation from scipy.cluster.vq import kmeans print('Running kmeans for %g anchors on %g points...' % (n, len(wh))) s = wh.std(0) # sigmas for whitening k, dist = kmeans(wh / s, n, iter=30) # points, mean distance k *= s wh = torch.Tensor(wh) k = print_results(k) # # Plot # k, d = [None] * 20, [None] * 20 # for i in tqdm(range(1, 21)): # k[i-1], d[i-1] = kmeans(wh / s, i) # points, mean distance # fig, ax = plt.subplots(1, 2, figsize=(14, 7)) # ax = ax.ravel() # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.') # fig, ax = plt.subplots(1, 2, figsize=(14, 7)) # plot wh # ax[0].hist(wh[wh[:, 0]<100, 0],400) # ax[1].hist(wh[wh[:, 1]<100, 1],400) # fig.tight_layout() # fig.savefig('wh.png', dpi=200) # Evolve npr = np.random f, sh, mp, s = fitness( k), k.shape, 0.9, 0.1 # fitness, generations, mutation prob, sigma for _ in tqdm(range(gen), desc='Evolving anchors'): v = np.ones(sh) while (v == 1 ).all(): # mutate until a change occurs (prevent duplicates) v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0) kg = (k.copy() * v).clip(min=2.0) fg = fitness(kg) if fg > f: f, k = fg, kg.copy() print_results(k) k = print_results(k) return k
def analyze_color(input_image, transparency_threshold=50, plot_3d=False, plot_bar=True, n_cluster=None, max_cluster=10, ignore_pure_black=True, use_sample=True, return_colors=True): # Copy to prevent modification (useful but mechanism needs clarification) input_image = input_image.copy() # Check input shape assert (len(input_image.shape) == 3) assert (input_image.shape[-1] in {3, 4}) # Turn color info of pixels into dataframe, filter by transparency if RGBA image is passed if input_image.shape[-1] == 4: color_df = pd.DataFrame(input_image.reshape(-1, 4), columns=list('rgba')) # Get the rgb info of pixels in the non-transparent part of the image color_df = color_df[color_df['a'] >= transparency_threshold] if input_image.shape[-1] == 3: color_df = pd.DataFrame(input_image.reshape(-1, 3), columns=list('rgb')) if ignore_pure_black: color_df = color_df[~((color_df['r'] == 0) & (color_df['g'] == 0) & (color_df['b'] == 0))] # Handle large pixel color_df if not use_sample and len(color_df) > 1e5: sample_or_not = (input( 'Large image detected, would you like to sample the pixels in this image? (Y/N) ' )).lower()[0] == 'y' if sample_or_not: print( 'Sampled 100,000 pixels from the image, note that you can also resize the image before passing it to this function.' ) color_df = color_df.sample(n=int(1e5), random_state=0) else: print( 'Not sampling performed, but note that rendering 3D plot for the pixels may crash your session and K-means clustering will be slow.' ) # Get std for reverse-transform the kmeans results to a meaningful rgb palette r_std, g_std, b_std = color_df[list('rgb')].std() reverse_whiten_array = np.array((r_std, g_std, b_std)) # Normalize observations on a per feature basis, forcing features to have unit variance # Doc: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.vq.whiten.html for color in list('rgb'): color_df['scaled_' + color] = whiten(color_df[color]) ## 3D scatter plot showing color groups if plot_3d: trace = go.Scatter3d( x=color_df['r'], y=color_df['g'], z=color_df['b'], mode='markers', marker=dict(color=[ 'rgb({},{},{})'.format(r, g, b) for r, g, b in zip(color_df['r'].values, color_df['g'].values, color_df['b'].values) ], size=1, opacity=1)) layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0)) fig = go.Figure(data=[trace], layout=layout) fig.show() ## Use K-means to identify main colors cluster_centers_list = [] avg_distortion_list = [] if n_cluster != None: n_cluster_range = [n_cluster - 1] # note minus 1 to get exactly n else: n_cluster_range = range(max_cluster + 1) if plot_bar: # Initialize plt graph f, ax = plt.subplots(len(n_cluster_range), 1, figsize=(10, 10)) for n in n_cluster_range: ###### Train clusters ###### cluster_centers, avg_distortion = kmeans( color_df[['scaled_r', 'scaled_g', 'scaled_b']], n + 1) ###### Assign labels ###### labels, distortions = vq( color_df[['scaled_r', 'scaled_g', 'scaled_b']], cluster_centers) color_df['label'] = labels color_df['distortion'] = distortions ###### Build palette ###### # These parameter affects visual style only and can be exposed to user later height = 200 width = 1000 gap_size = 5 palette = np.zeros((height, width, 3), np.uint8) # Count how many pixels falls under which category, let this decides the color's relative width in the palette cluster_proportion = color_df['label'].value_counts().sort_index( ) / len(color_df) cluster_width_list = (cluster_proportion * width).to_list() cluster_width_list = [ int(x) for x in saferound(cluster_width_list, places=0) ] # Reorder clusters and widths according to the proportion, largest to smallest reordered_cluster_df = pd.DataFrame( zip(cluster_centers, cluster_width_list), columns=['cluster', 'width']).sort_values('width', ascending=False) cluster_centers = reordered_cluster_df['cluster'].tolist() cluster_width_list = reordered_cluster_df['width'].tolist() # Storing information cluster_centers_list.append(cluster_centers) avg_distortion_list.append(avg_distortion) if plot_bar: # Coloring the palette canvas based on color and width endpoints = list(np.cumsum(cluster_width_list)) startpoints = [0] + endpoints[:-1] for cluster_index in range(len(cluster_centers)): # Notice here we apply the reverse_whiten_array to get meaningful RGB colors palette[:, startpoints[cluster_index] + gap_size: endpoints[cluster_index], :] = cluster_centers[ cluster_index] * reverse_whiten_array palette[:, startpoints[cluster_index]:startpoints[cluster_index] + gap_size, :] = (255, 255, 255) # Displaying the palette when performing K-means with parameter n if n_cluster != None: ax.imshow(palette) ax.axis('off') else: ax[n].imshow(palette) ax[n].axis('off') if plot_bar: ### Show the entire palette f.tight_layout() plt.show() ### Show the elbow plot for choosing best n_cluster parameter for K-means fig = plt.figure() plt.scatter(x=n_cluster_range, y=avg_distortion_list) fig.suptitle('Elbow Plot for K-means') plt.xlabel('Number of Clusters') plt.ylabel('Average Distortion') print() if return_colors: if n_cluster != None: return (cluster_centers_list[0] * reverse_whiten_array).astype( np.uint8) else: return [(cluster_centers * reverse_whiten_array).astype(np.uint8) for cluster_centers in cluster_centers_list]
from termcolor import colored,cprint import matplotlib.pyplot as plt import numpy as np from scipy.cluster import vq # Creating data c1 = np.random.randn(10, 2) + 5 c2 = np.random.randn(3, 2) - 5 c3 = np.random.randn(5, 2) print(c1,colored('c2=','red'),c2,colored('c3=','blue'),c3) # Pooling all the data into one 180 x 2 array data = np.vstack([c1, c2, c3]) print(colored('data =','red'),'\n',data) # Calculating the cluster centroids and variance # from kmeans centroids, variance = vq.kmeans(data, 3) # The identified variable contains the information # we need to separate the points in clusters # based on the vq function. identified, distance = vq.vq(data, centroids) # Retrieving coordinates for points in each vq # identified core vqc1 = data[identified == 0] vqc2 = data[identified == 1] vqc3 = data[identified == 2] print('$',vqc1,'#',vqc2,'@',vqc3)
def cluster_followings_sentiments(username, stop_value=None, access_token=None): """Returns groups and their respective centroids for the followings of a users clustered according to the sentiment reflected in their bios""" try: from textblob import TextBlob from scipy.cluster import vq import numpy non_empty = lambda x: x if x != ' ' else '' filter_crap = lambda x: { 'username': x['username'], 'bio': map( non_empty, re.sub(r'[^\x00-\x7f]', r' ', re.sub('[^A-Za-z0-9]+', ' ', x['bio'])).encode('utf-8'). strip(' \t\n\r').split()) } non_zero = lambda x: len(x['bio']) > 3 joiner = lambda x: { 'username': x['username'], 'bio': ' '.join(x['bio']) } whiten = lambda obs: obs / numpy.std(obs) follows_data = numpy.array( map( joiner, filter(non_zero, map(filter_crap, get_follows(username, 'user_and_bio'))))) grouped = [] t = [{ 'username': a['username'], 'bio': a['bio'], 'sentiment': [float(a) for a in TextBlob(a['bio']).sentiment] } for a in follows_data] centers, dist = vq.kmeans( numpy.array([[a['sentiment'][0], a['sentiment'][1]] for a in t]), whiten( numpy.array([[a['sentiment'][0], a['sentiment'][1]] for a in t])), 100) code, distance = vq.vq( numpy.array([[a['sentiment'][0], a['sentiment'][1]] for a in t]), centers) for i in range(0, len(centers)): grouped.append({ 'centroid': { 'polarity': list(map(float, centers[i]))[0], 'subjectivity': list(map(float, centers[i]))[1] }, 'cluster': list( numpy.array([{ 'polarity': a['sentiment'][0], 'subjectivity': a['sentiment'][1], 'username': a['username'] } for a in t])[code == i]) }) centers = sorted([list([float(b) for b in a]) for a in centers]) return grouped, centers except Exception, e: print str(e)
def open_file(path): file = open(path) lines = [] for line in file: line = line.strip() line = float(line) lines.append(line) file.close() return lines k_lines = open_file('D:\\code\\UBC\\k_VOT.txt') g_lines = open_file('D:\\code\\UBC\\g_VOT.txt') data = [] data.extend(k_lines) data.extend(g_lines) clusters, _ = kmeans(data, 2) #_for useless variable k_centroid = max(clusters) g_centroid = min(clusters) for j in range(11): value = random.uniform(10, 16) k_distance = abs(value - k_centroid) #distance from centroid g_distance = abs(value - g_centroid) if k_distance < g_distance: print('This sound is probably a k') else: print('This sound is probably a g')
def test_kmeans_simple(self): np.random.seed(54321) initc = np.concatenate(([[X[0]], [X[1]], [X[2]]])) for tp in np.array, np.matrix: code1 = kmeans(tp(X), tp(initc), iter=1)[0] assert_array_almost_equal(code1, CODET2)
# build ann index #t = AnnoyIndex(dims) for file_index, i in enumerate(infiles): file_vector = np.loadtxt(i) file_name = os.path.basename(i).split('.')[0] file_index_to_file_name[file_index] = file_name file_index_to_file_vector[file_index] = file_vector #whitened = whiten(file_vector) #t.add_item(file_index, file_vector) #t.build(trees) whitened = whiten(features) codes = 3 result = kmeans(whitened, codes) ''' # create a nearest neighbors json file for each input if not os.path.exists('nearest_neighbors'): os.makedirs('nearest_neighbors') for i in file_index_to_file_name.keys(): master_file_name = file_index_to_file_name[i] master_vector = file_index_to_file_vector[i] named_nearest_neighbors = [] nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors) for j in nearest_neighbors: neighbor_file_name = file_index_to_file_name[j] neighbor_file_vector = file_index_to_file_vector[j]
def cluster_segments(self): if self.n_segs >= 3: K = range(1, 4) N = len(self.hets['seg_id']) self.segs.reset_index(inplace=True, drop=False) tin_data = np.nanargmax(self.TiN_likelihood_matrix, axis=1).astype(float) km = [kmeans(tin_data, k, iter=1000) for k in K] centroids = [cent for (cent, var) in km] squared_distance_to_centroids = [ np.power(np.subtract(tin_data[:, np.newaxis], cent), 2) for cent in centroids ] self.sum_squared_distance = [ sum(np.min(d, axis=1)) / N for d in squared_distance_to_centroids ] cluster_assignment = [ np.argmin(d, axis=1) for d in squared_distance_to_centroids ] het_tin_map = np.argmax(self.p_TiN, axis=1) self.cl_distance_points = np.zeros([3, 3]) for k, clust in enumerate(cluster_assignment): for idx, row in self.segs.iterrows(): self.cl_distance_points[k, clust[idx]] += np.sum( np.power( het_tin_map[self.hets['seg_id'] == row['index']] - centroids[k][clust[idx]], 2)) self.cl_var = np.sqrt( np.true_divide(self.cl_distance_points, len(self.hets['seg_id']))) p = [1, 2, 3] delta_bic = [0, 10, 20] self.bic = (np.multiply( N, np.log( np.true_divide(np.sum(self.cl_distance_points, axis=1), N))) + np.multiply(p, np.log(N))) + delta_bic if len(centroids[2]) > 2: dist_btwn_c3 = np.mean( [abs(i - j) for i, j in combinations(centroids[2], 2)]) else: dist_btwn_c3 = 0 if len(centroids[1]) > 1: dist_btwn_c2 = np.abs(np.diff(centroids[1])) else: dist_btwn_c2 = 0 if dist_btwn_c3 < np.nanmax( self.cl_var[2, :]) and dist_btwn_c2 > np.nanmax( self.cl_var[1, :]): solution_idx = np.nanargmin(self.bic[0:1]) self.cluster_assignment = cluster_assignment[solution_idx] self.centroids = centroids[solution_idx] if dist_btwn_c3 < np.nanmax( self.cl_var[2, :]) and dist_btwn_c2 < np.nanmax( self.cl_var[1, :]): self.cluster_assignment = cluster_assignment[0] self.centroids = centroids[0] else: solution_idx = np.nanargmin(self.bic) self.cluster_assignment = cluster_assignment[solution_idx] self.centroids = centroids[solution_idx] else: self.cluster_assignment = 0 self.centroids = [np.mean(self.segs['TiN_MAP'])]
def find_markers(image, template=None): """Finds four corner markers. Use a combination of circle finding, corner detection and convolution to find the four markers in the image. Args: image (numpy.array): image array of uint8 values. template (numpy.array): template image of the markers. Returns: list: List of four (x, y) tuples in the order [top-left, bottom-left, top-right, bottom-right]. """ img_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) img_gray = cv2.medianBlur(img_gray, 5) B = (1/8)*np.ones((8,8), dtype=np.float32) B[:4,4:] = -1*B[:4,4:] B[4:,:4] = -1*B[4:,:4] angles = np.arange(-90, 91, 7.5, dtype=int) for i in angles: B_ = ndimage.rotate(B, i, reshape=True) C = cv2.filter2D(img_gray[5:,5:][:-5,:-5], ddepth = -1, kernel = B_) n=22 j=6 u = 0.8 v = 4 lo = int(0.5*(n-j)) hi = int(0.5*(n+j)) spot = (-1)*np.ones((n,n), dtype=np.float32) spot[lo:hi,lo:hi] = -1*spot[lo:hi,lo:hi] spot[-v:,:] = u*spot[-v:,:] spot[:,-v:] = u*spot[:,-v:] spot[:v,:] = u*spot[:v,:] spot[:,:v] = u*spot[:,:v] blobs = cv2.filter2D(C, ddepth = -1, kernel = spot) centers = np.array(np.argwhere(blobs==255), dtype = "float32") + 5 if centers.shape[0]>15: break if centers.shape[0]>3: markers = np.array(kmeans(centers,4)[0], dtype = int) rank_y = markers[:,1].argsort() rank_x1 = markers[rank_y[:2]][:,0].argsort() rank_x2 = markers[rank_y[2:]][:,0].argsort() p1 = markers[rank_y[:2]][rank_x1[0]] p2 = markers[rank_y[:2]][rank_x1[1]] p3 = markers[rank_y[2:]][rank_x2[0]] p4 = markers[rank_y[2:]][rank_x2[1]] final_markers = [(p1[1], p1[0]), (p2[1], p2[0]), (p3[1], p3[0]), (p4[1], p4[0])] else: final_markers = [(0,0), (2,0), (0,2), (2,2)] return final_markers raise NotImplementedError
def search(query, n=40, start=0): # retrieve top n results of query # default is 40 results per page dict_res = BossImageIndex().CallBoss(query, n, start) im_res = dict_res['ysearchresponse']['resultset_images'] res = [] for i in xrange(n): res.append((im_res[i]['thumbnail_url'], i)) #path_name = "/Library/WebServer/results/"+query path_name = "/Users/novi/my_image_search/results/" + query # create the folder (if does not exist) to save query results if os.path.isdir(path_name): shutil.rmtree(path_name) os.mkdir(path_name) else: os.mkdir(path_name) # download the image results image = urllib.URLopener() silentcounter = 1 imagefile = [] for counter in xrange(n): urltoberetrieved = res[counter][0] #print urltoberetrieved filename = '%s/%s.%s' % (path_name, silentcounter, 'jpg') #try: image.retrieve(urltoberetrieved, filename) imagefile.append(filename) silentcounter = silentcounter + 1 #except IOError: # print 'error at %s \n' % (urltoberetrieved) # pass # prepare the color image feature pref = numpy.array([[0, 0]]) # [image #,position #] ldesc = [] codes = 30 #number of k-means cluster ino = 5 jno = 8 # default grid: 5 by 8 2D grid show = ino * jno lim = show silentcounter = 1 for i_img in xrange(lim): fname = imagefile[i_img] try: im = cv.LoadImage(fname, 0) # loading with OpenCV (gray chanel only) silentcounter = silentcounter + 1 except: print 'image thumbnail can not be retrieved' sys.exit(0) #resizing the image #om = cv.CreateImage((psize,psize),im.depth,im.nChannels) #cv.Resize(im,om,cv.CV_INTER_CUBIC) storage = cv.CreateMemStorage(0) #generating the mask #mat = cv.CreateMat(psize,psize,cv.CV_8UC1) #extracting SURF feature #[keypoints,descriptors] = cv.ExtractSURF(om,mat,storage,(1,500,3,4)) [keypoints, descriptors] = cv.ExtractSURF(im, im, storage, (1, 500, 3, 4)) ldesc.append(descriptors) #perform vector quantization tarrdesc = [numpy.array(ldesc[i]) for i in range(show)] lendesc = [ldesc[i].__len__() for i in range(show)] arrdesc = numpy.concatenate([tarrdesc[i] for i in range(show)]) arrdesc = whiten(arrdesc) [codebook, distortion] = kmeans(arrdesc, codes) [code, dist] = vq(arrdesc, codebook) #generate the semantic feature imgdata = numpy.zeros((show, codebook.shape[0]), dtype=float) code_offset = 0 for i_img in xrange(show): code_index = range(code_offset, code_offset + lendesc[i_img]) for i_code in code_index: imgdata[i_img, code[i_code]] = imgdata[i_img, code[i_code]] + 1 code_offset = code_offset + lendesc[i_img] #normalize the semantic feature sumimgdata = numpy.sum(imgdata, axis=1) sumimgdata.shape = show, 1 imgdata = imgdata / sumimgdata griddata = numpy.zeros((2, ino * jno)) griddata[0, ] = numpy.kron(range(1, ino + 1), numpy.ones((1, jno))) griddata[1, ] = numpy.tile(range(1, jno + 1), (1, ino)) # do kernelized sorting procedure PI = KS(imgdata, griddata.T, pref) i_sorting = PI.argmax(axis=1) #creating the passed dictionary sorted_dict_res = {} sorted_dict_res['count'] = dict_res['ysearchresponse']['count'] sorted_dict_res['totalhits'] = dict_res['ysearchresponse']['totalhits'] sorted_dict_res['start'] = dict_res['ysearchresponse']['start'] sorted_dict_res['resultset_images'] = [ dict_res['ysearchresponse']['resultset_images'][i] for i in i_sorting ] return sorted_dict_res
def limb_track(): global frame_n cv.namedWindow("Dots") fps = 30 frame_dt = 0 #1.0 / fps mv_i = 0 pause = False while True: print("Frame:", mv_i) if frame_n >= contour_data.shape[0]: #mv_i = 0 print("Frames completed:", frame_n) f_write.save(write_dict) break t = time.clock() ret, im = cap.read() for x, y in fs: cv.circle(im, (x, y), 2, (255, 0, 0), -1) n = n_contours[mv_i] if (n > 0): c_points = contour_data[mv_i, :n] limb_distances = np.empty((num_limbs, n)) for i in range(num_limbs): limb_x, limb_y = fs[i] for j in range(n): x, y = c_points[j] dx = limb_x - x dy = limb_y - y distance = dx * dx + dy * dy limb_distances[i, j] = distance limb_distances[i] = np.sort(limb_distances[i]) threshold = 1500 needed_limbs = np.where(limb_distances[:, 0] < threshold)[0] whitened = whiten(c_points) x_scale = c_points[0, 0] / whitened[0, 0] y_scale = c_points[0, 1] / whitened[0, 1] if (needed_limbs.shape[0] > 0): max_k = 6 costs = np.empty(max_k - needed_limbs.shape[0]) all_kmean_points = [] for k in range(needed_limbs.shape[0], max_k): points, distortion = kmeans(whitened, k) points[:, 0] *= x_scale points[:, 1] *= y_scale points = points.astype('int32') all_kmean_points.append(points) costs[k - needed_limbs.shape[0]] = cost( points, needed_limbs) best_ind = np.argmin(costs) best_points = all_kmean_points[best_ind] for i, (x, y) in enumerate(best_points): cv.circle(im, (x, y), 2, (0, 0, 255), -1) distances = np.empty( (needed_limbs.shape[0], best_points.shape[0])) indices = np.empty( (needed_limbs.shape[0], best_points.shape[0], 2), dtype='uint8') for i in range(needed_limbs.shape[0]): limb_x, limb_y = fs[needed_limbs[i]] for j in range(best_points.shape[0]): x, y = best_points[j] dx = x - limb_x dy = y - limb_y distance = dx * dx + dy * dy distances[i, j] = distance indices[i, j, 0] = needed_limbs[i] indices[i, j, 1] = j for i in range(needed_limbs.shape[0]): i, j = np.unravel_index(np.nanargmin(distances), distances.shape) limb_ind = indices[i, j, 0] point_ind = indices[i, j, 1] new_limb_pos = (best_points[point_ind, 0], best_points[point_ind, 1]) cv.line(im, fs[limb_ind], new_limb_pos, (255, 255, 255), 1) fs[limb_ind] = new_limb_pos distances[i] = np.NaN distances[:, j] = np.NaN for i in range(num_limbs): name = names[i] x, y = fs[i] write_dict[name][mv_i, 0] = x write_dict[name][mv_i, 1] = y cv.putText(im, str(frame_n), (5, 25), cv.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255)) cv.imshow("Dots", im) if pause: k = cv.waitKey(0) else: dt = frame_dt - (time.clock() - t) dt_mili = int(dt * 1000) if (dt_mili < 1): dt_mili = 1 k = cv.waitKey(dt_mili) mv_i += 1 frame_n += 1 if k == 27: # esc key print("Frames completed:", frame_n) f_write.save(write_dict) break elif k == 32: # space key pause = not (pause) elif k == 63235 and pause: # right arrow mv_i += 1 frame_n += 1 print(stds[frame_n]) elif k == 63234 and pause: # left arrow mv_i -= 1 frame_n -= 1 print(stds[frame_n])
-0.14 -0.016 0.047 -0.017 0.047 -0.012 -0.07 -0.032 -0.012 -0.06 -0.035 -0.018 -0.056 -0.039 -0.056 -0.028 -0.004 -0.007 -0.005 -0.006] ==Twitch, ash, -therm, -thatch, bandit, blackbeard [-0.139 -0.412 0.204 0.135 0.063 0.07 0.063 0.043 0.51 0.061 0.073 0.107 -0.048 0.08 0.116 0.02 0.082 0.051 0.222 0.039 -0.044 0.006 0.031 0.098 -0.182 0.195 -0.368 0. -0.067 0.043 -0.135 0.016 -0.028 0.011 -0.175 -0.25 -0.05 -0.018 -0.017 -0.08 -0.041 -0. -0.112 -0.03 -0.093 -0.028 -0.003 -0.004 -0.01 -0.004] ==-twitch, ash, thermite, valk, capitao, -vigil, -nomad """ np.random.seed((1000,2000)) #random random seed K = range(1,20) # k's for k means KM = [kmeans(dataset,k) for k in K] centroids = [cent for (cent,var) in KM] D_k = [cdist(dataset, cent, 'euclidean') for cent in centroids] cIdx = [np.argmin(D,axis=1) for D in D_k] dist = [np.min(D,axis=1) for D in D_k] tot_withinss = [sum(d**2) for d in dist] # Total within-cluster sum of squares totss = sum(pdist(dataset)**2)/dataset.shape[0] # The total sum of squares betweenss = totss - tot_withinss # The between-cluster sum of squares ##### plots ##### kIdx = 7 # K=8 mrk = 'os^p<dvh8>+x.' # elbow curve plt.plot(K, betweenss/totss*100, 'b*-')
import matplotlib.pylab as plt #生成待聚类的数据点,这里生成了20个点,每个点4维: points = scipy.randn(20, 4) #1. 层次聚类 #生成点与点之间的距离矩阵,这里用的欧氏距离: disMat = sch.distance.pdist(points, 'euclidean') #进行层次聚类: Z = sch.linkage(disMat, method='average') #将层级聚类结果以树状图表示出来并保存为plot_dendrogram.png P = sch.dendrogram(Z) plt.savefig('plot_dendrogram.png') #根据linkage matrix Z得到聚类结果: cluster = sch.fcluster(Z, t=1, 'inconsistent') print "Original cluster by hierarchy clustering:\n", cluster #2. k-means聚类 #将原始数据做归一化处理 data = whiten(points) #使用kmeans函数进行聚类,输入第一维为数据,第二维为聚类个数k. #有些时候我们可能不知道最终究竟聚成多少类,一个办法是用层次聚类的结果进行初始化.当然也可以直接输入某个数值. #k-means最后输出的结果其实是两维的,第一维是聚类中心,第二维是损失distortion,我们在这里只取第一维,所以最后有个[0] centroid = kmeans(data, max(cluster))[0] #使用vq函数根据聚类中心对所有数据进行分类,vq的输出也是两维的,[0]表示的是所有数据的label label = vq(data, centroid)[0] print "Final clustering by k-means:\n", label