def meanShift(flat_image): # Estimate Bandwidth bandwidth = estimate_bandwidth(flat_image, quantile = 0.2, n_samples=500) ms = MeanShift(bandwidth, bin_seeding=True) ms.fit(flat_image) labels = ms.labels_ return ms.labels_, ms.cluster_centers_
def meanshift_for_hough_line(self): # init mean shift pixels_of_label = {} points_of_label = {} for hough_line in self.points_of_hough_line: pixels = self.pixels_of_hough_line[hough_line] pixels = np.array(pixels) bandwidth = estimate_bandwidth(pixels, quantile=QUANTILE, n_samples=500) if bandwidth == 0: bandwidth = 2 ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(pixels) labels = ms.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) for k in range(n_clusters_): label = list(hough_line) label.append(k) pixels_of_label[tuple(label)] = map(tuple, pixels[labels==k]) for label in pixels_of_label: pixels = pixels_of_label[label] points = map(self.img.get_bgr_value, pixels) points_of_label[label] = points self.pixels_of_hough_line = pixels_of_label self.points_of_hough_line = points_of_label
def _fit_mean_shift(self, x): for c in xrange(len(self.crange)): quant = 0.015 * (c + 1) for r in xrange(self.repeats): bandwidth = estimate_bandwidth( x, quantile=quant, random_state=r) idx = c * self.repeats + r model = MeanShift( bandwidth=bandwidth, bin_seeding=True) model.fit(x) self._labels[idx] = model.labels_ self._parameters[idx] = model.cluster_centers_ # build equivalent gmm k = model.cluster_centers_.shape[0] model_gmm = GMM(n_components=k, covariance_type=self.cvtype, init_params='c', n_iter=0) model_gmm.means_ = model.cluster_centers_ model_gmm.weights_ = sp.array( [(model.labels_ == i).sum() for i in xrange(k)]) model_gmm.fit(x) # evaluate goodness of fit self._ll[idx] = model_gmm.score(x).sum() if self.gof_type == 'aic': self._gof[idx] = model_gmm.aic(x) if self.gof_type == 'bic': self._gof[idx] = model_gmm.bic(x) print quant, k, self._gof[idx]
def cluster_pixels_ms(self): # reshape """ cluster points descriptors by meahs shift :type self: ColorRemover """ fg_pixels = self.img.fg_pixels.keys() descriptors = [] for r, c in fg_pixels: descriptors.append(self.descriptor_map[r][c]) descriptors = np.array(descriptors) descriptors = PCA(n_components=int(VECTOR_DIMENSION)/2).fit_transform(descriptors) # descriptors = self.descriptor_map.reshape(descriptors_rows, 1, VECTOR_DIMENSION) bandwidth = estimate_bandwidth(descriptors, quantile=0.05) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(descriptors) labels = ms.labels_ for i in range(len(labels)): xy = fg_pixels[i] label = labels[i] self.labels_map.itemset(xy, label) # save the indices and BGR values of each cluster as a dictionary with keys of label for label in range(K): self.pixels_of_hough_line_in_sphere[label] = map(tuple, np.argwhere((self.labels_map == label))) self.cluster_bgr[label] = map(tuple, self.img.bgr[self.labels_map == label])
def cluster_data(data,clustering_method,num_clusters): cluster_centers = labels_unique = labels = extra = None if clustering_method == 'KMeans': # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans k_means = KMeans(n_clusters=num_clusters,init='k-means++',n_init=10,max_iter=100,tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1) k_means.fit(data) labels = k_means.labels_ cluster_centers = k_means.cluster_centers_ elif clustering_method == 'MeanShift': ms = MeanShift( bin_seeding=True,cluster_all=False) ms.fit(data) labels = ms.labels_ cluster_centers = ms.cluster_centers_ elif clustering_method == 'AffinityPropagation': af = AffinityPropagation().fit(data) cluster_centers = [data[i] for i in af.cluster_centers_indices_] labels = af.labels_ elif clustering_method == "AgglomerativeClustering": n_neighbors=min(10,len(data)/2) connectivity = kneighbors_graph(data, n_neighbors=n_neighbors) ward = AgglomerativeClustering(n_clusters=num_clusters, connectivity=connectivity, linkage='ward').fit(data) labels = ward.labels_ elif clustering_method == "DBSCAN": db = DBSCAN().fit(data) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True extra = core_samples_mask labels = db.labels_ if labels is not None: labels_unique = np.unique(labels) return labels,cluster_centers,labels_unique,extra
def mean_shift_cluster_analysis(x,y,quantile=0.2,n_samples=1000): # ADAPTED FROM: # http://scikit-learn.org/stable/auto_examples/cluster/plot_mean_shift.html#example-cluster-plot-mean-shift-py # The following bandwidth can be automatically detected using X = np.hstack((x.reshape((x.shape[0],1)),y.reshape((y.shape[0],1)))) bandwidth = estimate_bandwidth(X, quantile=quantile, n_samples=n_samples) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #print("number of estimated clusters : %d" % n_clusters_) colors = 'bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmyk' #cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for i in xrange(len(np.unique(labels))): my_members = labels == i cluster_center = cluster_centers[i] plt.scatter(X[my_members, 0], X[my_members, 1],s=90,c=colors[i],alpha=0.7) plt.scatter(cluster_center[0], cluster_center[1],marker='+',s=280,c=colors[i]) tolx = (X[:,0].max()-X[:,0].min())*0.03 toly = (X[:,1].max()-X[:,1].min())*0.03 plt.xlim(X[:,0].min()-tolx,X[:,0].max()+tolx) plt.ylim(X[:,1].min()-toly,X[:,1].max()+toly) plt.show() return labels
def Mean_Shift(path): #importer les donnees data=pandas.read_csv(filepath_or_buffer=path,delimiter=',',encoding='utf-8') data.drop_duplicates() print (data) #lire les donnees values=data[['latitude', 'longitude']].values print("printing values") print (values) #Mean shift print ("Clustering data Meanshift algorithm") bandwidth = estimate_bandwidth(values, quantile=0.003, n_samples=None) #ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, min_bin_freq=20, cluster_all=False) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True,min_bin_freq=25,cluster_all=False) ms.fit(values) data['cluster'] = ms.labels_ data = data.sort(columns='cluster') data = data[(data['cluster'] != -1)] print (data['cluster']) data['cluster'] = data['cluster'].apply(lambda x:"cluster" +str(x)) labels_unique = np.unique(ms.labels_).tolist() del labels_unique[0] # Filtering clusters centers according to data filter cluster_centers = DataFrame(ms.cluster_centers_, columns=['latitude', 'longitude']) cluster_centers['cluster'] = labels_unique print (cluster_centers) n_centers_ = len(cluster_centers) print("number of clusters is :%d" % n_centers_) # print ("Exporting clusters to {}...'.format(clusters_file)") data.to_csv(path_or_buf="output/points.csv", cols=['user','latitude','longitude','cluster','picture','datetaken'], encoding='utf-8') #print ("Exporting clusters centers to {}...'.format(centers_file)") cluster_centers['cluster'] = cluster_centers['cluster'].apply(lambda x:"cluster" +str(x)) cluster_centers.to_csv(path_or_buf="output/centers.csv", cols=['latitude', 'longitude','cluster'], encoding='utf-8') plot_meanshift(data, cluster_centers, n_centers_) return 0
def meanShift(points): # perform meanshift clustering of data meanshift = MeanShift() meanshift.fit(points.T) labels = meanshift.labels_ centers = meanshift.cluster_centers_ return np.array(labels)
def simplify_data1(x): X = np.array(zip(x,np.zeros(len(x))), dtype=np.float) bandwidth = estimate_bandwidth(X, quantile=0.2) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #print n_clusters_ #exit() start=0 value=0 print x for k in range(n_clusters_): my_members = labels == k print "cluster {0}: {1}".format(k, X[my_members, 0]),np.average(X[my_members, 0]) value=np.average(X[my_members, 0]) val2=0 for i in xrange(start,start+len(X[my_members, 0])): val2+=X[i][0] print val2,X[i][0],i X[i][0]=value print "FINAL",val2/len(X[my_members, 0]) start+=len(X[my_members, 0]) return X[:,0]
def centers_y_clusters(self,graph_db,nodes,consulta,cyprop): group = [] todo = [] rr = [] for n in nodes: tiene = neo4j.CypherQuery(graph_db, consulta+" where id(n) ="+str(n.id)+" return count(distinct(e))"+cyprop+" as cuenta").execute() for r in tiene: todo.append([r.cuenta]) rr.append(r.cuenta) ms = MeanShift(bin_seeding=True) ms.fit(np.asarray(todo)) labels = ms.labels_ cluster_centers = sorted(ms.cluster_centers_ , key=lambda x: x[0]) for idx,cl in enumerate(cluster_centers): cluster_centers[idx] = float(cl[0]) for u in cluster_centers: group.append([]) for n in nodes: tiene = neo4j.CypherQuery(graph_db, consulta+" where id(n) ="+str(n.id)+" return count(distinct(e))"+cyprop+" as cuenta").execute() for r in tiene: valor = r.cuenta for idx,v in enumerate(cluster_centers): if idx == 0: temp1 = -9999 else: temp1 = (cluster_centers[idx-1] + cluster_centers[idx])/2 if idx == len(cluster_centers) - 1: temp2 = 99999 else: temp2 = (cluster_centers[idx+1] + cluster_centers[idx])/2 if temp1 <= valor < temp2: group[idx].append(n) return cluster_centers, group
def make(filename, precision): with open('test.geojson') as f: data = json.load(f) features = data['features'] points = [ geo['geometry']["coordinates"] for geo in features if pred(geo) ] print points ar_points = array(points).reshape(len(points) * 2, 2) print ar_points bandwidth = estimate_bandwidth(ar_points) / precision cluster = MeanShift(bandwidth=bandwidth) cluster.fit(ar_points) labels = cluster.labels_ cluster_centers = cluster.cluster_centers_ print 'clusters:', len(unique(labels)) for i, geo in enumerate(filter(pred, features)): geo['geometry']["coordinates"] = [ list(cluster_centers[labels[i*2 + j]]) for j in range(2) ] with open(filename, 'w') as f: json.dump(data, f)
def meanShift(mtx, **kw): """ meanShift(mtx, **kw) uses scikit-learn's meanshift clustering implementation to cluster infoDistance matrices. Call with the distance matrix as the first parameter. Available Keyword arguments: startingbandwidth: the lowest bandwidth to begin the estimation with (defaults to 0.1) bandwithincrement: the amount by which to increment bandwidth in between rounds of meanshift (defaults to 0.01) """ H = kw.get('startingbandwidth', 0.1) dH= kw.get('bandwidthincrement', 0.01) ms = MeanShift(bandwidth = H) clustercenters = None nnonunary = [] minH = None while nclusters > 1: ms = MeanShift(bandwidth = H) ms.fit(mtx) centers = ms.cluster_centers_ clusters = ms.labels_ nonunary = np.shape(np.where(np.bincount(clusters) > 1))[1] if nonunary: H = H + dH
def do_meanshift(s_path, band1, band2, band3, band4, colour1, colour2, make_plot): '''Meanshift clustering to determine the number of clusters in the data, which is passed to KMEANS function''' # Truncate data X = np.vstack([colour1, colour2]).T '''Compute clustering with MeanShift''' # Scale data because meanshift generates circular clusters X_scaled = preprocessing.scale(X) # The following bandwidth can be automatically detected using # the routine estimate_bandwidth(X). Bandwidth can also be set manually. bandwidth = estimate_bandwidth(X) #bandwidth = 0.65 # Meanshift clustering ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False) ms.fit(X_scaled) labels_unique = np.unique(ms.labels_) objects = ms.labels_[ms.labels_ >= 0] n_clusters = len(labels_unique[labels_unique >= 0]) # Make plot if "meanshift" in make_plot: make_ms_plots(s_path, colour1, colour2, n_clusters, X, ms, band1, band2, band3, band4, objects) return(n_clusters, bandwidth)
def meanshift(raw_data, t): # Compute clustering with MeanShift # The following bandwidth can be automatically detected using #data = [ [(raw_data[i, 1]+raw_data[i, 5]), (raw_data[i, 2]+raw_data[i,6])] for i in range(raw_data.shape[0]) ] data = np.zeros((raw_data.shape[0],2)) X = raw_data[:,1] + raw_data[:,5] Y = raw_data[:,2] + raw_data[:,6] #X = raw_data[:,1] ; Y = raw_data[:,2]; data = np.transpose(np.concatenate((np.mat(X),np.mat(Y)), axis=0)) bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(data) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) # Plot result plt.figure(t) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): my_members = labels == k cluster_center = cluster_centers[k] plt.plot(data[my_members, 0], data[my_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.axis('equal') plt.show()
def ms_algo(X, bandwidth=None): if(bandwidth==None): n_samples = X.shape[0] bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=n_samples) # Apply the meanshit algorithm from sklearn library ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) # collect from the meanshift algorithm the labels and the centers of the clusters labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #Number of clusters # Print section print("The number of clusters is: %d" % n_clusters_) print("The centers are:") for i in range(n_clusters_): print i, print cluster_centers[i] return cluster_centers
def train(trainingData, pklFile, clusteringAll, numberOfClusters=None): # ========================================================================= # # =============== STEP 1. DEFINE OUTPUT LEARNT MODEL FILE ================= # # ========================================================================= # if (pklFile == ''): os.system('rm -rf learntModel & mkdir learntModel') pklFile = 'learntModel/learntModel.pkl' # ========================================================================= # # =============== STEP 2. PERFORM CLUSTERING TO THE DATA ================== # # ========================================================================= # if (numberOfClusters == None): print "Running MeanShift Model..." bandwidth = estimate_bandwidth(trainingData) ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=clusteringAll) ms.fit(trainingData) joblib.dump(ms, pklFile) return {"numberOfClusters":len(ms.cluster_centers_), "labels": ms.labels_, "clusterCenters":ms.cluster_centers_} elif (numberOfClusters != None): print "Running K-Means Model..." kMeans = KMeans(init='k-means++', n_clusters=numberOfClusters) kMeans.fit(trainingData) joblib.dump(kMeans, pklFile) return {"numberOfClusters":len(kMeans.cluster_centers_), "labels": kMeans.labels_, "clusterCenters":kMeans.cluster_centers_}
def mean_shift(X): bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=1000) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ return labels, cluster_centers
def BA_meanshift_cluster(mark, chrom): ''' @param: @return: perform mean shift cluster on 2D data: ((chromStart+chromEnd)*0.5, chromEnd-chromStart) ''' path = os.path.join(get_data_dir(), "tmp", mark,"{0}-{1}.csv".format(chrom, mark)) DF = pd.read_csv(path, sep='\t') S_x = 0.5*(DF.loc[:, 'chromEnd'].values+DF.loc[:, 'chromStart'].values) S_y = DF.loc[:, 'chromEnd'].values-DF.loc[:, 'chromStart'].values X = np.hstack((np.atleast_2d(S_x[7000:8000]).T, np.atleast_2d(S_y[7000:8000]).T)) print X bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=1000) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ print list(set(labels)) import matplotlib.pyplot as plt from itertools import cycle colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(len(list(set(labels)))), colors): my_members = labels == k plt.plot(X[my_members, 0], X[my_members, 1], col + '.') plt.title('Estimated number of clusters: %d' % len(list(set(labels)))) plt.show()
def get_clusters(self, in_file, cc_file, clf_file, arrivals_file, chunk_size=1710671): df = pd.read_csv(open(in_file), chunksize=chunk_size) dests = [] part = 1 lines = 1710671 / chunk_size try: dest = cPickle.load(open(arrivals_file)) except IOError: for d in df: print "%d / %d" % (part, lines) part += 1 for row in d.values: # print eval(row[-1]) tmp = eval(row[-1]) if len(tmp) > 0: dests.append(tmp[-1]) dest = np.array(dests) cPickle.dump(dest, open(arrivals_file, "w"), protocol=cPickle.HIGHEST_PROTOCOL) print "Destination points loaded" try: ms = cPickle.load(open(clf_file)) except IOError: bw = 0.001 ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5, n_jobs=-2) ms.fit(dest) cPickle.dump(ms, open(clf_file, "w"), protocol=cPickle.HIGHEST_PROTOCOL) print "Mean shift loaded" cluster_centers = ms.cluster_centers_ cPickle.dump(cluster_centers, open(cc_file, "w"), protocol=cPickle.HIGHEST_PROTOCOL) print "Clusters dumped"
def hart85_means_shift_cluster(pair_buffer_df, features): from sklearn.cluster import MeanShift, estimate_bandwidth # Creating feature vector cluster_df = pd.DataFrame() if 'active' in features: cluster_df['active'] = pd.Series(pair_buffer_df.apply(lambda row: ((np.fabs(row['T1 Active']) + np.fabs(row['T2 Active'])) / 2), axis=1), index=pair_buffer_df.index) if 'reactive' in features: cluster_df['reactive'] = pd.Series(pair_buffer_df.apply(lambda row: ((np.fabs(row['T1 Reactive']) + np.fabs(row['T2 Reactive'])) / 2), axis=1), index=pair_buffer_df.index) if 'delta' in features: cluster_df['delta'] = pd.Series(pair_buffer_df.apply(lambda row: (row['T2 Time'] - row['T1 Time']), axis=1), index=pair_buffer_df.index) cluster_df['delta'] = cluster_df[ 'delta'].apply(lambda x: int(x) / 6e10) if 'hour_of_use' in features: cluster_df['hour_of_use'] = pd.DatetimeIndex( pair_buffer_df['T1 Time']).hour if 'sd_event' in features: cluster_df['sd_event'] = pd.Series(pair_buffer_df.apply(lambda row: (df.power[row['T1 Time']:row['T2 Time']]).std(), axis=1), index=pair_buffer_df.index) X = cluster_df.values.reshape((len(cluster_df.index), len(features))) ms = MeanShift(bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) return pd.DataFrame(cluster_centers, columns=features)
def find_clusters(feature, items, bandwidth=None, min_bin_freq=None, cluster_all=True, n_jobs=1): """ Cluster list of items based on feature using meanshift algorithm (Binning). :param feature: key used to retrieve item to cluster on :param items: :param bandwidth: :param min_bin_freq: :param cluster_all: :return: """ x = [item[feature] for item in items] X = np.array(list(zip(x, np.zeros(len(x)))), dtype=np.float) ms = MeanShift(bandwidth=bandwidth, min_bin_freq=min_bin_freq, cluster_all=cluster_all, n_jobs=n_jobs) ms.fit(X) labels = ms.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) clusters = [] for k in range(n_clusters_): if k != -1: my_members = labels == k cluster_center = np.median(X[my_members, 0]) cluster_sd = np.std(X[my_members, 0]) clusters.append({ 'center': cluster_center, 'sd': cluster_sd, 'items': X[my_members, 0] }) return clusters
def mean(X, save_fig=False, params_labels=None, prefix='clusters'): ''' Compute clustering with MeanShift ''' logger.debug('Calculating MeanShift clusters using %d parameters'%len(X[0])) X = np.array( X ) with warnings.catch_warnings(): warnings.simplefilter("ignore") bandwidth = estimate_bandwidth(X, quantile=0.2) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ if save_fig: plotClusters(X, ms, method='mean', prefix=prefix, params=params_labels) labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) logger.debug('Found %d clusters with MeanShift algorithm'%n_clusters_) return labels
def weekhour(lst,day,hour,num): l = [ ] for dicts in lst: latlong = dicts["latlong"] l.append(latlong) l = np.array(l) l = np.array([x for x in l if x[0] < 40]) l = np.array([x for x in l if x[1] < -102.0]) l = np.array([x for x in l if x[0] > 39]) l = np.array([x for x in l if x[1] > -105.5]) bandwidth = .001 ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(l) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): my_members = labels == k cluster_center = cluster_centers[k] plt.plot(l[my_members,1], l[my_members,0], col + '.') plt.plot(cluster_center[1], cluster_center[0], 'x', markerfacecolor=col,\ markeredgecolor='k', markersize=14) num_samples = len(labels) list_clust_cents = cluster_centers.tolist() num_labels = Counter(labels).most_common() top = tuple(num_labels) if num > n_clusters_: num = n_clusters_ for i in range(num): densest = top[i][1] percent = round((float(densest)/float(num_samples))*100,3) if densest >= 60: import geocoder g = geocoder.google(list_clust_cents[i], method='reverse') address = g.address else: address = 0 with open('weekdayclusterstest.csv', 'a') as csvfile: fieldnames = ['day', 'hour', 'densest cluster', 'address', 'percent', 'number of samples', 'number of estimated clusters'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerow({'densest cluster': densest, \ 'day': day, \ 'hour': hour, \ 'address': address, \ 'percent': percent, \ 'number of samples': num_samples, \ 'number of estimated clusters': n_clusters_})
def meanshiftUsingPCA(path): # Load original image given the image path im = cv.LoadImageM(path) #convert image to YUV color space cv.CvtColor(im,im,cv.CV_BGR2YCrCb) # Load bank of filters filterBank = lmfilters.loadLMFilters() # Resize image to decrease dimensions during clustering resize_factor = 1 thumbnail = cv.CreateMat(im.height / resize_factor, im.width / resize_factor, cv.CV_8UC3) cv.Resize(im, thumbnail) # now work with resized thumbnail image response = np.zeros(shape=((thumbnail.height)*(thumbnail.width),51), dtype=float) for f in xrange(0,48): filter = filterBank[f] # Resize the filter with the same factor for the resized image dst = cv.CreateImage(cv.GetSize(thumbnail), cv.IPL_DEPTH_32F, 3) resizedFilter = cv.CreateMat(filter.height / resize_factor, filter.width / resize_factor, filter.type) cv.Resize(filter, resizedFilter) # Apply the current filter cv.Filter2D(thumbnail,dst,resizedFilter) for j in xrange(0,thumbnail.height): for i in xrange(0,thumbnail.width): # Select the max. along the three channels maxRes = max(dst[j,i]) if math.isnan(maxRes): maxRes = 0.0 if maxRes > response[thumbnail.width*j+i,f]: # Store the max. response for the given feature index response[thumbnail.width*j+i,f] = maxRes #YUV features count = 0 for j in xrange(0,thumbnail.height): for i in xrange(0,thumbnail.width): response[count,48] = thumbnail[j,i][0] response[count,49] = thumbnail[j,i][1] response[count,50] = thumbnail[j,i][2] count+=1 #get the first 4 primary components using pca pca = PCA(response) pcaResponse = zeros([thumbnail.height*thumbnail.width,4]) for i in xrange(0,thumbnail.height*thumbnail.width): pcaResponse[i] = pca.getPCA(response[i],4) # Create new mean shift instance ms = MeanShift(bandwidth=10,bin_seeding=True) # Apply the mean shift clustering algorithm ms.fit(pcaResponse) labels = ms.labels_ n_clusters_ = np.unique(labels) print "Number of clusters: ", len(n_clusters_) repaintImage(thumbnail,labels) cv.Resize(thumbnail, im) return im
def do_meanshift (band1, band2, band3, band4, colour1, colour2, make_plots): '''Does meanshift clustering to determine a number of clusters in the data, which is passed to KMEANS function''' data = np.loadtxt(inputdata) #Input Checking #if band1 == band2 or band3 == band4: #print "Not a good idea to use the same band in one colour, try again" #return #for band in [band1, band2, band3, band4]: #if band not in band_names.keys(): #print "Can't find %s in band_name list" %band #return #Import 4 different wavelengths #Colour 1: 05_mag wave1 = data[:, band_names[band1]] wave2 = data[:, band_names[band2]] #Colour 2: 05_mag wave3 = data[:, band_names[band3]] wave4 = data[:, band_names[band4]] gooddata1 = np.logical_and(np.logical_and(wave1!=badval, wave2!=badval), np.logical_and(wave3!=badval, wave4!=badval)) # Remove data pieces with no value gooddata2 = np.logical_and(np.logical_and(wave1<maglim, wave2<maglim), np.logical_and(wave3<maglim, wave4<maglim)) greatdata = np.logical_and(gooddata1, gooddata2) colour1 = wave1[greatdata] - wave2[greatdata] colour2 = wave3[greatdata] - wave4[greatdata] #Truncate data X = np.vstack([colour1, colour2]).T #Scale data because meanshift generates circular clusters X_scaled = preprocessing.scale(X) # The following bandwidth can be automatically detected using # the routine estimate_bandwidth(). Bandwidth can also be set # as a value. bandwidth = estimate_bandwidth(X) # Meanshift clustering ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False) ms.fit(X_scaled) labels_unique = np.unique(ms.labels_) n_clusters = len(labels_unique[labels_unique >= 0]) #Make plot of clusters if needed if "MSplot" in make_plot: make_ms_plots(colour1, colour2, n_clusters, X, ms, band1, band2, band3, band4) return(n_clusters)
def meanShiftClustering(centers_df,subject): #estimate the bandwidth to use with the mean shift algorithm. The quantile represents the distance used between the box centers to define the cluster. Smaller quantile, means smaller distance between points that would end up in the same cluster centers_df=centers_df.reset_index() bandwidth=estimate_bandwidth(centers_df[['center_x','center_y']].as_matrix(), quantile=0.0055) #instantiate the mean shift algorithm ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) #fit the algorithm on the box center coordinates ms.fit(centers_df[['center_x','center_y']]) #get the resulting clustesr labels labels = ms.labels_ #get the resulting centers of each *cluster* cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) #calculate the number of clusters by using the length of the list that contains all the unique labels n_clusters_ = len(labels_unique) #concatenate the centers data frame (which contains all the box coordinates, their dimensions, and their centers) with the clustering labels generated by the clustering boxes_df = pd.concat([centers_df,pd.DataFrame(labels,columns=['cluster_label'])],axis=1) #the aggregate function in the groupby, includes two functions: count and median f = {'Number of boxes in a cluster': ['count'],'Median': ['median']} #group by the label of each cluster and aggregate the boxes' top left coordinates and dimensions by applying the median aggregated_df = boxes_df.groupby('cluster_label')['cluster_label','tl_x','tl_y','width','height'].agg(f).reset_index() #change column names for a more descriptive name aggregated_df.columns = ['cluster_label','median_cluster_label','agg_tl_x','agg_tl_y','agg_width','agg_height','boxes_in_cluster','count_tl_x','count_tl_y','count_width','count_height'] #leave out the unnecessary columns aggregated_df = aggregated_df[['cluster_label','agg_tl_x','agg_tl_y','agg_width','agg_height','boxes_in_cluster']] #Look at the output of the plotBoxes function (svg file) and determine at which THRESHOLD value there is a desired number of clusters (appears at the top of the plot) and that it visually matches the actual grid THRESHOLD = 5 #filter out all the clusters that have less than a certain number of boxes in each cluster #use the old-weather-aggregator-with-plot.py script to check what the best threshold is aggregated_df = aggregated_df.loc[aggregated_df.boxes_in_cluster>THRESHOLD,:] good_clusters = np.unique(aggregated_df.cluster_label.values) print "for subject_id:"+str(subject) print "number of estimated clusters overall: %d" % n_clusters_ print "number of estimated clusters, after small clusters were filtered out: %d" % len(good_clusters) print "clusters with more than %d boxes per cluster:" % THRESHOLD print aggregated_df.columns print aggregated_df.head() #save the aggregated boxes and their clusters into a csv file, separate file for each subject print "Saving the output/aggregated_df_%s.csv file..." % str(subject) aggregated_df.to_csv("output/aggregated_df_"+str(subject)+".csv",index=False) #make sure that only the boxes that belong to the good_clusters (have more boxes than the threshhold) remain in the boxes_df dataframe and then save the dataframe boxes_df = boxes_df.loc[boxes_df['cluster_label'].isin(good_clusters),:] print "Saving the output/clustered_df_%s.csv file..." % str(subject) boxes_df.to_csv("output/clustered_df_"+str(subject)+".csv",index=False) plotBoxes(aggregated_df,boxes_df,cluster_centers)
def test_parallel(): ms1 = MeanShift(n_jobs=2) ms1.fit(X) ms2 = MeanShift() ms2.fit(X) assert_array_equal(ms1.cluster_centers_, ms2.cluster_centers_) assert_array_equal(ms1.labels_, ms2.labels_)
def checkForClustering(catalog): debug("Checking for data clustering") Xfull = catalog.view(np.float64).reshape(catalog.shape + (-1,))[:,1:] X = Xfull[:,2:] debug("Using DBSCAN") db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ n_clusters_DBSCAN = len(set(labels)) - (1 if -1 in labels else 0) debug('Estimated number of clusters with DBSCAN: %d' % n_clusters_DBSCAN) unique_labelsDBSCAN = set(labels) colorsDBSCAN = plt.cm.rainbow(np.linspace(0, 1, len(unique_labelsDBSCAN))) debug("Estimating clusters using MeanShift") bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labelsMS = ms.labels_ cluster_centers = ms.cluster_centers_ labels_uniqueMS = np.unique(labelsMS) n_clusters_MS = len(labels_uniqueMS) debug("Estimated number of clusters with MeanShift: %d" % n_clusters_MS) # Plot result fig = plt.figure(figsize=(12,12)) ax0 = fig.add_subplot(2,2,1) ax1 = fig.add_subplot(2,2,2) ax2 = fig.add_subplot(2,2,3) ax3 = fig.add_subplot(2,2,4) for k, col in zip(unique_labelsDBSCAN, colorsDBSCAN): if k == -1: col = 'k' class_member_mask = (labels == k) mask = class_member_mask & core_samples_mask xy = Xfull[mask] ax0.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5) ax2.plot(catalog['MAG_APER(1)'][mask], catalog['CLASS_STAR'][mask], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5) xy = Xfull[class_member_mask & ~core_samples_mask] ax0.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5) ax2.plot(catalog['MAG_APER(1)'][class_member_mask & ~core_samples_mask], catalog['CLASS_STAR'][class_member_mask & ~core_samples_mask], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5) ax0.set_title('DBCAN: # clusters: %d' % n_clusters_DBSCAN) colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_MS), colors): my_members = labelsMS == k cluster_center = cluster_centers[k] ax1.plot(Xfull[my_members, 0], Xfull[my_members, 1], col + '.') ax3.plot(catalog['MAG_APER(1)'][my_members], catalog['CLASS_STAR'][my_members], col + '.') #ax1.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) ax1.set_title('MeanShift: # clusters: %d' % n_clusters_MS) plt.show()
def evaluate_candidate(options, work, top_frag, candidate): combined = [] top_score,top_ind,top_support = top_frag cand_score,cand_ind,cand_support = candidate min_support = options.support_rmsd comb_ind = sorted(list(set(top_ind) | set(cand_ind))) comb_support = sorted(list(set(top_support) & set(cand_support))) n_comb_support=len(comb_support) if n_comb_support < min_support: return [] if work.use_scores: comb_scores = [work.scores[i] for i in comb_support] aln_models = work.CA.take(comb_support, 0).take(comb_ind, 1) calculator = RMSDCalculator.RMSDCalculator("QCP_OMP_CALCULATOR", aln_models) dist = squareform(calculator.pairwiseRMSDMatrix()) mds = manifold.MDS(n_components=2, dissimilarity="precomputed", n_jobs=1, n_init = 5) pos = mds.fit(dist).embedding_ try: ms = MeanShift(bandwidth=options.bnd_rmsd, cluster_all=False, bin_seeding=True, min_bin_freq = min_support) ms.fit(pos) except: try: ms = MeanShift(bandwidth=options.bnd_rmsd, cluster_all=False, bin_seeding=False) ms.fit(pos) except: return [] labels = ms.labels_ labels_unique = np.unique(labels) for label in labels_unique: if label == -1: continue class_members = [index[0] for index in np.argwhere(labels == label)] class_support = [comb_support[i] for i in class_members] n_class_support = len(class_support) if n_class_support < min_support: continue class_dist = dist.take(class_members,0).take(class_members,1) mean_dist = np.mean(squareform(class_dist)) if work.use_scores: class_scores = [comb_scores[i] for i in class_members] class_score = sum(class_scores)/(1+mean_dist) else: class_score = (-n_class_support)/(1+mean_dist) heapq.heappush(combined, (class_score, comb_ind, class_support)) if combined: return heapq.heappop(combined) return []
def MSclusterer(X): X = X.toarray() bandwidth = estimate_bandwidth(X, quantile=0.04, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False) ms.fit(X) labels = ms.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print(n_clusters_) return ms.labels_
from sklearn.cluster import MeanShift from utils import load_bilateral_image, whiten import matplotlib.pyplot as plt # Get vectorized image feat, im = load_bilateral_image() H, W = im.shape[:2] feat = whiten(feat) ms = MeanShift(bandwidth=1, bin_seeding=True) ms.fit(feat.reshape(-1, feat.shape[2])) labels = ms.labels_ plt.subplot(1, 2, 1) plt.imshow(im) plt.axis('off') plt.subplot(1, 2, 2) plt.imshow(labels.reshape(H, W)) plt.axis('off') plt.show()
from sklearn.cluster import MeanShift, estimate_bandwidth from itertools import cycle import sys filename = sys.argv[1] print filename filebuf = open(filename) points = [] for line in filebuf.readlines(): line = line.rstrip().split() tmp = [float(line[0]), float(line[1]), float(line[2])] points.append(tmp) points = np.array(points) bandwidth = estimate_bandwidth(points, quantile=0.1) ms = MeanShift(bandwidth, bin_seeding=True) ms.fit(points) labels = ms.labels_ centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) wfiles = [] for i in range(n_clusters_): filename = 'file' filename += str(i) filename += '.sp' wfiles.append(filename) filebuf = [] for f in wfiles:
#Estimate bandwidth # quantile : smoothening parameter. #cut points dividing the range of a probability distribution into continuous intervals with equal probabilities, or dividing the observations in a sample in the same way. #should be between [0, 1] 0.5 means that the median of all pairwise distances is used. # n_samples : The number of samples to use. If not given, all samples are used. #bandwidth increases by very less amount by increasing no. of samples and not much visible difference will be there bandwidth1 = estimate_bandwidth(flat_image, quantile=.1, n_samples=500) #print(bandwidth1) ms1 = MeanShift(bandwidth1, bin_seeding=True) #Performing meanshift on flatImg ms1.fit(flat_image) #(r,g,b) vectors corresponding to the different clusters after meanshift labels1=ms1.labels_ #Remaining colors after meanshift cluster_centers1 = ms1.cluster_centers_ #Finding and diplaying the number of clusters labels_unique1 = np.unique(labels1) n_clusters_1 = len(labels_unique1) #print("number of estimated clusters : %d" % n_clusters_1) # Displaying segmented image
virginica = iris_data.loc[iris_data['Species_I. virginica'] == 1] versicolor = iris_data.loc[iris_data['Species_I. versicolor'] == 1] setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1] plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r') plt.scatter(x=versicolor['Sepal length'], y=versicolor['Sepal width'], color='g') plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b') #plt.show() print("Self band: ", estimate_bandwidth(iris_data, quantile=0.2)) analyzer = MeanShift(bandwidth=1) print("Self MeanShift: ", analyzer.fit(iris_data)) print("Function mean_shift: ", mean_shift(iris_data)) labels, cluster_centers, n_clusters = mean_shift(iris_data) fig = plt.figure() ax = fig.add_subplot(111) colors = cycle('bgrcmy') print(labels) for k, col in zip(range(n_clusters), colors): cluster_center = cluster_centers[k] if (labels == k): x, y = iris_data[0], iris_data[1] ax.scatter(x, y, c=col, linewidth=0.2) ax.scatter(x=cluster_center[0],
from sklearn import datasets iris = datasets.load_iris() data = iris.data from sklearn.cluster import MeanShift clsfr = MeanShift(bandwidth=0.85) clsfr.fit(data) labels = clsfr.labels_ centroids = clsfr.cluster_centers_ print(len(centroids)) print(centroids)
#---------------------------------------------------------------------- # Compute clustering with MeanShift # # We'll work with the scaled data, because MeanShift finds circular clusters X_scaled = preprocessing.scale(X) # The following bandwidth can be automatically detected using # the routine estimate_bandwidth(). Because bandwidth estimation # is very expensive in memory and computation, we'll skip it here. #bandwidth = estimate_bandwidth(X) bandwidth = 0.4 ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False) ms.fit(X_scaled) labels_unique = np.unique(ms.labels_) n_clusters = len(labels_unique[labels_unique >= 0]) print labels_unique print bandwidth print "number of estimated clusters : %d" % n_clusters #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(111) # plot density H, C1_bins, C2_bins = np.histogram2d(colour1, colour2, 51)
# Det kan være svært at se ud af data hvad der er den optimale vinduestørrelse, # så derfor har vi metoden til (baseret på vores data) # at give os den estimerede bedst egnede vinduesstørrelse print() print("Estimate Bandwidth") bandwidth = estimate_bandwidth(titanic_data) print(bandwidth) # Task: Fit data to a meanshift model from sklearn.cluster import MeanShift import numpy as np # Så "meanshift" er centrum af den cirkel med radius "bandwidth" der i et plot dækker over flest punkter analyzer = MeanShift(bandwidth=30) fit = analyzer.fit(titanic_data) print() print("fit\n", fit) labels = analyzer.labels_ print() print("labels\n", labels) uniqueLabels = np.unique(labels) print("\n\nnp.unique(labels)\n", uniqueLabels) # Task: How many clusters do we get print() print("Number of clusters:") numberOfClusters = len(uniqueLabels) print(numberOfClusters) # Task: Add a column to the titanic dataframe with the cluster label for each person
import pandas as pd import numpy as np import matplotlib.pyplot as plt from itertools import cycle from sklearn.cluster import MeanShift iris_data = pd.read_excel('iris_data.xlsx') print(iris_data.head()) iris_data = pd.get_dummies(iris_data, columns=['Species']) print(iris_data.head()) virginica = iris_data.loc[iris_data['Species_I. virginica'] == 1] versicolor = iris_data.loc[iris_data['Species_I. versicolor'] == 1] setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1] plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r') plt.scatter(x=versicolor['Sepal length'], y=versicolor['Sepal width'], color='g') plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b') #plt.show() from sklearn.cluster import estimate_bandwidth print(estimate_bandwidth(virginica, quantile=0.2)) print(estimate_bandwidth(versicolor, quantile=0.2)) print(estimate_bandwidth(setosa, quantile=0.2)) print(estimate_bandwidth(iris_data, quantile=1)) analyzer = MeanShift(bandwidth=1) print(analyzer.fit(iris_data))
# replace the values using the dictionary df[col] = list(map(convert_to_int, df[col])) return df df = handle_non_numerical_data(df) # removing the survived column because that's what we are testing X = np.array(df.drop(['survived'], 1).astype(float)) X = preprocessing.scale(X) y = df['survived'] model = MeanShift() model.fit(X) # get the groups the model created labels = model.labels_ # adding new col to original df with textual values for readability original_df['cluster_group'] = np.nan # adding the group value to the new col for all rows in df for i in range(len(X)): # iloc references the row at index i original_df['cluster_group'].iloc[i] = labels[i] # the number of groups we got from the model n_clusters_ = len(np.unique(labels))
# Z = linkage(face_encodings, 'ward') # fig = plt.figure(figsize=(25, 10)) # dn = dendrogram(Z) #mean-shift if True: nuke_people() faces = list(Face.objects.all()) face_encodings = np.array( [np.frombuffer(bytes.fromhex(f.encoding)) for f in faces]) X = StandardScaler().fit_transform(face_encodings) bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) #DBSCAN if False: nuke_people() faces = list(Face.objects.all()) face_encodings = np.array( [np.frombuffer(bytes.fromhex(f.encoding)) for f in faces]) X = StandardScaler().fit_transform(face_encodings) # ############################################################################# # Compute DBSCAN db = DBSCAN(eps=5, min_samples=2).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_
import numpy as np from sklearn.cluster import MeanShift # as ms from sklearn.datasets.samples_generator import make_blobs import matplotlib.pyplot as pt from mpl_toolkits.mplot3d import Axes3D #from matplotlib import style #style.use("ggplot") ponits = [[3, 3, 3], [9, 9, 9], [2, 9, 9]] mo, _ = make_blobs(n_samples=1500, centers=ponits, cluster_std=0.5) shift_calulating = MeanShift() shift_calulating.fit(mo) datalabeling = shift_calulating.labels_ centersofpoints = shift_calulating.cluster_centers_ print(centersofpoints) clustertospot = len(np.unique(datalabeling)) print("esitmated clustering group", clustertospot) penning = 20 * ['y', 'c', 'm', 'p', 's', 'd', 't'] print(penning) print(datalabeling) fig = pt.figure() axised = fig.add_subplot(111, projection='3d')
1: np.float64 }) print(X.head()) ## This is the bit where it fits the data ms = MeanShift(cluster_all=False) # Convert the columns of interest to a NumPy array # Multi-dimensional so could be anything really msX = np.array(X.iloc[:, col_int]) # print (msX) ms.fit(msX) labels = ms.labels_ cluster_centers = ms.cluster_centers_ n_clusters_ = len(np.unique(labels)) # print("Number of estimated clusters:", n_clusters_) # print(labels) ## Add the labels to the original dataframe and output to csv for analysis labels_df = pd.DataFrame(labels, columns=['LABELS']) X = pd.concat([X, labels_df], axis=1) X.to_csv(myPath + testout)
def do_work(self, train, uid, url): self.cap = cv2.VideoCapture(url) print(uid) self.kernel = np.ones((3, 3), np.uint8) self.frameWidth = int(self.cap.get(3)) self.frameHeight = int(self.cap.get(4)) self.outOriginal = cv2.VideoWriter( 'cache/original.avi', cv2.VideoWriter_fourcc('X', 'V', 'I', 'D'), 24, (self.frameWidth, self.frameHeight)) self.outDetect = cv2.VideoWriter( 'cache/detect.avi', cv2.VideoWriter_fourcc('X', 'V', 'I', 'D'), 24, (self.frameWidth, self.frameHeight)) self.outSkel = cv2.VideoWriter( 'cache/skel.avi', cv2.VideoWriter_fourcc('X', 'V', 'I', 'D'), 24, (self.frameWidth, self.frameHeight)) self.fgbg = cv2.bgsegm.createBackgroundSubtractorMOG() self.frameCount = 0 cacheDir = os.path.join(os.getcwd(), 'cache') sourceDir = os.path.join(os.getcwd(), 'sources') try: pass os.remove(os.path.abspath(os.path.join(cacheDir, 'test.csv'))) except OSError as e: pass try: if train: os.remove( os.path.abspath(os.path.join(sourceDir, str(uid) + '.csv'))) except OSError as e: pass while self.frameCount < 240: status, frame = self.cap.read() if not status: break blur = cv2.GaussianBlur(frame, (9, 9), 0) fgmask = self.fgbg.apply(blur) img = cv2.dilate(fgmask, self.kernel, iterations=1) x, y, height, length = self.contourDetect(img) boxImg = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) boxImg = cv2.rectangle(boxImg, (x, y), (x + length, y + height), (0, 0, 255), 2) cv2.line(boxImg, (0, int(y + 0.75 * height)), (640, int(y + 0.75 * height)), (0, 255, 0), 2) cv2.line(boxImg, (0, int(y + 0.15 * height)), (640, int(y + 0.15 * height)), (255, 0, 0), 2) skel, hip, shoulder = self.skelRegion(img, x, y, height, length) if self.frameCount > 50 and self.frameCount < 151: if train: with open('sources/' + str(uid) + '.csv', 'a', newline='') as csvfile: with open('cache/target.csv', 'a', newline='') as targetfile: fieldnames = [ 'height', 'stride', 'lowerbody', 'upperbody', 'hipangle', 'shoulderx', 'shouldery' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) targetnames = ['class'] targetWriter = csv.DictWriter( targetfile, fieldnames=targetnames) writer.writerow({ 'height': height, 'stride': length, 'lowerbody': round(0.53 * height, 2), 'upperbody': round(0.4 * height, 2), 'hipangle': round(hip, 2), 'shoulderx': shoulder[0], 'shouldery': shoulder[1] }) targetWriter.writerow({'class': uid}) targetWriter.writerow({'class': 0}) else: with open('cache/test.csv', 'a', newline='') as csvfile: fieldnames = [ 'height', 'stride', 'lowerbody', 'upperbody', 'hipangle', 'shoulderx', 'shouldery' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writerow({ 'height': height, 'stride': length, 'lowerbody': round(0.53 * height, 2), 'upperbody': round(0.4 * height, 2), 'hipangle': round(hip, 2), 'shoulderx': shoulder[0], 'shouldery': shoulder[1] }) self.outOriginal.write(frame) self.outDetect.write(boxImg) self.outSkel.write(skel) self.frameCount += 1 if self.frameCount % 10 == 0: if train: self.trackProgress(self.frameCount / 240, True) else: self.trackProgress(self.frameCount / 240, False) print("processing done!") self.cap.release() self.outDetect.release() self.outOriginal.release() self.outSkel.release() verify = False if train: pass else: csv_files = glob.glob('sources/*.csv') for cfile in csv_files: cf = pd.read_csv(cfile) master_array = cf.as_matrix() df = pd.read_csv('cache/test.csv') numpy_array = df.as_matrix() print(numpy_array) bandwidth = estimate_bandwidth(master_array, quantile=0.1) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(master_array) master_labels = ms.labels_ master_centers = ms.cluster_centers_ print("Master centroids:\n", master_centers) print("Number of Master clusters: ", len(np.unique(master_labels))) bandwidth = estimate_bandwidth(numpy_array, quantile=0.1) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(numpy_array) labels = ms.labels_ cluster_centers = ms.cluster_centers_ print("Test centroids:\n", cluster_centers) print("Number of Test clusters: ", len(np.unique(labels))) bandwidth = estimate_bandwidth(master_centers, quantile=0.9) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(master_centers) master_centers = ms.cluster_centers_ bandwidth = estimate_bandwidth(cluster_centers, quantile=0.9) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(cluster_centers) cluster_centers = ms.cluster_centers_ # new_centers = np.concatenate((master_centers, cluster_centers)) LIMIT = np.matrix([[5, 5, 5, 5, 5, 5, 5]]) if abs(master_centers - cluster_centers).all() < LIMIT.all(): verify = True uid = cfile.split('.')[0].split('/')[1] data = self.fetchDatabase(uid) img = open('cache/image.png', 'wb') img.write(data[4]) img.close() self.verifyDone.emit(str(data[0]), data[1], data[2], str(data[3])) print(master_centers) print(cluster_centers) break print(master_centers) print(cluster_centers) if not verify and not train: self.unauthVerify.emit() if train: self.threadCompleted.emit(True) else: self.threadCompleted.emit(False)
def __call__(self, data): data = self.normer(data.as_matrix(self.keys)) bandwidth = estimate_bandwidth(data, self.quantile, self.n_samples) ms = MeanShift(bandwidth) ms.fit(data) return [int(ms.predict([x])) for x in data]
kp2, des2 = sift.detectAndCompute(img2, None) import numpy as np from sklearn.cluster import MeanShift, estimate_bandwidth x = np.array([kp2[0].pt]) for i in range(len(kp2)): x = np.append(x, [kp2[i].pt], axis=0) x = x[1:len(x)] bandwidth = estimate_bandwidth(x, quantile=0.1, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=True) ms.fit(x) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) s = [None] * n_clusters_ for i in range(n_clusters_): l = ms.labels_ d, = np.where(l == i) print(d.__len__()) s[i] = list(kp2[xx] for xx in d) des2_ = des2
def main(file_name): t0 = time.time() mat = cv2.imread(file_name, 0) ret, binary = cv2.threshold(mat, 100, 255, cv2.THRESH_BINARY) _, cnts, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) print(" ### Len contour : %d ###" % len(cnts)) mat = cv2.cvtColor(mat, cv2.COLOR_GRAY2BGR) for i in range(len(cnts)): centroid = getCentroid(cnts[i]) # approxy contours length = cv2.arcLength(cnts[i], True) epsilon = 0.01 * length if epsilon < 3: epsilon = 3 approx = cv2.approxPolyDP(cnts[i], epsilon, True) approx = approx.reshape(len(approx), 2) # The following bandwidth can be automatically detected using bandwidth = estimate_bandwidth(approx, quantile=0.2, n_samples=len(approx)) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(approx) labels = ms.labels_ cluster_centers = ms.cluster_centers_ for p in cluster_centers: p = (int(p[0]), int(p[1])) cv2.circle(mat, p, 5, (0, 255, 0), -1) # labels_unique = np.unique(labels) # n_clusters_ = len(labels_unique) # print("number of estimated clusters : %d" % n_clusters_) # # Plot result # import matplotlib.pyplot as plt # from itertools import cycle # plt.figure(1) # plt.clf() # colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') # for k, col in zip(range(n_clusters_), colors): # my_members = labels == k # cluster_center = cluster_centers[k] # plt.plot(approx[my_members, 0], approx[my_members, 1], col + '.') # plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, # markeredgecolor='k', markersize=14) # plt.title('Estimated number of clusters: %d' % n_clusters_) # plt.show() print("-Contours[%d]" % i) print("\t*centroid : ", centroid, "in contour : ", list(centroid) in arrContour2ListPoints(approx), "\n") print("\t*length : %.2f , epsilon : %.2f\n" % (length, epsilon)) print("\t*approx : ", approx.shape, "\n") print("\t*cluster : ", cluster_centers.shape, "\n") cv2.putText(mat, "%d" % i, centroid, cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 255), 2) # cv2.circle(mat,centroid,5,(0,255,),-1) # for p in approx: # cv2.circle(mat,tuple(p),3,(0,0,255),-1) dt = time.time() - t0 print("* total time : %.2f\n" % dt) cv2.imshow("", mat) k = cv2.waitKey(0) cv2.destroyAllWindows()
data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0) matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] matplotlib.rcParams['axes.unicode_minus'] = False plt.figure(figsize=(10, 9), facecolor='w') m = euclidean_distances(data, squared=True) bw = np.median(m) print(bw) for i, mul in enumerate(np.linspace(0.1, 0.4, 4)): band_width = mul * bw model = MeanShift(bin_seeding=True, bandwidth=band_width) ms = model.fit(data) centers = ms.cluster_centers_ y_hat = ms.labels_ n_clusters = np.unique(y_hat).size print('带宽:', mul, band_width, '聚类簇的个数为:', n_clusters) plt.subplot(2, 2, i + 1) plt.title(u'带宽:%.2f,聚类簇的个数为:%d' % (band_width, n_clusters)) clrs = [] for c in np.linspace(16711680, 255, n_clusters): clrs.append('#%06x' % c) # clrs = plt.cm.Spectral(np.linspace(0, 1, n_clusters)) for k, clr in enumerate(clrs): cur = (y_hat == k) plt.scatter(data[cur, 0], data[cur, 1], c=clr, edgecolors='none') plt.scatter(centers[:, 0],
df[column] = list(map(convert_to_int, df[column])) return df df = handle_non_numerical_data(df) df.drop(['boat'], 1, inplace=True) #print(df.head()) X = np.array(df.drop(['survived'], 1).astype(float)) X = preprocessing.scale(X) y = np.array(df['survived']) clf = MeanShift() clf.fit(X) labels = clf.labels_ clusters_centers = clf.cluster_centers_ n_clusters = len(np.unique(labels)) original_df['cluster_group'] = np.nan for i in range(len(X)): original_df['cluster_group'].iloc[i] = labels[i] survival_rates = {} for i in range(n_clusters): temp_df = original_df[(original_df['cluster_group'] == float(i))] survival_cluster = temp_df[(temp_df['survived'] == 1)] survival_rate = float(len(survival_cluster)) / len(temp_df)
class aplicateClustering(object): def __init__(self, dataSet): self.dataSet = dataSet #metodo que permite aplicar k-means, genera diversos set de datos con respecto a las divisiones que se emplean... def aplicateKMeans(self, numberK): try: self.model = KMeans(n_clusters=numberK, random_state=1).fit(self.dataSet) self.labels = self.model.labels_ return 0 except: pass return 1 #metodo que permite aplicar birch clustering def aplicateBirch(self, numberK): try: self.model = Birch(threshold=0.2, branching_factor=50, n_clusters=numberK, compute_labels=True, copy=True).fit(self.dataSet) self.labels = self.model.labels_ return 0 except: pass return 1 #metodo que permite aplicar cluster jerarquico def aplicateAlgomerativeClustering(self, linkage, affinity, numberK): try: self.model = AgglomerativeClustering(n_clusters=numberK, affinity=affinity, memory=None, connectivity=None, compute_full_tree='auto', linkage=linkage).fit(self.dataSet) self.labels = self.model.labels_ return 0 except: pass return 1 #metodo que permite aplicar AffinityPropagation, con diversos parametros... def aplicateAffinityPropagation(self): try: self.model = AffinityPropagation().fit(self.dataSet) self.labels = self.model.labels_ return 0 except: pass return 1 #metodo que permite aplicar DBSCAN def aplicateDBSCAN(self): try: self.model = DBSCAN(eps=0.3, min_samples=10).fit(self.dataSet) self.labels = self.model.labels_ return 0 except: pass return 1 #metodo que permite aplicar MeanShift clustering... def aplicateMeanShift(self): try: bandwidth = estimate_bandwidth(self.dataSet, quantile=0.2) self.model = MeanShift(bandwidth=bandwidth, bin_seeding=True) self.model = self.model.fit(self.dataSet) self.labels = self.model.labels_ return 0 except: return 1
def main(): (options, args) = parseArguments() chunk_name = options.filename sample_transfer_params = np.array([ options.sample_concurrency, options.sample_parallelism, options.sample_pipelining ]) sample_transfer_throughput = options.sample_throughput if options.maxcc is not None: global maxcc maxcc = options.maxcc file_name = os.path.join(os.getcwd(), '../../target', chunk_name) resource_package = __name__ # Could be any module/package name resource_path = '/' + chunk_name # Do not use os.path.join(), see below print resource_package, sys.path fin = pkg_resources.resource_stream(resource_package, resource_path) #print file_name #sys.exit() discarded_data_counter = 0 all_experiments = [] #fin = open(file_name, 'r') data, name, size, similarity = read_data_from_file(fin) while data is not None: data_copy = np.array(data) regression, degree, optimal_point = run_modelling( data_copy, name, data[0, :]) if regression is None: #print "Skipped", name, size discarded_data_counter += 1 elif name.startswith("SB") or name.startswith("sg"): discarded_data_counter += 1 else: all_experiments.append( TransferExperiment(name, size, similarity, regression, degree, optimal_point, data[0, :])) #print "Read data point ", name, data[0,:], data #sys.exit(-1) data, name, size, similarity = read_data_from_file(fin) #print "Skipped:", discarded_data_counter, "/", (len(all_experiments) + discarded_data_counter) fin.close() for experiment in all_experiments: poly = PolynomialFeatures(degree=experiment.poly_degree) experiment.estimated_troughput = experiment.regression.predict( poly.fit_transform(sample_transfer_params.reshape(1, -1))) experiment.set_closeness( abs(experiment.estimated_troughput - sample_transfer_throughput)) all_experiments.sort(key=lambda x: x.closeness, reverse=True) for experiment in all_experiments: experiment.run_parameter_relaxation(options.cc_rate, options.p_rate, options.ppq_rate) #print experiment.name, experiment.estimated_troughput, " diff:", experiment.closeness, experiment.similarity, experiment.relaxed_params all_experiments.sort(key=lambda x: x.closeness, reverse=True) attrs = [experiment.closeness for experiment in all_experiments] #print attrs X = np.array(attrs, dtype=np.float) #print X bandwidth = estimate_bandwidth(X, quantile=0.2) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) label = labels[0] weight = 0 #for experiment in all_experiments: sorted_centers = sorted(cluster_centers[:, 0], reverse=True) #print cluster_centers, sorted_centers #print "Sorted indexes" #for cluster_center in cluster_centers: # print sorted_centers.index(cluster_center) #print labels for k in range(n_clusters_): my_members = labels == k #print my_members #print "cluster {0}: {1}".format(k, X[my_members, 0]) my_members = labels == labels[-1] #similar_experiments = [experiment for experiment in all_experiments if experiment.closeness in X[my_members]] #all_experiments = similar_experiments for experiment, label in zip(all_experiments, labels): rank = sorted_centers.index(cluster_centers[label]) # print "Traffic similar:", experiment.name, experiment.closeness, rank experiment.closeness_weight = 2**rank all_experiments.sort(key=lambda x: x.similarity) attrs = [experiment.similarity for experiment in all_experiments] db1 = DBSCAN(eps=2, min_samples=1).fit(attrs) similarity_labels = db1.labels_ # print attrs, similarity_labels for experiment, similarity_label in zip(all_experiments, similarity_labels): experiment.similarity_weight = 2**similarity_label # print attrs, similarity all_experiments.sort(key=lambda x: x.closeness) for experiment in all_experiments: experiment.run_parameter_relaxation(options.cc_rate, options.p_rate, options.ppq_rate) total_weight = 0 total_thr = 0 total_params = [0, 0, 0] for experiment in all_experiments: if experiment.similarity_weight < 1: # print experiment.name, experiment.closeness, experiment.closeness_weight, experiment.similarity, experiment.similarity_weight continue weight = experiment.similarity_weight * experiment.closeness_weight total_weight += weight weighted_params = [ param * weight for param in experiment.relaxed_params ] print "HEYYY", experiment.name, experiment.closeness, experiment.closeness_weight, experiment.similarity_weight, weight, experiment.relaxed_params, experiment.first_row total_params = map(add, total_params, weighted_params) total_thr += weight * experiment.relaxed_throughput final_params = [x / (total_weight * 1.0) for x in total_params] final_throughput = total_thr / total_weight return final_params, final_throughput
def meanShiftClustering(array1, array2, array3, numberOfMinSamples): convertedArray = convertArrayFormat(array1, array2, array3) arrayToSave = convertedArray convertedArray = StandardScaler().fit_transform(convertedArray) if (len(convertedArray) > 3): # Compute MeanShift # bandwidth = estimate_bandwidth(convertedArray, quantile = 0.2, n_samples = 500) # ms = MeanShift(bandwidth = bandwidth, bin_seeding = True) ms = MeanShift() ms.fit(convertedArray) labels = ms.labels_ cluster_centers = ms.cluster_centers_ # couting the number of members for each cluster and assign clusters with less than 'numberOfMinSamples' members to noise counterVec = ([]) maxLabel = max(labels) labels = list(labels) for i in range(0, maxLabel + 1): counterVec = np.append(counterVec, labels.count(i)) # checking the labels whose number is less than 'numberOfMinsSamples' noiseLabelVec = ([]) for i in range(0, len(counterVec)): if (counterVec[i] < numberOfMinSamples): noiseLabelVec = np.append(noiseLabelVec, i) # creating new labels, with excluding those whose number is less than 'numberOfMinSamples' and handle them as noise newLabels = [] for i in range(0, len(labels)): if (np.any(noiseLabelVec == labels[i])): newLabels.append(-1) else: newLabels.append(labels[i]) # changing labels to match those of other clustering algorithms for i in range(0, len(newLabels)): # decrease labelling uniformly by 1 newLabels[i] -= 1 # changing the -1 labels to max, and -2 labelling back to the original -1 maxLabel = max(newLabels) for i in range(0, len(newLabels)): if (newLabels[i] == -1): newLabels[i] = (maxLabel + 1) if (newLabels[i] == -2): newLabels[i] = -1 else: pass # Number of clusters in labels, ignoring noise if present labels_unique = np.unique(newLabels) n_clusters_ = len(set(labels_unique)) - (1 if -1 in newLabels else 0) else: n_clusters_ = 0 print ('number of estimated clusters : %d' % n_clusters_) fig = plt.figure(figsize = (16,12)) # in inches colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') newLabels = np.asarray(newLabels) # for accurate plotting for k, col in zip(range(n_clusters_), colors): my_members = newLabels == k cluster_center = cluster_centers[k] plt.plot(convertedArray[my_members, 0], convertedArray[my_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.show() return arrayToSave, newLabels
def cropEntries(image, file, padding): #t1 = time.time() croppedImages = [] crop_points = [] img = image.copy() height, width = img.shape[:2] sf = float(width) / float(2611) pad = int(padding / float(height) * float(11675)) histogram = pd.Series( [width - cv2.countNonZero(img[i, :]) for i in list(range(height))]) # do plots. #fig = plt.figure() #ax = histogram.plot() #ax.set_ylim([0,150]) #ax.set_xlim([10500,11500]) #plt.savefig('histogram' + file + '.pdf', bbox_inches='tight') #plt.close(fig) dip_df = histogram[histogram < sf * 25].to_frame().rename( columns={0: 'count'}) indices = np.array(dip_df.index.tolist()).reshape(-1, 1) #pkl.dump(indices, open('indices.pkl', 'wb')) #t2 = time.time() #print('Prep time: ' + str(round(t2-t1, 2)) + ' s') # find indices to cut the entries #tf1 = time.time() ms = MeanShift(bandwidth=sf * 50, bin_seeding=True) ms.fit(indices) dip_group = ms.predict(indices) #tf2 = time.time() #print('Fit time: ' + str(round(tf2-tf1, 2)) + ' s') # add new column #t1 = time.time() dip_df = dip_df.assign(group=dip_group) #cut_points = [0] + sorted(dip_df.groupby('group').apply(lambda x: int(np.mean(x.index))).tolist())[1:-1] + [height] #calculate where to cut cut_points = [0] + sorted( dip_df.groupby('group').idxmin()['count'].tolist())[1:-1] + [height] median_height = np.median([ cut_points[i + 1] - cut_points[i] for i in list(range(len(cut_points) - 1)) ]) #t2 = time.time() #print('Sort time: ' + str(round(t2-t1, 2)) + ' s') #for each pair of cut points found for i in list(range(len(cut_points) - 1)): start, end = cut_points[i], cut_points[i + 1] # if we suspect an entry is too large if end - start > 1.5 * median_height: # do the algorithm over again entry_hist = pd.DataFrame(data={ 'count': [ float(width - cv2.countNonZero(img[j, :])) for j in list(range(start, end)) ] }, index=list(range(start, end))) entry_dip_df = entry_hist[entry_hist['count'] < sf * 100] entry_indices = np.array(entry_dip_df.index.tolist()).reshape( -1, 1) entry_ms = MeanShift(bandwidth=sf * 50, bin_seeding=True) entry_ms.fit(entry_indices) entry_dip_group = entry_ms.predict(entry_indices) entry_dip_df = entry_dip_df.assign(entry_group=entry_dip_group) entry_cut_points = [start] + sorted( entry_dip_df.groupby('entry_group').idxmin() ['count'].tolist())[1:-1] + [end] # if you have too many cut points for one entry if len(entry_cut_points) > 2: #print(entry_cut_points) #fig2 = plt.figure() #ax = entry_hist['count'].plot() #for xval in entry_cut_points: #ax2 = plt.axvline(x = xval, linestyle = ':', color = 'r') #ax.set_ylim([0,300]) #plt.savefig('entry_hist' + file + str(i+1) + '.pdf', bbox_inches='tight') #plt.close(fig2) for entry_i in list(range(len(entry_cut_points) - 1)): # adjust the cut points if histogram.iloc[entry_cut_points[entry_i]: entry_cut_points[entry_i + 1]].sum() > sf * 20: adjusted_start = entry_cut_points[entry_i] adjusted_end = entry_cut_points[entry_i + 1] while (histogram.iloc[adjusted_start] == 0) and (adjusted_start < (adjusted_end - 1)): adjusted_start += 1 while (histogram.iloc[adjusted_end - 1] == 0) and ((adjusted_end - 1) > adjusted_start): adjusted_end -= 1 adjusted_start = max(adjusted_start - pad, 0) adjusted_end = min(adjusted_end + pad, height) croppedImages.append(img[adjusted_start:adjusted_end, 0:width]) crop_points.append([adjusted_start, adjusted_end]) else: if entry_hist['count'].sum() > sf * 20: # adjust cut points adjusted_start = start + 0 adjusted_end = end - 0 while (histogram.iloc[adjusted_start] == 0) and (adjusted_start < (adjusted_end - 1)): adjusted_start += 1 while (histogram.iloc[adjusted_end - 1] == 0) and ((adjusted_end - 1) > adjusted_start): adjusted_end -= 1 adjusted_start = max(adjusted_start - pad, 0) adjusted_end = min(adjusted_end + pad, height) croppedImages.append(img[adjusted_start:adjusted_end, 0:width]) crop_points.append([adjusted_start, adjusted_end]) else: # if the cut points end up possibly cutting words if histogram.iloc[start:end].sum() > sf * 20: # adjust cut points adjusted_start = start + 0 adjusted_end = end - 0 while (histogram.iloc[adjusted_start] == 0) and (adjusted_start < (adjusted_end - 1)): adjusted_start += 1 while (histogram.iloc[adjusted_end - 1] == 0) and ((adjusted_end - 1) > adjusted_start): adjusted_end -= 1 adjusted_start = max(adjusted_start - pad, 0) adjusted_end = min(adjusted_end + pad, height) croppedImages.append(img[adjusted_start:adjusted_end, 0:width]) crop_points.append([adjusted_start, adjusted_end]) #pkl.dump(crop_points, open('crop_points.' + file + '.pkl', 'wb')) return croppedImages, crop_points
def postionInSmallMapExtract(self,mapPic): #input pic is 0 and 1 matric #输入是小地图那个白框的矩阵图像,所有值只有0和1 if np.max(mapPic) > 1: print('图片数据大于1') if (mapPic == 0).all(): print('图片全为0') return kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) # 定义结构元素 mapPic = cv2.morphologyEx(mapPic, cv2.MORPH_OPEN, kernel) # 开运算 # self.picDisplay(mapPic,'map2') # exit() lieHe = np.sum(mapPic,axis = 0)#每列求和,最后是一行 hangHe = np.sum(mapPic,axis = 1)#每行求和,最后是一列 lieKuangXianPos = [] hangKuangXianPos = [] lieAreaList = [] hangAreaList = [] for i in range(len(lieHe)): if lieHe[i]> 6: lieKuangXianPos.append(i) elif lieHe[i]>0: lieAreaList.append(i) for i in range(len(hangHe)): if hangHe[i] > 6: hangKuangXianPos.append(i) elif hangHe[i]>0: hangAreaList.append(i) # print('行坐标:%s 列坐标:%s'%(hangKuangXianPos,lieKuangXianPos)) # print('中心点坐标:%s'%([np.mean(hangKuangXianPos),np.mean(lieKuangXianPos)])) # print(lieAreaList) # print(hangAreaList) # 行坐标:[143, 144, 170, 171] 实际为竖直坐标 # 列坐标:[89, 90, 137, 138] 实际为水平坐标 # 中心点坐标:[157.0, 113.5] # 竖直坐标间距13.5 水平坐标间距24 半个矩形长度,到中心点距离 #为了处理部分矩形边框超出小地图范围的情况 centerPos = [] centerPos.append(np.mean(lieAreaList))#中心点横坐标 centerPos.append(np.mean(hangAreaList))#中心点纵坐标 hangChangdu = len(hangKuangXianPos) lieChangdu = len(lieKuangXianPos) hangKuangXianPos = np.array(hangKuangXianPos).reshape(1,hangChangdu) lieKuangXianPos = np.array(lieKuangXianPos).reshape(1,lieChangdu) #通过聚类算法获取边框中线位置 此处为纵坐标获取 zeros = np.zeros([1, hangChangdu]) points = np.array([hangKuangXianPos, zeros]).T.reshape(hangChangdu,2) # print(points) # print(points.shape) ms = MeanShift(bandwidth=2) ms.fit(points) cluster_centers = ms.cluster_centers_ print(cluster_centers) ys = [] for i in cluster_centers: for j in i: if j!=0: ys.append(j) #通过聚类算法获取边框中线位置 此处为横坐标获取 zeros = np.zeros([1, lieChangdu]) points = np.array([lieKuangXianPos, zeros]).T.reshape(lieChangdu,2) # print(points) # print(points.shape) ms = MeanShift(bandwidth=4) ms.fit(points) cluster_centers = ms.cluster_centers_ # print(cluster_centers) xs = [] for i in cluster_centers: for j in i: if j!=0: xs.append(j) centerPoint = [0,0] if len(xs) == 2: centerPoint[0] = np.mean(xs) elif len(xs) == 1: if xs[0] > centerPos[0]: centerPoint[0] = xs[0] - 24 else: centerPoint[0] = xs[0] + 24 else: print('err 聚类获取点非1,2个 xs:%s'%(xs)) if len(ys) == 2: centerPoint[1] = np.mean(ys) elif len(ys) == 1: if ys[0] > centerPos[1]: centerPoint[1] = ys[0] - 13.5 else: centerPoint[1] = ys[0] + 13.5 else: print('err 聚类获取点非1,2个 ys:%s'%(ys)) # print('估计中心点:%s 精确中心点:%s'%(centerPos,centerPoint)) return centerPoint
import numpy as np import matplotlib.pyplot as plt data = pd.read_csv("data.csv") s = data[["G3", "Dalc", "failures"]] fig = plt.figure() ax = fig.add_subplot(111, projection="3d") ax.scatter(s["Dalc"], s["failures"], s["G3"]) ax.set_xlabel("Daily Alchol consumption") a = [1, 2, 3, 4] ax.set_ylabel("Failures") #Check the pics ax.set_zlabel("Grade point") #Check the pic plt.show() colors = np.array(["Red", "Blue", "Green"]) kmeans = MeanShift() kmeans.fit(s) fig = plt.figure() ax = fig.add_subplot(111, projection="3d") ax.scatter(s["Dalc"], s["failures"], s["G3"], color=colors[kmeans.labels_]) ax.set_ylabel("Free time") #Check the pics ax.set_zlabel("Grade point") #Check the pic ax.set_xlabel("Daily alcholic consumption") #s["Fjobn"] plt.show() s = s.groupby(["Dalc", "failures"])["G3"].mean() s = s.reset_index() print(s) fig = plt.figure() ax = fig.add_subplot(111, projection="3d") ax.scatter(s["Dalc"], s["failures"], s["G3"])
def combined(self,cluster_num=None,isPlot=False): '''Parameter - cluster_num: input "int", manually determine number of clusters - isplot: plot or not Explanation - utilize Mean-Shift method to make initial centroids of K-Means ''' print(" [*] starting meanshift-kmeans combind method ") self.data_preprocessing() print(" [*] data pre-processing done ") ind = 0 while self.data.shape[1] > self.data.shape[0]: self.dimension_reduction(cont_rate=self.cont_rate_list[ind]) if self.data.shape[1] > self.data.shape[0]: print(" - reduced data dimension larger then data amount, alter another contribution rate ") ind += 1 else: print(" - reduced data dimension smaller then data amount ") break print(" [*] dimensionality reduction done ") X = np.array(self.data) bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=10000, random_state=42, n_jobs=2) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) dict_1 = [] for i in labels: dict_1.append((i,str(i))) np.random.seed(42) data = np.array(cluster_centers) if n_clusters_ > 2: if not cluster_num: range_n_clusters = list(range(2,n_clusters_)) silhouette_avg = [] for n in range_n_clusters: estimator = KMeans(init='random', n_clusters=n, max_iter=1000, n_init=10) cluster_labels = estimator.fit_predict(data) silhouette_avg.append(silhouette_score(data, cluster_labels)) print(" - For n_clusters = " + str(n) + ", the average silhouette_score is : " + str(silhouette_avg[-1]) + ".") # use K-Means to cluster Mean-Shift centroids n_digits = range_n_clusters[silhouette_avg.index(max(silhouette_avg))] # print(" - Choose n_clusters = " + str(n_digits) + " with max average silhouette score as final clusters number.") kmeans = KMeans(init='random', n_clusters=n_digits, max_iter=1000, n_init=10) output_label = kmeans.fit_predict(data) cluster_centers = kmeans.cluster_centers_ # use clustered Mean-Shift centroids as initial centroids of K-Means kmeans = KMeans(init=cluster_centers, n_clusters=len(cluster_centers), max_iter=1000) output_label = kmeans.fit_predict(self.data) else: # use K-Means to cluster Mean-Shift centroids kmeans = KMeans(init='random', n_clusters=min(cluster_num,n_clusters_), max_iter=1000, n_init=10) output_label = kmeans.fit_predict(data) cluster_centers = kmeans.cluster_centers_ # use clustered Mean-Shift centroids as initial centroids of K-Means kmeans = KMeans(init=cluster_centers, n_clusters=len(cluster_centers), max_iter=1000) output_label = kmeans.fit_predict(self.data) else: output_label = labels print(" [*] end ") if isPlot and X.shape[1] >= 2: plt.figure(2) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') plot_shape = list('.^*o+dp.^*o+dp.^*o+dp^*o+dp.^*o+dp.^*o+dp.^*o+dp.^*o+dp.^*o+dp.') for k, col in zip(list(set(output_label)), colors): my_members = output_label == k cluster_center = cluster_centers[k] plt.plot(X[my_members, 0], X[my_members, 1], col + plot_shape[k]) plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) plt.title('MeanShift-KMeans') plt.show() dict_2 = {} for i in range(len(output_label)): dict_2.update({str(i):output_label[i]}) new_labels = [] for i in range(len(dict_1)): new_labels.append(dict_2[dict_1[i][1]]) new_labels = np.array(new_labels).astype(int) labels_unique = np.unique(new_labels) n_clusters_ = len(labels_unique) new_labels_pd = pd.DataFrame(new_labels,index=self.data.index.tolist(),columns=['labels']) data_cluster_dict = {} output = pd.DataFrame(index=list(set(new_labels_pd['labels'])),\ columns=self.data.columns) for label in set(new_labels_pd['labels']): a = list(new_labels_pd[(new_labels_pd['labels'] == label)].index) tmp = self.data.loc[a] for col in list(self.data.columns): output.loc[label][col] = tmp[col].mean() data_cluster_dict.update({label:a}) if isPlot and X.shape[1] == 1: x = [] for i in range(len(data_cluster_dict)): x.extend(data_cluster_dict[i]) self.data = self.data.reindex(x) self.data = self.data.reset_index(drop=True) j = 0 plot_shape = list('.^*o+dp.^*o+dp.^*o+dp^*o+dp.^*o+dp.^*o+dp.^*o+dp.^*o+dp.^*o+dp.') plot_color = list('bgrcmykbgrcmykbgrcmykbgrcmyk') for i in range(len(data_cluster_dict)): plt.plot(self.data.loc[j:j+len(data_cluster_dict[i])-1], color=plot_color[i], marker=plot_shape[i], \ linestyle='', linewidth=2.0) j += len(data_cluster_dict[i]) plt.show() return output, data_cluster_dict
def cluster(x, y, n_class): plt.figure() plt.subplot(241) plt.title('RAW') plt.axis('off') plt.scatter(x[:, 0], x[:, 1], c=y) kmeans_pre = KMeans(n_clusters=n_class, random_state=9).fit_predict(x) plt.subplot(242) plt.title('K-Means') plt.axis('off') plt.scatter(x[:, 0], x[:, 1], c=kmeans_pre) aff_pre = AffinityPropagation(preference=-50).fit(x) cluster_centers_indices = aff_pre.cluster_centers_indices_ labels = aff_pre.labels_ n_clusters_ = len(cluster_centers_indices) plt.subplot(243) plt.title('AffinityPropagation@{}'.format(unique(labels))) plt.axis('off') plt.scatter(x[:, 0], x[:, 1], c=labels) bandwidth = estimate_bandwidth(x, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth / 2) ms.fit(x) labels1 = ms.labels_ cluster_centers = ms.cluster_centers_ plt.subplot(244) plt.title('MeanShift@{}'.format(unique(labels1))) plt.axis('off') plt.scatter(x[:, 0], x[:, 1], c=labels1) sc_pre = SpectralClustering(n_clusters=n_class).fit_predict(x) plt.subplot(245) plt.title('SpectralClustering') plt.axis('off') plt.scatter(x[:, 0], x[:, 1], c=sc_pre) clustering = AgglomerativeClustering(n_clusters=n_class).fit(x) labels2 = clustering.labels_ plt.subplot(246) plt.title('AgglomerativeClustering') plt.axis('off') plt.scatter(x[:, 0], x[:, 1], c=labels2) labels3 = DBSCAN(eps=abs(np.max(x) - np.min(x)) / n_class / 2, min_samples=1).fit_predict(x) plt.subplot(247) plt.title('DBSCAN@{}'.format(unique(labels3))) plt.axis('off') plt.scatter(x[:, 0], x[:, 1], c=labels3) gmm = GaussianMixture(n_components=n_class) gmm.fit(x) labels4 = gmm.predict(x) plt.subplot(248) plt.title('GaussianMixture') plt.axis('off') plt.scatter(x[:, 0], x[:, 1], c=labels4) plt.show() preds = [kmeans_pre, labels, labels1, sc_pre, labels2, labels3, labels4] names = ['K-Means', 'AffinityPropagation@{}'.format(unique(labels)), 'MeanShift@{}'.format(unique(labels1)), 'SpectralClustering', 'AgglomerativeClustering', 'DBSCAN@{}'.format(unique(labels3)), 'GaussianMixture'] print('Method:', 'NMI', 'Homogeneity', 'Completeness') for name, pred in zip(names, preds): m1, m2, m3 = metric(y, pred) print(name + ':', m1, m2, m3)
#!/usr/bin/python import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import MeanShift, estimate_bandwidth from itertools import cycle # Load data from input file X = np.loadtxt('data_clustering.txt', delimiter=',') # Estimate the bandwidth of X bandwidth_X = estimate_bandwidth(X, quantile=0.1, n_samples=len(X)) # Cluster data with MeanShift meanshift_model = MeanShift(bandwidth=bandwidth_X, bin_seeding=True) meanshift_model.fit(X) # Extract the centers of clusters cluster_centers = meanshift_model.cluster_centers_ print('\nCenters of clusters:\n', cluster_centers) # Estimate the number of clusters labels = meanshift_model.labels_ num_clusters = len(np.unique(labels)) print("\nNumber of clusters in input data =", num_clusters) # Plot the points and cluster centers plt.figure() markers = 'o*xvs' for i, marker in zip(range(num_clusters), markers): # Plot points that belong to the current cluster plt.scatter(X[labels == i, 0],
from scipy.fftpack import dct import matplotlib.pyplot as plt from PIL import Image image = Image.open('C:\Users\Cris\Desktop\mountain_color.jpg') image = np.array(image) #Need to convert image into feature array based #on rgb intensities flat_image=np.reshape(image, [-1, 3]) #Estimate bandwidth bandwidth2 = estimate_bandwidth(flat_image, quantile=.2, n_samples=5000) ms = MeanShift(bandwidth2, bin_seeding=True) ms.fit(flat_image) labels=ms.labels_ # Example of how to use discrete cosine transform. # We will apply it to luminance, rather than labels. discrete_cosine_transform = dct(np.array(labels, dtype = 'float')) np.savetxt('C:\Users\Cris\Desktop\labels.csv', labels, delimiter=',') # Plot image vs segmented image plt.figure(2) plt.subplot(2, 1, 1) plt.imshow(image) plt.axis('off') plt.subplot(2, 1, 2)
bandwidth3 = estimate_bandwidth(flat_image, quantile=.3, n_samples=500) bandwidth4 = estimate_bandwidth(flat_image, quantile=.4, n_samples=500) print(bandwidth1) print(bandwidth2) print(bandwidth3) print(bandwidth4) ms1 = MeanShift(bandwidth1, bin_seeding=True) ms2 = MeanShift(bandwidth2, bin_seeding=True) ms3 = MeanShift(bandwidth3, bin_seeding=True) ms4 = MeanShift(bandwidth4, bin_seeding=True) #print(ms1) #Performing meanshift on flatImg ms1.fit(flat_image) ms2.fit(flat_image) ms3.fit(flat_image) ms4.fit(flat_image) #(r,g,b) vectors corresponding to the different clusters after meanshift labels1 = ms1.labels_ labels2 = ms2.labels_ labels3 = ms3.labels_ labels4 = ms4.labels_ #print(labels) #Remaining colors after meanshift cluster_centers1 = ms1.cluster_centers_ cluster_centers2 = ms2.cluster_centers_ cluster_centers3 = ms3.cluster_centers_
plt.scatter(X["Longitude"], X["Latitude"], c=kmeans.labels_, cmap='rainbow', zorder=0) plt.ylim(20, 55) plt.xlim(-130, -60) plt.title("KMeans clustering k=50") plt.xlabel("Longitude") plt.ylabel("Latitude") plt.show() """""" """""" """"" Mean_shift Clustering """ """""" """""" "" #bandwidth = estimate_bandwidth([X["Longitude"], X["Latitude"]]) meanshift = MeanShift() meanshift.fit(distance_matrix) print("number of estimated clusters in Mean Shift Clustering", len(np.unique(meanshift.labels_))) plt.scatter(X["Longitude"], X["Latitude"], c=meanshift.labels_, cmap='rainbow') #plt.ylim(20,55) #plt.xlim(-130,-60) plt.title("MeanShift Clustering") plt.xlabel("Longitude") plt.ylabel("Latitude") plt.show()