def meanShift(flat_image): # Estimate Bandwidth bandwidth = estimate_bandwidth(flat_image, quantile = 0.2, n_samples=500) ms = MeanShift(bandwidth, bin_seeding=True) ms.fit(flat_image) labels = ms.labels_ return ms.labels_, ms.cluster_centers_
def meanshift_for_hough_line(self): # init mean shift pixels_of_label = {} points_of_label = {} for hough_line in self.points_of_hough_line: pixels = self.pixels_of_hough_line[hough_line] pixels = np.array(pixels) bandwidth = estimate_bandwidth(pixels, quantile=QUANTILE, n_samples=500) if bandwidth == 0: bandwidth = 2 ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(pixels) labels = ms.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) for k in range(n_clusters_): label = list(hough_line) label.append(k) pixels_of_label[tuple(label)] = map(tuple, pixels[labels==k]) for label in pixels_of_label: pixels = pixels_of_label[label] points = map(self.img.get_bgr_value, pixels) points_of_label[label] = points self.pixels_of_hough_line = pixels_of_label self.points_of_hough_line = points_of_label
def _fit_mean_shift(self, x): for c in xrange(len(self.crange)): quant = 0.015 * (c + 1) for r in xrange(self.repeats): bandwidth = estimate_bandwidth( x, quantile=quant, random_state=r) idx = c * self.repeats + r model = MeanShift( bandwidth=bandwidth, bin_seeding=True) model.fit(x) self._labels[idx] = model.labels_ self._parameters[idx] = model.cluster_centers_ # build equivalent gmm k = model.cluster_centers_.shape[0] model_gmm = GMM(n_components=k, covariance_type=self.cvtype, init_params='c', n_iter=0) model_gmm.means_ = model.cluster_centers_ model_gmm.weights_ = sp.array( [(model.labels_ == i).sum() for i in xrange(k)]) model_gmm.fit(x) # evaluate goodness of fit self._ll[idx] = model_gmm.score(x).sum() if self.gof_type == 'aic': self._gof[idx] = model_gmm.aic(x) if self.gof_type == 'bic': self._gof[idx] = model_gmm.bic(x) print quant, k, self._gof[idx]
def cluster_pixels_ms(self): # reshape """ cluster points descriptors by meahs shift :type self: ColorRemover """ fg_pixels = self.img.fg_pixels.keys() descriptors = [] for r, c in fg_pixels: descriptors.append(self.descriptor_map[r][c]) descriptors = np.array(descriptors) descriptors = PCA(n_components=int(VECTOR_DIMENSION)/2).fit_transform(descriptors) # descriptors = self.descriptor_map.reshape(descriptors_rows, 1, VECTOR_DIMENSION) bandwidth = estimate_bandwidth(descriptors, quantile=0.05) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(descriptors) labels = ms.labels_ for i in range(len(labels)): xy = fg_pixels[i] label = labels[i] self.labels_map.itemset(xy, label) # save the indices and BGR values of each cluster as a dictionary with keys of label for label in range(K): self.pixels_of_hough_line_in_sphere[label] = map(tuple, np.argwhere((self.labels_map == label))) self.cluster_bgr[label] = map(tuple, self.img.bgr[self.labels_map == label])
def get_clusters(self, in_file, cc_file, clf_file, arrivals_file, chunk_size=1710671): df = pd.read_csv(open(in_file), chunksize=chunk_size) dests = [] part = 1 lines = 1710671 / chunk_size try: dest = cPickle.load(open(arrivals_file)) except IOError: for d in df: print "%d / %d" % (part, lines) part += 1 for row in d.values: # print eval(row[-1]) tmp = eval(row[-1]) if len(tmp) > 0: dests.append(tmp[-1]) dest = np.array(dests) cPickle.dump(dest, open(arrivals_file, "w"), protocol=cPickle.HIGHEST_PROTOCOL) print "Destination points loaded" try: ms = cPickle.load(open(clf_file)) except IOError: bw = 0.001 ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5, n_jobs=-2) ms.fit(dest) cPickle.dump(ms, open(clf_file, "w"), protocol=cPickle.HIGHEST_PROTOCOL) print "Mean shift loaded" cluster_centers = ms.cluster_centers_ cPickle.dump(cluster_centers, open(cc_file, "w"), protocol=cPickle.HIGHEST_PROTOCOL) print "Clusters dumped"
def applyMeanShift(data,quantileValue=0.2,clusterall=False): result=[] n_samples=len(data) print "Nombre de points du dataset: %d" %n_samples bandwidth = estimate_bandwidth(data, quantile=quantileValue) ms = MeanShift(bandwidth=bandwidth,cluster_all=clusterall) #Applique le MeanShift clustereddata=ms.fit(data) clusteredlabels= clustereddata.labels_ barycenters=ms.cluster_centers_ labels_unique = np.unique(clusteredlabels) nbOfClusters = len(labels_unique) print "number of estimated clusters : %d" % nbOfClusters for i in labels_unique: print "###Indices des points du cluster %d : ###" %i # print [indice[0] for indice in np.argwhere(clusteredlabels == i)] result.append([indice[0] for indice in np.argwhere(clusteredlabels == i)]) #Add a zero coordinates vector to takeinto account the fact that -1 "cluster" does not have a barycenter if -1 in labels_unique: barycenters= np.append([[0 for k in range(len(barycenters[0]))]],barycenters,axis=0) return [result,barycenters]
def meanShift(mtx, **kw): """ meanShift(mtx, **kw) uses scikit-learn's meanshift clustering implementation to cluster infoDistance matrices. Call with the distance matrix as the first parameter. Available Keyword arguments: startingbandwidth: the lowest bandwidth to begin the estimation with (defaults to 0.1) bandwithincrement: the amount by which to increment bandwidth in between rounds of meanshift (defaults to 0.01) """ H = kw.get('startingbandwidth', 0.1) dH= kw.get('bandwidthincrement', 0.01) ms = MeanShift(bandwidth = H) clustercenters = None nnonunary = [] minH = None while nclusters > 1: ms = MeanShift(bandwidth = H) ms.fit(mtx) centers = ms.cluster_centers_ clusters = ms.labels_ nonunary = np.shape(np.where(np.bincount(clusters) > 1))[1] if nonunary: H = H + dH
def run_mean_shift(df): ''' INPUTS: Pandas Dataframe OUTPUTS: Returns a fitted MeanShift object ''' model = MeanShift(min_bin_freq=10, cluster_all=False, n_jobs=-1) return model.fit(df)
def hart85_means_shift_cluster(pair_buffer_df, features): from sklearn.cluster import MeanShift, estimate_bandwidth # Creating feature vector cluster_df = pd.DataFrame() if 'active' in features: cluster_df['active'] = pd.Series(pair_buffer_df.apply(lambda row: ((np.fabs(row['T1 Active']) + np.fabs(row['T2 Active'])) / 2), axis=1), index=pair_buffer_df.index) if 'reactive' in features: cluster_df['reactive'] = pd.Series(pair_buffer_df.apply(lambda row: ((np.fabs(row['T1 Reactive']) + np.fabs(row['T2 Reactive'])) / 2), axis=1), index=pair_buffer_df.index) if 'delta' in features: cluster_df['delta'] = pd.Series(pair_buffer_df.apply(lambda row: (row['T2 Time'] - row['T1 Time']), axis=1), index=pair_buffer_df.index) cluster_df['delta'] = cluster_df[ 'delta'].apply(lambda x: int(x) / 6e10) if 'hour_of_use' in features: cluster_df['hour_of_use'] = pd.DatetimeIndex( pair_buffer_df['T1 Time']).hour if 'sd_event' in features: cluster_df['sd_event'] = pd.Series(pair_buffer_df.apply(lambda row: (df.power[row['T1 Time']:row['T2 Time']]).std(), axis=1), index=pair_buffer_df.index) X = cluster_df.values.reshape((len(cluster_df.index), len(features))) ms = MeanShift(bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) return pd.DataFrame(cluster_centers, columns=features)
def mean(X, save_fig=False, params_labels=None, prefix='clusters'): ''' Compute clustering with MeanShift ''' logger.debug('Calculating MeanShift clusters using %d parameters'%len(X[0])) X = np.array( X ) with warnings.catch_warnings(): warnings.simplefilter("ignore") bandwidth = estimate_bandwidth(X, quantile=0.2) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ if save_fig: plotClusters(X, ms, method='mean', prefix=prefix, params=params_labels) labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) logger.debug('Found %d clusters with MeanShift algorithm'%n_clusters_) return labels
def find_clusters(feature, items, bandwidth=None, min_bin_freq=None, cluster_all=True, n_jobs=1): """ Cluster list of items based on feature using meanshift algorithm (Binning). :param feature: key used to retrieve item to cluster on :param items: :param bandwidth: :param min_bin_freq: :param cluster_all: :return: """ x = [item[feature] for item in items] X = np.array(list(zip(x, np.zeros(len(x)))), dtype=np.float) ms = MeanShift(bandwidth=bandwidth, min_bin_freq=min_bin_freq, cluster_all=cluster_all, n_jobs=n_jobs) ms.fit(X) labels = ms.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) clusters = [] for k in range(n_clusters_): if k != -1: my_members = labels == k cluster_center = np.median(X[my_members, 0]) cluster_sd = np.std(X[my_members, 0]) clusters.append({ 'center': cluster_center, 'sd': cluster_sd, 'items': X[my_members, 0] }) return clusters
def Mean_Shift(path): #importer les donnees data=pandas.read_csv(filepath_or_buffer=path,delimiter=',',encoding='utf-8') data.drop_duplicates() print (data) #lire les donnees values=data[['latitude', 'longitude']].values print("printing values") print (values) #Mean shift print ("Clustering data Meanshift algorithm") bandwidth = estimate_bandwidth(values, quantile=0.003, n_samples=None) #ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, min_bin_freq=20, cluster_all=False) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True,min_bin_freq=25,cluster_all=False) ms.fit(values) data['cluster'] = ms.labels_ data = data.sort(columns='cluster') data = data[(data['cluster'] != -1)] print (data['cluster']) data['cluster'] = data['cluster'].apply(lambda x:"cluster" +str(x)) labels_unique = np.unique(ms.labels_).tolist() del labels_unique[0] # Filtering clusters centers according to data filter cluster_centers = DataFrame(ms.cluster_centers_, columns=['latitude', 'longitude']) cluster_centers['cluster'] = labels_unique print (cluster_centers) n_centers_ = len(cluster_centers) print("number of clusters is :%d" % n_centers_) # print ("Exporting clusters to {}...'.format(clusters_file)") data.to_csv(path_or_buf="output/points.csv", cols=['user','latitude','longitude','cluster','picture','datetaken'], encoding='utf-8') #print ("Exporting clusters centers to {}...'.format(centers_file)") cluster_centers['cluster'] = cluster_centers['cluster'].apply(lambda x:"cluster" +str(x)) cluster_centers.to_csv(path_or_buf="output/centers.csv", cols=['latitude', 'longitude','cluster'], encoding='utf-8') plot_meanshift(data, cluster_centers, n_centers_) return 0
def CombinedMeanShift(self, h, alpha, PrincComp=None, njobs=-2, mbf=1): """Performs the scikit-learn Mean Shift clustering. Arguments: h -- the bandwidth alpha -- the weight of the principal components as compared to the spatial data. PrincComp -- used to pass already-computed principal components njobs -- the number of processes to be used (default: n. of CPU - 1) mbf -- the minimum number of items in a seed""" MS = MeanShift(bin_seeding=True, bandwidth=h, cluster_all=True, min_bin_freq=mbf, n_jobs=njobs) if PrincComp is None: PrincComp = self.ShapePCA(2) print("Starting sklearn Mean Shift... ") stdout.flush() fourvector = np.vstack((self.__data, alpha * PrincComp)) MS.fit_predict(fourvector.T) self.__ClusterID = MS.labels_ self.__c = MS.cluster_centers_.T print("done.") stdout.flush()
def meanShift(points): # perform meanshift clustering of data meanshift = MeanShift() meanshift.fit(points.T) labels = meanshift.labels_ centers = meanshift.cluster_centers_ return np.array(labels)
def cluster_data(data,clustering_method,num_clusters): cluster_centers = labels_unique = labels = extra = None if clustering_method == 'KMeans': # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans k_means = KMeans(n_clusters=num_clusters,init='k-means++',n_init=10,max_iter=100,tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1) k_means.fit(data) labels = k_means.labels_ cluster_centers = k_means.cluster_centers_ elif clustering_method == 'MeanShift': ms = MeanShift( bin_seeding=True,cluster_all=False) ms.fit(data) labels = ms.labels_ cluster_centers = ms.cluster_centers_ elif clustering_method == 'AffinityPropagation': af = AffinityPropagation().fit(data) cluster_centers = [data[i] for i in af.cluster_centers_indices_] labels = af.labels_ elif clustering_method == "AgglomerativeClustering": n_neighbors=min(10,len(data)/2) connectivity = kneighbors_graph(data, n_neighbors=n_neighbors) ward = AgglomerativeClustering(n_clusters=num_clusters, connectivity=connectivity, linkage='ward').fit(data) labels = ward.labels_ elif clustering_method == "DBSCAN": db = DBSCAN().fit(data) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True extra = core_samples_mask labels = db.labels_ if labels is not None: labels_unique = np.unique(labels) return labels,cluster_centers,labels_unique,extra
def simplify_data1(x): X = np.array(zip(x,np.zeros(len(x))), dtype=np.float) bandwidth = estimate_bandwidth(X, quantile=0.2) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #print n_clusters_ #exit() start=0 value=0 print x for k in range(n_clusters_): my_members = labels == k print "cluster {0}: {1}".format(k, X[my_members, 0]),np.average(X[my_members, 0]) value=np.average(X[my_members, 0]) val2=0 for i in xrange(start,start+len(X[my_members, 0])): val2+=X[i][0] print val2,X[i][0],i X[i][0]=value print "FINAL",val2/len(X[my_members, 0]) start+=len(X[my_members, 0]) return X[:,0]
def mean_shift_cluster_analysis(x,y,quantile=0.2,n_samples=1000): # ADAPTED FROM: # http://scikit-learn.org/stable/auto_examples/cluster/plot_mean_shift.html#example-cluster-plot-mean-shift-py # The following bandwidth can be automatically detected using X = np.hstack((x.reshape((x.shape[0],1)),y.reshape((y.shape[0],1)))) bandwidth = estimate_bandwidth(X, quantile=quantile, n_samples=n_samples) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #print("number of estimated clusters : %d" % n_clusters_) colors = 'bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmyk' #cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for i in xrange(len(np.unique(labels))): my_members = labels == i cluster_center = cluster_centers[i] plt.scatter(X[my_members, 0], X[my_members, 1],s=90,c=colors[i],alpha=0.7) plt.scatter(cluster_center[0], cluster_center[1],marker='+',s=280,c=colors[i]) tolx = (X[:,0].max()-X[:,0].min())*0.03 toly = (X[:,1].max()-X[:,1].min())*0.03 plt.xlim(X[:,0].min()-tolx,X[:,0].max()+tolx) plt.ylim(X[:,1].min()-toly,X[:,1].max()+toly) plt.show() return labels
def centers_y_clusters(self,graph_db,nodes,consulta,cyprop): group = [] todo = [] rr = [] for n in nodes: tiene = neo4j.CypherQuery(graph_db, consulta+" where id(n) ="+str(n.id)+" return count(distinct(e))"+cyprop+" as cuenta").execute() for r in tiene: todo.append([r.cuenta]) rr.append(r.cuenta) ms = MeanShift(bin_seeding=True) ms.fit(np.asarray(todo)) labels = ms.labels_ cluster_centers = sorted(ms.cluster_centers_ , key=lambda x: x[0]) for idx,cl in enumerate(cluster_centers): cluster_centers[idx] = float(cl[0]) for u in cluster_centers: group.append([]) for n in nodes: tiene = neo4j.CypherQuery(graph_db, consulta+" where id(n) ="+str(n.id)+" return count(distinct(e))"+cyprop+" as cuenta").execute() for r in tiene: valor = r.cuenta for idx,v in enumerate(cluster_centers): if idx == 0: temp1 = -9999 else: temp1 = (cluster_centers[idx-1] + cluster_centers[idx])/2 if idx == len(cluster_centers) - 1: temp2 = 99999 else: temp2 = (cluster_centers[idx+1] + cluster_centers[idx])/2 if temp1 <= valor < temp2: group[idx].append(n) return cluster_centers, group
def BA_meanshift_cluster(mark, chrom): ''' @param: @return: perform mean shift cluster on 2D data: ((chromStart+chromEnd)*0.5, chromEnd-chromStart) ''' path = os.path.join(get_data_dir(), "tmp", mark,"{0}-{1}.csv".format(chrom, mark)) DF = pd.read_csv(path, sep='\t') S_x = 0.5*(DF.loc[:, 'chromEnd'].values+DF.loc[:, 'chromStart'].values) S_y = DF.loc[:, 'chromEnd'].values-DF.loc[:, 'chromStart'].values X = np.hstack((np.atleast_2d(S_x[7000:8000]).T, np.atleast_2d(S_y[7000:8000]).T)) print X bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=1000) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ print list(set(labels)) import matplotlib.pyplot as plt from itertools import cycle colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(len(list(set(labels)))), colors): my_members = labels == k plt.plot(X[my_members, 0], X[my_members, 1], col + '.') plt.title('Estimated number of clusters: %d' % len(list(set(labels)))) plt.show()
def make(filename, precision): with open('test.geojson') as f: data = json.load(f) features = data['features'] points = [ geo['geometry']["coordinates"] for geo in features if pred(geo) ] print points ar_points = array(points).reshape(len(points) * 2, 2) print ar_points bandwidth = estimate_bandwidth(ar_points) / precision cluster = MeanShift(bandwidth=bandwidth) cluster.fit(ar_points) labels = cluster.labels_ cluster_centers = cluster.cluster_centers_ print 'clusters:', len(unique(labels)) for i, geo in enumerate(filter(pred, features)): geo['geometry']["coordinates"] = [ list(cluster_centers[labels[i*2 + j]]) for j in range(2) ] with open(filename, 'w') as f: json.dump(data, f)
def do_meanshift(s_path, band1, band2, band3, band4, colour1, colour2, make_plot): '''Meanshift clustering to determine the number of clusters in the data, which is passed to KMEANS function''' # Truncate data X = np.vstack([colour1, colour2]).T '''Compute clustering with MeanShift''' # Scale data because meanshift generates circular clusters X_scaled = preprocessing.scale(X) # The following bandwidth can be automatically detected using # the routine estimate_bandwidth(X). Bandwidth can also be set manually. bandwidth = estimate_bandwidth(X) #bandwidth = 0.65 # Meanshift clustering ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False) ms.fit(X_scaled) labels_unique = np.unique(ms.labels_) objects = ms.labels_[ms.labels_ >= 0] n_clusters = len(labels_unique[labels_unique >= 0]) # Make plot if "meanshift" in make_plot: make_ms_plots(s_path, colour1, colour2, n_clusters, X, ms, band1, band2, band3, band4, objects) return(n_clusters, bandwidth)
def ms_algo(X, bandwidth=None): if(bandwidth==None): n_samples = X.shape[0] bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=n_samples) # Apply the meanshit algorithm from sklearn library ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) # collect from the meanshift algorithm the labels and the centers of the clusters labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #Number of clusters # Print section print("The number of clusters is: %d" % n_clusters_) print("The centers are:") for i in range(n_clusters_): print i, print cluster_centers[i] return cluster_centers
def meanshift(raw_data, t): # Compute clustering with MeanShift # The following bandwidth can be automatically detected using #data = [ [(raw_data[i, 1]+raw_data[i, 5]), (raw_data[i, 2]+raw_data[i,6])] for i in range(raw_data.shape[0]) ] data = np.zeros((raw_data.shape[0],2)) X = raw_data[:,1] + raw_data[:,5] Y = raw_data[:,2] + raw_data[:,6] #X = raw_data[:,1] ; Y = raw_data[:,2]; data = np.transpose(np.concatenate((np.mat(X),np.mat(Y)), axis=0)) bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(data) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) # Plot result plt.figure(t) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): my_members = labels == k cluster_center = cluster_centers[k] plt.plot(data[my_members, 0], data[my_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.axis('equal') plt.show()
def mean_shift(X): bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=1000) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ return labels, cluster_centers
def train(trainingData, pklFile, clusteringAll, numberOfClusters=None): # ========================================================================= # # =============== STEP 1. DEFINE OUTPUT LEARNT MODEL FILE ================= # # ========================================================================= # if (pklFile == ''): os.system('rm -rf learntModel & mkdir learntModel') pklFile = 'learntModel/learntModel.pkl' # ========================================================================= # # =============== STEP 2. PERFORM CLUSTERING TO THE DATA ================== # # ========================================================================= # if (numberOfClusters == None): print "Running MeanShift Model..." bandwidth = estimate_bandwidth(trainingData) ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=clusteringAll) ms.fit(trainingData) joblib.dump(ms, pklFile) return {"numberOfClusters":len(ms.cluster_centers_), "labels": ms.labels_, "clusterCenters":ms.cluster_centers_} elif (numberOfClusters != None): print "Running K-Means Model..." kMeans = KMeans(init='k-means++', n_clusters=numberOfClusters) kMeans.fit(trainingData) joblib.dump(kMeans, pklFile) return {"numberOfClusters":len(kMeans.cluster_centers_), "labels": kMeans.labels_, "clusterCenters":kMeans.cluster_centers_}
def weekhour(lst,day,hour,num): l = [ ] for dicts in lst: latlong = dicts["latlong"] l.append(latlong) l = np.array(l) l = np.array([x for x in l if x[0] < 40]) l = np.array([x for x in l if x[1] < -102.0]) l = np.array([x for x in l if x[0] > 39]) l = np.array([x for x in l if x[1] > -105.5]) bandwidth = .001 ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(l) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): my_members = labels == k cluster_center = cluster_centers[k] plt.plot(l[my_members,1], l[my_members,0], col + '.') plt.plot(cluster_center[1], cluster_center[0], 'x', markerfacecolor=col,\ markeredgecolor='k', markersize=14) num_samples = len(labels) list_clust_cents = cluster_centers.tolist() num_labels = Counter(labels).most_common() top = tuple(num_labels) if num > n_clusters_: num = n_clusters_ for i in range(num): densest = top[i][1] percent = round((float(densest)/float(num_samples))*100,3) if densest >= 60: import geocoder g = geocoder.google(list_clust_cents[i], method='reverse') address = g.address else: address = 0 with open('weekdayclusterstest.csv', 'a') as csvfile: fieldnames = ['day', 'hour', 'densest cluster', 'address', 'percent', 'number of samples', 'number of estimated clusters'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerow({'densest cluster': densest, \ 'day': day, \ 'hour': hour, \ 'address': address, \ 'percent': percent, \ 'number of samples': num_samples, \ 'number of estimated clusters': n_clusters_})
def meanshiftUsingPCA(path): # Load original image given the image path im = cv.LoadImageM(path) #convert image to YUV color space cv.CvtColor(im,im,cv.CV_BGR2YCrCb) # Load bank of filters filterBank = lmfilters.loadLMFilters() # Resize image to decrease dimensions during clustering resize_factor = 1 thumbnail = cv.CreateMat(im.height / resize_factor, im.width / resize_factor, cv.CV_8UC3) cv.Resize(im, thumbnail) # now work with resized thumbnail image response = np.zeros(shape=((thumbnail.height)*(thumbnail.width),51), dtype=float) for f in xrange(0,48): filter = filterBank[f] # Resize the filter with the same factor for the resized image dst = cv.CreateImage(cv.GetSize(thumbnail), cv.IPL_DEPTH_32F, 3) resizedFilter = cv.CreateMat(filter.height / resize_factor, filter.width / resize_factor, filter.type) cv.Resize(filter, resizedFilter) # Apply the current filter cv.Filter2D(thumbnail,dst,resizedFilter) for j in xrange(0,thumbnail.height): for i in xrange(0,thumbnail.width): # Select the max. along the three channels maxRes = max(dst[j,i]) if math.isnan(maxRes): maxRes = 0.0 if maxRes > response[thumbnail.width*j+i,f]: # Store the max. response for the given feature index response[thumbnail.width*j+i,f] = maxRes #YUV features count = 0 for j in xrange(0,thumbnail.height): for i in xrange(0,thumbnail.width): response[count,48] = thumbnail[j,i][0] response[count,49] = thumbnail[j,i][1] response[count,50] = thumbnail[j,i][2] count+=1 #get the first 4 primary components using pca pca = PCA(response) pcaResponse = zeros([thumbnail.height*thumbnail.width,4]) for i in xrange(0,thumbnail.height*thumbnail.width): pcaResponse[i] = pca.getPCA(response[i],4) # Create new mean shift instance ms = MeanShift(bandwidth=10,bin_seeding=True) # Apply the mean shift clustering algorithm ms.fit(pcaResponse) labels = ms.labels_ n_clusters_ = np.unique(labels) print "Number of clusters: ", len(n_clusters_) repaintImage(thumbnail,labels) cv.Resize(thumbnail, im) return im
def do_meanshift (band1, band2, band3, band4, colour1, colour2, make_plots): '''Does meanshift clustering to determine a number of clusters in the data, which is passed to KMEANS function''' data = np.loadtxt(inputdata) #Input Checking #if band1 == band2 or band3 == band4: #print "Not a good idea to use the same band in one colour, try again" #return #for band in [band1, band2, band3, band4]: #if band not in band_names.keys(): #print "Can't find %s in band_name list" %band #return #Import 4 different wavelengths #Colour 1: 05_mag wave1 = data[:, band_names[band1]] wave2 = data[:, band_names[band2]] #Colour 2: 05_mag wave3 = data[:, band_names[band3]] wave4 = data[:, band_names[band4]] gooddata1 = np.logical_and(np.logical_and(wave1!=badval, wave2!=badval), np.logical_and(wave3!=badval, wave4!=badval)) # Remove data pieces with no value gooddata2 = np.logical_and(np.logical_and(wave1<maglim, wave2<maglim), np.logical_and(wave3<maglim, wave4<maglim)) greatdata = np.logical_and(gooddata1, gooddata2) colour1 = wave1[greatdata] - wave2[greatdata] colour2 = wave3[greatdata] - wave4[greatdata] #Truncate data X = np.vstack([colour1, colour2]).T #Scale data because meanshift generates circular clusters X_scaled = preprocessing.scale(X) # The following bandwidth can be automatically detected using # the routine estimate_bandwidth(). Bandwidth can also be set # as a value. bandwidth = estimate_bandwidth(X) # Meanshift clustering ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False) ms.fit(X_scaled) labels_unique = np.unique(ms.labels_) n_clusters = len(labels_unique[labels_unique >= 0]) #Make plot of clusters if needed if "MSplot" in make_plot: make_ms_plots(colour1, colour2, n_clusters, X, ms, band1, band2, band3, band4) return(n_clusters)
def meanShiftClustering(centers_df,subject): #estimate the bandwidth to use with the mean shift algorithm. The quantile represents the distance used between the box centers to define the cluster. Smaller quantile, means smaller distance between points that would end up in the same cluster centers_df=centers_df.reset_index() bandwidth=estimate_bandwidth(centers_df[['center_x','center_y']].as_matrix(), quantile=0.0055) #instantiate the mean shift algorithm ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) #fit the algorithm on the box center coordinates ms.fit(centers_df[['center_x','center_y']]) #get the resulting clustesr labels labels = ms.labels_ #get the resulting centers of each *cluster* cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) #calculate the number of clusters by using the length of the list that contains all the unique labels n_clusters_ = len(labels_unique) #concatenate the centers data frame (which contains all the box coordinates, their dimensions, and their centers) with the clustering labels generated by the clustering boxes_df = pd.concat([centers_df,pd.DataFrame(labels,columns=['cluster_label'])],axis=1) #the aggregate function in the groupby, includes two functions: count and median f = {'Number of boxes in a cluster': ['count'],'Median': ['median']} #group by the label of each cluster and aggregate the boxes' top left coordinates and dimensions by applying the median aggregated_df = boxes_df.groupby('cluster_label')['cluster_label','tl_x','tl_y','width','height'].agg(f).reset_index() #change column names for a more descriptive name aggregated_df.columns = ['cluster_label','median_cluster_label','agg_tl_x','agg_tl_y','agg_width','agg_height','boxes_in_cluster','count_tl_x','count_tl_y','count_width','count_height'] #leave out the unnecessary columns aggregated_df = aggregated_df[['cluster_label','agg_tl_x','agg_tl_y','agg_width','agg_height','boxes_in_cluster']] #Look at the output of the plotBoxes function (svg file) and determine at which THRESHOLD value there is a desired number of clusters (appears at the top of the plot) and that it visually matches the actual grid THRESHOLD = 5 #filter out all the clusters that have less than a certain number of boxes in each cluster #use the old-weather-aggregator-with-plot.py script to check what the best threshold is aggregated_df = aggregated_df.loc[aggregated_df.boxes_in_cluster>THRESHOLD,:] good_clusters = np.unique(aggregated_df.cluster_label.values) print "for subject_id:"+str(subject) print "number of estimated clusters overall: %d" % n_clusters_ print "number of estimated clusters, after small clusters were filtered out: %d" % len(good_clusters) print "clusters with more than %d boxes per cluster:" % THRESHOLD print aggregated_df.columns print aggregated_df.head() #save the aggregated boxes and their clusters into a csv file, separate file for each subject print "Saving the output/aggregated_df_%s.csv file..." % str(subject) aggregated_df.to_csv("output/aggregated_df_"+str(subject)+".csv",index=False) #make sure that only the boxes that belong to the good_clusters (have more boxes than the threshhold) remain in the boxes_df dataframe and then save the dataframe boxes_df = boxes_df.loc[boxes_df['cluster_label'].isin(good_clusters),:] print "Saving the output/clustered_df_%s.csv file..." % str(subject) boxes_df.to_csv("output/clustered_df_"+str(subject)+".csv",index=False) plotBoxes(aggregated_df,boxes_df,cluster_centers)
def checkForClustering(catalog): debug("Checking for data clustering") Xfull = catalog.view(np.float64).reshape(catalog.shape + (-1,))[:,1:] X = Xfull[:,2:] debug("Using DBSCAN") db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ n_clusters_DBSCAN = len(set(labels)) - (1 if -1 in labels else 0) debug('Estimated number of clusters with DBSCAN: %d' % n_clusters_DBSCAN) unique_labelsDBSCAN = set(labels) colorsDBSCAN = plt.cm.rainbow(np.linspace(0, 1, len(unique_labelsDBSCAN))) debug("Estimating clusters using MeanShift") bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labelsMS = ms.labels_ cluster_centers = ms.cluster_centers_ labels_uniqueMS = np.unique(labelsMS) n_clusters_MS = len(labels_uniqueMS) debug("Estimated number of clusters with MeanShift: %d" % n_clusters_MS) # Plot result fig = plt.figure(figsize=(12,12)) ax0 = fig.add_subplot(2,2,1) ax1 = fig.add_subplot(2,2,2) ax2 = fig.add_subplot(2,2,3) ax3 = fig.add_subplot(2,2,4) for k, col in zip(unique_labelsDBSCAN, colorsDBSCAN): if k == -1: col = 'k' class_member_mask = (labels == k) mask = class_member_mask & core_samples_mask xy = Xfull[mask] ax0.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5) ax2.plot(catalog['MAG_APER(1)'][mask], catalog['CLASS_STAR'][mask], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5) xy = Xfull[class_member_mask & ~core_samples_mask] ax0.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5) ax2.plot(catalog['MAG_APER(1)'][class_member_mask & ~core_samples_mask], catalog['CLASS_STAR'][class_member_mask & ~core_samples_mask], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5) ax0.set_title('DBCAN: # clusters: %d' % n_clusters_DBSCAN) colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_MS), colors): my_members = labelsMS == k cluster_center = cluster_centers[k] ax1.plot(Xfull[my_members, 0], Xfull[my_members, 1], col + '.') ax3.plot(catalog['MAG_APER(1)'][my_members], catalog['CLASS_STAR'][my_members], col + '.') #ax1.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) ax1.set_title('MeanShift: # clusters: %d' % n_clusters_MS) plt.show()
pred = {'semantics': [], 'instances': []} with torch.no_grad(): for i, batch in enumerate(tqdm(loader, ascii=True)): points = batch['points'].to(device) labels = batch['labels'] size = batch['size'] logits, embedded = model(points) logits = logits.cpu().numpy() semantics = np.argmax(logits, axis=-1) instances = [] embedded = embedded.cpu().numpy() batch_size = embedded.shape[0] for b in range(batch_size): k = size[b].item() y = MeanShift(args['bandwidth'], n_jobs=8).fit_predict(embedded[b]) instances.append(y) instances = np.stack(instances) pred['semantics'].append(semantics) pred['instances'].append(instances) pred['semantics'] = np.concatenate(pred['semantics'], axis=0) pred['instances'] = np.concatenate(pred['instances'], axis=0) fname = os.path.join(logdir, 'pred.npz') print('> Saving predictions to {}...'.format(fname)) np.savez(fname, **pred)
from sklearn.cluster import AffinityPropagation from sklearn.cluster import MeanShift from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import DBSCAN from sklearn.cluster import Birch from sklearn.mixture import GaussianMixture names = ["K-Means", "Affinity Propagation", "Spectral Clustering","Mean Shift","Agglomerative Clustering","DBSCAN","Birch"] clusters = [ KMeans(n_clusters=7, random_state=1), AffinityPropagation(damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False), SpectralClustering(n_clusters=7, assign_labels="discretize", random_state=1), MeanShift(bandwidth=2), AgglomerativeClustering(n_clusters=7, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', pooling_func='deprecated'), DBSCAN(eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None), Birch(threshold=0.5, branching_factor=50, n_clusters=7, compute_labels=True, copy=True) ] #read and create features & labels variables data = pd.read_csv('glass_data_labeled.csv') X = data[['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']] y = data['Type'] # print(X) # print(y) for name, cl in zip(names, clusters): labels = cl.fit(X).labels_
def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples, vectorizer, reducer, param): train_programs, train_features, train_classes = read_traces( train_file, nsamples) train_size = len(train_programs) print("using", train_size, "examples to train.") if vectorizer == "bow": train_dict = dict() train_dict[ftype] = train_features #batch_size = 16 #window_size = 20 print("Transforming data and fitting model..") model = make_cluster_pipeline_bow(ftype, reducer) X_red = model.fit_transform(train_dict) elif vectorizer == "doc2vec": from gensim.models.doc2vec import TaggedDocument from gensim.models import Doc2Vec print("Vectorizing traces..") sentences = [] for (prog, trace) in zip(train_programs, train_features): sentences.append(TaggedDocument(trace.split(" "), [prog])) model = Doc2Vec(dm=2, min_count=1, window=5, size=100, sample=1e-4, negative=5, workers=8, iter=1) model.build_vocab(sentences) for epoch in range(20): model.train(sentences) shuffle(sentences) train_dict = dict() vec_train_features = [] for prog in train_programs: # print(prog, model.docvecs[prog]) vec_train_features.append(model.docvecs[prog]) train_dict[ftype] = vec_train_features print("Transforming data and fitting model..") model = make_cluster_pipeline_doc2vec(ftype, reducer) X_red = model.fit_transform(train_dict) #pl.rcParams.update({'font.size': 10}) if isinstance(X_red, list): X_red = np.vstack(X_red) print(X_red.shape) if X_red.shape[1] == 2: plt.figure() colors = 'brgcmykbgrcmykbgrcmykbgrcmyk' ncolors = len(colors) for prog, [x, y], cl in zip(train_programs, X_red, train_classes): x = gauss(0, 0.1) + x y = gauss(0, 0.1) + y try: plt.scatter(x, y, c=colors[int(cl)]) plt.text(x, y + 0.02, prog.split("/")[-1]) except ValueError: plt.text(x, y + 0.02, cl) if valid_file is not None: valid_programs, valid_features, valid_classes = read_traces( valid_file, None) valid_dict = dict() valid_dict[ftype] = valid_features X_red = model.transform(valid_dict) for prog, [x, y], cl in zip(valid_programs, X_red, valid_classes): x = gauss(0, 0.1) + x y = gauss(0, 0.1) + y plt.scatter(x, y, c=colors[cl + 1]) plt.text(x, y + 0.02, prog.split("/")[-1]) # plt.show() plt.savefig(train_file.replace(".gz", "") + ".png") from sklearn.cluster import MeanShift, estimate_bandwidth bandwidth = estimate_bandwidth(X_red, quantile=0.2) print("Clustering with bandwidth:", bandwidth) af = MeanShift(bandwidth=bandwidth * param).fit(X_red) cluster_centers = af.cluster_centers_ labels = af.labels_ n_clusters_ = len(cluster_centers) if X_red.shape[1] == 2: plt.close('all') plt.figure(1) plt.clf() for ([x, y], label, cluster_label) in zip(X_red, train_programs, labels): x = gauss(0, 0.1) + x y = gauss(0, 0.1) + y plt.scatter(x, y, c=colors[cluster_label % ncolors]) for i, [x, y] in enumerate(cluster_centers): plt.plot(x, y, 'o', markerfacecolor=colors[i % ncolors], markeredgecolor='k', markersize=7) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.savefig(train_file.replace(".gz", "") + ".clusters.png") # plt.show() clustered_traces = zip(train_programs, labels) writer = write_csv(train_file.replace(".gz", "") + ".clusters") for label, cluster in clustered_traces: writer.writerow([label.split("/")[-1], cluster])
#Estimate bandwidth #bandwidth increases by very less amount by increasing no. of samples and not much visible difference will be there bandwidth1 = estimate_bandwidth(flat_image, quantile=.1, n_samples=500) bandwidth2 = estimate_bandwidth(flat_image, quantile=.2, n_samples=500) #bandwidth3 = estimate_bandwidth(flat_image, quantile=.1, n_samples=1000) #bandwidth4 = estimate_bandwidth(flat_image, quantile=.3, n_samples=1000) bandwidth3 = estimate_bandwidth(flat_image, quantile=.3, n_samples=500) bandwidth4 = estimate_bandwidth(flat_image, quantile=.4, n_samples=500) print(bandwidth1) print(bandwidth2) print(bandwidth3) print(bandwidth4) ms1 = MeanShift(bandwidth1, bin_seeding=True) ms2 = MeanShift(bandwidth2, bin_seeding=True) ms3 = MeanShift(bandwidth3, bin_seeding=True) ms4 = MeanShift(bandwidth4, bin_seeding=True) #print(ms1) #Performing meanshift on flatImg ms1.fit(flat_image) ms2.fit(flat_image) ms3.fit(flat_image) ms4.fit(flat_image) #(r,g,b) vectors corresponding to the different clusters after meanshift labels1 = ms1.labels_ labels2 = ms2.labels_ labels3 = ms3.labels_
from sklearn.cluster import MeanShift, estimate_bandwidth from scipy.fftpack import dct import matplotlib.pyplot as plt from PIL import Image image = Image.open('C:\Users\Cris\Desktop\mountain_color.jpg') image = np.array(image) #Need to convert image into feature array based #on rgb intensities flat_image=np.reshape(image, [-1, 3]) #Estimate bandwidth bandwidth2 = estimate_bandwidth(flat_image, quantile=.2, n_samples=5000) ms = MeanShift(bandwidth2, bin_seeding=True) ms.fit(flat_image) labels=ms.labels_ # Example of how to use discrete cosine transform. # We will apply it to luminance, rather than labels. discrete_cosine_transform = dct(np.array(labels, dtype = 'float')) np.savetxt('C:\Users\Cris\Desktop\labels.csv', labels, delimiter=',') # Plot image vs segmented image plt.figure(2) plt.subplot(2, 1, 1) plt.imshow(image) plt.axis('off')
# "DBScan": DBSCAN(), # "OPTICS": OPTICS() # } # # fit_predict method for each algorithm - because DBScan and OPTICS doesn't have predict() method # fit_predict = { # "k-Means": lambda clr, X_train, X_test: clr.fit(X_train).predict(X_test), # "MeanShift": lambda clr, X_train, X_test: clr.fit(X_train).predict(X_test), # "DBScan": lambda clr, _, X_test: clr.fit_predict(X_test), # "OPTICS": lambda clr, _, X_test: clr.fit_predict(X_test) # } # Algorithms clrs = { "MyMeanShift": MyMeanShift(), "MeanShift": MeanShift( ), # if not given, the bandwidth is estimated using sklearn.cluster.estimate_bandwidth } # fit_predict method for each algorithm - because DBScan and OPTICS doesn't have predict() method fit_predict = { "MyMeanShift": lambda clr, X_train, X_test: clr.fit(X_train).predict(X_test), "MeanShift": lambda clr, X_train, X_test: clr.fit(X_train).predict(X_test), } # Measures measures = { "adjusted_rand_score": adjusted_rand_score, "completeness_score": completeness_score, "homogeneity_score": homogeneity_score, "v_measure_score": v_measure_score,
def test_meanshift_predict(): # Test MeanShift.predict ms = MeanShift(bandwidth=1.2) labels = ms.fit_predict(X) labels2 = ms.predict(X) assert_array_equal(labels, labels2)
def test_unfitted(): # Non-regression: before fit, there should be not fitted attributes. ms = MeanShift() assert not hasattr(ms, "cluster_centers_") assert not hasattr(ms, "labels_")
def main(): # generate theme sg.theme('DarkAmber') # All the stuff inside your window. layout = [[ sg.Image(filename='', key='-frame-'), sg.Image(filename='', key='-model-') ], [sg.Button('Learn Model'), sg.Button('Close')]] # Create the Window window = sg.Window('SIFT model Learning GUI', layout, location=(800, 400), finalize=True) # init frame acquisition cam = cv2.VideoCapture(0) print("Camera init -> DONE") # init feature detector SIFT SIFT = cv2.SIFT_create() # init feature matcher KNN matcher = cv2.DescriptorMatcher_create("BruteForce") ratio_thresh = 0.80 # init state frame and model ret, scene_img_RGB = cam.read() model_img = np.zeros((480, 640, 3)) # init model keypoint and descriptor kp_obj = None des_obj = None # Event Loop to process "events" and get the "values" of the inputs while True: # read state windows event, values = window.read(timeout=0, timeout_key='timeout') # get state ret, frame = cam.read() scene_img_RGB = frame # start / stop the application if event == 'Close' or event is None: # kill thread and close window cam.release() cv2.destroyAllWindows() # stop program break # Learn model if event == 'Learn Model' or event is None: # get object ROI roi = cv2.selectROI(scene_img_RGB) model_img = scene_img_RGB[int(roi[1]):int(roi[1] + roi[3]), int(roi[0]):int(roi[0] + roi[2])] cv2.destroyAllWindows() # find feature in the object ROI kp_obj, des_obj = SIFT.detectAndCompute( cv2.cvtColor(model_img, cv2.COLOR_BGR2GRAY), None) des_obj /= (des_obj.sum(axis=1, keepdims=True) + 1e-7) des_obj = np.sqrt(des_obj) # draw detected feature in the ROI model_img = cv2.drawKeypoints( model_img, kp_obj, None, flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS) # perform object detection in the current state if model_img is not None and kp_obj is not None and des_obj is not None: # convert frame in gray scene_img = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # get feature in the scene kp_scene, des_scene = SIFT.detectAndCompute(scene_img, None) des_scene /= (des_scene.sum(axis=1, keepdims=True) + 1e-7) des_scene = np.sqrt(des_scene) # match feature with the template using KNN matching (norm L2) knn_matches = matcher.knnMatch(des_obj, des_scene, 2) # filter matches (lowe ratio) good_matches = [] for m, n in knn_matches: if m.distance < ratio_thresh * n.distance: good_matches.append(m) # create empty keypoint position vector for all good matches obj = np.empty((len(good_matches), 2), dtype=np.float32) scene = np.empty((len(good_matches), 2), dtype=np.float32) # update keypoints position for i in range(len(good_matches)): obj[i, 0] = kp_obj[good_matches[i].queryIdx].pt[0] obj[i, 1] = kp_obj[good_matches[i].queryIdx].pt[1] scene[i, 0] = kp_scene[good_matches[i].trainIdx].pt[0] scene[i, 1] = kp_scene[good_matches[i].trainIdx].pt[1] if scene.shape[0] > 10: # compute bandwith for the clustering bandwidth = estimate_bandwidth(scene, quantile=0.2) # compute clusters for keypoint meanShift = MeanShift(bandwidth=bandwidth, bin_seeding=True) meanShift.fit(scene) labels = meanShift.labels_ clusterCenters = meanShift.cluster_centers_ # compute pose using cluster label for c in range(len(clusterCenters)): currentCluster = labels == c objPoint = obj[currentCluster, :] scenePoint = scene[currentCluster, :] # if cluster point number superior to a threshold e=10 if scenePoint.shape[0] > 10: # estimate homographical transformation TF, mask = cv2.findHomography(objPoint, scenePoint, cv2.RANSAC, 0.99) if TF is not None and mask[mask == 1].size > 15: # transform obj corners according the homographical transformation in the scene h, w, c = model_img.shape pts = np.float32([[0, 0], [0, h - 1], [w - 1, h - 1], [w - 1, 0]]).reshape(-1, 1, 2) dst = cv2.perspectiveTransform(pts, TF) for i in range(scenePoint.shape[0]): scene_img_RGB = cv2.circle( scene_img_RGB, (scenePoint[i, 0], scenePoint[i, 1]), 5, [0, 255, 255], -1) scene_img_RGB = cv2.polylines( scene_img_RGB, [np.int32(dst)], True, (0, 255, 0), 20, cv2.LINE_AA) # update image on the GUI imgbytes_frame = cv2.imencode( '.png', cv2.resize(scene_img_RGB, (640, 480), cv2.INTER_LINEAR))[1].tobytes() imgbytes_model = cv2.imencode('.png', model_img)[1].tobytes() window['-frame-'].update(data=imgbytes_frame) window['-model-'].update(data=imgbytes_model) window.close()
input_file = 'sales.csv' file_reader = csv.reader(open(input_file, 'r'), delimiter=',') X = [] for count, row in enumerate(file_reader): if not count: names = row[1:] continue X.append([float(x) for x in row[1:]]) X = np.array(X) bandwidth = estimate_bandwidth(X, quantile=0.8, n_samples=len(X)) meanshift_model = MeanShift(bandwidth=bandwidth, bin_seeding=True) meanshift_model.fit(X) labels = meanshift_model.labels_ cluster_centers = meanshift_model.cluster_centers_ num_clusters = len(np.unique(labels)) print("\nNumber of clusters in input data =", num_clusters) print("\nCenters of clusters:") print('\t'.join([name[:3] for name in names])) for cluster_center in cluster_centers: print('\t'.join([str(int(x)) for x in cluster_center])) cluster_centers_2d = cluster_centers[:, 1:3] plt.figure()
def get_cluster(bandwidth, X): """ https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MeanShift.html Doc de MeanShift: Mean shift clustering using a flat kernel. Mean shift clustering aims to discover “blobs” in a smooth density of samples. It is a centroid-based algorithm, which works by updating candidates for centroids to be the mean of the points within a given region. These candidates are then filtered in a post-processing stage to eliminate near-duplicates to form the final set of centroids. Seeding is performed using a binning technique for scalability. Parameters bandwidth float, default=None Bandwidth used in the RBF kernel. If not given, the bandwidth is estimated using sklearn.cluster.estimate_bandwidth; see the documentation for that function for hints on scalability (see also the Notes, below). seeds array-like of shape (n_samples, n_features), default=None Seeds used to initialize kernels. If not set, the seeds are calculated by clustering.get_bin_seeds with bandwidth as the grid size and default values for other parameters. bin_seeding bool, default=False If true, initial kernel locations are not locations of all points, but rather the location of the discretized version of points, where points are binned onto a grid whose coarseness corresponds to the bandwidth. Setting this option to True will speed up the algorithm because fewer seeds will be initialized. The default value is False. Ignored if seeds argument is not None. min_bin_freq int, default=1 To speed up the algorithm, accept only those bins with at least min_bin_freq points as seeds. cluster_all bool, default=True If true, then all points are clustered, even those orphans that are not within any kernel. Orphans are assigned to the nearest kernel. If false, then orphans are given cluster label -1. n_jobs int, default=None The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details. max_iter int, default=300 Maximum number of iterations, per seed point before the clustering operation terminates (for that seed point), if has not converged yet. """ ms = MeanShift( bandwidth=bandwidth, bin_seeding=True, n_jobs=-1, max_iter=500) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) return labels, cluster_centers, labels_unique, n_clusters_
print(label_list) print(word_list) #K-means算法 km = KMeans(n_clusters=89, max_iter=300, n_init=40, init='k-means++', n_jobs=1) result_kmeans = km.fit_predict(tfidf.toarray()) #AffinityPropagation算法 ap = AffinityPropagation(damping=0.55, max_iter=575, convergence_iter=575, copy=True, preference=None, affinity='euclidean', verbose=False) result_ap = ap.fit_predict(tfidf.toarray()) #meanshift算法 ms = MeanShift(bandwidth=0.65, bin_seeding=True) result_ms = ms.fit_predict(tfidf.toarray()) #SpectralClustering算法 sc = SpectralClustering(n_clusters=89, affinity='nearest_neighbors', n_neighbors=4, eigen_solver='arpack', n_jobs=1) result_sc = sc.fit_predict(tfidf.toarray()) #DBSCAN算法 db = DBSCAN(eps=0.7, min_samples=1) result_db = db.fit_predict(tfidf.toarray()) #AgglomerativeClustering算法 ac = AgglomerativeClustering(n_clusters=89, affinity='euclidean', linkage='ward')
def Windows_KDE_amova(SequenceStore, admx_lib, refs_lib): Geneo = admx_lib Geneo_order = list(Geneo.keys()) ref_order = list(refs_lib.keys()) Whose = [z for z in it.chain(*[Geneo[x] for x in Geneo_order])] Sup_labels = list( np.repeat(Geneo_order, [len(Geneo[x]) for x in Geneo_order])) ### Define parameters and libraries of analyses. Results = {x: recursively_default_dict() for x in SequenceStore.keys()} Construct = recursively_default_dict() PC_var = recursively_default_dict() for CHR in SequenceStore.keys(): print('going on CHR: ' + str(CHR)) for c in SequenceStore[CHR].keys(): ### PCA and MeanShift of information from each window copied from *FM36_Galaxy.py. Sequences = [SequenceStore[CHR][c][x] for x in Whose] Sequences = np.array(Sequences) Sequences = np.nan_to_num(Sequences) pca = PCA(n_components=KDE_comps, whiten=False, svd_solver='randomized').fit(Sequences) data = pca.transform(Sequences) PC_var[CHR][c] = [x for x in pca.explained_variance_] params = { 'bandwidth': np.linspace(np.min(data), np.max(data), Bandwidth_split) } grid = GridSearchCV(KernelDensity(algorithm="ball_tree", breadth_first=False), params, verbose=0) ###################################### ####### TEST global Likelihood ####### ###################################### Focus_labels = [z for z in it.chain(*refs_lib.values())] #### Mean Shift approach ## from sklearn.cluster import MeanShift, estimate_bandwidth bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=len(Focus_labels)) if bandwidth <= 1e-3: bandwidth = 0.1 ms = MeanShift(bandwidth=bandwidth, cluster_all=False, min_bin_freq=clsize) ms.fit(data[Focus_labels, :]) labels = ms.labels_ Tree = { x: [ Focus_labels[y] for y in range(len(labels)) if labels[y] == x ] for x in [g for g in list(set(labels)) if g != -1] } Keep = [x for x in Tree.keys() if len(Tree[x]) > clsize] Tree = {x: Tree[x] for x in Keep} Ngps = len(Tree) SpaceX = {x: data[Tree[x], :] for x in Tree.keys()} ### Extract MScluster likelihood by sample for hill in SpaceX.keys(): grid.fit(data[Tree[hill], :]) # use the best estimator to compute the kernel density estimate kde = grid.best_estimator_ # normalize kde derived log-likelihoods, derive sample p-values P_dist = kde.score_samples(data[Tree[hill], :]) Dist = kde.score_samples(data) P_dist = np.nan_to_num(P_dist) Dist = np.nan_to_num(Dist) if np.std(P_dist) == 0: Dist = np.array( [int(Dist[x] in P_dist) for x in range(len(Dist))]) else: Dist = scipy.stats.norm(np.mean(P_dist), np.std(P_dist)).cdf(Dist) Dist = np.nan_to_num(Dist) Construct[CHR][c][hill] = Dist ######################################### ############# AMOVA ################ ######################################### if supervised: labels = Sup_labels Who = list(range(Sequences.shape[0])) Ngps = len(refs_lib) else: Who = [ x for x in range(len(labels)) if labels[x] != -1 and labels[x] in Keep ] labels = [labels[x] for x in Who] Who = [Focus_labels[x] for x in Who] if amova: clear_output() Bool_set = Sequences[Who, :].astype(bool) print( 'chr {}, where: {}, supervised: {}, n clusters: {}'.format( CHR, c, str(supervised), Ngps)) Amova1, Ciggy = AMOVA_FM42(Bool_set, labels, n_boot=0, metric='jaccard') Amova2, Ciggy = AMOVA_FM42(data[Who, :], labels, n_boot=0, metric='euclidean') Amova3, Ciggy = AMOVA_FM42(Bool_set, labels, n_boot=0, metric='hamming') print('old: ; jaccard: {}; PCA euc: {}; nHam: {}'.format( Amova1, Amova2, Amova3)) Results[CHR][c] = [Ngps, Amova1, Amova2, Amova3] return Results, Construct, PC_var
def break_down_spec(actual_tracks, N_neigh=0, ms_layer2=True, scale_spec=False, qtl_I=0.05, qtl_II=0.1, clst_all_I=True, clst_all_II=True): ### if scale_spec: samps_tracks = scale(actual_tracks, axis=0) else: samps_tracks = actual_tracks # ############################################################################# bandwidth = estimate_bandwidth(samps_tracks, quantile=qtl_I, n_samples=samps_tracks.shape[0]) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=clst_all_I).fit(samps_tracks) labels = ms.labels_ coords = { z: [x for x in range(len(labels)) if labels[x] == z] for z in list(set(labels)) if z != -1 } names_plots = ['MS1'] #### fig = [ go.Scatter(x=[actual_tracks[x, 0] for x in coords[i]], y=[actual_tracks[x, 1] for x in coords[i]], mode='markers', name=str(i), marker=dict(color=i)) for i in coords.keys() ] layout = go.Layout(title='MS clust. I', xaxis=dict(title='time (s)'), yaxis=dict(title='frequency')) figures = [go.Figure(data=fig, layout=layout)] #### ## an extra step to clean this up. if ms_layer2: extra_cls = {} for clust in coords.keys(): subset = samps_tracks[coords[clust], :] subset = scale(subset, axis=0) if subset.shape[0] > 10: bandwidth = estimate_bandwidth(subset, quantile=qtl_II, n_samples=subset.shape[0]) if bandwidth > 0: ms = MeanShift(bin_seeding=True, cluster_all=clst_all_II, bandwidth=bandwidth).fit(subset) labels_local = ms.labels_ coords_local = { z: [ coords[clust][x] for x in range(len(labels_local)) if labels_local[x] == z ] for z in list(set(labels_local)) if z != -1 } coords_local = { z: coords_local[z] for z in coords_local.keys() if len(coords_local[z]) > 3 } coords_keys = sorted(coords_local.keys()) if len(coords_keys) > 1: coords[clust] = coords_local[coords_keys[0]] for cl in coords_keys[1:]: extra_cls[len(extra_cls) + len(coords)] = coords_local[cl] coords.update(extra_cls) names_plots.append('MSII') ######################## ## get just neighbours ## if N_neigh: extra_cls = {} for clust in coords.keys(): subset = samps_tracks[coords[clust], :] if subset.shape[0] >= 2 * N_neigh: t = list(np.arange(0, len(coords[clust]), N_neigh)) coords_local = list(sorted(coords[clust])) if len(t) > 1: if len(coords_local) - t[-1] < N_neigh: t[-1] = len(coords_local) else: t.append(len(coords_local)) coords[clust] = coords_local[t[0]:t[1]] for cl in range(1, len(t) - 1): extra_cls[len(extra_cls) + len(coords)] = coords_local[t[cl]:t[cl + 1]] coords.update(extra_cls) names_plots[-1] = names_plots[-1] + '_neighs{}'.format(N_neigh) fig = [ go.Scatter(x=[actual_tracks[x, 0] for x in coords[i]], y=[actual_tracks[x, 1] for x in coords[i]], mode='markers', name=str(i), marker=dict(color=i)) for i in coords.keys() ] layout = go.Layout(title='MS clust. II', xaxis=dict(title='time (s)'), yaxis=dict(title='frequency')) figures.append(go.Figure(data=fig, layout=layout)) return coords, figures, names_plots
# Z = linkage(face_encodings, 'ward') # fig = plt.figure(figsize=(25, 10)) # dn = dendrogram(Z) #mean-shift if True: nuke_people() faces = list(Face.objects.all()) face_encodings = np.array( [np.frombuffer(bytes.fromhex(f.encoding)) for f in faces]) X = StandardScaler().fit_transform(face_encodings) bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) #DBSCAN if False: nuke_people() faces = list(Face.objects.all()) face_encodings = np.array( [np.frombuffer(bytes.fromhex(f.encoding)) for f in faces]) X = StandardScaler().fit_transform(face_encodings) # ############################################################################# # Compute DBSCAN db = DBSCAN(eps=5, min_samples=2).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True
def frame_peaks(array_spec, spec_fs, spec_ts, frame=0, Sample_N=500, p_threshold=0.0004, amp_cap=4, peak_cap=.7, peak_iso=200, band_qtl=0.02, frame_plot=False): ## get probs from probs = list(array_spec[:, frame]) probs = np.array(probs) probs[probs > amp_cap] = amp_cap prob_sum = np.sum(probs) probs = probs / prob_sum # ############################################################################# # Compute clustering with MeanShift # The following bandwidth can be automatically detected using new_freqs = np.random.choice(list(spec_fs), Sample_N, replace=True, p=probs) new_freqs = new_freqs.reshape(-1, 1) bandwidth = estimate_bandwidth(new_freqs, quantile=band_qtl, n_samples=Sample_N) if bandwidth == 0: bandwidth = peak_iso ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False).fit(new_freqs) labels = ms.labels_ cluster_centers = ms.cluster_centers_ cluster_centers = list(it.chain(*cluster_centers)) ## trim_clusters: ### interpolation makes it easier to chose between neibhour centroids ### that are unlikely to exist as obs. frequency values. from scipy.interpolate import interp1d f2 = interp1d(spec_fs, array_spec[:, frame], kind='cubic') cluster_centers = cluster_threshold(cluster_centers, peak_iso, f2) cluster_centers = cluster_threshold(cluster_centers, peak_iso, f2) #### #### get amplitudes of peaks and store them peak_cent = [] amps_centres = [] shapes = [] for cent in cluster_centers: closest = abs(spec_fs - cent) closet = np.argmin(closest) amp_sel = array_spec[closet, frame] if amp_sel >= peak_cap: peak_cent.append(cent) amps_centres.append(amp_sel) ## get time stamps for each of the peaks. time_spec = [spec_ts[frame]] * len(amps_centres) if frame_plot: kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(new_freqs) X_plot = np.linspace(0, max(spec_fs) + 100, 1000)[:, np.newaxis] log_dens = kde.score_samples(X_plot) fig = [go.Scatter(x=spec_fs, y=array_spec[:, frame], mode='lines')] #fig= [go.Scatter(x=X_plot[:, 0], y=np.exp(log_dens), mode='lines', fill='tozeroy', line=dict(color='#AAAAFF', width=2))] shapes = [] for center in peak_cent: shapes.append({ 'type': 'line', 'x0': center, 'y0': 0, 'x1': center, 'y1': max(array_spec[:, frame]), 'line': { 'color': 'red', 'width': 4, 'dash': 'solid' }, }) layout = go.Layout(title='frame inx: {}'.format(frame), shapes=shapes, xaxis=dict(title='frequency'), yaxis=dict(title='amplitude')) figure_frame = go.Figure(data=fig, layout=layout) return peak_cent, time_spec, amps_centres, figure_frame else: return peak_cent, time_spec, amps_centres
## This is getting real data # Date needs to be in the format of column name on the first row of the column and numeric data in the rest of the fields. # Pulls in all the data in the file, the col_int above is then used to decide what to actually look at X = pd.read_csv(myPath + Data_in, header=0, dtype={ 0: np.float64, 1: np.float64 }) print(X.head()) ## This is the bit where it fits the data ms = MeanShift(cluster_all=False) # Convert the columns of interest to a NumPy array # Multi-dimensional so could be anything really msX = np.array(X.iloc[:, col_int]) # print (msX) ms.fit(msX) labels = ms.labels_ cluster_centers = ms.cluster_centers_ n_clusters_ = len(np.unique(labels)) # print("Number of estimated clusters:", n_clusters_) # print(labels)
from sklearn.datasets.samples_generator import make_blobs mmmm = 0 for filename in glob.glob('Q2-images/*'): #assuming gif X1 = [] I = cv2.imread(filename) I = cv2.resize(I, (0, 0), fx=0.5, fy=0.5) for i in range(I.shape[0]): for j in range(I.shape[1]): X1.append(I[i][j][:].tolist()) bandwidth = estimate_bandwidth(X1, quantile=0.25, n_samples=10) clustering = MeanShift(bandwidth=2, bin_seeding=True).fit(X1) labels = clustering.labels_ C = clustering.cluster_centers_ X1 = np.array(X1) labels = np.array(labels) count = 0 I_out = np.zeros(I.shape) for i in range(I.shape[0]): for j in range(I.shape[1]): I_out[i][j][:] = C[labels[count]] count = count + 1
def do_work(self, train, uid, url): self.cap = cv2.VideoCapture(url) print(uid) self.kernel = np.ones((3, 3), np.uint8) self.frameWidth = int(self.cap.get(3)) self.frameHeight = int(self.cap.get(4)) self.outOriginal = cv2.VideoWriter( 'cache/original.avi', cv2.VideoWriter_fourcc('X', 'V', 'I', 'D'), 24, (self.frameWidth, self.frameHeight)) self.outDetect = cv2.VideoWriter( 'cache/detect.avi', cv2.VideoWriter_fourcc('X', 'V', 'I', 'D'), 24, (self.frameWidth, self.frameHeight)) self.outSkel = cv2.VideoWriter( 'cache/skel.avi', cv2.VideoWriter_fourcc('X', 'V', 'I', 'D'), 24, (self.frameWidth, self.frameHeight)) self.fgbg = cv2.bgsegm.createBackgroundSubtractorMOG() self.frameCount = 0 cacheDir = os.path.join(os.getcwd(), 'cache') sourceDir = os.path.join(os.getcwd(), 'sources') try: pass os.remove(os.path.abspath(os.path.join(cacheDir, 'test.csv'))) except OSError as e: pass try: if train: os.remove( os.path.abspath(os.path.join(sourceDir, str(uid) + '.csv'))) except OSError as e: pass while self.frameCount < 240: status, frame = self.cap.read() if not status: break blur = cv2.GaussianBlur(frame, (9, 9), 0) fgmask = self.fgbg.apply(blur) img = cv2.dilate(fgmask, self.kernel, iterations=1) x, y, height, length = self.contourDetect(img) boxImg = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) boxImg = cv2.rectangle(boxImg, (x, y), (x + length, y + height), (0, 0, 255), 2) cv2.line(boxImg, (0, int(y + 0.75 * height)), (640, int(y + 0.75 * height)), (0, 255, 0), 2) cv2.line(boxImg, (0, int(y + 0.15 * height)), (640, int(y + 0.15 * height)), (255, 0, 0), 2) skel, hip, shoulder = self.skelRegion(img, x, y, height, length) if self.frameCount > 50 and self.frameCount < 151: if train: with open('sources/' + str(uid) + '.csv', 'a', newline='') as csvfile: with open('cache/target.csv', 'a', newline='') as targetfile: fieldnames = [ 'height', 'stride', 'lowerbody', 'upperbody', 'hipangle', 'shoulderx', 'shouldery' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) targetnames = ['class'] targetWriter = csv.DictWriter( targetfile, fieldnames=targetnames) writer.writerow({ 'height': height, 'stride': length, 'lowerbody': round(0.53 * height, 2), 'upperbody': round(0.4 * height, 2), 'hipangle': round(hip, 2), 'shoulderx': shoulder[0], 'shouldery': shoulder[1] }) targetWriter.writerow({'class': uid}) targetWriter.writerow({'class': 0}) else: with open('cache/test.csv', 'a', newline='') as csvfile: fieldnames = [ 'height', 'stride', 'lowerbody', 'upperbody', 'hipangle', 'shoulderx', 'shouldery' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writerow({ 'height': height, 'stride': length, 'lowerbody': round(0.53 * height, 2), 'upperbody': round(0.4 * height, 2), 'hipangle': round(hip, 2), 'shoulderx': shoulder[0], 'shouldery': shoulder[1] }) self.outOriginal.write(frame) self.outDetect.write(boxImg) self.outSkel.write(skel) self.frameCount += 1 if self.frameCount % 10 == 0: if train: self.trackProgress(self.frameCount / 240, True) else: self.trackProgress(self.frameCount / 240, False) print("processing done!") self.cap.release() self.outDetect.release() self.outOriginal.release() self.outSkel.release() verify = False if train: pass else: csv_files = glob.glob('sources/*.csv') for cfile in csv_files: cf = pd.read_csv(cfile) master_array = cf.as_matrix() df = pd.read_csv('cache/test.csv') numpy_array = df.as_matrix() print(numpy_array) bandwidth = estimate_bandwidth(master_array, quantile=0.1) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(master_array) master_labels = ms.labels_ master_centers = ms.cluster_centers_ print("Master centroids:\n", master_centers) print("Number of Master clusters: ", len(np.unique(master_labels))) bandwidth = estimate_bandwidth(numpy_array, quantile=0.1) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(numpy_array) labels = ms.labels_ cluster_centers = ms.cluster_centers_ print("Test centroids:\n", cluster_centers) print("Number of Test clusters: ", len(np.unique(labels))) bandwidth = estimate_bandwidth(master_centers, quantile=0.9) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(master_centers) master_centers = ms.cluster_centers_ bandwidth = estimate_bandwidth(cluster_centers, quantile=0.9) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(cluster_centers) cluster_centers = ms.cluster_centers_ # new_centers = np.concatenate((master_centers, cluster_centers)) LIMIT = np.matrix([[5, 5, 5, 5, 5, 5, 5]]) if abs(master_centers - cluster_centers).all() < LIMIT.all(): verify = True uid = cfile.split('.')[0].split('/')[1] data = self.fetchDatabase(uid) img = open('cache/image.png', 'wb') img.write(data[4]) img.close() self.verifyDone.emit(str(data[0]), data[1], data[2], str(data[3])) print(master_centers) print(cluster_centers) break print(master_centers) print(cluster_centers) if not verify and not train: self.unauthVerify.emit() if train: self.threadCompleted.emit(True) else: self.threadCompleted.emit(False)
setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1] # virginica.plot.scatter(x=0, y=1, c='r') # versicolor.plot.scatter(x=0, y=1, c='b') # setosa.plot.scatter(x=0, y=1, c='g') plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r') plt.scatter(x=versicolor['Sepal length'], y=versicolor['Sepal width'], color='g') plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b') # plt.show() # 4. print(estimate_bandwidth(iris_data, quantile=0.2)) analyzer = MeanShift(bandwidth=1) print('Self MeanShift: ', analyzer.fit(iris_data)) print('Function mean_shift: ', mean_shift(iris_data)) # print(estimate_bandwidth(virginica, quantile=0.2)) # print(estimate_bandwidth(versicolor, quantile=0.2)) # print(estimate_bandwidth(setosa, quantile=0.2)) # 5. # labels, cluster_centers, n_clusters = mean_shift(data_2d) # colors = cycle('bgrcmy') # for k, col in zip(range(n_clusters), colors): # my_members = (labels == k) # cluster_center = cluster_centers[k]
import numpy as np from sklearn.cluster import MeanShift from sklearn.datasets.samples_generator import make_blobs import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D Axes3D = Axes3D from matplotlib import style style.use("ggplot") centers = [[1, 1, 1], [5, 5, 5], [3, 10, 10]] X, _ = make_blobs(n_samples=100, centers=centers, cluster_std=1.5) ms = MeanShift() ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ print(cluster_centers) n_clusters_ = len(np.unique(labels)) print("Number of estimated clusters:", n_clusters_) colors = 10 * ['r', 'g', 'b', 'c', 'k', 'y', 'm'] fig = plt.figure() ax = fig.add_subplot(1, 1, 1, projection='3d') for i in range(len(X)): ax.scatter(X[i][0], X[i][1], X[i][2], c=colors[labels[i]], marker='o') ax.scatter(cluster_centers[:, 0], cluster_centers[:, 1],
from sklearn.preprocessing import LabelEncoder, OneHotEncoder onehotencoder = OneHotEncoder(categorical_features = [9]) x = onehotencoder.fit_transform(x).toarray() onehotencoder = OneHotEncoder(categorical_features = [17]) x = onehotencoder.fit_transform(x).toarray() """ ############################################ClusterModeling########################################################### # Using the elbow method to find the optimal number of clusters from sklearn.cluster import MeanShift ms = MeanShift(bandwidth=2, bin_seeding=False, cluster_all=True, min_bin_freq=2, seeds=None) ms_predict = ms.fit_predict(x) print(ms.labels_) from sklearn import metrics print(ms_predict) from sklearn.metrics import pairwise_distances #print(metrics.silhouette_score(x, brc_predict, metric='euclidean')) print("Silhouette Score: %0.3f" % metrics.silhouette_score(x, ms_predict, metric='euclidean')) print("Calinski-Harabaz Index: %0.3f" % metrics.calinski_harabaz_score(x, ms_predict)) ############################################SavingInXlsx###########################################################
def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir): f = open(model_file + ".pre") preprocessor = pickle.load(f) import h5py f = h5py.File(model_file + ".wei") layers = [] for k in range(f.attrs['nb_layers']): g = f['layer_{}'.format(k)] layers.append( [g['param_{}'.format(p)] for p in range(g.attrs['nb_params'])]) max_features = len(preprocessor.tokenizer.word_counts) print("Reading and sampling data to train..") train_programs, train_features, train_classes = read_traces(train_file, nsamples, cut=None) train_size = len(train_features) #y = train_programs X_train, y_train, labels = preprocessor.preprocess_traces( train_features, y_data=train_classes, labels=train_programs) new_model = make_cluster_cnn("test", max_features, maxlen, embedding_dims, nb_filters, filter_length, hidden_dims, None, weights=layers) train_dict = dict() train_dict[ftype] = new_model.predict(X_train) model = make_cluster_pipeline_subtraces(ftype) X_red_comp = model.fit_transform(train_dict) explained_var = np.var(X_red_comp, axis=0) print(explained_var) X_red = X_red_comp[:, 0:2] X_red_next = X_red_comp[:, 2:4] colors = mpl.colors.cnames.keys() progs = list(set(labels)) ncolors = len(colors) size = len(labels) print("Plotting..") for prog, [x, y] in zip(labels, X_red): # for prog,[x,y] in sample(zip(labels, X_red), min(size, 1000)): x = gauss(0, 0.05) + x y = gauss(0, 0.05) + y color = 'r' plt.scatter(x, y, c=color) """ if valid_file is not None: valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=None, maxsize=window_size) #None) valid_dict = dict() X_valid, _, valid_labels = preprocessor.preprocess_traces(valid_features, y_data=None, labels=valid_programs) valid_dict[ftype] = new_model.predict(X_valid) X_red_valid_comp = model.transform(valid_dict) X_red_valid = X_red_valid_comp[:,0:2] X_red_valid_next = X_red_valid_comp[:,2:4] for prog,[x,y] in zip(valid_labels, X_red_valid): x = gauss(0,0.05) + x y = gauss(0,0.05) + y plt.scatter(x, y, c='b') plt.text(x, y+0.02, prog.split("/")[-1]) plt.show() """ plt.savefig(train_file.replace(".gz", "") + ".png") print("Bandwidth estimation..") from sklearn.cluster import MeanShift, estimate_bandwidth X_red_sample = X_red[:min(size, 1000)] bandwidth = estimate_bandwidth(X_red_sample, quantile=0.2) print("Clustering with bandwidth:", bandwidth) #X_red = np.vstack((X_red,X_red_valid)) #X_red_next = np.vstack((X_red_next,X_red_valid_next)) #labels = labels + valid_labels print(X_red.shape, len(X_red), len(labels)) # print(valid_labels) af = MeanShift(bandwidth=bandwidth / 1).fit(X_red) cluster_centers = af.cluster_centers_ cluster_labels = af.labels_ n_clusters = len(cluster_centers) plt.figure() for ([x, y], label, cluster_label) in zip(X_red, labels, cluster_labels): # for ([x,y],label, cluster_label) in sample(zip(X_red,labels, # cluster_labels), min(size, 1000)): x = gauss(0, 0.1) + x y = gauss(0, 0.1) + y plt.scatter(x, y, c=colors[cluster_label % ncolors]) # if label in valid_labels: # plt.text(x-0.05, y+0.01, label.split("/")[-1]) for i, [x, y] in enumerate(cluster_centers): plt.plot(x, y, 'o', markerfacecolor=colors[i % ncolors], markeredgecolor='k', markersize=7) """ #for prog,[x,y] in zip(valid_labels, X_red_valid): #x = gauss(0,0.1) + x #y = gauss(0,0.1) + y #plt.scatter(x, y, c='black') #plt.text(x, y+0.02, prog.split("/")[-1]) plt.title('Estimated number of clusters: %d' % n_clusters) #plt.savefig("clusters.png") plt.show() """ plt.savefig(train_file.replace(".gz", "") + ".clusters.png") clustered_traces = zip(labels, cluster_labels) writer = open_csv(train_file.replace(".gz", "") + ".clusters") for label, cluster in clustered_traces: writer.writerow([label, cluster]) """
def generate(self, themes=None): self._pack() if themes: return KMeans(n_clusters=themes).fit( self._histograms[0]).cluster_centers_ return MeanShift().fit(self._histograms[0]).cluster_centers_
import pandas as pd import numpy as np import matplotlib.pyplot as plt from itertools import cycle from sklearn.cluster import MeanShift iris_data = pd.read_excel('iris_data.xlsx') print(iris_data.head()) iris_data = pd.get_dummies(iris_data, columns=['Species']) print(iris_data.head()) virginica = iris_data.loc[iris_data['Species_I. virginica'] == 1] versicolor = iris_data.loc[iris_data['Species_I. versicolor'] == 1] setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1] plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r') plt.scatter(x=versicolor['Sepal length'], y=versicolor['Sepal width'], color='g') plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b') #plt.show() from sklearn.cluster import estimate_bandwidth print(estimate_bandwidth(virginica, quantile=0.2)) print(estimate_bandwidth(versicolor, quantile=0.2)) print(estimate_bandwidth(setosa, quantile=0.2)) print(estimate_bandwidth(iris_data, quantile=1)) analyzer = MeanShift(bandwidth=1) print(analyzer.fit(iris_data))
vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, stop_words=stopwords, use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3)) tfidf_matrix = vectorizer.fit_transform(clean_data) print(tfidf_matrix.shape) dense_text = tfidf_matrix.todense() # The following bandwidth can be automatically detected using bandwidth = estimate_bandwidth(dense_text, quantile=0.2, n_samples=1000) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(dense_text) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) plt.figure(1) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): my_members = labels == k cluster_center = cluster_centers[k]
X = [] for count, row in enumerate(file_reader): if not count: names = row[2:] continue X.append([float(x) for x in row[2:]]) # Input data as numpy array X = np.array(X) # Estimating the bandwidth bandwidth = estimate_bandwidth(X, quantile=0.8, n_samples=len(X)) # Compute clustering with MeanShift meanshift_estimator = MeanShift(bandwidth=bandwidth, bin_seeding=True) meanshift_estimator.fit(X) labels = meanshift_estimator.labels_ centroids = meanshift_estimator.cluster_centers_ num_clusters = len(np.unique(labels)) print "\nNumber of clusters in input data =", num_clusters print "\nCentroids of clusters:" print '\t'.join([name[:3] for name in names]) for centroid in centroids: print '\t'.join([str(int(x)) for x in centroid]) ################ # Visualizing data
def find_clusters(tracks): """Find clusters in tracked points.""" tracks = list(map(lambda x: [x[-1][0], x[-1][1]], tracks)) ms = MeanShift(bandwidth=30, bin_seeding=True) ms.fit(tracks) return ms.cluster_centers_
def compute_clustering(x_data: pd.Series, y_data: pd.Series, method: str, nb_clusters: int) -> np.array: """ Compute clustering using Scikit-learn: https://scikit-learn.org/stable/modules/clustering.html Several algorithms can be chosen: - K-means: https://scikit-learn.org/stable/modules/clustering.html#k-means - Affinity propagation: https://scikit-learn.org/stable/modules/generated/sklearn\ .cluster.AffinityPropagation.html#sklearn.cluster.AffinityPropagation - Mean shift: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.\ MeanShift.html#sklearn.cluster.MeanShift - Spectral clustering: https://scikit-learn.org/stable/modules/generated/sklearn.\ cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering - Hierarchical/Agglomerative clustering: https://scikit-learn.org/stable/modules\ /generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.Agglomera\ tiveClustering - DBSCAN: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.\ DBSCAN.html#sklearn.cluster.DBSCAN - OPTICS: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.\ OPTICS.html#sklearn.cluster.OPTICS - Bayesian gaussian mixtures: https://scikit-learn.org/stable/modules/generated/\ sklearn.mixture.BayesianGaussianMixture.html - Birch: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.\ html#sklearn.cluster.Birch :param x_data: the x data set :param y_data: the y data set :param method: name of the algorithm used to perform clustering :param nb_clusters: number of clusters used to split data :return: the data set labels used to color scatter points """ mapped_data = [row for row in zip(x_data, y_data)] if method == "K-means": kmeans = KMeans(n_clusters=nb_clusters, random_state=0).fit(mapped_data) return kmeans.labels_ if method == "Affinity propagation": clustering = AffinityPropagation().fit(mapped_data) return clustering.labels_ if method == "Mean shift": clustering = MeanShift(bandwidth=2).fit(mapped_data) return clustering.labels_ if method == "Spectral clustering": clustering = SpectralClustering(n_clusters=nb_clusters, assign_labels="discretize", random_state=0).fit(mapped_data) return clustering.labels_ if method == "Ward hierarchical clustering": clustering = AgglomerativeClustering( n_clusters=nb_clusters).fit(mapped_data) return clustering.labels_ if method == "DBSCAN": clustering = DBSCAN(eps=3, min_samples=2).fit(mapped_data) return clustering.labels_ if method == "OPTICS": clustering = OPTICS(min_samples=2).fit(mapped_data) return clustering.labels_ if method == "Bayesian gaussian mixtures": bgm = BayesianGaussianMixture(n_components=nb_clusters, max_iter=100, tol=1e-3, reg_covar=0) bgm.fit(mapped_data) return bgm.predict(mapped_data) if method == "Birch": brc = Birch(n_clusters=nb_clusters) brc.fit(mapped_data) return brc.predict(mapped_data)
if unique not in text_digit_vals: text_digit_vals[unique] = x x += 1 df[column] = list(map(convert_to_int, df[column])) return df df = handle_non_numerical_data(df) df.drop(['ticket', 'home.dest'], 1, inplace=True) X = np.array(df.drop(['survived'], 1).astype(float)) X = preprocessing.scale(X) y = np.array(df['survived']) clf = MeanShift() clf.fit(X) labels = clf.labels_ cluster_centers = clf.cluster_centers_ original_df['cluster_group'] = np.nan for i in range(len(X)): original_df['cluster_group'].iloc[i] = labels[i] n_clusters_ = len(np.unique(labels)) survival_rates = {} for i in range(n_clusters_): temp_df = original_df[(original_df['cluster_group'] == float(i))] survival_cluster = temp_df[(temp_df['survived'] == 1)] survival_rate = len(survival_cluster) / len(temp_df)