def quick_shift(data, tau, window_type, bandwidth, metric): """Perform medoid shiftclustering of data with corresponding parameters. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input points. tau : float Threshold parameter. Distance should not be over tau so that two points may be connected to each other. window_type : string Type of window to compute the weights matrix. Can be "flat" or "normal". bandwidth : float Value of the bandwidth for the window. metric : string Metric used to compute the distance. See pairwise_distances doc to look at all the possible values. Returns ------- cluster_centers : array, shape=[n_clusters, n_features] Coordinates of cluster centers. labels : array, shape=[n_samples] Cluster labels for each point. cluster_centers_idx : array, shape=[n_clusters] Index in data of cluster centers. """ if tau is None: tau = estimate_bandwidth(data) if bandwidth is None: bandwidth = estimate_bandwidth(data) medoids, cluster_centers_idx = compute_stationary_medoids(data, tau, window_type, bandwidth, metric) cluster_centers = data[cluster_centers_idx] labels = [] labels_val = {} lab = 0 for i in cluster_centers_idx: labels_val[i] = lab lab += 1 for i in range(len(data)): next_med = medoids[i] while next_med not in cluster_centers_idx: next_med = medoids[next_med] labels.append(labels_val[next_med]) return cluster_centers, np.asarray(labels), cluster_centers_idx
def cluster_pixels_ms(self): # reshape """ cluster points descriptors by meahs shift :type self: ColorRemover """ fg_pixels = self.img.fg_pixels.keys() descriptors = [] for r, c in fg_pixels: descriptors.append(self.descriptor_map[r][c]) descriptors = np.array(descriptors) descriptors = PCA(n_components=int(VECTOR_DIMENSION)/2).fit_transform(descriptors) # descriptors = self.descriptor_map.reshape(descriptors_rows, 1, VECTOR_DIMENSION) bandwidth = estimate_bandwidth(descriptors, quantile=0.05) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(descriptors) labels = ms.labels_ for i in range(len(labels)): xy = fg_pixels[i] label = labels[i] self.labels_map.itemset(xy, label) # save the indices and BGR values of each cluster as a dictionary with keys of label for label in range(K): self.pixels_of_hough_line_in_sphere[label] = map(tuple, np.argwhere((self.labels_map == label))) self.cluster_bgr[label] = map(tuple, self.img.bgr[self.labels_map == label])
def cluster_data(df_story, algo='kmeans', params='{}'): print "[EDEN I/O -- cluster_data] algo: ", algo start = time.time() params = ast.literal_eval(params) if algo in ['gac', 'gactemporal']: model = algo_select(algo, params) model.fit(df_story) elif algo == 'meanshift': vsm = recon_vsm(df_story['vsm']) params['bandwidth'] = estimate_bandwidth(vsm, n_samples=200) model = algo_select(algo, params) model.fit(vsm) else: vsm = recon_vsm(df_story['vsm']) model = algo_select(algo, params) model.fit(vsm) # print "[EDEN I/O -- cluster_data.py] plot: cluster counts" # plot_cluster_counts(model, "Cluster counts using algorithm: " + str(algo)) # print "[EDEN I/O -- cluster_data] model: ", model end = time.time() print "[EDEN I/O -- cluster_data.py] Total elapsed time: ", end - start return model
def _fit_mean_shift(self, x): for c in xrange(len(self.crange)): quant = 0.015 * (c + 1) for r in xrange(self.repeats): bandwidth = estimate_bandwidth( x, quantile=quant, random_state=r) idx = c * self.repeats + r model = MeanShift( bandwidth=bandwidth, bin_seeding=True) model.fit(x) self._labels[idx] = model.labels_ self._parameters[idx] = model.cluster_centers_ # build equivalent gmm k = model.cluster_centers_.shape[0] model_gmm = GMM(n_components=k, covariance_type=self.cvtype, init_params='c', n_iter=0) model_gmm.means_ = model.cluster_centers_ model_gmm.weights_ = sp.array( [(model.labels_ == i).sum() for i in xrange(k)]) model_gmm.fit(x) # evaluate goodness of fit self._ll[idx] = model_gmm.score(x).sum() if self.gof_type == 'aic': self._gof[idx] = model_gmm.aic(x) if self.gof_type == 'bic': self._gof[idx] = model_gmm.bic(x) print quant, k, self._gof[idx]
def ms_algo(X, bandwidth=None): if(bandwidth==None): n_samples = X.shape[0] bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=n_samples) # Apply the meanshit algorithm from sklearn library ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) # collect from the meanshift algorithm the labels and the centers of the clusters labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #Number of clusters # Print section print("The number of clusters is: %d" % n_clusters_) print("The centers are:") for i in range(n_clusters_): print i, print cluster_centers[i] return cluster_centers
def test_estimate_bandwidth(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.cluster.estimate_bandwidth(random_state=self.random_state) expected = cluster.estimate_bandwidth(iris.data, random_state=self.random_state) self.assertEqual(result, expected)
def applyMeanShift(data,quantileValue=0.2,clusterall=False): result=[] n_samples=len(data) print "Nombre de points du dataset: %d" %n_samples bandwidth = estimate_bandwidth(data, quantile=quantileValue) ms = MeanShift(bandwidth=bandwidth,cluster_all=clusterall) #Applique le MeanShift clustereddata=ms.fit(data) clusteredlabels= clustereddata.labels_ barycenters=ms.cluster_centers_ labels_unique = np.unique(clusteredlabels) nbOfClusters = len(labels_unique) print "number of estimated clusters : %d" % nbOfClusters for i in labels_unique: print "###Indices des points du cluster %d : ###" %i # print [indice[0] for indice in np.argwhere(clusteredlabels == i)] result.append([indice[0] for indice in np.argwhere(clusteredlabels == i)]) #Add a zero coordinates vector to takeinto account the fact that -1 "cluster" does not have a barycenter if -1 in labels_unique: barycenters= np.append([[0 for k in range(len(barycenters[0]))]],barycenters,axis=0) return [result,barycenters]
def meanShift(flat_image): # Estimate Bandwidth bandwidth = estimate_bandwidth(flat_image, quantile = 0.2, n_samples=500) ms = MeanShift(bandwidth, bin_seeding=True) ms.fit(flat_image) labels = ms.labels_ return ms.labels_, ms.cluster_centers_
def clusterise_data(data_obj): """ Assigns a cluster label to each days present in the data received using three different algorithms: MeanShift, Affinity Propagation, or KMeans. @param data_obj: List of dictionaries """ L = len(data_obj) #Simply converts data_obj to a 2D list for computation List2D = [[None for _ in range(4)] for _ in range(L-1)] for i in range(L-1): #don't include current day #wake_up and sleep_duration are the most important factors List2D[i][0] = 5 * data_obj[i]["wake_up"] List2D[i][1] = 1 * data_obj[i]["sleep"] List2D[i][2] = 5 * data_obj[i]["sleep_duration"] List2D[i][3] = 0.5 * data_obj[i]["activity"] points = NumpyArray(List2D) #converts 2D list to numpyarray if ALGO == "Affinity Propagation": labels = AffinityPropagation().fit_predict(points) elif ALGO == "KMeans": labels= KMeans(init='k-means++', n_clusters=5, n_init=10) .fit_predict(points) elif ALGO == "MeanShift": bandwidth = estimate_bandwidth(points, quantile=0.2, n_samples=20) labels = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit_predict(points) else: raise Exception("Algorithm not defined: "+str(ALGO)) for i in range(L-1): data_obj[i]["cluster"] = labels[i] for unique_label in remove_duplicates(labels): debug_print(ALGO+": Cluster "+str(unique_label)+" contains "+str(labels.tolist().count(unique_label))+" data points") debug_print(ALGO+": Silhouette coefficient"+ str(metrics.silhouette_score(points, labels, metric='euclidean')*100)+"%")
def call_kmean(num_cluster, data, update_flag): X = StandardScaler().fit_transform(data) bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) two_means = MiniBatchKMeans( n_clusters=num_cluster) labels = two_means.fit(X).labels_.astype(np.int) # if user upload files if update_flag: return labels label_dict = {} label_dict_count = 0 for label in labels: label_dict[str(label_dict_count)] = float(label) label_dict_count = label_dict_count + 1 print label_dict unique_dict = {} unique_dict_count = 0 for uniq in np.unique(labels): print uniq unique_dict[str(unique_dict_count)] = float(uniq) unique_dict_count = unique_dict_count + 1 print unique_dict return label_dict, unique_dict
def do_meanshift(s_path, band1, band2, band3, band4, colour1, colour2, make_plot): '''Meanshift clustering to determine the number of clusters in the data, which is passed to KMEANS function''' # Truncate data X = np.vstack([colour1, colour2]).T '''Compute clustering with MeanShift''' # Scale data because meanshift generates circular clusters X_scaled = preprocessing.scale(X) # The following bandwidth can be automatically detected using # the routine estimate_bandwidth(X). Bandwidth can also be set manually. bandwidth = estimate_bandwidth(X) #bandwidth = 0.65 # Meanshift clustering ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False) ms.fit(X_scaled) labels_unique = np.unique(ms.labels_) objects = ms.labels_[ms.labels_ >= 0] n_clusters = len(labels_unique[labels_unique >= 0]) # Make plot if "meanshift" in make_plot: make_ms_plots(s_path, colour1, colour2, n_clusters, X, ms, band1, band2, band3, band4, objects) return(n_clusters, bandwidth)
def BA_meanshift_cluster(mark, chrom): ''' @param: @return: perform mean shift cluster on 2D data: ((chromStart+chromEnd)*0.5, chromEnd-chromStart) ''' path = os.path.join(get_data_dir(), "tmp", mark,"{0}-{1}.csv".format(chrom, mark)) DF = pd.read_csv(path, sep='\t') S_x = 0.5*(DF.loc[:, 'chromEnd'].values+DF.loc[:, 'chromStart'].values) S_y = DF.loc[:, 'chromEnd'].values-DF.loc[:, 'chromStart'].values X = np.hstack((np.atleast_2d(S_x[7000:8000]).T, np.atleast_2d(S_y[7000:8000]).T)) print X bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=1000) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ print list(set(labels)) import matplotlib.pyplot as plt from itertools import cycle colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(len(list(set(labels)))), colors): my_members = labels == k plt.plot(X[my_members, 0], X[my_members, 1], col + '.') plt.title('Estimated number of clusters: %d' % len(list(set(labels)))) plt.show()
def mean_shift_cluster_analysis(x,y,quantile=0.2,n_samples=1000): # ADAPTED FROM: # http://scikit-learn.org/stable/auto_examples/cluster/plot_mean_shift.html#example-cluster-plot-mean-shift-py # The following bandwidth can be automatically detected using X = np.hstack((x.reshape((x.shape[0],1)),y.reshape((y.shape[0],1)))) bandwidth = estimate_bandwidth(X, quantile=quantile, n_samples=n_samples) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #print("number of estimated clusters : %d" % n_clusters_) colors = 'bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmyk' #cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for i in xrange(len(np.unique(labels))): my_members = labels == i cluster_center = cluster_centers[i] plt.scatter(X[my_members, 0], X[my_members, 1],s=90,c=colors[i],alpha=0.7) plt.scatter(cluster_center[0], cluster_center[1],marker='+',s=280,c=colors[i]) tolx = (X[:,0].max()-X[:,0].min())*0.03 toly = (X[:,1].max()-X[:,1].min())*0.03 plt.xlim(X[:,0].min()-tolx,X[:,0].max()+tolx) plt.ylim(X[:,1].min()-toly,X[:,1].max()+toly) plt.show() return labels
def mean(X, save_fig=False, params_labels=None, prefix='clusters'): ''' Compute clustering with MeanShift ''' logger.debug('Calculating MeanShift clusters using %d parameters'%len(X[0])) X = np.array( X ) with warnings.catch_warnings(): warnings.simplefilter("ignore") bandwidth = estimate_bandwidth(X, quantile=0.2) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ if save_fig: plotClusters(X, ms, method='mean', prefix=prefix, params=params_labels) labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) logger.debug('Found %d clusters with MeanShift algorithm'%n_clusters_) return labels
def clustering(matrix,lst,blst): dblabel=cluster.DBSCAN(eps=8E-4).fit_predict(matrix) dblabel=select(dblabel) print("DBScan finished.") kmlabel=cluster.KMeans(n_clusters=300).fit_predict(matrix) kmlabel=select(kmlabel) print("KMeans finished.") bw=cluster.estimate_bandwidth(matrix,quantile=0.01,n_samples=1000) ms=cluster.MeanShift(bandwidth=bw) mslabel=ms.fit_predict(matrix) mslabel=select(mslabel) print("MeanShift finished.") bc=cluster.Birch(threshold=0.01) bmat=matrix.tolist() bclabel=bc.fit_predict(bmat) bclabel=select(bclabel) print("Birch finished.") intesec=[] suspct=[] c=0 for i in range(len(matrix)): #if bclabel[i]: #c+=1 #if mslabel[i]: if dblabel[i] and kmlabel[i] and mslabel[i] and bclabel[i]: intesec.append(lst[i]) if dblabel[i] or kmlabel[i] or mslabel[i] or bclabel[i]: suspct.append(lst[i]) print(str(c)) return intesec,suspct
def meanshift(raw_data, t): # Compute clustering with MeanShift # The following bandwidth can be automatically detected using #data = [ [(raw_data[i, 1]+raw_data[i, 5]), (raw_data[i, 2]+raw_data[i,6])] for i in range(raw_data.shape[0]) ] data = np.zeros((raw_data.shape[0],2)) X = raw_data[:,1] + raw_data[:,5] Y = raw_data[:,2] + raw_data[:,6] #X = raw_data[:,1] ; Y = raw_data[:,2]; data = np.transpose(np.concatenate((np.mat(X),np.mat(Y)), axis=0)) bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(data) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) # Plot result plt.figure(t) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): my_members = labels == k cluster_center = cluster_centers[k] plt.plot(data[my_members, 0], data[my_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.axis('equal') plt.show()
def mean_shift(X): bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=1000) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ return labels, cluster_centers
def train(trainingData, pklFile, clusteringAll, numberOfClusters=None): # ========================================================================= # # =============== STEP 1. DEFINE OUTPUT LEARNT MODEL FILE ================= # # ========================================================================= # if (pklFile == ''): os.system('rm -rf learntModel & mkdir learntModel') pklFile = 'learntModel/learntModel.pkl' # ========================================================================= # # =============== STEP 2. PERFORM CLUSTERING TO THE DATA ================== # # ========================================================================= # if (numberOfClusters == None): print "Running MeanShift Model..." bandwidth = estimate_bandwidth(trainingData) ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=clusteringAll) ms.fit(trainingData) joblib.dump(ms, pklFile) return {"numberOfClusters":len(ms.cluster_centers_), "labels": ms.labels_, "clusterCenters":ms.cluster_centers_} elif (numberOfClusters != None): print "Running K-Means Model..." kMeans = KMeans(init='k-means++', n_clusters=numberOfClusters) kMeans.fit(trainingData) joblib.dump(kMeans, pklFile) return {"numberOfClusters":len(kMeans.cluster_centers_), "labels": kMeans.labels_, "clusterCenters":kMeans.cluster_centers_}
def simplify_data1(x): X = np.array(zip(x,np.zeros(len(x))), dtype=np.float) bandwidth = estimate_bandwidth(X, quantile=0.2) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #print n_clusters_ #exit() start=0 value=0 print x for k in range(n_clusters_): my_members = labels == k print "cluster {0}: {1}".format(k, X[my_members, 0]),np.average(X[my_members, 0]) value=np.average(X[my_members, 0]) val2=0 for i in xrange(start,start+len(X[my_members, 0])): val2+=X[i][0] print val2,X[i][0],i X[i][0]=value print "FINAL",val2/len(X[my_members, 0]) start+=len(X[my_members, 0]) return X[:,0]
def meanshift_for_hough_line(self): # init mean shift pixels_of_label = {} points_of_label = {} for hough_line in self.points_of_hough_line: pixels = self.pixels_of_hough_line[hough_line] pixels = np.array(pixels) bandwidth = estimate_bandwidth(pixels, quantile=QUANTILE, n_samples=500) if bandwidth == 0: bandwidth = 2 ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(pixels) labels = ms.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) for k in range(n_clusters_): label = list(hough_line) label.append(k) pixels_of_label[tuple(label)] = map(tuple, pixels[labels==k]) for label in pixels_of_label: pixels = pixels_of_label[label] points = map(self.img.get_bgr_value, pixels) points_of_label[label] = points self.pixels_of_hough_line = pixels_of_label self.points_of_hough_line = points_of_label
def make(filename, precision): with open('test.geojson') as f: data = json.load(f) features = data['features'] points = [ geo['geometry']["coordinates"] for geo in features if pred(geo) ] print points ar_points = array(points).reshape(len(points) * 2, 2) print ar_points bandwidth = estimate_bandwidth(ar_points) / precision cluster = MeanShift(bandwidth=bandwidth) cluster.fit(ar_points) labels = cluster.labels_ cluster_centers = cluster.cluster_centers_ print 'clusters:', len(unique(labels)) for i, geo in enumerate(filter(pred, features)): geo['geometry']["coordinates"] = [ list(cluster_centers[labels[i*2 + j]]) for j in range(2) ] with open(filename, 'w') as f: json.dump(data, f)
def Mean_Shift(path): #importer les donnees data=pandas.read_csv(filepath_or_buffer=path,delimiter=',',encoding='utf-8') data.drop_duplicates() print (data) #lire les donnees values=data[['latitude', 'longitude']].values print("printing values") print (values) #Mean shift print ("Clustering data Meanshift algorithm") bandwidth = estimate_bandwidth(values, quantile=0.003, n_samples=None) #ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, min_bin_freq=20, cluster_all=False) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True,min_bin_freq=25,cluster_all=False) ms.fit(values) data['cluster'] = ms.labels_ data = data.sort(columns='cluster') data = data[(data['cluster'] != -1)] print (data['cluster']) data['cluster'] = data['cluster'].apply(lambda x:"cluster" +str(x)) labels_unique = np.unique(ms.labels_).tolist() del labels_unique[0] # Filtering clusters centers according to data filter cluster_centers = DataFrame(ms.cluster_centers_, columns=['latitude', 'longitude']) cluster_centers['cluster'] = labels_unique print (cluster_centers) n_centers_ = len(cluster_centers) print("number of clusters is :%d" % n_centers_) # print ("Exporting clusters to {}...'.format(clusters_file)") data.to_csv(path_or_buf="output/points.csv", cols=['user','latitude','longitude','cluster','picture','datetaken'], encoding='utf-8') #print ("Exporting clusters centers to {}...'.format(centers_file)") cluster_centers['cluster'] = cluster_centers['cluster'].apply(lambda x:"cluster" +str(x)) cluster_centers.to_csv(path_or_buf="output/centers.csv", cols=['latitude', 'longitude','cluster'], encoding='utf-8') plot_meanshift(data, cluster_centers, n_centers_) return 0
def perform_mean_shift(data): X = np.c_[data] (n_samples, n_features) = X.shape bandwidth = cluster.estimate_bandwidth(X, n_samples=n_samples) ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False) ms.fit(X) import pdb pdb.set_trace()
def meanShiftClustering(centers_df,subject): #estimate the bandwidth to use with the mean shift algorithm. The quantile represents the distance used between the box centers to define the cluster. Smaller quantile, means smaller distance between points that would end up in the same cluster centers_df=centers_df.reset_index() bandwidth=estimate_bandwidth(centers_df[['center_x','center_y']].as_matrix(), quantile=0.0055) #instantiate the mean shift algorithm ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) #fit the algorithm on the box center coordinates ms.fit(centers_df[['center_x','center_y']]) #get the resulting clustesr labels labels = ms.labels_ #get the resulting centers of each *cluster* cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) #calculate the number of clusters by using the length of the list that contains all the unique labels n_clusters_ = len(labels_unique) #concatenate the centers data frame (which contains all the box coordinates, their dimensions, and their centers) with the clustering labels generated by the clustering boxes_df = pd.concat([centers_df,pd.DataFrame(labels,columns=['cluster_label'])],axis=1) #the aggregate function in the groupby, includes two functions: count and median f = {'Number of boxes in a cluster': ['count'],'Median': ['median']} #group by the label of each cluster and aggregate the boxes' top left coordinates and dimensions by applying the median aggregated_df = boxes_df.groupby('cluster_label')['cluster_label','tl_x','tl_y','width','height'].agg(f).reset_index() #change column names for a more descriptive name aggregated_df.columns = ['cluster_label','median_cluster_label','agg_tl_x','agg_tl_y','agg_width','agg_height','boxes_in_cluster','count_tl_x','count_tl_y','count_width','count_height'] #leave out the unnecessary columns aggregated_df = aggregated_df[['cluster_label','agg_tl_x','agg_tl_y','agg_width','agg_height','boxes_in_cluster']] #Look at the output of the plotBoxes function (svg file) and determine at which THRESHOLD value there is a desired number of clusters (appears at the top of the plot) and that it visually matches the actual grid THRESHOLD = 5 #filter out all the clusters that have less than a certain number of boxes in each cluster #use the old-weather-aggregator-with-plot.py script to check what the best threshold is aggregated_df = aggregated_df.loc[aggregated_df.boxes_in_cluster>THRESHOLD,:] good_clusters = np.unique(aggregated_df.cluster_label.values) print "for subject_id:"+str(subject) print "number of estimated clusters overall: %d" % n_clusters_ print "number of estimated clusters, after small clusters were filtered out: %d" % len(good_clusters) print "clusters with more than %d boxes per cluster:" % THRESHOLD print aggregated_df.columns print aggregated_df.head() #save the aggregated boxes and their clusters into a csv file, separate file for each subject print "Saving the output/aggregated_df_%s.csv file..." % str(subject) aggregated_df.to_csv("output/aggregated_df_"+str(subject)+".csv",index=False) #make sure that only the boxes that belong to the good_clusters (have more boxes than the threshhold) remain in the boxes_df dataframe and then save the dataframe boxes_df = boxes_df.loc[boxes_df['cluster_label'].isin(good_clusters),:] print "Saving the output/clustered_df_%s.csv file..." % str(subject) boxes_df.to_csv("output/clustered_df_"+str(subject)+".csv",index=False) plotBoxes(aggregated_df,boxes_df,cluster_centers)
def mean_shift(data, bandwith=None, n_samples=500, quantile=0.3): if bandwith is None: bandwidth = skcluster.estimate_bandwidth(data, quantile=quantile, n_samples=n_samples) ms = skcluster.MeanShift(bandwidth=bandwidth).fit(data) labels = ms.labels_ return labels
def do_meanshift (band1, band2, band3, band4, colour1, colour2, make_plots): '''Does meanshift clustering to determine a number of clusters in the data, which is passed to KMEANS function''' data = np.loadtxt(inputdata) #Input Checking #if band1 == band2 or band3 == band4: #print "Not a good idea to use the same band in one colour, try again" #return #for band in [band1, band2, band3, band4]: #if band not in band_names.keys(): #print "Can't find %s in band_name list" %band #return #Import 4 different wavelengths #Colour 1: 05_mag wave1 = data[:, band_names[band1]] wave2 = data[:, band_names[band2]] #Colour 2: 05_mag wave3 = data[:, band_names[band3]] wave4 = data[:, band_names[band4]] gooddata1 = np.logical_and(np.logical_and(wave1!=badval, wave2!=badval), np.logical_and(wave3!=badval, wave4!=badval)) # Remove data pieces with no value gooddata2 = np.logical_and(np.logical_and(wave1<maglim, wave2<maglim), np.logical_and(wave3<maglim, wave4<maglim)) greatdata = np.logical_and(gooddata1, gooddata2) colour1 = wave1[greatdata] - wave2[greatdata] colour2 = wave3[greatdata] - wave4[greatdata] #Truncate data X = np.vstack([colour1, colour2]).T #Scale data because meanshift generates circular clusters X_scaled = preprocessing.scale(X) # The following bandwidth can be automatically detected using # the routine estimate_bandwidth(). Bandwidth can also be set # as a value. bandwidth = estimate_bandwidth(X) # Meanshift clustering ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False) ms.fit(X_scaled) labels_unique = np.unique(ms.labels_) n_clusters = len(labels_unique[labels_unique >= 0]) #Make plot of clusters if needed if "MSplot" in make_plot: make_ms_plots(colour1, colour2, n_clusters, X, ms, band1, band2, band3, band4) return(n_clusters)
def checkForClustering(catalog): debug("Checking for data clustering") Xfull = catalog.view(np.float64).reshape(catalog.shape + (-1,))[:,1:] X = Xfull[:,2:] debug("Using DBSCAN") db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ n_clusters_DBSCAN = len(set(labels)) - (1 if -1 in labels else 0) debug('Estimated number of clusters with DBSCAN: %d' % n_clusters_DBSCAN) unique_labelsDBSCAN = set(labels) colorsDBSCAN = plt.cm.rainbow(np.linspace(0, 1, len(unique_labelsDBSCAN))) debug("Estimating clusters using MeanShift") bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labelsMS = ms.labels_ cluster_centers = ms.cluster_centers_ labels_uniqueMS = np.unique(labelsMS) n_clusters_MS = len(labels_uniqueMS) debug("Estimated number of clusters with MeanShift: %d" % n_clusters_MS) # Plot result fig = plt.figure(figsize=(12,12)) ax0 = fig.add_subplot(2,2,1) ax1 = fig.add_subplot(2,2,2) ax2 = fig.add_subplot(2,2,3) ax3 = fig.add_subplot(2,2,4) for k, col in zip(unique_labelsDBSCAN, colorsDBSCAN): if k == -1: col = 'k' class_member_mask = (labels == k) mask = class_member_mask & core_samples_mask xy = Xfull[mask] ax0.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5) ax2.plot(catalog['MAG_APER(1)'][mask], catalog['CLASS_STAR'][mask], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5) xy = Xfull[class_member_mask & ~core_samples_mask] ax0.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5) ax2.plot(catalog['MAG_APER(1)'][class_member_mask & ~core_samples_mask], catalog['CLASS_STAR'][class_member_mask & ~core_samples_mask], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5) ax0.set_title('DBCAN: # clusters: %d' % n_clusters_DBSCAN) colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_MS), colors): my_members = labelsMS == k cluster_center = cluster_centers[k] ax1.plot(Xfull[my_members, 0], Xfull[my_members, 1], col + '.') ax3.plot(catalog['MAG_APER(1)'][my_members], catalog['CLASS_STAR'][my_members], col + '.') #ax1.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) ax1.set_title('MeanShift: # clusters: %d' % n_clusters_MS) plt.show()
def MSclusterer(X): X = X.toarray() bandwidth = estimate_bandwidth(X, quantile=0.04, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False) ms.fit(X) labels = ms.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print(n_clusters_) return ms.labels_
def mean_shift_clustering(features): bandwidth = estimate_bandwidth(features, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(features) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters = len(labels_unique) print("-- # of clusters: %d" % n_clusters) return labels
def fit_mean_shift_object(x,y,quantile=.005): ''' given x,y lists of x and y coordinates of points, we fit a Meanshift cluster object to these points ''' points = make_coordinates(x=x,y=y) bandwidth=estimate_bandwidth(points, quantile=quantile) ms = MeanShift(bandwidth=bandwidth,bin_seeding=True) ms.fit(points) return ms
def zoom_neutrophils(self): img_size = min(self.detector.img.shape[0:2]) neutrophils_objects = [] # get neutrophils objects for i, cell_coordinates in enumerate(self.detector.overlays): prediction = self.convnet.predict(self.detector.cells_images[i]) if prediction == 2: neutrophils_objects.append(cell_coordinates) if len(neutrophils_objects) == 0: return None, None quantile = 0.4 / (2**min(floor(img_size / 1000.0), 3)) bandwidth = estimate_bandwidth(neutrophils_objects, quantile=quantile) if bandwidth <= 0: return None, None else: ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(neutrophils_objects) labels = ms.predict(neutrophils_objects) clusters = [] for label in np.unique(labels): clusters.append([ neutrophils_objects[i] for i, lbl in enumerate(labels) if lbl == label ]) def density_function(x, objects, ms_density, convnet_density): current_index = objects.index(x) bias_density = int(convnet_density.img_size / 2) center_density = ( int(ms_density.cluster_centers_[current_index][0]) + bias_density, int(ms_density.cluster_centers_[current_index][1]) + bias_density) radius_density = int( max([ np.linalg.norm( np.array(coord) - np.array(center_density)) for coord in x ])) area = pi * radius_density**2 density = len(x) / area if len(x) > 1 else 0 return density object_in_max_clusters = max( clusters, key=lambda x: density_function(x, clusters, ms, self.convnet)) index_of_maximum = clusters.index(object_in_max_clusters) bias = int(self.convnet.img_size / 2) # get optimal center and radius of cluster center = (int(ms.cluster_centers_[index_of_maximum][0]) + bias, int(ms.cluster_centers_[index_of_maximum][1]) + bias) radius = int( max([ np.linalg.norm(np.array(coord) - np.array(center)) for coord in object_in_max_clusters ])) return center, radius + bias
# print(predict) print('%-9s\t%.2fs\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % ( name, (time() - t0), metrics.homogeneity_score(labels, predict), metrics.completeness_score(labels, predict), metrics.v_measure_score(labels, predict), metrics.adjusted_rand_score(labels, predict), metrics.adjusted_mutual_info_score( labels, predict, average_method='arithmetic'), # metrics.silhouette_score(data, predict, # metric='euclidean', # sample_size=n_samples) )) del estimator AllAlgorithm(AgglomerativeClustering(n_clusters=n_digits, linkage='ward'), name="AC", data=data) bandwidth = estimate_bandwidth(data, quantile=0.3, n_samples=sample_size) print(39 * '_' + 'PCA' + 40 * '_') reduced_data = PCA(n_components=10).fit_transform(data) bandwidth = estimate_bandwidth(data, quantile=0.3, n_samples=sample_size) AllAlgorithm(AgglomerativeClustering(n_clusters=n_digits, linkage='ward'), name="AC", data=reduced_data) print(82 * '_')
from sklearn.cluster import MeanShift import sklearn.cluster as cluster from sklearn.decomposition import PCA n_cluster_num = 3 clusterer = KMeans(n_clusters=n_cluster_num, random_state=10) cluster_labels_Kmeans = clusterer.fit_predict(df_feature) clusterer = DBSCAN(eps=0.45) cluster_label_DBScan = clusterer.fit_predict(df_feature) clusterer = Birch(n_clusters=n_cluster_num) cluster_label_Birch = clusterer.fit_predict(df_feature) bandwidth = cluster.estimate_bandwidth(df_feature, quantile=0.15) clusterer = MeanShift(bin_seeding=True, bandwidth=bandwidth) cluster_label_MeanShift = clusterer.fit_predict(df_feature) a_pca = PCA(n_components=3) data_pca = a_pca.fit_transform(df_feature) Y = wine.target # Kezdjünk új ábrát (plt.figure)! plt.figure(figsize=(20, 5)) # Rajzoljunk a plt.scatter segítségével! # Segítség: X_pca[:, 0], X_pca[:, 1], c=Y plt.subplot(151) plt.xlabel("Kmeans") plt.scatter(data_pca[:, 0], data_pca[:, 1], c=cluster_labels_Kmeans)
# 1. # iris_data = pd.read_csv('iris_data.csv') iris_data = pd.read_excel('iris_data.xlsx') # print(iris_data) # 2. iris_data = pd.get_dummies(iris_data, columns=['Species']) # print(iris_data) # 3. virginica = iris_data.loc[iris_data['Species_I. virginica'] == 1] versicolor = iris_data.loc[iris_data['Species_I. versicolor'] == 1] setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1] # virginica.plot.scatter(x=0, y=1, c='r') # versicolor.plot.scatter(x=0, y=1, c='b') # setosa.plot.scatter(x=0, y=1, c='g') plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r') plt.scatter(x=versicolor['Sepal length'], y=versicolor['Sepal width'], color='g') plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b') # plt.show() # 4. print(estimate_bandwidth(virginica, quantile=0.2)) print(estimate_bandwidth(versicolor, quantile=0.2)) print(estimate_bandwidth(setosa, quantile=0.2))
def color_quantization_click_and_select(path): # Recreate image def recreate_image(codebook, labels, w, h): """Recreate the (compressed) image from the code book & labels""" d = codebook.shape[1] image = np.zeros((w, h, d)) label_idx = 0 for i in range(w): for j in range(h): image[i][j] = codebook[labels[label_idx]] label_idx += 1 return image # Read image by cv2 path = '/home/yizi/Documents/phd/historical_map_project/image_generator/BHdV_PL_ATL20Ardt_1929_0003/image_batches/_05_06.tiff' image = cv2.imread(path) # Gaussian blur to smooth noise pixels image = cv2.GaussianBlur(image, (3, 3), cv2.BORDER_DEFAULT) image_file_name = os.path.basename(path).split('.')[0] # Change color space from BGR to RGB to HLS image = cv2.cvtColor(image, cv2.COLOR_BGR2HLS) image = np.array(image) # Change image objects into array plt.imshow(image) plt.show() # The following bandwidth can be automatically detected using image_reshape = image.reshape((image.shape[0] * image.shape[1], image.shape[2])) bandwidth = estimate_bandwidth(image_reshape, quantile=0.1, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=10) ms.fit(image_reshape) labels = ms.predict(image_reshape) cluster_centers = ms.cluster_centers_.astype(np.uint8) labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) image = recreate_image(cluster_centers, labels, image.shape[0], image.shape[1]).astype(np.uint8) print("Reduce color through mean-shift: %d" % n_clusters_) select_color = color_picker(image) print(select_color) @nb.njit def euc(a, b): return ((b - a) ** 2).sum(axis=0) ** 0.5 segmentation_image = {key: None for key in list(select_color.keys())} for key, value in select_color.items(): image_copy = image.copy().astype(np.uint8) if list(value) == list(np.array([0, 0, 0])): segmentation_image[key] = np.zeros((image.shape[0], image.shape[1], image.shape[2])) # Define the color you're looking for pattern = np.array(value).astype(np.uint8) # Make a mask to use with where mask = (image_copy == pattern).all(axis=2) newshape = mask.shape + (1,) mask = mask.reshape(newshape) image_copy = np.where(mask, [255, 255, 255], [0, 0, 0]) image_copy = image_copy.astype(np.uint8) if key == 'red_legend' or key == 'black_text': image_copy = remove_noise_pixels(image_copy, threshold=15) else: image_copy = cv2.bitwise_not(image_copy) image_copy = remove_noise_pixels(image_copy, threshold=15) segmentation_image[key] = image_copy fig = plt.figure(figsize=(25, 25)) for index, (layer, seg_image) in enumerate(segmentation_image.items()): ax = fig.add_subplot(2, 2, index + 1) ax.imshow(seg_image) ax.axis('off') ax.set_title(' %s _layer' % layer) plt.show() for nl in range(len(segmentation_image)): current_dir = os.getcwd() file_name = path.split('/')[-3] image_quantization_result_dir = str(Path(current_dir).parent) + '/image_generator/' + file_name + \ '/color_quantization_result_batches/' + str(nl) + '_layer/' if not os.path.exists(image_quantization_result_dir): os.makedirs(image_quantization_result_dir) print("Directory ", image_quantization_result_dir, " Created ") else: print("Directory ", image_quantization_result_dir, " already exists") save_path = image_quantization_result_dir + image_file_name + '.p' # Save image into pickle file for saving memeory with open(save_path, 'wb') as handle: pickle.dump(segmentation_image[list(segmentation_image.keys())[nl]], handle, protocol=pickle.HIGHEST_PROTOCOL)
def plot_results(sc=0, sr=0, sv=1, sv2=0, srs=0, mode=[], w_positions=False, scale=False, algo_params=[], rng=10, cinds=[0, 2, 3, 4, 7, 8, 9, 10], return_vs=0, return_bars=0, dinds=[0, 1, 2, 3, 4], save=False, title='misc', red=[], out=True, return_AIC=False, fake=True, **kwargs): ''' sc: bool show clusters plot sr: bool show reconstruction plot sv: bool show vscores mode: string [] = pca, 'all'= all, 'de' = dual energy,'se' = integrating detector W_position: bool Add positions to the vectors scale: bool Rescale the input, this is important if you add the position data default_base: dict Parameters for the clustering algorithms rng: int seed cinds: list which clustering methods to try red: string Which demensional REDuction 'ica', 'nmf' or 'tsne' ''' np.random.seed(rng) # ============ # Set up cluster parameters # ============ if sc: plt.figure(1, figsize=(9 * 2 + 3, 12.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) if sr: plt.figure(2, figsize=(9 * 2 + 3, 12.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) if srs: plt.figure(3, figsize=(9 * 2 + 3, 12.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 # This dictionary defines the colormap cdict = { 'red': ( (0.0, 0.0, 0.0), # no red at 0 (0.5, 1.0, 1.0), # all channels set to 1.0 at 0.5 to create white (1.0, 0.8, 0.8)), # set to 0.8 so its not too bright at 1 'green': ( (0.0, 0.8, 0.8), # set to 0.8 so its not too bright at 0 (0.5, 1.0, 1.0), # all channels set to 1.0 at 0.5 to create white (1.0, 0.0, 0.0)), # no green at 1 'blue': ( (0.0, 0.0, 0.0), # no blue at 0 (0.5, 1.0, 1.0), # all channels set to 1.0 at 0.5 to create white (1.0, 0.0, 0.0)) # no blue at 1 } # Create the colormap using the dictionary P = color.LinearSegmentedColormap('GnRd', cdict) h**o, comp, vs, idata, ialgo = [], [], [], [], [] if fake: data = (('glass', 'Glass'), ('pp', 'Poly'), ('bb', 'Bluebelt'), ('ptfe', 'PTFE'), ('steel', 'Steel')) datasets2 = [data[j] for j in dinds] else: datasets2 = (('chick_glass', 'Glass'), ('chick_bluebelt', 'Bluebelt')) default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -60, 'n_neighbors': 2, 'n_clusters': 2, 'linkage': 'ward', 'affinity': "nearest_neighbors", 'assign_labels': 'kmeans', 'min_samples': 10, 'ct': 'spherical', 'branching': 19, 'threshold': 0.0001, 'metric': 'minkowski', 'asc': False, 'p': 2, 'mcs': 10, 'nc': 2 } aic = [] for i_dataset, (dataset, dat_name) in enumerate(datasets2): # update parameters with dataset-specific values params = default_base.copy() params.update(algo_params) # if outlier_detect: # X = loadmat('2'+dataset)['Z'] if mode == 'de': X = loadmat('all' + dataset)['Z'][:, 1:] X = np.column_stack((np.sum(X[:, 1:3], 1), np.sum(X[:, 4:], 1))) elif mode == 'se': X = loadmat('all' + dataset)['Z'][:, 0] X = np.reshape(X, (20 * 68, 1), order="F") elif mode == 'all': X = loadmat('all' + dataset)['Z'][:, 1:] elif mode == 'small': X = loadmat('small_' + dataset)['Z'] elif mode == 'gauss': X = loadmat('all' + dataset)['Z'][:, 1:] for jj in range(0, 5): r2 = np.reshape(X[:, jj], (20, 68), order="F") X[:, jj] = np.reshape(gaussian_filter1d(r2, sigma=1), 20 * 68, order="F") X = PCA(n_components=2).fit_transform(X) else: X = loadmat('2' + dataset)['Z'] label_true = loadmat('2' + dataset + '_mask')['BW'] if not fake: X = X[400:, :].copy() label_true = label_true[400:].copy() if w_positions: xx, yy = np.meshgrid(range(68), range(20)) x = np.reshape(xx, (20 * 68, 1), order="F") y = np.reshape(yy, (20 * 68, 1), order="F") X = np.concatenate((X, x, y), axis=1) if scale: X = StandardScaler().fit_transform(X) if red == 'ica': X = FastICA(n_components=params['nc'], whiten=True).fit_transform(X) if red == 'icapca': X = FastICA(n_components=5).fit_transform(X) X = PCA(n_components=params['nc']).fit_transform(X) if red == 'tsne': X = TSNE(n_components=params['nc']).fit_transform(X) if red == 'nmf': X = NMF(n_components=params['nc']).fit_transform(X) if red == 'pca': X = PCA(n_components=params['nc']).fit_transform(X) if red == 'spec': X = SpectralEmbedding(n_components=3).fit_transform(X) # estimate bandwidth for mean shift # if mode == 'se': # bandwidth = None # else: bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) # if mode != 'se': # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) # else: # connectivity = None # ============ # Create cluster objects # ============ ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage=params['linkage'], connectivity=connectivity) spectral = cluster.SpectralClustering( n_clusters=params['n_clusters'], eigen_solver='arpack', affinity=params['affinity'], assign_labels=params['assign_labels']) dbscan = cluster.DBSCAN(eps=params['eps'], min_samples=params['min_samples'], metric=params['metric'], p=params['p']) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering( linkage="complete", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters'], branching_factor=params['branching'], threshold=params['threshold']) gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type=params['ct']) bgmm = mixture.BayesianGaussianMixture( n_components=params['n_clusters'], covariance_type=params['ct']) hdb = hdbscan.HDBSCAN(min_samples=params['min_samples'], min_cluster_size=params['mcs'], metric=params['metric'], allow_single_cluster=params['asc'], p=params['p'], **kwargs) cinds_all = (('KMeans', two_means), ('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('SpectralClustering', spectral), ('Ward', ward), ('AgglomerativeClustering', average_linkage), ('DBSCAN', dbscan), ('Birch', birch), ('HDBSCAN', hdb), ('GaussianMixture', gmm), ('BGaussianMixture', bgmm)) clustering_algorithms = [cinds_all[j] for j in cinds] for i_algorithm, (name, algorithm) in enumerate(clustering_algorithms): t0 = time.time() # catch warnings related to kneighbors_graph with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="the number of connected components of the " + "connectivity matrix is [0-9]{1,2}" + " > 1. Completing it to avoid stopping the tree early.", category=UserWarning) warnings.filterwarnings( "ignore", message="Graph is not fully connected, spectral embedding" + " may not work as expected.", category=UserWarning) if hasattr(algorithm, 'condensed_tree_'): pass else: algorithm.fit(X) t1 = time.time() if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) elif hasattr(algorithm, 'condensed_tree_'): y_pred = algorithm.fit_predict(X) else: y_pred = algorithm.predict(X) homo1, comp1, vs1 = homogeneity_completeness_v_measure( label_true.squeeze(), y_pred) if return_AIC: if hasattr(algorithm, 'aic'): aic.append(algorithm.aic(X)) if sc: plt.figure(1) plt.rcParams['axes.facecolor'] = P(1 - vs1, alpha=0.5) plt.subplot(len(datasets2), len(clustering_algorithms), plot_num) if i_dataset == 0: plt.title(name, size=18) if i_algorithm == 0: plt.ylabel(dat_name, size=18) colors2 = np.array( list( islice( cycle([ 'r', 'b', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ]), int(max(y_pred) + 1)))) # add black color for outliers (if any) colors2 = np.append(colors2, ["#000000"]) #import ipdb; ipdb.set_trace() X2 = X[label_true.squeeze() == 0, :] plt.scatter(X2[:, 0], X2[:, 1], s=40, color=colors2[y_pred[label_true.squeeze() == 0]]) X2 = X[label_true.squeeze() == 1, :] plt.scatter(X2[:, 0], X2[:, 1], s=40, color=colors2[y_pred[label_true.squeeze() == 1]], marker='x') plt.xticks(()) plt.yticks(()) plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') plt.text(.99, .89, ('%.2f' % vs1).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') # if sr | srs: # aa = y_pred[1320] # if aa != 0: # y_pred[y_pred == y_pred.min()] = y_pred.max() + 1 # y_pred[y_pred == aa] = y_pred.min() if sr: plt.figure(2) if fake: r = np.reshape(y_pred, (20, 48), order="F") else: r = np.reshape(y_pred, (20, 48), order="F") plt.subplot(len(datasets2), len(clustering_algorithms), plot_num) if i_dataset == 0: plt.title(name, size=18) plt.imshow(r) plt.set_cmap('bwr') plt.xticks([]) plt.yticks([]) if i_algorithm == 0: plt.ylabel(dat_name, size=18) if srs: plt.figure(3) r = np.reshape(y_pred, (20, 68), order="F") p = np.reshape(label_true, (20, 68), order="F") plt.subplot(len(datasets2), len(clustering_algorithms), plot_num) # Bin all of the bins to one and zero r[r == r.min()] = 0 r[r > r.min()] = 1 if i_dataset == 0: plt.title(name, size=18) plt.imshow(abs(p - r)) plt.set_cmap('gray') plt.xticks([]) plt.yticks([]) if i_algorithm == 0: plt.ylabel(dat_name, size=18) h**o.append(homo1) comp.append(comp1) vs.append(vs1) idata.append(i_dataset) ialgo.append(i_algorithm) plot_num += 1 if save: plt.figure(1) plt.savefig('scatter_' + title + '.png') plt.figure(2) plt.savefig('recon_' + title + '.png') plt.show() if sv: plt.rcParams['axes.facecolor'] = (0, 0, 0) vs = np.asarray(vs) bars = [] n_components_range = range(len(clustering_algorithms)) cv_types = [item[1] for item in datasets2] color_iter = cycle( ['navy', 'turquoise', 'cornflowerblue', 'darkorange', 'k']) # Plot the BIC scores plt.figure(figsize=(8, 6)) spl = plt.subplot(1, 1, 1) for i, (cv_type, pcolor) in enumerate(zip(cv_types, color_iter)): xpos = np.array(n_components_range) + .166 * (i - 2) bars.append( plt.bar(xpos, vs[i * len(n_components_range):(i + 1) * len(n_components_range)], width=.166, color=pcolor)) plt.xticks(n_components_range, [item[0] for item in clustering_algorithms]) plt.xticks() plt.ylim([0, 1]) plt.title('V score per model') xpos = np.mod(vs.argmax(), len(n_components_range)) + .65 +\ .16 * np.floor(vs.argmax() / len(n_components_range)) plt.text(xpos, vs.min() * 0.97 + .03 * vs.max(), '*', fontsize=14) spl.set_xlabel('Algorithm') spl.legend([b[0] for b in bars], cv_types) plt.tight_layout() if sv2: vs = np.asarray(vs) bars = [] colors = [] n_components_range = range(len(clustering_algorithms)) cv_types = [item[1] for item in datasets2] color_iter = cycle( ['navy', 'turquoise', 'cornflowerblue', 'darkorange', 'k']) # Plot the BIC scores if out: plt.figure(figsize=(8, 6)) plt.rcParams['axes.facecolor'] = (1, 1, 1) spl = plt.subplot(1, 1, 1) for i, (cv_type, pcolor) in enumerate(zip(cv_types, color_iter)): xpos = np.array(n_components_range) + 0.166 * (i - 2) bars.append(vs[i * len(n_components_range):(i + 1) * len(n_components_range)]) # import ipdb; ipdb.set_trace() [colors.append(colourblind(col)) for col in n_components_range] bars2 = np.mean(np.asarray(bars), axis=0) indeces = np.argsort(bars2) # import ipdb; ipdb.set_trace() bars2.sort() clustering_algorithms = [clustering_algorithms[i] for i in indeces] if out: plt.bar(n_components_range, bars2, color=colors) plt.xticks(n_components_range, [item[0] for item in clustering_algorithms], rotation=45, ha="right") plt.xticks() plt.ylim([0, 1]) plt.title('V Averaged over all Materials') xpos = np.mod(vs.argmax(), len(n_components_range)) + .65 +\ .16 * np.floor(vs.argmax() / len(n_components_range)) plt.text(xpos, vs.min() * 0.97 + .03 * vs.max(), '*', fontsize=14) spl.set_xlabel('Algorithm') spl.set_ylabel('V-score') #spl.legend([b[0] for b in bars], cv_types) plt.tight_layout() if return_bars: return bars if return_vs: return vs if return_AIC: return aic
def main(): print("# Start Spectral Clustering") print("# Load Fachion MNIST dataset") np.random.seed(0) # ============ # Generate datasets. We choose the size big enough to see the scalability # of the algorithms, but not too big to avoid too long running times # ============ n_samples = 1500 noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05) noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) blobs = datasets.make_blobs(n_samples=n_samples, random_state=8) no_structure = np.random.rand(n_samples, 2), None # Anisotropicly distributed data random_state = 170 X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state) transformation = [[0.6, -0.6], [-0.4, 0.8]] X_aniso = np.dot(X, transformation) aniso = (X_aniso, y) # blobs with varied variances varied = datasets.make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state) # ============ # Set up cluster parameters # ============ plt.figure(figsize=(9 * 2 + 3, 12.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } _datasets = [(noisy_circles, { 'damping': .77, 'preference': -240, 'quantile': .2, 'n_clusters': 2 }), (noisy_moons, { 'damping': .75, 'preference': -220, 'n_clusters': 2 }), (varied, { 'eps': .18, 'n_neighbors': 2 }), (aniso, { 'eps': .15, 'n_neighbors': 2 }), (blobs, {}), (no_structure, {})] for i_dataset, (dataset, algo_params) in enumerate(_datasets): # update parameters with dataset-specific values params = default_base.copy() params.update(algo_params) X, y = dataset # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # ============ # Create cluster objects # ============ ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters']) gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') clustering_algorithms = (('MiniBatchKMeans', two_means), ('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('SpectralClustering', spectral), ('Ward', ward), ('AgglomerativeClustering', average_linkage), ('DBSCAN', dbscan), ('Birch', birch), ('GaussianMixture', gmm)) for name, algorithm in clustering_algorithms: t0 = time.time() # catch warnings related to kneighbors_graph with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="the number of connected components of the " + "connectivity matrix is [0-9]{1,2}" + " > 1. Completing it to avoid stopping the tree early.", category=UserWarning) warnings.filterwarnings( "ignore", message="Graph is not fully connected, spectral embedding" + " may not work as expected.", category=UserWarning) algorithm.fit(X) t1 = time.time() if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) plt.subplot(len(_datasets), len(clustering_algorithms), plot_num) if i_dataset == 0: plt.title(name, size=18) colors = np.array( list( islice( cycle([ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ]), int(max(y_pred) + 1)))) # add black color for outliers (if any) colors = np.append(colors, ["#000000"]) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) plt.xlim(-2.5, 2.5) plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') plot_num += 1 #plt.show() if not os.path.exists(save_dir): os.system("mkdir -p {}".format(save_dir)) plt.savefig(os.path.join(save_dir, "clustering_methods_result.png"))
top=.96, wspace=.05, hspace=.05) plot_num = 1 count = 0 datasets = [actors, producers, writers, directors] for i_dataset, dataset in enumerate(datasets): X, y = dataset # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # create clustering estimators ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=2) ward = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack', affinity="nearest_neighbors")
def do_clustering(csps_data): global db_time, feature_set, global_args, themes # n_clusters = cluster_options['n_clusters'] # id measure response year organisation group score # print(csps_data.head()) # First get data fram into the right shape if (feature_set == 'demographics'): #print(csps_data.head()) #csps_data = csps_data.set_index(['organisation', 'org', 'year']) csps_data = pd.pivot_table(csps_data, values='score', index=[ 'organisation', 'year', 'headcount', 'org', 'gender_offset', 'ethnic_percentage', 'disabled_percentage', 'headcount_delta' ], columns=['measure'], aggfunc=np.sum) #csps_data = csps_data.dropna(axis=1) #print(csps_data.head()) #'organisation', 'year', 'headcount', 'org', 'gender_offset', 'ethnic_percentage', 'disabled_percentage', 'headcount_delta', 'measure', 'score' else: # csps_data2 = csps_data.pivot(index='org', columns='measure', values='score') csps_data = pd.pivot_table( csps_data, values='score', index=['organisation', 'year', 'headcount', 'org', 'par'], columns=['measure'], aggfunc=np.sum) csps_data = csps_data.dropna(axis=1) #csps_data = pd.pivot_table(csps_data, values='score', index=['organisation', 'year', 'headcount', 'org', 'par'], columns=['measure'], aggfunc=np.sum, fill_value=-1) #csps_data = pd.pivot_table(csps_data, values='score', index=['organisation', 'year', 'org', 'par'], columns=['measure', 'headcount'], aggfunc=np.sum) #print(feature_set) #print(csps_data2.head()) #print('*' * 50) #print(csps_data2.head()) # Now, because the EEI is required later and therefore retrieved, but is to be excluded from the questions and demographics, split the EEI column out and delete eei = csps_data['EEI'] #print(eei.tolist()) # if (feature_set != 'zzzzthemes'): # csps_data = csps_data.drop('EEI', 1) #print(csps_data.head()) #print(list(csps_data.columns.values)) column_names = csps_data.columns.values.tolist() inx = intersect(column_names, themes) # print(column_names) # print(themes) # print(inx) df_parts = [] #df_parts.append(csps_data['EEI']) ###################################################################################### # TODO - need to store the columns that will be dropped before removing them for clustering ###################################################################################### for m in inx: df_parts.append(csps_data[m]) csps_data = csps_data.drop('EEI', 1) # don't drop theme columns from theme feature set! # also demographics feature doesn't contain themes anyway # so just do it for 'questions', 'ew_questions', NOT 'themes', 'demographics' if ((feature_set == 'questions') or (feature_set == 'ew_questions')): csps_data = csps_data.drop(inx, 1) #print( '*' * 80 ) #print(df_parts) #print( '*' * 80 ) #print(csps_data.head()) # The data should always be a 2D array, shape (n_samples, n_features) # print(csps_data.head()) # To get the boolean mask where values are nan # cpvnm: CSPS data, pivoted, null mask # csps_data = pd.isnull(csps_data) # print(csps_data.head()) ''' if (feature_set == 'themes'): dist_test1 = csps_data['EEI'].tolist() dist_test2 = csps_data['MW'].tolist() # print(dist_test.head()) # dist_test.reset_index(True) # print(dist_test.head()) print(dist_test1) print(dist_test2) zz = zip(dist_test1, dist_test2) print(map(list, zz)) from sklearn.metrics.pairwise import euclidean_distances X_pairs = [[0, 1], [1, 1]] # distance between rows of X print(euclidean_distances(dist_test1, dist_test2)) # print(euclidean_distances(X_pairs, X_pairs)) # array([[ 0., 1.], [ 1., 0.]]) # get distance to origin # print(euclidean_distances(X_pairs, [[0, 0]])) # array([[ 1. ], [ 1.41421356]]) ''' #print(csps_data.columns) # Filling missing data: CSPS data, pivoted, no-null # csps_data = csps_data.fillna(value=0) # print(csps_data.head()) #'KMeans', 'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift', 'SpectralClustering', 'AgglomerativeClustering', 'DBSCAN', 'Birch' # normalize dataset for easier parameter selection try: X = StandardScaler().fit_transform(csps_data) except: print("ERROR") print(csps_data) start_cluster_time = timer() # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) if (algorithm == 'KMeans'): clustered = cluster.KMeans(n_clusters=cluster_options['n_clusters']) elif (algorithm == 'MiniBatchKMeans'): clustered = cluster.MiniBatchKMeans( n_clusters=cluster_options['n_clusters']) elif (algorithm == 'AffinityPropagation'): clustered = cluster.AffinityPropagation() elif (algorithm == 'MeanShift'): bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) clustered = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) elif (algorithm == 'SpectralClustering'): clustered = cluster.SpectralClustering( n_clusters=cluster_options['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") elif (algorithm == 'AffinityPropagation'): clustered = cluster.AffinityPropagation(damping=.9, preference=-200) elif (algorithm == 'AgglomerativeClustering'): clustered = cluster.AgglomerativeClustering( linkage='ward', n_clusters=cluster_options['n_clusters'], connectivity=connectivity) elif (algorithm == 'AC_average_linkage'): clustered = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=cluster_options['n_clusters'], connectivity=connectivity) elif (algorithm == 'DBSCAN'): clustered = cluster.DBSCAN(eps=.5, algorithm='auto', leaf_size=40) elif (algorithm == 'Birch'): clustered = cluster.Birch(n_clusters=cluster_options['n_clusters']) else: clustered = cluster.KMeans(n_clusters=cluster_options['n_clusters']) clustered.fit(X) if (algorithm == 'MeanShift' or algorithm == 'DBSCAN'): silhouette_score = -1 else: silhouette_score = metrics.silhouette_score(X, clustered.labels_, metric='euclidean') # neigh = NearestNeighbors(2, 0.4) # neigh.fit(X) # NearestNeighbors(algorithm='auto', leaf_size=30) # nbrs = neigh.radius_neighbors([[0, 0, 1.3]], 0.4, return_distance=False) # rng = neigh.radius_neighbors([X[1]]) # print('NearestNeighbors') # print(X.shape[1]) # print(np.asarray(rng[0][0])) end_cluster_time = timer() # this works, but isn't useful any more # csps_data['cluster_id'] = clustered.labels_ if (feature_set != 'demographics'): # org_year = zip(*csps_data.index.values) #['organisation', 'year', 'org', 'par'] # orgs = pd.Series(org_year[0]) # years = pd.Series(org_year[1]) # org_acronym = pd.Series(org_year[2]) # par_acronym = pd.Series(org_year[3]) # clusters = pd.Series(clustered.labels_.tolist()) org_year = zip(*csps_data.index.values ) #['organisation', 'year', 'headcount', 'org', 'par'] orgs = pd.Series(org_year[0]) years = pd.Series(org_year[1]) headcount = pd.Series(org_year[2]) org_acronym = pd.Series(org_year[3]) par_acronym = pd.Series(org_year[4]) clusters = pd.Series(clustered.labels_.tolist()) else: org_year = zip( *csps_data.index.values) #['organisation', 'org', 'year'] orgs = pd.Series(org_year[0]) years = pd.Series(org_year[2]) org_acronym = pd.Series(org_year[1]) clusters = pd.Series(clustered.labels_.tolist()) org_year.append(clustered.labels_.tolist()) #1 - organisation df = pd.DataFrame(orgs) #, 'organisation' df.columns = ['organisation'] #2 - year df['year'] = years #3 - headcount if (feature_set != 'demographics'): df['headcount'] = headcount else: df['headcount'] = np.array([0] * len(df)) #csps_data['headcount'] #4 - cluster id df['cluster'] = clusters #5 - acronym df['org'] = org_acronym #6 - parent if (feature_set != 'demographics'): df['parent'] = par_acronym else: df['parent'] = np.array(['x'] * len(df)) #7 - EEI df['EEI'] = eei.tolist() # if (feature_set != 'themes'): # df['EEI'] = eei.tolist() # else: # df['EEI'] = csps_data['EEI'] i = 0 for m in inx: df[m] = df_parts[i].tolist() i = i + 1 #category_labels = ['EEI', 'headcount', 'year'] #category_labels.extend(inx) category_labels = df.columns.values.tolist() # descriptive statistics for each cluster #df[df.A > 0] #df.groupby('cluster') #cluster_info = df.groupby(['cluster']).get_group(1) #grouped = df(['EEI', 'headcount', 'cluster']).groupby('cluster') grouped = df.groupby('cluster') cluster_info = grouped.describe().fillna('missing') # for name, group in grouped: # print(name) # print(group) #df = df.sort_values(by='cluster') # use describe to show quick summary statistics of the data #df.describe(); end_time = timer() cluster_time = (end_cluster_time - start_cluster_time) total_time = (end_time - start_time) if (algorithm == 'AffinityPropagation'): other_output = json.dumps([{ 'silhouette_score': silhouette_score, 'db_time': db_time, 'cluster_time': cluster_time, 'total_time': total_time, 'feature_set': feature_set, 'cluster_info': cluster_info.values.tolist(), 'category_labels': category_labels }, clustered.cluster_centers_indices_.tolist() ]) output = json.dumps([{ 'silhouette_score': silhouette_score, 'db_time': db_time, 'cluster_time': cluster_time, 'total_time': total_time, 'feature_set': feature_set, 'cluster_info': cluster_info.values.tolist(), 'category_labels': category_labels }, df.values.tolist()]) # output = json.dumps(other_output) return output
def do(): ai = AI() ai.load() # ai.learn() params = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } bandwidth = cluster.estimate_bandwidth(ai.x, quantile=params['quantile']) connectivity = kneighbors_graph(ai.x, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters']) gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') clustering_algorithms = (('MiniBatchKMeans', two_means), ('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('SpectralClustering', spectral), ('Ward', ward), ('AgglomerativeClustering', average_linkage), ('DBSCAN', dbscan), ('Birch', birch), ('GaussianMixture', gmm)) for name, algorithm in clustering_algorithms: with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="the number of connected components of the " + "connectivity matrix is [0-9]{1,2}" + " > 1. Completing it to avoid stopping the tree early.", category=UserWarning) warnings.filterwarnings( "ignore", message="Graph is not fully connected, spectral embedding" + " may not work as expected.", category=UserWarning) try: algorithm.fit(ai.x) except: continue if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(numpy.int) else: y_pred = algorithm.predict(ai.x) if max(y_pred) > 3: continue known_groups = {} for i, group in enumerate(ai.y): group = int(group) if group not in known_groups: known_groups[group] = [] known_groups[group].append(i) guessed_groups = {} for i, group in enumerate(y_pred): if group not in guessed_groups: guessed_groups[group] = [] guessed_groups[group].append(i) for k in known_groups: for g in guessed_groups: print( k, g, len(set(known_groups[k]).intersection(guessed_groups[g])))
def clusters(self, G): ''' Finds the clusters ''' # imports from a machine learning package skit-learn from sklearn.cluster import MeanShift, estimate_bandwidth # creates a array with the 2D coordinats for each node X = [[node[0], node[1]] for node in G.nodes()] # extimates the dimensions of single clusters bandwidth = estimate_bandwidth(X, quantile=0.1, random_state=0, n_jobs=1) # find clustes ms = MeanShift(bandwidth=bandwidth) ms.fit(X) # labels is an array indicating, for each node, the cluster number labels = {node: ms.labels_[i] for i, node in enumerate(G.nodes())} # --- ADDUCTION --- ''' # add cluster centers to the graph for node in cluster_centers: attribute = {'label': 'water tower', 'pos': node} G.add_node(node, attribute) attribute = {'type' : 'sink'} self.sinksource_graph.add_node(node, attribute) ''' adduction = nx.Graph() cluster_centers = [(node[0], node[1]) for node in ms.cluster_centers_] for node in cluster_centers: adduction.add_node(node) self.complete_graph(adduction) adduction = self.mesh_graph(adduction, weight='dist') print(len(adduction.edges())) nx.draw_networkx(adduction) # coord = {elem[0]: [elem[0][0], elem[0][1]] for elem in adduction.nodes(data=True)} # nx.draw_networkx(adduction, pos=coord, label=False) self.write2shp(adduction, "adduction_network") self.acqueduct.add_edges_from(adduction.edges()) # --- DISTRIBUTION --- # add label info to the graph nx.set_node_attributes(G, labels, 'label') # initialize distribution graphs distribution = [nx.Graph() for cluster in cluster_centers] for node in labels: cluster = labels[node] distribution[cluster].add_node(node) ''' # connect each node with his the cluster center node_list = [] for index, node in enumerate(G): node_list.append(node) labels = nx.get_node_attributes(G, 'label') label = labels[node] if label is not 'water tower': G.add_edge(node, cluster_centers[label]) ''' for dist_graph in distribution: self.complete_graph(dist_graph) dist_graph = nx.minimum_spanning_tree(dist_graph, weight='dist') self.acqueduct.add_edges_from(dist_graph.edges())
# coding=utf-8 import numpy as np import sklearn.cluster as sc import matplotlib.pyplot as mp x = np.loadtxt("../ml_data/multiple3.txt", delimiter=",") bw = sc.estimate_bandwidth(x, n_samples=len(x), quantile=0.2) model = sc.MeanShift(bandwidth=bw, bin_seeding=True) model.fit(x) pred_y = model.predict(x) #获取聚类中心 centers = model.cluster_centers_ print(centers) n = 500 l, r = x[:, 0].min() - 1, x[:, 0].max() + 1 t, d = x[:, 1].min() - 1, x[:, 1].max() + 1 grid_x, grid_y = np.meshgrid(np.linspace(l, r, n), np.linspace(t, d, n)) grid_xy = np.column_stack((grid_x.ravel(), grid_y.ravel())) grid_z = model.predict(grid_xy) # grid_z=np.vstack((grid_x.flatten(),grid_y.flatten())) # # grid_z=model.predict(grid_z.T) grid_z = grid_z.reshape(grid_x.shape) mp.figure("Kmeans", facecolor="lightgray")
print(data_frame.columns.values.tolist()) # ['V1', 'V2', 'labels'] data = data_frame.values X = data[:, :2] y = data[:, 2] # shuffle shuffle_indexes = np.random.permutation(len(X)) X, y = X[shuffle_indexes], y[shuffle_indexes] return X, y if __name__ == '__main__': X, y = load_data() print(X.shape, y.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state) mean_shift = MeanShift(bandwidth=estimate_bandwidth(X)) mean_shift.fit(X_train) y_predict = mean_shift.predict(X_test) # 1不变,0和2互换 y_predict[y_predict == 1] = 1 y_predict[y_predict == 0] = 3 y_predict[y_predict == 2] = 0 y_predict[y_predict == 3] = 2 score = accuracy_score(y_true=y_test, y_pred=y_predict) print(score) # 0.9966666666666667 pass
def color_quantization(path, exe_median_cut=True, plot=True): # Read image by cv2 image = cv2.imread(path) image_file_name = os.path.basename(path).split('.')[0] # Change color space from BGR to RGB to HLS image = cv2.cvtColor(image, cv2.COLOR_BGR2HLS) image = np.array(image) # Change image objects into array # The following bandwidth can be automatically detected using image_reshape = image.reshape((image.shape[0] * image.shape[1], image.shape[2])) bandwidth = estimate_bandwidth(image_reshape, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=10) ms.fit(image_reshape) labels = ms.predict(image_reshape) # Normalized color label_norm = deepcopy(labels) norm = colors.Normalize(vmin=-1., vmax=1.) norm.autoscale(label_norm) label_norm = norm(label_norm).tolist() cluster_centers = ms.cluster_centers_.astype(np.uint8) labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) image_mean_shift = recreate_image(cluster_centers, labels, image.shape[0], image.shape[1]).astype(np.uint8) print("Reduce color through mean-shift: %d" % n_clusters_) h_o, s_o, v_o = cv2.split(image) # Normalize color space pixel_colors = image.reshape((np.shape(image)[0] * np.shape(image)[1], 3)) norm = colors.Normalize(vmin=-1., vmax=1.) norm.autoscale(pixel_colors) pixel_colors = norm(pixel_colors).tolist() if plot: fig = plt.figure(figsize=(25, 25)) ax = fig.add_subplot(2, 3, 1) ax.imshow(image) ax.axis('off') ax.set_title('Original image') ax = fig.add_subplot(2, 3, 2) ax.imshow(image_mean_shift) ax.axis('off') ax.set_title('Image after mean shift') ax = fig.add_subplot(2, 3, 4, projection="3d") ax.scatter(h_o.flatten(), s_o.flatten(), v_o.flatten(), facecolors=pixel_colors, marker=".") ax.set_xlabel("Hue") ax.set_ylabel("Saturation") ax.set_zlabel("Value") ax.set_title('Color space of original images') ax = fig.add_subplot(2, 3, 5, projection='3d') ax.scatter(h_o.flatten(), s_o.flatten(), v_o.flatten(), c=label_norm) ax.set_xlabel("Hue") ax.set_ylabel("Saturation") ax.set_zlabel("Value") ax.set_title('Color space after mean-shift') # Median cut # There is no need to do median cut if there are not too many colors if exe_median_cut: image_array = Image.fromarray(image_mean_shift.astype(np.uint8)) image_median_cut_label = np.array(image_array.quantize(colors=10, method=0, kmeans=0, palette=None)) # Calculate the clustered centered pixel_class = {index: [] for index in list(set(image_median_cut_label.flatten()))} image_median_cut = image_median_cut_label.flatten() image_mean_shift_shape = image_mean_shift.reshape(image_mean_shift.shape[0]*image_mean_shift.shape[1], image_mean_shift.shape[2]) for i, j in zip(image_median_cut, image_mean_shift_shape): pixel_class[i].append(j) clustered_center_median_cut = np.array(list({key: tuple(np.average(np.array(value), axis=0).astype(np.uint8)) for key, value in pixel_class.items()}.values())) image_median_cut = recreate_image(clustered_center_median_cut, image_median_cut_label.flatten(), image_mean_shift.shape[0], image_mean_shift.shape[1]).astype(np.uint8) image_median_cut_h, image_median_cut_l, image_median_cut_s = cv2.split(image_median_cut) print("Reduce color through median cut: %d" % len(list(set(image_median_cut_label.flatten())))) image_mean_shift = image_median_cut # K-Means image_kmeans = image_mean_shift.reshape((image_mean_shift.shape[0] * image_mean_shift.shape[1], image_mean_shift.shape[2])) kmeans = KMeans(n_clusters=3, random_state=0, n_jobs=10).fit(image_kmeans) labels = kmeans.predict(image_kmeans) print("Reduce color through K-means: %d" % len(list(set(labels.flatten())))) cluster_centers = kmeans.cluster_centers_.astype(np.uint8) image_mean_shift = image_mean_shift.astype(np.uint8) k_means_image = recreate_image(cluster_centers, labels, image_mean_shift.shape[0], image_mean_shift.shape[1]).astype(np.uint8) h_km, s_km, v_km = cv2.split(k_means_image) if plot: ax = fig.add_subplot(2, 3, 3) ax.imshow(k_means_image) ax.axis('off') ax.set_title('Image after K-means') ax = fig.add_subplot(2, 3, 6, projection='3d') ax.scatter(h_km.flatten(), s_km.flatten(), v_km.flatten(), c=labels) ax.set_xlabel("Hue") ax.set_ylabel("Saturation") ax.set_zlabel("Value") fig.tight_layout() plt.show() segmentation_image = seperate_layers(k_means_image) for nl in range(1, len(segmentation_image)): current_dir = os.getcwd() file_name = path.split('/')[-3] image_quantization_result_dir = str(Path(current_dir).parent) + '/image_generator/' + file_name + \ '/color_quantization_result_batches/' + str(nl) + '_layer/' if not os.path.exists(image_quantization_result_dir): os.makedirs(image_quantization_result_dir) print("Directory ", image_quantization_result_dir, " Created ") else: print("Directory ", image_quantization_result_dir, " already exists") save_path = image_quantization_result_dir + image_file_name + '.p' # Save image into pickle file for saving memeory with open(save_path, 'wb') as handle: pickle.dump(segmentation_image[list(segmentation_image.keys())[nl]], handle, protocol=pickle.HIGHEST_PROTOCOL) return segmentation_image
# stopping criteria criteria = (cv.TERM_CRITERIA_EPS+cv.TERM_CRITERIA_MAX_ITER,100,0.2) fig, axs = plt.subplots(1,5,sharey=True,figsize=(10,8)) ctr,k = 0,2 for ax in axs: _, labels, (centers) = cv.kmeans(pixel_values, k, None,criteria, 10, cv.KMEANS_RANDOM_CENTERS) centers = np.uint8(centers) labels = labels.flatten() segmented_img = centers[labels.flatten()] segmented_img = segmented_img.reshape(img.shape) ax.imshow(segmented_img,interpolation='none') ax.set_title(["Cluster=",k],fontsize=8) ctr+=1 k+=2 plt.show() #%% Mean Shift Clustering Based Segmentation originImg = cv.imread('SunnyLake.bmp') originShape = originImg.shape flatImg=np.reshape(originImg, [-1, 3]) bandwidth = estimate_bandwidth(flatImg, quantile=0.1, n_samples=100) ms = MeanShift(bandwidth = bandwidth, bin_seeding=True) ms.fit(flatImg) labels=ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) segmentedImg = cluster_centers[np.reshape(labels, originShape[:2])] cv.imshow("Segmented Image",segmentedImg.astype(np.uint8)) cv.waitKey(0) cv.destroyAllWindows()
import numpy as np from sklearn.cluster import MeanShift, estimate_bandwidth from sklearn.datasets.samples_generator import make_blobs centers = [[1, 1], [-1, -1], [1, -1]] X,_ = make_blobs(n_samples=10000, centers=centers, cluster_std=0.6) bandwidth = estimate_bandwidth(X, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) # Plot result import matplotlib.pyplot as plt for k in range(n_clusters_): my_members = labels == k cluster_center = cluster_centers[k] plt.scatter(X[my_members, 0], X[my_members, 1]) plt.plot(cluster_center[0], cluster_center[1], 'o', markeredgecolor='b', markersize=14) plt.title('Estimated number of clusters: %d' % n_clusters_)
# iris_data = pd.read_csv('iris_data.csv') iris_data = pd.read_excel('iris_data.xlsx') # print(iris_data) # 2. iris_data = pd.get_dummies(iris_data, columns=['Species']) # print(iris_data) # 3. virginica = iris_data.loc[iris_data['Species_I. virginica'] == 1] versicolor = iris_data.loc[iris_data['Species_I. versicolor'] == 1] setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1] # virginica.plot.scatter(x=0, y=1, c='r') # versicolor.plot.scatter(x=0, y=1, c='b') # setosa.plot.scatter(x=0, y=1, c='g') plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r') plt.scatter(x=versicolor['Sepal length'], y=versicolor['Sepal width'], color='g') plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b') # plt.show() # 4. print(estimate_bandwidth(iris_data, quantile=0.2)) analyzer = MeanShift(bandwidth=1) # print(estimate_bandwidth(virginica, quantile=0.2)) # print(estimate_bandwidth(versicolor, quantile=0.2)) # print(estimate_bandwidth(setosa, quantile=0.2))
def extract_profiles(global_data, genotype, seq_idx, ref_labels, RG_info, ID_col, subset_col, Names, ref_lib={}, n_comps=4, repn=100, code={}, others='admx', Sn=500, same=True, clean=False): ''' Extract KDE profiles for specific accessions (global_idx) from reference groups in PCA space. Reduction and KDE calculated at seq_idx positions in genotype array. Reference accessions from ref_labels groups are permuted. samp_sample() function is used to sample accessions using RG_info, takes Sm ''' ## estimate the bandwith pca2 = PCA(n_components=n_comps, whiten=False, svd_solver='randomized') cluster_profiles = {x: [] for x in ref_labels} ## perform KDE. combine = {} tkeys = ref_labels var_comp_store = [] for rp in range(repn): print(rp) if same: Names_idx, kde_class_labels, kde_label_dict, Nsample = samp_same_v2c2( ref_lib, Names) else: Names_idx, kde_class_labels, kde_label_dict, Nsample = samp_sample( genotype, RG_info, ID_col, subset_col, Names, code=code, others=others, Sn=Sn) dat_foc = genotype[:, seq_idx] dat_foc = dat_foc[global_data] Sequences = genotype[:, seq_idx] Sequences = Sequences[Names_idx] if Sequences.shape[1] <= 3: Results[Chr][c] = [0, 0] print('hi') continue pca2.fit(Sequences) data = pca2.transform(Sequences) data_ref = pca2.transform(dat_foc) local_pcvar = list(pca2.explained_variance_ratio_) #local_pcvar= [local_pcvar] var_comp_store.append(local_pcvar) params = {'bandwidth': np.linspace(np.min(data), np.max(data), 15)} grid = GridSearchCV(KernelDensity(algorithm="ball_tree", breadth_first=False), params, cv=3, iid=False, verbose=0) ref_q = [] for bull in tkeys: Quanted_set = data[kde_label_dict[bull], :] grid.fit(Quanted_set) kde = grid.best_estimator_ P_dist = kde.score_samples(Quanted_set) Fist = kde.score_samples(data_ref) if clean: pdat = kde.score_samples(data) pdat = scipy.stats.norm(np.mean(P_dist), np.std(P_dist)).cdf(pdat) ref_q.append(pdat) ## Normalizing log-likelihood estimates by those of the reference set and extracting their cdf. Fist = scipy.stats.norm(np.mean(P_dist), np.std(P_dist)).cdf(Fist) if not clean: cluster_profiles[bull].append(Fist) if clean: ref_q = np.array(ref_q) sidx = ref_q.argsort(axis=0) sidx = ref_q[sidx, np.arange(sidx.shape[1])] diffs = sidx[-1] - sidx[-2] diffs = diffs for idx in range(len(tkeys)): qt_idx = np.array(kde_label_dict[bull]) qdiff = diffs[qt_idx] qmax = sidx[-1][qt_idx] bandwidth = estimate_bandwidth(qdiff.reshape(-1, 1), quantile=0.2) if not bandwidth: continue ms = MeanShift(bandwidth=bandwidth, cluster_all=True, min_bin_freq=20, bin_seeding=False) ms.fit(qdiff.reshape(-1, 1)) labels = ms.labels_ cluster_centers = ms.cluster_centers_ clust_keep = np.argmax(cluster_centers) clust_keep = [ qt_idx[x] for x in range(len(qt_idx)) if qmax[x] > .01 and labels[x] == clust_keep ] if len(clust_keep) >= 5: kde_label_dict[bull] = clust_keep print(len(qt_idx) - len(clust_keep)) for bull in tkeys: Quanted_set = data[kde_label_dict[bull], :] grid.fit(Quanted_set) kde = grid.best_estimator_ P_dist = kde.score_samples(Quanted_set) Fist = kde.score_samples(data_ref) ## Normalizing log-likelihood estimates by those of the reference set and extracting their cdf. Fist = scipy.stats.norm(np.mean(P_dist), np.std(P_dist)).cdf(Fist) cluster_profiles[bull].append(Fist) cluster_profiles = {x: np.array(g) for x, g in cluster_profiles.items()} cluster_profiles = { x: np.median(g, axis=0) for x, g in cluster_profiles.items() } var_comp_store = np.array(var_comp_store) var_comp_store = np.median(var_comp_store, axis=0) return cluster_profiles, var_comp_store
def Distance_analysis(SequenceStore,target,refs_lib,DIMr = 'PCA', Bandwidth_split= 30, ncomp_local= 4, clsize= 15): Clover= [] Coordinates= [] Clusters_coords= [] PC_var= recursively_default_dict() Dist_vectors= [] Distances= [] center_distances= [] Ref_stats= [] Ref_stats_lib= recursively_default_dict() for CHR in SequenceStore.keys(): print('going on CHR: '+ str(CHR)) for bl in SequenceStore[CHR].keys(): print('data set: {}'.format(bl)) ### PCA and MeanShift of information from each window copied from *FM36_Galaxy.py. Sequences = SequenceStore[CHR][bl] Sequences= np.nan_to_num(Sequences) print(Sequences.shape) #### Dimensionality reduction if DIMr == 'PCA': pca = PCA(n_components=ncomp_local, whiten=False,svd_solver='randomized').fit(Sequences) data = pca.transform(Sequences) PC_var[CHR][bl]= [x for x in pca.explained_variance_] if DIMr == 'NMF': from sklearn.decomposition import NMF data = NMF(n_components=ncomp_local, init='random', random_state=0).fit_transform(Sequences) Accurate = [] params = {'bandwidth': np.linspace(np.min(data), np.max(data),Bandwidth_split)} grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0) ###################################### ####### TEST global Likelihood ####### ###################################### Focus_labels = [z for z in it.chain(*refs_lib.values())] Who= refs_lib[target] Whose_parents= range(sum([len(x) for x in refs_lib.values()])) #Refs_local= [x for x in Whose_parents if x not in Who] Who_feat= data[Who,:] Ref_feat= data[Whose_parents,:] #### Normalize by distance between local centroids (to compensate for bias in sampling number). #### identify these clusters using MS. #### use reference accessions NOT in the cluster identified. Dpool= data[[x for x in Whose_parents if x not in Who],:] Pdistances= [] bandwidth = estimate_bandwidth(Dpool, quantile=0.15) if bandwidth <= 0: bandwidth= .1 params = {'bandwidth': np.linspace(np.min(Dpool), np.max(Dpool),30)} grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0) ## perform MeanShift clustering. ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=25) ms.fit(Dpool) labels1 = ms.labels_ label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1))) if y != -1} centers= [np.mean(Dpool[label_select[z],:],axis= 0) for z in label_select.keys()] #### Data set of evenly sampled data. ## ## We'll generate 50 new observations from each cluster identified locally. ## N= 50 Proxy_data= [] label_select_labels= [z for z in it.chain(*[[x] * len(label_select[x]) for x in label_select.keys()])] Center_store= {} Proxy_indexes= {} distance_vecs= [] for lab in label_select.keys(): if len(label_select[lab]) < 3: continue Quanted_set= Dpool[label_select[lab],:] if np.max(pairwise_distances(Quanted_set,metric= 'euclidean')) <= 1e-3: Extract= Quanted_set[np.random.choice(Quanted_set.shape[0],N),:] else: grid.fit(Quanted_set) kde = grid.best_estimator_ Extract= kde.sample(N) center= np.mean(Extract,axis= 0) Center_store[lab]= center Proxy_indexes[lab]= [x for x in range((len(Center_store) - 1) * N, len(Center_store) * N)] Proxy_data.extend(Extract) Proxy_data= np.array(Proxy_data) ##### Get pairwise distances between centroids. for pair in it.combinations(label_select.keys(),2): coordinates= [np.mean(Dpool[label_select[z],:],axis= 0) for z in pair] coordinates= np.array(coordinates) iu_control= np.triu_indices(2,1) MS_pair_dist= pairwise_distances(coordinates,metric= 'euclidean') MS_pair_dist= MS_pair_dist[iu_control][0] Pdistances.append(MS_pair_dist) ## reference_centroid= np.mean(centers,axis= 0) proxy_distances= pairwise_distances(reference_centroid.reshape(1,-1), Proxy_data,metric= 'euclidean') distances_to_center= pairwise_distances(reference_centroid.reshape(1,-1), Ref_feat,metric= 'euclidean')[0] self_distances= pairwise_distances(reference_centroid.reshape(1,-1), Who_feat, metric= 'euclidean') centroid= np.mean(Who_feat,axis= 0) distances_pairwise= pairwise_distances(centroid.reshape(1,-1), Ref_feat,metric= 'euclidean')[0] Distances.append(distances_pairwise) distances_pairwise= [(x - np.mean(proxy_distances)) / np.std(proxy_distances) for x in distances_pairwise] Clover.append(distances_pairwise) print(np.array(Clover).shape) FC_stats= [np.mean(proxy_distances),np.std(proxy_distances), np.mean(self_distances), np.std(self_distances)] Coord= [[CHR,bl,x] for x in Who] Ref_stats.append(FC_stats) Ref_stats_lib[CHR][bl]= FC_stats center_distances.append(distances_to_center) Coordinates.extend(Coord) Clusters_coords.append([CHR,bl]) clear_output() return Distances, Clover, Ref_stats_lib, Ref_stats, center_distances, Coordinates, Clusters_coords
def conspicuity_int_glcm(im, mask=None, use_sigmoid=False, morph_proc=True, type='hypo', a=3): # im = tools.resize_ND(im, scale=0.5) # mask = tools.resize_ND(mask, scale=0.5) im = im.copy() if mask is None: mask = np.ones_like(im) if im.max() <= 1: im = skiexp.rescale_intensity(im, (0, 1), (0, 255)).astype(np.int) glcm = tools.graycomatrix_3D(im, mask=mask) min_num = 2 * glcm.mean() glcm = np.where(glcm < min_num, 0, glcm) diag = np.ones(glcm.shape) k = 20 tu = np.triu(diag, -k) tl = np.tril(diag, k) diag = tu * tl glcm *= diag.astype(glcm.dtype) # print 'data from glcm ...', data = tools.data_from_glcm(glcm) quantiles = [0.2, 0.1, 0.4] n_clusters_ = 0 for q in quantiles: # print 'estimating bandwidth ...', bandwidth = estimate_bandwidth(data, quantile=q, n_samples=2000) if bandwidth == 0: continue # bandwidth = estimate_bandwidth(data, quantile=0.1, n_samples=2000) # print 'meanshift ...', ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) #, min_bin_freq=1000) ms.fit(data) labels = ms.labels_ cluster_centers = ms.cluster_centers_ n_clusters_ = len(np.unique(labels)) # print q, n_clusters_ if n_clusters_ > 1: break # n_clusters_ = 0 if n_clusters_ > 1: # print 'number of estimated clusters : %d' % n_clusters_ # print 'cluster centers: {}'.format(cluster_centers) lab_im = (1 + ms.predict( np.array(np.vstack( (im.flatten(), im.flatten()))).T).reshape(im.shape)) * mask else: # pokud meanshift najde pouze jeden mode, pouziji jiny pristup rvs = tools.analyze_glcm(glcm) rvs = sorted(rvs, key=lambda rv: rv.mean()) lab_im = rvs[0].pdf(im) n_clusters_ = len(rvs) mean_v = im[np.nonzero(mask)].mean() labs = np.unique(lab_im)[1:] res = np.zeros_like(lab_im) for l in labs: tmp = lab_im == l mv = im[np.nonzero(tmp)].mean() if mv < mean_v: res = np.where(tmp, 1, res) # plt.figure() # plt.subplot(121), plt.imshow(glcm, 'jet') # for c in cluster_centers: # plt.plot(c[0], c[1], 'o', markerfacecolor='w', markeredgecolor='k', markersize=8) # plt.axis('image') # plt.axis('off') # plt.subplot(122), plt.imshow(glcm, 'jet') # colors = itertools.cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') # for k, col in zip(range(n_clusters_), colors): # my_members = labels == k # cluster_center = cluster_centers[k] # plt.plot(data[my_members, 0], data[my_members, 1], col + '.') # plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor='w', markeredgecolor='k', markersize=8) # plt.title('Estimated number of clusters: %d' % n_clusters_) # plt.axis('image') # plt.axis('off') # plt.show() # plt.figure() # plt.subplot(131), plt.imshow(im, 'gray', interpolation='nearest') # plt.subplot(132), plt.imshow(lab_im, 'jet', interpolation='nearest') # plt.subplot(133), plt.imshow(res, 'gray', interpolation='nearest') # plt.show() # thresholding graycomatrix (GCM) # c_t = 5 # thresh = c_t * np.mean(glcm) # glcm_t = glcm > thresh # glcm_to = skimor.binary_opening(glcm_t, selem=skimor.disk(3)) # # rvs = tools.analyze_glcm(glcm_to) # filtering # rvs = sorted(rvs, key=lambda rv: rv.mean()) # im_int = rvs[0].pdf(im) # mean_v = rvs[0].mean() im_int = res a = 20 c = mean_v / 255 im_res = conspicuity_processing(im_int, mask, use_sigmoid=use_sigmoid, a=a, c=c, sigm_t=0.2, use_morph=morph_proc, radius=3) return im_res
from sklearn.datasets import make_blobs from sklearn.cluster import MeanShift, estimate_bandwidth import numpy as np print(__doc__) # ############################################################################# # Generate sample data centers = [[1, 1], [-1, -1], [1, -1]] X, _ = make_blobs(n_samples=10000, centers=centers, cluster_std=0.6) # ############################################################################# # Compute clustering with MeanShift # The following bandwidth can be automatically detected using bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) # ############################################################################# # Plot result plt.figure(1)
def window_analysis(Windows, ref_labels, labels1, Chr=1, ncomp=4, amova=True, supervised=True, include_who=[], range_sample=[130, 600], rand_sample=0, clsize=15, cl_freqs=5, Bandwidth_split=20): kde_class_labels = labels1 kde_label_dict = { z: [x for x in range(len(kde_class_labels)) if kde_class_labels[x] == z] for z in list(set(kde_class_labels)) } if include_who: include = [ x for x in range(len(kde_class_labels)) if kde_class_labels[x] in include_who ] ref_labels = include_who kde_class_labels = [kde_class_labels[x] for x in include] kde_label_dict = { z: [ x for x in range(len(kde_class_labels)) if kde_class_labels[x] == z ] for z in include_who } if rand_sample: sample = rand_sample sample_range = [0, sample] Freq_extract = { Chr: { bl: Windows[Chr][bl] for bl in np.random.choice( list(Windows[Chr].keys()), sample, replace=True) } } if range_sample: sample_range = range_sample Freq_extract = { Chr: { bl: Windows[Chr][bl] for bl in list(sorted(Windows[Chr].keys())) [sample_range[0]:sample_range[1]] } } Results = {'header': ['Chr', 'window'], 'info': []} Frequencies = {'header': ['Chr', 'window', 'cl'], 'coords': [], 'info': []} Construct = {'header': ['Chr', 'window', 'cl'], 'coords': [], 'info': []} PC_var = {'header': ['Chr', 'window'], 'coords': [], 'info': []} pc_density = [] pc_coords = [] sim_fst = [] for c in Freq_extract[Chr].keys(): Sequences = Windows[Chr][c] if Sequences.shape[1] <= 3: Results[Chr][c] = [0, 0] print('hi') continue Sequences = np.nan_to_num(Sequences) pca = PCA(n_components=ncomp, whiten=False, svd_solver='randomized').fit(Sequences) data = pca.transform(Sequences) if include_who: data = data[include, :] ##### PC density PC = 0 pc_places = data[:, PC] X_plot = np.linspace(-8, 8, 100) kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit( np.array(pc_places).reshape(-1, 1)) log_dens = kde.score_samples(X_plot.reshape(-1, 1)) pc_density.append(np.exp(log_dens)) pc_coords.append(pc_places) PC_var['coords'].append([Chr, c]) PC_var['info'].append([x for x in pca.explained_variance_]) ### params = { 'bandwidth': np.linspace(np.min(data), np.max(data), Bandwidth_split) } grid = GridSearchCV(KernelDensity(algorithm="ball_tree", breadth_first=False), params, verbose=0) ###################################### ####### TEST global Likelihood ####### ###################################### Focus_labels = list(range(data.shape[0])) #### Mean Shift approach ## from sklearn.cluster import MeanShift, estimate_bandwidth bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=len(Focus_labels)) if bandwidth <= 1e-3: bandwidth = 0.1 ms = MeanShift(bandwidth=bandwidth, cluster_all=False, min_bin_freq=clsize) ms.fit(data[Focus_labels, :]) labels = ms.labels_ Tree = { x: [Focus_labels[y] for y in range(len(labels)) if labels[y] == x] for x in [g for g in list(set(labels)) if g != -1] } Keep = [x for x in Tree.keys() if len(Tree[x]) > clsize] Tree = {x: Tree[x] for x in Keep} Ngps = len(Tree) SpaceX = {x: data[Tree[x], :] for x in Tree.keys()} these_freqs = [] ### Extract MScluster likelihood by sample for hill in SpaceX.keys(): if len(Tree[hill]) >= cl_freqs: if supervised == False: print('hi') cl_seqs = Sequences[Tree[hill], :] freq_vector = [ float(x) / (cl_seqs.shape[0] * 2) for x in np.sum(cl_seqs, axis=0) ] Frequencies['coords'].append([Chr, c, hill]) Frequencies['info'].append(freq_vector) these_freqs.append(freq_vector) grid.fit(data[Tree[hill], :]) # use the best estimator to compute the kernel density estimate kde = grid.best_estimator_ P_dist = kde.score_samples(data[Tree[hill], :]) Dist = kde.score_samples(data) P_dist = np.nan_to_num(P_dist) Dist = np.nan_to_num(Dist) if np.std(P_dist) == 0: Dist = np.array( [int(Dist[x] in P_dist) for x in range(len(Dist))]) else: Dist = scipy.stats.norm(np.mean(P_dist), np.std(P_dist)).cdf(Dist) Dist = np.nan_to_num(Dist) Construct['coords'].append([Chr, c, hill]) Construct['info'].append(Dist) ######################################### ############# AMOVA ################ ######################################### if supervised: labels = [x for x in kde_class_labels if x in ref_labels] Who = [ z for z in it.chain(*[kde_label_dict[x] for x in ref_labels]) ] Ngps = len(ref_labels) print(ref_labels) for hill in ref_labels: if len(kde_label_dict[hill]) >= cl_freqs: if include_who: Seq_specific = Sequences[include, :] cl_seqs = Seq_specific[kde_label_dict[hill], :] freq_vector = [ float(x) / (cl_seqs.shape[0] * 2) for x in np.sum(cl_seqs, axis=0) ] Frequencies['coords'].append([Chr, c, hill]) Frequencies['info'].append(freq_vector) these_freqs.append(freq_vector) else: Who = [ x for x in range(len(labels)) if labels[x] != -1 and labels[x] in Keep ] labels = [labels[x] for x in Who] Who = [Focus_labels[x] for x in Who] # Pairwise = return_fsts2(np.array(these_freqs)) sim_fst.extend(Pairwise.fst) if len(list(set(labels))) == 1: Results['coords'].append([Chr, c]) Results['info'].append([AMOVA, Ngps]) continue if amova: clear_output() AMOVA, Cig = AMOVA_FM42(data[Who, :], labels, n_boot=0, metric='euclidean') print('counting: {}, Ngps: {}'.format(AMOVA, Ngps)) Results['info'].append([Chr, c, AMOVA, Ngps]) Results['info'] = pd.DataFrame( np.array(Results['info']), columns=['chrom', 'window', 'AMOVA', 'Ngps']) X_plot = np.linspace(0, .3, 100) freq_kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit( np.array(sim_fst).reshape(-1, 1)) log_dens = freq_kde.score_samples(X_plot.reshape(-1, 1)) fig_roost_dens = [ go.Scatter(x=X_plot, y=np.exp(log_dens), mode='lines', fill='tozeroy', name='', line=dict(color='blue', width=2)) ] ## layout = go.Layout(title='allele frequency distribution across clusters', yaxis=dict(title='density'), xaxis=dict(title='fst')) fig = go.Figure(data=fig_roost_dens, layout=layout) return Frequencies, sim_fst, Results, Construct, pc_density, pc_coords, fig
iris_data = pd.get_dummies(iris_data, columns=['Species']) print(iris_data.head()) virginica = iris_data.loc[iris_data['Species_I. virginica'] == 1] versicolor = iris_data.loc[iris_data['Species_I. versicolor'] == 1] setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1] plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r') plt.scatter(x=versicolor['Sepal length'], y=versicolor['Sepal width'], color='g') plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b') #plt.show() print("Self band: ", estimate_bandwidth(iris_data, quantile=0.2)) analyzer = MeanShift(bandwidth=1) print("Self MeanShift: ", analyzer.fit(iris_data)) print("Function mean_shift: ", mean_shift(iris_data)) labels, cluster_centers, n_clusters = mean_shift(iris_data) fig = plt.figure() ax = fig.add_subplot(111) colors = cycle('bgrcmy') for k, col in zip(range(n_clusters), colors): my_members = (labels == k) cluster_center = cluster_centers[k] if (my_members == True): x, y = iris_data[0], iris_data[1]
'eps': .15, 'n_neighbors': 2 }), (blobs, {}), (no_structure, {})] for i_dataset, (dataset, algo_params) in enumerate(datasets): # update parameters with dataset-specific values params = default_base.copy() params.update(algo_params) X, y = dataset # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # ============ # Create cluster objects # ============ ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward',
def various_algorithm_launch(samples, pits, nb_clusters, target): np.random.seed(0) colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) colors = np.hstack([colors] * 20) clustering_names = [ 'Kmeans','MiniBatchKMeans', 'AffinityPropagation', 'MeanShift', 'SpectralClustering', 'Ward', 'AgglomerativeClustering', 'DBSCAN', 'Birch'] plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 full_result=pd.DataFrame() datasets = [samples] for i_dataset, dataset in enumerate(datasets): if i_dataset==len(datasets)-1: print i_dataset X=samples y=pits else: X, y = dataset # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=5, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # create clustering estimators ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=nb_clusters) k_means = cluster.KMeans(init='k-means++', n_clusters=nb_clusters, n_init=20, algorithm='full') ward = cluster.AgglomerativeClustering(n_clusters=nb_clusters, linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering(n_clusters=nb_clusters, eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=0.3, min_samples=10).fit(samples) affinity_propagation = cluster.AffinityPropagation(damping = 0.5, max_iter = 200, convergence_iter = 15, copy = True, preference = None, affinity = 'euclidean', verbose = False) average_linkage = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=nb_clusters, connectivity=connectivity) birch = cluster.Birch(n_clusters=nb_clusters) clustering_algorithms = [ k_means,two_means, affinity_propagation, ms, spectral, ward, average_linkage, dbscan, birch] for name, algorithm in zip(clustering_names, clustering_algorithms): # predict cluster memberships t0 = time.time() algorithm.fit(X) t1 = time.time() if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) classif_df=pd.DataFrame(y_pred,index=np.arange(1,len(y_pred)+1)) classif_df.columns = [name] pit_df=pd.DataFrame(pits,index=np.arange(1,len(pits)+1)) pit_df.columns = [target] result_per_pit = pd.concat([classif_df, pit_df], axis=1,verify_integrity=False) full_result = pd.concat([full_result, result_per_pit[name]], axis=1) #plot plt.subplot(1, len(clustering_algorithms), plot_num) if i_dataset == 0: plt.title(name, size=18) plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10) if hasattr(algorithm, 'cluster_centers_'): centers = algorithm.cluster_centers_ center_colors = colors[:len(centers)] plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors) plt.xlim(-2, 2) plt.ylim(-2, 2) plt.xticks(()) plt.yticks(()) plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') plot_num += 1 # print full_result # plt.show() return full_result
r = requests.get('https://github.com/probml/probml-data/blob/main/data/bread.jpg?raw=true', stream=True) image = Image.open(io.BytesIO(r.content)) # Image is (687 x 1025, RGB channels) image = np.array(image) original_shape = image.shape # Flatten image. X = np.reshape(image, [-1, 3]) plt.figure() plt.imshow(image) plt.axis('off') pml.savefig('meanshift_segmentation_input.pdf') bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=100) print("bandwidth {}".format(bandwidth)) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) segmented_image = np.reshape(labels, original_shape[:2]) # Just take (height, width), ignore color dim. plt.figure() plt.imshow(segmented_image)
import matplotlib.pyplot as plt input_file = 'wholesale.csv' file_reader = csv.reader(open(input_file, 'rt'), delimiter=',') X = [] for count, row in enumerate(file_reader): if not count: names = row[2:] continue X.append([float(x) for x in row[2:]]) X = np.array(X) # Estimating the bandwidth bandwidth = estimate_bandwidth(X, quantile=0.8, n_samples=len(X)) # Compute clustering with MeanShift meanshift_estimator = MeanShift(bandwidth=bandwidth, bin_seeding=True) meanshift_estimator.fit(X) labels = meanshift_estimator.labels_ centroids = meanshift_estimator.cluster_centers_ num_clusters = len(np.unique(labels)) print("Number of clusters in input data =", num_clusters) print("Centroids of clusters:") print('\t'.join([name[:3] for name in names])) for centroid in centroids: print('\t'.join([str(int(x)) for x in centroid]))
def tracker(path): #initialization for default value if path=='0': path=0; cap = cv2.VideoCapture(path) ip_method = ip.get_instace(ip.IPMethod.TOMASI); #FLANN Properties MIN_FRAMES_COUNT = 120 SKIP_FRAMES = 60 MIN_MERGE_FRAMES = 5; FLANN_INDEX_KDTREE = 0 index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 10) search_params = dict(checks = 50) flann = cv2.FlannBasedMatcher(index_params, search_params) DO_RESIZE=False new_sz = (180,120) #Initialization of inputs frames =[]; #Frames kp = []; #Key points all_matches = []; #All good matches match_count = []; #match_count labels = []; frame_cnt=0; print "Extracting frames...................." ret, prev_frame = cap.read() kp1,desc1 = ip_method.detectAndCompute(prev_frame); num_matches = np.zeros(kp1.__len__()) #storing frames frames.append(prev_frame); kp.append(kp1) match_count.append(num_matches); while(cap.isOpened()): SKIP_FRAMES=SKIP_FRAMES-1; ret, prev_frame = cap.read() if not ret or SKIP_FRAMES<0: break; while(cap.isOpened()): ret, cur_frame = cap.read() if not ret: break; kp2,desc2 = ip_method.detectAndCompute(cur_frame); matches = flann.knnMatch(desc1,desc2,k= 2) # Ratio test as per Lowe's paper good_matches = []; distances = [] for (m,n) in matches: if m.distance < 0.7*n.distance and m.distance > 4: good_matches.append(m); distances.append(m.distance); # Bashart's Displacement filtering mean = np.mean(distances); std = np.std(distances) good_matches[:] = [match for match in good_matches if abs(match.distance - mean) < 5 * std] kp1 = kp2; desc1 = desc2; num_matches = np.zeros(kp1.__len__()) for match in good_matches: num_matches[match.trainIdx]=match_count[-1][match.queryIdx]+1 all_matches.append(good_matches); #storing frames frames.append(cur_frame); kp.append(kp1) match_count.append(num_matches); if frame_cnt > MIN_FRAMES_COUNT: break; frame_cnt = frame_cnt +1; cap.release() print "Labeling the keypoints................." max_label=0; MIN_POINTS_TO_CLUSTER = 20 MAX_CLUSTERS = 100 #Forward Labeling Pass for rng in xrange(0,MIN_MERGE_FRAMES+1): labels.append([-1]*kp[rng].__len__()); for rng in xrange(MIN_MERGE_FRAMES+1,frame_cnt): motion_feats = []; feat_indices = []; labels.append([-1]*kp[rng].__len__()); for match in all_matches[rng-1]: if match_count[rng-1][match.queryIdx]>=MIN_MERGE_FRAMES: if labels[rng-1][match.queryIdx]==-1: src_pt = np.int32(kp[rng-1][match.queryIdx].pt) dst_pt = np.int32(kp[rng][match.trainIdx].pt) motion_feats.append(motion.get_features(src_pt,dst_pt)); feat_indices.append(match.trainIdx) else : labels[rng][match.trainIdx]=labels[rng-1][match.queryIdx] if(motion_feats.__len__()>=MIN_POINTS_TO_CLUSTER): #Clustering mean-shift motion_feats = np.asarray(motion_feats) bandwidth = estimate_bandwidth(motion_feats, quantile=0.1,random_state=200) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(motion_feats); for idx,lbl in zip(feat_indices,ms.labels_): labels[rng][idx]=lbl+max_label; max_label = max(labels[rng])+1; random_colors = np.random.randint(256, size=(MAX_CLUSTERS, 3)) print "Writing the video................." fourcc = cv2.cv.CV_FOURCC(*'XVID') w = prev_frame.shape[0]; h = prev_frame.shape[1] if DO_RESIZE: vidout = cv2.VideoWriter('out.avi',fourcc,20,new_sz) else: vidout = cv2.VideoWriter('out.avi',fourcc,20,(h,w)) for frame_idx in xrange(MIN_MERGE_FRAMES*2,frame_cnt): cur_frame = frames[frame_idx]; for rng in xrange(frame_idx-MIN_MERGE_FRAMES,frame_idx): for match in all_matches[rng-1]: if match_count[rng-1][match.queryIdx]>=MIN_MERGE_FRAMES \ and not (labels[rng-1][match.queryIdx]==-1 or labels[rng-1][match.queryIdx]>=MAX_CLUSTERS): #print "i m not here" src_pt = np.int32(kp[rng-1][match.queryIdx].pt) dst_pt = np.int32(kp[rng][match.trainIdx].pt) color = tuple(random_colors[labels[rng-1][match.queryIdx]]) cv2.line(cur_frame,tuple(src_pt),tuple(dst_pt),color,2); if DO_RESIZE: cur_frame=cv2.resize(cur_frame,new_sz); vidout.write(cur_frame); vidout.release() cv2.destroyAllWindows()
np.mean([Merged_Pvalues[Chr][z][aim][ind] for z in bl]) for ind in norm_labels[gp] ] SummedP = np.mean(SummedP) stat.append(SummedP) ## average estimates across groups stat = np.mean(stat) Genes.append(stat) Genes_to_state[gen][aim] = stat X_plot = np.linspace(0, max(Genes), 200)[:, np.newaxis] bandwidth = estimate_bandwidth(np.array(Genes).reshape(-1, 1), quantile=0.15, n_samples=len(Genes)) fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit( np.array(Genes).reshape(-1, 1)) log_dens = kde.score_samples(X_plot) ax.plot(X_plot[:, 0], np.exp(log_dens), '-', label='label= {0}, band= {1}'.format(aim, bandwidth)) ax.legend(loc='upper right') filename = Home + 'GeneValues_labs' + ''.join(
# Fit and predict the data birch.fit(scaledData) predictions = birch.predict(scaledData) # Scatterplot between two features to check the clustering plt.scatter(scaledData[:, 2], scaledData[:, 6], c=predictions) plt.xlabel("Height") plt.ylabel("Shell weight") plt.title("Clustering using Birch clustering algorithm") plt.show() ##################################### Mean Shift Clustering ################################# # Determine optimal bandwidth value bandwidth = estimate_bandwidth(scaledData, quantile=0.2, n_samples=500) # Instantiate the clustering model mnShift = MeanShift(bandwidth=bandwidth) # Fit and predict the data mnShift.fit(scaledData) predictions = mnShift.predict(scaledData) # Scatterplot between two features to check the clustering plt.scatter(scaledData[:, 2], scaledData[:, 6], c=predictions) plt.xlabel("Height") plt.ylabel("Shell weight") plt.title("Clustering using Mean shift clustering algorithm") plt.show()