def participants_clustering(group, block=None, *args, **kwargs): ''' Using sklearn.cluster.mean_shift, cluster the participants into groups for each frame. Optional position and keyword arguments are passed as is to the mean_shift function. Returns ======= (clusters_centroids, clusters_partition): clusters_centroids is a list of 2d point for each frame. clusters_partition is a map of partitions (keys are cluster index and values are sets of participants_numbers) for each frame. ''' participants_data = data.participants_data(group, block) participants_nums = data.get_participants(group) # a list of 2d point for each frame clusters_centroids = [] # a map of partitions (keys are cluster index and values # are sets of participants_numbers) for each frame clusters_partition = [] for frame, row in participants_data.iterrows(): pas = np.array(utils.list_to_chunks(row.values, 2)) centroids, allocations = cluster.mean_shift(pas, *args, **kwargs) clusters_centroids.append(centroids) partition = collections.defaultdict(set) for participant, allocation in zip(participants_nums, allocations): partition[allocation].add(participant) clusters_partition.append(partition) return clusters_centroids, clusters_partition
def mean_shift_labels(pointcloud, bandwidth=None, max_iter=300, n_jobs=1): ''' Find an array of point-labels of clusters found by the DBSCAN algorithm. Parameters ---------- X : array-like, shape=[n_samples, n_features] Input data. bandwidth : float, optional Kernel bandwidth. If bandwidth is not given, it is determined using a heuristic based on the median of all pairwise distances. This will take quadratic time in the number of samples. The sklearn.cluster.estimate_bandwidth function can be used to do this more efficiently. seeds : array-like, shape=[n_seeds, n_features] or None Point used as initial kernel locations. If None and bin_seeding=False, each data point is used as a seed. If None and bin_seeding=True, see bin_seeding. bin_seeding : boolean, default=False If true, initial kernel locations are not locations of all points, but rather the location of the discretized version of points, where points are binned onto a grid whose coarseness corresponds to the bandwidth. Setting this option to True will speed up the algorithm because fewer seeds will be initialized. Ignored if seeds argument is not None. min_bin_freq : int, default=1 To speed up the algorithm, accept only those bins with at least min_bin_freq points as seeds. cluster_all : boolean, default True If true, then all points are clustered, even those orphans that are not within any kernel. Orphans are assigned to the nearest kernel. If false, then orphans are given cluster label -1. max_iter : int, default 300 Maximum number of iterations, per seed point before the clustering operation terminates (for that seed point), if has not converged yet. n_jobs : int The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. Returns ------- cluster_centers : array, shape=[n_clusters, n_features] Coordinates of cluster centers. labels : array, shape=[n_samples] Cluster labels for each point. ''' # Set bandwidth to None if it is 0 if bandwidth == 0: bandwidth = None _, labels = mean_shift(np.asarray(pointcloud), bandwidth=bandwidth, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, max_iter=max_iter, n_jobs=n_jobs) return labels
def meanshift(bw, pixels): pixels = np.array(revert(pixels)) c, ind = mean_shift(pixels, bandwidth=bw, n_jobs=-1) out = [] for i in ind: out.append(c[i].astype(np.int)) #print c[i].astype(np.int) return out
def test_max_iter(max_iter): clusters1, _ = mean_shift(X, max_iter=max_iter) ms = MeanShift(max_iter=max_iter).fit(X) clusters2 = ms.cluster_centers_ assert ms.n_iter_ <= ms.max_iter assert len(clusters1) == len(clusters2) for c1, c2 in zip(clusters1, clusters2): assert np.allclose(c1, c2)
def test_mean_shift(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.cluster.mean_shift() expected = cluster.mean_shift(iris.data) self.assertEqual(len(result), 2) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assertTrue(isinstance(result[1], pdml.ModelSeries)) self.assert_index_equal(result[1].index, df.index) self.assert_numpy_array_equal(result[1].values, expected[1])
def test_mean_shift(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.cluster.mean_shift() expected = cluster.mean_shift(iris.data) self.assertEqual(len(result), 2) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assertIsInstance(result[1], pdml.ModelSeries) tm.assert_index_equal(result[1].index, df.index) tm.assert_numpy_array_equal(result[1].values, expected[1])
def test_mean_shift(): # Test MeanShift algorithm bandwidth = 1.2 ms = MeanShift(bandwidth=bandwidth) labels = ms.fit(X).labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) assert_equal(n_clusters_, n_clusters) cluster_centers, labels = mean_shift(X, bandwidth=bandwidth) labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) assert_equal(n_clusters_, n_clusters)
def test_mean_shift(bandwidth, cluster_all, expected, first_cluster_label): # Test MeanShift algorithm ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all) labels = ms.fit(X).labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) assert n_clusters_ == expected assert labels_unique[0] == first_cluster_label cluster_centers, labels_mean_shift = mean_shift(X, cluster_all=cluster_all) labels_mean_shift_unique = np.unique(labels_mean_shift) n_clusters_mean_shift = len(labels_mean_shift_unique) assert n_clusters_mean_shift == expected assert labels_mean_shift_unique[0] == first_cluster_label
def test_mean_shift(): """ Test MeanShift algorithm """ bandwidth = 1.2 bandwidth_ = estimate_bandwidth(X, n_samples=300) assert_true(0.9 <= bandwidth_ <= 1.5) ms = MeanShift(bandwidth=bandwidth) labels = ms.fit(X).labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) assert_equal(n_clusters_, n_clusters) cluster_centers, labels = mean_shift(X, bandwidth=bandwidth) labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) assert_equal(n_clusters_, n_clusters)
def test_mean_shift(global_dtype, bandwidth, cluster_all, expected, first_cluster_label): # Test MeanShift algorithm X_with_global_dtype = X.astype(global_dtype, copy=False) ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all) labels = ms.fit(X_with_global_dtype).labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) assert n_clusters_ == expected assert labels_unique[0] == first_cluster_label assert ms.cluster_centers_.dtype == global_dtype cluster_centers, labels_mean_shift = mean_shift(X_with_global_dtype, cluster_all=cluster_all) labels_mean_shift_unique = np.unique(labels_mean_shift) n_clusters_mean_shift = len(labels_mean_shift_unique) assert n_clusters_mean_shift == expected assert labels_mean_shift_unique[0] == first_cluster_label assert cluster_centers.dtype == global_dtype
cut = img[y:y + h, x:x + w] # extract feature vector density = misc.calc_density(cut, blob) moments = misc.calc_hu_moments(cut, blob) circularity = misc.calc_circularity(cut, blob) features.append([density, circularity, moments[0]]) # convert list to numpy array features = np.asarray(features) # scale the features to comparable ranges features = MinMaxScaler().fit_transform(features) # now use the feature vector for clustering: # predict = AggloClust(n_clusters=3).fit_predict(features.reshape(-1, 1)) # predict = KMeans(n_clusters=3).fit_predict(features.reshape(-1, 1)) predict = mean_shift(features, estimate_bandwidth(features))[1] # now draw the classified objects: # iterate through blobs for i, blob in enumerate(blobs): [x, y, w, h] = cv2.boundingRect(blob) cv2.rectangle(img, (x, y), (x + w, y + h), colors[predict[i]], 2) txt = '{} d={:.2f},m={:.2f},c={:.2f}'.format(i, *features[i]) cv2.putText(img, txt, (x + w - 75, y + h + 15), *text_opts) def pca(): names = ['circle', 'rectangle', 'triangle'] f_r = PCA(n_components=2).fit_transform(features) y = np.array(targets) for i, color in enumerate(['red', 'green', 'blue']):
clust = MeanShift(bandwidth = 10) res = clust.fit_predict(data[['Start']].values) data['Cluster'] = res cluster_data = concat([cluster_data, data], axis = 0, ignore_index = True) # <codecell> res = crosstab(rows = [cluster_data['Patient ID'], cluster_data['Visit Number']], cols = [cluster_data['TFName'], cluster_data['Cluster']]) # <codecell> from sklearn.cluster import k_means, mean_shift centroids, labels = mean_shift(res.values) labels = Series(labels, index = res.index) labels.sort() plt.figure(figsize = (20,20)) plt.imshow(res.ix[labels.index].values) # <codecell> labels # <codecell>
def action_execute_button_clicked(self): #打开影像 input_img = gdal.Open(self.input_file_path.text()) img_rows = input_img.RasterYSize img_cols = input_img.RasterXSize img_bands = input_img.RasterCount img_geotrans = input_img.GetGeoTransform() img_proj = input_img.GetProjection() #将影像转为mean_shift函数接受的数据格式 input_features = [] for i in range(1, img_bands + 1): band_img = input_img.GetRasterBand(i).ReadAsArray( 0, 0, img_cols, img_rows) input_features.append(band_img.reshape(-1)) input_features = np.array(input_features).T #执行mean_shift算法 bandwidth = estimate_bandwidth( input_features, quantile=float(self.bandwidth_estimate.currentText()), n_samples=int(img_rows * img_cols * 0.05)) #print(bandwidth) meanshift_result = mean_shift(input_features, bandwidth=bandwidth, bin_seeding=True, max_iter=int( self.iter_num.currentText())) #将各样本点灰度值转为对应聚类中心灰度值 cluster_centers, clustered_points = meanshift_result output_feature = [] for index, item in enumerate(clustered_points): while item > len(self.color_list) - 1: self.color_list.append(list(np.random.randint(256, size=3))) output_feature.append(self.color_list[item]) output_feature = np.array(output_feature).T output_feature = np.array( list(map(lambda x: x.reshape((img_rows, img_cols)), output_feature))) # 输出聚类影像 driver = gdal.GetDriverByName("GTiff") output_img = driver.Create(self.output_file_path.text(), img_cols, img_rows, 3, gdal.GDT_Byte) output_img.SetGeoTransform(img_geotrans) output_img.SetProjection(img_proj) for i in range(1, 4): output_img.GetRasterBand(i).WriteArray(output_feature[i - 1]) del output_img layer_legends = [] #图例数组 for i in range(len(cluster_centers)): layer_legends.append({ 'name': 'Cluster' + str(i + 1), 'color': QColor(self.color_list[i][0], self.color_list[i][1], self.color_list[i][2]) }) if (QMessageBox.question(self, "消息框", "聚类完成,是否将结果添加到图层?", QMessageBox.Yes | QMessageBox.No, QMessageBox.Yes) == QMessageBox.Yes): self.add_layer_signal.emit(self.output_file_path.text(), layer_legends)
def cluster(points, labels=None): #km = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=1, # verbose=1) #km.fit(points) return mean_shift(points)
def _clusterized_column(df, column, **kwargs): res = cluster.mean_shift(pd.DataFrame(df[column]).values, **kwargs) return res[1]
def mean_shift_cluster(data): # mean_shift mean_shift_label = mean_shift(np.array(MinMaxScaler().fit_transform(data))) return list(mean_shift_label[-1])