def main(): img = cv2.imread("IMG_2805.jpg") # img = cv2.imread("Picture1.png") w = len(img[0]) h = len(img) img = img[::2, ::2] number_of_segments = 500 segments = slic(img, n_segments=number_of_segments, sigma=5) # segments = felzenszwalb(img_float, 100, sigma=5, min_size=50) number_of_segments = len(np.unique(segments)) print("SLIC is done", number_of_segments) points = np.zeros((number_of_segments, 3)) for segment in range(number_of_segments): segment_mask = segments == segment size_of_segment = segment_mask.sum() if size_of_segment != 0: for col in range(3): points[segment, col] = int( (segment_mask * img[:, :, col]).sum() / size_of_segment) print("Points are made") ms = MeanShift(bandwidth=1) ms.fit_predict(points) print("Mean Shift done") cluster_centers = ms.cluster_centers_ output = cluster_centers[ms.labels_[segments]] print(output) cv2.imshow("output", output.astype(np.uint8)) cv2.imwrite("CartoonedImage.jpg", output.astype(np.uint8)) cv2.waitKey() cv2.destroyAllWindows()
def get_ft_field(zeta_res, model, zeta_scope, mode, ft_fields, meth): if mode == 0: words = zeta_res.index[zeta_res[zeta_scope] > 0] else: words = zeta_res.index[zeta_res[zeta_scope] < 0] vecs = [model.get_word_vector(str(x)) for x in words] word_matrix = np.matrix(vecs) if meth == "MS": clu = MeanShift(n_jobs=-1) if meth == "AP": if mode == 0: clu = AffinityPropagation( preference=zeta_res[zeta_scope][zeta_res[zeta_scope] > 0]) else: clu = AffinityPropagation( preference=zeta_res[zeta_scope][zeta_res[zeta_scope] < 0]) if meth == "Birch": clu = Birch(n_clusters=None) clu.fit_predict(word_matrix) try: cluster_frame1 = pd.DataFrame(clu.cluster_centers_) except: cluster_frame1 = pd.DataFrame(clu.subcluster_centers_) cluster_frame1["Category"] = mode ft_fields.put(cluster_frame1) ft_fields.close()
def evaluate_learners(X): ''' Run multiple times with different learners to get an idea of the relative performance of each configuration. Returns a sequence of tuples containing: (title, predicted classes) for each learner. ''' from sklearn.cluster import (MeanShift, MiniBatchKMeans, SpectralClustering, AgglomerativeClustering) learner = MeanShift( # Let the learner use its own heuristic for determining the # number of clusters to create bandwidth=None) y = learner.fit_predict(X) yield 'Mean Shift clusters', y learner = MiniBatchKMeans(n_clusters=2) y = learner.fit_predict(X) yield 'K Means clusters', y learner = SpectralClustering(n_clusters=2) y = learner.fit_predict(X) yield 'Spectral clusters', y learner = AgglomerativeClustering(n_clusters=2) y = learner.fit_predict(X) yield 'Agglomerative clusters (N=2)', y learner = AgglomerativeClustering(n_clusters=5) y = learner.fit_predict(X) yield 'Agglomerative clusters (N=5)', y
def CombinedMeanShift(self, h, alpha, PrincComp=None, njobs=-2, mbf=1): """Performs the scikit-learn Mean Shift clustering. Arguments: h -- the bandwidth alpha -- the weight of the principal components as compared to the spatial data. PrincComp -- used to pass already-computed principal components njobs -- the number of processes to be used (default: n. of CPU - 1) mbf -- the minimum number of items in a seed""" MS = MeanShift(bin_seeding=True, bandwidth=h, cluster_all=True, min_bin_freq=mbf, n_jobs=njobs) if PrincComp is None: PrincComp = self.ShapePCA(2) print("Starting sklearn Mean Shift... ") stdout.flush() fourvector = np.vstack((self.__data, alpha * PrincComp)) MS.fit_predict(fourvector.T) self.__ClusterID = MS.labels_ self.__c = MS.cluster_centers_.T self.__clsizes = itemfreq(self.__ClusterID)[:, 1] print("done.") stdout.flush()
def CombinedMeanShift(self, h, alpha, PrincComp=None, njobs=-2, mbf=1): """Performs the scikit-learn Mean Shift clustering. Arguments: h -- the bandwidth alpha -- the weight of the principal components as compared to the spatial data. PrincComp -- used to pass already-computed principal components njobs -- the number of processes to be used (default: n. of CPU - 1) mbf -- the minimum number of items in a seed""" MS = MeanShift(bin_seeding=True, bandwidth=h, cluster_all=True, min_bin_freq=mbf, n_jobs=njobs) if PrincComp is None: PrincComp = self.ShapePCA(2) print("Starting sklearn Mean Shift... ") stdout.flush() fourvector = np.vstack((self.__data, alpha * PrincComp)) MS.fit_predict(fourvector.T) self.__ClusterID = MS.labels_ self.__c = MS.cluster_centers_.T print("done.") stdout.flush()
def meanshift(data): bandwidth = estimate_bandwidth(data) if (bandwidth - bandwidth / 2) < 0 and (bandwidth + bandwidth / 2) > 0: space = { 'bandwidth': hp.uniform('bandwidth', 0, bandwidth + bandwidth / 2), 'min_bin_freq': hp.choice('min_bin_freq', range(1, 30)) } elif (bandwidth + bandwidth / 2) <= 0: space = { 'bandwidth': hp.uniform('bandwidth', 0.1, 1.5), 'min_bin_freq': hp.choice('min_bin_freq', range(1, 30)) } else: space = { 'bandwidth': hp.uniform('bandwidth', bandwidth - bandwidth / 2, bandwidth + bandwidth / 2), 'min_bin_freq': hp.choice('min_bin_freq', range(1, 30)) } algo = partial(tpe.suggest, n_startup_jobs=10) if data.shape[0] < 1000: best = fmin(hyper_meanshift, space, algo=algo, max_evals=100) else: best = fmin(hyper_meanshift, space, algo=algo, max_evals=30) model = MeanShift(bandwidth=best['bandwidth'], min_bin_freq=int(best['min_bin_freq'] + 1)) return best, model.fit_predict(data), sil_score( data, model.fit_predict(data)), model.fit(data)
def get_w2v_fields(zeta_res, model, zeta_scope, meth): ratio = len(zeta_res.index[zeta_res[zeta_scope] > 0]) / len( zeta_res.index[zeta_res[zeta_scope] < 0]) print(ratio) words = zeta_res.index[zeta_res[zeta_scope] > 0] vecs = [] for word in words: try: vecs.append(model[word]) except: pass word_matrix = np.matrix(vecs) if meth == "MS": clu = MeanShift(bandwidth=1, n_jobs=-1) if meth == "AP": clu = AffinityPropagation( preference=zeta_res[zeta_scope][zeta_res[zeta_scope] > 0]) if meth == "Birch": clu = Birch(n_clusters=None) clu.fit_predict(word_matrix) try: cluster_frame1 = pd.DataFrame(clu.cluster_centers_) except: cluster_frame1 = pd.DataFrame(clu.subcluster_centers_) cluster_frame1["category"] = 0 words = zeta_res.index[zeta_res[zeta_scope] < 0] vecs = [] for word in words: try: vecs.append(model[word]) except: pass word_matrix = np.matrix(vecs) if meth == "MS": clu = MeanShift(bandwidth=1, n_jobs=-1) if meth == "AP": clu = AffinityPropagation( preference=zeta_res[zeta_scope][zeta_res[zeta_scope] < 0]) if meth == "Birch": clu = Birch(n_clusters=None) clu.fit_predict(word_matrix) try: cluster_frame2 = pd.DataFrame(clu.cluster_centers_) except: cluster_frame2 = pd.DataFrame(clu.subcluster_centers_) cluster_frame2["category"] = 1 cluster_frame = pd.concat([cluster_frame1, cluster_frame2]).reset_index() return cluster_frame return cluster_frame
def _run_mean_shift(self, data): """Runs the mean shift algorithm on desired dataset.""" bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=200) ms = MeanShift(bandwidth=bandwidth, cluster_all=False, bin_seeding=True) ms.fit_predict(data) return ms
def runMeanShift(argsdict, data, inlbl, fPath, fName, fileN, i, sampleType): start = time.time() est = MeanShift(bandwidth=estimate_bandwidth(data, quantile=0.2)) est.fit_predict(data) end = time.time() return runRawAnalysis(argsdict, inlbl, est.labels_, fileN + '.Results', fPath + fName + str(i) + '_SIG.csv', (end - start))
def MShift(X): # The following bandwidth can be automatically detected using #bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=samples/2) ms = MeanShift(bandwidth=None, cluster_all=False, bin_seeding=True) #bandwidth=bandwidth, bin_seeding=True) ms.fit_predict(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique)
def _mean_shift(corpus, labels): vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(corpus) mean_shift = MeanShift(bandwidth=0.65, bin_seeding=True) result_mean_shift = mean_shift.fit_predict(X.toarray()) print('MeanShift:', normalized_mutual_info_score(result_mean_shift, labels))
def cluster(csv): data = pd.read_csv(csv) # X Features X = np.array(data.drop(['botname'], 1)) #print(X) X = scale(X.data) # Wähle Anzahl der Cluster, Random State seed für Reproduktion der Ergebnisse clustering = MeanShift() clustering.fit(X) # print(X_scaled) X_scaled = X #print(X_scaled) result = clustering.fit_predict(X) data['Cluster'] = result data = data.sort_values(['Cluster']) data.to_csv(r"C:\Users\Ronald Scheffler\.spyder-py3\meanshiftresult.csv") # Auswertung: # Silhouette Score? print(silhouette_score(X_scaled, result)) print(data) # CLass Prediction for Trainingsset from sklearn.model_selection import train_test_split X = np.array(data.drop(['botname'], 1)) y = data['Cluster'] # Klassen? X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) print(X_test) print(y)
def mean_shift_clustering(principal_components, principal_df): final_df = pd.concat([principal_df], axis=1) model = MeanShift() # fit model and predict clusters yhat = model.fit_predict(principal_components) # retrieve unique clusters clusters = unique(yhat) final_df['Segment'] = model.labels_ # create scatter plot for samples from each cluster for cluster in clusters: # get row indexes for samples with this cluster row_ix = where(yhat == cluster) # create scatter of these samples plt.scatter(principal_components[row_ix, 0], principal_components[row_ix, 1], s=75) final_df.rename({ 0: 'PC1', 1: 'PC2', 2: 'PC3', 'y': 'Race' }, axis=1, inplace=True) print(final_df) plt.title("Mean Shift Clustering") add_race_labels(final_df) calc_silhouette(data=principal_components, prediction=yhat, n_clusters=len(clusters)) return final_df
def meanshift_cluster(bandwidth, vectors): """ Mean shift clustering. Finds bin centers via sklearns meanshift clustering. """ t0 = time.time() if not bandwidth: print('no bandwidth given, will estimate best.') mscluster = MeanShift(seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=-1) else: mscluster = MeanShift(bandwidth=bandwidth, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=-1) assigned_clusters = mscluster.fit_predict(vectors) center = mscluster.cluster_centers_ print('MeanShift Clustering took {:.2f} seconds'.format(time.time() - t0)) print('Found {} clusters with bandwith = {}'.format( len(center), bandwidth)) return mscluster, assigned_clusters, center
def run(self, ncpus, steps=None): """ Analyze the full simulation. Parameters ---------- ncpus : int Number of processors. """ coord_dict = self.get_coords(ncpus, steps) estimator = MeanShift(bandwidth=1, n_jobs=ncpus, cluster_all=True) self.cluster_dict = {} for feature, coords in coord_dict.items(): results = estimator.fit_predict(coords) p_dict = {} for cluster in results: p_dict = hl.frequency_dict(p_dict, cluster, 1) for cluster, frequency in p_dict.items(): center = estimator.cluster_centers_[cluster] c = Cluster(cluster, frequency, center) self.cluster_dict = hl.list_dict(self.cluster_dict, feature, c) return self.cluster_dict
def MeanShiftPercentTotal(self): ''' Type: MeanShift Y-axis: % Reactions X-axis: # Observations ''' if self.authenticated: from sklearn.cluster import MeanShift as MS algorithm = MS(bandwidth=2) categories = algorithm.fit_predict(self.percentTotal) plt.scatter(self.percentTotal[categories == 0, 0], self.percentTotal[categories == 0, 1], c="green") plt.scatter(self.percentTotal[categories == 1, 0], self.percentTotal[categories == 1, 1], c="red") plt.scatter(algorithm.cluster_centers_[:, 0], algorithm.cluster_centers_[:, 1], c="black", marker="*") for i, txt in enumerate(self.labels): plt.annotate( txt, (self.percentTotal[i][0], self.percentTotal[i][1])) plt.ylabel("PERCENT") plt.xlabel("TOTAL") plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0]) plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1]) plt.title("MeanShift: # Observations, % Reactions") plt.show()
def __get_stations_clusters_meanshift(self, coords, var): clusterer = MeanShift( cluster_all=False, bandwidth=var, n_jobs=1 ) # Faster than multi-threaded, still prevents parallel execution unfortunately... pred = clusterer.fit_predict(coords) return pred
def clusterMeanShift(ndf): df = pd.read_csv(ndf, encoding="ISO-8859-1") bandwidth = estimate_bandwidth(df, quantile=0.3) clusters = MeanShift(bandwidth=bandwidth, bin_seeding=True) reduced_data = PCA(n_components=2).fit_transform(df) reduced_data = normalize(reduced_data, norm='l2', axis=1, copy=True, return_norm=False) ms = clusters.fit_predict(reduced_data) plt.scatter(reduced_data[ms == 0, 0], reduced_data[ms == 0, 1], s=50, c='lightgreen', edgecolor='black', marker='o', label='cluster 1') plt.scatter(clusters.cluster_centers_[:, 0], clusters.cluster_centers_[:, 1], s=80, c='red', marker='*', label='centroides') plt.legend() plt.grid() plt.show()
def customer_clustering(): data = data_helpers.read_feature_data(file_path='./data/customer_data') ms_model = MeanShift(bandwidth=0.18) predict_labels = ms_model.fit_predict(data) cluster_centers_indices = ms_model.cluster_centers_ total_guess_num = 800 correct_guess_num = 0 predict_label_count_matrix = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] for id_, label_ in enumerate(predict_labels): if id_ < 200: predict_label_count_matrix[0][label_] += 1 elif id_ < 400: predict_label_count_matrix[1][label_] += 1 elif id_ < 600: predict_label_count_matrix[2][label_] += 1 else: predict_label_count_matrix[3][label_] += 1 for i in range(4): correct_guess_num += max(predict_label_count_matrix[i]) accuracy = float(correct_guess_num) / float(total_guess_num) print('accuracy:' + str(accuracy)) return data, predict_labels
def test(): from skimage import io from sklearn.cluster import MeanShift house = io.imread('lab4/images/house.jpg', as_gray=True) n_row, n_col = house.shape data = house.reshape(-1, 1) # transform to feature space (1D for grayscale) # segmentation using scikit-image library function ms = MeanShift(bandwidth=20, bin_seeding=True) clusters = ms.fit_predict(data).reshape(-1, 1) f, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5)) [ax.set_axis_off() for ax in (ax1, ax2)] ax1.imshow(house, cmap='gray') ax1.set_title('original') ax2.imshow(clusters.reshape((n_row, n_col)), cmap='gray') ax2.set_title('segmentation') f.suptitle(f"scikit-image library function", fontsize=16) plt.show() # segmentation using our algorithm bandwidth = 20 clusters = mean_shift(house, bandwidth=bandwidth) f, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5)) [ax.set_axis_off() for ax in (ax1, ax2)] ax1.imshow(house, cmap='gray') ax1.set_title('original') ax2.imshow(clusters.reshape((n_row, n_col)), cmap='gray') ax2.set_title('segmentation') f.suptitle(f"our algorithm (bandwidth = {bandwidth})", fontsize=16) plt.show()
def test_meanshift_predict(global_dtype): # Test MeanShift.predict ms = MeanShift(bandwidth=1.2) X_with_global_dtype = X.astype(global_dtype, copy=False) labels = ms.fit_predict(X_with_global_dtype) labels2 = ms.predict(X_with_global_dtype) assert_array_equal(labels, labels2)
class MeanShiftClustering(ModelBase): def __init__(self, X): self.bw = self.find_bandwidth(X) self.cluster_lables = None self.centroid = None self.model = MeanShift(bandwidth=self.bw) def _reset(self): self.bw = None self.cluster_lables = None self.centroid = None def find_bandwidth(self, X): return estimate_bandwidth(X, quantile=0.25) def fit(self, X): self.cluster_labels = self.model.fit_predict(X) self.centroid = self.model.cluster_centers_ dict_ = defaultdict(list) for i, v in enumerate(self.cluster_labels): dict_[v].append((cdist([self.centroid[v]], [X[i]], 'euclidean')[0][0], i)) self.near_metric_idx = [] for i in dict_.keys(): self.near_metric_idx.append(sorted(dict_[i])[0][1]) return self def get_closest_samples(self, labels): return [labels[idx] for idx in self.near_metric_idx]
def user_rating_clustering(): data = data_helpers.read_feature_data( file_path='../data/user_movie_rating') ms_model = MeanShift() predict_labels = ms_model.fit_predict(data) cluster_centers_indices = ms_model.cluster_centers_ print(predict_labels)
def cluster_list(self, prob_list, bandwidth=-1): if bandwidth <= 0: cluster_model = MeanShift() else: cluster_model = MeanShift(bandwidth=bandwidth) label_list = cluster_model.fit_predict( np.array(prob_list).reshape(-1, 1)) group_num = np.max(label_list) + 1 if group_num == 1: return np.mean(prob_list) else: prob_dict = {} for i in range(len(label_list)): label = label_list[i] if label not in prob_dict: prob_dict[label] = [] prob_dict[label].append(prob_list[i]) max_index = -1 max_prob = -100 for i in range(len(prob_dict)): avg_prob = np.mean(prob_dict[i]) if avg_prob > max_prob: max_prob = avg_prob max_index = i return np.mean(prob_dict[max_index])
def visual(c, X, y): from sklearn.cluster import MeanShift cluster_object = MeanShift() y_pred = cluster_object.fit_predict(X) colors = ['red', 'green', 'blue', 'cyan', 'black', 'yellow', 'magenta', 'brown', 'orange', 'silver', 'goldenrod', 'olive', 'dodgerblue'] clusters = np.unique(y_pred) print("Cluster Labels") print(clusters) print("Evaluation") evaluation_labels(y, y_pred) evaluation(X, y_pred) for cluster in np.unique(y): row_idx = np.where(y == cluster) plt.scatter(X[row_idx, 0], X[row_idx, 1]) plt.title('Dataset') plt.xlabel('X1') plt.ylabel('X2') plt.legend() plt.show() for cluster in clusters: row_idx = np.where(y_pred == cluster) plt.scatter(X[row_idx, 0], X[row_idx, 1]) plt.title('Clusters') plt.xlabel('X1') plt.ylabel('X2') plt.legend() plt.show()
def MeanShiftRatio(self): ''' Type: MeanShift Y-axis: No Reaction X-axis: Reaction ''' if self.authenticated: from sklearn.cluster import MeanShift as MS algorithm = MS(bandwidth=2) categories = algorithm.fit_predict(self.allCoord) plt.scatter(self.allCoord[categories == 0, 0], self.allCoord[categories == 0, 1], c="green") plt.scatter(self.allCoord[categories == 1, 0], self.allCoord[categories == 1, 1], c="red") plt.scatter(algorithm.cluster_centers_[:, 0], algorithm.cluster_centers_[:, 1], c="black", marker="*") for i, txt in enumerate(self.labels): plt.annotate(txt, (self.allCoord[i][0], self.allCoord[i][1])) plt.ylabel("NO REACTION") plt.xlabel("REACTION") plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0]) plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1]) plt.title("MeanShift: Reaction, No Reaction") plt.show()
def hyper_meanshift(args): global data_file ms = MeanShift(bandwidth=args['bandwidth'], min_bin_freq=int(args['min_bin_freq']), n_jobs=-1) pred = ms.fit_predict(data_file.data) temp = sil_score(data_file.data, pred) return -temp
def hyper_meanshift(args): global basic_data global all_data ms = MeanShift(bandwidth = args['bandwidth'],min_bin_freq = int(args['min_bin_freq'])) pred = ms.fit_predict(basic_data) temp = sil_score(all_data,pred) # print(args) return -temp
def rgbMeanShiftImageSeg(img, num_clusters, defaultColors, difThres): data = img.reshape(img.shape[0] * img.shape[1], 3) bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=500) mShift = MeanShift(bandwidth=bandwidth, bin_seeding=True) labels = mShift.fit_predict(data) img_labels = labels.reshape(img.shape[:2]) centers = mShift.cluster_centers_ return img_labels, centers
def meanshift(data, k, right_labels): bandwidth = estimate_bandwidth(data) model = MeanShift(bandwidth=bandwidth, bin_seeding=True, min_bin_freq=k, n_jobs=-1) labels = model.fit_predict(data) adjusted_rand_score = accuracy(labels, right_labels) return labels, adjusted_rand_score
def mean_shift_other(df, target_columns): mean_shi = MeanShift() feats = target_columns y = mean_shi.fit_predict(df[feats]) df['cluster'] = y return df
def main(): """Load image, collect pixels, cluster, create segment images, plot.""" # load image img_rgb = data.coffee() img_rgb = misc.imresize(img_rgb, (256, 256)) / 255.0 img = color.rgb2hsv(img_rgb) height, width, channels = img.shape print("Image shape is: ", img.shape) # collect pixels as tuples of (r, g, b, y, x) print("Collecting pixels...") pixels = [] for y in range(height): for x in range(width): pixel = img[y, x, ...] pixels.append([pixel[0], pixel[1], pixel[2], (y / height) * 2.0, (x / width) * 2.0]) pixels = np.array(pixels) print("Found %d pixels to cluster" % (len(pixels))) # cluster the pixels using mean shift print("Clustering...") bandwidth = estimate_bandwidth(pixels, quantile=0.05, n_samples=500) clusterer = MeanShift(bandwidth=bandwidth, bin_seeding=True) labels = clusterer.fit_predict(pixels) # process labels generated during clustering labels_unique = set(labels) labels_counts = [(lu, len([l for l in labels if l == lu])) for lu in labels_unique] labels_unique = sorted(list(labels_unique), key=lambda l: labels_counts[l], reverse=True) nb_clusters = len(labels_unique) print("Found %d clusters" % (nb_clusters)) print(labels.shape) print("Creating images of segments...") img_segments = [np.copy(img_rgb) * 0.25 for label in labels_unique] for y in range(height): for x in range(width): pixel_idx = (y * width) + x label = labels[pixel_idx] img_segments[label][y, x, 0] = 1.0 print("Plotting...") images = [img_rgb] titles = ["Image"] for i in range(min(8, nb_clusters)): images.append(img_segments[i]) titles.append("Segment %d" % (i)) plot_images(images, titles)
def evaluate_learners(X): ''' Run multiple times with different learners to get an idea of the relative performance of each configuration. Returns a sequence of tuples containing: (title, predicted classes) for each learner. ''' from sklearn.cluster import (MeanShift, MiniBatchKMeans, SpectralClustering, AgglomerativeClustering) learner = MeanShift( # Let the learner use its own heuristic for determining the # number of clusters to create bandwidth=None ) y = learner.fit_predict(X) yield 'Mean Shift clusters', y learner = MiniBatchKMeans(n_clusters=2) y = learner.fit_predict(X) yield 'K Means clusters', y learner = SpectralClustering(n_clusters=2) y = learner.fit_predict(X) yield 'Spectral clusters', y learner = AgglomerativeClustering(n_clusters=2) y = learner.fit_predict(X) yield 'Agglomerative clusters (N=2)', y learner = AgglomerativeClustering(n_clusters=5) y = learner.fit_predict(X) yield 'Agglomerative clusters (N=5)', y
def obtainClusters(self, hist): print 'Obatining clusters using MeanShift from skilean...' hist = np.array(hist) hist = hist.astype(float) scaled_vec = StandardScaler().fit_transform(hist) bandwidth = estimate_bandwidth(scaled_vec, quantile=0.3) ms = MEANSHIFT(bandwidth=bandwidth, bin_seeding=True) clusters = ms.fit_predict(scaled_vec) print 'Clusters obtained using MeanShift' return clusters
def mean_shift(data,metric): t0 = time() bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=len(data)) model = MeanShift(cluster_all=True) labels = model.fit_predict(data) if np.count_nonzero(labels) != 0: score = accuracy.getAccuracy(data,labels,len(data),metric) else: score = 'None' t1 = time() labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) return ('Mean Shift',n_clusters_,score,t1-t0)
def compute_clusters(): ''' Calculates the centroid centers based on the reports on the database. ''' data = Report.objects.all().values('latitude', 'longitude', 'category') X = np.array([np.array([d['latitude'], d['longitude']]) for d in data]) model = MeanShift(bandwidth=settings.THRESHOLD) # Getting metrics for each cluster labels = model.fit_predict(X) categories = [d['category'] for d in data] label_metrics = zip(labels, categories) clusters = zip(list(set(model.labels_)), model.cluster_centers_) _update_clusters(clusters, label_metrics)
def mean_shift(): """ MeanShift discovers blobs in a smooth density of samples. It is a centroid algorithm which works by updating candidates for centroids to be the mean of the positions within a given region. These candidates are then filtered in a post-processing stage to eliminate near-duplicates and form the final list of centroids. """ # Set a generic data sample. centers = [ [-1.,0.], [0.,1.], [1.,0.] ] n_samples = 3000 std = 0.5 seed = 0 data, target = make_blobs(n_samples = n_samples, centers = centers, random_state = seed, cluster_std = std) # Set bandwidth for the mean shift classifier. width = estimate_bandwidth(data, quantile = 0.2, n_samples = int(n_samples / 5)) # Setup the classifier. clf = MeanShift(bandwidth = width, bin_seeding = True) ms_y = clf.fit_predict(data) # Evaluate accuracy. cnt = int(0) for idx in range(n_samples): if(ms_y[idx] != clf.labels_[idx]): cnt += 1 acc = float(cnt) / float(n_samples) # Print results. print('Approximated number of centroids ', len(clf.cluster_centers_)) print('Accuracy ', acc) # Plot clusters. plt.figure(figsize = (8,8)) plt.scatter(data[:,0], data[:,1], c = ms_y, s = 30) plt.title('Clusters found with the Mean-shift method') plt.show()
def predictMeanShift(X, labels): # The following bandwidth can be automatically detected using bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) results = ms.fit_predict(X) print list(results) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) # Create a PCA model. pca_2 = PCA(2) # Fit the PCA model on the numeric columns from earlier. plot_columns = pca_2.fit_transform(X) # Make a scatter plot of each game, shaded according to cluster assignment. plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=results) plt.title("Mean Shift- 4 clusters") # Show the plot. plt.show()
featMatrixSTD=StandardScaler().fit_transform(featMatrix) featMatrixSTD=featMatrixSTD#+np.abs(featMatrixSTD.min())+1.e-15 print(featMatrixSTD.min()) #featMatrix=RobustScaler(with_centering=False).fit_transform(featMatrix) nmfTrf=TruncatedSVD(n_components=10) nmfFeats=nmfTrf.fit_transform(featMatrixSTD) dfTest=paDataFrame(featMatrixSTD[:,:10]) corr=np.dot(featMatrix,featMatrix.T) print(corr.shape) bandwidth = estimate_bandwidth(featMatrix, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth*0.7, bin_seeding=True) print('bandwidth',bandwidth) labels=ms.fit_predict(featMatrix) # db = DBSCAN(eps=0.2, min_samples=10,metric='precomputed') # dMat=1.-corr # labels=db.fit_predict(dMat) print(np.unique(labels)) sorted_labels=np.argsort(labels) print(sorted_labels) corrSorted=corr[sorted_labels,:] corrSorted=corrSorted[:,sorted_labels] print(corr.shape,corrSorted.shape) lab1=np.where(labels==1)[0] lab2=np.where(labels==2)[0]
scenes = get_joly_scenes_sementation(frames, nb_std_above_mean_th=2.)#get_scenes_segmentation(diffs, nb_std_above_mean_th=2.5) del frames #not take to much memory too long for nothing... scenes_hashes = [get_hash_of_hashes(L[s:e]) for s, e in scenes] #tqdm.write(pformat(Counter(scenes_hashes))) distance_matrix = np.zeros([len(scenes_hashes)] * 2) #compute distance between scenes' hashes for i in trange(len(scenes_hashes)): for j in range(len(scenes_hashes)): distance_matrix[i, j] = hamming(scenes_hashes[i], scenes_hashes[j]) #find similar scenes which have hases distance too close compared to others similar_scenes_matrix = distance_matrix < 64 - (distance_matrix.mean() + distance_matrix.std() * 3) #try to automatically found clusters with affinity propagation cluster_builder = MeanShift(bandwidth=1) scenes_clusters = cluster_builder.fit_predict(similar_scenes_matrix) #find the clusters with 'too much' points inside compared to others clusters_counter = Counter(scenes_clusters) clusters_freq = np.array(list(clusters_counter.values())) clusters_freq_th = clusters_freq.mean() + clusters_freq.std() * 2.5 frequent_clusters_id = list(filter(lambda k: clusters_counter[k] > clusters_freq_th, clusters_counter)) #find hashes corresponding to these clusters scenes_hashes_idx = np.array(list(map(lambda v: v in frequent_clusters_id, scenes_clusters))) generics_scenes_hashes = np.array(scenes_hashes, dtype=np.uint64)[scenes_hashes_idx] #get the generics indexes from the scene hashes generics_scenes_idx = [] for i, h in enumerate(scenes_hashes): if h in generics_scenes_hashes: generics_scenes_idx.append(i) #get the boundaries of gnerics scenes generics_scenes = list(map(lambda i: scenes[i], generics_scenes_idx))
def mean_shift_clustering(features, labels): model = MeanShift() predictions = model.fit_predict(features) print get_impurity(predictions, labels) plot_clustering(features, labels, predictions)
print result index = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] for i in range(0,len(index)) : print("quantile : %f"%index[i]) bandwidth = estimate_bandwidth(data, quantile=index[1], n_samples=len(data)) print ("bandwidth : %f"% bandwidth) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) #print ms ms.fit(data) print ms.fit(data) labels = ms.fit_predict(data) # for i in range(0, len(labels)): # if labels[i] == 0 : # labels[i] = 1 # else : # labels[i] = 2 print ("labels : ",labels) cluster_centers = ms.cluster_centers_ # print ("cluster_centers : ", cluster_centers) labels_unique = np.unique(labels) # print("labels_unique : ", labels_unique)
map_sizes # <codecell> from sklearn.cluster import MeanShift cluster_data = DataFrame(columns = ['Patient ID', 'Visit Number', 'TFName', 'Start', 'Cluster']) for tf, num in zip(tf_counts.index, tf_counts.values): data = tf_grouped.ix[tf].reset_index() data['TFName'] = tf clust = MeanShift(bandwidth = 10) res = clust.fit_predict(data[['Start']].values) data['Cluster'] = res cluster_data = concat([cluster_data, data], axis = 0, ignore_index = True) # <codecell> res = crosstab(rows = [cluster_data['Patient ID'], cluster_data['Visit Number']], cols = [cluster_data['TFName'], cluster_data['Cluster']]) # <codecell> from sklearn.cluster import k_means, mean_shift centroids, labels = mean_shift(res.values)
def test_meanshift_predict(): """Test MeanShift.predict""" ms = MeanShift(bandwidth=1.2) labels = ms.fit_predict(X) labels2 = ms.predict(X) assert_array_equal(labels, labels2)
Html_file = open("clustering_files/meanshift.html", "w") # consider only 10000 data (meanshift complexity): ind = np.array(10000 * [1] + (X.shape[0] - 10000) * [0]).astype(bool) ind = shuffle(ind) data_thr10 = pd.DataFrame(X[ind]) data_thr10.columns = data.columns scaler = StandardScaler() X = scaler.fit_transform(X) X = X[ind] km = MeanShift(cluster_all=False) preds = km.fit_predict(X) preds[preds == -1] = max(preds) + 1 print "components", set(preds) print np.bincount(preds) data_thr10['preds'] = pd.Series(preds).astype("category") color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink", "brown", "green", "orange"] * 2 title = str(np.bincount(preds)) TOOLS = "wheel_zoom,box_zoom,reset,box_select,pan" plot_width = 900 plot_height = 300 x_name = 'rateCA' y_name = 'rate'