def dbscan(fig): global X_iris, geo ax = fig.add_subplot(geo + 5, projection='3d', title='dbscan') dbscan = cluster.DBSCAN() dbscan.fit(X_iris) res = dbscan.labels_ core = dbscan.core_sample_indices_ print(repr(core)) size = [5 if i not in core else 40 for i in range(len(X_iris))] print(repr(size)) for n, i in enumerate(X_iris): ax.scatter(*i[: 3], s=size[n], c='bgrcmyk'[res[n] % 7], alpha=0.8, marker='o') ax.set_xlabel('X Label') ax.set_ylabel('Y Label') ax.set_zlabel('Z Label') return res
def findClusters(self, dfFrame): aVX, aVY = np.asarray(dfFrame["SpeedX"]) * 50, np.asarray( dfFrame["SpeedY"]) * 50 aX, aY = np.asarray(dfFrame["AvgPosX"]), np.asarray(dfFrame["AvgPosY"]) aId = np.asarray(dfFrame["TrackId"]) nFrame = dfFrame["Frame"].iloc[0] # @UnusedVariable # aData = []; # for i in range(len(aX)): # aData.append([aX[i], aY[i]]); aData = np.asarray([aX, aY, aVX, aVY]).transpose() #,aVX,aVY]).transpose(); # print aData; # return; # aData = [[np.asarray([aX]).transpose()], [np.asarray([aY]).transpose()]]; # aData = np.asarray([aX,aY]).transpose(); # aData = StandardScaler().fit_transform(aData); algorithm = cluster.DBSCAN(eps=200, min_samples=2) # algorithm = cluster.SpectralClustering(n_clusters=5, eigen_solver='arpack', affinity="rbf") # algorithm = cluster.MeanShift(bandwidth=40, bin_seeding=True); #print ms.bandwidth; # algorithm = cluster.MiniBatchKMeans(n_clusters=2) # algorithm = mixture.GMM(n_components=3, covariance_type='full', n_iter=100) # aData = aData*0.07-20; # aX, aY = aData[:,0], aData[:,1] # algorithm = mixture.DPGMM(n_components=len(aData), covariance_type='diag', alpha=10, n_iter=1) # colors = np.array([x for x in 'bgrcmybgrcmybgrcmybgrcmyk']); # colors = np.hstack([colors] * 20); algorithm.fit(aData) # try: algorithm.fit(aData); # except: print nFrame, aData; if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(aData) # y_pred = hcluster.fclusterdata(aData, t=50, criterion="distance") y_pred = self.grpActorsMngr.fixGrpsId(y_pred, aId) # y_pred2 = self.grpActorsMngr.fixGrpsId([1,-1,0,-1,-1,0,-1,1],['3','6','75','140','19','149','124','125']);#y_pred, aId); # print y_pred2; # y_pred2 = self.grpActorsMngr.fixGrpsId([0,-1,1,-1,-1,1,-1,0],['3','6','75','140','19','149','124','125']);#y_pred, aId); # print y_pred2; # pl.hold(True); # # self.plotClusters(algorithm, aData, y_pred, aId, aX, aY, nFrame); return y_pred
def _call_kmapper(data, col_names, interval, overlap, clustering_alg, clustering_alg_params, filter_function, filter_parameters=None): print(filter_parameters) mapper = KeplerMapper() if len(col_names) == 1: data_new = np.array(data[col_names[0]]).reshape(-1,1) else: data_new = np.array(data[col_names]) lens_dict = {} if len(filter_function) == 1: f = filter_function[0] if f in data.columns: lens = data[f] else: lens = compute_lens(f, data_new, mapper, filter_parameters) lens_dict[f] = lens elif len(filter_function) == 2: lens = [] for f in filter_function: if f in data.columns: lens_f = np.array(data[f]).reshape(-1,1) else: lens_f = compute_lens(f, data_new, mapper, filter_parameters) lens.append(lens_f) lens_dict[f] = lens_f lens = np.concatenate((lens[0], lens[1]), axis=1) # clusterer = sklearn.cluster.DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean', n_jobs=8) print(data_new.shape) print(np.max(np.max(data_new))) print(np.mean(np.mean(data_new))) if clustering_alg == "DBSCAN": graph = mapper.map_parallel(lens, data_new, clusterer=cluster.DBSCAN(eps=float(clustering_alg_params["eps"]), min_samples=float(clustering_alg_params["min_samples"])), cover=Cover(n_cubes=interval, perc_overlap=overlap)) elif clustering_alg == "Agglomerative Clustering": graph = mapper.map_parallel(lens, data_new, clusterer=cluster.AgglomerativeClustering(n_clusters=None, linkage=clustering_alg_params["linkage"], distance_threshold=float(clustering_alg_params["dist"])), cover=Cover(n_cubes=interval, perc_overlap=overlap)) # graph = mapper.map_parallel(lens, data_new, clusterer=cluster.AgglomerativeClustering( linkage=clustering_alg_params["linkage"]), cover=Cover(n_cubes=interval, perc_overlap=overlap)) elif clustering_alg == "Mean Shift": graph = mapper.map_parallel(lens, data_new, clusterer=cluster.MeanShift(bandwidth=float(clustering_alg_params["bandwidth"])), cover=Cover(n_cubes=interval, perc_overlap=overlap)) # graph = mapper.map_parallel(lens, data_new, clusterer=cluster.MeanShift(bandwidth=1), cover=Cover(n_cubes=interval, perc_overlap=overlap)) print(len(graph['nodes'].keys())) # graph = mapper.map(lens, data_new, clusterer=cluster.DBSCAN(eps=eps, min_samples=min_samples), cover=Cover(n_cubes=interval, perc_overlap=overlap)) return graph, lens_dict
def get_algorithm(self): if(self.algorithmName == "kmeans"): cluster_alg = cluster.MiniBatchKMeans(n_clusters=int(self.parms['k'])) elif(self.algorithmName == "mean_shift"): bandwidth = cluster.estimate_bandwidth(self.X, quantile=float(self.parms['quantile'])) cluster_alg = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) elif(self.algorithmName == "affinity_propagation"): cluster_alg = cluster.AffinityPropagation(damping=float(self.parms['damping'])) elif(self.algorithmName == "birch"): cluster_alg = cluster.Birch(n_clusters=int(self.parms['k'])) elif(self.algorithmName == "ward"): connectivity = kneighbors_graph(self.X, n_neighbors=int(self.parms['n_neighbors']), include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) cluster_alg = cluster.AgglomerativeClustering(n_clusters=int(self.parms['k']), linkage='ward', connectivity=connectivity) elif(self.algorithmName == "spectral"): cluster_alg = cluster.SpectralClustering(n_clusters=int(self.parms['k']), eigen_solver='arpack', affinity="nearest_neighbors") elif(self.algorithmName == "dbscan"): cluster_alg = cluster.DBSCAN(eps=float(self.parms['eps'])) elif(self.algorithmName == "agglomerative"): connectivity = kneighbors_graph(self.X, n_neighbors=int(self.parms['n_neighbors']), include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) cluster_alg = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=int(self.parms['k']), connectivity=connectivity) else: return None return cluster_alg
def DBSCAN(self): """ Uses `sklearn's DBSCAN <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_ **Defaults and var_params:** sklearn.cluster.DBSCAN(eps=0.5, min_samples=5, metric='euclidean', algorithm='auto', leaf_size=30, p=None, n_jobs=1) Other Parameters ---------------- var_params: dict Pass variable params through constructor as dictionary pairs. Current default parameters are listed above Returns ------- labels: list of ints Solution of clustering labels for each object (updated in object.out) """ params = {} params['distance'] = 'euclidean' params['eps']=0.5 params['min_samples']=5 params['metric']='precomputed' params['algorithm']='auto' params['leaf_size']=30 params['p']=None, params['n_jobs'] = 1 params = returnParams(self.var_params, params, 'DBSCAN') if 'distance' in self.var_params: if self.var_params['distance'] == 'precomputed': d = self.var_params['M'] else: d = returnDistanceMatrix(self.data, params['distance']) else: d = returnDistanceMatrix(self.data, params['distance']) solution = skc.DBSCAN(eps=params['eps'], min_samples=params['min_samples'], metric=params['metric'], algorithm=params['algorithm'], leaf_size=params['leaf_size'], p=params['p'], n_jobs=params['n_jobs']) solution.fit(d) self.out = solution.labels_ self.var_params = params #update dictionary of parameters to match that used.
def find_objpcd_list_by_pos(self, pcd, x_range=(200, 800), y_range=(0, 600), z_range=(790, 1000), eps=5, toggledebug=False, scan_num=1): real_pcd = pcdu.trans_pcd(pcd, self.amat) # pcdu.show_pcd([p for p in real_pcd if p[2] < 900], rgba=(.5, .5, .5, .1)) # base.run() pcd_result = [] for p in real_pcd: if x_range[0] < p[0] < x_range[1] and y_range[0] < p[1] < y_range[ 1] and z_range[0] < p[2] < z_range[1]: pcd_result.append(p) pcd_result = np.array(pcd_result) db = skc.DBSCAN(eps=eps, min_samples=50 * scan_num).fit(pcd_result) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) print("n_clusters:", n_clusters) unique_labels = set(labels) objpcd_list = [] for k in unique_labels: if k == -1: continue else: class_member_mask = (labels == k) temppartialpcd = pcd_result[class_member_mask & core_samples_mask] if len(temppartialpcd) > 500: objpcd_list.append(temppartialpcd) if toggledebug: # pcdu.show_pcd(real_pcd, rgba=(1, 1, 1, .1)) pcdu.show_pcd(pcd_result, rgba=(1, 1, 0, 1)) for objpcd in objpcd_list: pcdu.show_pcd_withrbt(objpcd, rgba=(choice([0, 1]), choice([0, 1]), 1, 1)) base.run() return objpcd_list
def _set_parameters(self, **kwargs): ''' Sets parameters used in fitting tracks:: vd: drift velocity [mm/us] clock_period: clock period for timestamp [us] dbscan_eps: epsilon used for clustering [mm] dbscan_min_samples: min samples used for clustering ''' self._vd = kwargs.get('vd', self._vd) self._clock_period = kwargs.get('clock_period', self._clock_period) self._z_scale = self._vd * self._clock_period self._dbscan_eps = kwargs.get('dbscan_eps', self._dbscan_eps) self._dbscan_min_samples = kwargs.get('dbscan_min_samples', self._dbscan_min_samples) self.dbscan = cluster.DBSCAN(eps=self._dbscan_eps, min_samples=self._dbscan_min_samples)
def bdscan(multishapes): db = cluster.DBSCAN(eps=0.3, min_samples=60) db.fit(multishapes) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True unique_labels = set(db.labels_) print(unique_labels) # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(unique_labels) - (1 if -1 in db.labels_ else 0) fig = plt.figure(figsize=(8, 6)) colors = [ '#ff0000', '#00ff00', '#0000ff', '#ff00ff', '#00ffff', '#ffff00', '#f6ff00', '#2f800f', '#a221b5', '#21b5ac', '#b1216c' ] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = 'k' my_members = db.labels_ == k xy = multishapes[my_members & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=11) xy = multishapes[my_members & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) plt.title('Número estimado de clusters: %d' % n_clusters_) plt.show()
def update_data(attrname, old, new): # Get the drop down values algorithm = dropdown.value global X # Generate the new colors: if algorithm == 'MiniBatchKMeans': model = cluster.MiniBatchKMeans(n_clusters=2) elif algorithm == 'AffinityPropagation': model = cluster.AffinityPropagation(damping=.9, preference=-200) elif algorithm == 'MeanShift': model = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) elif algorithm == 'SpectralClustering': model = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack', affinity="nearest_neighbors") elif algorithm == 'Ward': model = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward', connectivity=connectivity) elif algorithm == 'AgglomerativeClustering': model = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=2, connectivity=connectivity) elif algorithm == 'Birch': model = cluster.Birch(n_clusters=2) elif algorithm == 'DBSCAN': model = cluster.DBSCAN(eps=.2) else: print('No Algorithm selected') model.fit(X) if hasattr(model, 'labels_'): y_pred = model.labels_.astype(np.int) else: y_pred = model.predict(X) colors = [Spectral6[i] for i in y_pred] source.data['colors'] = colors plot.title = algorithm
def train_model(x): epsilons = np.linspace(0.3, 1.2, 10) scores = [] models = [] for epsilon in epsilons: model = sc.DBSCAN(eps=epsilon, min_samples=5).fit(x) scores.append( ms.silhouette_score(x, model.labels_, sample_size=len(x), metric='euclidean')) models.append(model) scores = np.array(scores) best_index = scores.argmax() best_epsilon = epsilons[best_index] best_score = scores[best_index] best_model = models[best_index] print(best_epsilon, best_score) return best_model
def cluster_pipelines2(clustercount): return { 'Ward': Pipeline([ ('sca', preprocessing.MaxAbsScaler()), ('clu', cluster.AgglomerativeClustering(n_clusters=clustercount, linkage='ward')), ]), 'K-Means': Pipeline([ ('sca', preprocessing.MaxAbsScaler()), ('clu', cluster.KMeans(n_clusters=clustercount, init='k-means++', max_iter=100, n_init=1)), ]), 'GMM': Pipeline([ ('sca', preprocessing.MaxAbsScaler()), ('clu', mixture.GaussianMixture(n_components=clustercount)), ]), 'DBScan': Pipeline([ ('sca', preprocessing.MaxAbsScaler()), ('clu', cluster.DBSCAN(eps=0.1, min_samples=20)), ]), }
def cluster_face_features(feature_list, method=None, precomputed=True, eps=0.5): if feature_list is not None: face_feature_list = feature_list if face_feature_list is None: return None if precomputed: metric_type = 'precomputed' dist_matrix = __compute_pairwise_distance(face_feature_list) dist_matrix = dist_matrix else: metric_type = 'euclidean' dist_matrix = np.vstack(face_feature_list) dist_matrix = None if method == 'AP': cluster_estimator = cluster.AffinityPropagation(affinity=metric_type, damping=.55, preference=-1) if precomputed: dist_matrix = -dist_matrix elif method == 'DBSCAN': cluster_estimator = cluster.DBSCAN(metric=metric_type, eps=eps, min_samples=1) t0 = time.time() cluster_estimator.fit(dist_matrix) t1 = time.time() t = t1 - t0 print 'Clustering takes: %f seconds' % t if hasattr(cluster_estimator, 'labels_'): y_pred = cluster_estimator.labels_.astype(np.int) else: y_pred = cluster_estimator.predict(dist_matrix) return y_pred
def clustering(X, algorithm, n_clusters): # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # Generate the new colors: if algorithm=='MiniBatchKMeans': model = cluster.MiniBatchKMeans(n_clusters=n_clusters) elif algorithm=='AffinityPropagation': model = cluster.AffinityPropagation(damping=.9, preference=-200) elif algorithm=='MeanShift': model = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) elif algorithm=='SpectralClustering': model = cluster.SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors") elif algorithm=='Ward': model = cluster.AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', connectivity=connectivity) elif algorithm=='AgglomerativeClustering': model = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=n_clusters, connectivity=connectivity) elif algorithm=='Birch': model = cluster.Birch(n_clusters=n_clusters) elif algorithm=='DBSCAN': model = cluster.DBSCAN(eps=.2) else: print('No Algorithm selected. Default is MiniBatchKMeans') model = cluster.MiniBatchKMeans(n_clusters=n_clusters) model.fit(X) if hasattr(model, 'labels_'): y_pred = model.labels_.astype(np.int) else: y_pred = model.predict(X) return X, y_pred
def main(): data_origin = read_data('balance-scale.data') data_converted = convert_data(data_origin, 0) true_labels = data_converted.iloc[:, 0] data_clean = clean_data(data_converted.iloc[:, 1:]) plot_distribution(data_clean) dimension = 2 data_pca = pca(dimension, data_clean) plot_distribution(data_pca) n_clusters = 3 dimension_show = [1, 2] kmeans = cluster.KMeans(n_clusters=n_clusters).fit(data_clean) show_result(data_clean, data_pca, true_labels, kmeans, n_clusters, dimension_show) dbscan = cluster.DBSCAN(eps=0.38, min_samples=10).fit(data_clean) show_result(data_clean, data_pca, true_labels, dbscan, n_clusters, dimension_show) return 0
def delete_redudants(predictions, embed_model): X = [] logging.info('\n# Redundancy reduction ...\n') for i, row in predictions.iterrows(): X.append(get_sentence_vector(row['processed_text'], embed_model)) dbscan = cluster.DBSCAN(eps=0.09, metric='cosine', min_samples=2).fit(X) labels = dbscan.labels_ logging.info('\n# Labels\n') print(labels) predictions['label'] = labels isolated_tweets = predictions[predictions.label == -1] predictions = predictions[predictions.label != -1].drop_duplicates( 'label', keep='first') predictions = pd.concat([predictions, isolated_tweets], sort=True) predictions = predictions.sort_values('score', ascending=False) predictions.reset_index(inplace=True) predictions = predictions.drop(columns='processed_text') logging.info( f'\n# Redundancy reduction [OK]\n >> Length= {len(predictions)}\n') return predictions
def dbscan_seeds(goods, bads): """Find regions with concentration of good points.""" from scipy.spatial import ConvexHull import sklearn.cluster as cl good_ids, good_loc = goods bad_ids, bad_loc = bads labels = cl.DBSCAN(eps=150, min_samples=8).fit_predict(good_loc) gcluster = [] bcluster = [] hulls = [] for cluster in range(len(np.unique(labels)) - 1): points = good_loc[labels == cluster, :] hull = sgeo.Polygon(points[ConvexHull(points).vertices]) gcluster.append(list(i.compress(good_ids, labels == cluster))) bcluster.append([ id_ for id_, loc in zip(bad_ids, bad_loc) if hull.contains(sgeo.Point(loc)) ]) hulls.append(hull) return hulls, gcluster, bcluster
def do_dbscan(data): print(" Do dbscan...") # Récupération des paramètres eps = args.associer_param('eps', 1) min_pts = args.associer_param('min_pts', 1) model = cluster.DBSCAN(eps=eps, min_samples=min_pts, metric=args.args.distance) labels = model.fit_predict(data) data['cluster'] = labels print(" ok !") return data
def clustering(self, image_urls, min_samples=2, eps=0.4, pick_up_num=3): train = self.get_train(image_urls) print(train) if len(train) < min_samples: return None distances = self.calculate_distance(train) if distances is None: return None cls = cluster.DBSCAN(metric='precomputed', min_samples=min_samples, eps=eps) y = cls.fit_predict(distances) val = pd.Series(y).value_counts() target_clusters_index = [x for x in list(val.index) if x != -1][:pick_up_num] order = {key: i for i, key in enumerate(target_clusters_index)} picked_up = dict([(index, val) for (index, val) in enumerate(y.tolist()) if val in target_clusters_index]) picked_up_ = [(order[x2], image_urls[x1]) for (x1, x2) in sorted(picked_up.items(), key=lambda x: order[x[1]])] ret = [] for key, subiter in itertools.groupby(picked_up_, operator.itemgetter(0)): vals = [item[1] for item in subiter] ret.append({"row_id": int(key), "sumples_num": len(vals), "vals": vals}) return ret
def cluster(self, params={"alg": "KMeans", "num": 10}): start = time.time() encodedLog = self.encodedLog.values.tolist() if params["alg"].lower() == "kmeans": if not "runs" in params: params["runs"] = 0 cluster = TTestKMeans2(params["num"], encodedLog) print("SSE : ", cluster.inertia_) print("Clustering Time:", time.time() - start) return cluster.predict(encodedLog), cluster.cluster_centers_ elif params["alg"].lower() == "dbscan": cluster = skcl.DBSCAN(min_samples=params["minsamples"], eps=params["eps"]).fit(encodedLog) y_pred = cluster.labels_ centers = calcCenters(y_pred, encodedLog) print("Clustering Time:", time.time() - start) if "assignNoisy" in params and params["assignNoisy"] == True: y_pred, centers = assignNoisyPoints(y_pred, encodedLog, centers) return y_pred, centers
def cluster_pipelines(clustercount, featuredim, decompstr): decomp = clustervis_pipelines(featuredim)[decompstr] return { 'Ward': Pipeline([ ('decomp', decomp), ('clu', cluster.AgglomerativeClustering(n_clusters=clustercount, linkage='ward')), ]), 'K-Means': Pipeline([ ('decomp', decomp), ('clu', cluster.KMeans(n_clusters=clustercount, init='k-means++', max_iter=100, n_init=1)), ]), 'GMM': Pipeline([ ('decomp', decomp), ('clu', mixture.GaussianMixture(n_components=clustercount)), ]), 'DBScan': Pipeline([ ('decomp', decomp), ('clu', cluster.DBSCAN(eps=0.1, min_samples=20)), ]), }
def _optimize_eps(X, eps, Param, verbose=3): if verbose >= 3: print('[clusteval] >Evaluate using silhouette..') # Setup resolution eps = np.arange(0.1, 5, 1 / Param['epsres']) silscores = np.zeros(len(eps)) * np.nan sillclust = np.zeros(len(eps)) * np.nan silllabx = [] # Run over all Epsilons for i in tqdm(range(len(eps))): # DBSCAN db = cluster.DBSCAN(eps=eps[i], metric=Param['metric'], min_samples=Param['min_samples'], n_jobs=Param['n_jobs']).fit(X) # Get labx labx = db.labels_ # Fill array sillclust[i] = len(np.unique(labx)) # Store all labx silllabx.append(labx) # Compute Silhouette only if more then 1 cluster if sillclust[i] > 1: silscores[i] = silhouette_score(X, db.labels_) # Convert to array silllabx = np.array(silllabx) # Store only if agrees to restriction of input clusters number I1 = np.isnan(silscores) == False I2 = sillclust >= Param['min_clust'] I3 = sillclust <= Param['max_clust'] Iloc = I1 & I2 & I3 # Get only those of interest silscores = silscores[Iloc] sillclust = sillclust[Iloc] eps = eps[Iloc] silllabx = silllabx[Iloc, :] # Return return (eps, sillclust, silscores, silllabx)
def main(): # Create random data. n = 1500 # nb circles. for i, x_y in enumerate([ datasets.make_circles(n, factor=.5, noise=.05), datasets.make_moons(n_samples=n, noise=.05) ]): x, y = x_y # Scale data to reduce weights. # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables std_scale = preprocessing.StandardScaler().fit(x) x_scaled = std_scale.transform(x) # Perform DBSCAN on scaled data. range_eps = [0.05, 0.1, 0.2, 0.3] range_n_min = [5, 10, 20, 30] nb_plots = len(range_eps) + 1 # +1: add true clusters. for j, eps_n_min in enumerate(zip(range_eps, range_n_min)): # Perform DBSCAN on scaled data. e, n_min = eps_n_min cls = cluster.DBSCAN(eps=e, min_samples=n_min) cls.fit(x_scaled) # Plot DBSCAN. axis = plt.subplot(2, nb_plots, 1 + j + nb_plots * i) axis.scatter(x_scaled[:, 0], x_scaled[:, 1], c=cls.labels_, s=50) axis.set_title('eps %04.2f, n_min %02d' % (e, n_min)) # Plot true clusters. axis = plt.subplot(2, nb_plots, nb_plots + nb_plots * i) axis.scatter(x_scaled[:, 0], x_scaled[:, 1], c=y, s=50) axis.set_title('true clusters') plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.3, hspace=0.3) plt.suptitle('DBSCAN') plt.show()
def cluster_topics(): #model = cluster.Birch( #branching_factor=2, #threshold=0.002 # Lower = more clusters, higher = fewer clusters #) #model = cluster.KMeans( #branching_factor=10, #threshold=0.1 # Lower = more clusters, higher = fewer clusters #) model = cluster.DBSCAN(min_samples=2, eps=0.2) #model = cluster.AffinityPropagation( #) vectorizer = text.HashingVectorizer( analyzer='char_wb', # The feature is made of words not characters norm='l2', # Normalize the words lowercase=True, # Converts everything to lowercase stop_words=stopwords) num_samples = 10000 offset = 0 while True: log.debug(u"Loading topics...") topic_rows = db.session.query( models.TopicModel.id, models.TopicModel.topic).filter_by(clustered=False).order_by( models.TopicModel.id.asc()).limit(num_samples).offset( offset).all() if not topic_rows: break log.debug(u"Loaded {} topics".format(len(topic_rows))) offset += len(topic_rows) go_cluster(vectorizer, model, topic_rows)
def _get_cluster_dict(peak_array, eps=30, min_samples=2): """Sort peaks into cluster using sklearn's DBSCAN. Each cluster is given its own label, with the unclustered having the label -1. Parameters ---------- peak_array : 2D numpy array In the form [[x0, y0], [x1, y1], ...], i.e. shape = 2 eps : scalar For the DBSCAN clustering algorithm min_samples : int Minimum number of peaks in each cluster Returns ------- cluster_dict : dict The peaks are sorted into a dict with the cluster label as the key. Example ------- >>> import numpy as np >>> peak_array = np.random.randint(1000, size=(100, 2)) >>> import pyxem.utils.cluster_tools as ct >>> cluster_dict = ct._get_cluster_dict(peak_array) >>> cluster0 = cluster_dict[0] """ dbscan = cluster.DBSCAN(eps=eps, min_samples=min_samples) dbscan.fit(peak_array) label_list = dbscan.labels_ label_unique_list = sorted(list(set(label_list))) cluster_dict = {} for label_unique in label_unique_list: cluster_dict[label_unique] = [] for peak, label in zip(peak_array, label_list): cluster_dict[label].append(peak.tolist()) return cluster_dict
def dbscan(data, eps=0.3, min_samples=10): """DBScan clustering Parameters ---------- data : float array features array Returns ------- cl : int array cluster indicies Notes ----- This function requires scikits-learn """ db = skcluster.DBSCAN(eps=eps, min_samples=min_samples).fit(data) labels = db.labels_ return labels
def DBSCAN(P, eps=15, minpts=10): pointlist = [] for y in range(P.shape[1]): for x in range(P.shape[0]): if P[x, y] > 0: pointlist.append([x, y]) pointlist = np.array(pointlist) db = skc.DBSCAN(eps, minpts).fit(pointlist) labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) cluster_list = [] #找到最大的簇并标在图上 for i in range(n_clusters_): one_cluster = pointlist[labels == i] cluster_list.append([len(one_cluster), one_cluster]) cluster_list.sort(key=lambda x: x[0], reverse=True) P = np.zeros((P.shape[0], P.shape[1])) for pixel in cluster_list[0][1]: P[pixel[0], pixel[1]] = 1 return P
def __apply_cluster_algorithms__(self, x): if self.algorithms == 'k-mean': kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(x) self.labels = kmeans.labels_ for i, label in enumerate(kmeans.labels_): self.clusterid_docids_map[ label] = self.clusterid_docids_map.get(label, []) + [i] elif self.algorithms == 'dbscan': dbscan = cluster.DBSCAN(eps=2, min_samples=3) dbscan.fit(x) self.labels = dbscan.labels_ for i, label in enumerate(dbscan.labels_): self.clusterid_docids_map[ label] = self.clusterid_docids_map.get(label, []) + [i] else: sm_cluster = SparseMatrixClustering( cluster_sim_threshold=0.8, graph_manager=self.graph_manager) sm_cluster.fit(x) self.score_mat = sm_cluster.score_mat self.clusterid_docids_map = sm_cluster.clusterid_docids_mapping
def getDataPandas(): reader = pd.DataFrame() reader = pd.read_table(r'.\dbscanData.txt', header=None, sep=' ') #iterator=True,chunksize=1000) print reader #hello=reader.iloc[0] reader = reader.T #转置 print reader f3 = lambda x: x / x.sum() reader = reader.apply(f3) reader = reader.T #转回来 print reader dbscan = cluster.DBSCAN(eps=0.3, min_samples=3, algorithm='brute', metric='euclidean') dbscan.fit(reader) res = dbscan.labels_ print res
def filter_isolated_idxs(idxs=[], maxdist=3.0): newidxs = idxs.copy() seq = np.where(idxs==True)[0] if len(seq): X = np.array(zip(seq, np.zeros(len(seq)))) cfn = cluster.DBSCAN(eps=3, min_samples=1) cfn.fit(X) clusters = {} for v in np.unique(cfn.labels_): clusters[v] = len(cfn.labels_[cfn.labels_==v]) maxv = sorted(clusters, key=clusters.get)[-1] for i in xrange(len(cfn.labels_)): if cfn.labels_[i] != maxv: newidxs[seq[i]] = False return newidxs
def process(img: Image): img.thumbnail((200, 200)) data = np.array(img.getdata()) least_clusters = None clusters = None for i in np.linspace(5, 7, 5): db = cluster.DBSCAN(eps=i, min_samples=10).fit(data) ln = len(set(db.labels_)) if least_clusters is None or least_clusters > ln: least_clusters = ln clusters = db.labels_ if least_clusters <= 7: break result = [] for i in set(clusters): result.append( list( map(int, list(np.round(np.average(data[clusters == i], axis=0)))))) #govnokod return result