def dbscan(data, use_csi=True, epsq=None, max_cl_number=None, **kwargs): """ Finds cluster of users in data using DBSCAN :param data: pd.DataFrame with features for clustering indexed by users (sessions) :param use_csi: if True, then cluster stability index will be calculated (may take a lot of time) :param epsq: quantile of nearest neighbor positive distance between dots (value of it will be an eps), if None, then eps from key-words will be used. :param max_cl_number: maximal number of clusters for aggregation of small clusters :param kwargs: keyword arguments for sklearn.cluster.KMeans :return: np.array of clusters """ kmargs = { i: j for i, j in kwargs.items() if i in DBSCAN.get_params(DBSCAN) } if epsq is not None: kmargs.update({'eps': find_best_eps(data, epsq)}) km = DBSCAN(**kmargs) cl = km.fit_predict(data.values) bs = pd.get_dummies(cl) bs.index = data.index metrics = calc_all_metrics(data, km) if use_csi: metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs) if max_cl_number is not None: cl = aggregate_cl(cl, max_cl_number) return cl, metrics
def dbscan(training_vectors, clean_vectors, anomalous_vectors, eps=0.3, min_samples=3): print("Starting DB-Scan Fitting...") #Building the clustering model """eps is the max. distance between two neighbouring datapoints. min_samples gives the minimum number of samples that must be found to form a cluster. Both parameters must be chosen carefully, depending on the dataset. """ dbscan = DBSCAN(eps=eps, min_samples=min_samples) print("Fitting with Parameters: ", dbscan.get_params()) model = dbscan.fit(training_vectors) print("Training done! Switch to testing.") print("Start prediction...") result_training = __dbscan_predict(model, training_vectors) result_clean = __dbscan_predict(model, clean_vectors) result_anomalous = __dbscan_predict(model, anomalous_vectors) print("Predicting successful!") print("**************************") return result_clean, result_anomalous, result_training
def affichage_PCA_leur_DBSCAN(): matrice = np.genfromtxt("matrice.txt", delimiter=" ") print(matrice) reduced_data = PCA(n_components=2).fit_transform(matrice) print(reduced_data) dbscan = DBSCAN(eps=40, min_samples=23, metric='precomputed', algorithm='auto') print(dbscan) dbscan.fit(matrice) print(dbscan.labels_) print(dbscan.get_params(deep=True)) colors = ['g', 'b', 'r', 'y'] plt.figure(figsize=(4, 4)) for i in range(4): plt.plot(reduced_data[np.where(dbscan.labels_ == i), 0], reduced_data[np.where(dbscan.labels_ == i), 1], c=colors[i], marker='o', markerfacecolor=colors[i], markeredgecolor='k') plt.xlabel("PC1") plt.ylabel("PC2") plt.title( "Représentation d'un clustering \n obtenu par l'algorithme DBSCAN") plt.text(-350, 350, r'$\epsilon=40,\ \ minPts=23$') plt.show()
class DBSCANCluster(Intent): def __init__(self, eps: float) -> None: super().__init__() self.dbscan = DBSCAN(eps) def to_string(self) -> str: return 'Cluster:DBSCAN' def compute(self, df: pd.DataFrame) -> pd.DataFrame: nan_dropped = df.dropna() min_max_scaler = preprocessing.MinMaxScaler() scaled = min_max_scaler.fit_transform(nan_dropped.values) self.dbscan.fit(scaled) labels = pd.DataFrame(data=self.dbscan.labels_, index=nan_dropped.index).applymap(str) inc_nan = labels.reindex(index=df.index, fill_value='NaN') values = inc_nan.iloc[:, 0].unique() result = pd.concat( map( lambda v: pd.DataFrame( # type: ignore data=(inc_nan.iloc[:, 0] == v).astype('int').values, columns=[self.to_string() + ":" + v], index=df.index, dtype=int), values), axis='columns') return result def info(self) -> Optional[Dict[str, Any]]: return {"params": self.dbscan.get_params()}
def perform_dbscan(self): dbscan_clusterer = DBSCAN(**self.dbscan_params, metric="precomputed") dbscan_clusterer.fit(self.distance_matrix) self.dbscan_results = { "parameters": dbscan_clusterer.get_params(), "labels": dbscan_clusterer.labels_, "n_clusters": np.unique(dbscan_clusterer.labels_).max() + 1, 'clusters': label_cnt_dict(dbscan_clusterer.labels_) } print_dict(self.dbscan_results)
def perform_dbscan(self): ''' TODO : use ELKI's DBSCAN algorithm instead of scikit learns algorithm Reference : https://stackoverflow.com/questions/16381577/scikit-learn-dbscan-memory-usage ''' dbscan_clusterer = DBSCAN(**self.dbscan_params, metric="precomputed") dbscan_clusterer.fit(self.distance_matrix, hdf5_file=self.hdf5_file) self.dbscan_results = { "parameters": dbscan_clusterer.get_params(), "labels": dbscan_clusterer.labels_, "n_clusters": np.unique(dbscan_clusterer.labels_).max() + 1, 'clusters': label_cnt_dict(dbscan_clusterer.labels_) } print_dict(self.dbscan_results)
def dbscan(data, use_csi=True, epsq=None, max_cl_number=None, **kwargs): """ Finds cluster of users in data using DBSCAN Parameters ------- data: pd.DataFrame Dataframe with features for clustering indexed by users (sessions) use_csi: bool, optional If ``True``, then cluster stability index will be calculated. IMPORTANT: it may take a lot of time. Default: ``True`` epsq: float, optional Quantile of nearest neighbor positive distance between dots, its value will be an eps. If ``None``, then eps from keywords will be used. Default: ``None`` max_cl_number: int, optional Maximal number of clusters for aggregation of small clusters. Default: ``None`` kwargs: optional Parameters for ``sklearn.cluster.KMeans`` Returns -------- Array of clusters Return type ------- np.array """ kmargs = { i: j for i, j in kwargs.items() if i in DBSCAN.get_params(DBSCAN) } if epsq is not None: kmargs.update({'eps': find_best_eps(data, epsq)}) km = DBSCAN(**kmargs) cl = km.fit_predict(data.values) bs = pd.get_dummies(cl) bs.index = data.index metrics = calc_all_metrics(data, km) if use_csi: metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs) if max_cl_number is not None: cl = aggregate_cl(cl, max_cl_number) return cl, metrics
class sklearn_DB(FitClusterBase): """ https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html eps : float, optional The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your dataset and distance function. min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter. If metric is “precomputed”, X is assumed to be a distance matrix and must be square. X may be a sparse matrix, in which case only “nonzero” elements may be considered neighbors for DBSCAN. """ _pairwise = True def __init__(self, eps=None, min_samples=None, metrictype='precomputed'): super().__init__() self.name='DBSCAN' self.metric = metrictype # e.g. 'euclidean' # distance metric self.eps = eps # The number of samples in a neighborhood for a point to be considered as a core point. self.min_samples = min_samples self.db = None def fit(self, dmatrix, rho=None): self.db = DBSCAN(eps=self.eps, min_samples=self.min_samples, metric=self.metric).fit(dmatrix) return self.db.labels_ def pack(self): """return all the info""" return self.db.get_params()
plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels)) ] plt.subplot(211) for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) xy = X[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=14) xy = X[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=6) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.subplot(212) plt.scatter(X[:, 0], X[:, 1], s=1) print(db.get_params()) plt.show()
label="outlier", marker='1') plt.legend(prop={'size': 15}) plt.title("Hasil Pengelompokkan") plt.show() kluster["Cluster"] = kluster["Cluster"].astype("str") px.scatter(data_frame=kluster, x="Income", y="SpendScore", color="Cluster", template='plotly_dark') label_1 = klustering.labels_ params = klustering.get_params() print("Dengan parameter") print("eps\t\t\t\t:", params["eps"]) print("min_samples\t\t\t:", params["min_samples"]) print("Nilai silhoutte yang didapatkan\t:", silhouette_score(x, label_1)) """# Mencari eps dan minPts terbaik""" ls_eps = np.arange(8, 15.25, 0.25) ls_min_samples = np.arange(3, 11) dbscan_params = [(x, y) for x in ls_eps for y in ls_min_samples] dbscan_params jumlah_cluster = [] sil_score = []
def do_clustering(target_csv, cluster_method): num_cluster = 24 df_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, target_csv + '.csv'), index_col=0, header=0, encoding='utf-8-sig') df_data.index.name = 'short_code' print(df_data.iloc[:100]) print(df_data.shape) start_time = time.time() if cluster_method == 0: clustering = DBSCAN(eps=0.3, min_samples=1000) clustering.fit(df_data) csv_name = 'clustered_dbscan_' + target_csv + '.csv' elif cluster_method == 1: clustering = OPTICS(min_samples=1000, metric='cosine') clustering.fit(df_data) csv_name = 'clustered_optics_' + target_csv + '.csv' elif cluster_method == 2: clustering = AgglomerativeClustering(n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_ward_' + target_csv + '.csv' elif cluster_method == 3: clustering = AgglomerativeClustering(affinity='cosine', linkage='complete', n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_agglo_complete_' + target_csv + '.csv' elif cluster_method == 4: clustering = AgglomerativeClustering(affinity='cosine', linkage='single', n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_agglo_single_' + target_csv + '.csv' elif cluster_method == 5: clustering = Birch(n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_birch_' + target_csv + '.csv' elif cluster_method == 6: clustering = KMeans(n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_kmeans_' + target_csv + '.csv' elif cluster_method == 7: clustering = SpectralClustering(n_clusters=num_cluster, random_state=42, assign_labels='discretize') clustering.fit(df_data) csv_name = 'clustered_spectral_' + target_csv + '.csv' print("time elapsed for clustering: " + str(time.time() - start_time)) print(clustering.get_params()) print(clustering.labels_) count_percentage(clustering.labels_) result_df = pd.DataFrame(data=clustering.labels_, index=df_data.index, columns=['cluster']) start_time = time.time() print("calinski_harabasz_score: ", calinski_harabasz_score(df_data, result_df['cluster'].squeeze())) print("silhouette_score: ", silhouette_score(df_data, result_df['cluster'].squeeze())) print("davies_bouldin_score: ", davies_bouldin_score(df_data, result_df['cluster'].squeeze())) print("time elapsed for scoring: " + str(time.time() - start_time)) result_df.to_csv(os.path.join(CONFIG.CSV_PATH, csv_name), encoding='utf-8-sig')
score = silhouette_score(data, model.fit_predict(data)) #on conserve celui qui a le meilleur score de Silhouette if score > best_score: best_model = model best_score = score return best_model #va contenir les scores pour chaque valeur de K et pour chaque modèle : KMeans (col 0), GaussianMixture(col 1), #CAH (col 2) et DBSCAN (col 3) K_silhouette_scores = np.zeros((19, 4)) #Modèle optimal pour DBSCAN eps = np.arange(0.1, 2, 0.1) min_samples = range(3, 20) Dbscan = best_model_Dbscan(Z, eps, min_samples) print(Dbscan.get_params()) clusters = Dbscan.fit_predict(Z) print("Nombre de clusters pour DBSCAN : ", clusters) #Calcul du score de Silhouette pour DBSCAN K_silhouette_scores[:, 3] = silhouette_score(Z, clusters) M = hierarchy.linkage(Z, method='average', metric='euclidean') #Clustering pour différents nombres de classe #Matrice pour contenir les inerties intra-classe pour chaque valeur de K et chaque modèle inertia = [] for K in range(2, 21): K_model_scores = [] #Différents modèles models = { 'KMeans': KMeans(n_clusters=K, random_state=0), 'Gaussian':
dpgmm_labels = dpgmm.fit_predict(Ysc) toc = time() dpgmm_posterior = dpgmm.predict_proba(Ysc) dpgmm.get_params() print "elapsed time: %.4f sec" %(toc - tic) I_label = np.reshape(dpgmm_labels, (num_rows, num_cols, 1)) I_seg = I_label * 255.0 / np.max(I_label) #scale I_seg = I_seg.astype(np.uint8) #cast I_seg_bgr = cv2.cvtColor(I_seg, cv2.COLOR_GRAY2BGR) cv2.imshow('DPGMM seg', I_seg_bgr) #cv2.imwrite('./figures/dpgmm_seg.png', I_seg_bgr) #DB-SCAN print "running DBSCAN..." dbscan = DBSCAN(eps=0.5, min_samples=50, metric='euclidean', algorithm='auto') tic = time() dbscan_labels = dbscan.fit_predict(Ysc) toc = time() print "elapsed time: %.4f sec" %(toc - tic) dbscan.get_params() #noise samples are labeled -1 dbscan_labels = dbscan_labels + 1 I2_label = np.reshape(dbscan_labels, (num_rows, num_cols, 1)) I2_seg = I2_label * 255.0 / np.max(I2_label) #scale I2_seg = I2_seg.astype(np.uint8) #cast I2_seg_bgr = cv2.cvtColor(I2_seg, cv2.COLOR_GRAY2BGR) cv2.imshow('DPGMM seg', I2_seg_bgr) #cv2.imwrite('./figures/dbscan_seg.png', I2_seg_bgr)
dpgmm.get_params() print "elapsed time: %.4f sec" % (toc - tic) I_label = np.reshape(dpgmm_labels, (num_rows, num_cols, 1)) I_seg = I_label * 255.0 / np.max(I_label) #scale I_seg = I_seg.astype(np.uint8) #cast I_seg_bgr = cv2.cvtColor(I_seg, cv2.COLOR_GRAY2BGR) cv2.imshow('DPGMM seg', I_seg_bgr) #cv2.imwrite('./figures/dpgmm_seg.png', I_seg_bgr) #DB-SCAN print "running DBSCAN..." dbscan = DBSCAN(eps=0.5, min_samples=50, metric='euclidean', algorithm='auto') tic = time() dbscan_labels = dbscan.fit_predict(Ysc) toc = time() print "elapsed time: %.4f sec" % (toc - tic) dbscan.get_params() #noise samples are labeled -1 dbscan_labels = dbscan_labels + 1 I2_label = np.reshape(dbscan_labels, (num_rows, num_cols, 1)) I2_seg = I2_label * 255.0 / np.max(I2_label) #scale I2_seg = I2_seg.astype(np.uint8) #cast I2_seg_bgr = cv2.cvtColor(I2_seg, cv2.COLOR_GRAY2BGR) cv2.imshow('DPGMM seg', I2_seg_bgr) #cv2.imwrite('./figures/dbscan_seg.png', I2_seg_bgr)