def clustering(the_image_autoencoded, the_image_shape, number_of_clusters, extra_parameters=""): print() print("*** OPTICS clustering ***") print("---------------------------------") # https://scikit-learn.org/stable/modules/clustering.html # https://scikit-learn.org/stable/auto_examples/cluster/plot_optics.html # #sphx-glr-auto-examples-cluster-plot-optics-py # https://scikit-learn.org/stable/modules/clustering.html#optics print("Image shape: ", the_image_shape) print("OPTICS clustering") clust = OPTICS(min_samples=10, xi=.0005, min_cluster_size=.005) print("Running fit function for OPTICS clustering") clust.fit(the_image_autoencoded) labels_050 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=0.5) labels_200 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=2) labels_300 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=3) print("---------------------------") reachability = clust.reachability_[clust.ordering_] print("Reachability: ", reachability) print("---------------------------") print("Creating list for clustered data") clustered_data = np.zeros((the_image_shape[0], the_image_shape[1])) print("Clustered data shape: ", np.shape(clustered_data)) x = 0 y = 0 for i in range(the_image_shape[0] * the_image_shape[1]): clustered_data[y, x] = labels_050[y * the_image_shape[1] + x] x = x + 1 if x == the_image_shape[1]: x = 0 y = y + 1 return clustered_data
def optics_fit_predict(X, min_samples=50, cluster_method='dbscan', eps=2): """Perform OPTICS clustering Extracts an ordered list of points and reachability distances, and performs initial clustering using ``max_eps`` distance specified at OPTICS object instantiation. Parameters ---------- X : array, shape (n_samples, n_features), or (n_samples, n_samples) min_samples : The number of samples in a neighborhood for a point to be considered as a core point. cluster_method : 'dbscan' by default. Other available: 'xi' eps : The maximum distance between two samples for one to be considered as in the neighborhood of the other. Returns ------- labels: Prediction/labels """ opt = OPTICS(min_samples=min_samples, cluster_method=str(cluster_method)) opt.fit(X) labels = cluster_optics_dbscan(reachability=opt.reachability_, core_distances=opt.core_distances_, ordering=opt.ordering_, eps=eps) return labels
def _extract_best_optics(self, clusterer): max_score = -inf best_pred = None # Traverse epsilon to detect the best cut for my_eps in arange(0.01, 0.5, 0.01): pred = cluster_optics_dbscan( reachability=clusterer.reachability_, core_distances=clusterer.core_distances_, ordering=clusterer.ordering_, eps=my_eps) if not len(unique(pred)) in (1, len(self.data)): score = silhouette_score(X=self.data, labels=pred, metric=self.distance_metric, random_state=13712) if score > max_score: max_score = score best_pred = pred if best_pred is not None: return self._process_noise_as_singletons(best_pred) else: # All outputs are either one cluster or n clusters return self._process_noise_as_singletons(pred)
def UseDBScan(): db = get_db() cursor = db.cursor() sql = "Select starttime,longitude,latitude from userdata Where imsi = %s order by starttime" cursor.execute(sql, (request.form['imsi'], )) results = cursor.fetchall() data = np.array(results) #公式计算两点间距离(m) def distance(p1, p2): #lng1,lat1,lng2,lat2 = (120.12802999999997,30.28708,115.86572000000001,28.7427) lng1, lat1, lng2, lat2 = map( radians, [float(p1[0]), float(p1[1]), float(p2[0]), float(p2[1])]) # 经纬度转换成弧度 dlon = lng2 - lng1 dlat = lat2 - lat1 a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2 distance = 2 * asin(sqrt(a)) * 6378.137 * 1000 # 地球平均半径,6371km return distance dbscan_cluster = DBSCAN(eps=500, min_samples=5, metric=lambda a, b: distance(a, b)).fit(data[:, 1:3]) optics_cluster = OPTICS(min_samples=5, cluster_method='dbscan', metric=lambda a, b: distance(a, b)).fit(data[:, 1:3]) print(optics_cluster.reachability_) optics_label = cluster_optics_dbscan( reachability=optics_cluster.reachability_, core_distances=optics_cluster.core_distances_, ordering=optics_cluster.ordering_, eps=300) print(optics_label) results = np.c_[np.array(results), dbscan_cluster.labels_, optics_label].tolist() array = {} index = 0 for item in results: tmp = {} tmp['time'] = item[0] tmp['longitude'] = item[1] tmp['latitude'] = item[2] tmp['dbscan'] = item[3] tmp['optics'] = item[4] array[index] = tmp index += 1 return jsonify(array)
def get_dbscan_and_reachability_figs(hdf_filename, mds_hdf_key, optics_hdf_key, first_dim, second_dim, cutoff): with pd.HDFStore(hdf_filename, 'r') as store: mds = store[mds_hdf_key] try: mds = mds[[first_dim, second_dim]] except KeyError: mds = mds.iloc[:, [int(first_dim), int(second_dim)]] optics = store[optics_hdf_key] # df = pd.concat([mds, optics], axis=1, sort=False) df = pd.merge(mds, optics, left_index=True, right_index=True) labels = df.labels[optics.ordering] names = df.index[optics.ordering] space = np.arange(len(df.index)) reachability = df.reachability[optics.ordering] reach_fig = px.scatter(x=space, y=reachability, color=labels, hover_name=names, range_x=[min(space), max(space)+1]) dbscan_fig = go.Figure() if cutoff is not None: reach_fig.add_annotation( x=1, y=cutoff+.05, text="DBSCAN cutoff", xref="paper", showarrow=False, font_size=12 ) reach_fig.add_shape( type="line", xref='paper', x0=0, y0=cutoff, x1=1, y1=cutoff, line=dict(color="RoyalBlue", width=3) ) x = df.iloc[:, int(first_dim)] y = df.iloc[:, int(second_dim)] labels_db = cluster_optics_dbscan(reachability=optics.reachability, core_distances=optics.core_distances, ordering=optics.ordering, eps=cutoff) labels_db_text = [f"cluster {x}" for x in labels_db] dbscan_fig = px.scatter( x=x, y=y, color=labels_db_text, hover_name=df.index, labels={ "x": f"Component {int(first_dim)}", "y": f"Component {int(second_dim)}", "color": "Clusters" }, title=f"DBSCAN clustering for epsilon {cutoff}" ) return dbscan_fig, reach_fig
def OPTICS_Clustering(X): X = preprocess(X) cluster = OPTICS(min_samples=100, xi=.05, min_cluster_size=.05) cluster.fit(X) label_pred = cluster_optics_dbscan(reachability=cluster.reachability_, core_distances=cluster.core_distances_, ordering=cluster.ordering_, eps=2) label_pred = cluster.labels_ return label_pred
def mskNoise(eps_v): # Extract the labeled assigned to each point for this eps value labels_dbs = skclust.cluster_optics_dbscan( reachability=model_OPTIC.reachability_, core_distances=model_OPTIC.core_distances_, ordering=model_OPTIC.ordering_, eps=eps_v) # Identify points that are *not* labeled as "noise" (labeled as -1) msk = labels_dbs != -1 # Return data array with points labeled as noise filterd out return data[msk]
def cluster_optics(self, similarity_matrix): print('Clustering with optics.') #TODO: Fix clust = OPTICS(min_samples=2, xi=0.005)#, min_cluster_size=.05) clust.fit(similarity_matrix) labels_050 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=100) labels = clust.labels_[clust.ordering_] #labels = labels_050 labels = np.add(labels,1) #labels = labels_050 return labels
def get_dbscan_and_reachability_figs(hdf_filename, pcoa_hdf_key, optics_hdf_key, first_pc, second_pc, cutoff): with pd.HDFStore(hdf_filename, 'r') as store: pcoa = store[pcoa_hdf_key][[first_pc, second_pc]] optics = store[optics_hdf_key] df = pd.merge(pcoa, optics, left_index=True, right_index=True) labels = df.labels[optics.ordering] names = df.index[optics.ordering] space = np.arange(len(df.index)) reachability = df.reachability[optics.ordering] reach_fig = px.scatter(x=space, y=reachability, color=labels, hover_name=names, range_x=[min(space), max(space) + 1]) dbscan_fig = go.Figure() if cutoff is not None: reach_fig.add_annotation( x=1, y=cutoff + .05, text="DBSCAN cutoff", xref="paper", showarrow=False, font_size=12 ) reach_fig.add_shape( type="line", xref='paper', x0=0, y0=cutoff, x1=1, y1=cutoff, line=dict(color="RoyalBlue", width=3) ) x = df[first_pc] y = df[second_pc] labels_db = cluster_optics_dbscan(reachability=optics.reachability, core_distances=optics.core_distances, ordering=optics.ordering, eps=cutoff) labels_db_text = [f"cluster {x}" for x in labels_db] dbscan_fig = px.scatter(x=x, y=y, color=labels_db_text, hover_name=df.index) return dbscan_fig, reach_fig
def cluster_mask_OPTICS(skel, show_image=False): xs, ys = convert_mask_to_regression(skel) X = np.array(list(zip(xs, ys))) clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05) clust.fit(X) print(clust.reachability_) print(clust.core_distances_) print(clust.ordering_) labels_050 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=2) # Пока не разобрался return None
def cluster(data, **kwargs): """ Clusters the array using OPTICS and dbscan. Finds the best number of clusters. Parameters ----------- data_array: array STFT array or low-dimensional embedding from `embed()` [nchan x nobs x ntrials] Returns ------- res: array results with res[0] having the nclust: int number of clusters identified """ clust = OPTICS(min_samples=20, xi=0.05, min_cluster_size=0.1, n_jobs=-1) clust.fit(data) epsilon = np.arange(0, 2, step=0.01) ncl = np.array([]) res = np.array([]) for e in epsilon: labels = cluster_optics_dbscan( reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=e, ) ncl = np.append(ncl, len(np.unique(labels[labels > -1]))) if ncl[-1] <= 1: res = np.append(res, 0) else: res = np.append(res, calinski_harabasz_score(data, labels)) nclust = np.unique(ncl) return res, nclust
def cluster(cluster_min_samples=5, cluster_eps=2.0 / RADIUS_EARTH_KM): print("Start clustering") conn = db_connect() coords_map = db_select_photos_coords(conn) coords_rad = list(coords_map.keys()) clustering = OPTICS(min_samples=cluster_min_samples, metric='haversine').fit(coords_rad) labels_dbscan = cluster_optics_dbscan(reachability=clustering.reachability_, core_distances=clustering.core_distances_, ordering=clustering.ordering_, eps=cluster_eps) # coords_rad_labels = zip(coords_rad, clustering.labels_) coords_rad_labels = zip(coords_rad, labels_dbscan) map_coords_deg_cluster = {coords_map[coord_rad]: label for (coord_rad, label) in coords_rad_labels} map_hull_points_idx = compute_hull_curves(map_coords_deg_cluster) db_create_clusters(conn, map_coords_deg_cluster, map_hull_points_idx) conn.close() print("Finished clustering")
def _remodel_optics(model, target="xi", **kwargs): if target == "xi": xi = kwargs.get("xi", 0.03) min_cluster_size = kwargs.get("min_cluster_size", 0.01) min_samples = kwargs.get("min_samples", 0.03) labels = sk_cluster.cluster_optics_xi( min_samples=min_samples, min_cluster_size=min_cluster_size, xi=xi, reachability=model.reachability_, predecessor=model.predecessor_, ordering=model.ordering_) else: eps = kwargs.get("eps", 0.5) labels = sk_cluster.cluster_optics_dbscan( eps=eps, reachability=model.reachability_, core_distances=model.core_distances_, ordering=model.ordering_, ) return sort_labels(labels)
def test_dbscan(Dl, args, logger, deepFD, epoch): logger.info('Testing with DBSCAN...') labels = getattr(Dl, Dl.ds+'_labels') features = get_embeddings(deepFD, Dl).cpu().numpy() save_embeddings(features, args.out_path, epoch) resultfile = f'{args.out_path}/results.txt' fa = open(resultfile, 'a') fa.write(f'====== Epoch {epoch} ======\n') # optics optics = OPTICS() optics.fit(features) logists = optics.labels_ logists[logists >= 0] = 0 logists[logists < 0] = 1 logger.info('evaluating with optics') results = _eval(labels, logists, logists) logger.info(' pre \t rec \t f1 \t ap \tpr_auc\troc_auc\t h_pre\t h_rec\t h_f1') logger.info('{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}'.format(results['pre'],results['rec'],results['f1'],results['ap'],results['pr_auc'],results['roc_auc'],results['h_pre'],results['h_rec'],results['h_f1'])) fa.write('OPTICS\n') fa.write(' pre \t rec \t f1 \t ap \tpr_auc\troc_auc\t h_pre\t h_rec\t h_f1 \n') fa.write('{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\n'.format(results['pre'],results['rec'],results['f1'],results['ap'],results['pr_auc'],results['roc_auc'],results['h_pre'],results['h_rec'],results['h_f1'])) # dbscan with different epsilon epsilons = [0.5, 2, 5, 10] for ep in epsilons: logists = cluster_optics_dbscan(reachability=optics.reachability_, core_distances=optics.core_distances_, ordering=optics.ordering_, eps=ep) logists[logists >= 0] = 0 logists[logists < 0] = 1 logger.info(f'evaluating with dbscan at {ep}') results = _eval(labels, logists, logists) logger.info(' pre \t rec \t f1 \t ap \tpr_auc\troc_auc\t h_pre\t h_rec\t h_f1') logger.info('{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}'.format(results['pre'],results['rec'],results['f1'],results['ap'],results['pr_auc'],results['roc_auc'],results['h_pre'],results['h_rec'],results['h_f1'])) fa.write(f'DBSCAN at {ep}\n') fa.write(' pre \t rec \t f1 \t ap \tpr_auc\troc_auc\t h_pre\t h_rec\t h_f1 \n') fa.write('{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\n'.format(results['pre'],results['rec'],results['f1'],results['ap'],results['pr_auc'],results['roc_auc'],results['h_pre'],results['h_rec'],results['h_f1'])) fa.close()
C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2) C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2) C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2) C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2) X = np.vstack((C1, C2, C3, C4, C5, C6)) clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05) # Run the fit clust.fit(X) labels_050 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=0.5) labels_200 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=2) space = np.arange(len(X)) reachability = clust.reachability_[clust.ordering_] labels = clust.labels_[clust.ordering_] plt.figure(figsize=(10, 7)) G = gridspec.GridSpec(2, 3) ax1 = plt.subplot(G[0, :]) ax2 = plt.subplot(G[1, 0]) ax3 = plt.subplot(G[1, 1])
def main(): """ """ min_samples_rng = np.arange(min_samps, max_samps, step) # Process all files in 'input_folder' files = readFiles() for file_path in files: # Extract required data from file data_all, data_id, data_c, data_err, msk_accpt = dataExtract(file_path) # This dictionary will hold all the runs probs_dict = {"ID": data_id} # For all the 'min_samples' values in 'min_samples_rng' no_outliers = False for min_samples in min_samples_rng: print("min_sample={}".format(min_samples)) # For all the re-sample runs probs_all = [] for _ in range(Nruns): print(" Re-sample N={}".format(_)) # Use non-resampled values in the first run if _ == 0: # data_arr = data_c data_arr = np.array([data_c[_] for _ in data_c.columns]).T else: # Re-sample data data_arr = reSampleData(data_c, data_err) # Apply PCA reduction print(" PCA dimension reduction...") data_pca = dimReduc(data_arr, PCAdims) # Obtain OPTICS model print(" OPTICS model...") model_OPTIC = runOPTICS(data_pca, min_samples) labels = model_OPTIC.labels_[model_OPTIC.ordering_] if (labels == -1).sum() == 0: no_outliers = True break # Auto eps selection print(" eps selection...") eps_final = findEps(data_pca, model_OPTIC, perc_cut) # DBSCAN labels print(" DBSCAN labels...") labels_dbs = skclust.cluster_optics_dbscan( reachability=model_OPTIC.reachability_, core_distances=model_OPTIC.core_distances_, ordering=model_OPTIC.ordering_, eps=eps_final) msk_memb = labels_dbs != -1 probs = np.zeros(len(msk_accpt)) j = 0 for i, st_f in enumerate(msk_accpt): if st_f: if msk_memb[j]: probs[i] = 1 j += 1 probs_all.append(probs) if no_outliers is True: print("No more outliers. Breaking") break probs_dict[str(min_samples)] = np.round(np.mean(probs_all, 0), 3) # Estimate mean probabilities all_vals = [] for k, vals in probs_dict.items(): if k != 'ID': all_vals.append(vals) probs_mean = np.round(np.array(all_vals).mean(0), 3) # Write to file probs_dict['probs_mean'] = probs_mean fname, fext = file_path.parts[-1].split('.') fout = 'output/' + fname + "_probs." + fext ascii.write(probs_dict, fout, overwrite=True)
X2 = [4, -1] + .1 * np.random.randn(n_diem, 2) X3 = [1, -2] + .2 * np.random.randn(n_diem, 2) X4 = [-2, 3] + .3 * np.random.randn(n_diem, 2) X5 = [3, -2] + 1.6 * np.random.randn(n_diem, 2) X6 = [5, 6] + 2 * np.random.randn(n_diem, 2) X = np.vstack((X1, X2, X3, X4, X5, X6)) #Theo thứ tự theo chiều dọc clust_optics = OPTICS(min_samples=50, xi=0.05, min_cluster_size=0.05) #Truyền tham số có hàm # OPTICS với MinPts:50 , e=0,05 # Run the fit clust_optics.fit(X) # Run OPTICS labels_050 = cluster_optics_dbscan(reachability=clust_optics.reachability_, core_distances=clust_optics.core_distances_, ordering=clust_optics.ordering_, eps=0.5) #RUN DBSCAN VỚI EPS=0,5 labels_200 = cluster_optics_dbscan(reachability=clust_optics.reachability_, core_distances=clust_optics.core_distances_, ordering=clust_optics.ordering_, eps=2) #RUN DBSCAN VỚI EPS=2 space = np.arange(len(X)) #Độ dài mãng dư liệu reachability = clust_optics.reachability_[clust_optics.ordering_] labels = clust_optics.labels_[clust_optics.ordering_] plt.figure(figsize=(10, 7)) #Độ lớn figure G = gridspec.GridSpec(2, 3) #Tạo các vị trí cho ax gồm có 2 hàng 3 cột ax1 = plt.subplot(G[0, :]) #ax1 hiển thị ở hàng 0 cột 1 2 3 ax2 = plt.subplot(G[1, 0]) #ax2 hiển thị ở hàng 1 cột 0 ax3 = plt.subplot(G[1, 1]) #ax3 hiển thị ở hàng 1 cột 1
""" import numpy as np from sklearn.cluster import OPTICS, cluster_optics_dbscan import matplotlib.pyplot as plt # Config data_file = 'results/00_TSNE/HANDS17_DPREN_ShapeSplit_val_normalized_DEFAULT_combined_50.npy' ######################################################################## data = np.load(data_file) epsilons = [2.0, 4.0] clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=1000) clust.fit(data) for epsilon in epsilons: print('Starting clustering for epsilon {}'.format(epsilon)) labels = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=epsilon) cluster_labels = set(labels) for i in cluster_labels: mask = labels == i cluster = data[mask] if i == -1: plt.scatter(cluster[:, 0], cluster[:, 1], s=1, alpha=1.0, label=i, color='black') else: plt.scatter(cluster[:, 0], cluster[:, 1], s=1, alpha=0.2, label=i) plt.show()
ff = np.load(dirr+'OPTICS_sp%d_smin%d.npz'%(sp, mins)) lon0 = ff['lon'] lat0 = ff['lat'] reachability = ff['reachability'] / 1000 ordering = ff['ordering'] predecessor = ff['predecessor'] core_distances = ff['core_distances'] #%% Create the clusters from the reachabilities, given the xi value labels = [] for op in opts: m, c = op[0], op[1] if m == "xi": l, _ = cluster_optics_xi(reachability, predecessor, ordering, mins, xi=c) else: l = cluster_optics_dbscan(reachability=reachability, core_distances=core_distances, ordering=ordering, eps=c) labels.append(l) norms = [] for l in labels: bounds = np.arange(-.5,np.max(l)+1.5,1) norms.append(matplotlib.colors.BoundaryNorm(bounds, len(bounds))) #%% exte=[18, 360-70, -75, 0]; latlines=[-75,-50, -25, 0, 25, 50, 75, 100]; # Read Foram data readData = '/Volumes/HD/network_clustering/' data = nwf.readForamset(readData + 'ForamData.csv') Foramspecies = nwf.readForamset(readData + 'ForamDataHeader.txt')[0][21:]
def graph_optics_neighborhoods(X): clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05) clust.fit(X) labels_050 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=0.5) labels_200 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=2) space = np.arange(len(X)) reachability = clust.reachability_[clust.ordering_] labels = clust.labels_[clust.ordering_] plt.figure(figsize=(10, 7)) G = gridspec.GridSpec(2, 3) ax1 = plt.subplot(G[0, :]) ax2 = plt.subplot(G[1, 0]) ax3 = plt.subplot(G[1, 1]) ax4 = plt.subplot(G[1, 2]) # Reachability plot colors = ['g.', 'r.', 'b.', 'y.', 'c.'] for klass, color in zip(range(0, 5), colors): Xk = space[labels == klass] Rk = reachability[labels == klass] ax1.plot(Xk, Rk, color, alpha=0.3) ax1.plot(space[labels == -1], reachability[labels == -1], 'k.', alpha=0.3) ax1.plot(space, np.full_like(space, 2., dtype=float), 'k-', alpha=0.5) ax1.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.', alpha=0.5) ax1.set_ylabel('Reachability (epsilon distance)') ax1.set_title('Reachability Plot') # OPTICS colors = ['g.', 'r.', 'b.', 'y.', 'c.'] for klass, color in zip(range(0, 5), colors): Xk = X[clust.labels_ == klass] ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3) ax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], 'k+', alpha=0.1) ax2.set_title('Automatic Clustering\nOPTICS') # DBSCAN at 0.5 colors = ['g', 'greenyellow', 'olive', 'r', 'b', 'c'] for klass, color in zip(range(0, 6), colors): Xk = X[labels_050 == klass] ax3.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3, marker='.') ax3.plot(X[labels_050 == -1, 0], X[labels_050 == -1, 1], 'k+', alpha=0.1) ax3.set_title('Clustering at 0.5 epsilon cut\nDBSCAN') # DBSCAN at 2. colors = ['g.', 'm.', 'y.', 'c.'] for klass, color in zip(range(0, 4), colors): Xk = X[labels_200 == klass] ax4.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3) ax4.plot(X[labels_200 == -1, 0], X[labels_200 == -1, 1], 'k+', alpha=0.1) ax4.set_title('Clustering at 2.0 epsilon cut\nDBSCAN') plt.tight_layout() plt.show()
X_normalized = pd.DataFrame(X_normalized) # Renaming the columns X_normalized.columns = X.columns X_normalized.head() # OPTICS Clustering model optics_model = OPTICS(min_samples=10, min_cluster_size=0.05) # Training the model optics_model.fit(X_normalized) # DBSCAN technique with eps = 0.5 labels1 = cluster_optics_dbscan(reachability=optics_model.reachability_, core_distances=optics_model.core_distances_, ordering=optics_model.ordering_, eps=0.3) # DBSCAN technique with eps = 2.0 labels2 = cluster_optics_dbscan(reachability=optics_model.reachability_, core_distances=optics_model.core_distances_, ordering=optics_model.ordering_, eps=1.0) # Creating a numpy array with numbers at equal spaces till # the specified range space = np.arange(len(X_normalized)) # Storing the reachability distance of each point reachability = optics_model.reachability_[optics_model.ordering_]
C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2) C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2) C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2) C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2) X = np.vstack((C1, C2, C3, C4, C5, C6)) clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05) # Run the fit clust.fit(X) labels_050 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=0.5) labels_200 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=2) space = np.arange(len(X)) reachability = clust.reachability_[clust.ordering_] labels = clust.labels_[clust.ordering_] plt.figure(figsize=(10, 7)) G = gridspec.GridSpec(2, 3) ax1 = plt.subplot(G[0, :]) ax2 = plt.subplot(G[1, 0]) ax3 = plt.subplot(G[1, 1]) ax4 = plt.subplot(G[1, 2])
def machine_learning(self, df, plugin_options): """Apply the scikit-learn OPTICS machine learning algorithm to the supplied data set, returning the results and indices. Args: df (Pandas DataFrame): DataFrame containing the machine learning ready version of the dataset to be processed. plugin_options (dictionary): Dictionary containing any optional parameters for plugins being used. Returns: Dictionary: Dictionary containing final machine learning results and other internal data that user may want to save for review. """ print("\n") print("--Beginning: Machine Learning") print("\tMachine learning algorithm: scikit-learn OPTICS") if ("OPTICS_eps" in plugin_options): self.eps = float(plugin_options["OPTICS_eps"]) print("\tOverriding default eps, it is set to: %g" % self.eps) else: print("\tUsing default setting for eps: %g" % self.eps) if ("OPTICS_min_samples" in plugin_options): self.min_samples = int(plugin_options["OPTICS_min_samples"]) print("\tOverriding default min_samples, it is set to: %i" % self.min_samples) else: print("\tUsing default setting for min_samples: %g" % self.min_samples) # Capture start time. start_time = time.time() # Create an instance of OPTICS, a normalizer, and create a pipeline for # automatic execution of both. # cluster_method = "xi" or "dbscan" optics = OPTICS(max_eps=self.eps, min_samples=self.min_samples, cluster_method="xi") normalizer = StandardScaler(copy=False, with_mean=self.with_mean, with_std=True) print("\tBeginning: fitting") start_time_fitting = time.time() # Check to see if the user wants to skip normalization of the data before # applying OPTICS to the data. if "OPTICS_skip_normalization" not in plugin_options: normalized_data = normalizer.fit_transform(df) else: print("\t\tNOT normalizing data as requested by user...") normalized_data = df optics.fit(normalized_data) results = cluster_optics_dbscan(reachability=optics.reachability_, core_distances=optics.core_distances_, ordering=optics.ordering_, eps=self.eps) # Number of clusters in labels, ignoring noise if present. n_clusters = len(set(results)) - (1 if -1 in results else 0) n_noise = list(results).count(-1) print("\tn_clusters = %i, n_noise = %i" % (n_clusters, n_noise)) # compute centroids of the clusters centroids = np.zeros((n_clusters, normalized_data.shape[1])) for i in range(0, n_clusters): j = [k for k, x in enumerate(results) if x == i] centroids[i] = np.sum(normalized_data[j], axis=0) / len(j) self.fitting_time = time.time() - start_time_fitting print("\tFinished: fitting") print("\n\tFitting Time: %.4f seconds" % self.fitting_time) print("\tMachine Learning Total Time: %.4f seconds" % (time.time() - start_time)) print("--Finished: Machine Learning") # Return a dictionary containing specific components created or calculated # as part of the machine learning process. These may be used to perform # additional tasks (saving data to files, graphing, etc.). return {"OPTICS_Results": results, "OPTICS_Centroids": centroids}