def plot_bacteria_as_clusters(data: pd.DataFrame, save_path: Path, save_fig: bool = False, time_point=None): if time_point is None: # set to last time step time_point = -1 position_matrix = [] for bac in data['position'].index: x, y, z = data['position'][bac][time_point][0], \ data['position'][bac][time_point][1], \ data['position'][bac][time_point][2] position_matrix.append([x, y, z]) fig = plt.figure() ax = Axes3D(fig) ax.scatter(data[:, 0], data[:, 1], data[:, 2], s=30) ax.view_init(azim=200) plt.show() # model = DBSCAN(eps=2.5, min_samples=2) model = OPTICS(min_samples=2, metric='euclidean') model.fit_predict(data) fig = plt.figure() ax = Axes3D(fig) ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=model.labels_, s=30) ax.view_init(azim=200) plt.show() if save_fig: path = Path(save_path).parent / 'cluster_plot.png' plt.savefig(path) plt.close(fig) else: plt.show()
def sort_bacteria_in_cluster(self): """: Sorts the bacteria in the biofilm into bac_clusters. Clusters are calculated with the OPTICS algorithm. Return value is a list of the bac_clusters containing the respective bacteria. """ # sort data in the format of a 3xN matrix where N is the number of bacteria. data = self.position_matrix.transpose() model = OPTICS(min_samples=2, metric='euclidean') model.fit_predict(data) clusters = [[] for _ in range(0, len(np.unique(model.labels_)))] for bacteria, index in zip(self.bacteria, model.labels_): # sort bacteria in bac_clusters according to the assigned labels clusters[index].append(bacteria) # check if all bacteria where assigned sum = 0 for cluster in clusters: sum += len(cluster) if sum != len(self.bacteria): raise ValueError(f"{abs(sum - len(self.bacteria))} bacteria where not sorted in a cluster.") return clusters
def optics_clustering(principal_components, principal_df): final_df = pd.concat([principal_df], axis=1) model = OPTICS(eps=5, min_samples=2) # fit model and predict clusters yhat = model.fit_predict(principal_components) # retrieve unique clusters clusters = unique(yhat) final_df['Segment'] = model.labels_ # create scatter plot for samples from each cluster for cluster in clusters: # get row indexes for samples with this cluster row_ix = where(yhat == cluster) # create scatter of these samples plt.scatter(principal_components[row_ix, 0], principal_components[row_ix, 1], s=75) final_df.rename({ 0: 'PC1', 1: 'PC2', 2: 'PC3', 'y': 'Race' }, axis=1, inplace=True) print(final_df) plt.title("OPTICS Clustering") add_race_labels(final_df) calc_silhouette(data=principal_components, prediction=yhat, n_clusters=len(clusters)) return final_df
def find_cluster_indices(output_seqs, batch_size, datatype="train_y"): ## Cluster the output set of sequences and chooose sequences randomly from each cluster ### print("Clustering {}".format(datatype)) features = convert_to_array(output_seqs) from sklearn.cluster import DBSCAN clustering_type = OPTICS(min_samples=2, min_cluster_size=2) #DBSCAN(eps=0.5, min_samples=2).fit(features) #OPTICS(min_samples=2, min_cluster_size=2) cluster_labels = clustering_type.fit_predict(features) print("Number of clusters: {}".format(str(len(list(set(cluster_labels)))))) x = list() y = list() cluster_indices_dict = dict() for i, l in enumerate(cluster_labels): x.append(output_seqs[i]) y.append(l) if l not in cluster_indices_dict: cluster_indices_dict[l] = list() cluster_indices_dict[l].append(i) scatter_df = pd.DataFrame(list(zip(x, y)), columns=["output_seqs", "clusters"]) scatter_df.to_csv( "data/generated_files/clustered_output_seqs_data_{}.csv".format( datatype)) return cluster_labels, cluster_indices_dict, scatter_df
def get_optics(data): """ Do optics clustering and return clustered data """ optics = OPTICS(min_samples=50) vals = data.iloc[ :, 0:].values y_pred = optics.fit_predict(StandardScaler().fit_transform(vals)) data["cluster"] = y_pred return data
def visual(c, X, y): from sklearn.cluster import OPTICS cluster_object = OPTICS(min_cluster_size=100) y_pred = cluster_object.fit_predict(X) colors = [ 'red', 'green', 'blue', 'cyan', 'black', 'yellow', 'magenta', 'brown', 'orange', 'silver', 'goldenrod', 'olive', 'dodgerblue' ] clusters = np.unique(y_pred) print("Cluster Labels") print(clusters) print("Evaluation") evaluation_labels(y, y_pred) evaluation(X, y_pred) for cluster in clusters: row_idx = np.where(y == cluster) plt.scatter(X[row_idx, 0], X[row_idx, 1]) plt.title('Dataset') plt.xlabel('X1') plt.ylabel('X2') plt.legend() plt.show() plt.figure() for cluster in clusters: row_idx = np.where(y_pred == cluster) plt.scatter(X[row_idx, 0], X[row_idx, 1]) plt.title('Cluster') plt.xlabel('X1') plt.ylabel('X2') plt.legend() plt.show()
def cluster_proteins_by_sim(prot_graph_fname): print('here') with open(prot_graph_fname, 'rb') as fd: nodes, adj_mat = pkl.load(fd) model = OPTICS(min_cluster_size=5, n_jobs=-1) clusters = model.fit_predict(adj_mat) print(Counter(clusters)) transformer = eGTM() x, y = transformer.fit_transform(adj_mat).T cmap = plt.get_cmap('jet', np.max(clusters) + 2) cmap.set_under('gray') fig, ax = plt.subplots() ax.scatter(x, y, c=clusters, s=10, cmap=cmap) outfile = os.path.join(os.path.dirname(prot_graph_fname), 'protein_egtm_clusters.png') plt.savefig(outfile) plt.close() transformer = TSNE(n_components=2, n_iter_without_progress=10) x, y = transformer.fit_transform(adj_mat).T cmap = plt.get_cmap('jet', np.max(clusters) + 2) cmap.set_under('gray') fig, ax = plt.subplots() ax.scatter(x, y, c=clusters, s=10, cmap=cmap) outfile = os.path.join(os.path.dirname(prot_graph_fname), 'protein_tsne_clusters.png') plt.savefig(outfile) plt.close()
def exploratory_analysis(dataset: str, samples=0.1, eps=np.inf) -> None: X = np.genfromtxt(dataset, delimiter=',', encoding='utf8') scaler = StandardScaler(copy=False) X_transformed = scaler.fit_transform(X) clust = OPTICS(min_samples=samples, max_eps=eps, n_jobs=2) labels = clust.fit_predict(X) n_clusters = len(set(labels)) print("# clusters: {0}".format(n_clusters))
class OPTICS_algo_wrapper: def __init__(self): self.wrapped = OPTICS(min_samples=1, max_eps=2, metric='cosine', cluster_method='dbscan') def fit(self, data): return self.wrapped.fit(data) def predict(self, data): return self.wrapped.fit_predict(data)
class OPTICS_algo_wrapper: def __init__(self): self.wrapped = OPTICS(min_samples=5, xi=.05, min_cluster_size=.05) self.data = [] self.indexes = [] def fit(self, data): self.wrapped.fit(data) self.data = data self.indexes = self.wrapped.labels_ def predict(self, data): return self.wrapped.fit_predict(data)
def cluster_manifold_in_embedding(hl, y, manifold_learner,umap_min_dist,umap_metric,umap_dim,umap_neighbors,n_clusters,cluster): # find manifold on autoencoded embedding if manifold_learner == 'UMAP': md = float(umap_min_dist) hle = umap.UMAP(random_state=0,metric=umap_metric,n_components=umap_dim,n_neighbors=umap_neighbors,min_dist=md).fit_transform(hl) elif manifold_learner == 'LLE': hle = LocallyLinearEmbedding(n_components=umap_dim,n_neighbors=umap_neighbors).fit_transform(hl) elif manifold_learner == 'tSNE': hle = TSNE(n_components=umap_dim,n_jobs=16,random_state=0,verbose=0).fit_transform(hl) elif manifold_learner == 'isomap': hle = Isomap(n_components=umap_dim,n_neighbors=5).fit_transform(hl) # clustering on new manifold of autoencoded embedding if cluster == 'GMM': gmm = mixture.GaussianMixture(covariance_type='full',n_components=n_clusters,random_state=0) gmm.fit(hle) y_pred_prob = gmm.predict_proba(hle) y_pred = y_pred_prob.argmax(1) elif cluster == 'KM': km = KMeans(init='k-means++',n_clusters=n_clusters,random_state=0,n_init=20) y_pred = km.fit_predict(hle) elif cluster == 'SC': sc = SpectralClustering(n_clusters=n_clusters,random_state=0,affinity='nearest_neighbors') y_pred = sc.fit_predict(hle) elif cluster=='DBSCAN': db=DBSCAN() y_pred=db.fit_predict(hle) elif cluster=='OPTICS': op=OPTICS() y_pred=op.fit_predict(hle) y_pred = np.asarray(y_pred) #y = np.asarray(y) # y = y.reshape(len(y), ) #nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5) #ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5) print('=='*80) #print("METRICS for the ",cluster,manifold_learner) #print(nmi) #print(ari) print('=' * 80) return y_pred
class OPTICSModel(ClusteringModel): def __init__(self, **params): super().__init__('optics') self.model = OPTICS(**params) def perform_clustering(self, features, **params): self.model.fit(features, **params) return pd.concat([ features, pd.DataFrame([i for i in self.model.fit_predict(features)], columns=('cluster', )) ], axis=1)
def _get_class(self, im): minmax = (im.min(), im.max()) if minmax[1] - minmax[0] == 0: return np.array() im = (im - minmax[0]) / (minmax[1] - minmax[0]) clf = OPTICS(metric='euclidean', min_cluster_size=75) a = [] for x in range(im.shape[0]): for y in range(im.shape[1]): if im[x][y] > self.sentence_threshold: a.append([x, y]) b = clf.fit_predict(a) c = np.zeros(im.shape) for i in range(len(b)): c[a[i][0], a[i][1]] = b[i] + 1 return c
def bin(self, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, metric_params=None, cluster_method='xi', eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm='auto', leaf_size=30, n_jobs=10, **kwargs): """ optics clustering """ # optics clustering clusterer = OPTICS(min_samples=min_samples, max_eps=max_eps, metric=metric, p=p, metric_params=metric_params, cluster_method=cluster_method, eps=eps, xi=xi, predecessor_correction=predecessor_correction, min_cluster_size=min_cluster_size, algorithm=algorithm, leaf_size=leaf_size, n_jobs=n_jobs) cluster_labels = clusterer.fit_predict(self.embedding_df) cluster_df = pd.DataFrame(data=cluster_labels.transpose(), columns=['cluster'], index=self.embedding_df.index) # write output output_cluster_file = os.path.join(self.output_dir, self.prefix + "_optics.tsv") cluster_df.to_csv(output_cluster_file, sep="\t", header=True, index=True) return cluster_df
async def run(hub, config, pipe, data, train): ''' Run the OPTICS algorithm on the given dataset ''' if pipe not in hub.models.optics.COMPS: kmconfig = config.get('optics', {}) mlo = OPTICS(n_jobs=kmconfig.get('n_jobs', -1)) print('Created OPTICS machine learning object:\n', mlo) hub.models.optics.COMPS[pipe] = {'mlo': mlo} mlo = hub.models.optics.COMPS[pipe]['mlo'] if train: print(f'Training {len(train)} samples') mlo.fit(train) if data: print(f'Predicting {len(data)} samples') return list(mlo.fit_predict(data)) return []
def clusters_partition(df, color, size): '''Establishes hubs wherever two or more clusters connect.''' params = { 'cluster_method': 'xi', 'metric': 'cityblock', 'xi': 0.05, 'min_cluster_size': None, 'max_eps': np.inf, 'n_jobs': None } model = OPTICS(**params) df['hub'] = model.fit_predict(X=df[['x', 'y']].values) mask = df['hub'] < 0 df['hub_group'] = df['hub'] // size df.loc[mask, 'hub_group'] = -1 df['hub_color'] = df['hub'] % size df.loc[mask, 'hub_color'] = -1 df['hub_color'] = df['hub_color'].map(color) return df
def optics(data, name_list, data_name, result_path, vis_path): print("Start OPTICS clustering..") model = OPTICS(min_samples=10) model.fit(data) k = max(model.labels_) predict = model.fit_predict(data) result_path = result_path + "optics/" if not os.path.exists(result_path): os.mkdir(result_path) vis_path = vis_path + "optics/" if not os.path.exists(vis_path): os.mkdir(vis_path) #image_classification(name_list, predict, result_path, k) save_result(name_list, predict, result_path, k) visualization(data, predict, data_name, vis_path, k) print("Done.\n")
def _get_class(self, im, size=128): minmax = (im.min(), im.max()) if minmax[1]-minmax[0] == 0: return np.array([]) im = (im-minmax[0]) / (minmax[1]-minmax[0]) sc = cv2.resize(im, (size,size), interpolation=cv2.INTER_NEAREST) clf = OPTICS(max_eps=5, metric='euclidean', min_cluster_size=75) a = [] for x in range(sc.shape[0]): for y in range(sc.shape[1]): if sc[x][y] > 0.01: a.append([x,y]) b = clf.fit_predict(a) p = {v:k for k,v in enumerate(set(b))} b = [p[j] for j in b] c = np.zeros(sc.shape, dtype=np.int32) for i in range(len(b)): c[a[i][0],a[i][1]] = b[i]+1 c = cv2.resize(c, (im.shape[1], im.shape[0]), interpolation=cv2.INTER_NEAREST) return c
def optics_mins(ecfp_data): min_s_lst = [] nn_lst = [] svm_lst = [] lda_lst = [] rf_lst = [] h_x_lst = [] for min_s in range(2, 10): clustering = OPTICS(min_samples=min_s, metric=tanimoto_dist) labels = clustering.fit_predict(ecfp_data) X_train, X_test, y_train, y_test = train_test_split(ecfp_data, labels, test_size=0.2, random_state=0) min_s_lst.append(min_s) nn_lst.append(nn_classification(X_train, X_test, y_train, y_test)) svm_lst.append(svm_classification(X_train, X_test, y_train, y_test)) lda_lst.append(lda_classification(X_train, X_test, y_train, y_test)) rf_lst.append(rf_classification(X_train, X_test, y_train, y_test)) h_x_lst.append(shannon_entropy(labels)) fig, ax = plt.subplots() ax.plot(min_s_lst, nn_lst, label='NN') ax.plot(min_s_lst, svm_lst, label='SVM') ax.plot(min_s_lst, lda_lst, label='LDA') ax.plot(min_s_lst, rf_lst, label='RF') ax.set_xlabel('Minimal Samples') ax.set_ylabel('Accuracy Rate') ax1 = ax.twinx() ax1.plot(min_s_lst, h_x_lst, '--', color='black') ax1.set_ylabel('Shannon Entropy') ax.set_title("Hyperparameter Tuning for OPTICS Clustering") ax.legend(loc='lower right') #ax.grid() plt.margins(0.02) ax.set_ylim([0, 1]) ax1.set_ylim([0, 1]) plt.show()
def compute_optics(scooter_data, facil_bndry): with open( '/Users/BrandonHall/Documents/GitHub/SUMDScrapeAndAnalysis/DC_Outlines/DCboundCoords.pkl', 'rb') as f: facilities = pickle.load(f) xext = (-76.00, -77.20) yext = (38.7, 39.00) X = np.array([[trip['xy'].x, trip['xy'].y] for trip in scooter_data]) print("Shape of X", X.shape) clust = OPTICS(min_samples=50, xi=.005, max_eps=.1, min_cluster_size=.005) # Run the fit labels = clust.fit_predict(X) unique_labels = set(labels) print('there are ' + str(len(unique_labels) - 1) + ' clusters') # graph clusters plt.figure(figsize=(7, 7)) colors = [ plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels)) ] plt.fill(*facil_bndry.exterior.xy, c='gold', alpha=0.3) plt.plot(*facil_bndry.exterior.xy) plt.plot(*facilities.exterior.xy) # G = gridspec.GridSpec(1, 1) # ax = plt.subplot(G[0, 0]) # ax.set_title('Automatic Clustering\nOPTICS') for klass, color in zip(range(0, len(unique_labels)), colors): Xk = X[clust.labels_ == klass] plt.plot(Xk[:, 0], Xk[:, 1], alpha=0.9) plt.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], 'k+', alpha=0.5) plt.axis('equal') plt.show()
# optics clustering from numpy import unique from numpy import where from sklearn.datasets import make_classification from sklearn.cluster import OPTICS from matplotlib import pyplot # define dataset X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4) # define the model model = OPTICS(eps=0.8, min_samples=10) # fit model and predict clusters yhat = model.fit_predict(X) # retrieve unique clusters clusters = unique(yhat) # create scatter plot for samples from each cluster for cluster in clusters: # get row indexes for samples with this cluster row_ix = where(yhat == cluster) # create scatter of these samples pyplot.scatter(X[row_ix, 0], X[row_ix, 1]) # show the plot pyplot.show()
vectors = [] for word in sentence: if word in model: vectors.append(model[word]) df_vectors = pd.DataFrame(vectors) # Wortweise Durchschnitt bilden, sodass der ganze Satz einen einzigen "Durchschnitts-Wortvektor" erhält mean_vector = df_vectors.mean(axis=0).values.tolist() entry_vectors.append(mean_vector) df['vector'] = entry_vectors # Clustering xi = .07 clust = OPTICS(min_samples=2, xi=xi) labels = clust.fit_predict(entry_vectors) df['label'] = labels pd.set_option('display.max_colwidth', -1) # Lange Strings # Spalten wählen df = df.filter(items=['label', 'feed', 'entry']) # Unkategorisierte Zeilen weglassen df = df[df['label'] >= 0] # Sortieren df = df.sort_values(by='label') print(df.to_string())
# Del NaN and 1 ListVal = [ x for x in ListVal if not math.isnan(x[0]) and (x[0] != 1 and x[1] != 1) ] ListKey = [ y for x, y in zip(ListVal, ListKey) if not math.isnan(x[0]) and (x[0] != 1 and x[1] != 1) ] DictF = {x: y for x, y in zip(ListKey, ListVal)} ################################################################ opt = OPTICS(min_samples=14) y_opt = opt.fit_predict(ListVal) ListKeyN = np.array(ListKey) print(Counter(y_opt)) #print("Noise: {0}".format(ListKeyN[y_opt==-1])) print("1-th cluster: {0}".format(ListKeyN[y_opt == 20])) ''' import matplotlib.pyplot as plt countClust = [] Noise = [] for i in range(8, 109): opt = OPTICS(min_samples=i) y_opt = opt.fit_predict(ListVal) countClust.append(max(Counter(y_opt).keys())) Noise.append(Counter(y_opt)[-1]) plt.plot(countClust, range(8, 109), marker='o')
def cluster(img, eps, min_samples, backend="dbscan", nthreads=2, fit_kind="circle"): """ Cluster group of pixels. Parameters: ---------- img : np.ndarray Input image. Must be binary. eps : float Maximum distance allowed to form a cluster. min_samples : int Minimum number of samples to form a cluster. backend : str Which backend to use for clustering. Default is DBSCAN. fit_kind : str What type of geometry to fir to the clusters. Default is circle. Returns: ------- df : pd.DataFrame A dataframe with the clustering results. """ ipx, jpx = np.where(img) # gets where img == 1 X = np.vstack([ipx, jpx]).T if len(X) > min_samples: if backend.lower() == "optics": db = OPTICS(cluster_method="dbscan", metric="euclidean", eps=eps, max_eps=eps, min_samples=min_samples, min_cluster_size=min_samples, n_jobs=nthreads, algorithm="ball_tree").fit(X) labels = db.labels_ elif backend.lower() == "hdbscan": db = hdbscan.HDBSCAN(min_cluster_size=int(min_samples), metric="euclidean", allow_single_cluster=True, core_dist_n_jobs=nthreads) labels = db.fit_predict(X) elif backend.lower() == "dbscan": db = DBSCAN(eps=eps, metric="euclidean", min_samples=min_samples, n_jobs=nthreads, algorithm="ball_tree").fit(X) labels = db.labels_ else: raise ValueError("Use either DBSCAN or OPTICS.") # to dataframe df = pd.DataFrame(X, columns=["j", "i"]) df["cluster"] = labels df = df[df["cluster"] >= 0] # get centers and radii cluster = [] i_center = [] j_center = [] n_pixels = [] R1 = [] R2 = [] theta = [] for cl, gdf in df.groupby("cluster"): # fit a circle if fit_kind == "circle": c, r2 = miniball.get_bounding_ball( gdf[["i", "j"]].values.astype(float)) xc, yc = c r1 = np.sqrt(r2) r2 = r1 # these are for ellipses only t = 0 # these are for ellipses only elif fit_kind == "ellipse": try: # compute the minmun bounding ellipse A, c = mvee(gdf[["i", "j"]].values.astype(float)) # centroid xc, yc = c # radius, angle and eccentricity r1, r2, t, _ = get_ellipse_parameters(A) except Exception: # fall back to circle c, r2 = miniball.get_bounding_ball( gdf[["i", "j"]].values.astype(float)) xc, yc = c r1 = np.sqrt(r2) r2 = r1 # these are for ellipses only t = 0 # these are for ellipses only else: raise ValueError("Can only fit data to circles or ellipses.") # append to output i_center.append(xc) j_center.append(yc) cluster.append(cl) n_pixels.append(len(gdf)) R1.append(r1) R2.append(r2) theta.append(t) # to dataframe x = np.vstack([i_center, j_center, n_pixels, R1, R2, theta, cluster]).T columns = ["ic", "jc", "pixels", "ir", "jr", "theta_ij", "cluster"] df = pd.DataFrame(x, columns=columns) return df else: return pd.DataFrame()
data['resultados'] = previsoes data.groupby("resultados").aggregate("mean").plot.bar() plt.title('Algoritmo: DBSCAN') plt.legend(['Matemática','Leitura','Escrita']) plt.xlabel('Classes') plt.ylabel('Nota média') plt.show() plt.hist(data['resultados']) plt.xlabel('Classes') plt.ylabel('Quantidade') plt.show() clust = OPTICS(min_samples=20, min_cluster_size=15) previsoes = clust.fit_predict(scores) unicos, quantidade = np.unique(previsoes, return_counts = True) print("Optics com Min Samples {0}".format(20)) print("Coeficiente de Silhueta média: %0.3f" % sklearn.metrics.silhouette_score(scores, clust.labels_)) print("Coeficiente de Davies Bouldin: %0.3f" % sklearn.metrics.davies_bouldin_score(scores, clust.labels_)) print("Coeficiente de Calinski Harabasz: %0.3f\n" % sklearn.metrics.calinski_harabasz_score(scores, clust.labels_)) for u, q in zip(unicos, quantidade): print("Classe {0}:\t{1} elementos na classe".format(u,q)) data['resultados'] = previsoes resultadoOPTICS = data['resultados'] data.groupby("resultados").aggregate("mean").plot.bar() plt.title('Algoritmo: OPTICS') plt.legend(['Matemática','Leitura','Escrita']) plt.xlabel('Classes') plt.ylabel('Nota média')
from sklearn.cluster import DBSCAN times = [] for i in range(1, 5): start = time.time() dbscanClustering = DBSCAN(eps=5, min_samples=6).fit_predict(clData) end = time.time() times.append(end - start) dbscanTime = average(times) ############## OPTICS ############## from sklearn.cluster import OPTICS times = [] for i in range(1, 5): start = time.time() opticsClustering = OPTICS(min_samples=50, xi=0.05, max_eps=10) opticsLabels = opticsClustering.fit_predict(clData) end = time.time() times.append(end - start) opticsTime = average(times) ########### Hierarchical Clustering ############## from sklearn.cluster import AgglomerativeClustering times = [] for i in range(1, 5): start = time.time() HierClustering = AgglomerativeClustering(n_clusters=8).fit_predict(clData) end = time.time() times.append(end - start) hierarchicalTime = average(times) ########### Spectral Clustering ##############
# Using the Optics Algorithm ms = OPTICS(n_jobs=3) ms.fit(customers_normalized) customers["Cluster"] = ms.labels_ customers.groupby('Cluster').agg({ 'Recency': 'mean', 'Frequency': 'mean', 'MonetaryValue': ['mean', 'count'] }).round(2) # Visualise clusters for Optics Algorithm # Scatter Plot y_kmeans = ms.fit_predict(customers_normalized) plt.figure(figsize=(8, 8)) plt.scatter( customers_normalized[y_kmeans == 0, 0], customers_normalized[y_kmeans == 0, 1], # customers_normalized[y_kmeans == 0, 2], s=10, c='red', label='') plt.scatter( customers_normalized[y_kmeans == 1, 0], customers_normalized[y_kmeans == 1, 1], # customers_normalized[y_kmeans == 1, 2], s=10, c='blue', label='')
palette="Accent").set_title("PCA of kMeans analysis") print(adjusted_rand_score(kmeansClustering["kMeans"], data["Gate"])) del (dataCopy, principalComponents, principalDf) # OPTICS Clustering #explaination of methods in sklearn documentation from sklearn.cluster import OPTICS optics = OPTICS(min_samples=10, xi=.05, min_cluster_size=5) dataCopy = data.copy() del (dataCopy["Gate"]) resOptics = optics.fit_predict(dataCopy) dataCopy["optics"] = resOptics #sns.pairplot(dataCopy, diag_kind="kde", markers="1", hue = "optics") dataCopy["optics"].value_counts() #DBSCAN Clustering from sklearn.cluster import DBSCAN dbscan = DBSCAN(eps=121, min_samples=10) dataCopy = data.copy() del (dataCopy["Gate"])
X = array([ [1038, 660], [1045, 680], [1038, 750], [897, 750], [807, 780], [805, 850], ]) # Fitting OPTICS model to the dataset model = OPTICS( eps=0.3, min_samples=3, ) model.fit(X) # See the division of data in clusters 0 and -1 y = model.fit_predict(X) y clusters = unique(y) for cluster in clusters: row_ix = where(y == cluster) plt.scatter( X[row_ix, 0], X[row_ix, 1], ) plt.show()
def smart_group_clashes(self, clash_sets, max_clustering_distance): from sklearn.cluster import OPTICS from collections import defaultdict count_of_input_clashes = 0 count_of_clash_sets = 0 count_of_smart_groups = 0 count_of_final_clash_sets = 0 count_of_clash_sets = len(clash_sets) for clash_set in clash_sets: if not "clashes" in clash_set.keys(): self.settings.logger.info( f"Skipping clash set [{clash_set['name']}] since it contains no clash results." ) continue clashes = clash_set["clashes"] if len(clashes) == 0: self.settings.logger.info( f"Skipping clash set [{clash_set['name']}] since it contains no clash results." ) continue count_of_input_clashes += len(clashes) positions = [] for clash in clashes.values(): positions.append(clash["position"]) data = np.array(positions) # INPUTS # set the desired maximum distance between the grouped points if max_clustering_distance > 0: max_distance_between_grouped_points = max_clustering_distance else: max_distance_between_grouped_points = 3 model = OPTICS(min_samples=2, max_eps=max_distance_between_grouped_points) model.fit_predict(data) pred = model.fit_predict(data) # Insert the smart groups into the clashes if len(pred) == len(clashes.values()): i = 0 for clash in clashes.values(): int_prediction = int(pred[i]) if int_prediction == -1: # ungroup this clash since it's a single clash that we were not able to group. new_clash_group_number = np.amax(pred).item() + 1 + i clash["smart_group"] = new_clash_group_number else: clash["smart_group"] = int_prediction i += 1 # Create JSON with smart_groups that contain GlobalIDs output_clash_sets = defaultdict(list) for clash_set in clash_sets: if not "clashes" in clash_set.keys(): continue smart_groups = defaultdict(list) for clash_id, content in clash_set["clashes"].items(): if "smart_group" in content: object_id_list = list() # Clash has been grouped, let's extract it. object_id_list.append(content["a_global_id"]) object_id_list.append(content["b_global_id"]) smart_groups[content["smart_group"]].append(object_id_list) count_of_smart_groups += len(smart_groups) output_clash_sets[clash_set["name"]].append(smart_groups) # Rename the clash groups to something more sensible for clash_set, smart_groups in output_clash_sets.items(): clash_set_name = clash_set # Only select the clashes that correspond to the actively selected IFC Clash Set i = 1 new_smart_group_name = "" for smart_group, global_id_pairs in list(smart_groups[0].items()): new_smart_group_name = f"{clash_set_name} - {i}" smart_groups[0][new_smart_group_name] = smart_groups[0].pop( smart_group) i += 1 count_of_final_clash_sets = len(output_clash_sets) self.settings.logger.info( f"Took {count_of_input_clashes} clashes in {count_of_clash_sets} clash sets and turned them into {count_of_smart_groups} smart groups in {count_of_final_clash_sets} clash sets" ) return output_clash_sets