def myHDBSCAN(nodes, numPartitions): nodesToClusters = {} for node in nodes.values(): if node.user.protected: nodesToClusters[node.id] = -99 for Id in nodesToClusters.keys(): nodes.pop(Id) hdb = HDBSCAN( min_cluster_size=5, cluster_selection_epsilon=2, cluster_selection_method="leaf", metric="manhattan", ) for i in range(numPartitions - 1): if i == 0: clusters = hdb.fit_predict(createAdjacency(nodes)) continue # Sort then group clusters by num. of members frequency = {key: len(tuple(group)) for key, group in groupby(sorted(clusters)) if "-1" not in str(key)} maximum = max(frequency.values()) for key, freq in frequency.items(): if freq == maximum: cluster = key # Use 'cluster' to select biggest cluster # Split nodes that belong to 'cluster' into two clusters vec = hdb.fit_predict(createAdjacency(nodes, tuple(cluster == x for x in clusters))) # Update all new clustered elements from vec to clusters cnt = 0 for index, el in enumerate(clusters): if el == cluster: clusters[index] = (clusters[index]) + (vec[cnt]) cnt += 1 # clusters = np.unique(n_clusters).tolist() # clusters.remove(-1) # distances = [ # [abs(hdb.weighted_cluster_centroid(i) - hdb.weighted_cluster_centroid(j)).mean() for j in clusters] # for i in clusters] # edges = pd.DataFrame(distances).applymap(lambda x: x > 0.05).values.tolist() for Id, cluster in zip(nodes, clusters): nodesToClusters.update({Id: (cluster)}) return nodesToClusters, None
def _cluster_train(df): start_time = time.time() # Note: Issue #88 open, prediction_data cannot be used with Haervsine metrics https://github.com/scikit-learn-contrib/hdbscan/issues/88 db = HDBSCAN(min_samples=1, metric='haversine', core_dist_n_jobs=-1, memory='./__pycache__/', prediction_data=True) coords = df[['latitude', 'longitude']] * np.pi / 180 df = df.assign(cluster=db.fit_predict(coords)) # get the number of clusters num_clusters = db.labels_.max() message = 'Clustered {:,} points down to {:,} clusters, for {:.1f}% compression in {:,.2f} seconds' print( message.format(len(df), num_clusters, 100 * (1 - float(num_clusters) / len(df)), time.time() - start_time)) # Get the list of the point most in the center of each clusters cluster_centers = df[[ 'cluster', 'latitude', 'longitude' ]].groupby('cluster')['latitude', 'longitude'].agg( lambda x: _get_centermost_point(x.values)) df = df.merge(cluster_centers, left_on='cluster', right_index=True, how='left', suffixes=('', '_cluster')) return db, cluster_centers, df
def apply(self, fX): from hdbscan import HDBSCAN clusterer = HDBSCAN(min_cluster_size=self.min_cluster_size, min_samples=self.min_samples, metric='precomputed') distance_matrix = squareform(pdist(fX, metric=self.metric)) # apply clustering cluster_labels = clusterer.fit_predict(distance_matrix) # cluster embedding n_clusters = np.max(cluster_labels) + 1 if n_clusters < 2: return np.zeros(fX.shape[0], dtype=np.int) fC = l2_normalize( np.vstack([np.sum(fX[cluster_labels == k, :], axis=0) for k in range(n_clusters)])) # tag each undefined embedding to closest cluster undefined = cluster_labels == -1 closest_cluster = np.argmin( cdist(fC, fX[undefined, :], metric=self.metric), axis=0) cluster_labels[undefined] = closest_cluster return cluster_labels
def clustering(umap_embedding_fit, umap_embedding_predict, min_cluster_size, prediction_data): print("clustering...") hdbscan = HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=prediction_data).fit(umap_embedding_fit) clustering = hdbscan.fit_predict(umap_embedding_predict) labels = hdbscan.labels_ return clustering, labels
def get_clusters(self, coordinates, original_df, coordinates_df, csv_path): """ It employs the HDBSCAN method to gather the supplied coordinates into clusters. Parameters ---------- coordinates : numpy.array The array of coordinates that will be clustered. Its shape must fulfill the following dimensions: [M, N, 3], where M is the total number of models that have been sampled with PELE and N is the total number of atoms belonging to the residue that is being analyzed original_df : pandas.DataFrame Original dataframe from Analysis to be overwritten coordinates_df : pandas.DataFrame The filtered dataframe which was used to extract coordinates for clustering csv_path : str Directory where the CSV will be saved Returns ------- clusters : numpy.array The array of cluster labels assigned to each conformer from the supplied array """ from hdbscan import HDBSCAN coordinates = Clustering.fix_coordinates_shape(coordinates) clustering_method = HDBSCAN(cluster_selection_epsilon=self._bandwidth) clusters = clustering_method.fit_predict(coordinates) self._save_cluster_info(original_df, coordinates_df, clusters, csv_path) return clusters
def hdbscan_clustering(S,X,config): ''' Computes H-DBSCAN clustering from an input similarity matrix. Returns the labels associated with the clustering. ''' from hdbscan import HDBSCAN min_size = config.as_int("min_cluster_size") clf = HDBSCAN(min_cluster_size=min_size) return clf.fit_predict(X)
def hdbscan_clustering(S, X, config): ''' Computes H-DBSCAN clustering from an input similarity matrix. Returns the labels associated with the clustering. ''' from hdbscan import HDBSCAN min_size = config.as_int("min_cluster_size") clf = HDBSCAN(min_cluster_size=min_size) return clf.fit_predict(X)
def clustering(df_low_dim, algorithm='KMeans', K_means_n_clusters=15, hdbscan_min_cluster_size=20): """ Perform clustering on dimention reduced data. Clustering algorithms can be KMeans or HDBSCAN. """ if algorithm == 'HDBSCAN': clustering = HDBSCAN(min_cluster_size=hdbscan_min_cluster_size) else: clustering = KMeans(K_means_n_clusters) labels = clustering.fit_predict(df_low_dim) df_labels = pd.Series(['Cluster ' + str(x) for x in labels]) return df_labels
def cluster_space_hdb(classes, vocab_vecs, vocab, min_cluster_size=5, metric="sqeuclidean", min_samples=None): cl = HDBSCAN(metric=metric, min_cluster_size=min_cluster_size, min_samples=min_samples).fit(vocab_vecs) c_labels = {} vocab_labels = {} for i in range(0, len(vocab)): vocab_labels[vocab[i].text] = cl.labels_[i] for l in classes.keys(): c_labels[l] = cl.fit_predict(classes[l]["vecs"]) return c_labels, vocab_labels
def cluster_data_using_hdbscan(points): """TODO Args: points: a list of numpy arrays Returns an array of clusters.""" dbscan = HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True, gen_min_span_tree=False, leaf_size=40, metric='euclidean', min_cluster_size=5, min_samples=None, p=None) indexes = dbscan.fit_predict(points) number_of_clusters = len(set(dbscan.labels_)) - \ (1 if -1 in dbscan.labels_ else 0) return create_clusters(number_of_clusters=number_of_clusters, indexes=indexes)
def cluster_hdbscan(orig_ys, parameters, MemoryDir=None, classes=None): n_components, min_samples, min_cluster_size = parameters # Convert to np if not already orig_ys = np.array(orig_ys) orig_ys = orig_ys.astype('float64') # Load memory if needed if MemoryDir: savedMemory = Memory(MemoryDir + str(n_components) + "_" + str(min_samples) + "/") else: savedMemory = Memory(cachedir=None, verbose=0) # PCA to desired dimensionality pca = PCA(n_components=n_components) ys = pca.fit_transform(orig_ys) # Cluster using hdbscan clusterer = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, core_dist_n_jobs=-2, algorithm='boruvka_kdtree', cluster_selection_method='eom', prediction_data=True, memory=savedMemory) cluster_labels = clusterer.fit_predict(ys) outlier_scores = clusterer.outlier_scores_ # Increase outlier score of outlier points by 1 ys_idx = np.arange(len(cluster_labels)) outlier_idx = ys_idx[cluster_labels == -1] outlier_scores[outlier_idx] += 1 # Assign cluster labels to outlier points soft_cluster_labels = membership_vector(clusterer, ys[outlier_idx]) weak_cluster_labels = np.argmax(soft_cluster_labels, -1) cluster_labels[outlier_idx] = weak_cluster_labels return cluster_labels, outlier_scores
def compute_clusters(pois, alg='dbscan', min_pts=None, eps=None, n_jobs=1): """Computes clusters using the DBSCAN or the HDBSCAN algorithm. Args: pois (GeoDataFrame): A POI GeoDataFrame. alg (string): The clustering algorithm to use (dbscan or hdbscan; default: dbscan). min_pts (integer): The minimum number of neighbors for a dense point. eps (float): The neighborhood radius. n_jobs (integer): Number of parallel jobs to run in the algorithm (default: 1) Returns: A GeoDataFrame containing the clustered POIs and their labels. The value of parameter `eps` for each cluster is also returned (which varies in the case of HDBSCAN). """ # Prepare list of coordinates poi_list = [[p.x, p.y] for p in pois['geometry']] data_arr = np.array(poi_list) del poi_list[:] # Compute the clusters t0 = time() if alg == 'hdbscan': clusterer = HDBSCAN(min_cluster_size=min_pts, min_samples=min_pts, core_dist_n_jobs=n_jobs) labels = clusterer.fit_predict(data_arr) num_of_clusters = len(set(labels)) tree = clusterer.condensed_tree_.to_pandas() cluster_tree = tree[tree.child_size > 1] chosen_clusters = clusterer.condensed_tree_._select_clusters() eps_per_cluster = cluster_tree[cluster_tree.child.isin(chosen_clusters)].\ drop("parent", axis=1).drop("child", axis=1).reset_index().drop("index", axis=1) eps_per_cluster['lambda_val'] = eps_per_cluster['lambda_val'].apply( lambda x: 1 / x) eps_per_cluster.rename(columns={ 'lambda_val': 'eps', 'child_size': 'cluster_size' }, inplace=True) else: clusterer = DBSCAN(eps=eps, min_samples=min_pts, n_jobs=n_jobs).fit(data_arr) labels = clusterer.labels_ num_of_clusters = len(set(labels)) num_of_clusters_no_noise = set(labels) num_of_clusters_no_noise.discard(-1) num_of_clusters_no_noise = len(num_of_clusters_no_noise) eps_per_cluster = pd.DataFrame( {'eps': [eps] * num_of_clusters_no_noise}) eps_per_cluster['cluster_size'] = 0 print("Done in %0.3fs." % (time() - t0)) # Assign cluster labels to initial POIs pois['cluster_id'] = labels # Separate POIs that are inside clusters from those that are noise pois_in_clusters = pois.loc[pois['cluster_id'] > -1] pois_noise = pois.loc[pois['cluster_id'] == -1] print('Number of clusters: %d' % num_of_clusters) print('Number of clustered POIs: %d' % (len(pois_in_clusters))) print('Number of outlier POIs: %d' % (len(pois_noise))) return pois_in_clusters, eps_per_cluster
label=f"Cluster {i+1}") ax.scatter(*data_tsne_2d[clst_dbscan == -1].T, s=20, color=colorcycle[-1], label=f"No Cluster") ax.legend(loc='upper right') ax.set_xlabel("First t-SNE dimension") ax.set_ylabel("Second t-SNE dimension") fig.savefig( f"figs_script/tsne_dbscan_perp_30_colored_{n_to_plot}_clique_{plot_type_name}.pdf" ) # #### HDBScan hdbscan = HDBSCAN(min_cluster_size=5, core_dist_n_jobs=4) clst_hdbscan = hdbscan.fit_predict(clstData) print("Hdbscan clustering", pd.value_counts(clst_hdbscan), sep='\n') fig, ax = plt.subplots() for i in [0, 1]: ax.scatter(*data_tsne_2d[clst_hdbscan == i].T, s=20, color=colorcycle[i], label=f"Cluster {i+1}") ax.scatter(*data_tsne_2d[clst_hdbscan == -1].T, s=20, color=colorcycle[-1], label=f"No Cluster") ax.legend(loc='upper right') ax.set_xlabel("First t-SNE dimension") ax.set_ylabel("Second t-SNE dimension")
def fit(self, X, y=None, sample_weight=None): """X is a dataframe.""" if self.method not in ("dbscan", "hdbscan", "spark"): raise ValueError("Unsupported method '%s'" % self.method) if not self.dbscan_params: self.dbscan_params = dict(min_samples=20, n_jobs=-1, algorithm='brute', metric=partial( distance_dataframe, X, **dict( junction_dist=StringDistance(), correct=False, tol=0))) if not self.hdbscan_params and self.method == 'hdbscan': self.hdbscan_params = dict(min_samples=20, n_jobs=-1, metric=partial( distance_dataframe, X, **dict( junction_dist=StringDistance(), correct=False, tol=0))) self.dbscan_params['eps'] = self.eps # new part: group by junction and v genes if self.method == 'hdbscan' and False: # no grouping; unsupported sample_weight groups_values = [[x] for x in np.arange(X.shape[0])] else: # list of lists groups_values = X.groupby(["v_gene_set_str", self.model + "junc"]).groups.values() idxs = np.array([elem[0] for elem in groups_values]) # take one of them sample_weight = np.array([len(elem) for elem in groups_values]) X_all = idxs.reshape(-1, 1) if self.kmeans_params.get('n_clusters', True): # ensure the number of clusters is higher than points self.kmeans_params['n_clusters'] = min( self.kmeans_params['n_clusters'], X_all.shape[0]) kmeans = MiniBatchKMeans(**self.kmeans_params) lengths = X[self.model + 'junction_length'].values kmeans.fit(lengths[idxs].reshape(-1, 1)) dbscan_labels = np.zeros_like(kmeans.labels_).ravel() if self.method == 'hdbscan': from hdbscan import HDBSCAN from hdbscan.prediction import all_points_membership_vectors dbscan_sk = HDBSCAN(**self.hdbscan_params) else: dbscan_sk = DBSCAN(**self.dbscan_params) if self.method == 'spark': from pyspark import SparkContext from icing.externals.pypardis import dbscan as dbpard sc = SparkContext.getOrCreate() sample_weight_map = dict(zip(idxs, sample_weight)) # self.dbscan_params.pop('n_jobs', None) dbscan = dbpard.DBSCAN(dbscan_params=self.dbscan_params, **self.dbspark_params) # else: for i, label in enumerate(np.unique(kmeans.labels_)): idx_row = np.where(kmeans.labels_ == label)[0] if self.verbose: print("Iteration %d/%d" % (i, np.unique(kmeans.labels_).size), "(%d seqs)" % idx_row.size, end='\r') X_idx = idxs[idx_row].reshape(-1, 1).astype('float64') weights = sample_weight[idx_row] if idx_row.size == 1: db_labels = np.array([0]) elif self.method == 'spark' and idx_row.size > 5000: test_data = sc.parallelize(enumerate(X_idx)) dbscan.train(test_data, sample_weight=sample_weight_map) db_labels = np.array(dbscan.assignments())[:, 1] elif self.method == 'hdbscan': db_labels = dbscan_sk.fit_predict(X_idx) # unsupported weights # avoid noise samples soft_clusters = all_points_membership_vectors(dbscan_sk) db_labels = np.array([np.argmax(x) for x in soft_clusters]) else: db_labels = dbscan_sk.fit_predict(X_idx, sample_weight=weights) if len(dbscan_sk.core_sample_indices_) < 1: db_labels[:] = 0 if -1 in db_labels: balltree = BallTree(X_idx[dbscan_sk.core_sample_indices_], metric=dbscan_sk.metric) noise_labels = balltree.query(X_idx[db_labels == -1], k=1, return_distance=False).ravel() # get labels for core points, then assign to noise points based # on balltree dbscan_noise_labels = db_labels[ dbscan_sk.core_sample_indices_][noise_labels] db_labels[db_labels == -1] = dbscan_noise_labels # hopefully, there are no noisy samples at this time db_labels[db_labels > -1] = db_labels[db_labels > -1] + np.max( dbscan_labels) + 1 dbscan_labels[idx_row] = db_labels # + np.max(dbscan_labels) + 1 if self.method == 'spark': sc.stop() labels = dbscan_labels # new part: put together the labels labels_ext = np.zeros(X.shape[0], dtype=int) labels_ext[idxs] = labels for i, list_ in enumerate(groups_values): labels_ext[list_] = labels[i] self.labels_ = labels_ext
def Clustering(IL, mask, clustering_training_data): #ri_test = np.random.choice(range(len(IL)),size=np.int(IL.shape[0]/10)) clusterer = HDBSCAN(min_cluster_size=1250, gen_min_span_tree=True) #hdb = clusterer.fit(clustering_training_data) IL.ix[mask, 'hdbscan_cluster'] = clusterer.fit_predict(IL)
mua = MUA(filename='S:/pcie.bin') spk = mua.tospk() fet = spk.tofet('pca') # spike sort a channel centered spiking events ch = 26 min_cluster_size = 5 leaf_size = 10 hdbcluster = HDBSCAN(min_cluster_size=min_cluster_size, leaf_size=leaf_size, gen_min_span_tree=True, algorithm='boruvka_kdtree') clu = hdbcluster.fit_predict(fet[ch]) print 'get clusters', np.unique(clu) # from phy.gui import GUI, create_app, run_app create_app() gui = GUI(position=(400, 200), size=(600, 400)) scatter_view = view_scatter_3d() scatter_view.attach(gui) scatter_view.set_data(fet[ch], clu) nclu = len(np.unique(clu)) view = View(layout='grid', shape=(3, nclu))
def fit_tsne(self, dataset, min_cluster_size=55, perplexity=40, n_iter=2500, learning_rate=700.0): self.corpus, self.orig = text_preprocessing(dataset, self.msg_column, self.min_msg_length, self.stop_words_set) logger.info("corpus length: " + str(len(self.corpus))) fname = \ RESOURCE_DIR + '/tsne_' + \ str(self.max_features) + '_' + \ str(n_iter) + '_' + \ str(perplexity).replace(".", "_") + '_' + \ str(learning_rate).replace(".", "_") if not self.overwrite: fname = free_file_name(fname, 'pkl') else: fname = fname + '.pkl' logger.info("Using t-SNE model file: " + str(fname)) if not os.path.isfile(fname): cv = CountVectorizer(max_features=self.max_features) self.X = cv.fit_transform(self.corpus).toarray() pca_all = PCA() pca_all.fit_transform(self.X) ratio = 0.0 n_components = 0 while ratio < 0.85: ratio += pca_all.explained_variance_ratio_[n_components] n_components += 1 pca = PCA(n_components) x_pca = pca.fit_transform(self.X) logger.info( 'Cumulative explained variation for principal components: {}'. format(np.sum(pca.explained_variance_ratio_))) tsne = TSNE(n_components=2, verbose=1, perplexity=perplexity, n_iter=n_iter, learning_rate=learning_rate) self.tsne_results = tsne.fit_transform(x_pca) with open(fname, 'wb') as file: pickle.dump((cv, pca, self.tsne_results), file) else: with open(fname, 'rb') as file: cv, pca, self.tsne_results = pickle.load(file) self.X = cv.fit_transform(self.corpus).toarray() self.x_pca = pca.fit_transform(self.X) logger.info('X rows: ' + str(len(self.X))) logger.info('X cols: ' + str(len(self.X[0]))) df = pd.DataFrame(columns=['X', 'Y']) df['X'] = self.tsne_results[:, 0] df['Y'] = self.tsne_results[:, 1] tsne_values = df.values clustering = HDBSCAN(min_cluster_size=min_cluster_size) self.y_pred = clustering.fit_predict(tsne_values) next_cluster_num = get_next_cluster_num() self.y = [] filtered_values = [] for i in range(0, len(self.y_pred)): if self.y_pred[i] < 0: continue filtered_values.append(tsne_values[i]) self.y.append(int(self.y_pred[i]) + int(next_cluster_num)) self.y = np.array(self.y, dtype='int') logger.info('Next cluster num: ' + str(next_cluster_num)) clusters_codes = pd.DataFrame(self.y, columns=['cl'])['cl'].unique() self.n_clusters = len(clusters_codes) filtered_values = np.array(filtered_values) self.index_array = np.concatenate( (filtered_values, np.zeros( (len(filtered_values), 1), dtype='float')), axis=1) logger.info('Cluster codes (' + str(self.n_clusters) + '):') logger.info(clusters_codes)
with warnings.catch_warnings(): warnings.simplefilter('ignore') umap = UMAP(random_state=42) embedding = umap.fit_transform(dist) print(embedding[:5]) # # plt.scatter(embedding[:,0], embedding[:,1]) # # HDBSCAN hdbscan = HDBSCAN(min_cluster_size=8) clustering = hdbscan.fit_predict(embedding) # # Three clusters ! # # plt.scatter(embedding[:,0], embedding[:,1], c=clustering); # # Titles from the first cluster (Fun) titles_cluster = get_titles_from_cluster(0) # # Titles from the second cluster (Fan) titles_cluster = get_titles_from_cluster(1) # # Titles from the third cluster (cinemagoer)
def getClusters(umap): hdbscan = HDBSCAN(min_cluster_size=5) clusters = hdbscan.fit_predict(umap) return clusters
def fit(self, X, y=None, sample_weight=None): """X is a dataframe.""" if self.method not in ("dbscan", "hdbscan", "spark"): raise ValueError("Unsupported method '%s'" % self.method) if not self.dbscan_params: self.dbscan_params = dict( min_samples=20, n_jobs=-1, algorithm='brute', metric=partial(distance_dataframe, X, **dict( junction_dist=StringDistance(), correct=False, tol=0))) if not self.hdbscan_params and self.method == 'hdbscan': self.hdbscan_params = dict( min_samples=20, n_jobs=-1, metric=partial(distance_dataframe, X, **dict( junction_dist=StringDistance(), correct=False, tol=0))) self.dbscan_params['eps'] = self.eps # new part: group by junction and v genes if self.method == 'hdbscan' and False: # no grouping; unsupported sample_weight groups_values = [[x] for x in np.arange(X.shape[0])] else: # list of lists groups_values = X.groupby( ["v_gene_set_str", self.model + "junc"]).groups.values() idxs = np.array([elem[0] for elem in groups_values]) # take one of them sample_weight = np.array([len(elem) for elem in groups_values]) X_all = idxs.reshape(-1, 1) if self.kmeans_params.get('n_clusters', True): # ensure the number of clusters is higher than points self.kmeans_params['n_clusters'] = min( self.kmeans_params['n_clusters'], X_all.shape[0]) kmeans = MiniBatchKMeans(**self.kmeans_params) lengths = X[self.model + 'junction_length'].values kmeans.fit(lengths[idxs].reshape(-1, 1)) dbscan_labels = np.zeros_like(kmeans.labels_).ravel() if self.method == 'hdbscan': from hdbscan import HDBSCAN from hdbscan.prediction import all_points_membership_vectors dbscan_sk = HDBSCAN(**self.hdbscan_params) else: dbscan_sk = DBSCAN(**self.dbscan_params) if self.method == 'spark': from pyspark import SparkContext from icing.externals.pypardis import dbscan as dbpard sc = SparkContext.getOrCreate() sample_weight_map = dict(zip(idxs, sample_weight)) # self.dbscan_params.pop('n_jobs', None) dbscan = dbpard.DBSCAN( dbscan_params=self.dbscan_params, **self.dbspark_params) # else: for i, label in enumerate(np.unique(kmeans.labels_)): idx_row = np.where(kmeans.labels_ == label)[0] if self.verbose: print("Iteration %d/%d" % (i, np.unique(kmeans.labels_).size), "(%d seqs)" % idx_row.size, end='\r') X_idx = idxs[idx_row].reshape(-1, 1).astype('float64') weights = sample_weight[idx_row] if idx_row.size == 1: db_labels = np.array([0]) elif self.method == 'spark' and idx_row.size > 5000: test_data = sc.parallelize(enumerate(X_idx)) dbscan.train(test_data, sample_weight=sample_weight_map) db_labels = np.array(dbscan.assignments())[:, 1] elif self.method == 'hdbscan': db_labels = dbscan_sk.fit_predict(X_idx) # unsupported weights # avoid noise samples soft_clusters = all_points_membership_vectors(dbscan_sk) db_labels = np.array([np.argmax(x) for x in soft_clusters]) else: db_labels = dbscan_sk.fit_predict( X_idx, sample_weight=weights) if len(dbscan_sk.core_sample_indices_) < 1: db_labels[:] = 0 if -1 in db_labels: balltree = BallTree( X_idx[dbscan_sk.core_sample_indices_], metric=dbscan_sk.metric) noise_labels = balltree.query( X_idx[db_labels == -1], k=1, return_distance=False).ravel() # get labels for core points, then assign to noise points based # on balltree dbscan_noise_labels = db_labels[ dbscan_sk.core_sample_indices_][noise_labels] db_labels[db_labels == -1] = dbscan_noise_labels # hopefully, there are no noisy samples at this time db_labels[db_labels > -1] = db_labels[db_labels > -1] + np.max(dbscan_labels) + 1 dbscan_labels[idx_row] = db_labels # + np.max(dbscan_labels) + 1 if self.method == 'spark': sc.stop() labels = dbscan_labels # new part: put together the labels labels_ext = np.zeros(X.shape[0], dtype=int) labels_ext[idxs] = labels for i, list_ in enumerate(groups_values): labels_ext[list_] = labels[i] self.labels_ = labels_ext
def return_hdbscansvm(df, txt_col, rf = False, clust_size = 15, samp_size = 5, svmx = False, svmc = 1000, clust_metric = 'braycurtis'): super_flat =pd.DataFrame(df) r = super_flat hdb = HDBSCAN(min_cluster_size = clust_size, min_samples= samp_size, metric = clust_metric, cluster_selection_method = 'leaf') n = hdb.fit_predict(r) f = pd.get_dummies(n) with_cat = pd.Series([str(i) for i in list(zip(txt_col, n))], name = 'text') answers = pd.concat([with_cat, f], axis = 1 ) answers = [answers, n, hdb] if svmx == True: ans_ = pd.concat([pd.DataFrame(txt_col), f], axis = 1 ) ans_ = ans_[ans_[-1] == 0] ans_ = ans_.drop(-1, axis = 1) ans_ = ans_.melt(id_vars='text', var_name='cluster3', value_name='value') ans_ = ans_[ans_['value'] == 1] ans_ = ans_.drop('value', axis = 1) ans_svm = ans_[ans_['cluster3'] > -1] for_x = pd.concat([pd.DataFrame(txt_col),pd.DataFrame(r)], axis = 1) authorssvm = for_x.merge(ans_svm,left_on = 'text', right_on= 'text', how = 'right') y = list(authorssvm['cluster3']) X = authorssvm.drop(['cluster3', 'text'], axis = 1) X = X.fillna(0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) clf = svm.SVC(C = svmc, kernel = 'rbf', gamma = 0.7, random_state = 12) print('done') clf.fit(X_train,y_train) print('done') y_pred= clf.predict(X_test) print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) nsvm = clf.predict(r) f = pd.get_dummies(nsvm) with_catsvm = pd.Series([str(i) for i in list(zip(txt_col, nsvm))], name = 'text') answerssvm = pd.concat([with_catsvm, f], axis = 1 ) answers = [answerssvm, nsvm, hdb] if rf == True: ans_ = pd.concat([pd.DataFrame(txt_col), f], axis = 1 ) ans_ = ans_[ans_[-1] == 0] ans_ = ans_.drop(-1, axis = 1) ans_ = ans_.melt(id_vars='text', var_name='cluster3', value_name='value') ans_ = ans_[ans_['value'] == 1] ans_ = ans_.drop('value', axis = 1) ans_svm = ans_[ans_['cluster3'] > -1] for_x = pd.concat([pd.DataFrame(txt_col),pd.DataFrame(r)], axis = 1) authorssvm = for_x.merge(ans_svm,left_on = 'text', right_on= 'text', how = 'right') y = list(authorssvm['cluster3']) X = authorssvm.drop(['cluster3', 'text'], axis = 1) X = X.fillna(0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) clf = RandomForestClassifier(min_samples_split =50,max_depth = 10, random_state=12) print('done') clf.fit(X_train,y_train) print('done') y_pred= clf.predict(X_test) print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) nsvm = clf.predict(r) f = pd.get_dummies(nsvm) with_catsvm = pd.Series([str(i) for i in list(zip(txt_col, nsvm))], name = 'text') answerssvm = pd.concat([with_catsvm, f], axis = 1 ) answers = [answerssvm, nsvm, hdb] return(answers)
def __run_hdbscan(dataset, eps, min_cluster_size, min_samples, algorithm): clusterer = HDBSCAN(cluster_selection_epsilon=eps, min_cluster_size=min_cluster_size, min_samples=min_samples, algorithm=algorithm) return clusterer, clusterer.fit_predict(dataset)
""" hdbscan """ hdbscan = HDBSCAN(min_cluster_size=1375) best_model = hdbscan predictions = hdbscan.fit_predict(tracks_df) print(len(set(predictions))) visualize_clusters(data=tracks_df, predictions=predictions, n_cluster=len(set(hdbscan.labels_)), stochastic=True) best_k = len(set(hdbscan.labels_)) print("len of tracks_df: %d" % len(tracks_df)) """ explain track clusters with original afs """ statement = "SELECT track_id, danceability, energy, speechiness, acousticness, instrumentalness, tempo, valence, liveness FROM acoustic_features" af_df = pd.read_sql(sql=statement, con=engine).set_index("track_id") scaler = MinMaxScaler() af_df = pd.DataFrame(scaler.fit_transform(af_df), columns=af_df.columns, index=af_df.index) df = tracks_df.merge(af_df, left_index=True, right_index=True)
def __init__(self, pois, alg="hdbscan", min_samples=None, eps=None, n_jobs=-1, **kwargs): """Computes clusters using the sklearn algorithms or HDBSCAN. Parameters: pois (GeoDataFrame): A POI GeoDataFrame. alg (string): The clustering algorithm to use (hdbscan, dbscan or optics; default: hdbscan). min_samples (float|integer): The number of samples in a neighborhood for a point to be considered as a core point. Expressed as an absolute number (int > 1) or a fraction of the number of samples (float between 0 and 1). eps (float): The neighborhood radius (used only in dbscan). n_jobs (integer): Number of parallel jobs to run in the algorithm (default: -1) **kwargs: Optional arguments depending on the algorithm. """ if min_samples is None: min_samples = int(round(np.log(len(pois)))) if alg == 'dbscan': assert eps is not None self.pois = pois self.alg = alg self.min_samples = min_samples self.eps = eps self.n_jobs = n_jobs # Prepare list of coordinates data_arr = pois.geometry.get_coordinates().values() # Compute the clusters if alg == 'hdbscan': min_cluster_size = kwargs.pop('min_cluster_size', 50) core_dist_n_jobs = kwargs.pop('core_dist_n_jobs', n_jobs) clusterer = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, core_dist_n_jobs=core_dist_n_jobs, **kwargs) labels = clusterer.fit_predict(data_arr) tree = clusterer.condensed_tree_.to_pandas() cluster_tree = tree[tree.child_size > 1] chosen_clusters = clusterer.condensed_tree_._select_clusters() eps_per_cluster = cluster_tree[cluster_tree.child.isin(chosen_clusters)].\ drop("parent", axis=1).drop("child", axis=1).reset_index().drop("index", axis=1) eps_per_cluster['lambda_val'] = eps_per_cluster[ 'lambda_val'].apply(lambda x: 1 / x) eps_per_cluster.rename(columns={ 'lambda_val': 'eps', 'child_size': 'cluster_size' }, inplace=True) else: if alg == 'dbscan': clusterer = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=n_jobs, **kwargs).fit(data_arr) elif alg == 'optics': clusterer = OPTICS(min_samples=min_samples, eps=eps, n_jobs=n_jobs, **kwargs).fit(data_arr) else: raise Exception( 'Implemented algoriths are hdbscan, dbscan and optics.') labels = clusterer.labels_ num_of_clusters_no_noise = set(labels) num_of_clusters_no_noise.discard(-1) num_of_clusters_no_noise = len(num_of_clusters_no_noise) eps_per_cluster = pd.DataFrame( {'eps': [eps] * num_of_clusters_no_noise}) eps_per_cluster['cluster_size'] = 0 # Assign cluster labels to initial POIs pois['cluster_id'] = labels # Separate POIs that are inside clusters from those that are noise pois_in_clusters = pois[pois.cluster_id > -1] pois_noise = pois[pois.cluster_id == -1] self._num_of_clusters = len(set(labels)) self._pois_in_clusters = pois_in_clusters self._eps_per_cluster = eps_per_cluster self._pois_noise = pois_noise self._shape_type = None
X = x.loc[~missing, variables] pd.scatter_matrix(X) # Visualize using MDS, as we use a distance based clustering method. # https://datascience.stackexchange.com/questions/22/k-means-clustering-for-mixed-numeric-and-categorical-data Z = scale(X.astype(float)) scale_dist = dist(Z, "braycurtis") mds_scale = MDS(n_components=2, dissimilarity="precomputed", max_iter=1000) coords_scale = mds_scale.fit_transform(scale_dist) plt.scatter(*coords_scale.T) # Run HDSCAN* over precomputed distances. clusterer = HDBSCAN(min_cluster_size=50, metric="precomputed") labels = clusterer.fit_predict(scale_dist) # Cursory diagnostics. Counter(labels) clusterer.cluster_persistence_ np.mean(clusterer.probabilities_[labels != -1]) fig, axes = plt.subplots(1, 2) ax1, ax2 = axes scatter(*coords_scale.T, labels=labels + 1, ax=ax1) ax1.legend() ax1.set_title("Clusters") ax2.hist(clusterer.probabilities_[labels != -1], bins=60, normed=True) ax2.set_title("Probability of belonging to assigned cluster")
def fit_som(self, dataset, som_threshold=0.5, som_size=100, som_sigma=1.0, som_learning_rate=0.5): self.corpus, self.orig = text_preprocessing(dataset, self.msg_column, self.min_msg_length, self.stop_words_set) logger.info("corpus length: " + str(len(self.corpus))) fname = \ RESOURCE_DIR + '/som_' + \ str(self.max_features) + '_' + \ str(som_size) + '_' + \ str(som_sigma).replace(".", "_") + '_' + \ str(som_learning_rate).replace(".", "_") if not self.overwrite: fname = free_file_name(fname, 'pkl') else: fname = fname + '.pkl' logger.info("Using SOM model file: " + str(fname)) if not os.path.isfile(fname): cv = CountVectorizer(max_features=self.max_features) self.X = cv.fit_transform(self.corpus).toarray() self.sc = MinMaxScaler(feature_range=(0, 1)) self.X_scale = self.sc.fit_transform(self.X) self.som = MiniSom(x=som_size, y=som_size, input_len=self.max_features, sigma=som_sigma, learning_rate=som_learning_rate) self.som.train_batch(data=self.X_scale, num_iteration=len(self.X_scale)) with open(fname, 'wb') as file: pickle.dump((cv, self.sc, self.som), file) else: with open(fname, 'rb') as file: cv, self.sc, self.som = pickle.load(file) self.X = cv.fit_transform(self.corpus).toarray() self.X_scale = self.sc.fit_transform(self.X) logger.info('X rows: ' + str(len(self.X))) logger.info('X cols: ' + str(len(self.X[0]))) distance_map = self.som.distance_map() indexes_coords = [] indexes_dist = [] for i in range(0, len(distance_map)): for j in range(0, len(distance_map[i])): if distance_map[i, j] < som_threshold: indexes_coords.append(i) indexes_coords.append(j) indexes_dist.append(distance_map[i, j]) coord_array = np.array(indexes_coords).reshape( int(len(indexes_coords) / 2), 2) dist_array = np.array(indexes_dist).reshape(int(len(indexes_dist)), 1) clustering = HDBSCAN(min_cluster_size=5) y_pred = clustering.fit_predict(coord_array) next_cluster_num = get_next_cluster_num() self.y = [] filtered_coords = [] filtered_dists = [] for i in range(0, len(y_pred)): if y_pred[i] < 0: continue filtered_coords.append(coord_array[i]) filtered_dists.append(dist_array[i]) self.y.append(int(y_pred[i]) + int(next_cluster_num)) self.y = np.array(self.y, dtype='int') logger.info('Next cluster num: ' + str(next_cluster_num)) clusters_codes = pd.DataFrame(self.y, columns=['cl'])['cl'].unique() self.n_clusters = len(clusters_codes) coord_array = np.array(filtered_coords) dist_array = np.array(filtered_dists) self.index_array = np.concatenate((coord_array, dist_array), axis=1) logger.info('Cluster codes (' + str(self.n_clusters) + '):') logger.info(clusters_codes)