def cluster_embeddings_hdbscan(sentences, sentence_embeddings): clusterer = hdbscan.HDBSCAN(min_cluster_size=2, min_samples=None) clusterer.fit(sentence_embeddings) summary_df = pd.DataFrame( data={ "position": range(len(sentences)), "sentence": sentences, "embedding": sentence_embeddings.tolist(), "cluster": clusterer.labels_ }) return summary_df, clusterer.exemplars_
def hdbscan_cluster(shifted_pcd, valid, min_cluster_size=50): clustered_ins_ids = np.zeros(shifted_pcd.shape[0], dtype=np.int32) valid_shifts = shifted_pcd[valid, :].reshape(-1, 3) if valid_shifts.shape[0] == 0: return clustered_ins_ids cluster = hdbscan.HDBSCAN( min_cluster_size = min_cluster_size, allow_single_cluster = True ).fit(valid_shifts) instance_labels = cluster.labels_ instance_labels += (-instance_labels.min() + 1) clustered_ins_ids[valid] = instance_labels return clustered_ins_ids
def estimate(self, mutations_data, **kwargs): try: precomputed = squareform( pdist(mutations_data, 'cityblock') / mutations_data.shape[1]) except BaseException: logger.exception( 'Problem with mutations_data: {}'.format(mutations_data)) raise RuntimeError('Problem in computing distances for hdbscan.') kwargs.pop('metric', None) model = hdbscan.HDBSCAN(metric='precomputed', **merge_dicts(default_parameters, kwargs)) model.fit(precomputed) return pd.Series(model.labels_ + 1, index=mutations_data.index)
def apply_hdbscan(data, min_cluster_size, min_samples): clusterer = hdbscan.HDBSCAN( min_cluster_size=min_cluster_size, min_samples=min_samples, memory='./chache') start = time.time() log.info("Fitting...") clusterer.fit(data) end = time.time() log.info('HDBSCAN finished in %f secondi', (end - start)) n_classes = len(np.unique(clusterer.labels_)) n_outliers = list(clusterer.labels_).count(-1) log.info("Found %d clusters (+1 outliers) and %d outliers", n_classes - 1, n_outliers) return clusterer.labels_, n_classes
def hdbscan_clustering(self, min_samples_scaling=0.5): # Feature scaling. X = self.feature_scaling() min_cluster_size = max( [self.global_min_samples, int(0.005 * X.shape[0])]) hdbcl = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=int(min_samples_scaling * min_cluster_size)) hdbresult = hdbcl.fit(X) self.table['cluster'] = hdbresult.labels_
def apply_hdbscan(features): import hdbscan from sklearn.metrics import pairwise_distances distance = pairwise_distances(features, metric='cosine') hdb = hdbscan.HDBSCAN(min_cluster_size=2, metric='precomputed') hdb.fit(distance.astype('float64')) # Clustering Results # Number of clusters in pred_labels, ignoring noise (-1) if present. pred_labels = hdb.labels_ n_clusters_ = len(set(pred_labels)) - (1 if -1 in pred_labels else 0) n_noise_ = list(pred_labels).count(-1) return pred_labels, n_clusters_, n_noise_
def __init__(self, Basis, window_sz, overlap, min_cluster_size): self.basis = Basis self.window_sz = window_sz self.overlap = overlap self.inds = [] self.koop_list = [] self.koopman_feature_array = [] self.labels = [] self.koop_cluster_list = [] self.koop_cluster_memb_prob_list = [] self.koopman_hybrid_modes = [] self.min_cluster_size = min_cluster_size self.clusterer = hdbscan.HDBSCAN(min_cluster_size=self.min_cluster_size,metric='euclidean')
def cluster(distances): # Perform initial clustering distances = np.array(distances).reshape(-1, 1) clusterer = hdbscan.HDBSCAN( allow_single_cluster=True, prediction_data=True, min_cluster_size=5, ) # labels = clusterer.fit(xmap_embedding.astype(np.float64)).labels_ clusterer.fit(np.array(distances).astype(np.float64)) return clusterer.labels_
def dbscan(self): """ Train hdbscan for the input dataframe save the hdbscan model """ df = self.data.copy() hdb = hdbscan.HDBSCAN(min_cluster_size=16000, min_samples=5, prediction_data=True).fit(df) joblib.dump(hdb, 'ad/hdbscan') self.data[ 'Category'] = hdb.labels_ # Stores the labels into category field
def dbscan(self): self.cluster_obj = hdbscan.HDBSCAN( cluster_selection_epsilon=self.dbs_eps, min_cluster_size=self.dbs_min_samples, prediction_data=True) self.cluster_obj.fit(self.X) def dbscan_predict(X_test): test_labels, strengths = hdbscan.approximate_predict( self.cluster_obj, X_test) return test_labels self.cluster_obj.predict = dbscan_predict
def cluster(self, divisor = 30, cluster_selection_epsilon = 0, metric = 'euclidean', algorithm = 'best', n_components = 10): #HDB Scan hdb = hdbscan.HDBSCAN(min_cluster_size= int(np.floor(len(self.clusterdf) / divisor)), min_samples=1, cluster_selection_method='eom' ,cluster_selection_epsilon=cluster_selection_epsilon ,metric= metric ,algorithm=algorithm ) #Normalize data cols = self.unitscol + self.traitscol + self.traitsnumunitcol + self.chosenunitpivotcol + self.chosentraitpivotcol data = self.clusterdf[cols].fillna(0) norm_data = normalize(data, norm='l2') ##Cannot make dimension reduction work #reducer = umap.UMAP(metric = 'manhattan', random_state = 42, n_components = n_components) #embed = reducer.fit_transform(norm_data) embed = norm_data #print(cols) #Cluster HDB print('HDB Scan') clusterer=hdb.fit(embed) try: self.plot = clusterer.condensed_tree_.plot(select_clusters=True, label_clusters=True) except: pass self.clusterdf['hdbnumber'] = pd.Series(hdb.labels_+1, index=self.clusterdf.index) self.clusterdf['comp_id'] = self.clusterdf['participants.placement'].apply(str)+self.clusterdf['match_id'] #print(self.clusterdf['hdbnumber'].value_counts()) #Get top 2 traits traitsaveragedf = self.clusterdf.fillna(0).groupby('hdbnumber')[list(self.traitscol)].mean() commontraitslist=traitsaveragedf.apply(lambda s: s.abs().nlargest(2).index.to_list(), axis=1) self.commontraits=commontraitslist.agg(lambda x: ' '.join(map(str, x))) self.commontraits[0]='No Comp' self.clusterdf=self.clusterdf.merge(pd.DataFrame({'hdb':self.commontraits}),left_on='hdbnumber',right_on='hdbnumber') self.traitshdb=self.traitsdf.merge(self.clusterdf)[list(self.traitsdf.columns)+list(['hdb'])+list(['comp_id'])] self.unitshdb=self.unitsdf.merge(self.clusterdf[['match_id','participants.placement','comp_id','hdb']])[list(self.unitsdf.columns)+list(['hdb'])+list(['comp_id'])] self.itemshdb=self.itemsdf.merge(self.clusterdf)[list(self.itemsdf.columns)+list(['hdb'])+list(['comp_id'])] comppop=pd.DataFrame(self.clusterdf.groupby('match_id')['hdb'].value_counts().rename('compsinmatch')) self.clusterdf=pd.merge(self.clusterdf,comppop,left_on=['match_id','hdb'],right_index=True)
def get_umap_subsets(self, nn=100, md=0.1, **kwargs): """ Get the names and indices of the t-sne-defined subsets """ # First get umap results: results = Table.read( "../data/dimred_results/apogee_rc_dimred_hyperparametertest.fits") self.Xu = results["X_umap_euclidean_nn" + str(nn) + "_md" + str(md)] self.Yu = results["Y_umap_euclidean_nn" + str(nn) + "_md" + str(md)] # Now run HDBSCAN to define the subsets import hdbscan clusterer = hdbscan.HDBSCAN(**kwargs) clusterer.fit(np.vstack((self.Xu, self.Yu)).T) self.classcol = clusterer.labels_ self.classprob = clusterer.probabilities_ self.subsets = np.unique(clusterer.labels_) #self.classcol= np.char.rstrip(self.data["tsne_class_teffcut40"],b' ')#.decode('utf8').strip() #self.subsets = ["thin", "thick1", "thick2", "thick3", "thick4", # "mpthin", "mpthintrans", "smr", "t4trans", "youngthin", # "debris1", "debris2", "debris3", "debris4", "debris5", # "smr2", "t2trans1", "highTi","lowMg","highAlMg?"] self.names = [ "", "", "", "", "", "", "Transition group", "", "", "Young local disc", "", "", "[s/Fe]-enhanced", "", "", r"", "Debris candidate", r"Extreme-Ti star", r"Low-[Mg/Fe] star", "High-[Al/Mg] star" ] self.Xcoords = [10, 11, 4.5, -12, 18, -31, 22, 26, -22.5, -14, -2, -25] self.Ycoords = [5.5, .5, -2, -4, 6, 0, 1.5, -.5, -7, -2, -6, 14] self.fsize = [20, 16, 12, 12, 15, 13, 11, 11, 11, 11, 11, 11] self.sym = [ "o", "v", "^", ">", "<", "s", "o", "*", "<", "o", "h", "d", "D", "v", "p", "*", "D", "p", "s", "8" ] self.al = [ .6, .8, .8, .8, .8, .8, .8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ] self.lw = [ 0, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5 ] self.size = [ 7, 12, 12, 12, 12, 15, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18 ] self.col = [ "grey", "m", "hotpink", "crimson", "r", "g", "brown", "orange", "gold", "k", "yellow", "gold", "lime", "k", "royalblue" ]
def obtain_hard_clusters(points, thresholds): # expects points as B * ? * E, thresholds as list # returns single linkage labels of shape B len(thresholds) ? clusterer = hdbscan.HDBSCAN(min_samples=1, approx_min_span_tree=False) labels_list = [] for batch_id in tqdm(range(points.shape[0]), desc="Processing batches", leave=False): clusterer.fit(points[batch_id].reshape(-1, points.shape[-1])) labels_batch = [] for threshold in tqdm(thresholds, desc="Cutting at thresholds", leave=False): labels_batch_threshold = clusterer.single_linkage_tree_.get_clusters(cut_distance=threshold, min_cluster_size=1) labels_batch.append(labels_batch_threshold.reshape(*points.shape[1:-1])) labels_list.append(labels_batch) tqdm.write("Done with hard clustering batch {}".format(batch_id)) return np.array(labels_list)
def cluster_locations(df, **kwargs): """ Cluster locations with HDBSCAN Args: df (pandas.DataFrame): RADAR android_phone_location dataframe min_samples (int): Minimum number of samples to form a cluster Default = N/20 kwargs: Key-word arguments to provide HDBSCAN Returns: np.ndarray[int]: Cluster labels """ min_samples = kwargs.pop('min_samples', 1 + len(df) // 20) clusterer = hdbscan.HDBSCAN(metric='haversine', min_samples=min_samples) clusterer.fit(df[['latitude', 'longitude']]) return clusterer.labels_
def centroids(paths): # distances = euclidean_distances(paths) # distances = cdist(paths, paths, 'euclidean') clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size) cluster_labels = clusterer.fit_predict(paths) num_clusters = len( set(cluster_labels)) - (1 if -1 in cluster_labels else 0) unique_labels = set(cluster_labels) clusters = [[] for n in range(num_clusters)] logging.info('Number of clusters: %s', num_clusters) for i, v in enumerate(paths): if cluster_labels[i] != -1: clusters[cluster_labels[i]].append(v) return clusters
def run_hdbscan(self, min_samples=10, min_cluster_size=500): """This function applies hdbscan clustering on the data after umap has been applied." """ print('Applying HDBSCAN clustering...') try: self.labels = hdbscan.HDBSCAN( min_samples=min_samples, min_cluster_size=min_cluster_size).fit_predict( self.clusterable_embedding) except: raise Exception( "Please execute 'retrieve_prediction_explanations', 'get_strength_per_feature_cols' and 'run_umap' methods first." )
def HDBScan_create_clusters(self, dist_data, transaction_data, min_members): # File pointer for of cluster details file cluster_op_file = open("./cluster_details_70000_" + "_" + str(min_members) + "_HDBSCAN.txt", mode='w', encoding='utf-8') # Implementing DBScan algorithm db = hdbscan.HDBSCAN(min_cluster_size=min_members, metric='precomputed').fit(dist_data) #core_samples_mask = np.zeros_like(db.labels_, dtype=bool) #core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) # clusters = [data[labels == i] for i in range(n_clusters_)] # storing cluster centers with open('ClusterCenters.p', 'wb') as fp: pickle.dump(db.cluster_centers_, fp) # Count Number of elements in each cluster print(Counter(labels)) cluster_op_file.write(str(Counter(labels))) cluster_op_file.write("\n") # Dictionary of clusters containing transaction numbers cluster_dict = {} #col_list = list(dist_data.columns.values) with open('./keys_70000.p', 'rb') as fp: col_list = pickle.load(fp) for j in range(len(labels)): if labels[j] != -1: # print(j) if labels[j] in cluster_dict.keys(): cluster_dict[labels[j]].append(col_list[j]) else: cluster_dict[labels[j]] = [col_list[j]] print(cluster_dict) self.cluster_output(transaction_data, cluster_dict, cluster_op_file)
def run_hdbscan(dataframe, min_size, month, state): # Compute DBSCAN hdb_t1 = time.time() hdb = hdbscan.HDBSCAN(min_cluster_size=10).fit(dataframe.as_matrix()) hdb_labels = hdb.labels_ hdb_elapsed_time = time.time() - hdb_t1 # Number of clusters in labels, ignoring noise if present. n_clusters_hdb_ = len(set(hdb_labels)) - (1 if -1 in hdb_labels else 0) print('\n\n++ HDBSCAN Results') print('Estimated number of clusters: %d' % n_clusters_hdb_) print('Elapsed time to cluster: %.4f s' % hdb_elapsed_time) return hdb_labels, n_clusters_hdb
def _cluster_encoded_trees(self, encoded_trees, scores=None, cluster_method='hdbscan', min_samples=5, min_cluster_size=5): if not encoded_trees: return [] res = [] if 'hdbscan' == cluster_method: clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, gen_min_span_tree=False) clusterer.fit(encoded_trees) res = clusterer.labels_ # non-clustered inputs have id -1, make them appear as individual clusters max_cluster = np.amax(res) for i in range(len(res)): if res[i] == -1: # print(res[i]) max_cluster += 1 res[i] = max_cluster elif 'kmeans' == cluster_method: # seems to be very slow for cluster_size in range(len(encoded_trees)): kmeans = cluster.KMeans(n_clusters=cluster_size + 1).fit(encoded_trees) if kmeans.inertia_ < 1: break res = kmeans.labels_ else: raise Exception( 'Fatal error: cluster_method={} is not recognized.'.format( cluster_method)) res = [int(i) for i in res] if scores is not None: if len(scores) != len(res): raise Exception( 'Fatal error: length of score ({}) and smiles ({}) are different.' .format(len(scores), len(res))) best_cluster_score = {} for cluster_id, score in zip(res, scores): best_cluster_score[cluster_id] = max( best_cluster_score.get(cluster_id, -float('inf')), score) new_order = list( sorted(best_cluster_score.items(), key=lambda x: -x[1])) order_mapping = {new_order[n][0]: n for n in range(len(new_order))} res = [order_mapping[n] for n in res] return res
def hdbscan_opt_param(X, Y, tres=0.9): params, prec = [], [] for p in range(20, 60, 5): for e in np.arange(0.1,1.0,0.2): for c in range(10, 60, 10): ## HDBSCAN model and parameters e = round(e,1) hdbscan_clust = hdbscan.HDBSCAN(min_samples = p, cluster_selection_epsilon = float(e), min_cluster_size = c) param = [e, p, c] params.append(param) # Compute HDBSCAN snd save outlier scores HDBS_clustering = hdbscan_clust.fit(X) lables = HDBS_clustering.outlier_scores_ lables = np.array(lables) lables = np.transpose(lables) lables = pd.DataFrame(lables, columns = ['outliers_scores_HDBSCAN']) threshold = pd.Series(HDBS_clustering.outlier_scores_).quantile(tres) ## Compute accuracy and precision original, result = [], [] original = [Y[Y==1.0].index, Y[Y==0.0].index] result = [lables[lables['outliers_scores_HDBSCAN']>threshold].index, lables[lables['outliers_scores_HDBSCAN']<=threshold].index] tp = (original[0] & result[0]).size tn = (original[1] & result[1]).size fp = (original[1] & result[0]).size fn = (original[0] & result[1]).size if (tp+fp) == 0: precision = 0 else: precision = tp/(tp+fp) prec.append(precision) values = range(0, len(params)) max_p = max(prec) eps = params[prec.index(max_p)][0] minpt = params[prec.index(max_p)][1] mincl = params[prec.index(max_p)][2] print(" - [HDBSCAN optimal parameters] : max precision at - ", params[prec.index(max_p)], "[epx, min_pts, min_clusters]") return eps, minpt, mincl
def run_algo(self,params: str, name_algo: str): self.raz() if name_algo.upper().__contains__("HDBSCAN"): parameters = tools.buildDict(params, {"min_samples": [3], "min_cluster_size": [2], "alpha": [0.5]}) self.execute(algo_name="HDBSCAN", url="https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html", func=lambda x: hdbscan.HDBSCAN(min_cluster_size=x["min_cluster_size"], min_samples=x["min_samples"], alpha=x["alpha"]), ps=parameters,colors=draw.colors,useCache=True) if name_algo.upper().__contains__("MEANSHIFT"): parameters = tools.buildDict(params, {"bandwidth": [2]}) self.execute("MEANSHIFT", "http://scikit-learn.org/stable/modules/generated/sklearn.cluster.MeanShift.html#sklearn.cluster.MeanShift", lambda x: cl.MeanShift(bandwidth=x["bandwidth"], bin_seeding=False, cluster_all=True), parameters,draw.colors,useCache=True) if name_algo.upper().__contains__("HAC"): parameters = tools.buildDict(params, {"n_clusters": [12]}) self.execute("HAC", "http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering", lambda x: cl.AgglomerativeClustering(n_clusters=x["n_clusters"]), parameters,draw.colors,useCache=True ) if name_algo.upper().__contains__("NEURALGAS"): parameters = tools.buildDict(params, {"passes": [10], "distance_toremove": [60]}) for passes in parameters.get("passes"): for distance_toremove_edge in parameters.get("distance_toremove_edge"): m: algo.model = algo.create_cluster_from_neuralgasnetwork( copy.deepcopy(self.ref_model).clear_clusters(),draw.colors, a=0.5, passes=passes, distance_toremove_edge=distance_toremove_edge) m.params = [passes, distance_toremove_edge, ""] m.help = "https://github.com/AdrienGuille/GrowingNeuralGas" self.append_modeles(m) if len(self.models)==0 or "NOTREATMENT" in name_algo.upper() or "NO" in name_algo.upper(): m=copy.deepcopy(self.ref_model) m.params=params m.setname("NOTREATMENT") self.append_modeles(m)
def return_best_cluster(self,df_cluster,cluster_param): if self.cluster_method == 'kprototypes': #weights on kmeans for i in self.numerical_index: df_cluster.iloc[:,i] = self.individual[i] * df_cluster.iloc[:,i] if os.path.exists(self.folder + 'cluster_init.json'): with open(self.folder + 'cluster_init.json') as f: cluster_init = json.load(f) ftnss = 100000 for init in cluster_init: init = [ np.array(init[0]),np.array(init[1])] kproto = KPrototypes(n_clusters=cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, max_iter=5, verbose=1, gamma=1,n_init=1, init = init) kproto.fit(df_cluster.values,categorical = self.categorical_index) x = pd.DataFrame([]) x['cluster'] = kproto.labels_ x['target'] = self.target df_grouped = x.groupby(['cluster'])['target'].max() - x.groupby(['cluster'])['target'].min() curr_ftnss = (df_grouped.values).sum() print(ftnss) print(curr_ftnss < ftnss) winner_model = kproto if curr_ftnss < ftnss: ftnss = curr_ftnss winner_model = kproto else: kproto = KPrototypes(n_clusters=self.cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, max_iter=5, verbose=1, gamma=1,n_init=1, init = 'Cao') kproto.fit(df_cluster.values,categorical = self.categorical_index) curr_ftnss = self.calculate_fitness(kproto.labels_) winner_model = kproto dump(winner_model,self.folder+'best_model.joblib') self.df['cluster'] = winner_model.labels_ return winner_model elif self.cluster_method == 'hdbscan': clusterer = hdb.HDBSCAN(min_cluster_size=cluster_param, prediction_data=True) clusterer.fit(df_cluster) dump(clusterer,self.folder+'best_model.joblib') self.df['cluster'] = clusterer.labels_ return clusterer
def ensemble_detection(X, run_sam=False, **kwargs): import umap import hdbscan from sklearn.cluster import DBSCAN metric = kwargs.get('metric', 'euclidean') k = kwargs.get('k', 5) min_dist = kwargs.get('min_dist', 0.05) umap_data, cluster_labels, sam = None, None, None if run_sam: from SAM import SAM N, T = X.shape counts = (X, np.arange(T), np.arange(N)) npcs = kwargs.get('npcs', 5) resolution = kwargs.get('resolution', 2.0) stopping_condition = kwargs.get('stopping_condition', 5e-4) max_iter = kwargs.get('max_iter', 25) sam = SAM(counts) sam.run(verbose=False, projection='umap', k=k, npcs=npcs, preprocessing='Normalizer', distance=metric, stopping_condition=stopping_condition, max_iter=max_iter, proj_kwargs={ 'metric': metric, 'n_neighbors': k, 'min_dist': min_dist }) param = kwargs.get('resolution', 1.0) umap_data = sam.adata.obsm['X_umap'] sam.clustering(X=None, param=param, method='leiden') cluster_labels = sam.adata.obs['leiden_clusters'] cluster_labels = [cluster_labels.iloc[i] for i in range(N)] else: umapy = umap.UMAP(n_components=2, min_dist=min_dist, n_neighbors=k) umap_data = umapy.fit_transform(X) clustering = hdbscan.HDBSCAN(min_cluster_size=5) cluster_labels = clustering.fit_predict(umap_data) return sam, umap_data, cluster_labels
def run_hdbscan(hs, mpts, labels, metric): clusterer = hdbscan.HDBSCAN(min_cluster_size=mpts, min_samples=mpts, metric=metric, gen_min_span_tree=True, match_reference_implementation=True) cluster_labels = clusterer.fit_predict(hs) # clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True, axis='matplotlib') Z = clusterer.single_linkage_tree_.to_numpy() # fig, ax1 = plt.subplots() # # plt.title('HDBSCAN* - mpts: ' + str(mpts) + " - " + metric) # plt.xlabel('mpts') # plt.ylabel('distance') dbcv = np.zeros((kmax - kmin) / skip) # for i in range(kmin, kmax, skip): # dbcv[i-kmin] = compute_DBCV(basedir + '/' + filename, resudir + '/' + filename + '_partition_' + str(i) + '.csv') r = dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels labels=labels, count_sort=True) cut = 2 partitioning = fcluster(Z, cut, criterion='distance') # plt.axhline(y=cut, c='k') clusters = np.unique(partitioning) h = np.array(hs) medoids = [] for c in clusters: a = np.where(partitioning == c)[0] objects = h[a] dist = distance_matrix(objects, objects) medoids.append(h[np.argmin(dist.sum(axis=0))]) plt.savefig(plotdir + filename + "_" + method + "_" + metric + '.png') plt.show() plt.gcf().clear() return medoids
def generateClusters(normalizeDistanceMeasurement, extraName): RS = 3072018 projection = TSNE( random_state=RS).fit_transform(normalizeDistanceMeasurement) plt.figure(figsize=(10, 10)) plt.scatter(*projection.T) plt.savefig(f'{config.outputDir}tsne-result-{extraName}{config.addition}') plt.clf() clusterSize = len(normalizeDistanceMeasurement) // 150 model = hdbscan.HDBSCAN(min_cluster_size=clusterSize, min_samples=clusterSize, cluster_selection_method='leaf', metric='precomputed') clu = model.fit(normalizeDistanceMeasurement) amountOfClusters = len(set(clu.labels_)) - 1 silhouetteScores = silhouette_samples(normalizeDistanceMeasurement, clu.labels_, metric='precomputed') avgClusterSize = 0 avgSilhouetteScore = 0 for clusterNumber in list(set(clu.labels_)): if clusterNumber == -1: continue scoresForSpecificCluster = silhouetteScores[clu.labels_ == clusterNumber] silhouetteAverageForCluster = np.average(scoresForSpecificCluster) avgSilhouetteScore += silhouetteAverageForCluster avgClusterSize += np.count_nonzero(clu.labels_ == clusterNumber) appendStatsToOutputFile(extraName, "Clustering size", clusterSize) appendStatsToOutputFile(extraName, "Number of clusters", amountOfClusters) appendStatsToOutputFile( extraName, "Average cluster size", round((avgClusterSize / amountOfClusters) / len(normalizeDistanceMeasurement) * 100, 2)) appendStatsToOutputFile(extraName, "Average Silhouette score", round(avgSilhouetteScore / amountOfClusters, 3)) appendStatsToOutputFile( extraName, "Samples not in noise", round((len(normalizeDistanceMeasurement) - np.count_nonzero(clu.labels_ == -1)) / len(normalizeDistanceMeasurement), 3)) return clu, projection
def cluster(timestamps,min_cluster = 7,plot = False): """ Uses a clustering algorithm from HDBSCAN to cluster timestamps into conversation epochs. Args: timestamps: List of timestamps (utc seconds) min_cluster: Minimum cluster size to consider (5-10 work well) plot: Decide whether or not to plot results Returns: labels: A list giving a label to each element of timestamp. If label[i] = -1 it's classified as noise and if label[i] = label[j] then then timestamp[i] and timestamp[j] are in the same cluster. """ X = np.array(timestamps) X = X.reshape(-1,1) clusterer = hdbscan.HDBSCAN(min_cluster_size= min_cluster, min_samples = 2) labels = clusterer.fit_predict(X) count = (len(set(labels)) - (1 if -1 in labels else 0)) #Number of clusters in labels, ignoring noise if present n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) #Black is removed and used for noise unique_lables = set(labels) cmap = plt.cm.get_cmap("Spectral") colors = [cmap(np.random.rand(1)[0]) for each in np.linspace(0, 1, len(unique_lables))] if plot == True: for k, col in zip(unique_lables, colors): if k == -1: # Black used for noise col = [0, 0, 0, 1] continue class_member_mask = (labels == k) xy = X[class_member_mask] plt.plot(xy[:], [0] * len(xy), 'o', c=tuple(col), linewidth = 7) plt.title("Estimated Number of Clusters: %d" % n_clusters_) plt.show() return labels
def generate_cluster( distances, embeddings_for_precomputed=None, selection_method="eom", metric="euclidean", min_size=2, min_sample=2, threads=1 ): clusterer = hdbscan.HDBSCAN( algorithm='best', alpha=1.0, cluster_selection_method=selection_method, metric=metric, min_cluster_size=min_size, min_samples=min_sample, core_dist_n_jobs=threads, approx_min_span_tree=False ) try: clusterer.fit(distances) if metric != "precomputed": cluster_validity = Clusterer.validity( clusterer.labels_, distances, quick=True ) else: cluster_validity = Clusterer.validity( clusterer.labels_, embeddings_for_precomputed, quick=True ) except (ValueError, FloatingPointError, SystemError, AttributeError): """ SystemError occurs in some mysterious part of numpy. Seems C++ returns a NULL at some point rather than a nice Python Error object and it can't recover. """ try: if metric != "precomputed": cluster_validity = Clusterer.validity( clusterer.labels_, distances, quick=False ) else: cluster_validity = Clusterer.validity( clusterer.labels_, embeddings_for_precomputed, quick=False ) except (ValueError, FloatingPointError, SystemError, AttributeError): cluster_validity = -1 clusterer.labels_ = np.array([-1 for _ in range(distances.shape[0])]) return (cluster_validity, min_size, min_sample, clusterer.labels_)
def clusteringPipeline(data, name, file_col): cluster = hdbscan.HDBSCAN() # maybe change the hyperparameters? cluster.fit(data.drop(file_col, axis=1)) # dim reduction to create final dataset pca = PCA(n_components=2) coords = pca.fit_transform(data) # dataset creation df = pd.DataFrame(data[file_col], columns=[file_col]) df['x'] = coords[:, 0] df['y'] = coords[:, 1] df['HDBSCAN'] = cluster.label_ df.to_csv(f'D3_inputs/{name}', index=False)
def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) if self.hyperparams['algorithm'] == 'HDBSCAN': self.clf = hdbscan.HDBSCAN( min_cluster_size=self.hyperparams['min_cluster_size'], min_samples=self.hyperparams['min_samples'], cluster_selection_method=self. hyperparams['cluster_selection_method']) else: self.clf = DBSCAN(eps=self.hyperparams['eps'], min_samples=self.hyperparams['min_samples'])
def test_hdbscan_core_dists_bug_4054(): """ This test explicitly verifies that the MRE from https://github.com/rapidsai/cuml/issues/4054 matches the reference impl """ X, y = datasets.make_moons(n_samples=10000, noise=0.12, random_state=0) cu_labels_ = HDBSCAN(min_samples=25, min_cluster_size=25).fit_predict(X) sk_labels_ = hdbscan.HDBSCAN(min_samples=25, min_cluster_size=25, approx_min_span_tree=False).fit_predict(X) assert adjusted_rand_score(cu_labels_, sk_labels_) > 0.99