def cluster_embeddings_hdbscan(sentences, sentence_embeddings):
    clusterer = hdbscan.HDBSCAN(min_cluster_size=2, min_samples=None)
    clusterer.fit(sentence_embeddings)

    summary_df = pd.DataFrame(
        data={
            "position": range(len(sentences)),
            "sentence": sentences,
            "embedding": sentence_embeddings.tolist(),
            "cluster": clusterer.labels_
        })

    return summary_df, clusterer.exemplars_
Exemple #2
0
def hdbscan_cluster(shifted_pcd, valid, min_cluster_size=50):
    clustered_ins_ids = np.zeros(shifted_pcd.shape[0], dtype=np.int32)
    valid_shifts = shifted_pcd[valid, :].reshape(-1, 3)
    if valid_shifts.shape[0] == 0:
        return clustered_ins_ids
    cluster = hdbscan.HDBSCAN(
        min_cluster_size = min_cluster_size,
        allow_single_cluster = True
    ).fit(valid_shifts)
    instance_labels = cluster.labels_
    instance_labels += (-instance_labels.min() + 1)
    clustered_ins_ids[valid] = instance_labels
    return clustered_ins_ids
Exemple #3
0
 def estimate(self, mutations_data, **kwargs):
     try:
         precomputed = squareform(
             pdist(mutations_data, 'cityblock') / mutations_data.shape[1])
     except BaseException:
         logger.exception(
             'Problem with mutations_data: {}'.format(mutations_data))
         raise RuntimeError('Problem in computing distances for hdbscan.')
     kwargs.pop('metric', None)
     model = hdbscan.HDBSCAN(metric='precomputed',
                             **merge_dicts(default_parameters, kwargs))
     model.fit(precomputed)
     return pd.Series(model.labels_ + 1, index=mutations_data.index)
def apply_hdbscan(data, min_cluster_size, min_samples):
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size, min_samples=min_samples, memory='./chache')
    start = time.time()
    log.info("Fitting...")
    clusterer.fit(data)
    end = time.time()
    log.info('HDBSCAN finished in %f secondi', (end - start))
    n_classes = len(np.unique(clusterer.labels_))
    n_outliers = list(clusterer.labels_).count(-1)
    log.info("Found %d clusters (+1 outliers) and %d outliers",
             n_classes - 1, n_outliers)
    return clusterer.labels_, n_classes
Exemple #5
0
    def hdbscan_clustering(self, min_samples_scaling=0.5):
        # Feature scaling.
        X = self.feature_scaling()

        min_cluster_size = max(
            [self.global_min_samples,
             int(0.005 * X.shape[0])])

        hdbcl = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
                                min_samples=int(min_samples_scaling *
                                                min_cluster_size))
        hdbresult = hdbcl.fit(X)
        self.table['cluster'] = hdbresult.labels_
def apply_hdbscan(features):
    import hdbscan
    from sklearn.metrics import pairwise_distances
    distance = pairwise_distances(features, metric='cosine')
    hdb = hdbscan.HDBSCAN(min_cluster_size=2, metric='precomputed')
    hdb.fit(distance.astype('float64'))

    # Clustering Results
    # Number of clusters in pred_labels, ignoring noise (-1) if present.
    pred_labels = hdb.labels_
    n_clusters_ = len(set(pred_labels)) - (1 if -1 in pred_labels else 0)
    n_noise_ = list(pred_labels).count(-1)
    return pred_labels, n_clusters_, n_noise_
	def __init__(self, Basis, window_sz, overlap, min_cluster_size):
		self.basis = Basis
		self.window_sz = window_sz
		self.overlap = overlap
		self.inds = []
		self.koop_list = []
		self.koopman_feature_array = []
		self.labels = []
		self.koop_cluster_list = []
		self.koop_cluster_memb_prob_list = []
		self.koopman_hybrid_modes = []
		self.min_cluster_size = min_cluster_size
		self.clusterer = hdbscan.HDBSCAN(min_cluster_size=self.min_cluster_size,metric='euclidean')
Exemple #8
0
def cluster(distances):
    # Perform initial clustering
    distances = np.array(distances).reshape(-1, 1)
    clusterer = hdbscan.HDBSCAN(
        allow_single_cluster=True,
        prediction_data=True,
        min_cluster_size=5,
    )
    # labels = clusterer.fit(xmap_embedding.astype(np.float64)).labels_

    clusterer.fit(np.array(distances).astype(np.float64))

    return clusterer.labels_
Exemple #9
0
    def dbscan(self):
        """
         Train hdbscan for the input dataframe
         save the hdbscan model
        """

        df = self.data.copy()
        hdb = hdbscan.HDBSCAN(min_cluster_size=16000,
                              min_samples=5,
                              prediction_data=True).fit(df)
        joblib.dump(hdb, 'ad/hdbscan')
        self.data[
            'Category'] = hdb.labels_  # Stores the labels into category field
    def dbscan(self):
        self.cluster_obj = hdbscan.HDBSCAN(
            cluster_selection_epsilon=self.dbs_eps,
            min_cluster_size=self.dbs_min_samples,
            prediction_data=True)
        self.cluster_obj.fit(self.X)

        def dbscan_predict(X_test):
            test_labels, strengths = hdbscan.approximate_predict(
                self.cluster_obj, X_test)
            return test_labels

        self.cluster_obj.predict = dbscan_predict
Exemple #11
0
    def cluster(self, divisor = 30, cluster_selection_epsilon = 0, metric = 'euclidean', algorithm = 'best', n_components = 10):
        #HDB Scan
        hdb = hdbscan.HDBSCAN(min_cluster_size=
        int(np.floor(len(self.clusterdf) / divisor)), 
        min_samples=1,
        cluster_selection_method='eom'
        ,cluster_selection_epsilon=cluster_selection_epsilon
        ,metric= metric
        ,algorithm=algorithm
        )

        #Normalize data
        cols = self.unitscol + self.traitscol + self.traitsnumunitcol + self.chosenunitpivotcol + self.chosentraitpivotcol
        data = self.clusterdf[cols].fillna(0)
        norm_data = normalize(data, norm='l2')

        ##Cannot make dimension reduction work
        #reducer = umap.UMAP(metric = 'manhattan', random_state = 42, n_components = n_components)
        #embed = reducer.fit_transform(norm_data)

        embed = norm_data

        #print(cols)
        #Cluster HDB
        print('HDB Scan')
        clusterer=hdb.fit(embed)

        try:
            self.plot = clusterer.condensed_tree_.plot(select_clusters=True, label_clusters=True)
        except:
            pass
        
        self.clusterdf['hdbnumber'] = pd.Series(hdb.labels_+1, index=self.clusterdf.index)
        self.clusterdf['comp_id'] = self.clusterdf['participants.placement'].apply(str)+self.clusterdf['match_id']
        
        #print(self.clusterdf['hdbnumber'].value_counts())

        #Get top 2 traits
        traitsaveragedf = self.clusterdf.fillna(0).groupby('hdbnumber')[list(self.traitscol)].mean()
        commontraitslist=traitsaveragedf.apply(lambda s: s.abs().nlargest(2).index.to_list(), axis=1)
        self.commontraits=commontraitslist.agg(lambda x: ' '.join(map(str, x)))
        self.commontraits[0]='No Comp'

        self.clusterdf=self.clusterdf.merge(pd.DataFrame({'hdb':self.commontraits}),left_on='hdbnumber',right_on='hdbnumber')        

        self.traitshdb=self.traitsdf.merge(self.clusterdf)[list(self.traitsdf.columns)+list(['hdb'])+list(['comp_id'])]
        self.unitshdb=self.unitsdf.merge(self.clusterdf[['match_id','participants.placement','comp_id','hdb']])[list(self.unitsdf.columns)+list(['hdb'])+list(['comp_id'])]
        self.itemshdb=self.itemsdf.merge(self.clusterdf)[list(self.itemsdf.columns)+list(['hdb'])+list(['comp_id'])]

        comppop=pd.DataFrame(self.clusterdf.groupby('match_id')['hdb'].value_counts().rename('compsinmatch'))
        self.clusterdf=pd.merge(self.clusterdf,comppop,left_on=['match_id','hdb'],right_index=True)
    def get_umap_subsets(self, nn=100, md=0.1, **kwargs):
        """
        Get the names and indices of the t-sne-defined subsets
        """
        # First get umap results:
        results = Table.read(
            "../data/dimred_results/apogee_rc_dimred_hyperparametertest.fits")
        self.Xu = results["X_umap_euclidean_nn" + str(nn) + "_md" + str(md)]
        self.Yu = results["Y_umap_euclidean_nn" + str(nn) + "_md" + str(md)]

        # Now run HDBSCAN to define the subsets
        import hdbscan
        clusterer = hdbscan.HDBSCAN(**kwargs)
        clusterer.fit(np.vstack((self.Xu, self.Yu)).T)
        self.classcol = clusterer.labels_
        self.classprob = clusterer.probabilities_
        self.subsets = np.unique(clusterer.labels_)
        #self.classcol= np.char.rstrip(self.data["tsne_class_teffcut40"],b' ')#.decode('utf8').strip()
        #self.subsets = ["thin", "thick1", "thick2", "thick3", "thick4",
        #           "mpthin", "mpthintrans", "smr", "t4trans", "youngthin",
        #           "debris1", "debris2", "debris3", "debris4", "debris5",
        #           "smr2", "t2trans1", "highTi","lowMg","highAlMg?"]
        self.names = [
            "", "", "", "", "", "", "Transition group", "", "",
            "Young local disc", "", "", "[s/Fe]-enhanced", "", "", r"",
            "Debris candidate", r"Extreme-Ti star", r"Low-[Mg/Fe] star",
            "High-[Al/Mg] star"
        ]
        self.Xcoords = [10, 11, 4.5, -12, 18, -31, 22, 26, -22.5, -14, -2, -25]
        self.Ycoords = [5.5, .5, -2, -4, 6, 0, 1.5, -.5, -7, -2, -6, 14]
        self.fsize = [20, 16, 12, 12, 15, 13, 11, 11, 11, 11, 11, 11]
        self.sym = [
            "o", "v", "^", ">", "<", "s", "o", "*", "<", "o", "h", "d", "D",
            "v", "p", "*", "D", "p", "s", "8"
        ]
        self.al = [
            .6, .8, .8, .8, .8, .8, .8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1
        ]
        self.lw = [
            0, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5,
            .5, .5, .5, .5
        ]
        self.size = [
            7, 12, 12, 12, 12, 15, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
            18, 18, 18, 18, 18
        ]
        self.col = [
            "grey", "m", "hotpink", "crimson", "r", "g", "brown", "orange",
            "gold", "k", "yellow", "gold", "lime", "k", "royalblue"
        ]
def obtain_hard_clusters(points, thresholds):
    # expects points as B * ? * E, thresholds as list
    # returns single linkage labels of shape B len(thresholds) ?
    clusterer = hdbscan.HDBSCAN(min_samples=1, approx_min_span_tree=False)
    labels_list = []
    for batch_id in tqdm(range(points.shape[0]), desc="Processing batches", leave=False):
        clusterer.fit(points[batch_id].reshape(-1, points.shape[-1]))
        labels_batch = []
        for threshold in tqdm(thresholds, desc="Cutting at thresholds", leave=False):
            labels_batch_threshold = clusterer.single_linkage_tree_.get_clusters(cut_distance=threshold, min_cluster_size=1)
            labels_batch.append(labels_batch_threshold.reshape(*points.shape[1:-1]))
        labels_list.append(labels_batch)
        tqdm.write("Done with hard clustering batch {}".format(batch_id))
    return np.array(labels_list)
Exemple #14
0
def cluster_locations(df, **kwargs):
    """ Cluster locations with HDBSCAN
    Args:
        df (pandas.DataFrame): RADAR android_phone_location dataframe
        min_samples (int): Minimum number of samples to form a cluster
            Default = N/20
        kwargs: Key-word arguments to provide HDBSCAN
    Returns:
        np.ndarray[int]: Cluster labels
    """
    min_samples = kwargs.pop('min_samples', 1 + len(df) // 20)
    clusterer = hdbscan.HDBSCAN(metric='haversine', min_samples=min_samples)
    clusterer.fit(df[['latitude', 'longitude']])
    return clusterer.labels_
Exemple #15
0
 def centroids(paths):
     # distances = euclidean_distances(paths)
     # distances = cdist(paths, paths, 'euclidean')
     clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
     cluster_labels = clusterer.fit_predict(paths)
     num_clusters = len(
         set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
     unique_labels = set(cluster_labels)
     clusters = [[] for n in range(num_clusters)]
     logging.info('Number of clusters: %s', num_clusters)
     for i, v in enumerate(paths):
         if cluster_labels[i] != -1:
             clusters[cluster_labels[i]].append(v)
     return clusters
    def run_hdbscan(self, min_samples=10, min_cluster_size=500):
        """This function applies hdbscan clustering on the data after umap has been applied."
        """
        print('Applying HDBSCAN clustering...')

        try:
            self.labels = hdbscan.HDBSCAN(
                min_samples=min_samples,
                min_cluster_size=min_cluster_size).fit_predict(
                    self.clusterable_embedding)
        except:
            raise Exception(
                "Please execute 'retrieve_prediction_explanations', 'get_strength_per_feature_cols'  and 'run_umap' methods first."
            )
    def HDBScan_create_clusters(self, dist_data, transaction_data,
                                min_members):

        # File pointer for of cluster details file
        cluster_op_file = open("./cluster_details_70000_" + "_" +
                               str(min_members) + "_HDBSCAN.txt",
                               mode='w',
                               encoding='utf-8')

        # Implementing DBScan algorithm
        db = hdbscan.HDBSCAN(min_cluster_size=min_members,
                             metric='precomputed').fit(dist_data)

        #core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        #core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_

        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        print('Estimated number of clusters: %d' % n_clusters_)

        # clusters = [data[labels == i] for i in range(n_clusters_)]

        # storing cluster centers
        with open('ClusterCenters.p', 'wb') as fp:
            pickle.dump(db.cluster_centers_, fp)

        # Count Number of elements in each cluster
        print(Counter(labels))

        cluster_op_file.write(str(Counter(labels)))
        cluster_op_file.write("\n")

        # Dictionary of clusters containing transaction numbers
        cluster_dict = {}
        #col_list = list(dist_data.columns.values)
        with open('./keys_70000.p', 'rb') as fp:
            col_list = pickle.load(fp)

        for j in range(len(labels)):
            if labels[j] != -1:
                # print(j)
                if labels[j] in cluster_dict.keys():
                    cluster_dict[labels[j]].append(col_list[j])
                else:
                    cluster_dict[labels[j]] = [col_list[j]]

        print(cluster_dict)

        self.cluster_output(transaction_data, cluster_dict, cluster_op_file)
def run_hdbscan(dataframe, min_size, month, state):
    # Compute DBSCAN
    hdb_t1 = time.time()

    hdb = hdbscan.HDBSCAN(min_cluster_size=10).fit(dataframe.as_matrix())
    hdb_labels = hdb.labels_
    hdb_elapsed_time = time.time() - hdb_t1
    # Number of clusters in labels, ignoring noise if present.
    n_clusters_hdb_ = len(set(hdb_labels)) - (1 if -1 in hdb_labels else 0)

    print('\n\n++ HDBSCAN Results')
    print('Estimated number of clusters: %d' % n_clusters_hdb_)
    print('Elapsed time to cluster: %.4f s' % hdb_elapsed_time)
    return hdb_labels, n_clusters_hdb
    def _cluster_encoded_trees(self,
                               encoded_trees,
                               scores=None,
                               cluster_method='hdbscan',
                               min_samples=5,
                               min_cluster_size=5):
        if not encoded_trees:
            return []
        res = []
        if 'hdbscan' == cluster_method:
            clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
                                        min_samples=min_samples,
                                        gen_min_span_tree=False)
            clusterer.fit(encoded_trees)
            res = clusterer.labels_
            # non-clustered inputs have id -1, make them appear as individual clusters
            max_cluster = np.amax(res)
            for i in range(len(res)):
                if res[i] == -1:
                    # print(res[i])
                    max_cluster += 1
                    res[i] = max_cluster
        elif 'kmeans' == cluster_method:  # seems to be very slow
            for cluster_size in range(len(encoded_trees)):
                kmeans = cluster.KMeans(n_clusters=cluster_size +
                                        1).fit(encoded_trees)
                if kmeans.inertia_ < 1:
                    break
            res = kmeans.labels_
        else:
            raise Exception(
                'Fatal error: cluster_method={} is not recognized.'.format(
                    cluster_method))

        res = [int(i) for i in res]

        if scores is not None:
            if len(scores) != len(res):
                raise Exception(
                    'Fatal error: length of score ({}) and smiles ({}) are different.'
                    .format(len(scores), len(res)))
            best_cluster_score = {}
            for cluster_id, score in zip(res, scores):
                best_cluster_score[cluster_id] = max(
                    best_cluster_score.get(cluster_id, -float('inf')), score)
            new_order = list(
                sorted(best_cluster_score.items(), key=lambda x: -x[1]))
            order_mapping = {new_order[n][0]: n for n in range(len(new_order))}
            res = [order_mapping[n] for n in res]
        return res
Exemple #20
0
def hdbscan_opt_param(X, Y, tres=0.9):
    params, prec = [], []
     
    for p in range(20, 60, 5):    
        for e in np.arange(0.1,1.0,0.2):
            for c in range(10, 60, 10):
                
                ## HDBSCAN model and parameters
                e = round(e,1)            
                hdbscan_clust = hdbscan.HDBSCAN(min_samples = p, cluster_selection_epsilon = float(e), min_cluster_size = c)            
                param = [e, p, c]
                params.append(param)

                # Compute HDBSCAN snd save outlier scores            
                HDBS_clustering = hdbscan_clust.fit(X)            
                lables = HDBS_clustering.outlier_scores_
                lables = np.array(lables)
                lables = np.transpose(lables)
                lables = pd.DataFrame(lables, columns = ['outliers_scores_HDBSCAN'])            
                threshold = pd.Series(HDBS_clustering.outlier_scores_).quantile(tres)

                ## Compute accuracy and precision
                original, result = [], []
                original = [Y[Y==1.0].index, Y[Y==0.0].index]
                result = [lables[lables['outliers_scores_HDBSCAN']>threshold].index, lables[lables['outliers_scores_HDBSCAN']<=threshold].index]

                tp = (original[0] & result[0]).size
                tn = (original[1] & result[1]).size
                fp = (original[1] & result[0]).size
                fn = (original[0] & result[1]).size

                if (tp+fp) == 0:
                    precision = 0
                else:
                    precision = tp/(tp+fp)
               
                prec.append(precision)
                
                

    values = range(0, len(params))
    max_p = max(prec)
    eps = params[prec.index(max_p)][0]
    minpt = params[prec.index(max_p)][1]
    mincl = params[prec.index(max_p)][2]

    print(" - [HDBSCAN optimal parameters] : max precision at - ", params[prec.index(max_p)], "[epx, min_pts, min_clusters]")   
    
    return eps, minpt, mincl
Exemple #21
0
    def run_algo(self,params: str, name_algo: str):

        self.raz()

        if name_algo.upper().__contains__("HDBSCAN"):
            parameters = tools.buildDict(params, {"min_samples": [3], "min_cluster_size": [2], "alpha": [0.5]})
            self.execute(algo_name="HDBSCAN",
                        url="https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html",
                        func=lambda x:
                            hdbscan.HDBSCAN(min_cluster_size=x["min_cluster_size"],
                                            min_samples=x["min_samples"],
                                            alpha=x["alpha"]),
                        ps=parameters,colors=draw.colors,useCache=True)

        if name_algo.upper().__contains__("MEANSHIFT"):
            parameters = tools.buildDict(params, {"bandwidth": [2]})
            self.execute("MEANSHIFT",
                        "http://scikit-learn.org/stable/modules/generated/sklearn.cluster.MeanShift.html#sklearn.cluster.MeanShift",
                        lambda x:
                        cl.MeanShift(bandwidth=x["bandwidth"], bin_seeding=False, cluster_all=True),
                        parameters,draw.colors,useCache=True)

        if name_algo.upper().__contains__("HAC"):
            parameters = tools.buildDict(params, {"n_clusters": [12]})
            self.execute("HAC",
                        "http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering",
                        lambda x:
                        cl.AgglomerativeClustering(n_clusters=x["n_clusters"]),
                        parameters,draw.colors,useCache=True
                        )

        if name_algo.upper().__contains__("NEURALGAS"):
            parameters = tools.buildDict(params, {"passes": [10], "distance_toremove": [60]})
            for passes in parameters.get("passes"):
                for distance_toremove_edge in parameters.get("distance_toremove_edge"):
                    m: algo.model = algo.create_cluster_from_neuralgasnetwork(
                        copy.deepcopy(self.ref_model).clear_clusters(),draw.colors,
                        a=0.5,
                        passes=passes,
                        distance_toremove_edge=distance_toremove_edge)
                    m.params = [passes, distance_toremove_edge, ""]
                    m.help = "https://github.com/AdrienGuille/GrowingNeuralGas"
                    self.append_modeles(m)

        if len(self.models)==0 or "NOTREATMENT" in name_algo.upper() or "NO" in name_algo.upper():
            m=copy.deepcopy(self.ref_model)
            m.params=params
            m.setname("NOTREATMENT")
            self.append_modeles(m)
Exemple #22
0
    def return_best_cluster(self,df_cluster,cluster_param):
        if self.cluster_method == 'kprototypes':
            
            #weights on kmeans
            
          
            for i in self.numerical_index:
                df_cluster.iloc[:,i] = self.individual[i] * df_cluster.iloc[:,i] 

            if os.path.exists(self.folder + 'cluster_init.json'):
                with open(self.folder + 'cluster_init.json') as f:
                    cluster_init = json.load(f)
                ftnss = 100000
                for init in cluster_init:    
                    init = [ np.array(init[0]),np.array(init[1])] 
                    kproto = KPrototypes(n_clusters=cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, 
                                    max_iter=5, verbose=1, gamma=1,n_init=1, init = init)
                    kproto.fit(df_cluster.values,categorical = self.categorical_index)    

                    x = pd.DataFrame([])
                    x['cluster'] = kproto.labels_
                    x['target'] = self.target
                    df_grouped = x.groupby(['cluster'])['target'].max() - x.groupby(['cluster'])['target'].min()
                    curr_ftnss = (df_grouped.values).sum()   
                    print(ftnss) 

                    print(curr_ftnss < ftnss)       
                    winner_model = kproto          
                           
                    if curr_ftnss < ftnss:
                        ftnss = curr_ftnss     
                        winner_model = kproto  
            else:
                kproto = KPrototypes(n_clusters=self.cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, 
                                max_iter=5, verbose=1, gamma=1,n_init=1, init = 'Cao')
                kproto.fit(df_cluster.values,categorical = self.categorical_index)            
                curr_ftnss = self.calculate_fitness(kproto.labels_)                
                winner_model = kproto                     
            
            dump(winner_model,self.folder+'best_model.joblib')
            self.df['cluster'] = winner_model.labels_
            return winner_model
        
        elif self.cluster_method == 'hdbscan':
            clusterer = hdb.HDBSCAN(min_cluster_size=cluster_param, prediction_data=True)
            clusterer.fit(df_cluster)    
            dump(clusterer,self.folder+'best_model.joblib')
            self.df['cluster'] = clusterer.labels_
            return clusterer   
Exemple #23
0
def ensemble_detection(X, run_sam=False, **kwargs):
    import umap
    import hdbscan
    from sklearn.cluster import DBSCAN
    metric = kwargs.get('metric', 'euclidean')
    k = kwargs.get('k', 5)
    min_dist = kwargs.get('min_dist', 0.05)

    umap_data, cluster_labels, sam = None, None, None
    if run_sam:
        from SAM import SAM

        N, T = X.shape
        counts = (X, np.arange(T), np.arange(N))

        npcs = kwargs.get('npcs', 5)
        resolution = kwargs.get('resolution', 2.0)
        stopping_condition = kwargs.get('stopping_condition', 5e-4)
        max_iter = kwargs.get('max_iter', 25)

        sam = SAM(counts)
        sam.run(verbose=False,
                projection='umap',
                k=k,
                npcs=npcs,
                preprocessing='Normalizer',
                distance=metric,
                stopping_condition=stopping_condition,
                max_iter=max_iter,
                proj_kwargs={
                    'metric': metric,
                    'n_neighbors': k,
                    'min_dist': min_dist
                })
        param = kwargs.get('resolution', 1.0)
        umap_data = sam.adata.obsm['X_umap']
        sam.clustering(X=None, param=param, method='leiden')
        cluster_labels = sam.adata.obs['leiden_clusters']
        cluster_labels = [cluster_labels.iloc[i] for i in range(N)]

    else:

        umapy = umap.UMAP(n_components=2, min_dist=min_dist, n_neighbors=k)
        umap_data = umapy.fit_transform(X)

        clustering = hdbscan.HDBSCAN(min_cluster_size=5)
        cluster_labels = clustering.fit_predict(umap_data)

    return sam, umap_data, cluster_labels
Exemple #24
0
def run_hdbscan(hs, mpts, labels, metric):
    clusterer = hdbscan.HDBSCAN(min_cluster_size=mpts,
                                min_samples=mpts,
                                metric=metric,
                                gen_min_span_tree=True,
                                match_reference_implementation=True)
    cluster_labels = clusterer.fit_predict(hs)

    # clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True, axis='matplotlib')
    Z = clusterer.single_linkage_tree_.to_numpy()

    # fig, ax1 = plt.subplots()
    #
    # plt.title('HDBSCAN* - mpts: ' + str(mpts) + " - " + metric)
    # plt.xlabel('mpts')
    # plt.ylabel('distance')

    dbcv = np.zeros((kmax - kmin) / skip)

    # for i in range(kmin, kmax, skip):
    #     dbcv[i-kmin] = compute_DBCV(basedir + '/' + filename, resudir + '/' + filename + '_partition_' + str(i) + '.csv')

    r = dendrogram(
        Z,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=8.,  # font size for the x axis labels
        labels=labels,
        count_sort=True)

    cut = 2
    partitioning = fcluster(Z, cut, criterion='distance')
    # plt.axhline(y=cut, c='k')
    clusters = np.unique(partitioning)

    h = np.array(hs)

    medoids = []

    for c in clusters:
        a = np.where(partitioning == c)[0]
        objects = h[a]
        dist = distance_matrix(objects, objects)
        medoids.append(h[np.argmin(dist.sum(axis=0))])

    plt.savefig(plotdir + filename + "_" + method + "_" + metric + '.png')
    plt.show()
    plt.gcf().clear()

    return medoids
Exemple #25
0
def generateClusters(normalizeDistanceMeasurement, extraName):
    RS = 3072018
    projection = TSNE(
        random_state=RS).fit_transform(normalizeDistanceMeasurement)
    plt.figure(figsize=(10, 10))
    plt.scatter(*projection.T)
    plt.savefig(f'{config.outputDir}tsne-result-{extraName}{config.addition}')
    plt.clf()

    clusterSize = len(normalizeDistanceMeasurement) // 150
    model = hdbscan.HDBSCAN(min_cluster_size=clusterSize,
                            min_samples=clusterSize,
                            cluster_selection_method='leaf',
                            metric='precomputed')
    clu = model.fit(normalizeDistanceMeasurement)

    amountOfClusters = len(set(clu.labels_)) - 1

    silhouetteScores = silhouette_samples(normalizeDistanceMeasurement,
                                          clu.labels_,
                                          metric='precomputed')

    avgClusterSize = 0
    avgSilhouetteScore = 0

    for clusterNumber in list(set(clu.labels_)):
        if clusterNumber == -1:
            continue
        scoresForSpecificCluster = silhouetteScores[clu.labels_ ==
                                                    clusterNumber]
        silhouetteAverageForCluster = np.average(scoresForSpecificCluster)
        avgSilhouetteScore += silhouetteAverageForCluster
        avgClusterSize += np.count_nonzero(clu.labels_ == clusterNumber)

    appendStatsToOutputFile(extraName, "Clustering size", clusterSize)
    appendStatsToOutputFile(extraName, "Number of clusters", amountOfClusters)
    appendStatsToOutputFile(
        extraName, "Average cluster size",
        round((avgClusterSize / amountOfClusters) /
              len(normalizeDistanceMeasurement) * 100, 2))
    appendStatsToOutputFile(extraName, "Average Silhouette score",
                            round(avgSilhouetteScore / amountOfClusters, 3))
    appendStatsToOutputFile(
        extraName, "Samples not in noise",
        round((len(normalizeDistanceMeasurement) -
               np.count_nonzero(clu.labels_ == -1)) /
              len(normalizeDistanceMeasurement), 3))

    return clu, projection
def cluster(timestamps,min_cluster = 7,plot = False):
    """
    Uses a clustering algorithm from HDBSCAN to cluster timestamps into conversation epochs.

    Args:
        timestamps: List of timestamps (utc seconds)

        min_cluster: Minimum cluster size to consider (5-10 work well)

        plot: Decide whether or not to plot results

    Returns:
        labels: A list giving a label to each element of timestamp. If label[i] = -1 it's classified as noise
        and if label[i] = label[j] then then timestamp[i] and timestamp[j] are in the same cluster.

    """

    X = np.array(timestamps)
    X = X.reshape(-1,1)

    clusterer = hdbscan.HDBSCAN(min_cluster_size= min_cluster, min_samples = 2)
    labels = clusterer.fit_predict(X)

    count = (len(set(labels)) - (1 if -1 in labels else 0))

    #Number of clusters in labels, ignoring noise if present
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    #Black is removed and used for noise
    unique_lables = set(labels)

    cmap = plt.cm.get_cmap("Spectral")
    colors = [cmap(np.random.rand(1)[0]) for each in np.linspace(0, 1, len(unique_lables))]

    if plot == True:
        for k, col in zip(unique_lables, colors):
            if k == -1:
                # Black used for noise
                col = [0, 0, 0, 1]
                continue

            class_member_mask = (labels == k)
            xy = X[class_member_mask]
            plt.plot(xy[:], [0] * len(xy), 'o', c=tuple(col), linewidth = 7)

        plt.title("Estimated Number of Clusters: %d" % n_clusters_)
        plt.show()

    return labels
Exemple #27
0
    def generate_cluster(
            distances,
            embeddings_for_precomputed=None,
            selection_method="eom",
            metric="euclidean",
            min_size=2,
            min_sample=2,
            threads=1
    ):
        clusterer = hdbscan.HDBSCAN(
            algorithm='best',
            alpha=1.0,
            cluster_selection_method=selection_method,
            metric=metric,
            min_cluster_size=min_size,
            min_samples=min_sample,
            core_dist_n_jobs=threads,
            approx_min_span_tree=False
        )

        try:
            clusterer.fit(distances)
            if metric != "precomputed":
                cluster_validity = Clusterer.validity(
                    clusterer.labels_, distances, quick=True
                )
            else:
                cluster_validity = Clusterer.validity(
                    clusterer.labels_, embeddings_for_precomputed, quick=True
                )
        except (ValueError, FloatingPointError, SystemError, AttributeError):
            """
            SystemError occurs in some mysterious part of numpy. Seems C++ returns a NULL at some point
            rather than a nice Python Error object and it can't recover. 
            """
            try:
                if metric != "precomputed":
                    cluster_validity = Clusterer.validity(
                        clusterer.labels_, distances, quick=False
                    )
                else:
                    cluster_validity = Clusterer.validity(
                        clusterer.labels_, embeddings_for_precomputed, quick=False
                    )
            except (ValueError, FloatingPointError, SystemError, AttributeError):
                cluster_validity = -1
                clusterer.labels_ = np.array([-1 for _ in range(distances.shape[0])])

        return (cluster_validity, min_size, min_sample, clusterer.labels_)
Exemple #28
0
def clusteringPipeline(data, name, file_col):
    cluster = hdbscan.HDBSCAN()  # maybe change the hyperparameters?
    cluster.fit(data.drop(file_col, axis=1))

    # dim reduction to create final dataset
    pca = PCA(n_components=2)
    coords = pca.fit_transform(data)

    # dataset creation
    df = pd.DataFrame(data[file_col], columns=[file_col])
    df['x'] = coords[:, 0]
    df['y'] = coords[:, 1]
    df['HDBSCAN'] = cluster.label_

    df.to_csv(f'D3_inputs/{name}', index=False)
Exemple #29
0
    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        if self.hyperparams['algorithm'] == 'HDBSCAN':
            self.clf = hdbscan.HDBSCAN(
                min_cluster_size=self.hyperparams['min_cluster_size'],
                min_samples=self.hyperparams['min_samples'],
                cluster_selection_method=self.
                hyperparams['cluster_selection_method'])
        else:
            self.clf = DBSCAN(eps=self.hyperparams['eps'],
                              min_samples=self.hyperparams['min_samples'])
Exemple #30
0
def test_hdbscan_core_dists_bug_4054():
    """
    This test explicitly verifies that the MRE from
    https://github.com/rapidsai/cuml/issues/4054
    matches the reference impl
    """

    X, y = datasets.make_moons(n_samples=10000, noise=0.12, random_state=0)

    cu_labels_ = HDBSCAN(min_samples=25, min_cluster_size=25).fit_predict(X)
    sk_labels_ = hdbscan.HDBSCAN(min_samples=25,
                                 min_cluster_size=25,
                                 approx_min_span_tree=False).fit_predict(X)

    assert adjusted_rand_score(cu_labels_, sk_labels_) > 0.99