def find_clusters(
    X: xpd.DataFrame,
    eps: float = 3000,
    min_samples: int = 250,
    output_colname: str = "cluster_id",
) -> xpd.Series:
    """
    Classify a point cloud into several groups, with each group being assigned
    a positive integer label like 1, 2, 3, etc. Unclassified noise points are
    labelled as NaN.

    Uses Density-based spatial clustering of applications with noise (DBSCAN).
    See also https://www.naftaliharris.com/blog/visualizing-dbscan-clustering

    ***       **         111       NN
    **    **   *         11    22   N
    *     ****     -->   1     2222
      **     **            33     22
    ******               333333

    Parameters
    ----------
    X : cudf.DataFrame or pandas.DataFrame
        A table of X, Y, Z points to run the clustering algorithm on.
    eps : float
        The maximum distance between 2 points such they reside in the same
        neighborhood. Default is 3000 (metres).
    min_samples : int
        The number of samples in a neighborhood such that this group can be
        considered as an important core point (including the point itself).
        Default is 250 (sample points).
    output_colname : str
        The name of the column for the output Series. Default is 'cluster_id'.

    Returns
    -------
    cluster_labels : cudf.Series or pd.Series
        Which cluster each datapoint belongs to. Noisy samples are labeled as
        NaN.
    """
    try:
        from cuml.cluster import DBSCAN
    except ImportError:
        from sklearn.cluster import DBSCAN

    # Run DBSCAN using {eps} m distance, and minimum of {min_samples} points
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit(X=X)

    cluster_labels = dbscan.labels_ + 1  # noise points -1 becomes 0
    if isinstance(cluster_labels, np.ndarray):
        cluster_labels = xpd.Series(data=cluster_labels,
                                    dtype=xpd.Int32Dtype())
    cluster_labels = cluster_labels.mask(
        cond=cluster_labels == 0)  # turn 0 to NaN
    cluster_labels.index = X.index  # let labels have same index as input data
    cluster_labels.name = output_colname

    return cluster_labels
def dbscan(model,
           counts_per_word,
           embeddings=None,
           sim_thresh=0.8,
           min_samples=5,
           min_occs=1000,
           verbose=False):

    if embeddings is None:

        #print('COUNTS PER WORD:', counts_per_word[:, 1])

        # Keep only hashtags with more than min_occs occurences
        nb_to_keep = np.argmax(counts_per_word[:, 1].astype(int) < min_occs)
        if nb_to_keep == 0:
            raise Exception(
                f'dbscan : No word with more than {min_occs} occurences')

        # Create fit data
        #model_words = set(model.wv.vocab.keys())
        model_words = set(model.wv.index_to_key)
        words_kept = np.array([
            word for word, count in counts_per_word[:nb_to_keep]
            if word in model_words
        ])
        X = np.array([model.wv[w] for w in words_kept])

    else:
        X = embeddings
        #print('X2 :', X)
        words_kept = np.arange(len(X)).astype(str)

    # cosine DBScan
    clustering = DBSCAN(eps=1 - sim_thresh,
                        min_samples=min_samples,
                        metric='cosine').fit(X)
    clust_labels = clustering.labels_

    # Setup and fit clusters
    dbscan_float = DBSCAN(eps=1 - sim_thresh, min_samples=min_samples)
    dbscan_float.fit(gdf_float)

    if verbose:

        print(np.bincount(clust_labels + 1)[1:])

        for e in range(clust_labels.max() + 1):
            print(f"Topic {e} :")
            tags = np.array(counts_per_word)[:len(clust_labels)][clust_labels
                                                                 == e]
            for tag in tags:
                print(f"\t{tag}")

    return clust_labels, words_kept
Exemple #3
0
def test_cuml_fit_clusters():
    # Create and populate a GPU DataFrame
    gdf_float = cudf.DataFrame()
    gdf_float['0'] = [1.0, 2.0, 5.0]
    gdf_float['1'] = [4.0, 2.0, 1.0]
    gdf_float['2'] = [4.0, 2.0, 1.0]

    # Setup and fit clusters
    dbscan_float = DBSCAN(eps=1.0, min_samples=1)
    dbscan_float.fit(gdf_float)

    print(dbscan_float.labels_)
Exemple #4
0
    def test_dbscan(self):
        import cudf
        from cuml.cluster import DBSCAN

        # Create and populate a GPU DataFrame
        gdf_float = cudf.DataFrame()
        gdf_float['0'] = [1.0, 2.0, 5.0]
        gdf_float['1'] = [4.0, 2.0, 1.0]
        gdf_float['2'] = [4.0, 2.0, 1.0]

        # Setup and fit clusters
        dbscan_float = DBSCAN(eps=1.0, min_samples=1)
        dbscan_float.fit(gdf_float)

        self.assertEqual(3, dbscan_float.labels_.size)
Exemple #5
0
def dbscan_gpu(feature, max_neighbor_dist):
    """
    :param feature: of shape num_samples x num_features
    :param max_neighbor_dist: The maximum distance between two samples for one to be considered as in the neighborhood of the other
    :return:
    """
    from cupy import asnumpy
    from cuml.cluster import DBSCAN
    db = DBSCAN(eps=max_neighbor_dist, min_samples=1).fit(feature)
    return asnumpy(db.labels_.values)
Exemple #6
0
class T5_DBSCAN:
    def __init__(self,
                 ndims=5,
                 nn=25,
                 eps=0.1,
                 minSamples=25,
                 coreSamples=True,
                 verbose=False):
        self.logger = logging.getLogger("T5Clustering")
        logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                            level=logging.DEBUG)
        self.reducer = DimReducer(n_components=ndims, n_neighbors=nn)
        gpu_mem(0)
        self.train_data = None
        self.test_data = None
        self.data = None
        self.clusterer = DBSCAN(eps=eps,
                                min_samples=minSamples,
                                verbose=verbose,
                                calc_core_sample_indices=coreSamples,
                                output_type='cudf')

    def split(self, fname, sampling=True, train_size=1000000, test_size=None):
        self.train_data, self.test_data = read(fname,
                                               sampling=sampling,
                                               train_size=train_size,
                                               test_size=test_size)
        return self

    def reduce(self, train_data):
        self.logger.info("Dimensionality reduction (UMAP): %s",
                         self.reducer.reducer.get_params)
        self.data = self.reducer.fit_transform(train_data)
        return self

    def cluster(self):
        self.logger.info("Clustering ... (DBSCAN)")
        gpu_mem(0)
        self.clusterer.fit(self.data)
        self.logger.info("Clusters: %d, Core samples: %d",
                         self.clusterer.labels_.max(),
                         len(self.clusterer.core_sample_indices_))
Exemple #7
0
 def __init__(self,
              ndims=5,
              nn=25,
              eps=0.1,
              minSamples=25,
              coreSamples=True,
              verbose=False):
     self.logger = logging.getLogger("T5Clustering")
     logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                         level=logging.DEBUG)
     self.reducer = DimReducer(n_components=ndims, n_neighbors=nn)
     gpu_mem(0)
     self.train_data = None
     self.test_data = None
     self.data = None
     self.clusterer = DBSCAN(eps=eps,
                             min_samples=minSamples,
                             verbose=verbose,
                             calc_core_sample_indices=coreSamples,
                             output_type='cudf')
Exemple #8
0
def _find_clusters_dbscan(points, eps, min_samples):
    # ball_tree algorithm appears to run about 30-40% faster based on a single
    # test orbit and (vx, vy), run on a laptop, improving from 300ms to 180ms.
    #
    # Runtime is not very sensitive to leaf_size, but 30 appears to be roughly
    # optimal, and is the default value anyway.
    db = DBSCAN(
        eps=eps,
        min_samples=min_samples,
        algorithm="ball_tree",
        leaf_size=30,
    )
    db.fit(points)

    cluster_labels = np.unique(db.labels_[np.where(db.labels_ != -1)])
    clusters = []
    for label in cluster_labels:
        cluster_indices = np.where(db.labels_ == label)[0]
        clusters.append(cluster_indices)
    del db
    return clusters
Exemple #9
0
def clusterVelocity(
    obs_ids,
    x,
    y,
    dt,
    vx,
    vy,
    eps=0.005,
    min_samples=5,
    min_arc_length=1.0,
):
    """
    Clusters THOR projection with different velocities
    in the projection plane using `~scipy.cluster.DBSCAN`.
    
    Parameters
    ----------
    obs_ids : `~numpy.ndarray' (N)
        Observation IDs.
    x : `~numpy.ndarray' (N)
        Projection space x coordinate in degrees or radians.
    y : `~numpy.ndarray' (N)
        Projection space y coordinate in degrees or radians.    
    dt : `~numpy.ndarray' (N)
        Change in time from 0th exposure in units of MJD.
    vx : `~numpy.ndarray' (N)
        Projection space x velocity in units of degrees or radians per day in MJD. 
    vy : `~numpy.ndarray' (N)
        Projection space y velocity in units of degrees or radians per day in MJD. 
    eps : float, optional
        The maximum distance between two samples for them to be considered 
        as in the same neighborhood. 
        See: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html
        [Default = 0.005]
    min_samples : int, optional
        The number of samples (or total weight) in a neighborhood for a 
        point to be considered as a core point. This includes the point itself.
        See: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html
        [Default = 5]
    min_arc_length : float, optional
        Minimum arc length in units of days for a cluster to be accepted. 
        
    Returns
    -------
    list
        If clusters are found, will return a list of numpy arrays containing the 
        observation IDs for each cluster. If no clusters are found, will return np.NaN.
    """
    xx = x - vx * dt
    yy = y - vy * dt
    if USE_GPU:
        kwargs = {}
    else:
        kwargs = {"n_jobs": 1}

    X = np.vstack([xx, yy]).T

    db = DBSCAN(eps=eps, min_samples=min_samples, **kwargs)
    db.fit(X)

    clusters = db.labels_[np.where(db.labels_ != -1)[0]]
    cluster_ids = []
    logger.debug(
        f"cluster: vx={vx} vy={vy} n_obs={len(obs_ids)} n_cluster={len(cluster_ids)}",
    )
    if len(clusters) != 0:
        for cluster in np.unique(clusters):
            cluster_mask = np.where(db.labels_ == cluster)[0]

            dt_in_cluster = dt[cluster_mask]
            num_obs = len(dt_in_cluster)
            arc_length = dt_in_cluster.max() - dt_in_cluster.min()
            if (num_obs == len(np.unique(dt_in_cluster))) and (
                    num_obs >= min_samples) and (arc_length >= min_arc_length):
                cluster_ids.append(obs_ids[cluster_mask])

    if len(cluster_ids) == 0:
        cluster_ids = np.NaN

    del db
    return cluster_ids
Exemple #10
0
train_data, test_data = read(
    "/mnt/m2-1/cw09b.dh.1k.spam.70.dochits.tier.5.t5-base.embeddings.0.npz",
    sampling=True,
    train_size=1000000,
    test_size=None)

#  reduce dimensions
ndims = 5
nn = 25
logger.info("Dimensionality reduction: 768 to %d... (UMAP)", nn)
reducer = DimReducer(n_components=ndims, n_neighbors=nn)
data = reducer.fit_transform(train_data)

# cluster
eps = 0.05
minSamples = 25
logger.info("Clustering ... (DBSCAN)")
dbscan = DBSCAN(
    eps=eps,
    min_samples=minSamples,
    verbose=False,
    # max_mbytes_per_batch=4096,
    calc_core_sample_indices=True,
    output_type='cudf')
dbscan.fit(data)
gpu_mem(0)

print(dbscan.labels_.max())
# print(len(dbscan.core_sample_indices_))
# print(dbscan.core_sample_indices_)
def dbscan_gpu(model,
               counts_per_word,
               embeddings=None,
               sim_thresh=0.8,
               min_samples=5,
               min_occs=1000,
               verbose=False,
               s2v=False):

    if embeddings is None:

        #print('COUNTS PER WORD:', counts_per_word[:, 1])

        # Keep only hashtags with more than min_occs occurences
        nb_to_keep = np.argmax(counts_per_word[:, 1].astype(int) < min_occs)
        if nb_to_keep == 0:
            raise Exception(
                f'dbscan : No word with more than {min_occs} occurences')
        else:
            pass
            #print(f'dbscan : Keepings {nb_to_keep} words with more than {min_occs} occurences')

        # Create fit data
        #model_words = set(model.wv.vocab.keys())
        if not s2v:
            model_words = set(model.wv.index_to_key)
        else:
            model_words = set(model.keys())

        words_kept = np.array([
            word for word, count in counts_per_word[:nb_to_keep]
            if word in model_words
        ])
        #print('1- len(words_kept) :', len(words_kept))
        X = cudf.DataFrame()

        if s2v:
            transposed = np.array([model[w] for w in words_kept]).transpose()
        else:
            transposed = np.array([model.wv[w]
                                   for w in words_kept]).transpose()

        for e, v in enumerate(transposed):
            X[e] = v
        X = pairwise_distances(X, metric='cosine')

    else:
        X = cudf.DataFrame()
        for e, v in enumerate(embeddings.transpose()):
            X[e] = v
        X = pairwise_distances(X, metric='cosine')
        words_kept = np.arange(len(embeddings)).astype(str)
        #print('2- len(words_kept) :', len(words_kept))

    # cosine DBScan
    #clustering = DBSCAN(eps=1-sim_thresh, min_samples=min_samples, metric='cosine').fit(X)
    #clust_labels = clustering.labels_

    # Setup and fit clusters
    # Create and populate a GPU DataFrame
    #print('len(X):', len(X))
    clustering = DBSCAN(eps=1 - sim_thresh,
                        min_samples=min_samples,
                        metric="precomputed").fit(X)
    clust_labels = clustering.labels_.to_array()
    #print('labels :', clust_labels)
    #.to_pandas().values
    #print('len(clust_labels) :', len(clust_labels))

    if verbose:

        print(np.bincount(clust_labels + 1)[1:])

        for e in range(clust_labels.max() + 1):
            print(f"Topic {e} :")
            tags = np.array(counts_per_word)[:len(clust_labels)][clust_labels
                                                                 == e]
            for tag in tags:
                print(f"\t{tag}")

    return clust_labels, words_kept
def make_clusters_DBSCAN(dat_to_cluster, eps):
    estimator = DBSCAN(eps= eps, min_samples=3)
    res = estimator.fit_predict(dat_to_cluster)
    return res, estimator