Exemple #1
0
def fast_dbscan(X,
                eps=0.5,
                min_samples=5,
                n_clusters=None,
                metric='euclidean',
                algorithm='brute',
                random_state: int = 1,
                framework: Literal['auto', 'cuml', 'sklearn'] = 'auto'):
    r""" DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
    Finds core samples of high density and expands clusters from them.
    Good for data which contains clusters of similar density.

    DBSCAN is a very powerful if the datapoints tend to congregate in
    larger groups.

    Arguments:
      eps : float, default=0.5
        The maximum distance between two samples for one to be considered
        as in the neighborhood of the other. This is not a maximum bound
        on the distances of points within a cluster. This is the most
        important DBSCAN parameter to choose appropriately for your data set
        and distance function.
      min_samples : int, default=5
        The number of samples (or total weight) in a neighborhood for a point
        to be considered as a core point. This includes the point itself.
      algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='brute'
        The algorithm to be used by the NearestNeighbors module
        to compute pointwise distances and find nearest neighbors.
        See NearestNeighbors module documentation for details.

  """
    kwargs = dict(locals())
    X = kwargs.pop('X')
    framework = kwargs.pop('framework')
    n_cluster = kwargs.pop('n_clusters')
    random_state = kwargs.pop('random_state')
    ## fine-tuning the kwargs
    if _check_cuml(framework):
        from cuml.cluster import DBSCAN
        kwargs.pop('algorithm')
        kwargs.pop('metric')
    else:
        from sklearn.cluster import DBSCAN
    ## fitting
    dbscan = DBSCAN(**kwargs)
    dbscan.fit(X)
    dbscan._fitid = id(X)
    dbscan.predict = types.MethodType(dbscan_predict, dbscan)
    return dbscan