Esempio n. 1
0
    def fit(self, X, y=None, sample_weight=None):
        X = list(filter(lambda x:self.w2v_model.wv.__contains__(x[0]), X))
        self.tags, self.reviews_num = zip(*X)
        self.X = np.array([self.w2v_model.wv.__getitem__(t) for t in self.tags])
        X = self.X

        # X = self._validate_data(X, accept_sparse='csr')

        if not self.eps > 0.0:
            raise ValueError("eps must be positive.")

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X)

        # Calculate neighborhood for all samples. This leaves the original
        # point in, which needs to be considered later (i.e. point i is in the
        # neighborhood of point i. While True, its useless information)
        if self.metric == 'precomputed' and sparse.issparse(X):
            # set the diagonal to explicit values, as a point is its own
            # neighbor
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning)
                X.setdiag(X.diagonal())  # XXX: modifies X's internals in-place

        neighbors_model = NearestNeighbors(
            radius=self.eps, algorithm=self.algorithm,
            leaf_size=self.leaf_size, metric=self.metric,
            metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs)
        neighbors_model.fit(X)
        # This has worst case O(n^2) memory complexity
        neighborhoods = neighbors_model.radius_neighbors(X,
                                                         return_distance=False)

        if sample_weight is None:
            n_neighbors = np.array([len(neighbors)
                                    for neighbors in neighborhoods])
        else:
            n_neighbors = np.array([np.sum(sample_weight[neighbors])
                                    for neighbors in neighborhoods])

        # Initially, all samples are noise.
        labels = np.full(X.shape[0], -1, dtype=np.intp)

        # A list of all core samples found.
        core_samples = np.asarray(n_neighbors >= self.min_samples,
                                  dtype=np.uint8)
        dbscan_inner(core_samples, neighborhoods, labels)

        self.core_sample_indices_ = np.where(core_samples)[0]
        self.labels_ = labels

        if len(self.core_sample_indices_):
            # fix for scipy sparse indexing issue
            self.components_ = X[self.core_sample_indices_].copy()
        else:
            # no core samples
            self.components_ = np.empty((0, X.shape[1]))
        return self
Esempio n. 2
0
    def fit(self, dataFrame: pd.DataFrame, spatialFeatures: list,
            temporalFeatures: list):
        ''' '''
        if (len(spatialFeatures) <= 0 or len(temporalFeatures) <= 0):
            raise ValueError(
                "spatialFeatures and temporalFeatures must be lists with length greater than 0"
            )

        nnEps1 = NearestNeighbors(radius=self.eps1,
                                  algorithm="auto",
                                  leaf_size=30,
                                  metric=self.metric1,
                                  metric_params=self.metric1Params,
                                  p=None,
                                  n_jobs=None)

        nnEps2 = NearestNeighbors(radius=self.eps2,
                                  algorithm="auto",
                                  leaf_size=30,
                                  metric=self.metric2,
                                  metric_params=self.metric2Params,
                                  p=None,
                                  n_jobs=None)

        spatialDf = dataFrame[spatialFeatures]
        temporalDf = dataFrame[temporalFeatures]
        nnEps1.fit(spatialDf)
        nnEps2.fit(temporalDf)

        eps1Neighborhoods = nnEps1.radius_neighbors(spatialDf,
                                                    return_distance=False)
        eps2Neighborhoods = nnEps2.radius_neighbors(temporalDf,
                                                    return_distance=False)

        # Intersection of 2 neighborhoods
        neighborhoods = []
        for eps1Neighbors, eps2Neighbors in zip(eps1Neighborhoods,
                                                eps2Neighborhoods):
            intersection = np.intersect1d(eps1Neighbors, eps2Neighbors)
            neighborhoods.append(intersection)
        neighborhoods = np.array(neighborhoods, dtype=object)

        # All samples are noise in the beginning
        labels = np.full(dataFrame.shape[0], -1, dtype=np.intp)

        #
        neighborsCounts = np.array(
            [len(neighborhood) for neighborhood in neighborhoods])
        corePoints = np.asarray(neighborsCounts >= self.minSamples,
                                dtype=np.uint8)

        dbscan_inner(corePoints, neighborhoods, labels)
        self.labels = labels
        self.neighborhoods = neighborhoods
        self.corePoints = corePoints
def dbscan(X,
           eps=0.5,
           minpts=5,
           metric='minkowski',
           algorithm='auto',
           leaf_size=30,
           p=2,
           sample_weight=None,
           n_jobs=1):

    X = check_array(X, accept_sparse='csr')
    if sample_weight is not None:
        sample_weight = np.asarray(sample_weight)
        check_consistent_length(X, sample_weight)

    if metric == 'precomputed' and sparse.issparse(X):
        neighborhoods = np.empty(X.shape[0], dtype=object)
        X.sum_duplicates()  # XXX: modifies X's internals in-place
        X_mask = X.data <= eps
        masked_indices = astype(X.indices, np.intp, copy=False)[X_mask]
        masked_indptr = np.cumsum(X_mask)[X.indptr[1:] - 1]
        # insert the diagonal: a point is its own neighbor, but 0 distance
        # means absence from sparse matrix data
        masked_indices = np.insert(masked_indices, masked_indptr,
                                   np.arange(X.shape[0]))
        masked_indptr = masked_indptr[:-1] + np.arange(1, X.shape[0])
        # split into rows
        neighborhoods[:] = np.split(masked_indices, masked_indptr)
    else:
        neighbors_model = NearestNeighbors(radius=eps,
                                           algorithm=algorithm,
                                           leaf_size=leaf_size,
                                           metric=metric,
                                           p=p,
                                           n_jobs=n_jobs)
        neighbors_model.fit(X)
        # This has worst case O(n^2) memory complexity
        neighborhoods = neighbors_model.radius_neighbors(X,
                                                         eps,
                                                         return_distance=False)
    if sample_weight is None:
        n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
    else:
        n_neighbors = np.array(
            [np.sum(sample_weight[neighbors]) for neighbors in neighborhoods])

# Initially, all samples are noise.
    labels = -np.ones(X.shape[0], dtype=np.intp)

    # A list of all core samples found.
    core_samples = np.asarray(n_neighbors >= minpts, dtype=np.uint8)
    dbscan_inner(core_samples, neighborhoods, labels)
    return np.where(core_samples)[0], labels
Esempio n. 4
0
def variable_eps_DBSCAN(X, eps_array, min_samples=5):
    """ Density-Based Spatial Clustering of Applications with Noise

	Parameters
	----------
	X : array[float, float], shape=(n_samples,n_features)
		Similarity matrix

	eps_array : array[float], shape=(n_samples)
		The maximum distance between two points for them to be considered 
		to be in the same neighborhood, applied locally.

	Returns
	--------
	cluster_centers : array, shape=[n_clusters, n_features]
		Coordinates of cluster centers.

	labels : array, shape=[n_samples]
		Cluster labels for each point.

	Notes
	-----
	Code adapted from scikit-learn library 
	"""
    # Calculate neighborhood for all samples. This leaves the original point
    # in, which needs to be considered later (i.e. point i is in the
    # neighborhood of point i. While True, its useless information)
    neighborhoods = np.array([np.where(x <= eps_array[i])[0] for i, x in enumerate(X)])

    n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])

    # Initially, all samples are noise.
    labels = -np.ones(X.shape[0], dtype=np.intp)

    # A list of all core samples found.
    core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8)
    dbscan_inner(core_samples, neighborhoods, labels)

    return np.where(core_samples)[0], labels
Esempio n. 5
0
def cluster(distances_filename: str, metadata_filename: str):
    """
    DBSCAN clustering of the embeddings based on a pairwise distance matrix.

    Parameters
    ----------
    distances_filename : str
        Precomputed pairwise distance matrix file to use for the DBSCAN
        clustering.
    metadata_filename : str
        Metadata file with precursor m/z information for all embeddings.
    """
    clusters_filename = (distances_filename.replace('dist_',
                                                    'clusters_').replace(
                                                        '.npz', '.npy'))
    if os.path.isfile(clusters_filename):
        return

    # DBSCAN clustering of the embeddings.
    logger.info(
        'DBSCAN clustering (eps=%.4f, min_samples=%d) of precomputed '
        'pairwise distance matrix %s', config.eps, config.min_samples,
        distances_filename)
    # Reimplement DBSCAN preprocessing to avoid unnecessary memory consumption.
    dist = ss.load_npz(distances_filename)
    dist_data, dist_indices, dist_indptr = dist.data, dist.indices, dist.indptr
    num_embeddings = dist.shape[0]
    # Find the eps-neighborhoods for all points.
    logger.debug('Find the eps-neighborhoods for all points (eps=%.4f)',
                 config.eps)
    mask = dist_data <= config.eps
    # noinspection PyTypeChecker
    indptr = _cumsum(mask)[dist_indptr]
    indices = dist_indices[mask].astype(np.intp, copy=False)
    neighborhoods = np.split(indices, indptr[1:-1])
    # Initially, all samples are noise.
    # (Memmap for shared memory multiprocessing.)
    cluster_labels = np.lib.format.open_memmap(clusters_filename,
                                               mode='w+',
                                               dtype=np.intp,
                                               shape=(num_embeddings, ))
    cluster_labels.fill(-1)
    # A list of all core samples found.
    n_neighbors = np.fromiter(map(len, neighborhoods), np.uint32)
    core_samples = n_neighbors >= config.min_samples
    # Run Scikit-Learn DBSCAN.
    logger.debug('Run Scikit-Learn DBSCAN inner.')
    neighborhoods_arr = np.empty(len(neighborhoods), dtype=np.object)
    neighborhoods_arr[:] = neighborhoods
    dbscan_inner(core_samples, neighborhoods_arr, cluster_labels)

    # Free up memory by deleting DBSCAN-related data structures.
    del dist, dist_data, dist_indices, dist_indptr, mask, indptr, indices
    del neighborhoods, n_neighbors, core_samples, neighborhoods_arr
    gc.collect()

    # Refine initial clusters to make sure spectra within a cluster don't have
    # an excessive precursor m/z difference.
    precursor_mzs = (pd.read_parquet(
        metadata_filename, columns=['mz']).squeeze().values.astype(np.float32))
    logger.debug('Sort cluster labels in ascending order.')
    order = np.argsort(cluster_labels)
    reverse_order = np.argsort(order)
    cluster_labels[:] = cluster_labels[order]
    precursor_mzs = precursor_mzs[order]
    logger.debug(
        'Finetune %d initial cluster assignments to not exceed %d %s '
        'precursor m/z tolerance', cluster_labels[-1] + 1,
        config.precursor_tol_mass, config.precursor_tol_mode)
    if cluster_labels[-1] == -1:  # Only noise samples.
        cluster_labels.fill(-1)
    else:
        group_idx = nb.typed.List(_get_cluster_group_idx(cluster_labels))
        n_clusters = nb.typed.List(
            joblib.Parallel(n_jobs=-1)(joblib.delayed(_postprocess_cluster)(
                cluster_labels[start_i:stop_i], precursor_mzs[start_i:stop_i],
                config.precursor_tol_mass, config.precursor_tol_mode,
                config.min_samples) for start_i, stop_i in group_idx))
        _assign_unique_cluster_labels(cluster_labels, group_idx, n_clusters,
                                      config.min_samples)
        cluster_labels[:] = cluster_labels[reverse_order]
    cluster_labels.flush()
    logger.debug('%d unique clusters after precursor m/z finetuning',
                 np.amax(cluster_labels) + 1)
Esempio n. 6
0
def dbscan(
    X,
    Core=[],
    eps=0.5,
    min_samples=5,
    metric='minkowski',
    algorithm='kd_tree',
    leaf_size=30,
    p=2,
    sample_weight=None,
    n_jobs=1,
):
    """Perform DBSCAN clustering from vector array or distance matrix.

    Read more in the :ref:`User Guide <dbscan>`.

    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.

    eps : float, optional
        The maximum distance between two samples for them to be considered
        as in the same neighborhood.

    min_samples : int, optional
        The number of samples (or total weight) in a neighborhood for a point
        to be considered as a core point. This includes the point itself.

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by metrics.pairwise.pairwise_distances for its
        metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square. X may be a sparse matrix, in which case only "nonzero"
        elements may be considered neighbors for DBSCAN.

    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        The algorithm to be used by the NearestNeighbors module
        to compute pointwise distances and find nearest neighbors.
        See NearestNeighbors module documentation for details.

    leaf_size : int, optional (default = 30)
        Leaf size passed to BallTree or cKDTree. This can affect the speed
        of the construction and query, as well as the memory required
        to store the tree. The optimal value depends
        on the nature of the problem.

    p : float, optional
        The power of the Minkowski metric to be used to calculate distance
        between points.

    sample_weight : array, shape (n_samples,), optional
        Weight of each sample, such that a sample with a weight of at least
        ``min_samples`` is by itself a core sample; a sample with negative
        weight may inhibit its eps-neighbor from being core.
        Note that weights are absolute, and default to 1.

    n_jobs : int, optional (default = 1)
        The number of parallel jobs to run for neighbors search.
        If ``-1``, then the number of jobs is set to the number of CPU cores.

    Returns
    -------
    core_samples : array [n_core_samples]
        Indices of core samples.

    labels : array [n_samples]
        Cluster labels for each point.  Noisy samples are given the label -1.

    Notes
    -----
    See examples/cluster/plot_dbscan.py for an example.

    This implementation bulk-computes all neighborhood queries, which increases
    the memory complexity to O(n.d) where d is the average number of neighbors,
    while original DBSCAN had memory complexity O(n).

    Sparse neighborhoods can be precomputed using
    :func:`NearestNeighbors.radius_neighbors_graph
    <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>`
    with ``mode='distance'``.

    References
    ----------
    Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
    Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
    In: Proceedings of the 2nd International Conference on Knowledge Discovery
    and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
    """
    if not eps > 0.0:
        raise ValueError("eps must be positive.")

    X = check_array(X, accept_sparse='csr')
    if sample_weight is not None:
        sample_weight = np.asarray(sample_weight)
        check_consistent_length(X, sample_weight)

    # print ("sample_weight")
    # print(sample_weight)

    neighbors_model = NearestNeighbors(radius=eps,
                                       algorithm=algorithm,
                                       leaf_size=leaf_size,
                                       metric=metric,
                                       p=p,
                                       n_jobs=n_jobs)
    neighbors_model.fit(X)
    # This has worst case O(n^2) memory complexity
    neighborhoods = neighbors_model.radius_neighbors(X,
                                                     eps,
                                                     return_distance=False)
    # print ("neighborhoods")
    # print(neighborhoods)

    if sample_weight is None:
        n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
        #print('n_neighbors')
        #print (n_neighbors)
    else:
        # print("sample_weight[neighborhoods[1]]")
        # print (sample_weight[neighborhoods[1]])
        n_neighbors = np.array(
            [np.sum(sample_weight[neighbors]) for neighbors in neighborhoods])

    # Initially, all samples are noise.
    labels = -np.ones(X.shape[0], dtype=np.intp)
    #print(labels)
    # A list of all core samples found.
    core_samples = np.zeros(X.shape[0], dtype=np.uint8)
    if len(Core) != 0:
        # print("core_samples:\n")
        NCore = expand_core(Core, neighborhoods, min_samples)
        for i in NCore:
            core_samples[i] = 1
            # for j in neighborhoods[i]:
            #     core_samples[j] = 1

    else:
        core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8)

    #print(labels)
    print('1111111111111')
    dbscan_inner(core_samples, neighborhoods, labels)
    # print("core_where:\n")
    #
    # print(np.where(core_samples)[0])

    return np.where(core_samples)[0], labels
Esempio n. 7
0
def dbscan(coords,
           timestamps,
           eps_d=500,
           eps_t=10 / 1.66667e-11,
           min_samples=5,
           metric_d='l1',
           metric_t='l1',
           algorithm='auto',
           leaf_size=30,
           n_jobs=1):

    neighbors_model_O = NearestNeighbors(radius=eps_d,
                                         leaf_size=leaf_size,
                                         metric=metric_d,
                                         n_jobs=n_jobs,
                                         algorithm=algorithm)
    neighbors_model_O.fit(coords[:, :2])
    neighborhoods_O = neighbors_model_O.radius_neighbors(coords[:, :2],
                                                         eps_d,
                                                         return_distance=False)

    neighbors_model_D = NearestNeighbors(radius=eps_d,
                                         leaf_size=leaf_size,
                                         metric=metric_d,
                                         n_jobs=n_jobs,
                                         algorithm=algorithm)
    neighbors_model_D.fit(coords[:, 2:])
    neighborhoods_D = neighbors_model_D.radius_neighbors(coords[:, 2:],
                                                         eps_d,
                                                         return_distance=False)

    neighbors_model_t_O = NearestNeighbors(radius=eps_t,
                                           leaf_size=leaf_size,
                                           metric=metric_t,
                                           n_jobs=n_jobs,
                                           algorithm=algorithm)
    neighbors_model_t_O.fit(timestamps[:, [0]])
    neighborhoods_t_O = neighbors_model_t_O.radius_neighbors(
        timestamps[:, [0]], eps_t, return_distance=False)

    neighbors_model_t_D = NearestNeighbors(radius=eps_t,
                                           leaf_size=leaf_size,
                                           metric=metric_t,
                                           n_jobs=n_jobs,
                                           algorithm=algorithm)
    neighbors_model_t_D.fit(timestamps[:, [1]])
    neighborhoods_t_D = neighbors_model_t_D.radius_neighbors(
        timestamps[:, [1]], eps_t, return_distance=False)

    n_neighbors = np.zeros(coords.shape[0], dtype=np.int16)
    neighborhoods = np.empty(coords.shape[0], dtype=object)
    for i in range(coords.shape[0]):
        neighbor_i = np.array(
            list(
                set(neighborhoods_O[i]).intersection(
                    set(neighborhoods_D[i]), set(neighborhoods_t_O[i]),
                    set(neighborhoods_t_D[i]))))
        neighborhoods[i] = neighbor_i

        n_neighbors[i] = neighbor_i.shape[0]

    # Initially, all samples are noise.
    labels = -np.ones(coords.shape[0], dtype=np.intp)

    # A list of all core samples found.
    core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8)
    dbscan_inner(core_samples, neighborhoods, labels)
    return np.where(core_samples)[0], labels
Esempio n. 8
0
def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
           algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=1):
    """Perform DBSCAN clustering from vector array or distance matrix.
    Read more in the :ref:`User Guide <dbscan>`.
    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.
    eps : float, optional
        The maximum distance between two samples for them to be considered
        as in the same neighborhood.
    min_samples : int, optional
        The number of samples (or total weight) in a neighborhood for a point
        to be considered as a core point. This includes the point itself.
    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by :func:`sklearn.metrics.pairwise_distances` for
        its metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square. X may be a sparse matrix, in which case only "nonzero"
        elements may be considered neighbors for DBSCAN.
    metric_params : dict, optional
        Additional keyword arguments for the metric function.
        .. versionadded:: 0.19
    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        The algorithm to be used by the NearestNeighbors module
        to compute pointwise distances and find nearest neighbors.
        See NearestNeighbors module documentation for details.
    leaf_size : int, optional (default = 30)
        Leaf size passed to BallTree or cKDTree. This can affect the speed
        of the construction and query, as well as the memory required
        to store the tree. The optimal value depends
        on the nature of the problem.
    p : float, optional
        The power of the Minkowski metric to be used to calculate distance
        between points.
    sample_weight : array, shape (n_samples,), optional
        Weight of each sample, such that a sample with a weight of at least
        ``min_samples`` is by itself a core sample; a sample with negative
        weight may inhibit its eps-neighbor from being core.
        Note that weights are absolute, and default to 1.
    n_jobs : int, optional (default = 1)
        The number of parallel jobs to run for neighbors search.
        If ``-1``, then the number of jobs is set to the number of CPU cores.
    Returns
    -------
    core_samples : array [n_core_samples]
        Indices of core samples.
    labels : array [n_samples]
        Cluster labels for each point.  Noisy samples are given the label -1.
    Notes
    -----
    For an example, see :ref:`examples/cluster/plot_dbscan.py
    <sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
    This implementation bulk-computes all neighborhood queries, which increases
    the memory complexity to O(n.d) where d is the average number of neighbors,
    while original DBSCAN had memory complexity O(n).
    Sparse neighborhoods can be precomputed using
    :func:`NearestNeighbors.radius_neighbors_graph
    <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>`
    with ``mode='distance'``.
    References
    ----------
    Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
    Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
    In: Proceedings of the 2nd International Conference on Knowledge Discovery
    and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
    """
    if not eps > 0.0:
        raise ValueError("eps must be positive.")

    X = check_array(X, accept_sparse='csr')
    if sample_weight is not None:
        sample_weight = np.asarray(sample_weight)
        check_consistent_length(X, sample_weight)

    # Calculate neighborhood for all samples. This leaves the original point
    # in, which needs to be considered later (i.e. point i is in the
    # neighborhood of point i. While True, its useless information)
    if metric == 'precomputed' and sparse.issparse(X):
        neighborhoods = np.empty(X.shape[0], dtype=object)
        X.sum_duplicates()  # XXX: modifies X's internals in-place
        X_mask = X.data <= eps
        masked_indices = X.indices.astype(np.intp, copy=False)[X_mask]
        masked_indptr = np.concatenate(([0], np.cumsum(X_mask)))[X.indptr[1:]]

        # insert the diagonal: a point is its own neighbor, but 0 distance
        # means absence from sparse matrix data
        masked_indices = np.insert(masked_indices, masked_indptr,
                                   np.arange(X.shape[0]))
        masked_indptr = masked_indptr[:-1] + np.arange(1, X.shape[0])
        # split into rows
        neighborhoods[:] = np.split(masked_indices, masked_indptr)
    else:
        neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm,
                                           leaf_size=leaf_size,
                                           metric=metric,
                                           metric_params=metric_params, p=p,
                                           n_jobs=n_jobs)
        neighbors_model.fit(X)
        # This has worst case O(n^2) memory complexity
        neighborhoods = neighbors_model.radius_neighbors(X, eps,
                                                         return_distance=False)

    if sample_weight is None:
        n_neighbors = np.array([len(neighbors)
                                for neighbors in neighborhoods])
    else:
        n_neighbors = np.array([np.sum(sample_weight[neighbors])
                                for neighbors in neighborhoods])

    # Initially, all samples are noise.
    labels = -np.ones(X.shape[0], dtype=np.intp)

    # A list of all core samples found.
    core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8)
    dbscan_inner(core_samples, neighborhoods, labels)
    return np.where(core_samples)[0], labels
Esempio n. 9
0
    def dbscan(self,
               X,
               eps=0.5,
               min_samples=5,
               metric='minkowski',
               metric_params=None,
               algorithm='auto',
               leaf_size=30,
               p=2,
               sample_weight=None,
               n_jobs=None):
        """Perform DBSCAN clustering from vector array or distance matrix.
        Read more in the :ref:`User Guide <dbscan>`.
        Parameters
        ----------
        X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
                array of shape (n_samples, n_samples)
            A feature array, or array of distances between samples if
            ``metric='precomputed'``.
        eps : float, optional
            The maximum distance between two samples for one to be considered
            as in the neighborhood of the other. This is not a maximum bound
            on the distances of points within a cluster. This is the most
            important DBSCAN parameter to choose appropriately for your data set
            and distance function.
        min_samples : int, optional
            The number of samples (or total weight) in a neighborhood for a point
            to be considered as a core point. This includes the point itself.
        metric : string, or callable
            The metric to use when calculating distance between instances in a
            feature array. If metric is a string or callable, it must be one of
            the options allowed by :func:`sklearn.metrics.pairwise_distances` for
            its metric parameter.
            If metric is "precomputed", X is assumed to be a distance matrix and
            must be square. X may be a sparse matrix, in which case only "nonzero"
            elements may be considered neighbors for DBSCAN.
        metric_params : dict, optional
            Additional keyword arguments for the metric function.
            .. versionadded:: 0.19
        algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
            The algorithm to be used by the NearestNeighbors module
            to compute pointwise distances and find nearest neighbors.
            See NearestNeighbors module documentation for details.
        leaf_size : int, optional (default = 30)
            Leaf size passed to BallTree or cKDTree. This can affect the speed
            of the construction and query, as well as the memory required
            to store the tree. The optimal value depends
            on the nature of the problem.
        p : float, optional
            The power of the Minkowski metric to be used to calculate distance
            between points.
        sample_weight : array, shape (n_samples,), optional
            Weight of each sample, such that a sample with a weight of at least
            ``min_samples`` is by itself a core sample; a sample with negative
            weight may inhibit its eps-neighbor from being core.
            Note that weights are absolute, and default to 1.
        n_jobs : int or None, optional (default=None)
            The number of parallel jobs to run for neighbors search.
            ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
            ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
            for more details.
        Returns
        -------
        core_samples : array [n_core_samples]
            Indices of core samples.
        labels : array [n_samples]
            Cluster labels for each point. 
        """
        if not eps > 0.0:
            raise ValueError("eps must be positive.")

        X = check_array(X, accept_sparse='csr')
        if sample_weight is not None:
            sample_weight = np.asarray(sample_weight)
            check_consistent_length(X, sample_weight)

        # Calculate neighborhood for all samples. This leaves the original point
        # in, which needs to be considered later (i.e. point i is in the
        # neighborhood of point i. While True, its useless information)
        if metric == 'precomputed' and sparse.issparse(X):
            neighborhoods = np.empty(X.shape[0], dtype=object)
            X.sum_duplicates()  # XXX: modifies X's internals in-place

            # set the diagonal to explicit values, as a point is its own neighbor
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning)
                X.setdiag(X.diagonal())  # XXX: modifies X's internals in-place

            X_mask = X.data <= eps
            masked_indices = X.indices.astype(np.intp, copy=False)[X_mask]
            masked_indptr = np.concatenate(([0], np.cumsum(X_mask)))
            masked_indptr = masked_indptr[X.indptr[1:-1]]

            # split into rows
            neighborhoods[:] = np.split(masked_indices, masked_indptr)
        else:
            neighbors_model = NearestNeighbors(radius=eps,
                                               algorithm=algorithm,
                                               leaf_size=leaf_size,
                                               metric=metric,
                                               metric_params=metric_params,
                                               p=p,
                                               n_jobs=n_jobs)
            neighbors_model.fit(X)
            # This has worst case O(n^2) memory complexity
            neighborhoods = neighbors_model.radius_neighbors(
                X, eps, return_distance=False)

        if sample_weight is None:
            n_neighbors = np.array(
                [len(neighbors) for neighbors in neighborhoods])
        else:
            n_neighbors = np.array([
                np.sum(sample_weight[neighbors]) for neighbors in neighborhoods
            ])

        # Initially, all samples are noise.
        labels = np.full(X.shape[0], -1, dtype=np.intp)

        # A list of all core samples found.
        core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8)
        dbscan_inner(core_samples, neighborhoods, labels)
        return np.where(core_samples)[0], labels
Esempio n. 10
0
def generate_clusters(pairwise_dist_matrix: ss.csr_matrix, eps: float,
                      min_samples: int, precursor_mzs: np.ndarray,
                      precursor_tol_mass: float, precursor_tol_mode: str) \
        -> np.ndarray:
    """
    DBSCAN clustering of the given pairwise distance matrix.

    Parameters
    ----------
    pairwise_dist_matrix : ss.csr_matrix
        A sparse pairwise distance matrix used for clustering.
    eps : float
        The maximum distance between two samples for one to be considered as in
        the neighborhood of the other.
    min_samples : int
        The number of samples in a neighborhood for a point to be considered as
        a core point. This includes the point itself.
    precursor_mzs : np.ndarray
        Precursor m/z's matching the pairwise distance matrix.
    precursor_tol_mass : float
        Maximum precursor mass tolerance for points to be clustered together.
    precursor_tol_mode : str
        The unit of the precursor m/z tolerance ('Da' or 'ppm').

    Returns
    -------
    np.ndarray
        Cluster labels. Noisy samples are given the label -1.
    """
    # DBSCAN clustering using the precomputed pairwise distance matrix.
    logger.info('DBSCAN clustering (eps=%.4f, min_samples=%d) of precomputed '
                'pairwise distance matrix', eps, min_samples)
    # Reimplement DBSCAN preprocessing to avoid unnecessary memory consumption.
    # Find the eps-neighborhoods for all points.
    mask = pairwise_dist_matrix.data <= eps
    indices = pairwise_dist_matrix.indices[mask].astype(np.intp)
    indptr = np.zeros(len(mask) + 1, dtype=np.int64)
    np.cumsum(mask, out=indptr[1:])
    indptr = indptr[pairwise_dist_matrix.indptr]
    neighborhoods = np.split(indices, indptr[1:-1])
    # Initially, all samples are noise.
    clusters = np.full(pairwise_dist_matrix.shape[0], -1, dtype=np.intp)
    # A list of all core samples found.
    n_neighbors = np.fromiter(map(len, neighborhoods), np.uint32)
    core_samples = n_neighbors >= min_samples
    # Run Scikit-Learn DBSCAN.
    neighborhoods_arr = np.empty(len(neighborhoods), dtype=np.object)
    neighborhoods_arr[:] = neighborhoods
    dbscan_inner(core_samples, neighborhoods_arr, clusters)
    # Refine initial clusters to make sure spectra within a cluster don't have
    # an excessive precursor m/z difference.
    order = np.argsort(clusters)
    reverse_order = np.argsort(order)
    clusters, precursor_mzs = clusters[order], precursor_mzs[order]
    logger.debug('Finetune %d initial unique clusters to not exceed %.2f %s '
                 'precursor m/z tolerance', clusters[-1] + 1,
                 precursor_tol_mass, precursor_tol_mode)
    group_idx = _get_cluster_group_idx(clusters)
    if len(group_idx) == 0:     # Only noise samples.
        return -np.ones_like(precursor_mzs, dtype=np.int64)
    cluster_reassignments = nb.typed.List(joblib.Parallel(n_jobs=-1)(
        joblib.delayed(_postprocess_cluster)
        (precursor_mzs[start_i:stop_i], precursor_tol_mass,
         precursor_tol_mode) for start_i, stop_i in group_idx))
    clusters = _assign_unique_cluster_labels(
        group_idx, cluster_reassignments, min_samples)[reverse_order]
    logger.debug('%d unique clusters after precursor m/z finetuning',
                 np.amax(clusters) + 1)
    return clusters
Esempio n. 11
0
def faiss_dbscan(X, eps=0.5, min_samples=5, nlist=100, nprobe=5, metric='l2', metric_params=None,
           algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=1, GPU=False, IVFFlat=True):
    """Perform DBSCAN clustering from vector array or distance matrix.

    Read more in the :ref:`User Guide <dbscan>`.

    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.

    eps : float, optional
        The maximum distance between two samples for them to be considered
        as in the same neighborhood.

    min_samples : int, optional
        The number of samples (or total weight) in a neighborhood for a point
        to be considered as a core point. This includes the point itself.

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by metrics.pairwise.pairwise_distances for its
        metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square. X may be a sparse matrix, in which case only "nonzero"
        elements may be considered neighbors for DBSCAN.

    metric_params : dict, optional
        Additional keyword arguments for the metric function.

        .. versionadded:: 0.19

    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        The algorithm to be used by the NearestNeighbors module
        to compute pointwise distances and find nearest neighbors.
        See NearestNeighbors module documentation for details.

    leaf_size : int, optional (default = 30)
        Leaf size passed to BallTree or cKDTree. This can affect the speed
        of the construction and query, as well as the memory required
        to store the tree. The optimal value depends
        on the nature of the problem.

    p : float, optional
        The power of the Minkowski metric to be used to calculate distance
        between points.

    sample_weight : array, shape (n_samples,), optional
        Weight of each sample, such that a sample with a weight of at least
        ``min_samples`` is by itself a core sample; a sample with negative
        weight may inhibit its eps-neighbor from being core.
        Note that weights are absolute, and default to 1.

    n_jobs : int, optional (default = 1)
        The number of parallel jobs to run for neighbors search.
        If ``-1``, then the number of jobs is set to the number of CPU cores.

    Returns
    -------
    core_samples : array [n_core_samples]
        Indices of core samples.

    labels : array [n_samples]
        Cluster labels for each point.  Noisy samples are given the label -1.

    Notes
    -----
    See examples/cluster/plot_dbscan.py for an example.

    This implementation bulk-computes all neighborhood queries, which increases
    the memory complexity to O(n.d) where d is the average number of neighbors,
    while original DBSCAN had memory complexity O(n).

    Sparse neighborhoods can be precomputed using
    :func:`NearestNeighbors.radius_neighbors_graph
    <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>`
    with ``mode='distance'``.

    References
    ----------
    Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
    Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
    In: Proceedings of the 2nd International Conference on Knowledge Discovery
    and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
    """
    if not eps > 0.0:
        raise ValueError("eps must be positive.")

    if sample_weight is not None:
        sample_weight = np.asarray(sample_weight)
        check_consistent_length(X, sample_weight)

    # Calculate neighborhood for all samples. This leaves the original point
    # in, which needs to be considered later (i.e. point i is in the
    # neighborhood of point i. While True, its useless information)
    if GPU is True:
        neighborhoods = gpu_radius_neighbors(X, eps, min_samples, nlist, nprobe, return_distance=False, IVFFlat=IVFFlat)
    else:
        neighborhoods = cpu_radius_neighbors(X, eps, min_samples, nlist, nprobe, return_distance=False, IVFFlat=IVFFlat)
    if sample_weight is None:
        n_neighbors = np.array([len(neighbors)
                                for neighbors in neighborhoods])
    else:
        n_neighbors = np.array([np.sum(sample_weight[neighbors])
                                for neighbors in neighborhoods])

    # Initially, all samples are noise.
    labels = -np.ones(X.shape[0], dtype=np.intp)

    # A list of all core samples found.
    core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8)
    dbscan_inner(core_samples, neighborhoods, labels)
    return np.where(core_samples)[0], labels
Esempio n. 12
0
def dbscan(X,
           eps=0.5,
           min_samples=5,
           metric='minkowski',
           metric_params=None,
           algorithm='auto',
           leaf_size=30,
           p=2,
           sample_weight=None,
           n_jobs=None):
    """Perform DBSCAN clustering from vector array or distance matrix.

    Read more in the :ref:`User Guide <dbscan>`.

    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.

    eps : float, optional
        The maximum distance between two samples for one to be considered
        as in the neighborhood of the other. This is not a maximum bound
        on the distances of points within a cluster. This is the most
        important DBSCAN parameter to choose appropriately for your data set
        and distance function.

    min_samples : int, optional
        The number of samples (or total weight) in a neighborhood for a point
        to be considered as a core point. This includes the point itself.

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by :func:`sklearn.metrics.pairwise_distances` for
        its metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square. X may be a sparse matrix, in which case only "nonzero"
        elements may be considered neighbors for DBSCAN.

    metric_params : dict, optional
        Additional keyword arguments for the metric function.

        .. versionadded:: 0.19

    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        The algorithm to be used by the NearestNeighbors module
        to compute pointwise distances and find nearest neighbors.
        See NearestNeighbors module documentation for details.

    leaf_size : int, optional (default = 30)
        Leaf size passed to BallTree or cKDTree. This can affect the speed
        of the construction and query, as well as the memory required
        to store the tree. The optimal value depends
        on the nature of the problem.

    p : float, optional
        The power of the Minkowski metric to be used to calculate distance
        between points.

    sample_weight : array, shape (n_samples,), optional
        Weight of each sample, such that a sample with a weight of at least
        ``min_samples`` is by itself a core sample; a sample with negative
        weight may inhibit its eps-neighbor from being core.
        Note that weights are absolute, and default to 1.

    n_jobs : int or None, optional (default=None)
        The number of parallel jobs to run for neighbors search.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    Returns
    -------
    core_samples : array [n_core_samples]
        Indices of core samples.

    labels : array [n_samples]
        Cluster labels for each point.  Noisy samples are given the label -1.

    See also
    --------
    DBSCAN
        An estimator interface for this clustering algorithm.
    OPTICS
        A similar estimator interface clustering at multiple values of eps. Our
        implementation is optimized for memory usage.

    Notes
    -----
    For an example, see :ref:`examples/cluster/plot_dbscan.py
    <sphx_glr_auto_examples_cluster_plot_dbscan.py>`.

    This implementation bulk-computes all neighborhood queries, which increases
    the memory complexity to O(n.d) where d is the average number of neighbors,
    while original DBSCAN had memory complexity O(n). It may attract a higher
    memory complexity when querying these nearest neighborhoods, depending
    on the ``algorithm``.

    One way to avoid the query complexity is to pre-compute sparse
    neighborhoods in chunks using
    :func:`NearestNeighbors.radius_neighbors_graph
    <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
    ``mode='distance'``, then using ``metric='precomputed'`` here.

    Another way to reduce memory and computation time is to remove
    (near-)duplicate points and use ``sample_weight`` instead.

    :func:`cluster.optics <sklearn.cluster.optics>` provides a similar
    clustering with lower memory usage.

    References
    ----------
    Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
    Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
    In: Proceedings of the 2nd International Conference on Knowledge Discovery
    and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996

    Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
    DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.
    ACM Transactions on Database Systems (TODS), 42(3), 19.
    """
    if not eps > 0.0:
        raise ValueError("eps must be positive.")

    X = check_array(X, accept_sparse='csr')
    if sample_weight is not None:
        sample_weight = np.asarray(sample_weight)
        check_consistent_length(X, sample_weight)

    # Calculate neighborhood for all samples. This leaves the original point
    # in, which needs to be considered later (i.e. point i is in the
    # neighborhood of point i. While True, its useless information)
    if metric == 'precomputed' and sparse.issparse(X):
        neighborhoods = np.empty(X.shape[0], dtype=object)
        X.sum_duplicates()  # XXX: modifies X's internals in-place

        # set the diagonal to explicit values, as a point is its own neighbor
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning)
            X.setdiag(X.diagonal())  # XXX: modifies X's internals in-place

        X_mask = X.data <= eps
        masked_indices = X.indices.astype(np.intp, copy=False)[X_mask]
        masked_indptr = np.concatenate(([0], np.cumsum(X_mask)))
        masked_indptr = masked_indptr[X.indptr[1:-1]]

        # split into rows
        neighborhoods[:] = np.split(masked_indices, masked_indptr)
    else:
        neighbors_model = NearestNeighbors(radius=eps,
                                           algorithm=algorithm,
                                           leaf_size=leaf_size,
                                           metric=metric,
                                           metric_params=metric_params,
                                           p=p,
                                           n_jobs=n_jobs)
        neighbors_model.fit(X)
        # This has worst case O(n^2) memory complexity
        neighborhoods = neighbors_model.radius_neighbors(X,
                                                         eps,
                                                         return_distance=False)
        #np.savetxt('sklearn_neighborhoods', neighborhoods, fmt='%s')
    if sample_weight is None:
        n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
    else:
        n_neighbors = np.array(
            [np.sum(sample_weight[neighbors]) for neighbors in neighborhoods])

    # Initially, all samples are noise.
    labels = np.full(X.shape[0], -1, dtype=np.intp)

    # A list of all core samples found.
    core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8)
    dbscan_inner(core_samples, neighborhoods, labels)
    return np.where(core_samples)[0], labels