Beispiel #1
0
def test_check_normalize_sample_weight():
    from sklearn.cluster._kmeans import _check_normalize_sample_weight
    sample_weight = None
    checked_sample_weight = _check_normalize_sample_weight(sample_weight, X)
    assert _num_samples(X) == _num_samples(checked_sample_weight)
    assert_almost_equal(checked_sample_weight.sum(), _num_samples(X))
    assert X.dtype == checked_sample_weight.dtype
Beispiel #2
0
def _fuzzykmeans_single_elkan(X,
                              m,
                              sample_weight,
                              n_clusters,
                              max_iter=300,
                              init='k-means++',
                              verbose=False,
                              x_squared_norms=None,
                              random_state=None,
                              tol=1e-4,
                              precompute_distances=True):
    if sp.issparse(X):
        raise TypeError("algorithm='elkan' not supported for sparse input X")

    n_samples, n_features = X.shape
    random_state = check_random_state(random_state)

    fuzzy_labels = random_state.rand(n_samples, n_clusters)
    fuzzy_labels /= fuzzy_labels.sum(axis=1)[:, np.newaxis]

    if x_squared_norms is None:
        x_squared_norms = row_norms(X, squared=True)
    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)
    centers = m_step(centers, fuzzy_labels, m)
    centers = np.ascontiguousarray(centers)
    if verbose:
        print('Initialization complete')

    checked_sample_weight = _check_normalize_sample_weight(sample_weight, X)
    centers, labels, n_iter = k_means_elkan(X,
                                            checked_sample_weight,
                                            n_clusters,
                                            centers,
                                            tol=tol,
                                            max_iter=max_iter,
                                            verbose=verbose)
    fuzzy_labels, labels = e_step(X, centers, m)
    centers = m_step(X, fuzzy_labels, m)

    if sample_weight is None:
        inertia = np.sum((X - centers[labels])**2, dtype=np.float64)
    else:
        sq_distances = np.sum(
            (X - centers[labels])**2, axis=1,
            dtype=np.float64) * checked_sample_weight
        inertia = np.sum(sq_distances, dtype=np.float64)
    return fuzzy_labels, labels, inertia, centers, n_iter
Beispiel #3
0
def _labels_inertia(norm,
                    X,
                    sample_weight,
                    centers,
                    precompute_distances=True,
                    distances=None):
    """
    E step of the K-means EM algorithm.

    Computes the labels and the inertia of the given samples and centers.
    This will compute the distances in-place.

    Parameters
    ----------
    norm : 'l1' or 'l2'

    X : float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
        The input samples to assign to the labels.

    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.

    centers : float array, shape (k, n_features)
        The cluster centers.

    precompute_distances : boolean, default: True
        Precompute distances (faster but takes more memory).

    distances: existing distances

    Returns
    -------
    labels : int array of shape(n)
        The resulting assignment

    inertia : float
        Sum of squared distances of samples to their closest cluster center.
    """
    if norm == 'l2':
        return _labels_inertia_skl(X,
                                   sample_weight=sample_weight,
                                   centers=centers,
                                   precompute_distances=precompute_distances,
                                   x_squared_norms=None)

    sample_weight = _check_normalize_sample_weight(sample_weight, X)
    # set the default value of centers to -1 to be able to detect any anomaly
    # easily
    if distances is None:
        distances = numpy.zeros(shape=(0, ), dtype=X.dtype)
    # distances will be changed in-place
    if issparse(X):
        raise NotImplementedError(  # pragma no cover
            "Sparse matrix is not implemented for norm 'l1'.")
    if precompute_distances:
        return _labels_inertia_precompute_dense(norm=norm,
                                                X=X,
                                                sample_weight=sample_weight,
                                                centers=centers,
                                                distances=distances)
    raise NotImplementedError(  # pragma no cover
        "precompute_distances is False, not implemented for norm 'l1'.")
Beispiel #4
0
def _kmeans_single_lloyd(norm,
                         X,
                         sample_weight,
                         n_clusters,
                         max_iter=300,
                         init='k-means++',
                         verbose=False,
                         random_state=None,
                         tol=1e-4,
                         precompute_distances=True):
    """
    A single run of k-means, assumes preparation completed prior.

    Parameters
    ----------
    norm : 'l1' or 'l2'

    X : array-like of floats, shape (n_samples, n_features)
        The observations to cluster.

    n_clusters : int
        The number of clusters to form as well as the number of
        centroids to generate.

    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.

    max_iter : int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.

    init : {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':

        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.

        'random': choose k observations (rows) at random from data for
        the initial centroids.

        If an ndarray is passed, it should be of shape (k, p) and gives
        the initial centers.

        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.

    tol : float, optional
        The relative increment in the results before declaring convergence.

    verbose : boolean, optional
        Verbosity mode

    precompute_distances : boolean, default: True
        Precompute distances (faster but takes more memory).

    random_state : int, RandomState instance or None (default)
        Determines random number generation for centroid initialization. Use
        an int to make the randomness deterministic.
        See :term:`Glossary <random_state>`.

    Returns
    -------
    centroid : float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.

    label : integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.

    inertia : float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).

    n_iter : int
        Number of iterations run.
    """
    random_state = check_random_state(random_state)

    sample_weight = _check_normalize_sample_weight(sample_weight, X)

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids(norm,
                              X,
                              n_clusters,
                              init,
                              random_state=random_state)
    if verbose:  # pragma no cover
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = numpy.zeros(shape=(X.shape[0], ), dtype=X.dtype)
    X_sort_index = numpy.argsort(X, axis=0)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()
        # labels assignment is also called the E-step of EM
        labels, inertia = \
            _labels_inertia(norm, X, sample_weight, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

        # computation of the means is also called the M-step of EM
        centers = _centers_dense(X, sample_weight, labels, n_clusters,
                                 distances, X_sort_index)

        if verbose:  # pragma no cover
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = numpy.sum(
            numpy.abs(centers_old - centers).ravel())
        if center_shift_total <= tol:
            if verbose:  # pragma no cover
                print("Converged at iteration %d: "
                      "center shift %r within tolerance %r" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(norm, X, sample_weight, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
Beispiel #5
0
def _labels_inertia_skl(X,
                        sample_weight,
                        x_squared_norms,
                        centers,
                        precompute_distances=True,
                        distances=None):
    """E step of the K-means EM algorithm.
    Compute the labels and the inertia of the given samples and centers.
    This will compute the distances in-place.
    Parameters
    ----------
    X : float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
        The input samples to assign to the labels.
    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.
    x_squared_norms : array, shape (n_samples,)
        Precomputed squared euclidean norm of each data point, to speed up
        computations.
    centers : float array, shape (k, n_features)
        The cluster centers.
    precompute_distances : boolean, default: True
        Precompute distances (faster but takes more memory).
    distances : float array, shape (n_samples,)
        Pre-allocated array to be filled in with each sample's distance
        to the closest center.
    Returns
    -------
    labels : int array of shape(n)
        The resulting assignment
    inertia : float
        Sum of squared distances of samples to their closest cluster center.
    """
    n_samples = X.shape[0]
    sample_weight = _check_normalize_sample_weight(sample_weight, X)
    # set the default value of centers to -1 to be able to detect any anomaly
    # easily
    labels = numpy.full(n_samples, -1, numpy.int32)
    if distances is None:
        distances = numpy.zeros(shape=(0, ), dtype=X.dtype)
    # distances will be changed in-place
    if issparse(X):
        inertia = _assign_labels_csr(X,
                                     sample_weight,
                                     x_squared_norms,
                                     centers,
                                     labels,
                                     distances=distances)
    else:
        if precompute_distances:
            return _labels_inertia_precompute_dense(
                norm='l2',
                X=X,
                sample_weight=sample_weight,
                centers=centers,
                distances=distances)
        inertia = _assign_labels_array(X,
                                       sample_weight,
                                       x_squared_norms,
                                       centers,
                                       labels,
                                       distances=distances)
    return labels, inertia