def test_check_sample_weight():
    from sklearn.cluster.k_means_ import _check_sample_weight
    sample_weight = None
    checked_sample_weight = _check_sample_weight(X, sample_weight)
    assert_equal(_num_samples(X), _num_samples(checked_sample_weight))
    assert_almost_equal(checked_sample_weight.sum(), _num_samples(X))
    assert_equal(X.dtype, checked_sample_weight.dtype)
Beispiel #2
0
def test_check_sample_weight():
    from sklearn.cluster.k_means_ import _check_sample_weight
    sample_weight = None
    checked_sample_weight = _check_sample_weight(X, sample_weight)
    assert_equal(_num_samples(X), _num_samples(checked_sample_weight))
    assert_almost_equal(checked_sample_weight.sum(), _num_samples(X))
    assert_equal(X.dtype, checked_sample_weight.dtype)
Beispiel #3
0
def _check_normalize_sample_weight(sample_weight, X):
    """Set sample_weight if None, and check for correct dtype"""

    sample_weight_was_none = sample_weight is None

    sample_weight = _check_sample_weight(X, sample_weight)

    if not sample_weight_was_none:
        # normalize the weights to sum up to n_samples
        # an array of 1 (i.e. samples_weight is None) is already normalized
        n_samples = len(sample_weight)
        scale = n_samples / sample_weight.sum()
        sample_weight *= scale
    return sample_weight
Beispiel #4
0
def _labels_inertia(X, sample_weight, x_squared_norms, centers, distances, same_cluster_size=False):
    """E step of the K-means EM algorithm.
    Compute the labels and the inertia of the given samples and centers.
    This will compute the distances in-place.
    Parameters
    ----------
    X : float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
        The input samples to assign to the labels.
    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.
    x_squared_norms : array, shape (n_samples,)
        Precomputed squared euclidean norm of each data point, to speed up
        computations.
    centers : float array, shape (k, n_features)
        The cluster centers.
    distances : float array, shape (n_samples,)
        Pre-allocated array to be filled in with each sample's distance
        to the closest center.
    Returns
    -------
    labels : int array of shape(n)
        The resulting assignment
    inertia : float
        Sum of squared distances of samples to their closest cluster center.
    """
    sample_weight = _check_sample_weight(X, sample_weight)
    n_samples = X.shape[0]
    n_clusters = centers.shape[0]

    # See http://jmonlong.github.io/Hippocamplus/2018/06/09/cluster-same-size/#same-size-k-means-variation
    if same_cluster_size:
        cluster_size = n_samples // n_clusters
        labels = np.zeros(n_samples, dtype=np.int32)
        mindist = np.zeros(n_samples, dtype=np.float32)
        # count how many samples have been labeled in a cluster
        counters = np.zeros(n_clusters, dtype=np.int32)
        # dist: (n_samples, n_clusters)
        dist = euclidean_distances(X, centers, squared=False)
        closeness = dist.min(axis=-1) - dist.max(axis=-1)
        ranking = np.argsort(closeness)
        for r in ranking:
            while True:
                label = dist[r].argmin()
                if counters[label] < cluster_size:
                    labels[r] = label
                    counters[label] += 1
                    # squared distances are used for inertia in this function
                    mindist[r] = dist[r, label] ** 2
                    break
                else:
                    dist[r, label] = np.inf
    else:
        # Breakup nearest neighbor distance computation into batches to prevent
        # memory blowup in the case of a large number of samples and clusters.
        # TODO: Once PR #7383 is merged use check_inputs=False in metric_kwargs.
        labels, mindist = pairwise_distances_argmin_min(
            X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True})

    # cython k-means code assumes int32 inputs
    labels = labels.astype(np.int32, copy=False)
    if n_samples == distances.shape[0]:
        # distances will be changed in-place
        distances[:] = mindist
    inertia = (mindist * sample_weight).sum()
    return labels, inertia
Beispiel #5
0
def kmeans_lloyd(X, sample_weight, n_clusters, max_iter=300,
                 init='k-means++', verbose=False, x_squared_norms=None,
                 random_state=None, tol=1e-4, same_cluster_size=False):
    """A single run of k-means, assumes preparation completed prior.
    Parameters
    ----------
    X : array-like of floats, shape (n_samples, n_features)
        The observations to cluster.
    n_clusters : int
        The number of clusters to form as well as the number of
        centroids to generate.
    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.
    max_iter : int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.
    init : {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':
        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.
        'random': choose k observations (rows) at random from data for
        the initial centroids.
        If an ndarray is passed, it should be of shape (k, p) and gives
        the initial centers.
        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.
    tol : float, optional
        The relative increment in the results before declaring convergence.
    verbose : boolean, optional
        Verbosity mode
    x_squared_norms : array
        Precomputed x_squared_norms.
    precompute_distances : boolean, default: True
        Precompute distances (faster but takes more memory).
    random_state : int, RandomState instance or None (default)
        Determines random number generation for centroid initialization. Use
        an int to make the randomness deterministic.
        See :term:`Glossary <random_state>`.
    Returns
    -------
    centroid : float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.
    label : integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.
    inertia : float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).
    n_iter : int
        Number of iterations run.
    """
    random_state = check_random_state(random_state)
    if same_cluster_size:
        assert len(X) % n_clusters == 0, "#samples is not divisible by #clusters"

    if verbose:
        print("\n==> Starting k-means clustering...\n")

    sample_weight = _check_sample_weight(X, sample_weight)
    x_squared_norms = row_norms(X, squared=True)

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids(X, n_clusters, init, random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()
        # labels assignment is also called the E-step of EM
        labels, inertia = \
            _labels_inertia(X, sample_weight, x_squared_norms,
                            centers, distances=distances, same_cluster_size=same_cluster_size)

        # computation of the means is also called the M-step of EM
        centers = _centers_dense(
            X, sample_weight, labels, n_clusters, distances)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e"
                      % (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, sample_weight, x_squared_norms,
                            best_centers, distances=distances, same_cluster_size=same_cluster_size)

    return best_labels, best_inertia, best_centers, i + 1
def subspace_kmeans_single(X,
                           sample_weight,
                           n_clusters,
                           init='k-means++',
                           max_iter=300,
                           tol=1e-4,
                           tol_eig=-1e-10,
                           verbose=False,
                           x_squared_norms=None,
                           random_state=None):
    random_state = check_random_state(random_state)
    sample_weight = _check_sample_weight(X, sample_weight)

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)

    # === Beginning of original implementation of initialization ===

    # Dimensionality of original space
    d = X.shape[1]

    # Set initial V as QR-decomposed Q of random matrix
    rand_vals = random_state.random_sample(d**2).reshape(d, d)
    V, _ = np.linalg.qr(rand_vals, mode='complete')

    # Set initial m as d/2
    m = d // 2

    # Scatter matrix of the dataset in the original space
    S_D = np.dot(X.T, X)

    # Projection onto the first m attributes
    P_C = np.eye(m, M=d).T

    # === End of original implementation of initialization ===

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # === Beginning of original implementation of E-step of EM ===

        X_C = np.dot(np.dot(X, V), P_C)
        mu_C = np.dot(np.dot(centers, V), P_C)
        labels, _ = pairwise_distances_argmin_min(
            X=X_C, Y=mu_C, metric='euclidean', metric_kwargs={'squared': True})
        labels = labels.astype(np.int32)

        # === End of original implementation of E-step of EM ===

        # computation of the means is also called the M-step of EM
        centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters,
                                          distances)

        # === Beginning of original implementation of M-step of EM ===

        S = np.zeros((d, d))
        for i in range(n_clusters):
            X_i = X[:][labels == i] - centers[:][i]
            S += np.dot(X_i.T, X_i)
        Sigma = S - S_D
        evals, evecs = np.linalg.eigh(Sigma)
        idx = np.argsort(evals)[::1]
        V = evecs[:, idx]
        m = len(np.where(evals < tol_eig)[0])
        if m == 0:
            raise ValueError(
                'Dimensionality of clustered space is 0. '
                'The dataset is better explained by a single cluster.')
        P_C = np.eye(m, M=d).T
        inertia = 0.0
        for i in range(n_clusters):
            inertia += row_norms(X[:][labels == i] - centers[:][i],
                                 squared=True).sum()

        # === End of original implementation of M-step of EM ===

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, sample_weight,x_squared_norms, best_centers,
                            precompute_distances=False,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
def _spherical_kmeans_single_lloyd(
    X,
    n_clusters,
    sample_weight=None,
    max_iter=300,
    init="k-means++",
    verbose=False,
    x_squared_norms=None,
    random_state=None,
    tol=1e-4,
    precompute_distances=True,
):
    """
    Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
    """
    random_state = check_random_state(random_state)

    sample_weight = _check_sample_weight(sample_weight, X)

    best_labels, best_inertia, best_centers = None, None, None

    # init
    centers = _init_centroids(
        X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms
    )
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # labels assignment
        # TODO: _labels_inertia should be done with cosine distance
        #       since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized
        #       this doesn't really matter.
        labels, inertia = _labels_inertia(
            X,
            sample_weight,
            x_squared_norms,
            centers,
            precompute_distances=precompute_distances,
            distances=distances,
        )

        # computation of the means
        if sp.issparse(X):
            centers = _k_means._centers_sparse(
                X, sample_weight, labels, n_clusters, distances
            )
        else:
            centers = _k_means._centers_dense(
                X.astype(np.float),
                sample_weight.astype(np.float),
                labels,
                n_clusters,
                distances.astype(np.float),
            )

        # l2-normalize centers (this is the main contibution here)
        centers = normalize(centers)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print(
                    "Converged at iteration %d: "
                    "center shift %e within tolerance %e" % (i, center_shift_total, tol)
                )
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = _labels_inertia(
            X,
            sample_weight,
            x_squared_norms,
            best_centers,
            precompute_distances=precompute_distances,
            distances=distances,
        )

    return best_labels, best_inertia, best_centers, i + 1
Beispiel #8
0
    def sub_kmeans_single_(self, X, sample_weight, x_squared_norms, tol,
                           random_state):
        random_state = check_random_state(random_state)
        sample_weight = _check_sample_weight(X, sample_weight)
        best_labels, best_inertia, best_centers = None, None, None

        distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)
        centers = _init_centroids(X,
                                  self.n_clusters,
                                  init='k-means++',
                                  random_state=random_state,
                                  x_squared_norms=x_squared_norms)

        d = X.shape[1]  # dimentionality of original space
        m = d // 2  # dimentionality of clustered space
        SD = np.dot(X.T,
                    X)  # scatter matrix of the dataset in the original space

        # orthonormal matrix of a rigid transformation
        V, _ = np.linalg.qr(random_state.random_sample(d**2).reshape(d, d),
                            mode='complete')
        for i in range(self.max_iter):
            centers_old = centers.copy()

            # get the clusters' labels
            labels = self.assignment_step_(X=X, V=V, centers=centers, m=m)

            # compute new centers and sum the clusters' scatter matrices
            centers = _k_means._centers_dense(X, sample_weight, labels,
                                              self.n_clusters, distances)
            S = self.update_step_(X, centers, labels)

            # sorted eigenvalues and eigenvectors of SIGMA=S-SD
            V, m = self.eigen_decomposition_(S - SD)
            if m == 0:
                raise ValueError('Might be a single cluster (m = 0).')

            # inertia - sum of squared distances of samples to their closest cluster center
            inertia = sum([
                row_norms(X[labels == j] - centers[j], squared=True).sum()
                for j in range(self.n_clusters)
            ])

            # print("Iteration %2d, inertia %.3f" % (i, inertia))
            if best_inertia is None or inertia < best_inertia:
                best_labels = labels.copy()
                best_centers = centers.copy()
                best_inertia = inertia

            center_shift_total = squared_norm(centers_old - centers)
            if center_shift_total <= tol:
                # print("Converged at iteration %d: center shift %e within tolerance %e" % (i, center_shift_total, tol))
                break

        if center_shift_total > 0:
            # rerun E-step in case of non-convergence so that predicted labels match cluster centers
            best_labels, best_inertia = _labels_inertia(
                X,
                sample_weight,
                x_squared_norms,
                best_centers,
                precompute_distances=False,
                distances=distances)

        return best_centers, best_labels, best_inertia