def Subspace_iter(X, n_clusters, init='k-means++', max_iter=300, tol=1e-4, tol_eig=-1e-10, x_squared_norms=None, random_state=None):
    random_state = check_random_state(random_state)
    centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms)

    new_labels, new_inertia, new_centers = None, None, None

    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)
    d_shape = X.shape[1]
    randomval = random_state.random_sample(d_shape ** 2).reshape(d_shape, d_shape)
    V_val, _ = np.linalg.qr(randomval, mode='complete')
    m_val = d_shape // 2
    S_D = np.dot(X.T, X)
    P_Cluster = np.eye(m_val, M=d_shape).T
    for i in range(max_iter):
        centers_old = centers.copy()
        X_values = np.dot(np.dot(X, V_val), P_Cluster)
        centers_c = np.dot(np.dot(centers, V_val), P_Cluster)
        labels, _ = pairwise_distances_argmin_min(X = X_values, Y = centers_c,  metric='euclidean',metric_kwargs={'squared': True})
        labels = labels.astype(np.int32)
        centers = _k_means._centers_dense(X, labels, n_clusters, distances)
        S = np.zeros((d_shape, d_shape))
        for it in range(n_clusters):
            X_it = X[:][labels == it] - centers[:][it]
            S += np.dot(X_it.T, X_it)
        Sigma = S - S_D
        EV, _ = np.linalg.eigh(Sigma)
        m = len(np.where(EV < tol_eig)[0])
        P_Cluster = np.eye(m, M=d_shape).T
        inertia = 0.0
        for j in range(n_clusters):
            inertia += row_norms( X[:][labels == j] - centers[:][j],squared=True).sum()

        if new_inertia is None or inertia < new_inertia:
            new_labels = labels.copy()
            new_centers = centers.copy()
            new_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            break

    if center_shift_total > 0:
        new_labels, new_inertia = _labels_inertia(X, x_squared_norms, new_centers,
                            precompute_distances=False,
                            distances=distances)
    return new_labels, new_inertia, new_centers, i + 1
def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300,
                   init='k-means++', verbose=False, random_state=None,
                   tol=1e-4, precompute_distances=True, sample_weight=None):
    """A single run of k-means, assumes preparation completed prior.
    Parameters
    ----------
    X: array-like of floats, shape (n_samples, n_features)
        The observations to cluster.
    n_clusters: int
        The number of clusters to form as well as the number of
        centroids to generate.
    max_iter: int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.
    init: {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':
        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.
        'random': generate k centroids from a Gaussian with mean and
        variance estimated from the data.
        If an ndarray is passed, it should be of shape (k, p) and gives
        the initial centers.
        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.
    tol: float, optional
        The relative increment in the results before declaring convergence.
    verbose: boolean, optional
        Verbosity mode
    x_squared_norms: array
        Precomputed x_squared_norms.
    precompute_distances : boolean, default: True
        Precompute distances (faster but takes more memory).
    random_state: integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
    Returns
    -------
    centroid: float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.
    label: integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.
    inertia: float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).
    n_iter : int
        Number of iterations run.
    """

    if sample_weight == None:
        sample_weight = np.ones(X.shape[0])

    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = k_means_._init_centroids(X, n_clusters, init, random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0],), dtype=np.float64)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()
        # labels assignment is also called the E-step of EM
        labels, inertia = \
            _labels_inertia(X, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

        sample_weight = np.asarray([1.0] * len(labels))
        # computation of the means is also called the M-step of EM
        if sp.issparse(X):
            centers = _k_means._centers_sparse(X, sample_weight, labels, n_clusters,
                                               distances)
        else:
            centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters, distances)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        shift = squared_norm(centers_old - centers)
        if shift <= tol:
            if verbose:
                print("Converged at iteration %d" % i)

            break

    if shift > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
def subspace_kmeans_single(X,
                           sample_weight,
                           n_clusters,
                           init='k-means++',
                           max_iter=300,
                           tol=1e-4,
                           tol_eig=-1e-10,
                           verbose=False,
                           x_squared_norms=None,
                           random_state=None):
    random_state = check_random_state(random_state)
    sample_weight = _check_sample_weight(X, sample_weight)

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)

    # === Beginning of original implementation of initialization ===

    # Dimensionality of original space
    d = X.shape[1]

    # Set initial V as QR-decomposed Q of random matrix
    rand_vals = random_state.random_sample(d**2).reshape(d, d)
    V, _ = np.linalg.qr(rand_vals, mode='complete')

    # Set initial m as d/2
    m = d // 2

    # Scatter matrix of the dataset in the original space
    S_D = np.dot(X.T, X)

    # Projection onto the first m attributes
    P_C = np.eye(m, M=d).T

    # === End of original implementation of initialization ===

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # === Beginning of original implementation of E-step of EM ===

        X_C = np.dot(np.dot(X, V), P_C)
        mu_C = np.dot(np.dot(centers, V), P_C)
        labels, _ = pairwise_distances_argmin_min(
            X=X_C, Y=mu_C, metric='euclidean', metric_kwargs={'squared': True})
        labels = labels.astype(np.int32)

        # === End of original implementation of E-step of EM ===

        # computation of the means is also called the M-step of EM
        centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters,
                                          distances)

        # === Beginning of original implementation of M-step of EM ===

        S = np.zeros((d, d))
        for i in range(n_clusters):
            X_i = X[:][labels == i] - centers[:][i]
            S += np.dot(X_i.T, X_i)
        Sigma = S - S_D
        evals, evecs = np.linalg.eigh(Sigma)
        idx = np.argsort(evals)[::1]
        V = evecs[:, idx]
        m = len(np.where(evals < tol_eig)[0])
        if m == 0:
            raise ValueError(
                'Dimensionality of clustered space is 0. '
                'The dataset is better explained by a single cluster.')
        P_C = np.eye(m, M=d).T
        inertia = 0.0
        for i in range(n_clusters):
            inertia += row_norms(X[:][labels == i] - centers[:][i],
                                 squared=True).sum()

        # === End of original implementation of M-step of EM ===

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, sample_weight,x_squared_norms, best_centers,
                            precompute_distances=False,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
def kmeans_constrained_single(X,
                              n_clusters,
                              size_min=None,
                              size_max=None,
                              max_iter=300,
                              init='k-means++',
                              verbose=False,
                              x_squared_norms=None,
                              random_state=None,
                              tol=1e-4):
    """A single run of k-means constrained, assumes preparation completed prior.

    Parameters
    ----------
    X : array-like of floats, shape (n_samples, n_features)
        The observations to cluster.

    size_min : int, optional, default: None
        Constrain the label assignment so that each cluster has a minimum
        size of size_min. If None, no constrains will be applied

    size_max : int, optional, default: None
        Constrain the label assignment so that each cluster has a maximum
        size of size_max. If None, no constrains will be applied

    n_clusters : int
        The number of clusters to form as well as the number of
        centroids to generate.

    max_iter : int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.

    init : {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':

        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.

        'random': generate k centroids from a Gaussian with mean and
        variance estimated from the data.

        If an ndarray is passed, it should be of shape (k, p) and gives
        the initial centers.

        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.

    tol : float, optional
        The relative increment in the results before declaring convergence.

    verbose : boolean, optional
        Verbosity mode

    x_squared_norms : array
        Precomputed x_squared_norms.

    random_state : int, RandomState instance or None, optional, default: None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Returns
    -------
    centroid : float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.

    label : integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.

    inertia : float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).

    n_iter : int
        Number of iterations run.
    """

    sample_weight = np.ones(X.shape[0])
    random_state = check_random_state(random_state)
    n_samples = X.shape[0]

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(n_samples, ), dtype=X.dtype)

    # Determine min and max sizes if non given
    if size_min is None:
        size_min = 0
    if size_max is None:
        size_max = n_samples  # Number of data points

    # Check size min and max
    if not ((size_min >= 0) and (size_min <= n_samples) and (size_max >= 0) and
            (size_max <= n_samples)):
        raise ValueError(
            "size_min and size_max must be a positive number smaller "
            "than the number of data points or `None`")
    if size_max < size_min:
        raise ValueError("size_max must be larger than size_min")
    if size_min * n_clusters > n_samples:
        raise ValueError(
            "The product of size_min and n_clusters cannot exceed the number of samples (X)"
        )

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()
        # labels assignment is also called the E-step of EM
        labels, inertia = \
            _labels_constrained(X, centers, size_min, size_max, distances=distances)

        # computation of the means is also called the M-step of EM
        if sp.issparse(X):
            centers = _centers_sparse(X, sample_weight, labels, n_clusters,
                                      distances)
        else:
            centers = _centers_dense(X, sample_weight, labels, n_clusters,
                                     distances)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_constrained(X, centers, size_min, size_max, distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
def _spherical_kmeans_single_lloyd(X,
                                   n_clusters,
                                   max_iter=300,
                                   init='k-means++',
                                   verbose=False,
                                   x_squared_norms=None,
                                   random_state=None,
                                   tol=1e-4,
                                   precompute_distances=True):
    '''
    Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
    '''
    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None

    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # labels assignment
        # TODO: _labels_inertia should be done with cosine distance
        #       since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized
        #       this doesn't really matter.
        labels, inertia = \
            _labels_inertia(X, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

        # computation of the means
        if sp.issparse(X):
            centers = _k_means._centers_sparse(X, labels, n_clusters,
                                               distances)
        else:
            centers = _k_means._centers_dense(X, labels, n_clusters, distances)

        # l2-normalize centers (this is the main contibution here)
        centers = normalize(centers)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
Beispiel #6
0
def _kmeans_single_lloyd(X,
                         sample_weight,
                         n_clusters,
                         max_iter=300,
                         init='k-means++',
                         verbose=False,
                         x_squared_norms=None,
                         random_state=None,
                         tol=1e-4,
                         precompute_distances=True,
                         group=None):
    """A single run of k-means, assumes preparation completed prior.

    Parameters
    ----------
    X : array-like of floats, shape (n_samples, n_features)
        The observations to cluster.

    n_clusters : int
        The number of clusters to form as well as the number of
        centroids to generate.

    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.

    max_iter : int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.

    init : {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':

        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.

        'random': choose k observations (rows) at random from data for
        the initial centroids.

        If an ndarray is passed, it should be of shape (k, p) and gives
        the initial centers.

        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.

    tol : float, optional
        The relative increment in the results before declaring convergence.

    verbose : boolean, optional
        Verbosity mode

    x_squared_norms : array
        Precomputed x_squared_norms.

    precompute_distances : boolean, default: True
        Precompute distances (faster but takes more memory).

    random_state : int, RandomState instance or None (default)
        Determines random number generation for centroid initialization. Use
        an int to make the randomness deterministic.
        See :term:`Glossary <random_state>`.

    Returns
    -------
    centroid : float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.

    label : integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.

    inertia : float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).

    n_iter : int
        Number of iterations run.
    """
    random_state = check_random_state(random_state)

    sample_weight = _check_sample_weight(X, sample_weight)

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()
        # labels assignment is also called the E-step of EM
        labels, inertia = \
            _labels_inertia(X, sample_weight, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances, group=group)

        # computation of the means is also called the M-step of EM
        if sp.issparse(X):
            centers = _k_means._centers_sparse(X, sample_weight, labels,
                                               n_clusters, distances)
        else:
            centers = _k_means._centers_dense(X, sample_weight, labels,
                                              n_clusters, distances)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, sample_weight, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances, group=group)

    return best_labels, best_inertia, best_centers, i + 1
Beispiel #7
0
    def sub_kmeans_single_(self, X, sample_weight, x_squared_norms, tol,
                           random_state):
        random_state = check_random_state(random_state)
        sample_weight = _check_sample_weight(X, sample_weight)
        best_labels, best_inertia, best_centers = None, None, None

        distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)
        centers = _init_centroids(X,
                                  self.n_clusters,
                                  init='k-means++',
                                  random_state=random_state,
                                  x_squared_norms=x_squared_norms)

        d = X.shape[1]  # dimentionality of original space
        m = d // 2  # dimentionality of clustered space
        SD = np.dot(X.T,
                    X)  # scatter matrix of the dataset in the original space

        # orthonormal matrix of a rigid transformation
        V, _ = np.linalg.qr(random_state.random_sample(d**2).reshape(d, d),
                            mode='complete')
        for i in range(self.max_iter):
            centers_old = centers.copy()

            # get the clusters' labels
            labels = self.assignment_step_(X=X, V=V, centers=centers, m=m)

            # compute new centers and sum the clusters' scatter matrices
            centers = _k_means._centers_dense(X, sample_weight, labels,
                                              self.n_clusters, distances)
            S = self.update_step_(X, centers, labels)

            # sorted eigenvalues and eigenvectors of SIGMA=S-SD
            V, m = self.eigen_decomposition_(S - SD)
            if m == 0:
                raise ValueError('Might be a single cluster (m = 0).')

            # inertia - sum of squared distances of samples to their closest cluster center
            inertia = sum([
                row_norms(X[labels == j] - centers[j], squared=True).sum()
                for j in range(self.n_clusters)
            ])

            # print("Iteration %2d, inertia %.3f" % (i, inertia))
            if best_inertia is None or inertia < best_inertia:
                best_labels = labels.copy()
                best_centers = centers.copy()
                best_inertia = inertia

            center_shift_total = squared_norm(centers_old - centers)
            if center_shift_total <= tol:
                # print("Converged at iteration %d: center shift %e within tolerance %e" % (i, center_shift_total, tol))
                break

        if center_shift_total > 0:
            # rerun E-step in case of non-convergence so that predicted labels match cluster centers
            best_labels, best_inertia = _labels_inertia(
                X,
                sample_weight,
                x_squared_norms,
                best_centers,
                precompute_distances=False,
                distances=distances)

        return best_centers, best_labels, best_inertia
Beispiel #8
0
def _kmeans_single_with_weights(X,
                                weights,
                                n_clusters,
                                x_squared_norms,
                                max_iter=300,
                                init='kmeans++_with_weights',
                                verbose=False,
                                random_state=None,
                                tol=1e-4,
                                precompute_distances=True):
    """A single run of k-means with weights, assumes preparation completed prior.
    Parameters
    ----------
    X: array-like of floats, shape (n_samples, n_features)
        The observations to cluster.
    n_clusters: int
        The number of clusters to form as well as the number of
        centroids to generate.
    weights: array, shape (n_samples)
    max_iter: int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.
    init: {'kmeans++_with_weights', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':
        'kmeans++_with_weights' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init_with_weights for more details.
        'random': generate k centroids from a Gaussian with mean and
        variance estimated from the data.
        If an ndarray is passed, it should be of shape (k, p) and gives
        the initial centers.
        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.
    tol: float, optional
        The relative increment in the results before declaring convergence.
    verbose: boolean, optional
        Verbosity mode
    x_squared_norms: array
        Precomputed x_squared_norms.
    precompute_distances : boolean, default: True
        Precompute distances (faster but takes more memory).
    random_state: integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
    Returns
    -------
    centroid: float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.
    label: integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.
    weighted inertia: float
        The final value of the inertia criterion (weighted sum of squared distances to
        the closest centroid for all observations in the training set).
    n_iter : int
        Number of iterations run.
    """
    # random_state = check_random_state(random_state)     ####remove this

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids_with_weights(X,
                                           weights,
                                           n_clusters,
                                           init,
                                           random_state=random_state,
                                           x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0], ), dtype=np.float64)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()
        # labels assignment is also called the E-step of EM
        labels, _ = \
            _labels_inertia(X, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

        # computation of the means is also called the M-step of EM
        # if sp.issparse(X):
        #   centers = _k_means._centers_sparse(X, labels, n_clusters,
        #                                      distances)
        # else:
        #    centers = _k_means._centers_dense(X, labels, n_clusters, distances)

        # computation of the weighted means is also called the M-step of EM ##Check this!!#
        weights_clusters = np.zeros(n_clusters)

        for i in range(len(X)):
            weights_clusters[labels[i]] += weights[i]

        n_samples_in_cluster = np.bincount(labels, minlength=n_clusters)

        if sp.issparse(X):
            centers = _k_means._centers_sparse(
                np.multiply(X, weights.reshape(len(weights), 1)), labels,
                n_clusters, distances)
        else:
            centers = _k_means._centers_dense(
                np.multiply(X, weights.reshape(len(weights), 1)), labels,
                n_clusters, distances)

        weights_clusters = np.zeros(n_clusters)

        for i in range(len(X)):
            weights_clusters[labels[i]] += weights[i]
        weights_clusters[weights_clusters == 0] = 1
        n_samples_in_cluster = np.bincount(labels, minlength=n_clusters)
        centers /= weights_clusters[:, np.newaxis]
        centers *= n_samples_in_cluster[:, np.newaxis]

        # weighted inertia is computed here
        centers_labelled = centers[labels]
        row_norms_diff = row_norms(X - centers_labelled,
                                   squared=True)[np.newaxis, :]
        inertia = np.sum(weights * row_norms_diff)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        shift = squared_norm(centers_old - centers)
        if shift <= tol:
            if verbose:
                print("Converged at iteration %d" % i)

            break

    if shift > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, _ = \
            _labels_inertia(X, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)
        # weighted best_inertia is computed here
        best_centers_labelled = best_centers[best_labels]
        best_row_norms_diff = row_norms(X - best_centers_labelled,
                                        squared=True)[np.newaxis, :]
        best_inertia = np.sum(weights * best_row_norms_diff)
    return best_labels, best_inertia, best_centers, i + 1
Beispiel #9
0
def _kmeans_step(X, x_squared_norms, centers,
                    distances,
                    precompute_distances,
                    n_clusters,
                    random_state=None):
    """Incremental update of the centers for the Minibatch K-Means algorithm.
        Parameters
        ----------
        X : array, shape (n_samples, n_features)
        The original data array.
        x_squared_norms : array, shape (n_samples,)
        Squared euclidean norm of each data point.
        centers : array, shape (k, n_features)
        The cluster centers. This array is MODIFIED IN PLACE
        distances : array, dtype float64, shape (n_samples), optional
        If not None, should be a pre-allocated array that will be used to store
        the distances of each sample to its closest center.
        May not be None when random_reassign is True.
        random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
        
        Returns
        -------
        inertia : float
        Sum of distances of samples to their closest cluster center.
        squared_diff : numpy array, shape (n_clusters,)
        Squared distances between previous and updated cluster centers.
    """
    centers_old = centers.copy()
    # labels assignment is also called the E-step of EM
    labels, inertia = k_means_._labels_inertia(X, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

    # computation of the means is also called the M-step of EM
    if sp.issparse(X):
        centers = _k_means._centers_sparse(X, labels, n_clusters,
                                               distances)
    else:
        
        centers = _k_means._centers_dense(X, labels, n_clusters, distances)
    """       if best_inertia is None or inertia < best_inertia:
              best_labels = labels.copy()
              best_centers = centers.copy()
              best_inertia = inertia
    """
    shift = squared_norm(centers_old - centers)
    """        if shift <= tol:
            if verbose:
                print("Converged at iteration %d" % i)

            break

    if shift > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)
    """                                                                                                
    return centers,inertia, shift
def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300,
                                   init='k-means++', verbose=False,
                                   x_squared_norms=None,
                                   random_state=None, tol=1e-4,
                                   precompute_distances=True):
    '''
    Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
    '''
    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None

    # init
    centers = _init_centroids(X, n_clusters, init, random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # labels assignment
        # TODO: _labels_inertia should be done with cosine distance
        #       since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized
        #       this doesn't really matter.
        labels, inertia = \
            _labels_inertia(X, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

        # computation of the means
        if sp.issparse(X):
            centers = _k_means._centers_sparse(X, labels, n_clusters,
                                               distances)
        else:
            centers = _k_means._centers_dense(X, labels, n_clusters, distances)

        # l2-normalize centers (this is the main contibution here)
        centers = normalize(centers)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e"
                      % (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
Beispiel #11
0
def _kmeans_single(X,
                   n_clusters,
                   x_squared_norms,
                   max_iter=300,
                   init='k-means++',
                   verbose=False,
                   random_state=None,
                   tol=1e-4,
                   precompute_distances=True,
                   distance_function=euclidean_distances):
    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms,
                              distance_function=distance_function)
    if verbose:
        print('Initialization complete')

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0], ), dtype=np.float64)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()
        # labels assignment is also called the E-step of EM
        labels, inertia = \
            _labels_inertia(X, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

        # # computation of the means is also called the M-step of EM
        # if sp.issparse(X):
        #     centers = _k_means._centers_sparse(X, labels, n_clusters,
        #                                        distances)
        # else:
        #     centers = _k_means._centers_dense(X, labels, n_clusters, distances)

        n_samples = X.shape[0]
        sample_weight = np.ones(n_samples, dtype=X.dtype)
        # computation of the means is also called the M-step of EM
        centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters,
                                          distances)

        if verbose:
            print('Iteration %2d, inertia %.3f' % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        if squared_norm(centers_old - centers) <= tol:
            if verbose:
                print("Converged at iteration %d" % i)
            break
    return best_labels, best_inertia, best_centers