コード例 #1
0
ファイル: test_k_means.py プロジェクト: yananhou/scikit-learn
def test_x_squared_norms_init_centroids():
    # Test that x_squared_norms can be None in _init_centroids
    from sklearn.cluster._kmeans import _init_centroids

    X_norms = np.sum(X**2, axis=1)
    precompute = _init_centroids(
        X, 3, "k-means++", random_state=0, x_squared_norms=X_norms)
    assert_array_almost_equal(
        precompute,
        _init_centroids(X, 3, "k-means++", random_state=0))
コード例 #2
0
def _fit_single(X, y=None, n_clusters=2, init='random', random_state=None,
                metric='riemann', max_iter=100, tol=1e-4, n_jobs=1):
    """helper to fit a single run of centroid."""
    # init random state if provided
    mdm = MDM(metric=metric, n_jobs=n_jobs)
    squared_nomrs = [np.linalg.norm(x, ord='fro')**2 for x in X]
    mdm.covmeans_ = _init_centroids(X, n_clusters, init,
                                    random_state=random_state,
                                    x_squared_norms=squared_nomrs)
    if y is not None:
        mdm.classes_ = np.unique(y)
    else:
        mdm.classes_ = np.arange(n_clusters)

    labels = mdm.predict(X)
    k = 0
    while True:
        old_labels = labels.copy()
        mdm.fit(X, old_labels)
        dist = mdm._predict_distances(X)
        labels = mdm.classes_[dist.argmin(axis=1)]
        k += 1
        if (k > max_iter) | (np.mean(labels == old_labels) > (1 - tol)):
            break
    inertia = sum([sum(dist[labels == mdm.classes_[i], i])
                   for i in range(len(mdm.classes_))])
    return labels, inertia, mdm
コード例 #3
0
def _fuzzykmeans_single_elkan(X,
                              m,
                              sample_weight,
                              n_clusters,
                              max_iter=300,
                              init='k-means++',
                              verbose=False,
                              x_squared_norms=None,
                              random_state=None,
                              tol=1e-4,
                              precompute_distances=True):
    if sp.issparse(X):
        raise TypeError("algorithm='elkan' not supported for sparse input X")

    n_samples, n_features = X.shape
    random_state = check_random_state(random_state)

    fuzzy_labels = random_state.rand(n_samples, n_clusters)
    fuzzy_labels /= fuzzy_labels.sum(axis=1)[:, np.newaxis]

    if x_squared_norms is None:
        x_squared_norms = row_norms(X, squared=True)
    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)
    centers = m_step(centers, fuzzy_labels, m)
    centers = np.ascontiguousarray(centers)
    if verbose:
        print('Initialization complete')

    checked_sample_weight = _check_normalize_sample_weight(sample_weight, X)
    centers, labels, n_iter = k_means_elkan(X,
                                            checked_sample_weight,
                                            n_clusters,
                                            centers,
                                            tol=tol,
                                            max_iter=max_iter,
                                            verbose=verbose)
    fuzzy_labels, labels = e_step(X, centers, m)
    centers = m_step(X, fuzzy_labels, m)

    if sample_weight is None:
        inertia = np.sum((X - centers[labels])**2, dtype=np.float64)
    else:
        sq_distances = np.sum(
            (X - centers[labels])**2, axis=1,
            dtype=np.float64) * checked_sample_weight
        inertia = np.sum(sq_distances, dtype=np.float64)
    return fuzzy_labels, labels, inertia, centers, n_iter
コード例 #4
0
def _init_and_run(
    X: np.ndarray,
    y: np.ndarray,
    init,
    init_size,
    r_mat: np.ndarray,
    max_iter,
    verbose,
    tol,
    init_advanced,
    working_memory,
):
    n_clusters = r_mat.shape[1]

    if init_advanced:
        print("Advanced initialization.")

        centers = np.empty((n_clusters, X.shape[1]), dtype=X.dtype)

        # Decode rejection matrix
        y_enc = _r_mat_to_y(r_mat)

        # Initialize centers for determined clusters
        i = 0
        for i in range(n_clusters):
            mask = y_enc == i
            if mask.sum() == 0:
                break
            centers[i] = X[mask].mean(axis=0)

        if verbose:
            print(
                f"Initializing {i} labeled and {n_clusters - i} unlabeled cluster centers..."
            )
            print("init_size:", init_size)

        if n_clusters - i > 0:
            # Initialize cententers for remaining free clusters from free objects (y==-1)
            centers[i:] = _init_centroids(X[y == -1],
                                          n_clusters - i,
                                          init=init,
                                          init_size=init_size)

    else:
        centers = _init_centroids(X,
                                  n_clusters,
                                  init=init,
                                  init_size=init_size)

    assert centers.shape[0] == r_mat.shape[1]

    (
        labels,
        sample_distances,
        inertia,
        centers,
        n_iter_,
    ) = _constrained_kmeans_single(
        X,
        r_mat,
        centers_init=centers,
        max_iter=max_iter,
        verbose=verbose,
        tol=tol,
        working_memory=working_memory,
    )

    return (
        labels,
        sample_distances,
        inertia,
        centers,
        n_iter_,
    )
コード例 #5
0
 def calc_sampling_distribution(self):
     x_squared_norms = row_norms(self.X, squared=True)
     centers = _init_centroids(self.X, self.n_clusters, self.init, random_state=self.random_state,
                               x_squared_norms=x_squared_norms)
     sens = sensitivity.kmeans_sensitivity(self.X, self.w, centers, max(np.log(self.n_clusters), 1))
     self.p = sens / np.sum(sens)
コード例 #6
0
def _fuzzykmeans_single_lloyd(X,
                              m,
                              sample_weight,
                              n_clusters,
                              max_iter=300,
                              init='k-means++',
                              verbose=False,
                              x_squared_norms=None,
                              random_state=None,
                              tol=1e-4,
                              precompute_distances=True):
    """A single run of k-means, assumes preparation completed prior.

    Parameters
    ----------
    X : array-like of floats, shape (n_samples, n_features)
        The observations to cluster.

    n_clusters : int
        The number of clusters to form as well as the number of
        centroids to generate.

    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.

    max_iter : int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.

    init : {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':

        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.

        'random': choose k observations (rows) at random from data for
        the initial centroids.

        If an ndarray is passed, it should be of shape (k, p) and gives
        the initial centers.

        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.

    tol : float, optional
        The relative increment in the results before declaring convergence.

    verbose : boolean, optional
        Verbosity mode

    x_squared_norms : array
        Precomputed x_squared_norms.

    precompute_distances : boolean, default: True
        Precompute distances (faster but takes more memory).

    random_state : int, RandomState instance or None (default)
        Determines random number generation for centroid initialization. Use
        an int to make the randomness deterministic.
        See :term:`Glossary <random_state>`.

    Returns
    -------
    centroid : float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.

    label : integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.

    inertia : float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).

    n_iter : int
        Number of iterations run.
    """
    n_samples, n_features = X.shape
    random_state = check_random_state(random_state)

    fuzzy_labels = random_state.rand(n_samples, n_clusters)
    fuzzy_labels /= fuzzy_labels.sum(axis=1)[:, np.newaxis]

    sample_weight = _check_normalize_sample_weight(sample_weight, X)

    best_fuzzy_labels, best_labels, best_inertia, best_centers = None, None, None, None
    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)

    centers = m_step(centers, fuzzy_labels, m)

    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()
        # labels assignment is also called the E-step of EM
        labels, inertia = \
            _labels_inertia(X, sample_weight, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)
        fuzzy_labels, labels = e_step(X, centers, m)

        # computation of the means is also called the M-step of EM
        # if sp.issparse(X):
        #     centers = _k_means._centers_sparse(X, sample_weight, labels,
        #                                        n_clusters, distances)
        # else:
        #     centers = _k_means._centers_dense(X, sample_weight, labels,
        #                                       n_clusters, distances)
        centers = m_step(X, fuzzy_labels, m)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_fuzzy_labels = fuzzy_labels.copy()
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, sample_weight, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)
        best_fuzzy_labels, best_labels = e_step(X, centers, m)

    return best_fuzzy_labels, best_labels, best_inertia, best_centers, i + 1