Exemple #1
0
def test_labels_assignment_and_inertia():
    # pure numpy implementation as easily auditable reference gold
    # implementation
    rng = np.random.RandomState(42)
    noisy_centers = centers + rng.normal(size=centers.shape)
    labels_gold = np.full(n_samples, -1, dtype=np.int)
    mindist = np.empty(n_samples)
    mindist.fill(np.infty)
    for center_id in range(n_clusters):
        dist = np.sum((X - noisy_centers[center_id])**2, axis=1)
        labels_gold[dist < mindist] = center_id
        mindist = np.minimum(dist, mindist)
    inertia_gold = mindist.sum()
    assert (mindist >= 0.0).all()
    assert (labels_gold != -1).all()

    sample_weight = None

    # perform label assignment using the dense array input
    x_squared_norms = (X**2).sum(axis=1)
    labels_array, inertia_array = _labels_inertia(X, sample_weight,
                                                  x_squared_norms,
                                                  noisy_centers)
    assert_array_almost_equal(inertia_array, inertia_gold)
    assert_array_equal(labels_array, labels_gold)

    # perform label assignment using the sparse CSR input
    x_squared_norms_from_csr = row_norms(X_csr, squared=True)
    labels_csr, inertia_csr = _labels_inertia(X_csr, sample_weight,
                                              x_squared_norms_from_csr,
                                              noisy_centers)
    assert_array_almost_equal(inertia_csr, inertia_gold)
    assert_array_equal(labels_csr, labels_gold)
Exemple #2
0
def test_minibatch_reassign(data):
    # Check the reassignment part of the minibatch step with very high or very
    # low reassignment ratio.
    perfect_centers = np.empty((n_clusters, n_features))
    for i in range(n_clusters):
        perfect_centers[i] = X[true_labels == i].mean(axis=0)

    x_squared_norms = row_norms(data, squared=True)
    sample_weight = np.ones(n_samples)
    centers_new = np.empty_like(perfect_centers)

    # Give a perfect initialization, but a large reassignment_ratio, as a
    # result many centers should be reassigned and the model should no longer
    # be good
    score_before = - _labels_inertia(data, sample_weight, x_squared_norms,
                                     perfect_centers, 1)[1]

    _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers,
                     centers_new, np.zeros(n_clusters),
                     np.random.RandomState(0), random_reassign=True,
                     reassignment_ratio=1)

    score_after = - _labels_inertia(data, sample_weight, x_squared_norms,
                                    centers_new, 1)[1]

    assert score_before > score_after

    # Give a perfect initialization, with a small reassignment_ratio,
    # no center should be reassigned.
    _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers,
                     centers_new, np.zeros(n_clusters),
                     np.random.RandomState(0), random_reassign=True,
                     reassignment_ratio=1e-15)

    assert_allclose(centers_new, perfect_centers)
Exemple #3
0
def test_minibatch_update_consistency():
    # Check that dense and sparse minibatch update give the same results
    rng = np.random.RandomState(42)

    centers_old = centers + rng.normal(size=centers.shape)
    centers_old_csr = centers_old.copy()

    centers_new = np.zeros_like(centers_old)
    centers_new_csr = np.zeros_like(centers_old_csr)

    weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype)
    weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype)

    x_squared_norms = (X ** 2).sum(axis=1)
    x_squared_norms_csr = row_norms(X_csr, squared=True)

    sample_weight = np.ones(X.shape[0], dtype=X.dtype)

    # extract a small minibatch
    X_mb = X[:10]
    X_mb_csr = X_csr[:10]
    x_mb_squared_norms = x_squared_norms[:10]
    x_mb_squared_norms_csr = x_squared_norms_csr[:10]
    sample_weight_mb = sample_weight[:10]

    # step 1: compute the dense minibatch update
    old_inertia = _mini_batch_step(
        X_mb, x_mb_squared_norms, sample_weight_mb, centers_old, centers_new,
        weight_sums, np.random.RandomState(0), random_reassign=False)
    assert old_inertia > 0.0

    # compute the new inertia on the same batch to check that it decreased
    labels, new_inertia = _labels_inertia(
        X_mb, sample_weight_mb, x_mb_squared_norms, centers_new)
    assert new_inertia > 0.0
    assert new_inertia < old_inertia

    # step 2: compute the sparse minibatch update
    old_inertia_csr = _mini_batch_step(
        X_mb_csr, x_mb_squared_norms_csr, sample_weight_mb, centers_old_csr,
        centers_new_csr, weight_sums_csr, np.random.RandomState(0),
        random_reassign=False)
    assert old_inertia_csr > 0.0

    # compute the new inertia on the same batch to check that it decreased
    labels_csr, new_inertia_csr = _labels_inertia(
        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, centers_new_csr)
    assert new_inertia_csr > 0.0
    assert new_inertia_csr < old_inertia_csr

    # step 3: check that sparse and dense updates lead to the same results
    assert_array_equal(labels, labels_csr)
    assert_allclose(centers_new, centers_new_csr)
    assert_allclose(old_inertia, old_inertia_csr)
    assert_allclose(new_inertia, new_inertia_csr)
def predict(self, X, sample_weight=None):
    """Predict the closest cluster each sample in X belongs to.

    In the vector quantization literature, `cluster_centers_` is called
    the code book and each value returned by `predict` is the index of
    the closest code in the code book.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape = [n_samples, n_features]
       New data to predict.

    sample_weight : array-like, shape (n_samples,), optional
        The weights for each observation in X. If None, all observations
        are assigned equal weight (default: None)

    Returns
    -------
    labels : array, shape [n_samples,]
        Index of the cluster each sample belongs to.
    """
    check_is_fitted(self)

    X = self._check_test_data(X)

    daal_ready = sample_weight is None and hasattr(X, '__array__') # or sp.isspmatrix_csr(X)

    if daal_ready:
        return _daal4py_k_means_dense(X, self.n_clusters, 0, 0.0, self.cluster_centers_, 1, None)[1]
    else:
        x_squared_norms = row_norms(X, squared=True)
        return _labels_inertia(X, sample_weight, x_squared_norms,
                               self.cluster_centers_)[0]
def _predict(self, X, sample_weight=None):
    """Predict the closest cluster each sample in X belongs to.

    In the vector quantization literature, `cluster_centers_` is called
    the code book and each value returned by `predict` is the index of
    the closest code in the code book.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape = [n_samples, n_features]
       New data to predict.

    sample_weight : array-like, shape (n_samples,), optional
        The weights for each observation in X. If None, all observations
        are assigned equal weight (default: None)

    Returns
    -------
    labels : array, shape [n_samples,]
        Index of the cluster each sample belongs to.
    """
    check_is_fitted(self)

    X = _daal4py_check_test_data(self, X)

    _patching_status = PatchingConditionsChain(
        "sklearn.cluster.KMeans.predict")
    _patching_status.and_conditions([
        (sample_weight is None, "Sample weights are not supported."),
        (hasattr(X, '__array__'), "X does not have '__array__' attribute.")
    ])
    _dal_ready = _patching_status.or_conditions([(sp.isspmatrix_csr(X),
                                                  "X is not sparse.")])

    _patching_status.write_log()
    if _dal_ready:
        return _daal4py_k_means_predict(X, self.n_clusters,
                                        self.cluster_centers_)[0]
    x_squared_norms = row_norms(X, squared=True)
    return _labels_inertia(X, sample_weight, x_squared_norms,
                           self.cluster_centers_)[0]
Exemple #6
0
def _predict(self, X, sample_weight=None):
    """Predict the closest cluster each sample in X belongs to.

    In the vector quantization literature, `cluster_centers_` is called
    the code book and each value returned by `predict` is the index of
    the closest code in the code book.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape = [n_samples, n_features]
       New data to predict.

    sample_weight : array-like, shape (n_samples,), optional
        The weights for each observation in X. If None, all observations
        are assigned equal weight (default: None)

    Returns
    -------
    labels : array, shape [n_samples,]
        Index of the cluster each sample belongs to.
    """
    check_is_fitted(self)

    X = _daal4py_check_test_data(self, X)

    if sample_weight is None and hasattr(X, '__array__') or \
            sp.isspmatrix_csr(X):
        logging.info("sklearn.cluster.KMeans."
                     "predict: " + get_patch_message("daal"))
        return _daal4py_k_means_predict(X, self.n_clusters,
                                        self.cluster_centers_)[0]
    logging.info("sklearn.cluster.KMeans."
                 "predict: " + get_patch_message("sklearn"))
    x_squared_norms = row_norms(X, squared=True)
    return _labels_inertia(X, sample_weight, x_squared_norms,
                           self.cluster_centers_)[0]
def test_minibatch_update_consistency():
    # Check that dense and sparse minibatch update give the same results
    rng = np.random.RandomState(42)
    old_centers = centers + rng.normal(size=centers.shape)

    new_centers = old_centers.copy()
    new_centers_csr = old_centers.copy()

    weight_sums = np.zeros(new_centers.shape[0], dtype=np.double)
    weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double)

    x_squared_norms = (X ** 2).sum(axis=1)
    x_squared_norms_csr = row_norms(X_csr, squared=True)

    buffer = np.zeros(centers.shape[1], dtype=np.double)
    buffer_csr = np.zeros(centers.shape[1], dtype=np.double)

    # extract a small minibatch
    X_mb = X[:10]
    X_mb_csr = X_csr[:10]
    x_mb_squared_norms = x_squared_norms[:10]
    x_mb_squared_norms_csr = x_squared_norms_csr[:10]

    sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double)

    # step 1: compute the dense minibatch update
    old_inertia, incremental_diff = _mini_batch_step(
        X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums,
        buffer, 1, None, random_reassign=False)
    assert old_inertia > 0.0

    # compute the new inertia on the same batch to check that it decreased
    labels, new_inertia = _labels_inertia(
        X_mb, sample_weight_mb, x_mb_squared_norms, new_centers)
    assert new_inertia > 0.0
    assert new_inertia < old_inertia

    # check that the incremental difference computation is matching the
    # final observed value
    effective_diff = np.sum((new_centers - old_centers) ** 2)
    assert_almost_equal(incremental_diff, effective_diff)

    # step 2: compute the sparse minibatch update
    old_inertia_csr, incremental_diff_csr = _mini_batch_step(
        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr,
        weight_sums_csr, buffer_csr, 1, None, random_reassign=False)
    assert old_inertia_csr > 0.0

    # compute the new inertia on the same batch to check that it decreased
    labels_csr, new_inertia_csr = _labels_inertia(
        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr)
    assert new_inertia_csr > 0.0
    assert new_inertia_csr < old_inertia_csr

    # check that the incremental difference computation is matching the
    # final observed value
    effective_diff = np.sum((new_centers_csr - old_centers) ** 2)
    assert_almost_equal(incremental_diff_csr, effective_diff)

    # step 3: check that sparse and dense updates lead to the same results
    assert_array_equal(labels, labels_csr)
    assert_array_almost_equal(new_centers, new_centers_csr)
    assert_almost_equal(incremental_diff, incremental_diff_csr)
    assert_almost_equal(old_inertia, old_inertia_csr)
    assert_almost_equal(new_inertia, new_inertia_csr)
Exemple #8
0
def _fuzzykmeans_single_lloyd(X,
                              m,
                              sample_weight,
                              n_clusters,
                              max_iter=300,
                              init='k-means++',
                              verbose=False,
                              x_squared_norms=None,
                              random_state=None,
                              tol=1e-4,
                              precompute_distances=True):
    """A single run of k-means, assumes preparation completed prior.

    Parameters
    ----------
    X : array-like of floats, shape (n_samples, n_features)
        The observations to cluster.

    n_clusters : int
        The number of clusters to form as well as the number of
        centroids to generate.

    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.

    max_iter : int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.

    init : {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':

        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.

        'random': choose k observations (rows) at random from data for
        the initial centroids.

        If an ndarray is passed, it should be of shape (k, p) and gives
        the initial centers.

        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.

    tol : float, optional
        The relative increment in the results before declaring convergence.

    verbose : boolean, optional
        Verbosity mode

    x_squared_norms : array
        Precomputed x_squared_norms.

    precompute_distances : boolean, default: True
        Precompute distances (faster but takes more memory).

    random_state : int, RandomState instance or None (default)
        Determines random number generation for centroid initialization. Use
        an int to make the randomness deterministic.
        See :term:`Glossary <random_state>`.

    Returns
    -------
    centroid : float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.

    label : integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.

    inertia : float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).

    n_iter : int
        Number of iterations run.
    """
    n_samples, n_features = X.shape
    random_state = check_random_state(random_state)

    fuzzy_labels = random_state.rand(n_samples, n_clusters)
    fuzzy_labels /= fuzzy_labels.sum(axis=1)[:, np.newaxis]

    sample_weight = _check_normalize_sample_weight(sample_weight, X)

    best_fuzzy_labels, best_labels, best_inertia, best_centers = None, None, None, None
    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)

    centers = m_step(centers, fuzzy_labels, m)

    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()
        # labels assignment is also called the E-step of EM
        labels, inertia = \
            _labels_inertia(X, sample_weight, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)
        fuzzy_labels, labels = e_step(X, centers, m)

        # computation of the means is also called the M-step of EM
        # if sp.issparse(X):
        #     centers = _k_means._centers_sparse(X, sample_weight, labels,
        #                                        n_clusters, distances)
        # else:
        #     centers = _k_means._centers_dense(X, sample_weight, labels,
        #                                       n_clusters, distances)
        centers = m_step(X, fuzzy_labels, m)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_fuzzy_labels = fuzzy_labels.copy()
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, sample_weight, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)
        best_fuzzy_labels, best_labels = e_step(X, centers, m)

    return best_fuzzy_labels, best_labels, best_inertia, best_centers, i + 1