Esempio n. 1
0
def test_densify_rows():
    X = sp.csr_matrix([[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64)
    rows = np.array([0, 2, 3], dtype=np.intp)
    out = np.ones((rows.shape[0], X.shape[1]), dtype=np.float64)

    assign_rows_csr(X, rows, np.arange(out.shape[0], dtype=np.intp)[::-1], out)
    assert_array_equal(out, X[rows].toarray()[::-1])
Esempio n. 2
0
def maximization(X, labels, distances, centers, samples):
    X_indptr = X.indptr
    X_data = X.data
    X_indices = X.indices
    sameple_weight = numpy.ones(samples, dtype=X.dtype)
    weight_cluster = numpy.zeros(k, dtype=float)
    for i in range(samples):
        c = labels[i]
        weight_cluster[c] += sameple_weight[i]
        empty_clusters = numpy.where(weight_cluster == 0)[0]
    n_empty_clusters = empty_clusters.shape[0]
    if n_empty_clusters > 0:
        far_points = distances.argsort()[::-1][:n_empty_clusters]

        assign_rows_csr(X, far_points.astype(numpy.intp),
                        empty_clusters.astype(numpy.intp), centers)
        for i in range(n_empty_clusters):
            weight_cluster[empty_clusters[i]] = 1

    for i in range(len(labels)):
        curr_label = labels[i]
        for index in range(X_indptr[i], X_indptr[i + 1]):
            j = X_indices[index]
            centers[curr_label, j] += X_data[index] * sameple_weight[i]
    numpy.true_divide(centers,
                      weight_cluster[:, numpy.newaxis],
                      out=centers,
                      casting="unsafe")
    return centers
def test_densify_rows():
    X = sp.csr_matrix([[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]],
                      dtype=np.float64)
    rows = np.array([0, 2, 3], dtype=np.intp)
    out = np.ones((rows.shape[0], X.shape[1]), dtype=np.float64)

    assign_rows_csr(X, rows, np.arange(out.shape[0], dtype=np.intp)[::-1], out)
    assert_array_equal(out, X[rows].toarray()[::-1])
Esempio n. 4
0
def _centers_sparse(X, sample_weight, labels, n_clusters, distances):
    """
    M step of the K-means EM algorithm
    Computation of cluster centers / means.
    Parameters
    ----------
    X : scipy.sparse.csr_matrix, shape (n_samples, n_features)
    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.
    labels : array of integers, shape (n_samples)
        Current label assignment
    n_clusters : int
        Number of desired clusters
    distances : array-like, shape (n_samples)
        Distance to closest cluster for each sample.
    Returns
    -------
    centers : array, shape (n_clusters, n_features)
        The resulting centers
    """
    n_samples = X.shape[0]
    n_features = X.shape[1]

    data = X.data
    indices = X.indices
    indptr = X.indptr

    dtype = X.dtype
    centers = numpy.zeros((n_clusters, n_features), dtype=dtype)
    weight_in_cluster = numpy.zeros((n_clusters, ), dtype=dtype)
    for i in range(n_samples):
        c = labels[i]
        weight_in_cluster[c] += sample_weight[i]
    empty_clusters = numpy.where(weight_in_cluster == 0)[0]
    n_empty_clusters = empty_clusters.shape[0]

    # maybe also relocate small clusters?

    if n_empty_clusters > 0:
        # find points to reassign empty clusters to
        far_from_centers = distances.argsort()[::-1][:n_empty_clusters]
        assign_rows_csr(X, far_from_centers, empty_clusters, centers)

        for i in range(n_empty_clusters):
            weight_in_cluster[empty_clusters[i]] = 1

    for i in range(labels.shape[0]):
        curr_label = labels[i]
        for ind in range(indptr[i], indptr[i + 1]):
            j = indices[ind]
            centers[curr_label, j] += data[ind] * sample_weight[i]

    centers /= weight_in_cluster[:, numpy.newaxis]

    return centers
Esempio n. 5
0
def test_densify_rows():
    X = sp.csr_matrix([[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]],
                      dtype=np.float64)
    X_rows = np.array([0, 2, 3], dtype=np.intp)
    out = np.ones((6, X.shape[1]), dtype=np.float64)
    out_rows = np.array([1, 3, 4], dtype=np.intp)

    expect = np.ones_like(out)
    expect[out_rows] = X[X_rows, :].toarray()

    assign_rows_csr(X, X_rows, out_rows, out)
    assert_array_equal(out, expect)
Esempio n. 6
0
def test_densify_rows():
    X = sp.csr_matrix([[0, 3, 0],
                       [2, 4, 0],
                       [0, 0, 0],
                       [9, 8, 7],
                       [4, 0, 5]], dtype=np.float64)
    X_rows = np.array([0, 2, 3], dtype=np.intp)
    out = np.ones((6, X.shape[1]), dtype=np.float64)
    out_rows = np.array([1, 3, 4], dtype=np.intp)

    expect = np.ones_like(out)
    expect[out_rows] = X[X_rows, :].toarray()

    assign_rows_csr(X, X_rows, out_rows, out)
    assert_array_equal(out, expect)
Esempio n. 7
0
def MB_step(X,
            x_squared_norms,
            centers,
            counts,
            curr_iter,
            old_center_buffer,
            compute_squared_diff,
            distances,
            random_reassign=False,
            random_state=None,
            reassignment_ratio=.01,
            verbose=False,
            learn_rate=0.0):
    """Incremental update of the centers for the Minibatch K-Means algorithm.
    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The original data array.
    x_squared_norms : array, shape (n_samples,)
        Squared euclidean norm of each data point.
    centers : array, shape (k, n_features)
        The cluster centers. This array is MODIFIED IN PLACE
    counts : array, shape (k,)
         The vector in which we keep track of the numbers of elements in a
         cluster. This array is MODIFIED IN PLACE
    distances : array, dtype float64, shape (n_samples), optional
        If not None, should be a pre-allocated array that will be used to store
        the distances of each sample to its closest center.
        May not be None when random_reassign is True.
    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
    random_reassign : boolean, optional
        If True, centers with very low counts are randomly reassigned
        to observations.
    reassignment_ratio : float, optional
        Control the fraction of the maximum number of counts for a
        center to be reassigned. A higher value means that low count
        centers are more likely to be reassigned, which means that the
        model will take longer to converge, but should converge in a
        better clustering.
    verbose : bool, optional, default False
        Controls the verbosity.
    compute_squared_diff : bool
        If set to False, the squared diff computation is skipped.
    old_center_buffer : int
        Copy of old centers for monitoring convergence.
    
    learn_rate: learning rate
    
    Returns
    -------
    centers: 
    	Updated centers
    inertia : float
        Sum of distances of samples to their closest cluster center.
    squared_diff : numpy array, shape (n_clusters,)
        Squared distances between previous and updated cluster centers.
    """
    # Perform label assignment to nearest centers
    nearest_center, inertia = k_means_._labels_inertia(X,
                                                       x_squared_norms,
                                                       centers,
                                                       distances=distances)

    if random_reassign and reassignment_ratio > 0:
        random_state = check_random_state(random_state)
        # Reassign clusters that have very low counts
        to_reassign = counts < reassignment_ratio * counts.max()
        # pick at most .5 * batch_size samples as new centers
        if to_reassign.sum() > .5 * X.shape[0]:
            indices_dont_reassign = np.argsort(counts)[int(.5 * X.shape[0]):]
            to_reassign[indices_dont_reassign] = False
        n_reassigns = to_reassign.sum()
        if n_reassigns:
            # Pick new clusters amongst observations with uniform probability
            new_centers = choice(X.shape[0],
                                 replace=False,
                                 size=n_reassigns,
                                 random_state=random_state)
            if verbose:
                print("[MiniBatchKMeans] Reassigning %i cluster centers." %
                      n_reassigns)

            if sp.issparse(X) and not sp.issparse(centers):
                assign_rows_csr(X, astype(new_centers, np.intp),
                                astype(np.where(to_reassign)[0], np.intp),
                                centers)
            else:
                centers[to_reassign] = X[new_centers]
        # reset counts of reassigned centers, but don't reset them too small
        # to avoid instant reassignment. This is a pretty dirty hack as it
        # also modifies the learning rates.
        counts[to_reassign] = np.min(counts[~to_reassign])

    squared_diff = 0.0
    ## implementation for the sparse CSR representation completely written in
    # cython
    if sp.issparse(X):
        if compute_squared_diff:
            old_center_buffer = centers
        #rand_vec = make_rand_vector(X.shape[1])
        #learn_rate = 0.0
        centers = _MB_step._mini_batch_update_csr(X, x_squared_norms, centers,
                                                  counts, nearest_center,
                                                  old_center_buffer,
                                                  compute_squared_diff,
                                                  curr_iter, learn_rate)

        if compute_squared_diff:
            diff = centers - old_center_buffer
            squared_diff = row_norms(diff, squared=True).sum()

        return centers, squared_diff, inertia

    ## dense variant in mostly numpy (not as memory efficient though)
    k = centers.shape[0]
    for center_idx in range(k):
        # find points from minibatch that are assigned to this center
        center_mask = nearest_center == center_idx
        old_count = counts[center_idx]
        this_count = center_mask.sum()
        counts[center_idx] += this_count  # update counts

        if this_count > 0:
            new_count = counts[center_idx]
            if compute_squared_diff:
                old_center_buffer[:] = centers[center_idx]

            # inplace remove previous count scaling
            #centers[center_idx] *= counts[center_idx]

            # inplace sum with new points members of this cluster
            #centers[center_idx] += np.sum(X[center_mask], axis=0)

            # update the count statistics for this center
            #counts[center_idx] += count

            # inplace rescale to compute mean of all points (old and new)
            #centers[center_idx] /= counts[center_idx]
            new_center = np.sum(X[center_mask], axis=0)
            if learn_rate == 0.0:
                learn_rate = (new_count - old_count) / float(new_count)

            centers[center_idx] = centers[center_idx] + learn_rate * (
                new_center / (new_count - old_count) - centers[center_idx])

            # update the squared diff if necessary
            if compute_squared_diff:
                diff = centers[center_idx].ravel() - old_center_buffer.ravel()
                squared_diff += np.dot(diff, diff)

    return centers, squared_diff, inertia
def _mini_batch_spherical_step(X,
                               sample_weight,
                               x_squared_norms,
                               centers,
                               weight_sums,
                               old_center_buffer,
                               compute_squared_diff,
                               distances,
                               random_reassign=False,
                               random_state=None,
                               reassignment_ratio=.01,
                               verbose=False):
    """Incremental update of the centers for the Minibatch K-Means algorithm.
    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The original data array.
    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.
    x_squared_norms : array, shape (n_samples,)
        Squared euclidean norm of each data point.
    centers : array, shape (k, n_features)
        The cluster centers. This array is MODIFIED IN PLACE
    counts : array, shape (k,)
         The vector in which we keep track of the numbers of elements in a
         cluster. This array is MODIFIED IN PLACE
    distances : array, dtype float, shape (n_samples), optional
        If not None, should be a pre-allocated array that will be used to store
        the distances of each sample to its closest center.
        May not be None when random_reassign is True.
    random_state : int, RandomState instance or None (default)
        Determines random number generation for centroid initialization and to
        pick new clusters amongst observations with uniform probability. Use
        an int to make the randomness deterministic.
        See :term:`Glossary <random_state>`.
    random_reassign : boolean, optional
        If True, centers with very low counts are randomly reassigned
        to observations.
    reassignment_ratio : float, optional
        Control the fraction of the maximum number of counts for a
        center to be reassigned. A higher value means that low count
        centers are more likely to be reassigned, which means that the
        model will take longer to converge, but should converge in a
        better clustering.
    verbose : bool, optional, default False
        Controls the verbosity.
    compute_squared_diff : bool
        If set to False, the squared diff computation is skipped.
    old_center_buffer : int
        Copy of old centers for monitoring convergence.
    Returns
    -------
    inertia : float
        Sum of squared distances of samples to their closest cluster center.
    squared_diff : numpy array, shape (n_clusters,)
        Squared distances between previous and updated cluster centers.
    """
    # Perform label assignment to nearest centers
    nearest_center, inertia = _labels_inertia(X,
                                              sample_weight,
                                              x_squared_norms,
                                              centers,
                                              distances=distances)

    if random_reassign and reassignment_ratio > 0:
        random_state = check_random_state(random_state)
        # Reassign clusters that have very low weight
        to_reassign = weight_sums < reassignment_ratio * weight_sums.max()
        # pick at most .5 * batch_size samples as new centers
        if to_reassign.sum() > .5 * X.shape[0]:
            indices_dont_reassign = \
                np.argsort(weight_sums)[int(.5 * X.shape[0]):]
            to_reassign[indices_dont_reassign] = False
        n_reassigns = to_reassign.sum()
        if n_reassigns:
            # Pick new clusters amongst observations with uniform probability
            new_centers = random_state.choice(X.shape[0],
                                              replace=False,
                                              size=n_reassigns)
            if verbose:
                print("[MiniBatchKMeans] Reassigning %i cluster centers." %
                      n_reassigns)

            if sp.issparse(X) and not sp.issparse(centers):
                assign_rows_csr(
                    X, new_centers.astype(np.intp, copy=False),
                    np.where(to_reassign)[0].astype(np.intp, copy=False),
                    centers)
            else:
                centers[to_reassign] = X[new_centers]
        # reset counts of reassigned centers, but don't reset them too small
        # to avoid instant reassignment. This is a pretty dirty hack as it
        # also modifies the learning rates.
        weight_sums[to_reassign] = np.min(weight_sums[~to_reassign])

    # implementation for the sparse CSR representation completely written in
    # cython
    if sp.issparse(X):
        return inertia, _mini_batch_update_csr(X, sample_weight,
                                               x_squared_norms, centers,
                                               weight_sums, nearest_center,
                                               old_center_buffer,
                                               compute_squared_diff)

    # dense variant in mostly numpy (not as memory efficient though)
    k = centers.shape[0]
    squared_diff = 0.0
    for center_idx in range(k):
        # find points from minibatch that are assigned to this center
        center_mask = nearest_center == center_idx
        wsum = sample_weight[center_mask].sum()

        if wsum > 0:
            if compute_squared_diff:
                old_center_buffer[:] = centers[center_idx]

            # inplace remove previous count scaling
            centers[center_idx] *= weight_sums[center_idx]

            # inplace sum with new points members of this cluster
            centers[center_idx] += \
                np.sum(X[center_mask] *
                       sample_weight[center_mask, np.newaxis], axis=0)

            # unit-normalize for spherical k-means
            centers[center_idx] = normalize(centers[center_idx, None])[:, 0]

            # update the squared diff if necessary
            if compute_squared_diff:
                diff = centers[center_idx].ravel() - old_center_buffer.ravel()
                squared_diff += np.dot(diff, diff)

    return inertia, squared_diff