Esempio n. 1
0
def find_centroids(centroids, data, labels, pairwise_distances, zero_point, C):
    # Get the number of points associated with each centroid
    counts = np.bincount(labels, minlength=C)

    # more bincounts using the positions as weights produce the unnormalized
    #  updated centroid locations (have to do each dimension separately since
    #  a weight cannot be a vector)
    for idx in range(data.shape[1]):
        centroids[:, idx] = np.bincount(labels,
                                        weights=data[:, idx],
                                        minlength=C)

    # would have been nice if numpy offered a combined amin/argmin to avoid
    #  iterating over pairwise_distances twice
    distance_sum = np.sum(np.amin(pairwise_distances, axis=1))

    # To avoid introducing divide by zero errors
    # If a centroid has no weight, we'll do no normalization
    # This will keep its coordinates defined.
    counts = np.maximum(counts, np.ones((1, ), dtype=np.uint64))
    centroids /= counts[:, np.newaxis]
    return distance_sum
Esempio n. 2
0
def find_centroids(data, labels, C, D):
    # Sort the points by their labels
    indices = np.argsort(labels)
    sorted_points = data[indices]
    # Compute counts and indexes for ending of sets of points for each centroid
    counts = np.bincount(labels, minlength=C)
    indexes = np.cumsum(counts)
    # Now we can use the indexes to split the array into sub-arrays and then
    # sum across them to create the centroids
    centroids = np.empty((C, D), dtype=data.dtype)
    ragged_arrays = np.split(sorted_points, indexes)
    for idx in xrange(C):
        centroids[idx, :] = np.sum(ragged_arrays[idx], axis=0)
    # To avoid introducing divide by zero errors
    # If a centroid has no weight, we'll do no normalization
    # This will keep its coordinates defined.
    counts = np.maximum(counts, 1)
    return centroids / counts[:, np.newaxis]
Esempio n. 3
0
def find_centroids(centroids, data, labels, pairwise_distances, zero_point, C,
                   D):
    # Get the number of points associated with each centroid
    counts = np.bincount(labels, minlength=C)
    # Build label masks for each centroid and sum across all the
    # points assocated with each new centroid
    distance_sum = 0.0
    for idx in range(C):
        # Boolean mask indicating where the points are for this center
        centroid_mask = labels == idx
        centroids[idx, :] = np.sum(np.where(centroid_mask[..., np.newaxis],
                                            data, zero_point),
                                   axis=0)
        distance_sum += np.sum(
            np.where(centroid_mask, pairwise_distances[:, idx], 0.0))
    # To avoid introducing divide by zero errors
    # If a centroid has no weight, we'll do no normalization
    # This will keep its coordinates defined.
    counts = np.maximum(counts, np.ones((1, ), dtype=np.uint64))
    centroids /= counts[:, np.newaxis]
    return distance_sum