Example #1
0
def _k_means_mod(seeds, subset, num_clusters):
    """The KMeansMod() step of the algorithm"""

    clustering = _k_means(seeds, subset, num_clusters)
    centroids = clustering.cluster_centers_

    # Because labels_ returned by kmeans are arbitrarily numbered,
    # we work with the returned centroids
    distances = distance_table(subset, centroids)
    labels = distances.argmin(axis=1)

    sought = set(range(0, num_clusters))
    labels = set(labels)
    missing = sought - labels

    missingcount = len(missing)

    if missingcount > 0:
        # print("Missing:", missing)

        furthest = _find_furthest(distances, missingcount)
        # print("Furthest-nearest:", furthest)

        i = 0
        for clusterid in missing:
            # print("Replacing", seeds[clusterid], "with", subset[furthest[i]])
            seeds[clusterid] = subset[furthest[i]]
            i += 1

        clustering = _k_means(seeds, subset, num_clusters)
        centroids = clustering.cluster_centers_

    return centroids
Example #2
0
    def find_centers(self):
        """Main Initialisation interface method"""

        # i-iv) The point furthest from the centre, plus the two main axes
        first, axes = self._initialise()

        # v) Incrementally find points most remote from latest seed
        candidates = self._generate_candidates(first, axes)

        # print("Candidates:\n", candidates)
        # print("Axes:", axes)

        # Check for the eternal problem of duplicates
        deduped = np.unique(candidates, axis=0)
        # print("Deduped:\n", deduped)
        if len(deduped) < self._num_clusters:
            raise InitialisationException("Duplicate candidates found")

        # vi) Turn the candidates into means of initial clusters
        distances = kmeans.distance_table(self._data, candidates, axes)
        mins = distances.argmin(axis=1)

        means = [None] * self._num_clusters

        for k in range(self._num_clusters):
            cluster = self._data[mins == k, :]
            # print("Cluster contains:", len(cluster))
            means[k] = np.mean(cluster, axis=0)

        return np.array(means)
Example #3
0
    def find_centers(self):

        centroids = []

        to_find = self._num_clusters
        data = self._data

        while to_find > 1:

            first = self._find_first_centroid(data)
            centroids.append(first)

            temp_centroids = np.array([first])

            while len(temp_centroids) < to_find:
                furthest = self._find_furthest(temp_centroids, data)
                temp_centroids = np.vstack((temp_centroids, furthest))

            # Delete latest
            clustering = np.argmin(distance_table(temp_centroids, data),
                                   axis=0)
            mask = np.where(clustering == 0)[0]
            data = np.delete(data, mask, axis=0)

            to_find -= 1

        # Finally just get the mean of the remaining points
        final = np.mean(data, axis=0)

        centroids.append(final)

        return np.array(centroids)
    def test_with_1_empty(self):
        """Seeds and data known to leave one empty cluster after k_means(),
        and thus trigger k_means_mod() to reassign a centroid"""

        seeds = np.array([
            [5.4, 3.0, 4.5, 1.5],
            [6.7, 3.0, 5.0, 1.7],
            [5.1, 3.8, 1.5, 0.3],  # Doesn't get any data points assigned
        ])

        data = np.array([
            # Assigned to 0 but is furthest, so becomes the new 2
            [6.4, 2.9, 4.3, 1.3],
            [6.3, 3.4, 5.6, 2.4],
            [6.8, 3.0, 5.5, 2.1],
            [5.0, 2.0, 3.5, 1.0],
            [5.8, 2.7, 5.1, 1.9],
        ])

        expected_labels = [2, 1, 1, 0, 0]

        expected_centroids = [
            [5.4, 2.35, 4.3, 1.45],
            [6.55, 3.2, 5.55, 2.25],
            [6.4, 2.9, 4.3, 1.3],     # The new 2
        ]

        centroids = bfinit._k_means_mod(seeds, data, len(seeds))
        labels = kmeans.distance_table(data, centroids).argmin(1)

        np.testing.assert_array_equal(labels, expected_labels)
        np.testing.assert_array_equal(centroids, expected_centroids)
Example #5
0
    def _calc_density(self, point, latestdata):
        """Sum of distances to its nearest neighbours"""

        neighbours = int(len(latestdata) / self._num_clusters) + 1
        dists = distance_table(np.array([point]), latestdata)[0]
        idx = np.argpartition(dists, neighbours)
        subdists = dists[idx[:neighbours]]
        return np.sum(subdists)
Example #6
0
    def test_distance_table(self):
        """Calculate matrix of distances between two sets of data points"""

        data = np.array([[1, 1], [2, 3], [4, 4]])
        centroids = np.array([[2, 2], [3, 3]])
        dtable = mykm.distance_table(data, centroids)

        self.assertEqual(dtable.shape, (3, 2))

        expected = np.array([[2, 8], [1, 1], [8, 2]])
        self.assertTrue(np.array_equal(dtable, expected))
Example #7
0
    def find_centers(self):
        """Main method"""

        # 1-3) The most densely surrounded point is the first initial centroid
        centre_h = self._find_hdp()

        # 4) Add X_h to C as the first centroid
        centroids = np.array([centre_h])

        # Find the remaining required centroids
        while len(centroids) < self._num_clusters:

            # 5) For each point xi, set D(xi)...
            distances = distance_table(self._data, centroids)
            mins_d = np.min(distances, axis=1)

            # 6) Find y as ...
            # Though why it's supposedly recalculated on each loop is puzzling
            dist_h = distance_table(np.array([centre_h]), self._data)[0]
            dist_h = dist_h[dist_h != 0]  # Anderson skips the 0 one
            partition = np.partition(dist_h, self._how_many)[:self._how_many]
            my_y = sum(partition)

            # 7-8) Find the unique integer i so that...
            i = 0
            accum_dist = 0

            while accum_dist < my_y:

                accum_dist = accum_dist + mins_d[i]
                i = i + 1

            # 9) Add X_i to C
            # But surely the i found here isn't a meaningful index to X?
            # It just looks like we're cycling thought the data in a way
            # that's highly dependent on its arbitrary order
            centroids = np.vstack((centroids, self._data[i]))

        return centroids
Example #8
0
    def find_centers(self):
        """Main method"""

        # L2/Euclidean norm, as suggested by the R kkz() documentation
        norms = np.linalg.norm(self._data, axis=1)

        first = self._data[np.argmax(norms)]
        codebook = np.array([first])

        while codebook.shape[0] < self._num_clusters:
            distances = distance_table(self._data, codebook)
            mins = np.min(distances, axis=1)
            amax = np.argmax(mins, axis=0)
            nxt = self._data[amax]
            codebook = np.append(codebook, [nxt], axis=0)

        return codebook
Example #9
0
    def find_centers(self):

        # Initial centroid
        randindex = np.random.choice(self._num_samples, replace=False)
        centroids = np.array([self._data[randindex]])

        # Remaining required centroids
        while len(centroids) < self._num_clusters:

            distances = kmeans.distance_table(self._data, centroids)
            probabilities = distances.min(1)**2 / np.sum(distances.min(1)**2)

            randindex = np.random.choice(self._num_samples,
                                         replace=False,
                                         p=probabilities)
            centroids = np.append(centroids, [self._data[randindex]], axis=0)

        return centroids
Example #10
0
    def _find_hdp(self):
        """The highest density point"""

        distances = distance_table(self._data, self._data)
        sum_v = np.sum(distances, axis=1)  # doesn't matter which axis
        return self._data[np.argmin(sum_v)]
Example #11
0
    def _find_furthest(self, temp_centroids, latestdata):
        """The furthest-nearest point (exact opposite of Yuan)"""

        distances = distance_table(latestdata, temp_centroids)
        nearests = np.min(distances, axis=1)
        return latestdata[np.argmax(nearests)]
Example #12
0
    def _objective_function(data, centroids):
        """Sum of intra-cluster distances"""

        distances = distance_table(data, centroids)

        return np.sum(distances.min(1))