def __kmeans(self, km_points_orig, nClusters):
        assert isinstance(km_points_orig, dict)
        assert isinstance(nClusters, int) and nClusters > 1

        km = skKMeans(n_clusters=nClusters)

        # Get the ordered set of points (i.e. flower pixel percentages of each image)
        km_points = np.array([
            k[1]
            for k in sorted(km_points_orig.items(), key=operator.itemgetter(1))
        ]).reshape((-1, 1))

        # Compute KMeans
        km.fit(km_points)

        # Get the centroids ordered
        km_centroids = list(km.cluster_centers_)
        km_centroids.sort()

        # Assign each image to a cluster
        final_img_clusters = {}
        for k, v in km_points_orig.items():
            # Compute distance to each of the centroids
            dist = np.array([abs(v - q) for q in km_centroids])

            # Get the closest centroid
            final_img_clusters[k] = int(dist.argmin())

        return final_img_clusters
Example #2
0
    def test_speed_vs_sk(self):
        from sklearn.cluster import KMeans as skKMeans
        n_samples = 100000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples,
                                    centers=centers,
                                    cluster_std=1.,
                                    random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        # Warmup - during first call CUDA kernels take ~2sec to load
        kmeans_h2o.fit(X)
        start_h2o = time.time()
        kmeans_h2o.fit(X)
        end_h2o = time.time()

        kmeans_sk = skKMeans(n_init=1,
                             n_clusters=centers,
                             init='random',
                             algorithm='full',
                             n_jobs=-1)
        start_sk = time.time()
        kmeans_sk.fit(X)
        end_sk = time.time()

        assert end_h2o - start_h2o <= end_sk - start_sk
Example #3
0
 def __init__(self, dataset, n_classes):
     # Try using kmeans to work out clusters of results, so that we can pick
     # 'representatives' of each cluster to use as our kernel selection.
     # This provides classes to use in a decision tree classifier.
     kmeans = skKMeans(n_clusters=n_classes,
                       random_state=0).fit(dataset.normalized)
     kernel_map = [
         dataset.normalized.columns[np.argmax(vec)]
         for vec in kmeans.cluster_centers_
     ]
     self.classes = kernel_map
     self.name = "{}{}".format(self.cls_name, n_classes)
Example #4
0
    def __init__(self, dataset, n_classes):
        data = dataset.normalized.reset_index(drop=True)
        pca = PCA(n_components=25)
        pca.fit(data)
        mu = data.mean(axis=0).to_numpy()

        transformed = pca.transform(data)
        kmeans = skKMeans(n_clusters=n_classes,
                          random_state=0).fit(transformed)

        centroids = self._invert_pca(pca, mu, kmeans.cluster_centers_)

        kernel_map = [data.columns[np.argmax(vec)] for vec in centroids]
        self.classes = kernel_map
        self.name = "{}{}".format(self.cls_name, n_classes)
Example #5
0
    def test_accuracy(self):
        from sklearn.cluster import KMeans as skKMeans
        n_samples = 500000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        kmeans_h2o.fit(X)
        kmeans_sk = skKMeans(n_init=1, n_clusters=centers, random_state=42)
        kmeans_sk.fit(X)

        accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels)
        accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels)
        # We also want to be either better or at most 10% worse than SKLearn
        # Everything else is horrible and we probably should fix something
        assert accuracy_h2o - accuracy_sk >= -0.1
Example #6
0
    def test_accuracy(self):
        from sklearn.cluster import KMeans as skKMeans
        n_samples = 100000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        kmeans_h2o.fit(X)
        kmeans_sk = skKMeans(n_init=1, n_clusters=centers, init='random',
                             random_state=42)
        kmeans_sk.fit(X)

        accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels)
        accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels)
        # We also want to be either better or at most 10% worse than SKLearn
        # Everything else is horrible and we probably should fix something
        assert accuracy_h2o - accuracy_sk >= -0.1
Example #7
0
    def test_speed_vs_sk(self):
        from sklearn.cluster import KMeans as skKMeans
        n_samples = 100000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers)
        start_h2o = time.time()
        kmeans_h2o.fit(X)
        end_h2o = time.time()

        kmeans_sk = skKMeans(n_init=1, n_clusters=centers, init='random')
        start_sk = time.time()
        kmeans_sk.fit(X)
        end_sk = time.time()

        print(end_h2o - start_h2o)
        print(end_sk - start_sk)
    centers, labels_my_cluster = mdl.fit(data, k=3, max_iter=1000)

    plt.tight_layout()

    ## comparison of k-means cluster and its performance
    ## my k-mean cluster
    ax = fig.add_subplot(2, 3, 4)
    plot_data_kmeans(data,
                     ax=ax,
                     labels=labels_my_cluster,
                     centers=centers,
                     fnames=fnames)
    ax.set_title('My K-mean clustering')

    ## sklearn k-mean cluster
    labels_sk = skKMeans(n_clusters=3).fit(data).labels_
    centers_sk = get_centers(data, labels_sk)
    ax = fig.add_subplot(2, 3, 5)
    plot_data_kmeans(data,
                     ax=ax,
                     labels=labels_sk,
                     centers=centers_sk,
                     fnames=fnames)
    ax.set_title('sklearn\'s K-mean clustering')

    ## truth
    ax = fig.add_subplot(2, 3, 6)
    labels_true = data_all.target
    centers_true = get_centers(data, labels_true)
    plot_data_kmeans(data,
                     ax=ax,
Example #9
0
def k_means():
	""" KMeans """
	print("[INFO] - KMeans - KMeans Classifier")
	model = skKMeans(n_clusters=clusters)
	return model