Example #1
0
    def test_speed_vs_sk(self, order):
        from h2o4gpu.cluster import KMeansSklearn as skKMeans
        n_samples = 100000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples,
                                    centers=centers,
                                    cluster_std=1.,
                                    random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        # Warmup - during first call CUDA kernels take ~2sec to load
        kmeans_h2o.fit(X)
        start_h2o = time.time()
        kmeans_h2o.fit(X)
        end_h2o = time.time()

        if os.getenv("CHECKPERFORMANCE") is not None:
            kmeans_sk = skKMeans(n_init=1,
                                 n_clusters=centers,
                                 algorithm='full',
                                 n_jobs=-1)
            start_sk = time.time()
            kmeans_sk.fit(X)
            end_sk = time.time()
            assert end_h2o - start_h2o <= end_sk - start_sk
Example #2
0
    def test_speed_vs_sk(self):
        from sklearn.cluster import KMeans as skKMeans
        n_samples = 100000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples,
                                    centers=centers,
                                    cluster_std=1.,
                                    random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        # Warmup - during first call CUDA kernels take ~2sec to load
        kmeans_h2o.fit(X)
        start_h2o = time.time()
        kmeans_h2o.fit(X)
        end_h2o = time.time()

        kmeans_sk = skKMeans(n_init=1,
                             n_clusters=centers,
                             init='random',
                             algorithm='full',
                             n_jobs=-1)
        start_sk = time.time()
        kmeans_sk.fit(X)
        end_sk = time.time()

        assert end_h2o - start_h2o <= end_sk - start_sk
Example #3
0
    def _test_accuracy(self,
                       order,
                       n_samples=500000,
                       centers=10,
                       n_features=2):
        from h2o4gpu.cluster import KMeansSklearn as skKMeans
        X, true_labels = make_blobs(n_samples=n_samples,
                                    centers=centers,
                                    n_features=n_features,
                                    cluster_std=2.,
                                    random_state=42)

        X = np.asanyarray(X, order=order)

        kmeans_h2o = KMeans(n_gpus=1,
                            n_clusters=centers,
                            random_state=42,
                            verbose=1000)
        kmeans_h2o.fit(X)
        kmeans_sk = skKMeans(n_init=1, n_clusters=centers, random_state=42)
        kmeans_sk.fit(X)

        accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels)
        accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels)
        # We also want to be either better or at most 10% worse than SKLearn
        # Everything else is horrible and we probably should fix something
        # TODO: it's failing with lower rtol, find out why it's so inaccurate
        assert np.allclose(accuracy_h2o, accuracy_sk, rtol=0.1, atol=0.05), \
            'Accuracy error {0} {1} n_samples={2}, centers={3}, n_features={4}, order={5}'.format(
            accuracy_h2o, accuracy_sk, n_samples, centers, n_features, order)
Example #4
0
    def test_fit_i32_vs_f32(self):
        X_f64 = np.array([[1., 2.], [1., 4.], [1., 0.], [4., 2.], [4., 4.], [4., 0.]])
        X_f32 = X_f64.astype(np.float32)
        X_i32 = X_f64.astype(np.int32)
        kmeans = KMeans(n_gpus=1, n_clusters=2, random_state=123)

        model_f64_labels = kmeans.fit(X_f64).predict(X_f64)
        model_f32_labels = kmeans.fit(X_f32).predict(X_f32)
        model_i32_labels = kmeans.fit(X_i32).predict(X_i32)

        assert all(model_f64_labels == model_f32_labels)
        assert all(model_f32_labels == model_i32_labels)
Example #5
0
    def test_fit_iris(self):
        X = load_iris().data
        clusters = 4
        model = KMeans(n_gpus=1, n_clusters=clusters, random_state=123).fit(X)

        assert model.cluster_centers_.shape == (X.shape[1], clusters)

        model_rerun = KMeans(n_gpus=1, n_clusters=clusters,
                             random_state=123).fit(X)

        # Same random_state should yield same results
        assert np.allclose(model.cluster_centers_,
                           model_rerun.cluster_centers_)

        model_rerun2 = model_rerun.fit(X)

        # Multiple invocations of fit with the same random_state
        # also should produce the same result
        assert np.allclose(model_rerun.cluster_centers_,
                           model_rerun2.cluster_centers_)

        model_all = KMeans(n_clusters=clusters, random_state=123).fit(X)

        # Multi GPU should yield same result as single GPU
        assert np.allclose(model.cluster_centers_, model_all.cluster_centers_)
Example #6
0
    def test_fit_iris(self):
        X = load_iris().data
        clusters = 4
        model = KMeans(n_gpus=1, n_clusters=clusters, random_state=123).fit(X)

        assert model.cluster_centers_.shape == (X.shape[1], clusters)

        model_rerun = KMeans(n_gpus=1, n_clusters=clusters, random_state=123).fit(X)

        # Same random_state should yield same results
        assert np.allclose(
            model.cluster_centers_, model_rerun.cluster_centers_
        )

        model_rerun2 = model_rerun.fit(X)

        # Multiple invocations of fit with the same random_state
        # also should produce the same result
        assert np.allclose(
            model_rerun.cluster_centers_, model_rerun2.cluster_centers_
        )

        model_all = KMeans(n_clusters=clusters, random_state=123).fit(X)

        # Multi GPU should yield same result as single GPU
        assert np.allclose(
            model.cluster_centers_, model_all.cluster_centers_
        )
Example #7
0
    def test_fit_iris_precision(self):
        X_f64 = load_iris().data
        X_f32 = X_f64.astype(np.float32)
        kmeans = KMeans(n_gpus=1, n_clusters=4, random_state=12345)
        model_f64_labels = kmeans.fit(X_f64).predict(X_f64)
        model_f32_labels = kmeans.fit(X_f32).predict(X_f32)

        assert all(model_f64_labels == model_f32_labels)
Example #8
0
    def test_fit_iris_precision(self):
        X_f64 = load_iris().data
        X_f32 = X_f64.astype(np.float32)
        kmeans = KMeans(n_gpus=1, n_clusters=4, random_state=12345)
        model_f64_labels = kmeans.fit(X_f64).predict(X_f64)
        model_f32_labels = kmeans.fit(X_f32).predict(X_f32)

        assert all(model_f64_labels == model_f32_labels)
Example #9
0
    def test_accuracy(self):
        from sklearn.cluster import KMeans as skKMeans
        n_samples = 500000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        kmeans_h2o.fit(X)
        kmeans_sk = skKMeans(n_init=1, n_clusters=centers, random_state=42)
        kmeans_sk.fit(X)

        accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels)
        accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels)
        # We also want to be either better or at most 10% worse than SKLearn
        # Everything else is horrible and we probably should fix something
        assert accuracy_h2o - accuracy_sk >= -0.1
Example #10
0
    def test_accuracy(self):
        from sklearn.cluster import KMeans as skKMeans
        n_samples = 100000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        kmeans_h2o.fit(X)
        kmeans_sk = skKMeans(n_init=1, n_clusters=centers, init='random',
                             random_state=42)
        kmeans_sk.fit(X)

        accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels)
        accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels)
        # We also want to be either better or at most 10% worse than SKLearn
        # Everything else is horrible and we probably should fix something
        assert accuracy_h2o - accuracy_sk >= -0.1
Example #11
0
    def test_speed_vs_sk(self):
        from sklearn.cluster import KMeans as skKMeans
        n_samples = 100000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers)
        start_h2o = time.time()
        kmeans_h2o.fit(X)
        end_h2o = time.time()

        kmeans_sk = skKMeans(n_init=1, n_clusters=centers, init='random')
        start_sk = time.time()
        kmeans_sk.fit(X)
        end_sk = time.time()

        print(end_h2o - start_h2o)
        print(end_sk - start_sk)