def test_speed_vs_sk(self): from h2o4gpu.cluster import KMeansSklearn as skKMeans n_samples = 100000 centers = 10 X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42) # Warmup - during first call CUDA kernels take ~2sec to load kmeans_h2o.fit(X) start_h2o = time.time() kmeans_h2o.fit(X) end_h2o = time.time() if os.getenv("CHECKPERFORMANCE") is not None: kmeans_sk = skKMeans(n_init=1, n_clusters=centers, algorithm='full', n_jobs=-1) start_sk = time.time() kmeans_sk.fit(X) end_sk = time.time() assert end_h2o - start_h2o <= end_sk - start_sk
def _test_accuracy(self, order, n_samples=500000, centers=10, n_features=2): from h2o4gpu.cluster import KMeansSklearn as skKMeans X, true_labels = make_blobs(n_samples=n_samples, centers=centers, n_features=n_features, cluster_std=2., random_state=42) X = np.asanyarray(X, order=order) kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42, verbose=1000) kmeans_h2o.fit(X) kmeans_sk = skKMeans(n_init=1, n_clusters=centers, random_state=42) kmeans_sk.fit(X) accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels) accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels) # We also want to be either better or at most 10% worse than SKLearn # Everything else is horrible and we probably should fix something # TODO: it's failing with lower rtol, find out why it's so inaccurate assert np.allclose(accuracy_h2o, accuracy_sk, rtol=0.1, atol=0.05), \ 'Accuracy error {0} {1} n_samples={2}, centers={3}, n_features={4}, order={5}'.format( accuracy_h2o, accuracy_sk, n_samples, centers, n_features, order)
def test_accuracy(self): from h2o4gpu.cluster import KMeansSklearn as skKMeans n_samples = 500000 centers = 10 X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42) kmeans_h2o.fit(X) kmeans_sk = skKMeans(n_init=1, n_clusters=centers, random_state=42) kmeans_sk.fit(X) accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels) accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels) # We also want to be either better or at most 10% worse than SKLearn # Everything else is horrible and we probably should fix something assert accuracy_h2o - accuracy_sk >= -0.1