def test_speed_vs_sk(self, order): from h2o4gpu.cluster import KMeansSklearn as skKMeans n_samples = 100000 centers = 10 X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42) # Warmup - during first call CUDA kernels take ~2sec to load kmeans_h2o.fit(X) start_h2o = time.time() kmeans_h2o.fit(X) end_h2o = time.time() if os.getenv("CHECKPERFORMANCE") is not None: kmeans_sk = skKMeans(n_init=1, n_clusters=centers, algorithm='full', n_jobs=-1) start_sk = time.time() kmeans_sk.fit(X) end_sk = time.time() assert end_h2o - start_h2o <= end_sk - start_sk
def test_speed_vs_sk(self): from sklearn.cluster import KMeans as skKMeans n_samples = 100000 centers = 10 X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42) # Warmup - during first call CUDA kernels take ~2sec to load kmeans_h2o.fit(X) start_h2o = time.time() kmeans_h2o.fit(X) end_h2o = time.time() kmeans_sk = skKMeans(n_init=1, n_clusters=centers, init='random', algorithm='full', n_jobs=-1) start_sk = time.time() kmeans_sk.fit(X) end_sk = time.time() assert end_h2o - start_h2o <= end_sk - start_sk
def _test_accuracy(self, order, n_samples=500000, centers=10, n_features=2): from h2o4gpu.cluster import KMeansSklearn as skKMeans X, true_labels = make_blobs(n_samples=n_samples, centers=centers, n_features=n_features, cluster_std=2., random_state=42) X = np.asanyarray(X, order=order) kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42, verbose=1000) kmeans_h2o.fit(X) kmeans_sk = skKMeans(n_init=1, n_clusters=centers, random_state=42) kmeans_sk.fit(X) accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels) accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels) # We also want to be either better or at most 10% worse than SKLearn # Everything else is horrible and we probably should fix something # TODO: it's failing with lower rtol, find out why it's so inaccurate assert np.allclose(accuracy_h2o, accuracy_sk, rtol=0.1, atol=0.05), \ 'Accuracy error {0} {1} n_samples={2}, centers={3}, n_features={4}, order={5}'.format( accuracy_h2o, accuracy_sk, n_samples, centers, n_features, order)
def test_fit_i32_vs_f32(self): X_f64 = np.array([[1., 2.], [1., 4.], [1., 0.], [4., 2.], [4., 4.], [4., 0.]]) X_f32 = X_f64.astype(np.float32) X_i32 = X_f64.astype(np.int32) kmeans = KMeans(n_gpus=1, n_clusters=2, random_state=123) model_f64_labels = kmeans.fit(X_f64).predict(X_f64) model_f32_labels = kmeans.fit(X_f32).predict(X_f32) model_i32_labels = kmeans.fit(X_i32).predict(X_i32) assert all(model_f64_labels == model_f32_labels) assert all(model_f32_labels == model_i32_labels)
def test_fit_iris(self): X = load_iris().data clusters = 4 model = KMeans(n_gpus=1, n_clusters=clusters, random_state=123).fit(X) assert model.cluster_centers_.shape == (X.shape[1], clusters) model_rerun = KMeans(n_gpus=1, n_clusters=clusters, random_state=123).fit(X) # Same random_state should yield same results assert np.allclose(model.cluster_centers_, model_rerun.cluster_centers_) model_rerun2 = model_rerun.fit(X) # Multiple invocations of fit with the same random_state # also should produce the same result assert np.allclose(model_rerun.cluster_centers_, model_rerun2.cluster_centers_) model_all = KMeans(n_clusters=clusters, random_state=123).fit(X) # Multi GPU should yield same result as single GPU assert np.allclose(model.cluster_centers_, model_all.cluster_centers_)
def test_fit_iris(self): X = load_iris().data clusters = 4 model = KMeans(n_gpus=1, n_clusters=clusters, random_state=123).fit(X) assert model.cluster_centers_.shape == (X.shape[1], clusters) model_rerun = KMeans(n_gpus=1, n_clusters=clusters, random_state=123).fit(X) # Same random_state should yield same results assert np.allclose( model.cluster_centers_, model_rerun.cluster_centers_ ) model_rerun2 = model_rerun.fit(X) # Multiple invocations of fit with the same random_state # also should produce the same result assert np.allclose( model_rerun.cluster_centers_, model_rerun2.cluster_centers_ ) model_all = KMeans(n_clusters=clusters, random_state=123).fit(X) # Multi GPU should yield same result as single GPU assert np.allclose( model.cluster_centers_, model_all.cluster_centers_ )
def test_fit_iris_precision(self): X_f64 = load_iris().data X_f32 = X_f64.astype(np.float32) kmeans = KMeans(n_gpus=1, n_clusters=4, random_state=12345) model_f64_labels = kmeans.fit(X_f64).predict(X_f64) model_f32_labels = kmeans.fit(X_f32).predict(X_f32) assert all(model_f64_labels == model_f32_labels)
def test_accuracy(self): from sklearn.cluster import KMeans as skKMeans n_samples = 500000 centers = 10 X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42) kmeans_h2o.fit(X) kmeans_sk = skKMeans(n_init=1, n_clusters=centers, random_state=42) kmeans_sk.fit(X) accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels) accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels) # We also want to be either better or at most 10% worse than SKLearn # Everything else is horrible and we probably should fix something assert accuracy_h2o - accuracy_sk >= -0.1
def test_accuracy(self): from sklearn.cluster import KMeans as skKMeans n_samples = 100000 centers = 10 X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42) kmeans_h2o.fit(X) kmeans_sk = skKMeans(n_init=1, n_clusters=centers, init='random', random_state=42) kmeans_sk.fit(X) accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels) accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels) # We also want to be either better or at most 10% worse than SKLearn # Everything else is horrible and we probably should fix something assert accuracy_h2o - accuracy_sk >= -0.1
def test_speed_vs_sk(self): from sklearn.cluster import KMeans as skKMeans n_samples = 100000 centers = 10 X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers) start_h2o = time.time() kmeans_h2o.fit(X) end_h2o = time.time() kmeans_sk = skKMeans(n_init=1, n_clusters=centers, init='random') start_sk = time.time() kmeans_sk.fit(X) end_sk = time.time() print(end_h2o - start_h2o) print(end_sk - start_sk)