def test_sparse(self): """ Tests fit_transforms with sparse data""" n_samples = 1500 x, y = make_blobs(n_samples=n_samples, random_state=170) transformation = [[0.6, -0.6], [-0.4, 0.8]] x = np.dot(x, transformation) dense_arr = ds.array(x, block_size=(300, 2)) sparse_arr = ds.array(csr_matrix(x), block_size=(300, 2)) sc = StandardScaler() dense_scaled = sc.fit_transform(dense_arr) dense_mean = sc.mean_.collect() dense_var = sc.var_.collect() sparse_scaled = sc.fit_transform(sparse_arr) sparse_mean = sc.mean_.collect() sparse_var = sc.var_.collect() csr_scaled = sparse_scaled.collect() arr_scaled = dense_scaled.collect() self.assertTrue(issparse(csr_scaled)) self.assertTrue(sparse_scaled._sparse) self.assertTrue(sc.var_._sparse) self.assertTrue(sc.mean_._sparse) self.assertTrue(issparse(sparse_mean)) self.assertTrue(issparse(sparse_var)) self.assertTrue(np.allclose(csr_scaled.toarray(), arr_scaled)) self.assertTrue(np.allclose(sparse_mean.toarray(), dense_mean)) self.assertTrue(np.allclose(sparse_var.toarray(), dense_var))
def test_fit_transform(self): """ Tests fit_transform against scikit-learn. """ n_samples = 1500 x, y = make_blobs(n_samples=n_samples, random_state=170) transformation = [[0.6, -0.6], [-0.4, 0.8]] x = np.dot(x, transformation) ds_arr = ds.array(x, block_size=(300, 2)) sc1 = SkStandardScaler() scaled_x = sc1.fit_transform(x) sc2 = StandardScaler() ds_scaled = sc2.fit_transform(ds_arr) self.assertTrue(np.allclose(scaled_x, ds_scaled.collect())) self.assertTrue(np.allclose(sc1.mean_, sc2.mean_.collect())) self.assertTrue(np.allclose(sc1.var_, sc2.var_.collect())) self.assertEqual(ds_scaled._top_left_shape, ds_scaled._blocks[0][0].shape) self.assertEqual(ds_arr._reg_shape, ds_scaled._reg_shape) self.assertEqual(ds_arr._top_left_shape, ds_scaled._top_left_shape) self.assertEqual(ds_arr.shape, ds_scaled.shape) self.assertEqual(ds_arr._n_blocks, ds_scaled._n_blocks)
def test_irregular(self): """ Test with an irregular array """ n_samples = 1500 x, y = make_blobs(n_samples=n_samples, random_state=170) transformation = [[0.6, -0.6], [-0.4, 0.8]] x = np.dot(x, transformation) ds_arr = ds.array(x, block_size=(300, 2)) ds_arr = ds_arr[297:602] x = x[297:602] sc1 = SkStandardScaler() scaled_x = sc1.fit_transform(x) sc2 = StandardScaler() ds_scaled = sc2.fit_transform(ds_arr) self.assertTrue(np.allclose(scaled_x, ds_scaled.collect())) self.assertTrue(np.allclose(sc1.mean_, sc2.mean_.collect())) self.assertTrue(np.allclose(sc1.var_, sc2.var_.collect())) self.assertEqual(ds_scaled._top_left_shape, compss_wait_on(ds_scaled._blocks[0][0]).shape) self.assertEqual(ds_arr._reg_shape, ds_scaled._reg_shape) self.assertEqual(ds_arr._top_left_shape, ds_scaled._top_left_shape) self.assertEqual(ds_arr.shape, ds_scaled.shape) self.assertEqual(ds_arr._n_blocks, ds_scaled._n_blocks)
def test_fit_2(self): """Tests GridSearchCV fit() with different data.""" x_np, y_np = datasets.load_breast_cancer(return_X_y=True) x = ds.array(x_np, block_size=(100, 10)) x = StandardScaler().fit_transform(x) y = ds.array(y_np.reshape(-1, 1), block_size=(100, 1)) parameters = {'c': [0.1], 'gamma': [0.1]} csvm = CascadeSVM() searcher = GridSearchCV(csvm, parameters, cv=5) searcher.fit(x, y) self.assertTrue(hasattr(searcher, 'best_estimator_')) self.assertTrue(hasattr(searcher, 'best_score_')) self.assertTrue(hasattr(searcher, 'best_params_')) self.assertTrue(hasattr(searcher, 'best_index_')) self.assertTrue(hasattr(searcher, 'scorer_')) self.assertEqual(searcher.n_splits_, 5)