def test_index_rows_dense(self): """ Tests get a slice of rows from the ds.array using lists as index """ config.session.execute("TRUNCATE TABLE hecuba.istorage") config.session.execute("DROP KEYSPACE IF EXISTS hecuba_dislib") bn, bm = 5, 5 x = np.random.randint(100, size=(10, 10)) ds_data = ds.array(x=x, block_size=(bn, bm)) data = ds.array(x=x, block_size=(bn, bm)) data.make_persistent(name="hecuba_dislib.test_array") indices_lists = [([0, 5], [0, 5])] for rows, cols in indices_lists: got = data[rows].collect() expected = ds_data[rows].collect() self.assertTrue(equal(got, expected)) # Try slicing with irregular array x = ds_data[1:, 1:] data_sliced = data[1:, 1:] for rows, cols in indices_lists: got = data_sliced[rows].collect() expected = x[rows].collect() self.assertTrue(equal(got, expected))
def test_univariate(self): """Tests fit() and predict(), univariate.""" x_data = np.array([1, 2, 3, 4, 5]) y_data = np.array([2, 1, 1, 2, 4.5]) bn, bm = 2, 1 x = ds.array(x=x_data, block_size=(bn, bm)) y = ds.array(x=y_data, block_size=(bn, bm)) reg = LinearRegression() reg.fit(x, y) self.assertTrue(np.allclose(reg.coef_.collect(), 0.6)) self.assertTrue(np.allclose(reg.intercept_.collect(), 0.3)) # Predict one sample x_test = np.array([3]) test_data = ds.array(x=x_test, block_size=(1, 1)) pred = reg.predict(test_data).collect() self.assertTrue(np.allclose(pred, 2.1)) # Predict multiple samples x_test = np.array([3, 5, 6]) test_data = ds.array(x=x_test, block_size=(bn, bm)) pred = reg.predict(test_data).collect() self.assertTrue(np.allclose(pred, [2.1, 3.3, 3.9]))
def test_multivariate(self): """Tests fit() and predict(), multivariate.""" x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) y_data = np.array([2, 1, 1, 2, 4.5]) bn, bm = 2, 2 x = ds.array(x=x_data, block_size=(bn, bm)) y = ds.array(x=y_data, block_size=(bn, 1)) reg = LinearRegression() reg.fit(x, y) self.assertTrue(np.allclose(reg.coef_.collect(), [0.421875, 0.296875])) self.assertTrue(np.allclose(reg.intercept_.collect(), 0.240625)) # Predict one sample x_test = np.array([3, 2]) test_data = ds.array(x=x_test, block_size=(1, bm)) pred = reg.predict(test_data).collect() self.assertTrue(np.allclose(pred, 2.1)) # Predict multiple samples x_test = np.array([[3, 2], [4, 4], [1, 3]]) test_data = ds.array(x=x_test, block_size=(bn, bm)) pred = reg.predict(test_data).collect() self.assertTrue(np.allclose(pred, [2.1, 3.115625, 1.553125]))
def test_fit_and_predict(self): """Tests LinearRegression's fit() and predict()""" x_data = np.array([1, 2, 3, 4, 5]).reshape(-1, 1) y_data = np.array([2, 1, 1, 2, 4.5]).reshape(-1, 1) bn, bm = 2, 2 x = ds.array(x=x_data, block_size=(bn, bm)) y = ds.array(x=y_data, block_size=(bn, bm)) reg = LinearRegression() reg.fit(x, y) # y = 0.6 * x + 0.3 reg.coef_ = compss_wait_on(reg.coef_) reg.intercept_ = compss_wait_on(reg.intercept_) self.assertTrue(np.allclose(reg.coef_, 0.6)) self.assertTrue(np.allclose(reg.intercept_, 0.3)) x_test = np.array([3, 5]).reshape(-1, 1) test_data = ds.array(x=x_test, block_size=(bn, bm)) pred = reg.predict(test_data).collect() self.assertTrue(np.allclose(pred, [2.1, 3.3]))
def test_sparse(self): """ Tests fit_transforms with sparse data""" n_samples = 1500 x, y = make_blobs(n_samples=n_samples, random_state=170) transformation = [[0.6, -0.6], [-0.4, 0.8]] x = np.dot(x, transformation) dense_arr = ds.array(x, block_size=(300, 2)) sparse_arr = ds.array(csr_matrix(x), block_size=(300, 2)) sc = StandardScaler() dense_scaled = sc.fit_transform(dense_arr) dense_mean = sc.mean_.collect() dense_var = sc.var_.collect() sparse_scaled = sc.fit_transform(sparse_arr) sparse_mean = sc.mean_.collect() sparse_var = sc.var_.collect() csr_scaled = sparse_scaled.collect() arr_scaled = dense_scaled.collect() self.assertTrue(issparse(csr_scaled)) self.assertTrue(sparse_scaled._sparse) self.assertTrue(sc.var_._sparse) self.assertTrue(sc.mean_._sparse) self.assertTrue(issparse(sparse_mean)) self.assertTrue(issparse(sparse_var)) self.assertTrue(np.allclose(csr_scaled.toarray(), arr_scaled)) self.assertTrue(np.allclose(sparse_mean.toarray(), dense_mean)) self.assertTrue(np.allclose(sparse_var.toarray(), dense_var))
def test_kron(self, shape_a, shape_b, sparse): """ Tests kronecker product """ np.random.seed() a_np = np.random.random(shape_a) b_np = np.random.random(shape_b) expected = np.kron(a_np, b_np) if sparse: a_np = sp.csr_matrix(a_np) b_np = sp.csr_matrix(b_np) b0 = np.random.randint(1, a_np.shape[0] + 1) b1 = np.random.randint(1, a_np.shape[1] + 1) b2 = np.random.randint(1, b_np.shape[0] + 1) b3 = np.random.randint(1, b_np.shape[1] + 1) a = ds.array(a_np, (b0, b1)) b = ds.array(b_np, (b2, b3)) b4 = np.random.randint(1, (b0 * b2) + 1) b5 = np.random.randint(1, (b1 * b3) + 1) computed = ds.kron(a, b, (b4, b5)) self.assertTrue(_validate_array(computed)) computed = computed.collect(False) # convert to ndarray because there is no kron for sparse matrices in # scipy if a._sparse: computed = computed.toarray() self.assertTrue(_equal_arrays(expected, computed))
def test_score(self, collect): seed = 666 # negative points belong to class 1, positives to 0 p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) csvm = CascadeSVM(cascade_arity=3, max_iter=10, tol=1e-4, kernel='rbf', c=2, gamma=0.1, check_convergence=True, random_state=seed, verbose=False) csvm.fit(x, y) # points are separable, scoring the training dataset should have 100% # accuracy x_test = ds.array(np.array([p1, p2, p3, p4]), (2, 2)) y_test = ds.array(np.array([0, 0, 1, 1]).reshape(-1, 1), (2, 1)) accuracy = csvm.score(x_test, y_test, collect) if not collect: accuracy = compss_wait_on(accuracy) self.assertEqual(accuracy, 1.0)
def test_make_regression_sklearn_max_predict(self): """Tests RandomForestRegressor predict with sklearn_max.""" x, y = make_regression( n_samples=3000, n_features=10, n_informative=4, shuffle=True, random_state=0, ) x_train = ds.array(x[::2], (300, 10)) y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) x_test = ds.array(x[1::2], (300, 10)) y_test = ds.array(y[1::2][:, np.newaxis], (300, 1)) rf = RandomForestRegressor(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) accuracy1 = compss_wait_on(rf.score(x_test, y_test)) y_pred = rf.predict(x_test).collect() y_true = y[1::2] accuracy2 = _determination_coefficient(y_true, y_pred) self.assertGreater(accuracy1, 0.85) self.assertGreater(accuracy2, 0.85) self.assertAlmostEqual(accuracy1, accuracy2)
def test_make_classification_sklearn_max_predict_proba(self): """Tests RandomForestClassifier predict_proba with sklearn_max.""" x, y = make_classification( n_samples=3000, n_features=10, n_classes=3, n_informative=4, n_redundant=2, n_repeated=1, n_clusters_per_class=2, shuffle=True, random_state=0, ) x_train = ds.array(x[::2], (300, 10)) y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) x_test = ds.array(x[1::2], (300, 10)) y_test = y[1::2] rf = RandomForestClassifier(random_state=0, sklearn_max=10) rf.fit(x_train, y_train) probabilities = rf.predict_proba(x_test).collect() rf.classes = compss_wait_on(rf.classes) y_pred = rf.classes[np.argmax(probabilities, axis=1)] accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) self.assertGreater(accuracy, 0.7)
def load_movielens(data_path, train_ratio=0.9): cols = ['user_id', 'movie_id', 'rating', 'timestamp'] file = 'sample_movielens_ratings.csv' # 30 users, 100 movies df = pd.read_csv(os.path.join(data_path, file), delimiter=',', names=cols, usecols=cols[0:3]).sample(frac=1, random_state=666) # just in case there are movies/user without rating n_m = max(df.movie_id.nunique(), max(df.movie_id) + 1) n_u = max(df.user_id.nunique(), max(df.user_id) + 1) idx = int(df.shape[0] * train_ratio) tr_df = df.iloc[:idx] te_df = df.iloc[idx:] train = csr_matrix((tr_df.rating, (tr_df.user_id, tr_df.movie_id)), shape=(n_u, n_m)) test = csr_matrix((te_df.rating, (te_df.user_id, te_df.movie_id))) x_size, y_size = ceil(train.shape[0] / 2), ceil(train.shape[1] / 3) train_arr = ds.array(train, block_size=(x_size, y_size)) x_size, y_size = ceil(test.shape[0] / 2), ceil(test.shape[1] / 3) test_arr = ds.array(test, block_size=(x_size, y_size)) return train_arr, test_arr
def test_median(self): """ Tests the median """ x_np = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) x = ds.array(x_np, block_size=(2, 2)) xm = x.median() self.assertTrue(_validate_array(xm)) expected = np.median(x_np, axis=0) self.assertTrue(_equal_arrays(expected, xm.collect())) xm = x.median(axis=1) self.assertTrue(_validate_array(xm)) expected = np.median(x_np, axis=1) self.assertTrue(_equal_arrays(expected, xm.collect())) with self.assertRaises(NotImplementedError): x_csr = ds.array(sp.csr_matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), (2, 2)) x_csr.median()
def main(): """ Linear regression example with plot """ # Example data x = np.array([ 1000, 4000, 5000, 4500, 3000, 4000, 9000, 11000, 15000, 12000, 7000, 3000 ]) y = np.array([ 9914, 40487, 54324, 50044, 34719, 42551, 94871, 118914, 158484, 131348, 78504, 36284 ]) x_ds = ds.array(x[:, np.newaxis], (4, 1)) y_ds = ds.array(y[:, np.newaxis], (4, 1)) reg = LinearRegression() reg.fit(x_ds, y_ds) reg.coef_ = compss_wait_on(reg.coef_) reg.intercept_ = compss_wait_on(reg.intercept_) print(reg.coef_, reg.intercept_) # plot_result: scatter(x, y, marker='x') x_mesh = np.linspace(min(x), max(x), 1000) plot(x_mesh, [reg.coef_ * x + reg.intercept_ for x in x_mesh]) show()
def test_knn_fit(self): """ Tests knn fit_predict and compares the result with regular ds-arrays """ config.session.execute("TRUNCATE TABLE hecuba.istorage") config.session.execute("DROP KEYSPACE IF EXISTS hecuba_dislib") x = np.random.random((1500, 5)) block_size = (500, 5) block_size2 = (250, 5) data = ds.array(x, block_size=block_size) q_data = ds.array(x, block_size=block_size2) data_h = ds.array(x, block_size=block_size) data_h.make_persistent(name="hecuba_dislib.test_array") q_data_h = ds.array(x, block_size=block_size2) q_data_h.make_persistent(name="hecuba_dislib.test_array_q") knn = NearestNeighbors(n_neighbors=10) knn.fit(data) dist, ind = knn.kneighbors(q_data) knn_h = NearestNeighbors(n_neighbors=10) knn_h.fit(data_h) dist_h, ind_h = knn_h.kneighbors(q_data_h) self.assertTrue( np.allclose(dist.collect(), dist_h.collect(), atol=1e-7)) self.assertTrue(np.array_equal(ind.collect(), ind_h.collect()))
def test_linear_regression(self): """ Tests linear regression fit_predict and compares the result with regular ds-arrays """ config.session.execute("TRUNCATE TABLE hecuba.istorage") config.session.execute("DROP KEYSPACE IF EXISTS hecuba_dislib") x_data = np.array([1, 2, 3, 4, 5]).reshape(-1, 1) y_data = np.array([2, 1, 1, 2, 4.5]).reshape(-1, 1) block_size = (x_data.shape[0] // 3, x_data.shape[1]) x = ds.array(x=x_data, block_size=block_size) x.make_persistent(name="hecuba_dislib.test_array_x") y = ds.array(x=y_data, block_size=block_size) y.make_persistent(name="hecuba_dislib.test_array_y") reg = LinearRegression() reg.fit(x, y) # y = 0.6 * x + 0.3 reg.coef_ = compss_wait_on(reg.coef_) reg.intercept_ = compss_wait_on(reg.intercept_) self.assertTrue(np.allclose(reg.coef_, 0.6)) self.assertTrue(np.allclose(reg.intercept_, 0.3)) x_test = np.array([3, 5]).reshape(-1, 1) test_data = ds.array(x=x_test, block_size=block_size) test_data.make_persistent(name="hecuba_dislib.test_array_test") pred = reg.predict(test_data).collect() self.assertTrue(np.allclose(pred, [2.1, 3.3]))
def load_movielens(train_ratio=0.9): file = 'tests/files/sample_movielens_ratings.csv' # 'user_id', 'movie_id', 'rating', 'timestamp' data = np.genfromtxt(file, dtype='int', delimiter=',', usecols=range(3)) # just in case there are movies/user without rating # movie_id n_m = max(len(np.unique(data[:, 1])), max(data[:, 1]) + 1) # user_id n_u = max(len(np.unique(data[:, 0])), max(data[:, 0]) + 1) idx = int(data.shape[0] * train_ratio) train_data = data[:idx] test_data = data[idx:] train = csr_matrix( (train_data[:, 2], (train_data[:, 0], train_data[:, 1])), shape=(n_u, n_m)) test = csr_matrix( (test_data[:, 2], (test_data[:, 0], test_data[:, 1]))) x_size, y_size = train.shape[0] // 4, train.shape[1] // 4 train_arr = ds.array(train, block_size=(x_size, y_size)) x_size, y_size = test.shape[0] // 4, test.shape[1] // 4 test_arr = ds.array(test, block_size=(x_size, y_size)) return train_arr, test_arr
def test_make_classification_hard_vote_predict(self): """Tests RandomForestClassifier predict with hard_vote.""" x, y = make_classification( n_samples=3000, n_features=10, n_classes=3, n_informative=4, n_redundant=2, n_repeated=1, n_clusters_per_class=2, shuffle=True, random_state=0, ) x_train = ds.array(x[::2], (300, 10)) y_train = ds.array(y[::2][:, np.newaxis], (300, 1)) x_test = ds.array(x[1::2], (300, 10)) y_test = y[1::2] rf = RandomForestClassifier(random_state=0, sklearn_max=10, hard_vote=True) rf.fit(x_train, y_train) y_pred = rf.predict(x_test).collect() accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) self.assertGreater(accuracy, 0.7)
def test_power(self): """ Tests ds-array power and sqrt """ orig = np.array([[1, 2, 3], [4, 5, 6]]) x = ds.array(orig, block_size=(2, 1)) xp = x ** 2 xs = xp.sqrt() self.assertTrue(_validate_array(xp)) self.assertTrue(_validate_array(xs)) expected = np.array([[1, 4, 9], [16, 25, 36]]) self.assertTrue(_equal_arrays(expected, xp.collect())) self.assertTrue(_equal_arrays(orig, xs.collect())) orig = sp.csr_matrix([[1, 2, 3], [4, 5, 6]]) x = ds.array(orig, block_size=(2, 1)) xp = x ** 2 xs = xp.sqrt() self.assertTrue(_validate_array(xp)) self.assertTrue(_validate_array(xs)) expected = sp.csr_matrix([[1, 4, 9], [16, 25, 36]]) self.assertTrue(_equal_arrays(expected, xp.collect())) self.assertTrue(_equal_arrays(orig, xs.collect())) with self.assertRaises(NotImplementedError): x ** x
def main(): x, y = load_iris(return_X_y=True) indices = np.arange(len(x)) shuffle(indices) # use 80% of samples for training train_idx = indices[:int(0.8 * len(x))] test_idx = indices[int(0.8 * len(x)):] # Train the RF classifier print("- Training Random Forest classifier with %s samples of Iris " "dataset." % len(train_idx)) x_train = ds.array(x[train_idx], (10, 4)) y_train = ds.array(y[train_idx][:, np.newaxis], (10, 1)) forest = RandomForestClassifier(10) forest.fit(x_train, y_train) # Test the trained RF classifier print("- Testing the classifier.", end='') x_test = ds.array(x[test_idx], (10, 4)) y_real = ds.array(y[test_idx][:, np.newaxis], (10, 1)) y_pred = forest.predict(x_test) score = compss_wait_on(forest.score(x_test, y_real)) # Put results in fancy dataframe and print the accuracy df = pd.DataFrame(data=list(zip(y[test_idx], y_pred.collect())), columns=['Label', 'Predicted']) print(" Predicted values: \n\n%s" % df) print("\n- Classifier accuracy: %s" % score)
def test_predict(self): seed = 666 # negative points belong to class 1, positives to 0 p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] x = ds.array(np.array([p1, p4, p3, p2]), (2, 2)) y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1)) csvm = CascadeSVM(cascade_arity=3, max_iter=10, tol=1e-4, kernel='linear', c=2, gamma=0.1, check_convergence=False, random_state=seed, verbose=False) csvm.fit(x, y) # p5 should belong to class 0, p6 to class 1 p5, p6 = np.array([1, 1]), np.array([-1, -1]) x_test = ds.array(np.array([p1, p2, p3, p4, p5, p6]), (2, 2)) y_pred = csvm.predict(x_test) l1, l2, l3, l4, l5, l6 = y_pred.collect() self.assertTrue(l1 == l2 == l5 == 0) self.assertTrue(l3 == l4 == l6 == 1)
def load_movielens(train_ratio=0.9): file = 'tests/files/sample_movielens_ratings.csv' cols = ['user_id', 'movie_id', 'rating', 'timestamp'] # 30 users, 100 movies df = pd.read_csv(file, names=cols, usecols=cols[0:3]) # just in case there are movies/user without rating n_m = max(df.movie_id.nunique(), max(df.movie_id) + 1) n_u = max(df.user_id.nunique(), max(df.user_id) + 1) idx = int(df.shape[0] * train_ratio) train_df = df.iloc[:idx] test_df = df.iloc[idx:] train = csr_matrix( (train_df.rating, (train_df.user_id, train_df.movie_id)), shape=(n_u, n_m)) test = csr_matrix( (test_df.rating, (test_df.user_id, test_df.movie_id))) x_size, y_size = train.shape[0] // 4, train.shape[1] // 4 train_arr = ds.array(train, block_size=(x_size, y_size)) x_size, y_size = test.shape[0] // 4, test.shape[1] // 4 test_arr = ds.array(test, block_size=(x_size, y_size)) return train_arr, test_arr
def test_make_classification_hard_vote_score_mix(self): """Tests RandomForestClassifier score with hard_vote, sklearn_max, distr_depth and max_depth.""" x, y = make_classification(n_samples=3000, n_features=10, n_classes=3, n_informative=4, n_redundant=2, n_repeated=1, n_clusters_per_class=2, shuffle=True, random_state=0) x_train = ds.array(x[:len(x) // 2], (300, 10)) y_train = ds.array(y[:len(y) // 2][:, np.newaxis], (300, 1)) x_test = ds.array(x[len(x) // 2:], (300, 10)) y_test = ds.array(y[len(y) // 2:][:, np.newaxis], (300, 1)) rf = RandomForestClassifier(random_state=0, sklearn_max=100, distr_depth=2, max_depth=12, hard_vote=True) rf.fit(x_train, y_train) accuracy = compss_wait_on(rf.score(x_test, y_test)) self.assertGreater(accuracy, 0.7)
def main(): x_np, y_np = datasets.load_iris(return_X_y=True) x = ds.array(x_np, (30, 4)) y = ds.array(y_np[:, np.newaxis], (30, 1)) parameters = { 'n_estimators': (1, 2, 4, 8, 16, 32), 'max_depth': range(3, 5) } rf = RandomForestClassifier() searcher = GridSearchCV(rf, parameters, cv=5) np.random.seed(0) searcher.fit(x, y) print(searcher.cv_results_['params']) print(searcher.cv_results_['mean_test_score']) pd_df = pd.DataFrame.from_dict(searcher.cv_results_) print(pd_df[['params', 'mean_test_score']]) with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(pd_df) print(searcher.best_estimator_) print(searcher.best_score_) print(searcher.best_params_) print(searcher.best_index_) print(searcher.scorer_) print(searcher.n_splits_)
def test_fit_predict(self): """ Tests fit and predicts methods """ np.random.seed(42) n_samples, n_features = 50, 100 X = np.random.randn(n_samples, n_features) # Decreasing coef w. alternated signs for visualization idx = np.arange(n_features) coef = (-1)**idx * np.exp(-idx / 10) coef[10:] = 0 # sparsify coef y = np.dot(X, coef) # Add noise y += 0.01 * np.random.normal(size=n_samples) n_samples = X.shape[0] X_train, y_train = X[:n_samples // 2], y[:n_samples // 2] X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] lasso = Lasso(lmbd=0.1, max_iter=50) lasso.fit(ds.array(X_train, (5, 100)), ds.array(y_train, (5, 1))) y_pred_lasso = lasso.predict(ds.array(X_test, (25, 100))) r2_score_lasso = r2_score(y_test, y_pred_lasso.collect()) self.assertEqual(r2_score_lasso, 0.9481746925431124)
def test_scoring_callable(self): """Tests GridSearchCV with callable scoring parameter.""" x_np, y_np = datasets.load_iris(return_X_y=True) x = ds.array(x_np, (30, 4)) y = ds.array(y_np[:, np.newaxis], (30, 1)) param_grid = {'n_estimators': (2, 4)} rf = RandomForestClassifier() def scoring(clf, x_score, y_real): return clf.score(x_score, y_real) searcher = GridSearchCV(rf, param_grid, cv=3, scoring=scoring) searcher.fit(x, y) self.assertTrue(hasattr(searcher, 'cv_results_')) self.assertTrue(hasattr(searcher, 'best_estimator_')) self.assertTrue(hasattr(searcher, 'best_score_')) self.assertTrue(hasattr(searcher, 'best_params_')) self.assertTrue(hasattr(searcher, 'best_index_')) self.assertTrue(hasattr(searcher, 'scorer_')) def invalid_scoring(clf, x_score, y_score): return '2' searcher = GridSearchCV(rf, param_grid, cv=3, scoring=invalid_scoring) with self.assertRaisesRegex(ValueError, 'scoring must return a number'): searcher.fit(x, y)
def test_sparse(self, feature_range): """ Tests fit_transforms with sparse data""" n_samples = 1500 x, y = make_blobs(n_samples=n_samples, random_state=170) transformation = [[0.6, -0.6], [-0.4, 0.8]] x = np.dot(x, transformation) dense_arr = ds.array(x, block_size=(300, 2)) sparse_arr = ds.array(csr_matrix(x), block_size=(300, 2)) sc = MinMaxScaler(feature_range=feature_range) dense_scaled = sc.fit_transform(dense_arr) dense_min = sc.data_min_.collect() dense_max = sc.data_max_.collect() sparse_scaled = sc.fit_transform(sparse_arr) sparse_min = sc.data_min_.collect() sparse_max = sc.data_max_.collect() csr_scaled = sparse_scaled.collect() arr_scaled = dense_scaled.collect() self.assertTrue(issparse(csr_scaled)) self.assertTrue(sparse_scaled._sparse) self.assertTrue(sc.data_min_._sparse) self.assertTrue(sc.data_max_._sparse) self.assertTrue(issparse(sparse_min)) self.assertTrue(issparse(sparse_max)) self.assertTrue(np.allclose(csr_scaled.toarray(), arr_scaled)) self.assertTrue(np.allclose(sparse_min.toarray(), dense_min)) self.assertTrue(np.allclose(sparse_max.toarray(), dense_max))
def test_refit_callable(self): """Tests GridSearchCV with callable refit parameter.""" x_np, y_np = datasets.load_iris(return_X_y=True) x = ds.array(x_np, (30, 4)) y = ds.array(y_np[:, np.newaxis], (30, 1)) param_grid = {'n_estimators': (2, 4)} rf = RandomForestClassifier() best_index = 1 def refit(results): return best_index searcher = GridSearchCV(rf, param_grid, cv=3, refit=refit) searcher.fit(x, y) self.assertTrue(hasattr(searcher, 'cv_results_')) self.assertTrue(hasattr(searcher, 'best_estimator_')) self.assertFalse(hasattr(searcher, 'best_score_')) self.assertTrue(hasattr(searcher, 'best_params_')) self.assertTrue(hasattr(searcher, 'best_index_')) self.assertTrue(hasattr(searcher, 'scorer_')) best_index = 'str' searcher = GridSearchCV(rf, param_grid, cv=3, refit=refit) with self.assertRaises(TypeError): searcher.fit(x, y) best_index = -1 searcher = GridSearchCV(rf, param_grid, cv=3, refit=refit) with self.assertRaises(IndexError): searcher.fit(x, y)
def test_univariate_no_intercept(self): """Tests fit() and predict(), univariate, fit_intercept=False.""" x_data = np.array([1, 2, 3, 4, 5]) y_data = np.array([2, 1, 1, 2, 4.5]) bn, bm = 2, 1 x = ds.array(x=x_data, block_size=(bn, bm)) y = ds.array(x=y_data, block_size=(bn, bm)) reg = LinearRegression(fit_intercept=False) reg.fit(x, y) self.assertTrue(np.allclose(reg.coef_.collect(), 0.68181818)) self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) # Predict one sample x_test = np.array([3]) test_data = ds.array(x=x_test, block_size=(1, 1)) pred = reg.predict(test_data).collect() self.assertTrue(np.allclose(pred, 2.04545455)) # Predict multiple samples x_test = np.array([3, 5, 6]) test_data = ds.array(x=x_test, block_size=(bn, bm)) pred = reg.predict(test_data).collect() self.assertTrue(np.allclose(pred, [2.04545455, 3.4090909, 4.0909091]))
def test_fit(self): """Tests GridSearchCV fit().""" x_np, y_np = datasets.load_iris(return_X_y=True) x = ds.array(x_np, (30, 4)) y = ds.array(y_np[:, np.newaxis], (30, 1)) param_grid = {'n_estimators': (2, 4), 'max_depth': range(3, 5)} rf = RandomForestClassifier() searcher = GridSearchCV(rf, param_grid) searcher.fit(x, y) expected_keys = { 'param_max_depth', 'param_n_estimators', 'params', 'mean_test_score', 'std_test_score', 'rank_test_score' } split_keys = {'split%d_test_score' % i for i in range(5)} expected_keys.update(split_keys) self.assertSetEqual(set(searcher.cv_results_.keys()), expected_keys) expected_params = [(3, 2), (3, 4), (4, 2), (4, 4)] for params in searcher.cv_results_['params']: m = params['max_depth'] n = params['n_estimators'] self.assertIn((m, n), expected_params) expected_params.remove((m, n)) self.assertEqual(len(expected_params), 0) self.assertTrue(hasattr(searcher, 'best_estimator_')) self.assertTrue(hasattr(searcher, 'best_score_')) self.assertTrue(hasattr(searcher, 'best_params_')) self.assertTrue(hasattr(searcher, 'best_index_')) self.assertTrue(hasattr(searcher, 'scorer_')) self.assertEqual(searcher.n_splits_, 5)
def test_multivariate_no_intercept(self): """Tests fit() and predict(), multivariate, fit_intercept=False.""" x_data = np.array([[1, 2], [2, 0], [3, 1], [4, 4], [5, 3]]) y_data = np.array([2, 1, 1, 2, 4.5]) bn, bm = 2, 2 x = ds.array(x=x_data, block_size=(bn, bm)) y = ds.array(x=y_data, block_size=(bn, 1)) reg = LinearRegression(fit_intercept=False) reg.fit(x, y) self.assertTrue( np.allclose(reg.coef_.collect(), [0.48305085, 0.30367232])) self.assertTrue(np.allclose(reg.intercept_.collect(), 0)) # Predict one sample x_test = np.array([3, 2]) test_data = ds.array(x=x_test, block_size=(1, bm)) pred = reg.predict(test_data).collect() self.assertTrue(np.allclose(pred, [2.05649718])) # Predict multiple samples x_test = np.array([[3, 2], [4, 4], [1, 3]]) test_data = ds.array(x=x_test, block_size=(bn, bm)) pred = reg.predict(test_data).collect() self.assertTrue(np.allclose(pred, [2.05649718, 3.14689266, 1.3940678]))
def test_shuffle_xy_sparse(self): """ Tests shuffle for given sparse x and sparse y, and random_state. Tests that the shuffled arrays contain the same rows as the original data, and that the position has changed for some row. """ np.random.seed(0) x = sparse.random(8, 10, density=0.5).tocsr() x_ds = ds.array(x, (3, 5)) y = sparse.random(8, 1, density=0.5).tocsr() y_ds = ds.array(y, (4, 1)) shuffled_x, shuffled_y = shuffle(x_ds, y_ds, random_state=0) shuffled_x = shuffled_x.collect() shuffled_y = shuffled_y.collect() # Assert that at least one of the first 2 samples has changed self.assertFalse((x[0:2] != shuffled_x[0:2]).nnz == 0) # Assert that the shuffled data has the same shape. self.assertEqual(shuffled_x.shape, x.shape) self.assertEqual(shuffled_y.shape[0], y.shape[0]) # Assert that all rows from x are found in the shuffled_x, and that the # same permutation has been used to shuffle x and y. for idx, x_row in enumerate(x): found = False for shuffled_idx, shuffle_x_row in enumerate(shuffled_x): if (shuffle_x_row != x_row).nnz == 0: # If rows are equal found = True self.assertEqual(y[idx, 0], shuffled_y[shuffled_idx, 0]) break self.assertTrue(found)