def test_rr(app_inst: ArrayApplication): num_features = 13 rs = np.random.RandomState(1337) real_theta = rs.random_sample(num_features) real_X, real_y = BimodalGaussian.get_dataset(100, num_features, p=0.5, theta=real_theta) extra_X, extra_y = BimodalGaussian.get_dataset(10, num_features, p=0.5, theta=real_theta) # Perturb some examples. extra_X = extra_X * rs.random_sample(np.product(extra_X.shape)).reshape(extra_X.shape) extra_y = extra_y * rs.random_sample(extra_y.shape).reshape(extra_y.shape) real_X = np.concatenate([real_X, extra_X], axis=0) real_y = np.concatenate([real_y, extra_y], axis=0) X = app_inst.array(real_X, block_shape=(15, 5)) y = app_inst.array(real_y, block_shape=(15,)) theta = app_inst.ridge_regression(X, y, lamb=0.0) robust_theta = app_inst.ridge_regression(X, y, lamb=10000.0) # Generate a test set to evaluate robustness to outliers. test_X, test_y = BimodalGaussian.get_dataset(100, num_features, p=0.5, theta=real_theta) test_X = app_inst.array(test_X, block_shape=(15, 5)) test_y = app_inst.array(test_y, block_shape=(15,)) theta_error = np.sum((((test_X @ theta) - test_y)**2).get()) robust_theta_error = np.sum((((test_X @ robust_theta) - test_y)**2).get()) assert robust_theta_error < theta_error
def test_stats(app_inst: ArrayApplication): real_X, _ = BimodalGaussian.get_dataset(3, 2) X = app_inst.array(real_X, block_shape=(2, 1)) assert np.allclose(app_inst.mean(X, axis=0).get(), np.mean(real_X, axis=0)) assert np.allclose(app_inst.std(X, axis=1).get(), np.std(real_X, axis=1)) real_X, _ = BimodalGaussian.get_dataset(100, 9) X = app_inst.array(real_X, block_shape=(10, 2)) assert np.allclose(app_inst.mean(X, axis=0).get(), np.mean(real_X, axis=0)) assert np.allclose(app_inst.std(X, axis=1).get(), np.std(real_X, axis=1))
def test_logistic(nps_app_inst: ArrayApplication): num_samples, num_features = 1000, 10 real_X, real_y = BimodalGaussian.get_dataset(num_samples, num_features) X = nps_app_inst.array(real_X, block_shape=(100, 3)) y = nps_app_inst.array(real_y, block_shape=(100,)) param_set = [ {"solver": "gd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10}, {"solver": "sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10}, {"solver": "block_sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10}, {"solver": "newton", "tol": 1e-8, "max_iter": 10}, {"solver": "irls", "tol": 1e-8, "max_iter": 10}, ] for kwargs in param_set: runtime = time.time() lr_model: LogisticRegression = LogisticRegression(**kwargs) lr_model.fit(X, y) runtime = time.time() - runtime y_pred = lr_model.predict(X).get() y_pred_proba = lr_model.predict_proba(X).get() np.allclose( np.ones(shape=(y.shape[0],)), y_pred_proba[:, 0] + y_pred_proba[:, 1] ) print("opt", kwargs["solver"]) print("runtime", runtime) print("norm", lr_model.grad_norm_sq(X, y).get()) print("objective", lr_model.objective(X, y).get()) print("accuracy", np.sum(y.get() == y_pred) / num_samples)
def test_transpose(app_inst: ArrayApplication): real_X, _ = BimodalGaussian.get_dataset(100, 9) X = app_inst.array(real_X, block_shape=(100, 1)) assert np.allclose(X.T.get(), real_X.T) # Identity. assert np.allclose(X.T.T.get(), X.get()) assert np.allclose(X.T.T.get(), real_X)
def test_sklearn_linear_regression(nps_app_inst: ArrayApplication): from sklearn.linear_model import LinearRegression as SKLinearRegression _, num_features = 1000, 10 rs = np.random.RandomState(1337) real_theta = rs.random_sample(num_features) real_X, real_y = BimodalGaussian.get_dataset(233, num_features, theta=real_theta) X = nps_app_inst.array(real_X, block_shape=(100, 3)) y = nps_app_inst.array(real_y, block_shape=(100, )) param_set = [ { "solver": "newton-cg", "tol": 1e-8, "max_iter": 10 }, ] for kwargs in param_set: lr_model: LinearRegression = LinearRegression(**kwargs) lr_model.fit(X, y) y_pred = lr_model.predict(X).get() sk_lr_model = SKLinearRegression() sk_lr_model.fit(real_X, real_y) sk_y_pred = sk_lr_model.predict(real_X) np.allclose(sk_y_pred, y_pred)
def test_lr(nps_app_inst: ArrayApplication): num_samples, num_features = 1000, 10 rs = np.random.RandomState(1337) real_theta = rs.random_sample(num_features) real_X, real_y = BimodalGaussian.get_dataset(233, num_features, theta=real_theta) X = nps_app_inst.array(real_X, block_shape=(100, 3)) y = nps_app_inst.array(real_y, block_shape=(100, )) param_set = [{ "solver": "gd", "lr": 1e-6, "tol": 1e-8, "max_iter": 100 }, { "solver": "newton", "tol": 1e-8, "max_iter": 10 }] for kwargs in param_set: runtime = time.time() model: LinearRegression = LinearRegression(**kwargs) model.fit(X, y) assert model._beta.shape == real_theta.shape and model._beta0.shape == ( ) runtime = time.time() - runtime y_pred = model.predict(X).get() print("opt", kwargs["solver"]) print("runtime", runtime) print("norm", model.grad_norm_sq(X, y).get()) print("objective", model.objective(X, y).get()) print("error", np.sum((y.get() - y_pred)**2) / num_samples) print("D^2", model.deviance_sqr(X, y).get())
def test_concatenate(app_inst: ArrayApplication): axis = 1 real_X, _ = BimodalGaussian.get_dataset(1000, 9) real_ones = np.ones(shape=(1000, 1)) X = app_inst.array(real_X, block_shape=(100, 9)) ones = app_inst.ones((1000, 1), (100, 1), dtype=X.dtype) X_concated = app_inst.concatenate([X, ones], axis=axis, axis_block_size=X.block_shape[axis]) common.check_block_integrity(X_concated) real_X_concated = np.concatenate([real_X, real_ones], axis=axis) assert np.allclose(X_concated.get(), real_X_concated) real_X2 = np.random.random_sample(1000 * 17).reshape(1000, 17) X2 = app_inst.array(real_X2, block_shape=(X.block_shape[0], 3)) X_concated = app_inst.concatenate([X, ones, X2], axis=axis, axis_block_size=X.block_shape[axis]) common.check_block_integrity(X_concated) real_X_concated = np.concatenate([real_X, real_ones, real_X2], axis=axis) assert np.allclose(X_concated.get(), real_X_concated) y1 = app_inst.zeros(shape=(50, ), block_shape=(10, ), dtype=int) y2 = app_inst.ones(shape=(50, ), block_shape=(10, ), dtype=int) y = app_inst.concatenate([y1, y2], axis=0) common.check_block_integrity(y)
def test_penalties(nps_app_inst: ArrayApplication): num_samples, num_features = 1000, 10 real_X, real_y = BimodalGaussian.get_dataset(num_samples, num_features) X = nps_app_inst.array(real_X, block_shape=(100, 3)) y = nps_app_inst.array(real_y, block_shape=(100,)) param_set = [ {"solver": "gd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10}, {"solver": "sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10}, {"solver": "block_sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10}, {"solver": "newton", "tol": 1e-8, "max_iter": 10}, {"solver": "lbfgs", "tol": 1e-8, "max_iter": 20}, ] for kwargs in param_set: model: Lasso = Lasso(alpha=0.5, **kwargs) model.fit(X, y) if kwargs["solver"] in ("newton", "lbfgs"): assert model.deviance_sqr(X, y) > 0.9 model: Ridge = Ridge(alpha=0.5, **kwargs) model.fit(X, y) if kwargs["solver"] in ("newton", "lbfgs"): assert model.deviance_sqr(X, y) > 0.9 model: ElasticNet = ElasticNet(alpha=0.5, l1_ratio=0.5, **kwargs) model.fit(X, y) if kwargs["solver"] in ("newton", "lbfgs"): assert model.deviance_sqr(X, y) > 0.9
def test_sklearn_logistic_regression(nps_app_inst: ArrayApplication): from sklearn.linear_model import LogisticRegression as SKLogisticRegression num_samples, num_features = 1000, 10 real_X, real_y = BimodalGaussian.get_dataset(num_samples, num_features) X = nps_app_inst.array(real_X, block_shape=(100, 3)) y = nps_app_inst.array(real_y, block_shape=(100,)) param_set = [ {"solver": "newton-cg", "tol": 1e-8, "max_iter": 10}, ] for kwargs in param_set: runtime = time.time() lr_model: LogisticRegression = LogisticRegression(**kwargs) lr_model.fit(X, y) runtime = time.time() - runtime y_pred = lr_model.predict(X).get() y_pred_proba = lr_model.predict_proba(X).get() np.allclose( np.ones(shape=(y.shape[0],)), y_pred_proba[:, 0] + y_pred_proba[:, 1] ) sk_lr_model = SKLogisticRegression(**kwargs) sk_lr_model.fit(real_X, real_y) sk_y_pred = sk_lr_model.predict(real_X) sk_y_pred_proba = sk_lr_model.predict_proba(real_X) np.allclose( np.ones(shape=(y.shape[0],)), sk_y_pred_proba[:, 0] + sk_y_pred_proba[:, 1] ) np.allclose(sk_y_pred, y_pred)
def test_lr(nps_app_inst: ArrayApplication): num_samples, num_features = 1000, 10 rs = np.random.RandomState(1337) real_theta = rs.random_sample(num_features) real_X, real_y = BimodalGaussian.get_dataset(233, num_features, theta=real_theta) X = nps_app_inst.array(real_X, block_shape=(100, 3)) y = nps_app_inst.array(real_y, block_shape=(100,)) param_set = [ {"solver": "gd", "lr": 1e-6, "tol": 1e-8, "max_iter": 100}, {"solver": "newton", "tol": 1e-8, "max_iter": 10}, ] for kwargs in param_set: runtime = time.time() model: LinearRegression = LinearRegression(**kwargs) model.fit(X, y) assert model._beta.shape == real_theta.shape and model._beta0.shape == () runtime = time.time() - runtime y_pred = model.predict(X).get() print("opt", kwargs["solver"]) print("runtime", runtime) print("norm", model.grad_norm_sq(X, y).get()) print("objective", model.objective(X, y).get()) print("error", np.sum((y.get() - y_pred) ** 2) / num_samples) print("D^2", model.deviance_sqr(X, y).get()) # Test if integer array arguments will converge properly. X = nps_app_inst.array([[1, 2], [3, 5], [1, 5]], block_shape=(2, 2)) y = nps_app_inst.array([1, 2, 3], block_shape=(2,)) model: LinearRegression = LinearRegression() model.fit(X, y) try: pred = model.predict([1, 2]).get() assert 0.9 < pred < 1.1 except OverflowError: assert False, "LinearRegression overflows with integer array arguments."
def test_lr(app_inst: ArrayApplication): num_features = 13 rs = np.random.RandomState(1337) for dtype in (np.float32, np.float64): real_theta = rs.random_sample(num_features).astype(dtype) real_X, real_y = BimodalGaussian.get_dataset(233, num_features, theta=real_theta) real_X = real_X.astype(dtype) real_y = real_y.astype(dtype) X = app_inst.array(real_X, block_shape=(15, 5)) y = app_inst.array(real_y, block_shape=(15,)) # Direct TSQR LR theta = app_inst.linear_regression(X, y) error = app_inst.sum((((X @ theta) - y)**2)).get() if dtype == np.float64: assert np.allclose(0, error), error else: # Need to account for lower precision. assert np.allclose(0, error, rtol=1.e-4, atol=1.e-4), error # Fast LR theta = app_inst.fast_linear_regression(X, y) error = app_inst.sum((((X @ theta) - y)**2)).get() if dtype == np.float64: assert np.allclose(0, error), error else: # Need to account for lower precision. assert np.allclose(0, error, rtol=1.e-4, atol=1.e-4), error
def test_qr(app_inst: ArrayApplication): real_X, _ = BimodalGaussian.get_dataset(2345, 9) X = app_inst.array(real_X, block_shape=(123, 4)) Q, R = app_inst.indirect_tsqr(X) assert np.allclose(Q.get() @ R.get(), real_X) Q, R = app_inst.direct_tsqr(X) assert np.allclose(Q.get() @ R.get(), real_X)
def test_pca(app_inst: ArrayApplication): real_X, _ = BimodalGaussian.get_dataset(2345, 9) X = app_inst.array(real_X, block_shape=(123, 4)) # Covariance matrix test. C = app_inst.cov(X, rowvar=False) V, _, VT = linalg.svd(app_inst, C) assert app_inst.allclose(V, VT.T) pc = X @ V assert app_inst.allclose(pc, linalg.pca(app_inst, X))
def test_logistic(app_inst: ArrayApplication): num_samples, num_features = 1000, 10 real_X, real_y = BimodalGaussian.get_dataset(num_samples, num_features) X = app_inst.array(real_X, block_shape=(100, 3)) y = app_inst.array(real_y, block_shape=(100, )) opt_param_set = [("gd", { "lr": 1e-6, "tol": 1e-8, "max_iter": 10 }), ("block_sync_sgd", { "lr": 1e-6, "tol": 1e-8, "max_iter": 10 }), ("block_async_sgd", { "lr": 1e-6, "tol": 1e-8, "max_iter": 10 }), ("newton", { "tol": 1e-8, "max_iter": 10 }), ("irls", { "tol": 1e-8, "max_iter": 10 })] for opt, opt_params in opt_param_set: runtime = time.time() lr_model: LogisticRegression = LogisticRegression( app_inst, opt, opt_params) lr_model.fit(X, y) runtime = time.time() - runtime y_pred = (lr_model.predict(X).get() > 0.5).astype(int) print("opt", opt) print("runtime", runtime) print("norm", lr_model.grad_norm_sq(X, y).get()) print("objective", lr_model.objective(X, y).get()) print("accuracy", np.sum(y.get() == y_pred) / num_samples)
def test_reshape(app_inst: ArrayApplication): real_X, _ = BimodalGaussian.get_dataset(1000, 9) X = app_inst.array(real_X, block_shape=(100, 9)) X = X.reshape(shape=(1000, 9), block_shape=(1000, 1)) assert np.allclose(X.get(), real_X)
def test_svd(app_inst: ArrayApplication): real_X, _ = BimodalGaussian.get_dataset(2345, 9) X = app_inst.array(real_X, block_shape=(123, 4)) U, S, VT = app_inst.svd(X) assert np.allclose((U.get() * S.get()) @ VT.get(), real_X)
def test_matmul(app_inst: ArrayApplication): real_X, _ = BimodalGaussian.get_dataset(100, 9) X = app_inst.array(real_X, block_shape=(100, 1)) X_sqr = X.T @ X assert np.allclose(X_sqr.get(), real_X.T @ real_X)
def test_logistic_cv(nps_app_inst: ArrayApplication): num_samples, num_features = 1000, 10 num_bad = 100 block_shape = (200, 10) folds = num_samples // block_shape[0] rs = np.random.RandomState(1337) real_X, real_y = BimodalGaussian.get_dataset( num_samples - num_bad, num_features, p=0.5 ) extra_X, extra_y = BimodalGaussian.get_dataset(num_bad, num_features, p=0.5) # Perturb some examples. extra_X = extra_X * rs.random_sample(np.product(extra_X.shape)).reshape( extra_X.shape ) extra_y = rs.randint(0, 2, extra_y.shape).reshape(extra_y.shape) perm = rs.permutation(np.arange(num_samples)) real_X = np.concatenate([real_X, extra_X], axis=0)[perm] real_y = np.concatenate([real_y, extra_y], axis=0)[perm] # real_X, real_y = BimodalGaussian.get_dataset(num_samples, num_features) X = nps_app_inst.array(real_X, block_shape=block_shape) y = nps_app_inst.array(real_y, block_shape=(block_shape[0],)) param_set = [ {"solver": "newton", "tol": 1e-8, "max_iter": 10}, { "solver": "newton", "penalty": "l2", "C": 1.0 / 0.1, "tol": 1e-8, "max_iter": 10, }, { "solver": "newton", "penalty": "l2", "C": 1.0 / 0.2, "tol": 1e-8, "max_iter": 10, }, { "solver": "newton", "penalty": "l2", "C": 1.0 / 0.4, "tol": 1e-8, "max_iter": 10, }, { "solver": "newton", "penalty": "l2", "C": 1.0 / 0.8, "tol": 1e-8, "max_iter": 10, }, ] X_train = nps_app_inst.empty( (num_samples - X.block_shape[0], num_features), X.block_shape, X.dtype ) y_train = nps_app_inst.empty( (num_samples - y.block_shape[0],), y.block_shape, y.dtype ) num_hps = len(param_set) mean_accuracies = nps_app_inst.empty((num_hps,), (num_hps,)) for i, kwargs in enumerate(param_set): accuracies = nps_app_inst.empty((folds,), (folds,)) for fold in range(folds): print(i, fold) pos = X.block_shape[0] * fold block_size, _ = X.grid.get_block_shape((fold, 0)) start = pos stop = pos + block_size X_train[:start] = X[:start] X_train[start:] = X[stop:] y_train[:start] = y[:start] y_train[start:] = y[stop:] X_test, y_test = X[start:stop], y[start:stop] lr_model: LogisticRegression = LogisticRegression(**kwargs) lr_model.fit(X_train, y_train) y_pred = lr_model.predict(X_test) accuracies[fold] = nps_app_inst.sum(y_test == y_pred) / (stop - start) mean_accuracies[i] = nps_app_inst.mean(accuracies) print(mean_accuracies.get())
def test_log(app_inst: ArrayApplication): real_X, _ = BimodalGaussian.get_dataset(100, 9) X = app_inst.array(real_X, block_shape=(10, 2)) assert np.allclose(app_inst.log(X).get(), np.log(real_X))