def test_weighted_ridge(datatype, algorithm, fit_intercept, normalize, distribution): nrows, ncols, n_info = 1000, 20, 10 max_weight = 10 noise = 20 X_train, X_test, y_train, y_test = make_regression_dataset( datatype, nrows, ncols, n_info, noise=noise ) # set weight per sample to be from 1 to max_weight if distribution == "uniform": wt = np.random.randint(1, high=max_weight, size=len(X_train)) elif distribution == "exponential": wt = np.random.exponential(size=len(X_train)) else: wt = np.random.lognormal(size=len(X_train)) # Initialization of cuML's linear regression model curidge = cuRidge(fit_intercept=fit_intercept, normalize=normalize, solver=algorithm) # fit and predict cuml linear regression model curidge.fit(X_train, y_train, sample_weight=wt) curidge_predict = curidge.predict(X_test) # sklearn linear regression model initialization, fit and predict skridge = skRidge(fit_intercept=fit_intercept, normalize=normalize) skridge.fit(X_train, y_train, sample_weight=wt) skridge_predict = skridge.predict(X_test) assert array_equal(skridge_predict, curidge_predict, 1e-1, with_sign=True)
def test_ridge(datatype, X_type, y_type, algorithm): X = np.array([[2.0, 5.0], [6.0, 9.0], [2.0, 2.0], [2.0, 3.0]], dtype=datatype) y = np.dot(X, np.array([5.0, 10.0]).astype(datatype)) pred_data = np.array([[3.0, 5.0], [2.0, 5.0]]).astype(datatype) skridge = skRidge(fit_intercept=False, normalize=False) skridge.fit(X, y) curidge = cuRidge(fit_intercept=False, normalize=False, solver=algorithm) if X_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([2, 6, 2, 2], dtype=datatype) gdf['1'] = np.asarray([5, 9, 2, 3], dtype=datatype) curidge.fit(gdf, y) elif X_type == 'ndarray': curidge.fit(X, y) sk_predict = skridge.predict(pred_data) cu_predict = curidge.predict(pred_data).to_array() assert array_equal(sk_predict, cu_predict, 1e-3, with_sign=True)
def test_ridge_regression_model(datatype, algorithm, nrows, column_info): if algorithm == "svd" and nrows > 46340: pytest.skip("svd solver is not supported for the data that has more" "than 46340 rows or columns if you are using CUDA version" "10.x") ncols, n_info = column_info X_train, X_test, y_train, y_test = make_regression_dataset( datatype, nrows, ncols, n_info ) # Initialization of cuML's ridge regression model curidge = cuRidge(fit_intercept=False, normalize=False, solver=algorithm) # fit and predict cuml ridge regression model curidge.fit(X_train, y_train) curidge_predict = curidge.predict(X_test) if nrows < 500000: # sklearn ridge regression model initialization, fit and predict skridge = skRidge(fit_intercept=False, normalize=False) skridge.fit(X_train, y_train) skridge_predict = skridge.predict(X_test) assert array_equal(skridge_predict, curidge_predict, 1e-1, with_sign=True)
def test_ridge_regression_model(datatype, algorithm, nrows, column_info): ncols, n_info = column_info X_train, X_test, y_train, y_test = make_regression_dataset( datatype, nrows, ncols, n_info ) # Initialization of cuML's ridge regression model curidge = cuRidge(fit_intercept=False, normalize=False, solver=algorithm) # fit and predict cuml ridge regression model curidge.fit(X_train, y_train) curidge_predict = curidge.predict(X_test) if nrows < 500000: # sklearn ridge regression model initialization, fit and predict skridge = skRidge(fit_intercept=False, normalize=False) skridge.fit(X_train, y_train) skridge_predict = skridge.predict(X_test) assert array_equal(skridge_predict, curidge_predict, 1e-1, with_sign=True)
def test_ridge_predict_convert_dtype(train_dtype, test_dtype): X, y = make_regression(n_samples=50, n_features=10, n_informative=5, random_state=0) X = X.astype(train_dtype) y = y.astype(train_dtype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) clf = cuRidge() clf.fit(X_train, y_train) clf.predict(X_test.astype(test_dtype))
def test_ridge_regression_model_default(datatype): X_train, X_test, y_train, y_test = small_regression_dataset(datatype) curidge = cuRidge() # fit and predict cuml ridge regression model curidge.fit(X_train, y_train) curidge_predict = curidge.predict(X_test) # sklearn ridge regression model initialization, fit and predict skridge = skRidge() skridge.fit(X_train, y_train) skridge_predict = skridge.predict(X_test) assert array_equal(skridge_predict, curidge_predict, 1e-1, with_sign=True)
X_train = X_train.toarray() if type(X_train) is not np.ndarray: X_train_np = X_train.toarray() else: X_train_np = X_train if args.densify_all: X_train = X_train_np if args.test == 'ridge': sk = Ridge(fit_intercept=False, alpha=regularizer, max_iter=1000000, tol=1e-06) cu = cuRidge(fit_intercept=False, alpha=regularizer, solver='eig') elif args.test == 'lasso': sk = Lasso(fit_intercept=False, alpha=regularizer / X_train.shape[0]) cu = cuLasso(fit_intercept=False, alpha=regularizer / X_train.shape[0]) elif args.test == 'logistic': sk = Logistic(fit_intercept=False, C=regularizer, dual=True, solver='liblinear') cu = cuLogistic(fit_intercept=False, C=regularizer * X_train.shape[0], max_iter=100000, tol=1e-8) else: raise ("Invalid test")
def test_linear_models(datatype, X_type, y_type, algorithm, nrows, ncols, n_info): train_rows = np.int32(nrows * 0.8) X, y = make_regression(n_samples=(nrows), n_features=ncols, n_informative=n_info, random_state=0) X_test = np.asarray(X[train_rows:, 0:]).astype(datatype) X_train = np.asarray(X[0:train_rows, :]).astype(datatype) y_train = np.asarray(y[0:train_rows, ]).astype(datatype) # Initialization of cuML's linear and ridge regression models cuols = cuLinearRegression(fit_intercept=True, normalize=False, algorithm=algorithm) curidge = cuRidge(fit_intercept=False, normalize=False, solver=algorithm) if X_type == 'dataframe': y_train = pd.DataFrame({'labels': y_train[0:, ]}) X_train = pd.DataFrame( {'fea%d' % i: X_train[0:, i] for i in range(X_train.shape[1])}) X_test = pd.DataFrame( {'fea%d' % i: X_test[0:, i] for i in range(X_test.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X_train) X_cudf_test = cudf.DataFrame.from_pandas(X_test) y_cudf = y_train.values y_cudf = y_cudf[:, 0] y_cudf = cudf.Series(y_cudf) # fit and predict cuml linear regression model cuols.fit(X_cudf, y_cudf) cuols_predict = cuols.predict(X_cudf_test).to_array() # fit and predict cuml ridge regression model curidge.fit(X_cudf, y_cudf) curidge_predict = curidge.predict(X_cudf_test).to_array() elif X_type == 'ndarray': # fit and predict cuml linear regression model cuols.fit(X_train, y_train) cuols_predict = cuols.predict(X_test).to_array() # fit and predict cuml ridge regression model curidge.fit(X_train, y_train) curidge_predict = curidge.predict(X_test).to_array() if nrows < 500000: # sklearn linear and ridge regression model initialization and fit skols = skLinearRegression(fit_intercept=True, normalize=False) skols.fit(X_train, y_train) skridge = skRidge(fit_intercept=False, normalize=False) skridge.fit(X_train, y_train) skols_predict = skols.predict(X_test) skridge_predict = skridge.predict(X_test) assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True) assert array_equal(skridge_predict, curidge_predict, 1e-1, with_sign=True)
import numpy as np import cudf from cuml import Ridge as cuRidge lr = cuRidge(alpha=1.0, fit_intercept=True, normalize=False, solver='eig') X = cudf.DataFrame() X['col1'] = np.array([1, 1, 2, 2], dtype=np.float32) X['col2'] = np.array([1, 2, 2, 3], dtype=np.float32) print("\n\n***** Running fit *****\n") print("Input Dataframe:") print(X) y = cudf.Series(np.array([6.0, 8.0, 9.0, 11.0], dtype=np.float32)) print("Input Labels:") print(y) reg = lr.fit(X, y) print("Coefficients:") print(reg.coef_) print("intercept:") print(reg.intercept_) print("\n\n***** Running predict *****\n") X_new = cudf.DataFrame() X_new['col1'] = np.array([3, 2], dtype=np.float32) X_new['col2'] = np.array([5, 5], dtype=np.float32) print("Input Dataframe:") print(X_new)