def test_sgd_default(dtype, datatype): X, y = make_blobs(n_samples=100, n_features=3, centers=2, random_state=0) X = X.astype(dtype) y = y.astype(dtype) # Default loss is squared_loss y[y == 0] = -1 X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) if datatype == "dataframe": X_train = cudf.DataFrame(X_train) X_test = cudf.DataFrame(X_test) y_train = cudf.Series(y_train) cu_sgd = cumlSGD() cu_sgd.fit(X_train, y_train) cu_pred = cu_sgd.predict(X_test) if datatype == "dataframe": assert isinstance(cu_pred, cudf.Series) cu_pred = cu_pred.to_numpy() else: assert isinstance(cu_pred, np.ndarray) # Adjust for squared loss (we don't need to test for high accuracy, # just that the loss function tended towards the expected classes. cu_pred[cu_pred < 0] = -1 cu_pred[cu_pred >= 0] = 1 assert np.array_equal(cu_pred, y_test)
def test_svd_default(datatype): X_train = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]], dtype=datatype) y_train = np.array([1, 1, 2, 2], dtype=datatype) X_test = np.array([[3.0, 5.0], [2.0, 5.0]]).astype(datatype) cu_sgd = cumlSGD() cu_sgd.fit(X_train, y_train) cu_pred = cu_sgd.predict(X_test).to_array() print("cuML predictions : ", cu_pred)
def test_svd(datatype, lrate, input_type, penalty, loss, name): if name == 'blobs': n_samples = 500000 train_rows = int(n_samples * 0.8) X, y = make_blobs(n_samples=n_samples, n_features=1000, random_state=0) X_test = np.array(X[train_rows:, 0:], dtype=datatype) X_train = np.array(X[0:train_rows, :], dtype=datatype) y_train = np.array(y[0:train_rows, ], dtype=datatype) elif name == 'iris': iris = datasets.load_iris() X = iris.data y = iris.target train_rows = int((np.shape(X)[0]) * 0.8) X_test = np.array(X[train_rows:, 0:], dtype=datatype) X_train = np.array(X[0:train_rows, :], dtype=datatype) y_train = np.array(y[0:train_rows, ], dtype=datatype) else: X_train = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]], dtype=datatype) y_train = np.array([1, 1, 2, 2], dtype=datatype) X_test = np.array([[3.0, 5.0], [2.0, 5.0]]).astype(datatype) cu_sgd = cumlSGD(learning_rate=lrate, eta0=0.005, epochs=2000, fit_intercept=True, batch_size=4096, tol=0.0, penalty=penalty, loss=loss) if input_type == 'dataframe': y_train_pd = pd.DataFrame({'fea0': y_train[0:, ]}) X_train_pd = pd.DataFrame( {'fea%d' % i: X_train[0:, i] for i in range(X_train.shape[1])}) X_test_pd = pd.DataFrame( {'fea%d' % i: X_test[0:, i] for i in range(X_test.shape[1])}) X_train = cudf.DataFrame.from_pandas(X_train_pd) X_test = cudf.DataFrame.from_pandas(X_test_pd) y_train = y_train_pd.values y_train = y_train[:, 0] y_train = cudf.Series(y_train) cu_sgd.fit(X_train, y_train) cu_pred = cu_sgd.predict(X_test).to_array() print("cuML predictions : ", cu_pred)
def test_sgd(dtype, lrate, penalty, loss, datatype): X, y = make_blobs(n_samples=100, n_features=3, centers=2, random_state=0) X = X.astype(dtype) y = y.astype(dtype) if loss == "hinge" or loss == "squared_loss": y[y == 0] = -1 X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) if datatype == "dataframe": X_train = cudf.DataFrame(X_train) X_test = cudf.DataFrame(X_test) y_train = cudf.Series(y_train) cu_sgd = cumlSGD(learning_rate=lrate, eta0=0.005, epochs=2000, fit_intercept=True, batch_size=4096, tol=0.0, penalty=penalty, loss=loss, power_t=0.4) cu_sgd.fit(X_train, y_train) cu_pred = cu_sgd.predict(X_test) if datatype == "dataframe": assert isinstance(cu_pred, cudf.Series) cu_pred = cu_pred.to_numpy() else: assert isinstance(cu_pred, np.ndarray) if loss == "log": cu_pred[cu_pred < 0.5] = 0 cu_pred[cu_pred >= 0.5] = 1 elif loss == "squared_loss": cu_pred[cu_pred < 0] = -1 cu_pred[cu_pred >= 0] = 1 # Adjust for squared loss (we don't need to test for high accuracy, # just that the loss function tended towards the expected classes. assert np.array_equal(cu_pred, y_test)
def test_svd(datatype, lrate, input_type, penalty, loss): X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]], dtype=datatype) y = np.array([1, 1, 2, 2], dtype=datatype) pred_data = np.array([[3.0, 5.0], [2.0, 5.0]]).astype(datatype) if input_type == 'dataframe': X = cudf.DataFrame() X['col1'] = np.asarray([-1, -2, 1, 2], dtype=datatype) X['col2'] = np.asarray([-1, -1, 2, 2], dtype=datatype) y = cudf.Series(np.array(y, dtype=np.float32)) pred_data = cudf.DataFrame() pred_data['col1'] = np.asarray([3, 2], dtype=datatype) pred_data['col2'] = np.asarray([5, 5], dtype=datatype) cu_sgd = cumlSGD(learning_rate=lrate, eta0=0.005, epochs=2000, fit_intercept=True, batch_size=2, tol=0.0, penalty=penalty, loss=loss) cu_sgd.fit(X, y) cu_pred = cu_sgd.predict(pred_data).to_array() print("cuML predictions : ", cu_pred)
def test_svd(datatype, lrate, penalty, loss, name): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) elif name == 'iris': iris = datasets.load_iris() X = (iris.data).astype(datatype) y = (iris.target).astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) else: X_train = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]], dtype=datatype) y_train = np.array([1, 1, 2, 2], dtype=datatype) X_test = np.array([[3.0, 5.0], [2.0, 5.0]]).astype(datatype) cu_sgd = cumlSGD(learning_rate=lrate, eta0=0.005, epochs=2000, fit_intercept=True, batch_size=4096, tol=0.0, penalty=penalty, loss=loss) cu_sgd.fit(X_train, y_train) cu_pred = cu_sgd.predict(X_test).to_array() print("cuML predictions : ", cu_pred)