def test_mbsgd_classifier_default(datatype, nrows, column_info): ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_informative=n_info, n_features=ncols, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) y_train = y_train.astype(datatype) y_test = y_test.astype(datatype) cu_mbsgd_classifier = cumlMBSGClassifier() cu_mbsgd_classifier.fit(X_train, y_train) cu_pred = cu_mbsgd_classifier.predict(X_test).to_array() skl_sgd_classifier = SGDClassifier() skl_sgd_classifier.fit(X_train, y_train) skl_pred = skl_sgd_classifier.predict(X_test) cu_acc = accuracy_score(cu_pred, y_test) skl_acc = accuracy_score(skl_pred, y_test) assert cu_acc >= skl_acc - 0.05
def test_mbsgd_classifier_vs_skl(lrate, penalty, loss, make_dataset): nrows, X_train, X_test, y_train, y_test = make_dataset if nrows < 500000: cu_mbsgd_classifier = cumlMBSGClassifier(learning_rate=lrate, eta0=0.005, epochs=100, fit_intercept=True, batch_size=2, tol=0.0, penalty=penalty) cu_mbsgd_classifier.fit(X_train, y_train) cu_pred = cu_mbsgd_classifier.predict(X_test) cu_acc = accuracy_score(cp.asnumpy(cu_pred), cp.asnumpy(y_test)) skl_sgd_classifier = SGDClassifier(learning_rate=lrate, eta0=0.005, max_iter=100, fit_intercept=True, tol=0.0, penalty=penalty, random_state=0) skl_sgd_classifier.fit(cp.asnumpy(X_train), cp.asnumpy(y_train)) skl_pred = skl_sgd_classifier.predict(cp.asnumpy(X_test)) skl_acc = accuracy_score(skl_pred, cp.asnumpy(y_test)) assert cu_acc >= skl_acc - 0.08
def test_mbsgd_classifier(datatype, lrate, input_type, penalty, loss, nrows, column_info): ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_informative=n_info, n_features=ncols, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=10) cu_mbsgd_classifier = cumlMBSGClassifier(learning_rate=lrate, eta0=0.005, epochs=100, fit_intercept=True, batch_size=2, tol=0.0, penalty=penalty) cu_mbsgd_classifier.fit(X_train, y_train) cu_pred = cu_mbsgd_classifier.predict(X_test).to_array() cu_acc = accuracy_score(cu_pred, y_test) if nrows < 500000: skl_sgd_classifier = SGDClassifier(learning_rate=lrate, eta0=0.005, max_iter=100, fit_intercept=True, tol=0.0, penalty=penalty, random_state=0) skl_sgd_classifier.fit(X_train, y_train) skl_pred = skl_sgd_classifier.predict(X_test) skl_acc = accuracy_score(skl_pred, y_test) assert cu_acc >= skl_acc - 0.06
def test_mbsgd_classifier_default(make_dataset): nrows, X_train, X_test, y_train, y_test = make_dataset cu_mbsgd_classifier = cumlMBSGClassifier(batch_size=nrows / 10) cu_mbsgd_classifier.fit(X_train, y_train) cu_pred = cu_mbsgd_classifier.predict(X_test) cu_acc = accuracy_score(cp.asnumpy(cu_pred), cp.asnumpy(y_test)) assert cu_acc >= 0.69
def test_mbsgd_classifier_set_params(): x = np.linspace(0, 1, 50) y = (x > 0.5).astype(cp.int32) model = cumlMBSGClassifier() model.fit(x, y) coef_before = model.coef_ model = cumlMBSGClassifier(epochs=20, loss='hinge') model.fit(x, y) coef_after = model.coef_ model = cumlMBSGClassifier() model.set_params(**{'epochs': 20, 'loss': 'hinge'}) model.fit(x, y) coef_test = model.coef_ assert coef_before != coef_after assert coef_after == coef_test
def test_mbsgd_classifier_default(make_dataset): nrows, X_train, X_test, y_train, y_test = make_dataset cu_mbsgd_classifier = cumlMBSGClassifier() cu_mbsgd_classifier.fit(X_train, y_train) cu_pred = cu_mbsgd_classifier.predict(X_test) cu_acc = accuracy_score(cu_pred, y_test) assert cu_acc >= 0.69
def test_mbsgd_classifier_attributes(): X, y = make_blobs() clf = cumlMBSGClassifier() clf.fit(X, y) attrs = ["dtype", "solver_model", "coef_", "intercept_", "l1_ratio", "n_cols", "eta0", "batch_size", "fit_intercept", "penalty"] for attr in attrs: assert hasattr(clf, attr)
def test_mbsgd_classifier(lrate, penalty, loss, make_dataset): nrows, X_train, X_test, y_train, y_test = make_dataset cu_mbsgd_classifier = cumlMBSGClassifier(learning_rate=lrate, eta0=0.005, epochs=100, fit_intercept=True, batch_size=nrows/100, tol=0.0, penalty=penalty) cu_mbsgd_classifier.fit(X_train, y_train) cu_pred = cu_mbsgd_classifier.predict(X_test) cu_acc = accuracy_score(cp.asnumpy(cu_pred), cp.asnumpy(y_test)) assert cu_acc > 0.79
def test_mbsgd_classifier_default(make_dataset): nrows, X_train, X_test, y_train, y_test = make_dataset cu_mbsgd_classifier = cumlMBSGClassifier() cu_mbsgd_classifier.fit(X_train, y_train) cu_pred = cu_mbsgd_classifier.predict(X_test).to_array() cu_acc = accuracy_score(cu_pred, y_test) if nrows < 500000: skl_sgd_classifier = SGDClassifier() skl_sgd_classifier.fit(X_train, y_train) skl_pred = skl_sgd_classifier.predict(X_test) skl_acc = accuracy_score(skl_pred, y_test) assert cu_acc >= skl_acc - 0.05
def test_mbsgd_classifier(datatype, lrate, input_type, penalty, loss, nrows, ncols): train_rows = int(nrows * 0.8) X, y = make_classification(n_samples=nrows, n_features=ncols, random_state=0) X_test = np.array(X[train_rows:, :], dtype=datatype) X_train = np.array(X[:train_rows, :], dtype=datatype) y_train = np.array(y[:train_rows, ], dtype=datatype) y_test = np.array(y[train_rows:, ], dtype=datatype) cu_mbsgd_classifier = cumlMBSGClassifier(learning_rate=lrate, eta0=0.005, epochs=100, fit_intercept=True, batch_size=2, tol=0.0, penalty=penalty) cu_mbsgd_classifier.fit(X_train, y_train) cu_pred = cu_mbsgd_classifier.predict(X_test).to_array() skl_sgd_classifier = SGDClassifier(learning_rate=lrate, eta0=0.005, max_iter=100, fit_intercept=True, tol=0.0, penalty=penalty, random_state=0) skl_sgd_classifier.fit(X_train, y_train) skl_pred = skl_sgd_classifier.predict(X_test) cu_error = accuracy_score(cu_pred, y_test) skl_error = accuracy_score(skl_pred, y_test) assert (cu_error - skl_error <= 0.02)