def test_ovr(): X = np.array([[0, 0, 1, 1], [1, 1, 0, 0], [-1, -1, -1, -1]] * 100) y = np.array([0, 1, 2] * 100) ovr = DistOneVsRestClassifier(LogisticRegression(solver="liblinear")) ovr.fit(X, y) preds = ovr.predict(X[:3]) assert np.allclose(preds, np.array([0, 1, 2]))
def test_multiclass(spark_session): sc = spark_session.sparkContext # variables solver = "liblinear" test_size = 0.2 # load sample data (binary target) data = load_digits() X = data["data"] y = data["target"] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=10 ) ### distributed one vs rest model = DistOneVsRestClassifier(LogisticRegression(solver=solver), sc) # distributed fitting with spark model.fit(X_train, y_train) # predictions on the driver preds = model.predict(X_test) assert preds.shape == y_test.shape
# variables scoring_average = "weighted" solver = "liblinear" test_size = 0.2 # load sample data (binary target) data = load_digits() X = data["data"] y = data["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=10) ### distributed one vs rest model = DistOneVsRestClassifier(LogisticRegression(solver=solver), sc) # distributed fitting with spark model.fit(X_train, y_train) # predictions on the driver preds = model.predict(X_test) probs = model.predict_proba(X_test) # results print("-- One Vs Rest --") print("Weighted F1: {0}".format( f1_score(y_test, preds, average=scoring_average))) print("Precision: {0}".format( precision_score(y_test, preds, average=scoring_average))) print("Recall: {0}".format(recall_score(y_test, preds, average=scoring_average))) print(pickle.loads(pickle.dumps(model)))
# create dataset X, y = make_classification( n_samples=100000, n_features=40, n_informative=36, n_redundant=1, n_repeated=1, n_classes=40, n_clusters_per_class=1, random_state=5, ) # one nested example model = DistGridSearchCV( DistOneVsRestClassifier(LogisticRegression(solver="liblinear"), sc=sc), {"estimator__C": params}, cv=cv, scoring=scoring, ) model.fit(X, y) print(pd.DataFrame(model.cv_results_)[["mean_test_score", "param_estimator__C"]]) # another nested example model = DistGridSearchCV( DistOneVsOneClassifier(LogisticRegression(solver="liblinear"), sc=sc), {"estimator__C": params}, cv=cv, scoring=scoring, ) model.fit(X, y)