Example #1
0
def test_ensemble(spark_session):
    sc = spark_session.sparkContext

    test_size = 0.2
    max_depth = None
    n_estimators = 100

    # load sample data (binary target)
    data = load_breast_cancer()
    X = data["data"]
    y = data["target"]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=10)

    ### distributed random forest
    model = DistRandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        sc=sc,
    )
    # distributed fitting with spark
    model.fit(X_train, y_train)
    # predictions on the driver
    preds = model.predict(X_test)

    assert preds.shape == y_test.shape
Example #2
0
"""
===============================================================
Train distributed decision forests on the breast cancer dataset
===============================================================

In this example we fit two types of decision tree ensembles on
the breast cancer dataset as binary classifiers. These are the
popular random forest classifier and the extra randomized trees
classifier. While the number of decision tree estimators in these
examples is small, skdist allows very large numbers of trees
to be trained in parallel with spark.

Here the core difference between skdist and sklearn is to use the sparkContext
variable as an argument to the random forest and extra trees class
instantiation. Under the hood, skdist will then broadcast the training data out 
to the executors, fit decision trees out on the estimators, 
collect the fitted trees back to the driver, and appropriately store those 
fitted trees within the fitted estimator object to conform to the predict 
methods of the sklearn ensemble meta-estimators.

The final estimators are then nearly identical to a fitted sklearn RandomForestClassifier
or ExtraTreesClassifier estimator as shown by looking at some of their methods
and attributes.

Finally, all spark objects are removed from the fitted skdist estimator objects
so that these objects are pickle-able as shown.

Here is a sample output run:

-- Random Forest --
Example #3
0
def test_clone():
    rf = DistRandomForestClassifier(n_estimators=10)
    rf_cloned = base._clone(rf)
    assert rf.n_estimators == rf_cloned.n_estimators
Example #4
0
start = time.time()
lr = LogisticRegression(solver="lbfgs", multi_class="auto")
model = DistGridSearchCV(lr, {"C": [10.0, 1.0, 0.1, 0.01]},
                         sc=sc,
                         cv=5,
                         scoring="f1_weighted")
model.fit(X_train_t, y_train)
print("-- sk-dist LR --")
print("Train Time: {0}".format(time.time() - start))
print("Best Model CV Score: {0}".format(model.best_score_))
print("Holdout F1: {0}".format(
    f1_score(y_test, model.predict(X_test_t), average="weighted")))

# sk-dist random forest
start = time.time()
rf = DistRandomForestClassifier(n_estimators=100, max_depth=None, sc=sc)
rf.fit(X_train_t, y_train)
print("-- sk-dist RF --")
print("Train Time: {0}".format(time.time() - start))
print("Holdout F1: {0}".format(
    f1_score(y_test, rf.predict(X_test_t), average="weighted")))

# spark-ify scaled training data
pandas_df = pd.DataFrame(X_train_t)
pandas_df["label"] = y_train
spark_df = spark.createDataFrame(pandas_df)
assembler = VectorAssembler(inputCols=[str(a) for a in pandas_df.columns[:-1]],
                            outputCol="features")

# spark ML logistic regression w/ grid seach
start = time.time()
Example #5
0
# one nested example
model = DistGridSearchCV(
    DistOneVsRestClassifier(LogisticRegression(solver="liblinear"), sc=sc),
    {"estimator__C": params},
    cv=cv,
    scoring=scoring,
)
model.fit(X, y)
print(pd.DataFrame(model.cv_results_)[["mean_test_score", "param_estimator__C"]])

# another nested example
model = DistGridSearchCV(
    DistOneVsOneClassifier(LogisticRegression(solver="liblinear"), sc=sc),
    {"estimator__C": params},
    cv=cv,
    scoring=scoring,
)
model.fit(X, y)
print(pd.DataFrame(model.cv_results_)[["mean_test_score", "param_estimator__C"]])

# a final nested example
model = DistGridSearchCV(
    DistRandomForestClassifier(sc=sc, n_estimators=100),
    {"max_depth": [10, 20, None], "n_estimators": [100]},
    cv=cv,
    scoring=scoring,
)
model.fit(X, y)
print(pd.DataFrame(model.cv_results_)[["mean_test_score", "param_max_depth"]])