Example #1
0
def test_search(spark_session):
    sc = spark_session.sparkContext

    # sklearn variables
    Cs = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
    cv = 5
    test_size = 0.2
    scoring = "roc_auc"
    solver = "liblinear"

    # load sample data (binary target)
    data = load_breast_cancer()
    X = data["data"]
    y = data["target"]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=10)

    ### distributed grid search
    model = DistGridSearchCV(LogisticRegression(solver=solver),
                             dict(C=Cs),
                             sc,
                             cv=cv,
                             scoring=scoring)
    # distributed fitting with spark
    model.fit(X_train, y_train)
    # predictions on the driver
    preds = model.predict(X_test)

    assert preds.shape == y_test.shape
Example #2
0
def main():
    cv = 5
    clf_scoring = "accuracy"
    reg_scoring = "neg_mean_squared_error"

    data = load_iris()
    X = data["data"]
    y = data["target"]

    grid = dict(learning_rate=[.05, .01],
                max_depth=[4, 6, 8],
                colsample_bytree=[.6, .8, 1.0],
                n_estimators=[100, 200, 300])

    model = DistGridSearchCV(XGBClassifier(),
                             grid,
                             spark.sparkContext,
                             cv=cv,
                             scoring=clf_scoring)

    model.fit(X, y)
    # predictions on the driver
    preds = model.predict(X)
    probs = model.predict_proba(X)

    # results
    print("-- Grid Search --")
    print("Best Score: {0}".format(model.best_score_))
    print("Best colsample_bytree: {0}".format(
        model.best_estimator_.colsample_bytree))
    print("Best learning_rate: {0}".format(
        model.best_estimator_.learning_rate))
    print("Best max_depth: {0}".format(model.best_estimator_.max_depth))
    print("Best n_estimators: {0}".format(model.best_estimator_.n_estimators))
Example #3
0
def test_gs():
    X = np.array([[1, 1, 1], [0, 0, 0], [-1, -1, -1]] * 100)
    y = np.array([0, 0, 1] * 100)
    gs = DistGridSearchCV(LogisticRegression(solver="liblinear"),
                          {"C": [0.1, 1.0]},
                          cv=3)
    gs.fit(X, y)
    preds = gs.predict(X[:3])
    assert np.allclose(preds, np.array([0, 0, 1]))
Example #4
0
X_train_t = scaler.fit_transform(X_train)
X_test_t = scaler.transform(X_test)

# sk-dist logistic regression w/ grid search
start = time.time()
lr = LogisticRegression(solver="lbfgs", multi_class="auto")
model = DistGridSearchCV(lr, {"C": [10.0, 1.0, 0.1, 0.01]},
                         sc=sc,
                         cv=5,
                         scoring="f1_weighted")
model.fit(X_train_t, y_train)
print("-- sk-dist LR --")
print("Train Time: {0}".format(time.time() - start))
print("Best Model CV Score: {0}".format(model.best_score_))
print("Holdout F1: {0}".format(
    f1_score(y_test, model.predict(X_test_t), average="weighted")))

# sk-dist random forest
start = time.time()
rf = DistRandomForestClassifier(n_estimators=100, max_depth=None, sc=sc)
rf.fit(X_train_t, y_train)
print("-- sk-dist RF --")
print("Train Time: {0}".format(time.time() - start))
print("Holdout F1: {0}".format(
    f1_score(y_test, rf.predict(X_test_t), average="weighted")))

# spark-ify scaled training data
pandas_df = pd.DataFrame(X_train_t)
pandas_df["label"] = y_train
spark_df = spark.createDataFrame(pandas_df)
assembler = VectorAssembler(inputCols=[str(a) for a in pandas_df.columns[:-1]],
Example #5
0
# load sample data (binary target)
data = load_breast_cancer()
X = data["data"]
y = data["target"]

### distributed grid search
model = DistGridSearchCV(LogisticRegression(solver=solver),
                         dict(C=Cs),
                         sc,
                         cv=cv,
                         scoring=scoring)
# distributed fitting with spark
model.fit(X, y)
# predictions on the driver
preds = model.predict(X)
probs = model.predict_proba(X)

# results
print("-- Grid Search --")
print("Best Score: {0}".format(model.best_score_))
print("Best C: {0}".format(model.best_estimator_.C))
result_data = pd.DataFrame(model.cv_results_)[["param_C", "mean_test_score"]]
print(result_data.sort_values("param_C"))
print(pickle.loads(pickle.dumps(model)))

### distributed randomized search
param_dist = dict(C=[])
model = DistRandomizedSearchCV(
    LogisticRegression(solver=solver),
    dict(C=Cs),