Exemple #1
0
def test_search(spark_session):
    sc = spark_session.sparkContext

    # sklearn variables
    Cs = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
    cv = 5
    test_size = 0.2
    scoring = "roc_auc"
    solver = "liblinear"

    # load sample data (binary target)
    data = load_breast_cancer()
    X = data["data"]
    y = data["target"]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=10)

    ### distributed grid search
    model = DistGridSearchCV(LogisticRegression(solver=solver),
                             dict(C=Cs),
                             sc,
                             cv=cv,
                             scoring=scoring)
    # distributed fitting with spark
    model.fit(X_train, y_train)
    # predictions on the driver
    preds = model.predict(X_test)

    assert preds.shape == y_test.shape
Exemple #2
0
def test_gs():
    X = np.array([[1, 1, 1], [0, 0, 0], [-1, -1, -1]] * 100)
    y = np.array([0, 0, 1] * 100)
    gs = DistGridSearchCV(LogisticRegression(solver="liblinear"),
                          {"C": [0.1, 1.0]},
                          cv=3)
    gs.fit(X, y)
    preds = gs.predict(X[:3])
    assert np.allclose(preds, np.array([0, 0, 1]))
def main():
    cv = 5
    clf_scoring = "accuracy"
    reg_scoring = "neg_mean_squared_error"

    data = load_iris()
    X = data["data"]
    y = data["target"]

    grid = dict(learning_rate=[.05, .01],
                max_depth=[4, 6, 8],
                colsample_bytree=[.6, .8, 1.0],
                n_estimators=[100, 200, 300])

    model = DistGridSearchCV(XGBClassifier(),
                             grid,
                             spark.sparkContext,
                             cv=cv,
                             scoring=clf_scoring)

    model.fit(X, y)
    # predictions on the driver
    preds = model.predict(X)
    probs = model.predict_proba(X)

    # results
    print("-- Grid Search --")
    print("Best Score: {0}".format(model.best_score_))
    print("Best colsample_bytree: {0}".format(
        model.best_estimator_.colsample_bytree))
    print("Best learning_rate: {0}".format(
        model.best_estimator_.learning_rate))
    print("Best max_depth: {0}".format(model.best_estimator_.max_depth))
    print("Best n_estimators: {0}".format(model.best_estimator_.n_estimators))
# instantiate spark session
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

# the digits dataset
digits = datasets.load_digits()
X = digits["data"]
y = digits["target"]

# create a classifier: a support vector classifier
classifier = svm.SVC(gamma="scale")
param_grid = {
    "C": [0.01, 0.01, 0.1, 1.0, 10.0],
    "gamma": ["scale", "auto", 0.001, 0.01, 0.1],
    "kernel": ["rbf", "poly", "sigmoid"],
}
scoring = "f1_weighted"
cv = 10

# hyperparameter optimization
# total fits: 750
start = time.time()
model = DistGridSearchCV(classifier, param_grid, sc=sc, cv=cv, scoring=scoring)
model.fit(X, y)
print("Train time: {0}".format(time.time() - start))
print("Best score: {0}".format(model.best_score_))
results = pd.DataFrame(model.cv_results_).sort_values("mean_test_score",
                                                      ascending=False)
print("-- CV Results --")
print(results[["param_C", "param_kernel", "mean_test_score"]].head(10))
Exemple #5
0
limit = 1000

# convert training data to pandas
df = pd.DataFrame({"text": dataset["data"]})
df = df[:limit]
dataset["target"] = dataset["target"][:limit]

# fit a small encoder
encoder = Encoderizer(size="small")
X_t = encoder.fit_transform(df)

# train logistic regression
lr = DistGridSearchCV(
    LogisticRegression(solver="liblinear"),
    dict(C=[0.1, 1.0, 10.0]),
    sc,
    scoring=scoring,
    cv=cv,
)
lr.fit(X_t, dataset["target"])

# train random forest
rf = DistGridSearchCV(
    RandomForestClassifier(n_estimators=10),
    dict(max_depth=[5, 10]),
    sc,
    scoring=scoring,
    cv=cv,
)
rf.fit(X_t, dataset["target"])
Exemple #6
0
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=4)

# initial scaling
scaler = StandardScaler()
X_train_t = scaler.fit_transform(X_train)
X_test_t = scaler.transform(X_test)

# sk-dist logistic regression w/ grid search
start = time.time()
lr = LogisticRegression(solver="lbfgs", multi_class="auto")
model = DistGridSearchCV(lr, {"C": [10.0, 1.0, 0.1, 0.01]},
                         sc=sc,
                         cv=5,
                         scoring="f1_weighted")
model.fit(X_train_t, y_train)
print("-- sk-dist LR --")
print("Train Time: {0}".format(time.time() - start))
print("Best Model CV Score: {0}".format(model.best_score_))
print("Holdout F1: {0}".format(
    f1_score(y_test, model.predict(X_test_t), average="weighted")))

# sk-dist random forest
start = time.time()
rf = DistRandomForestClassifier(n_estimators=100, max_depth=None, sc=sc)
rf.fit(X_train_t, y_train)
print("-- sk-dist RF --")
print("Train Time: {0}".format(time.time() - start))
print("Holdout F1: {0}".format(
Exemple #7
0
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2)

# define word vector -> regression model
word_pipe = Pipeline(
    steps=[("vec", HashingVectorizer(analyzer="word", decode_error="ignore")
            ), ("clf", LogisticRegression())])
word_params = {
    "vec__ngram_range": [(1, 1), (1, 2), (1, 3), (1, 4), (2, 4)],
    "clf__C": [0.1, 1.0, 10.0],
    "clf__solver": ["liblinear", "lbfgs"]
}
word_model = DistGridSearchCV(word_pipe,
                              word_params,
                              sc=sc,
                              cv=cv,
                              scoring=scoring)

# define character vector -> regression model
char_pipe = Pipeline(
    steps=[("vec", HashingVectorizer(analyzer="char_wb", decode_error="ignore")
            ), ("clf", LogisticRegression())])
char_params = {
    "vec__ngram_range": [(2, 2), (2, 3), (2, 4), (2, 5), (3, 3), (3, 5)],
    "clf__C": [0.1, 1.0, 10.0],
    "clf__solver": ["liblinear", "lbfgs"]
}
char_model = DistGridSearchCV(char_pipe,
                              char_params,
                              sc=sc,
Exemple #8
0
X = digits["data"]
y = digits["target"] 
# create a classifier: a support vector classifier
classifier = svm.SVC()
param_grid = {
    "C": [0.01, 0.01, 0.1, 1.0, 10.0, 20.0, 50.0], 
    "gamma": ["scale", "auto", 0.001, 0.01, 0.1], 
    "kernel": ["rbf", "poly", "sigmoid"]
    }
scoring = "f1_weighted"
cv = 10
# hyperparameter optimization
start = time.time()

model_DIST = DistGridSearchCV(    
    classifier, param_grid,     
    sc=sc, cv=cv, scoring=scoring,
    verbose=True    
    )
model = GridSearchCV(
    classifier, param_grid,
    cv=cv, scoring=scoring,
    verbose=True
    )

model.fit(X,y)
print("Train time for scikit-learn: {0}".format(time.time() - start))
print("Best score: {0}".format(model.best_score_))
print("Train time for sk-dist: {0}".format(time.time() - start))
print("Best score: {0}".format(model_DIST.best_score_))
Exemple #9
0
# define encoder config
encoder_config = {
    "text_col": "string_vectorizer",
    "categorical_str_col": "onehotencoder",
    "categorical_int_col": "onehotencoder",
    "numeric_col": "numeric",
    "dict_col": "dict",
    "multilabel_col": "multihotencoder"
}

# variables
Cs = [0.1, 1.0, 10.0]
cv = 5
scoring = "f1_weighted"
solver = "liblinear"

# instantiate encoder with encoder_config, fit/transform on data
encoder = Encoderizer(size="small", config=encoder_config)
df_transformed = encoder.fit_transform(df)
print([i[0] for i in encoder.transformer_list])

# define and fit model
model = DistGridSearchCV(LogisticRegression(solver=solver, multi_class="auto"),
                         dict(C=Cs),
                         sc,
                         scoring=scoring,
                         cv=cv)

model.fit(df_transformed, df["target"])
print(model.best_score_)
Exemple #10
0
y = dataset["target"]

# instantiate a pipeline and grid
pipe = Pipeline(steps=[
    ("vec", TfidfVectorizer(decode_error="ignore", analyzer="word")),
    ("svd", TruncatedSVD()),
    ("clf", LogisticRegression(solver="liblinear", multi_class="auto")),
])
params = {
    "clf__C": [0.1, 1.0, 10.0],
    "vec__ngram_range": [(1, 1), (1, 2)],
    "svd__n_components": [50, 100],
}

# fit and select hyperparameters with skdist
model0 = DistGridSearchCV(pipe, params, sc, scoring=scoring, cv=cv)
model0.fit(X, y)
print("A Pipeline used as the base estimator for DistGridSearchCV: {0}".format(
    model0.best_score_))

# assemble a pipeline with skdist distributed
# grid search as the final estimator step
model1 = Pipeline(steps=[
    ("vec", TfidfVectorizer(decode_error="ignore", analyzer="word")),
    ("svd", TruncatedSVD(n_components=50)),
    (
        "clf",
        DistGridSearchCV(
            LogisticRegression(solver="liblinear", multi_class="auto"),
            {"C": [0.1, 1.0, 10.0]},
            sc,
Exemple #11
0
"""
===================================================================================
Train distributed CV search with a logistic regression on the breast cancer dataset
===================================================================================

In this example we optimize hyperparameters (C) for a logistic regression on the
breast cancer dataset. This is a binary target. We use both grid search and 
randomized search. 

Here the core difference between skdist and sklearn is to use the sparkContext
variable as an argument to the grid search and randomized search class 
instantiation. Under the hood, skdist will then broadcast the training data out
to the executors for each param set, fit the estimator for each param set, return
the cross validation score to the driver for each fit, and finally refit the model 
with the best param set back on the driver.

The final estimators are then nearly identical to a fitted sklearn GridSearchCV
or RandomizedSearchCV estimator as shown by looking at some of their methods
and attributes. 

Finally, all spark objects are removed from the fitted skdist estimator objects
so that these objects are pickle-able as shown.

Here is a sample output run:

-- Grid Search --
Best Score: 0.9925297825837328
Best C: 1.0
  param_C  mean_test_score
0   0.001         0.973818
Exemple #12
0
# create dataset
X, y = make_classification(
    n_samples=100000,
    n_features=40,
    n_informative=36,
    n_redundant=1,
    n_repeated=1,
    n_classes=40,
    n_clusters_per_class=1,
    random_state=5,
)

# one nested example
model = DistGridSearchCV(
    DistOneVsRestClassifier(LogisticRegression(solver="liblinear"), sc=sc),
    {"estimator__C": params},
    cv=cv,
    scoring=scoring,
)
model.fit(X, y)
print(pd.DataFrame(model.cv_results_)[["mean_test_score", "param_estimator__C"]])

# another nested example
model = DistGridSearchCV(
    DistOneVsOneClassifier(LogisticRegression(solver="liblinear"), sc=sc),
    {"estimator__C": params},
    cv=cv,
    scoring=scoring,
)
model.fit(X, y)
print(pd.DataFrame(model.cv_results_)[["mean_test_score", "param_estimator__C"]])
Exemple #13
0
Cs = [0.1, 1.0, 10.0]
cv = 5
scoring = "f1_weighted"
solver = "liblinear"

# convert training data to pandas
df = pd.DataFrame({"text": dataset["data"]})
df = df[:1000]
dataset["target"] = dataset["target"][:1000]

# fit a small encoder and train classifier
encoder = Encoderizer(size="small")
X_t = encoder.fit_transform(df)
model = DistGridSearchCV(LogisticRegression(solver=solver, multi_class="auto"),
                         dict(C=Cs),
                         sc,
                         scoring=scoring,
                         cv=cv)
model.fit(X_t, dataset["target"])
print(model.best_score_)

# fit a medium encoder and train classifier
encoder = Encoderizer(size="medium")
X_t = encoder.fit_transform(df)
model = DistGridSearchCV(LogisticRegression(solver=solver, multi_class="auto"),
                         dict(C=Cs),
                         sc,
                         scoring=scoring,
                         cv=cv)
model.fit(X_t, dataset["target"])
print(model.best_score_)
Exemple #14
0
"""
====================================================================================
Distribute hyperparameter tuning with gradient boosting trees via DistGridSearchCV
====================================================================================

In this example we train a classifier and regression with XGBoost by distributing
the hyperparameter tuning through DistGridSearchCV. This should work right out of the
box with XGBoost's sklearn wrapper.

Given the sequential nature of training estimators on gradient boosting trees, it
makes sense to distribute the hyperparameters and cross validation folds, rather than
trying to train multiple estimators in parallel. Skdist excels in this functionality by
leveraging DistGridSearchCV. In this example, we are able to train 54 unique sets of
hyperparameters in parallel and return the the best model to the driver.

NOTE: This example uses xgboost==0.90

Here is a sample output run:

-- Grid Search --
Best Score: 0.9936882800963308
Best colsample_bytree: 1.0
Best learning_rate: 0.05
Best max_depth: 4
Best n_estimators: 300
DistGridSearchCV(cv=5, error_score='raise-deprecating',
                 estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                         colsample_bylevel=1,
                                         colsample_bynode=1, colsample_bytree=1,
                                         gamma=0, learning_rate=0.1,