Ejemplo n.º 1
0
def test_search(spark_session):
    sc = spark_session.sparkContext

    # sklearn variables
    Cs = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
    cv = 5
    test_size = 0.2
    scoring = "roc_auc"
    solver = "liblinear"

    # load sample data (binary target)
    data = load_breast_cancer()
    X = data["data"]
    y = data["target"]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=10)

    ### distributed grid search
    model = DistGridSearchCV(LogisticRegression(solver=solver),
                             dict(C=Cs),
                             sc,
                             cv=cv,
                             scoring=scoring)
    # distributed fitting with spark
    model.fit(X_train, y_train)
    # predictions on the driver
    preds = model.predict(X_test)

    assert preds.shape == y_test.shape
Ejemplo n.º 2
0
def main():
    cv = 5
    clf_scoring = "accuracy"
    reg_scoring = "neg_mean_squared_error"

    data = load_iris()
    X = data["data"]
    y = data["target"]

    grid = dict(learning_rate=[.05, .01],
                max_depth=[4, 6, 8],
                colsample_bytree=[.6, .8, 1.0],
                n_estimators=[100, 200, 300])

    model = DistGridSearchCV(XGBClassifier(),
                             grid,
                             spark.sparkContext,
                             cv=cv,
                             scoring=clf_scoring)

    model.fit(X, y)
    # predictions on the driver
    preds = model.predict(X)
    probs = model.predict_proba(X)

    # results
    print("-- Grid Search --")
    print("Best Score: {0}".format(model.best_score_))
    print("Best colsample_bytree: {0}".format(
        model.best_estimator_.colsample_bytree))
    print("Best learning_rate: {0}".format(
        model.best_estimator_.learning_rate))
    print("Best max_depth: {0}".format(model.best_estimator_.max_depth))
    print("Best n_estimators: {0}".format(model.best_estimator_.n_estimators))
Ejemplo n.º 3
0
def test_gs():
    X = np.array([[1, 1, 1], [0, 0, 0], [-1, -1, -1]] * 100)
    y = np.array([0, 0, 1] * 100)
    gs = DistGridSearchCV(LogisticRegression(solver="liblinear"),
                          {"C": [0.1, 1.0]},
                          cv=3)
    gs.fit(X, y)
    preds = gs.predict(X[:3])
    assert np.allclose(preds, np.array([0, 0, 1]))
Ejemplo n.º 4
0
# instantiate spark session
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

# the digits dataset
digits = datasets.load_digits()
X = digits["data"]
y = digits["target"]

# create a classifier: a support vector classifier
classifier = svm.SVC(gamma="scale")
param_grid = {
    "C": [0.01, 0.01, 0.1, 1.0, 10.0],
    "gamma": ["scale", "auto", 0.001, 0.01, 0.1],
    "kernel": ["rbf", "poly", "sigmoid"],
}
scoring = "f1_weighted"
cv = 10

# hyperparameter optimization
# total fits: 750
start = time.time()
model = DistGridSearchCV(classifier, param_grid, sc=sc, cv=cv, scoring=scoring)
model.fit(X, y)
print("Train time: {0}".format(time.time() - start))
print("Best score: {0}".format(model.best_score_))
results = pd.DataFrame(model.cv_results_).sort_values("mean_test_score",
                                                      ascending=False)
print("-- CV Results --")
print(results[["param_C", "param_kernel", "mean_test_score"]].head(10))
Ejemplo n.º 5
0
df = df[:limit]
dataset["target"] = dataset["target"][:limit]

# fit a small encoder
encoder = Encoderizer(size="small")
X_t = encoder.fit_transform(df)

# train logistic regression
lr = DistGridSearchCV(
    LogisticRegression(solver="liblinear"),
    dict(C=[0.1, 1.0, 10.0]),
    sc,
    scoring=scoring,
    cv=cv,
)
lr.fit(X_t, dataset["target"])

# train random forest
rf = DistGridSearchCV(
    RandomForestClassifier(n_estimators=10),
    dict(max_depth=[5, 10]),
    sc,
    scoring=scoring,
    cv=cv,
)
rf.fit(X_t, dataset["target"])

# assemble voter and pipeline
voter = SimpleVoter([("lr", lr), ("rf", rf)],
                    classes=model.classes_,
                    voting="hard")
Ejemplo n.º 6
0
                                                    test_size=0.2,
                                                    random_state=4)

# initial scaling
scaler = StandardScaler()
X_train_t = scaler.fit_transform(X_train)
X_test_t = scaler.transform(X_test)

# sk-dist logistic regression w/ grid search
start = time.time()
lr = LogisticRegression(solver="lbfgs", multi_class="auto")
model = DistGridSearchCV(lr, {"C": [10.0, 1.0, 0.1, 0.01]},
                         sc=sc,
                         cv=5,
                         scoring="f1_weighted")
model.fit(X_train_t, y_train)
print("-- sk-dist LR --")
print("Train Time: {0}".format(time.time() - start))
print("Best Model CV Score: {0}".format(model.best_score_))
print("Holdout F1: {0}".format(
    f1_score(y_test, model.predict(X_test_t), average="weighted")))

# sk-dist random forest
start = time.time()
rf = DistRandomForestClassifier(n_estimators=100, max_depth=None, sc=sc)
rf.fit(X_train_t, y_train)
print("-- sk-dist RF --")
print("Train Time: {0}".format(time.time() - start))
print("Holdout F1: {0}".format(
    f1_score(y_test, rf.predict(X_test_t), average="weighted")))
Ejemplo n.º 7
0
                              scoring=scoring)

# define word/character vector -> feature selection -> tree ensemble
both_model = Pipeline(steps=[(
    "vec",
    FeatureUnion([(
        "word", CountVectorizer(analyzer="word", decode_error="ignore")
    ), ("char", CountVectorizer(analyzer="char_wb", decode_error="ignore"))])
), (
    "select", SelectKBest(f_classif, 1000)
), ("clf",
    DistExtraTreesClassifier(n_estimators=1000, max_depth=None, sc=sc))])

# fit all models
start = time.time()
word_model.fit(X_train, y_train)
print("Word Model Fit Time: {0}".format(time.time() - start))

start1 = time.time()
char_model.fit(X_train, y_train)
print("Char Model Fit Time: {0}".format(time.time() - start1))

start2 = time.time()
both_model.fit(X_train, y_train)
print("Tree Model Fit Time: {0}".format(time.time() - start2))
print("Total Fit Time: {0}".format(time.time() - start))

# construct voter
model = SimpleVoter([("word", word_model), ("char", char_model),
                     ("both", both_model)],
                    classes=word_model.classes_,
Ejemplo n.º 8
0
# define encoder config
encoder_config = {
    "text_col": "string_vectorizer",
    "categorical_str_col": "onehotencoder",
    "categorical_int_col": "onehotencoder",
    "numeric_col": "numeric",
    "dict_col": "dict",
    "multilabel_col": "multihotencoder"
}

# variables
Cs = [0.1, 1.0, 10.0]
cv = 5
scoring = "f1_weighted"
solver = "liblinear"

# instantiate encoder with encoder_config, fit/transform on data
encoder = Encoderizer(size="small", config=encoder_config)
df_transformed = encoder.fit_transform(df)
print([i[0] for i in encoder.transformer_list])

# define and fit model
model = DistGridSearchCV(LogisticRegression(solver=solver, multi_class="auto"),
                         dict(C=Cs),
                         sc,
                         scoring=scoring,
                         cv=cv)

model.fit(df_transformed, df["target"])
print(model.best_score_)
Ejemplo n.º 9
0
# instantiate a pipeline and grid
pipe = Pipeline(steps=[
    ("vec", TfidfVectorizer(decode_error="ignore", analyzer="word")),
    ("svd", TruncatedSVD()),
    ("clf", LogisticRegression(solver="liblinear", multi_class="auto")),
])
params = {
    "clf__C": [0.1, 1.0, 10.0],
    "vec__ngram_range": [(1, 1), (1, 2)],
    "svd__n_components": [50, 100],
}

# fit and select hyperparameters with skdist
model0 = DistGridSearchCV(pipe, params, sc, scoring=scoring, cv=cv)
model0.fit(X, y)
print("A Pipeline used as the base estimator for DistGridSearchCV: {0}".format(
    model0.best_score_))

# assemble a pipeline with skdist distributed
# grid search as the final estimator step
model1 = Pipeline(steps=[
    ("vec", TfidfVectorizer(decode_error="ignore", analyzer="word")),
    ("svd", TruncatedSVD(n_components=50)),
    (
        "clf",
        DistGridSearchCV(
            LogisticRegression(solver="liblinear", multi_class="auto"),
            {"C": [0.1, 1.0, 10.0]},
            sc,
            scoring=scoring,
Ejemplo n.º 10
0
solver = "liblinear"

# convert training data to pandas
df = pd.DataFrame({"text": dataset["data"]})
df = df[:1000]
dataset["target"] = dataset["target"][:1000]

# fit a small encoder and train classifier
encoder = Encoderizer(size="small")
X_t = encoder.fit_transform(df)
model = DistGridSearchCV(LogisticRegression(solver=solver, multi_class="auto"),
                         dict(C=Cs),
                         sc,
                         scoring=scoring,
                         cv=cv)
model.fit(X_t, dataset["target"])
print(model.best_score_)

# fit a medium encoder and train classifier
encoder = Encoderizer(size="medium")
X_t = encoder.fit_transform(df)
model = DistGridSearchCV(LogisticRegression(solver=solver, multi_class="auto"),
                         dict(C=Cs),
                         sc,
                         scoring=scoring,
                         cv=cv)
model.fit(X_t, dataset["target"])
print(model.best_score_)

# fit a large encoder and train classifier
encoder = Encoderizer(size="large")