Esempio n. 1
0
def test_raise_exception_when_invalid_value_in_subset(grid_search_3_params):
    with pytest.raises(ValueError):
        change = 'n_estimators'
        subset = {'criterion': 'not_a_value'}
        plot.grid_search(grid_search_3_params.cv_results_,
                         change=change,
                         subset=subset)
Esempio n. 2
0
def test_raise_exception_when_parameter_does_not_exist(grid_search_3_params):
    with pytest.raises(ValueError):
        change = ('this_is_not_a_parameter')
        subset = {'criterion': 'gini', 'max_features': 'sqrt'}
        plot.grid_search(grid_search_3_params.cv_results_,
                         change=change,
                         subset=subset)
Esempio n. 3
0
def test_single_numeric_partially_restricted(grid_search_3_params):
    change = 'n_estimators'
    subset = {'max_features': 'sqrt'}
    plot.grid_search(grid_search_3_params.cv_results_,
                     change,
                     subset,
                     kind='bar')
Esempio n. 4
0
def test_raise_exception_when_passing_repeated_parameters(
        grid_search_3_params):
    with pytest.raises(ValueError):
        change = ['n_estimators', 'n_estimators']
        plot.grid_search(grid_search_3_params.cv_results_,
                         change=change,
                         subset=None)
Esempio n. 5
0
def test_single_numeric_partially_restricted(grid_search_3_params):
    to_vary = 'n_estimators'
    to_keep = {'max_features': 'sqrt'}
    plot.grid_search(grid_search_3_params.cv_results_,
                     to_vary,
                     to_keep,
                     kind='bar')
Esempio n. 6
0
def test_single_numeric_restricted_multi(grid_search_3_params):
    change = 'n_estimators'
    subset = {'max_features': ['sqrt', 'log2'], 'criterion': 'gini'}
    plot.grid_search(grid_search_3_params.cv_results_,
                     change,
                     subset,
                     kind='bar')
Esempio n. 7
0
def test_double_ignores_kind_line(grid_search_3_params):
    change = ('n_estimators', 'criterion')
    subset = {'max_features': 'sqrt'}
    plot.grid_search(grid_search_3_params.cv_results_,
                     change,
                     subset,
                     kind='line')
Esempio n. 8
0
def test_raise_exception_when_parameter_set_is_not_fully_specified(
        grid_search_3_params):
    with pytest.raises(ValueError):
        change = ('n_estimators', 'criterion')
        plot.grid_search(grid_search_3_params.cv_results_,
                         change=change,
                         subset=None)
Esempio n. 9
0
def test_raise_exception_when_parameter_subset_matches_more_than_one_group(
        grid_search_4_params):
    with pytest.raises(ValueError):
        change = ('n_estimators', 'criterion')
        subset = {'min_samples_split': 2}
        plot.grid_search(grid_search_4_params.cv_results_,
                         change=change,
                         subset=subset)
 def test_raise_exception_when_parameter_subset_matches_more_than_one_group(
         self):
     with self.assertRaises(ValueError):
         change = ('n_estimators', 'criterion')
         subset = {'min_samples_split': 2}
         plot.grid_search(grid_scores_4_params,
                          change=change,
                          subset=subset)
def plot_gridsearch(namesCV_,change_,grid_search_models_,output_,suffix_):

  for nameCV,change,gridCV in zip(namesCV_,change_,grid_search_models_):
    figure = plt.figure(figsize=(10,10))
    plot.grid_search(gridCV.grid_scores_, change=change,kind='bar')
    plt.title('Grid search results '+ nameCV, fontsize=17)
    plt.ylim(-0.3,0)
    plt.ylabel('negative mean squared error',fontsize=17)
    plt.xlabel(change,fontsize=17)
    plotname=output_+"/GridSearchResults"+nameCV+suffix_+".png"
    plt.savefig(plotname)
Esempio n. 12
0
def grid_search_xgb(features,labels,test_feature):
    os.makedirs(params_path)

    tuning_params = {
                        "max_depth" : [3,4], #[3,4,5,6]
                        "min_child_weight" : [1,2],
                        "n_estimators" : [450,460,480], #[350,400,450,480,500]
                        "gamma" : [0,0.1,0.2], # [0,0.1,0.2]
                        "subsample" : [0.7,0.6], # [0.9,0.8,0.7]
                        "colsample_bytree" : [0.9,0.8,0.7], # [0.9,0.8,0.7]
                        "reg_alpha" : [0.01, 0.05, 0.06, 0.07]
                        "scale_pos_weight" : [1,10,20]
                        "learning_rate" : [i*0.01 for i in range(10)]
                        }

    score = ['roc_auc']

    parameter_space = {}

    start = time.time()
    print("# Tuning hyper-parameters for %s" % score)
    print("\nProcessing XGB model")
    #clf = XGBClassifier(objective = "binary:logistic", n_estimators = 450, max_depth =4, gamma = 0,reg_alpha =0,
    #					subsample = 0.8 , min_child_weight =1, colsample_bytree = 0.8, learning_rate = 0.1,
    #					scale_pos_weight = 1, n_jobs = -1
    # 					)
    #clf = XGBClassifier(objective = "binary:logistic", n_jobs = -1, kwargs = parameter_space)

    clf = GridSearchCV(pipe, parameter_space, cv= 5, scoring='%s' % score)
    clf.fit(features, labels)
    print("Best parameters set found on development set:")
    bst_params = clf.best_params_
    bst_score = clf.best_score_
    bst_estimator = clf.best_estimator_

    with open(params_path  + "XGB_params.txt", 'a') as f:
        f.write(
                "************************" + "\n"
                + str(bst_estimator) + "\n"
                + str(bst_params) + "\n"
                + str(bst_score))

    print("Find best params {}, with best roc {}".format(bst_params, bst_score))
    print("XGB model complete")
    plot.grid_search(clf.grid_scores_, change='max_depth', kind ='bar')
    plt.savefig(params_path + "XGB_params_{}.png".format(round(bst_score,2)))
    end = time.time()
    print(">>>>Duration<<<< : {}min ".format(round((end-start)/60,2)))

    return bst_estimator
Esempio n. 13
0
 def draw_heatmap(self, google, model, params, kernel):
     if google:
         path = "results-quickdraw/"
     else:
         path = "results/"
     fig = plot.grid_search(model.grid_scores_, change=params)
     fig.get_figure().savefig(path + self.timestamp + "_" + kernel + ".pdf")
Esempio n. 14
0
    'max_features': ['sqrt', 'log2'],
}

est = RandomForestClassifier()
clf = GridSearchCV(est, parameters, cv=5)

#clf.fit(iris.data, iris.target)
data = datasets.make_classification(1000, 10, 5, class_sep=0.7)
clf.fit(data[0], data[1])

grid_scores = clf.grid_scores_

# changing numeric parameter without any restrictions
# in the rest of the parameter set
to_vary = 'n_estimators'
grid_search(clf.grid_scores_, to_vary)
plt.show()
# you can also use bars
grid_search(clf.grid_scores_, to_vary, kind='bar')
plt.show()

# changing a categorical variable
# without any constraints
to_vary = 'criterion'
grid_search(clf.grid_scores_, to_vary)
plt.show()
# bar
grid_search(clf.grid_scores_, to_vary, kind='bar')
plt.show()

# varying a numerical parameter but constraining
 def test_raise_exception_when_passing_repeated_parameters(self):
     with self.assertRaises(ValueError):
         change = ['n_estimators', 'n_estimators']
         plot.grid_search(grid_scores, change=change, subset=None)
def test_single_numeric_line_with_tuple():
    change = ('n_estimators')
    plot.grid_search(grid_scores, change, kind='line')
Esempio n. 17
0
def test_single_numeric_line(grid_search_3_params):
    change = 'n_estimators'
    plot.grid_search(grid_search_3_params.cv_results_, change, kind='line')
def test_single_numeric_bar():
    change = 'n_estimators'
    plot.grid_search(grid_scores, change, kind='bar')
Esempio n. 19
0
def test_can_send_tuple_len_one(grid_search_3_params):
    change = ('n_estimators')
    plot.grid_search(grid_search_3_params.cv_results_, change)
Esempio n. 20
0
        "classifier", "params", "mean_fit_time", "mean_score_time",
        "mean_test_score"
    ]]
    df = df.append(df_grid_scores)

df.to_csv(root_path / "reports/classifier_selection.csv", index=False)
#%% Evaluation
# The selected hyperparameters for the RandomForestClassifier didn't prove themselves
# useful, for atleast showing differences between iterations. There's very little
# difference in performance between different parameter sets. Which is proven
# by the variance calculation below.
print("variance of 'mean_test_score' arr results : {}".format(
    np.var(grid_scores.get("RandomForestClassifier", "")["mean_test_score"])))
# criterion: gini
ax = plot.grid_search(grid_scores.get("RandomForestClassifier", ""),
                      change=('classifier__max_depth',
                              'classifier__n_estimators'),
                      subset={'classifier__criterion': "gini"})
fig = ax.get_figure()
fig.savefig(root_path / 'reports/figures/rf__gini.png')
fig.clear()

# criterion: entropy
ax = plot.grid_search(grid_scores.get("RandomForestClassifier", ""),
                      change=('classifier__max_depth',
                              'classifier__n_estimators'),
                      subset={'classifier__criterion': "entropy"})
fig = ax.get_figure()
fig.savefig(root_path / 'reports/figures/rf__entropy.png')
fig.clear()

# differences between kernels
def test_single_numeric_restricted_single():
    change = 'n_estimators'
    subset = {'max_features': 'sqrt', 'criterion': 'gini'}
    plot.grid_search(grid_scores, change, subset, kind='bar')
    'max_features': ['sqrt', 'log2'],
}

est = RandomForestClassifier()
clf = GridSearchCV(est, parameters, cv=5)

#clf.fit(iris.data, iris.target)
data = datasets.make_classification(1000, 10, 5, class_sep=0.7)
clf.fit(data[0], data[1])

grid_scores = clf.grid_scores_

# changing numeric parameter without any restrictions
# in the rest of the parameter set
to_vary = 'n_estimators'
grid_search(clf.grid_scores_, to_vary)
plt.show()
# you can also use bars
grid_search(clf.grid_scores_, to_vary, kind='bar')
plt.show()

# changing a categorical variable
# without any constraints
to_vary = 'criterion'
grid_search(clf.grid_scores_, to_vary)
plt.show()
# bar
grid_search(clf.grid_scores_, to_vary, kind='bar')
plt.show()

 def test_raise_exception_when_invalid_value_in_subset(self):
     with self.assertRaises(ValueError):
         change = 'n_estimators'
         subset = {'criterion': 'not_a_value'}
         plot.grid_search(grid_scores, change=change, subset=subset)
Esempio n. 24
0
def test_list_with_len_three_raises_exception(grid_search_3_params):
    l = ['a', 'b', 'c']
    with pytest.raises(ValueError):
        plot.grid_search(grid_search_3_params.cv_results_, l)
Esempio n. 25
0
def test_can_send_string(grid_search_3_params):
    change = 'n_estimators'
    plot.grid_search(grid_search_3_params.cv_results_, change)
parameters = {
    'n_estimators': [1, 10, 50, 100],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2'],
}

est = RandomForestClassifier()
clf = GridSearchCV(est, parameters, cv=5)

X, y = datasets.make_classification(1000, 10, n_informative=5, class_sep=0.7)
clf.fit(X, y)

# changing numeric parameter without any restrictions
# in the rest of the parameter set
grid_search(clf.cv_results_, change='n_estimators')
plt.show()

# you can also use bars
grid_search(clf.cv_results_, change='n_estimators', kind='bar')
plt.show()

# changing a categorical variable without any constraints
grid_search(clf.cv_results_, change='criterion')
plt.show()

# bar
grid_search(clf.cv_results_, change='criterion', kind='bar')
plt.show()

# varying a numerical parameter but constraining
Esempio n. 27
0
def test_single_categorial_bar(grid_search_3_params):
    change = 'n_estimators'
    plot.grid_search(grid_search_3_params.cv_results_, change, kind='bar')
Esempio n. 28
0
module_ = importlib.import_module(module_name)
class_ = getattr(module_, class_name)
clf = class_()

df = pd.read_parquet(str(upstream['join']))
X = df.drop('target', axis='columns')
y = df.target

# Perform grid search over the passed parameters
grid = GridSearchCV(clf, model_params, n_jobs=-1, cv=2)

# We want to estimate generalization performance *and* tune hyperparameters
# so we are using nested cross-validation
y_pred = cross_val_predict(grid, X, y)

print(classification_report(y, y_pred))

plot.confusion_matrix(y, y_pred)

# find best params
grid.fit(X, y)
grid.best_params_

plot.grid_search(grid.cv_results_, change=list(model_params))

best = grid.best_estimator_
best

with open(product['model'], 'wb') as f:
    pickle.dump(best, f)
def test_single_categorial_bar():
    change = 'n_estimators'
    plot.grid_search(grid_scores, change, kind='bar')
Esempio n. 30
0
def test_subset_can_be_none_when_parameter_set_is_fully_specified(
        grid_search_2_params):
    change = ('n_estimators', 'criterion')
    plot.grid_search(grid_search_2_params.cv_results_,
                     change=change,
                     subset=None)
Esempio n. 31
0
def test_none_change_raises_exception(grid_search_3_params):
    with pytest.raises(ValueError):
        plot.grid_search(grid_search_3_params.cv_results_, None)