def test_raise_exception_when_invalid_value_in_subset(grid_search_3_params): with pytest.raises(ValueError): change = 'n_estimators' subset = {'criterion': 'not_a_value'} plot.grid_search(grid_search_3_params.cv_results_, change=change, subset=subset)
def test_raise_exception_when_parameter_does_not_exist(grid_search_3_params): with pytest.raises(ValueError): change = ('this_is_not_a_parameter') subset = {'criterion': 'gini', 'max_features': 'sqrt'} plot.grid_search(grid_search_3_params.cv_results_, change=change, subset=subset)
def test_single_numeric_partially_restricted(grid_search_3_params): change = 'n_estimators' subset = {'max_features': 'sqrt'} plot.grid_search(grid_search_3_params.cv_results_, change, subset, kind='bar')
def test_raise_exception_when_passing_repeated_parameters( grid_search_3_params): with pytest.raises(ValueError): change = ['n_estimators', 'n_estimators'] plot.grid_search(grid_search_3_params.cv_results_, change=change, subset=None)
def test_single_numeric_partially_restricted(grid_search_3_params): to_vary = 'n_estimators' to_keep = {'max_features': 'sqrt'} plot.grid_search(grid_search_3_params.cv_results_, to_vary, to_keep, kind='bar')
def test_single_numeric_restricted_multi(grid_search_3_params): change = 'n_estimators' subset = {'max_features': ['sqrt', 'log2'], 'criterion': 'gini'} plot.grid_search(grid_search_3_params.cv_results_, change, subset, kind='bar')
def test_double_ignores_kind_line(grid_search_3_params): change = ('n_estimators', 'criterion') subset = {'max_features': 'sqrt'} plot.grid_search(grid_search_3_params.cv_results_, change, subset, kind='line')
def test_raise_exception_when_parameter_set_is_not_fully_specified( grid_search_3_params): with pytest.raises(ValueError): change = ('n_estimators', 'criterion') plot.grid_search(grid_search_3_params.cv_results_, change=change, subset=None)
def test_raise_exception_when_parameter_subset_matches_more_than_one_group( grid_search_4_params): with pytest.raises(ValueError): change = ('n_estimators', 'criterion') subset = {'min_samples_split': 2} plot.grid_search(grid_search_4_params.cv_results_, change=change, subset=subset)
def test_raise_exception_when_parameter_subset_matches_more_than_one_group( self): with self.assertRaises(ValueError): change = ('n_estimators', 'criterion') subset = {'min_samples_split': 2} plot.grid_search(grid_scores_4_params, change=change, subset=subset)
def plot_gridsearch(namesCV_,change_,grid_search_models_,output_,suffix_): for nameCV,change,gridCV in zip(namesCV_,change_,grid_search_models_): figure = plt.figure(figsize=(10,10)) plot.grid_search(gridCV.grid_scores_, change=change,kind='bar') plt.title('Grid search results '+ nameCV, fontsize=17) plt.ylim(-0.3,0) plt.ylabel('negative mean squared error',fontsize=17) plt.xlabel(change,fontsize=17) plotname=output_+"/GridSearchResults"+nameCV+suffix_+".png" plt.savefig(plotname)
def grid_search_xgb(features,labels,test_feature): os.makedirs(params_path) tuning_params = { "max_depth" : [3,4], #[3,4,5,6] "min_child_weight" : [1,2], "n_estimators" : [450,460,480], #[350,400,450,480,500] "gamma" : [0,0.1,0.2], # [0,0.1,0.2] "subsample" : [0.7,0.6], # [0.9,0.8,0.7] "colsample_bytree" : [0.9,0.8,0.7], # [0.9,0.8,0.7] "reg_alpha" : [0.01, 0.05, 0.06, 0.07] "scale_pos_weight" : [1,10,20] "learning_rate" : [i*0.01 for i in range(10)] } score = ['roc_auc'] parameter_space = {} start = time.time() print("# Tuning hyper-parameters for %s" % score) print("\nProcessing XGB model") #clf = XGBClassifier(objective = "binary:logistic", n_estimators = 450, max_depth =4, gamma = 0,reg_alpha =0, # subsample = 0.8 , min_child_weight =1, colsample_bytree = 0.8, learning_rate = 0.1, # scale_pos_weight = 1, n_jobs = -1 # ) #clf = XGBClassifier(objective = "binary:logistic", n_jobs = -1, kwargs = parameter_space) clf = GridSearchCV(pipe, parameter_space, cv= 5, scoring='%s' % score) clf.fit(features, labels) print("Best parameters set found on development set:") bst_params = clf.best_params_ bst_score = clf.best_score_ bst_estimator = clf.best_estimator_ with open(params_path + "XGB_params.txt", 'a') as f: f.write( "************************" + "\n" + str(bst_estimator) + "\n" + str(bst_params) + "\n" + str(bst_score)) print("Find best params {}, with best roc {}".format(bst_params, bst_score)) print("XGB model complete") plot.grid_search(clf.grid_scores_, change='max_depth', kind ='bar') plt.savefig(params_path + "XGB_params_{}.png".format(round(bst_score,2))) end = time.time() print(">>>>Duration<<<< : {}min ".format(round((end-start)/60,2))) return bst_estimator
def draw_heatmap(self, google, model, params, kernel): if google: path = "results-quickdraw/" else: path = "results/" fig = plot.grid_search(model.grid_scores_, change=params) fig.get_figure().savefig(path + self.timestamp + "_" + kernel + ".pdf")
'max_features': ['sqrt', 'log2'], } est = RandomForestClassifier() clf = GridSearchCV(est, parameters, cv=5) #clf.fit(iris.data, iris.target) data = datasets.make_classification(1000, 10, 5, class_sep=0.7) clf.fit(data[0], data[1]) grid_scores = clf.grid_scores_ # changing numeric parameter without any restrictions # in the rest of the parameter set to_vary = 'n_estimators' grid_search(clf.grid_scores_, to_vary) plt.show() # you can also use bars grid_search(clf.grid_scores_, to_vary, kind='bar') plt.show() # changing a categorical variable # without any constraints to_vary = 'criterion' grid_search(clf.grid_scores_, to_vary) plt.show() # bar grid_search(clf.grid_scores_, to_vary, kind='bar') plt.show() # varying a numerical parameter but constraining
def test_raise_exception_when_passing_repeated_parameters(self): with self.assertRaises(ValueError): change = ['n_estimators', 'n_estimators'] plot.grid_search(grid_scores, change=change, subset=None)
def test_single_numeric_line_with_tuple(): change = ('n_estimators') plot.grid_search(grid_scores, change, kind='line')
def test_single_numeric_line(grid_search_3_params): change = 'n_estimators' plot.grid_search(grid_search_3_params.cv_results_, change, kind='line')
def test_single_numeric_bar(): change = 'n_estimators' plot.grid_search(grid_scores, change, kind='bar')
def test_can_send_tuple_len_one(grid_search_3_params): change = ('n_estimators') plot.grid_search(grid_search_3_params.cv_results_, change)
"classifier", "params", "mean_fit_time", "mean_score_time", "mean_test_score" ]] df = df.append(df_grid_scores) df.to_csv(root_path / "reports/classifier_selection.csv", index=False) #%% Evaluation # The selected hyperparameters for the RandomForestClassifier didn't prove themselves # useful, for atleast showing differences between iterations. There's very little # difference in performance between different parameter sets. Which is proven # by the variance calculation below. print("variance of 'mean_test_score' arr results : {}".format( np.var(grid_scores.get("RandomForestClassifier", "")["mean_test_score"]))) # criterion: gini ax = plot.grid_search(grid_scores.get("RandomForestClassifier", ""), change=('classifier__max_depth', 'classifier__n_estimators'), subset={'classifier__criterion': "gini"}) fig = ax.get_figure() fig.savefig(root_path / 'reports/figures/rf__gini.png') fig.clear() # criterion: entropy ax = plot.grid_search(grid_scores.get("RandomForestClassifier", ""), change=('classifier__max_depth', 'classifier__n_estimators'), subset={'classifier__criterion': "entropy"}) fig = ax.get_figure() fig.savefig(root_path / 'reports/figures/rf__entropy.png') fig.clear() # differences between kernels
def test_single_numeric_restricted_single(): change = 'n_estimators' subset = {'max_features': 'sqrt', 'criterion': 'gini'} plot.grid_search(grid_scores, change, subset, kind='bar')
'max_features': ['sqrt', 'log2'], } est = RandomForestClassifier() clf = GridSearchCV(est, parameters, cv=5) #clf.fit(iris.data, iris.target) data = datasets.make_classification(1000, 10, 5, class_sep=0.7) clf.fit(data[0], data[1]) grid_scores = clf.grid_scores_ # changing numeric parameter without any restrictions # in the rest of the parameter set to_vary = 'n_estimators' grid_search(clf.grid_scores_, to_vary) plt.show() # you can also use bars grid_search(clf.grid_scores_, to_vary, kind='bar') plt.show() # changing a categorical variable # without any constraints to_vary = 'criterion' grid_search(clf.grid_scores_, to_vary) plt.show() # bar grid_search(clf.grid_scores_, to_vary, kind='bar') plt.show()
def test_raise_exception_when_invalid_value_in_subset(self): with self.assertRaises(ValueError): change = 'n_estimators' subset = {'criterion': 'not_a_value'} plot.grid_search(grid_scores, change=change, subset=subset)
def test_list_with_len_three_raises_exception(grid_search_3_params): l = ['a', 'b', 'c'] with pytest.raises(ValueError): plot.grid_search(grid_search_3_params.cv_results_, l)
def test_can_send_string(grid_search_3_params): change = 'n_estimators' plot.grid_search(grid_search_3_params.cv_results_, change)
parameters = { 'n_estimators': [1, 10, 50, 100], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2'], } est = RandomForestClassifier() clf = GridSearchCV(est, parameters, cv=5) X, y = datasets.make_classification(1000, 10, n_informative=5, class_sep=0.7) clf.fit(X, y) # changing numeric parameter without any restrictions # in the rest of the parameter set grid_search(clf.cv_results_, change='n_estimators') plt.show() # you can also use bars grid_search(clf.cv_results_, change='n_estimators', kind='bar') plt.show() # changing a categorical variable without any constraints grid_search(clf.cv_results_, change='criterion') plt.show() # bar grid_search(clf.cv_results_, change='criterion', kind='bar') plt.show() # varying a numerical parameter but constraining
def test_single_categorial_bar(grid_search_3_params): change = 'n_estimators' plot.grid_search(grid_search_3_params.cv_results_, change, kind='bar')
module_ = importlib.import_module(module_name) class_ = getattr(module_, class_name) clf = class_() df = pd.read_parquet(str(upstream['join'])) X = df.drop('target', axis='columns') y = df.target # Perform grid search over the passed parameters grid = GridSearchCV(clf, model_params, n_jobs=-1, cv=2) # We want to estimate generalization performance *and* tune hyperparameters # so we are using nested cross-validation y_pred = cross_val_predict(grid, X, y) print(classification_report(y, y_pred)) plot.confusion_matrix(y, y_pred) # find best params grid.fit(X, y) grid.best_params_ plot.grid_search(grid.cv_results_, change=list(model_params)) best = grid.best_estimator_ best with open(product['model'], 'wb') as f: pickle.dump(best, f)
def test_single_categorial_bar(): change = 'n_estimators' plot.grid_search(grid_scores, change, kind='bar')
def test_subset_can_be_none_when_parameter_set_is_fully_specified( grid_search_2_params): change = ('n_estimators', 'criterion') plot.grid_search(grid_search_2_params.cv_results_, change=change, subset=None)
def test_none_change_raises_exception(grid_search_3_params): with pytest.raises(ValueError): plot.grid_search(grid_search_3_params.cv_results_, None)