def fit_model(X_train, y_train, is_halving_grid_search_cv: bool): if is_halving_grid_search_cv: param_grid = { 'max_depth': [10, 11, 12, 13, 14, 15], 'min_samples_split': [3, 5, 8, 10, 15, 20, 30] } base_estimator = RandomForestClassifier( n_estimators=100, class_weight='balanced_subsample', verbose=0, n_jobs=-1, random_state=2021) grid_search = HalvingGridSearchCV(base_estimator, param_grid, cv=5, factor=2, resource='n_estimators', max_resources=20, random_state=2021, n_jobs=-1, verbose=1) grid_search.fit(X_train, y_train) model = grid_search.best_estimator_ else: model = RandomForestClassifier(random_state=20) model.fit(X_train, y_train) # print("Train set score: {:.3f}".format(model.score(X_train, y_train))) # print("Test set score: {:.3f}".format(model.score(X_test, y_test))) return model
def test_resource_parameter(Est): # Test the resource parameter n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) param_grid = {'a': [1, 2], 'b': list(range(10))} base_estimator = FastClassifier() sh = Est(base_estimator, param_grid, cv=2, resource='c', max_resources=10, factor=3) sh.fit(X, y) assert set(sh.n_resources_) == set([1, 3, 9]) for r_i, params, param_c in zip(sh.cv_results_['n_resources'], sh.cv_results_['params'], sh.cv_results_['param_c']): assert r_i == params['c'] == param_c with pytest.raises( ValueError, match='Cannot use resource=1234 which is not supported '): sh = HalvingGridSearchCV(base_estimator, param_grid, cv=2, resource='1234', max_resources=10) sh.fit(X, y) with pytest.raises( ValueError, match='Cannot use parameter c as the resource since it is part ' 'of the searched parameters.'): param_grid = {'a': [1, 2], 'b': [1, 2], 'c': [1, 3]} sh = HalvingGridSearchCV(base_estimator, param_grid, cv=2, resource='c', max_resources=10) sh.fit(X, y)
def fit(self, X, y=None, *, target_col=None): """Fit estimator. Requires to either specify the target as separate 1d array or Series y (in scikit-learn fashion) or as column of the dataframe X specified by target_col. If y is specified, X is assumed not to contain the target. Parameters ---------- X : DataFrame Input features. If target_col is specified, X also includes the target. y : Series or numpy array, optional. Target. You need to specify either y or target_col. target_col : string or int, optional Column name of target if included in X. """ # copy and paste from above?! if ((y is None and target_col is None) or (y is not None) and (target_col is not None)): raise ValueError( "Need to specify either y or target_col.") X, y = _validate_Xyt(X, y, target_col, do_clean=False) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) types = detect_types(X, type_hints=self.type_hints) self.feature_names_ = X.columns self.types_ = types cv = 5 factor = 3 y, self.scoring_ = self._preprocess_target(y) self.log_ = [] # reimplement cross-validation so we only do preprocessing once pipe = Pipeline([('preprocessing', EasyPreprocessor(verbose=self.verbose, types=types)), ('classifier', DummyClassifier())]) estimators = self._get_estimators() param_grid = [{'classifier': [est]} for est in estimators] gs = HalvingGridSearchCV( factor=factor, estimator=pipe, param_grid=param_grid, min_resources=self.min_resources, verbose=self.verbose, cv=cv, error_score='raise', scoring=self.scoring_, refit='recall_macro', n_jobs=self.n_jobs) self.search_ = gs with sklearn.config_context(print_changed_only=True): gs.fit(X, y) self.est_ = gs.best_estimator_ print("best classifier: ", gs.best_params_['classifier']) print("best score: {:.3f}".format(gs.best_score_)) return self
def half_grid_search_pipeline(): data = generate_dataset() svc = svm.SVC() params = {"kernel": ["linear", "rbf", "sigmoid"], "C": list(range(1, 20))} classifier = HalvingGridSearchCV(svc, params, scoring="accuracy", factor=3) start_time = time.time() classifier.fit(data.data, data.target) print(f"Time taken for fitting {time.time() - start_time} seconds") print("Best Params", classifier.best_params_) print("Best CV Score", classifier.best_score_)
def test_resource_parameter(Est): # Test the resource parameter n_samples = 1000 X, y = make_classification(n_samples=n_samples, random_state=0) param_grid = {"a": [1, 2], "b": list(range(10))} base_estimator = FastClassifier() sh = Est(base_estimator, param_grid, cv=2, resource="c", max_resources=10, factor=3) sh.fit(X, y) assert set(sh.n_resources_) == set([1, 3, 9]) for r_i, params, param_c in zip( sh.cv_results_["n_resources"], sh.cv_results_["params"], sh.cv_results_["param_c"], ): assert r_i == params["c"] == param_c with pytest.raises( ValueError, match="Cannot use resource=1234 which is not supported "): sh = HalvingGridSearchCV(base_estimator, param_grid, cv=2, resource="1234", max_resources=10) sh.fit(X, y) with pytest.raises( ValueError, match=("Cannot use parameter c as the resource since it is part " "of the searched parameters."), ): param_grid = {"a": [1, 2], "b": [1, 2], "c": [1, 3]} sh = HalvingGridSearchCV(base_estimator, param_grid, cv=2, resource="c", max_resources=10) sh.fit(X, y)
def _perform_gridsearch(self, models_param_grid, scoring, cv=5): """Perform gridsearch on provided models_param_grid and return results. Gridsearch is performed on provided Models space ('Models class': param_grid dict pairs) with scoring used as a sole metric to decide which Models perform well and which perform poor. HalvingGridSearch is used instead of a regular GridSearch to possibly save time. Note: GridSearch might fail with NotFittedError - All Models failed to fit. This might happen sometimes when parameters for the same type of Model are provided wrongly (e.g. DecisionTreeClassifier instanced with criterion: "mae" which is used in DecisionTreeRegressor. Changing param_grid solves this issue. Args: models_param_grid (dict): 'Model class': param_grid dict pairs scoring (function): sklearn scoring function cv (int, optional): number of folds, defaults to 5 Returns: tuple: ( list of (Model class, best params for the Model class) tuples dict of 'Model class': cv_results_ from GridSearch object ) Raises: NotFittedError: when GridSearch object raises NotFittedError """ all_results = {} best_of_their_class = [] for model, params in models_param_grid.items(): # not every model requires random_state, e.g. KNeighborsClassifier if "random_state" in model().get_params().keys(): params["random_state"] = [self.random_state] # https://scikit-learn.org/stable/modules/model_evaluation.html#defining-your-scoring-strategy-from-metric-functions sorting_order = reverse_sorting_order(obj_name(scoring)) created_model = self._wrap_model(model()) # GridSearch will fail with NotFittedError("All estimators failed to fit") when argument provided # in the param grid is incorrect for a given model (even one combination will trigger it). clf = HalvingGridSearchCV( created_model, params, scoring=make_scorer(scoring, greater_is_better=sorting_order), cv=cv, error_score=0, # to ignore errors that might happen, random_state=self.random_state) try: clf.fit(self.X_train, self.y_train) except NotFittedError: # printing out warning for user as the NotFittedError might be misleading in this case params_str = [ "{}: {}".format(key, item) for key, item in params.items() ] warn_msg = "WARNING: {} might potentially have incorrect params provided: {}".format( model, params_str) warnings.warn(warn_msg) raise all_results[model] = clf.cv_results_ best_of_their_class.append((model, clf.best_params_)) return best_of_their_class, all_results,
# estimator, and compute the time required to train a # :class:`~sklearn.model_selection.HalvingGridSearchCV` instance, as well as a # :class:`~sklearn.model_selection.GridSearchCV` instance. rng = np.random.RandomState(0) X, y = datasets.make_classification(n_samples=1000, random_state=rng) gammas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7] Cs = [1, 10, 100, 1e3, 1e4, 1e5] param_grid = {"gamma": gammas, "C": Cs} clf = SVC(random_state=rng) tic = time() gsh = HalvingGridSearchCV( estimator=clf, param_grid=param_grid, factor=2, random_state=rng ) gsh.fit(X, y) gsh_time = time() - tic tic = time() gs = GridSearchCV(estimator=clf, param_grid=param_grid) gs.fit(X, y) gs_time = time() - tic # %% # We now plot heatmaps for both search estimators. def make_heatmap(ax, gs, is_sh=False, make_cbar=False): """Helper to make a heatmap."""
(scores.mean(), scores.std())) from sklearn.datasets import make_classification from sklearn.ensemble import RandomForestClassifier from sklearn.experimental import enable_halving_search_cv # noqa from sklearn.model_selection import HalvingGridSearchCV import pandas as pd param_grid = {'max_depth': [3, 5, 10], 'min_samples_split': [2, 5, 10]} base_estimator = RandomForestClassifier(random_state=0) X, y = make_classification(n_samples=1000, random_state=0) # halving to make finding faster sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, factor=2, resource='n_estimators', max_resources=30).fit(X, y) print(sh.best_estimator_) import numpy as np from sklearn.model_selection import validation_curve from sklearn.datasets import load_iris from sklearn.linear_model import Ridge np.random.seed(0) X, y = load_iris(return_X_y=True) indices = np.arange(y.shape[0]) np.random.shuffle(indices) X, y = X[indices], y[indices]
if __name__ == '__main__': X_train, y_train, X_test, y_test = get_all_data() train_weights = [normalize_review_weight(w) for w in X_train['helpful']] tfidf_grid = { 'vectorizer__lowercase': [True, False], 'vectorizer__ngram_range': [(1, 3), (1, 4), (2, 4)], 'vectorizer__max_df': [1.0, 0.95, 0.9, 0.85, 0.8], 'vectorizer__min_df': [25, 50, 100, 200, 0.01, 0.05], } svm = Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', LinearSVC(class_weight='balanced'))]) grid_search = HalvingGridSearchCV(svm, tfidf_grid, random_state=42, verbose=10, n_jobs=12) grid_search.fit(X_train['reviewText'], y_train, classifier__sample_weight=train_weights) print(grid_search.best_params_) print( score_metric(y_test, grid_search.best_estimator_.predict( X_test['reviewText']))) with open('model/sklearn-svc.pkl', 'wb') as out: pickle.dump(grid_search.best_estimator_, out) bayes = Pipeline([('vectorizer', TfidfVectorizer()),
param_grid = { "max_depth": [3, 4, 5, 7], "gamma": [0, 0.25, 1], "reg_lambda": [0, 1, 10], "scale_pos_weight": [1, 3, 5], "subsample": [0.8], # Fix subsample "colsample_bytree": [0.5], # Fix colsample_bytree } from sklearn.experimental import enable_halving_search_cv from sklearn.model_selection import HalvingGridSearchCV halving_cv = HalvingGridSearchCV( xgb_cl, param_grid, scoring="roc_auc", n_jobs=-1, min_resources="exhaust", factor=3 ) _ = halving_cv.fit(X, y) # deal with class imbalance counts = pd.Series(y.flatten()).value_counts() scale_pos_weight = counts["No"] / counts["Yes"] param_grid_2 = { "max_depth": [3, 4, 5], "gamma": [5, 30, 50],
def run_ann(data_train, label_train, data_test, label_test, algo_name, data_name, fig_name=None, show_plots=False, plot_learning=False, plot_val=False, val_param="max_iter", val_range=range(10, 210, 10), test=False, val_lab="Iterations", grid_search=False, **kwargs): start = time.time() if grid_search: # based off sklearn example for hp tuning # https://scikit-learn.org/stable/modules/grid_search.html# # define hyper parameter space to check over param_grid = { # alpha "alpha": [1e-3, 1e-4, 1e-5], # learning rate "learning_rate_init": [1e-2, 1e-3, 1e-4] } clf = MLPClassifier(hidden_layer_sizes=(85, ), **kwargs, max_iter=1000, early_stopping=True, random_state=0) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn") sh = HalvingGridSearchCV(clf, param_grid, cv=5, factor=2).fit(data_train, label_train) print(sh.best_estimator_) clf = MLPClassifier(hidden_layer_sizes=(85, ), max_iter=500, random_state=0, early_stopping=True, **kwargs).fit(data_train, label_train) # based on sklearn learning curve example # https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html if plot_learning: # plot learning curve for current model title = f"Learning Curves (ANN {algo_name}) ({data_name})" with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn") plot_learning_curve(clf, title, data_train, label_train, ylim=(0, 1.01), cv=5, n_jobs=4) if fig_name is not None: now = datetime.now() dt_string = now.strftime("%Y-%m-%d-%H-%M-%S") plt.savefig(f"{fig_name}_learn_{dt_string}") if plot_val: # based off sklearn validation curve example # https://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html # plot validation curve title = f"Validation Curve with ANN ({algo_name}) ({data_name})" x_lab = val_lab y_lab = "Score" with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn") plot_validation_curve(clf, title, data_train, label_train, x_lab, y_lab, cv=5, param_name=val_param, param_range=val_range, ylim=(0.0, 1.1)) if fig_name is not None: now = datetime.now() dt_string = now.strftime("%Y-%m-%d-%H-%M-%S") plt.savefig(f"{fig_name}_val_{dt_string}") if show_plots: plt.show() if test: print(f"ANN ({algo_name}) score: {clf.score(data_test, label_test)}") print(f"ANN ({algo_name}) time: {time.time()-start:.2f}")
def main(): # tracemalloc.start() log_file = "./log/data_set_stats.log" # initialize logging log_dir = os.path.dirname(log_file) if not os.path.isdir(log_dir): os.makedirs(log_dir) logging.basicConfig( filename=log_file, filemode='w', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', datefmt='%H:%M:%S', level=logging.DEBUG) # instead of fetching all datasets, we get all datasets from OpenML-CC18, but some have missing values; # the pre-processing will be performed in convex_hull_stats.dataset_stats.py #df_datasets = lg.datasets.fetch_datasets(task="classification", min_classes=2, max_features=4000, update_data=True) logging.info("Loading benchmark suite OpenML-CC18...") benchmark_suite = openml.study.get_suite('OpenML-CC18') random_state = 42 classifiers = dict() # let's create pipelines for classifiers that also include hyperparameter tuning from sklearn.experimental import enable_halving_search_cv from sklearn.model_selection import HalvingGridSearchCV rf_parameter_grid = { 'n_estimators': [10, 20, 30, 50, 100, 200, 300], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [4, 5, 6, 7, 8, None], 'criterion': ['gini', 'entropy'] } # classifiers["RandomForestHT"] = HalvingGridSearchCV(RandomForestClassifier(random_state=random_state), rf_parameter_grid, random_state=random_state) # classifiers["RandomForest"] = RandomForestClassifier(random_state=random_state) svc_parameter_grid = { 'C': [0.1, 1.0, 10, 100, 1000], 'kernel': ['poly'], 'degree': [2, 3, 4, 5], 'gamma': [1e-3, 1e-4, 'scale'], 'coef0': [0.0, 1.0, 10.0], } #classifiers["SVCHT"] = HalvingGridSearchCV(SVC(random_state=random_state), svc_parameter_grid, random_state=random_state) #classifiers["SVC"] = SVC(kernel='poly', random_state=random_state) lr_parameter_grid = { 'C': np.logspace(-3, 3, 7), 'penalty': ['none', 'l2'], } # classifiers["LogisticRegressionHT"] = HalvingGridSearchCV(LogisticRegression(random_state=random_state), lr_parameter_grid, random_state=random_state) # classifiers["LogisticRegression"] = LogisticRegression(random_state=random_state) mlp_parameter_grid = { 'hidden_layer_sizes': [(50), (100), (50, 50), (100, 50)], 'learning_rate_init': [0.001, 0.0001], } classifiers["MLPClassifierHT"] = HalvingGridSearchCV( MLPClassifier(max_iter=1000, early_stopping=True, random_state=random_state), mlp_parameter_grid, random_state=random_state) # classifiers["MLPClassifier"] = MLPClassifier(random_state=random_state) convex_hull_stats.openml_stats_all(benchmark_suite, classifiers, n_splits=10)