Beispiel #1
0
def fit_model(X_train, y_train, is_halving_grid_search_cv: bool):
    if is_halving_grid_search_cv:
        param_grid = {
            'max_depth': [10, 11, 12, 13, 14, 15],
            'min_samples_split': [3, 5, 8, 10, 15, 20, 30]
        }

        base_estimator = RandomForestClassifier(
            n_estimators=100,
            class_weight='balanced_subsample',
            verbose=0,
            n_jobs=-1,
            random_state=2021)

        grid_search = HalvingGridSearchCV(base_estimator,
                                          param_grid,
                                          cv=5,
                                          factor=2,
                                          resource='n_estimators',
                                          max_resources=20,
                                          random_state=2021,
                                          n_jobs=-1,
                                          verbose=1)
        grid_search.fit(X_train, y_train)
        model = grid_search.best_estimator_
    else:
        model = RandomForestClassifier(random_state=20)
        model.fit(X_train, y_train)
        # print("Train set score: {:.3f}".format(model.score(X_train, y_train)))
        # print("Test set score:  {:.3f}".format(model.score(X_test, y_test)))

    return model
def test_resource_parameter(Est):
    # Test the resource parameter

    n_samples = 1000
    X, y = make_classification(n_samples=n_samples, random_state=0)
    param_grid = {'a': [1, 2], 'b': list(range(10))}
    base_estimator = FastClassifier()
    sh = Est(base_estimator, param_grid, cv=2, resource='c',
             max_resources=10, factor=3)
    sh.fit(X, y)
    assert set(sh.n_resources_) == set([1, 3, 9])
    for r_i, params, param_c in zip(sh.cv_results_['n_resources'],
                                    sh.cv_results_['params'],
                                    sh.cv_results_['param_c']):
        assert r_i == params['c'] == param_c

    with pytest.raises(
            ValueError,
            match='Cannot use resource=1234 which is not supported '):
        sh = HalvingGridSearchCV(base_estimator, param_grid, cv=2,
                                 resource='1234', max_resources=10)
        sh.fit(X, y)

    with pytest.raises(
            ValueError,
            match='Cannot use parameter c as the resource since it is part '
                  'of the searched parameters.'):
        param_grid = {'a': [1, 2], 'b': [1, 2], 'c': [1, 3]}
        sh = HalvingGridSearchCV(base_estimator, param_grid, cv=2,
                                 resource='c', max_resources=10)
        sh.fit(X, y)
Beispiel #3
0
    def fit(self, X, y=None, *, target_col=None):
        """Fit estimator.

        Requires to either specify the target as separate 1d array or Series y
        (in scikit-learn fashion) or as column of the dataframe X specified by
        target_col.
        If y is specified, X is assumed not to contain the target.

        Parameters
        ----------
        X : DataFrame
            Input features. If target_col is specified, X also includes the
            target.
        y : Series or numpy array, optional.
            Target. You need to specify either y or target_col.
        target_col : string or int, optional
            Column name of target if included in X.
        """
        # copy and paste from above?!
        if ((y is None and target_col is None)
                or (y is not None) and (target_col is not None)):
            raise ValueError(
                "Need to specify either y or target_col.")
        X, y = _validate_Xyt(X, y, target_col, do_clean=False)
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        types = detect_types(X, type_hints=self.type_hints)
        self.feature_names_ = X.columns
        self.types_ = types
        cv = 5
        factor = 3

        y, self.scoring_ = self._preprocess_target(y)
        self.log_ = []

        # reimplement cross-validation so we only do preprocessing once
        pipe = Pipeline([('preprocessing',
                          EasyPreprocessor(verbose=self.verbose, types=types)),
                         ('classifier', DummyClassifier())])

        estimators = self._get_estimators()
        param_grid = [{'classifier': [est]} for est in estimators]
        gs = HalvingGridSearchCV(
            factor=factor,
            estimator=pipe, param_grid=param_grid,
            min_resources=self.min_resources,
            verbose=self.verbose, cv=cv, error_score='raise',
            scoring=self.scoring_, refit='recall_macro', n_jobs=self.n_jobs)
        self.search_ = gs
        with sklearn.config_context(print_changed_only=True):
            gs.fit(X, y)
        self.est_ = gs.best_estimator_

        print("best classifier: ", gs.best_params_['classifier'])
        print("best score: {:.3f}".format(gs.best_score_))

        return self
def half_grid_search_pipeline():
    data = generate_dataset()
    svc = svm.SVC()
    params = {"kernel": ["linear", "rbf", "sigmoid"], "C": list(range(1, 20))}

    classifier = HalvingGridSearchCV(svc, params, scoring="accuracy", factor=3)
    start_time = time.time()
    classifier.fit(data.data, data.target)
    print(f"Time taken for fitting {time.time() - start_time} seconds")

    print("Best Params", classifier.best_params_)
    print("Best CV Score", classifier.best_score_)
def test_resource_parameter(Est):
    # Test the resource parameter

    n_samples = 1000
    X, y = make_classification(n_samples=n_samples, random_state=0)
    param_grid = {"a": [1, 2], "b": list(range(10))}
    base_estimator = FastClassifier()
    sh = Est(base_estimator,
             param_grid,
             cv=2,
             resource="c",
             max_resources=10,
             factor=3)
    sh.fit(X, y)
    assert set(sh.n_resources_) == set([1, 3, 9])
    for r_i, params, param_c in zip(
            sh.cv_results_["n_resources"],
            sh.cv_results_["params"],
            sh.cv_results_["param_c"],
    ):
        assert r_i == params["c"] == param_c

    with pytest.raises(
            ValueError,
            match="Cannot use resource=1234 which is not supported "):
        sh = HalvingGridSearchCV(base_estimator,
                                 param_grid,
                                 cv=2,
                                 resource="1234",
                                 max_resources=10)
        sh.fit(X, y)

    with pytest.raises(
            ValueError,
            match=("Cannot use parameter c as the resource since it is part "
                   "of the searched parameters."),
    ):
        param_grid = {"a": [1, 2], "b": [1, 2], "c": [1, 3]}
        sh = HalvingGridSearchCV(base_estimator,
                                 param_grid,
                                 cv=2,
                                 resource="c",
                                 max_resources=10)
        sh.fit(X, y)
Beispiel #6
0
    def _perform_gridsearch(self, models_param_grid, scoring, cv=5):
        """Perform gridsearch on provided models_param_grid and return results.

        Gridsearch is performed on provided Models space ('Models class': param_grid dict pairs) with scoring used
        as a sole metric to decide which Models perform well and which perform poor. HalvingGridSearch is used
        instead of a regular GridSearch to possibly save time.

        Note:
            GridSearch might fail with NotFittedError - All Models failed to fit. This might happen sometimes when
            parameters for the same type of Model are provided wrongly (e.g. DecisionTreeClassifier instanced with
            criterion: "mae" which is used in DecisionTreeRegressor. Changing param_grid solves this issue.

        Args:
            models_param_grid (dict): 'Model class': param_grid dict pairs
            scoring (function): sklearn scoring function
            cv (int, optional): number of folds, defaults to 5

        Returns:
            tuple: (
                list of (Model class, best params for the Model class) tuples
                dict of 'Model class': cv_results_ from GridSearch object
                )

        Raises:
            NotFittedError: when GridSearch object raises NotFittedError
        """
        all_results = {}
        best_of_their_class = []

        for model, params in models_param_grid.items():

            # not every model requires random_state, e.g. KNeighborsClassifier
            if "random_state" in model().get_params().keys():
                params["random_state"] = [self.random_state]

            # https://scikit-learn.org/stable/modules/model_evaluation.html#defining-your-scoring-strategy-from-metric-functions
            sorting_order = reverse_sorting_order(obj_name(scoring))

            created_model = self._wrap_model(model())

            # GridSearch will fail with NotFittedError("All estimators failed to fit") when argument provided
            # in the param grid is incorrect for a given model (even one combination will trigger it).
            clf = HalvingGridSearchCV(
                created_model,
                params,
                scoring=make_scorer(scoring, greater_is_better=sorting_order),
                cv=cv,
                error_score=0,  # to ignore errors that might happen,
                random_state=self.random_state)
            try:
                clf.fit(self.X_train, self.y_train)
            except NotFittedError:
                # printing out warning for user as the NotFittedError might be misleading in this case
                params_str = [
                    "{}: {}".format(key, item) for key, item in params.items()
                ]
                warn_msg = "WARNING: {} might potentially have incorrect params provided: {}".format(
                    model, params_str)
                warnings.warn(warn_msg)
                raise

            all_results[model] = clf.cv_results_
            best_of_their_class.append((model, clf.best_params_))

        return best_of_their_class, all_results,
Beispiel #7
0
# estimator, and compute the time required to train a
# :class:`~sklearn.model_selection.HalvingGridSearchCV` instance, as well as a
# :class:`~sklearn.model_selection.GridSearchCV` instance.

rng = np.random.RandomState(0)
X, y = datasets.make_classification(n_samples=1000, random_state=rng)

gammas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]
Cs = [1, 10, 100, 1e3, 1e4, 1e5]
param_grid = {"gamma": gammas, "C": Cs}

clf = SVC(random_state=rng)

tic = time()
gsh = HalvingGridSearchCV(
    estimator=clf, param_grid=param_grid, factor=2, random_state=rng
)
gsh.fit(X, y)
gsh_time = time() - tic

tic = time()
gs = GridSearchCV(estimator=clf, param_grid=param_grid)
gs.fit(X, y)
gs_time = time() - tic

# %%
# We now plot heatmaps for both search estimators.


def make_heatmap(ax, gs, is_sh=False, make_cbar=False):
    """Helper to make a heatmap."""
Beispiel #8
0
      (scores.mean(), scores.std()))

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
import pandas as pd

param_grid = {'max_depth': [3, 5, 10], 'min_samples_split': [2, 5, 10]}
base_estimator = RandomForestClassifier(random_state=0)
X, y = make_classification(n_samples=1000, random_state=0)

# halving to make finding faster
sh = HalvingGridSearchCV(base_estimator,
                         param_grid,
                         cv=5,
                         factor=2,
                         resource='n_estimators',
                         max_resources=30).fit(X, y)
print(sh.best_estimator_)

import numpy as np
from sklearn.model_selection import validation_curve
from sklearn.datasets import load_iris
from sklearn.linear_model import Ridge

np.random.seed(0)
X, y = load_iris(return_X_y=True)
indices = np.arange(y.shape[0])
np.random.shuffle(indices)
X, y = X[indices], y[indices]
if __name__ == '__main__':
    X_train, y_train, X_test, y_test = get_all_data()
    train_weights = [normalize_review_weight(w) for w in X_train['helpful']]

    tfidf_grid = {
        'vectorizer__lowercase': [True, False],
        'vectorizer__ngram_range': [(1, 3), (1, 4), (2, 4)],
        'vectorizer__max_df': [1.0, 0.95, 0.9, 0.85, 0.8],
        'vectorizer__min_df': [25, 50, 100, 200, 0.01, 0.05],
    }

    svm = Pipeline([('vectorizer', TfidfVectorizer()),
                    ('classifier', LinearSVC(class_weight='balanced'))])
    grid_search = HalvingGridSearchCV(svm,
                                      tfidf_grid,
                                      random_state=42,
                                      verbose=10,
                                      n_jobs=12)
    grid_search.fit(X_train['reviewText'],
                    y_train,
                    classifier__sample_weight=train_weights)

    print(grid_search.best_params_)
    print(
        score_metric(y_test,
                     grid_search.best_estimator_.predict(
                         X_test['reviewText'])))
    with open('model/sklearn-svc.pkl', 'wb') as out:
        pickle.dump(grid_search.best_estimator_, out)

    bayes = Pipeline([('vectorizer', TfidfVectorizer()),
Beispiel #10
0
param_grid = {
    "max_depth": [3, 4, 5, 7],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],  # Fix subsample
    "colsample_bytree": [0.5],  # Fix colsample_bytree
}


from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

halving_cv = HalvingGridSearchCV(
    xgb_cl, param_grid, scoring="roc_auc", n_jobs=-1, min_resources="exhaust", factor=3
)

_ = halving_cv.fit(X, y)


# deal with class imbalance

counts = pd.Series(y.flatten()).value_counts()

scale_pos_weight = counts["No"] / counts["Yes"]


param_grid_2 = {
    "max_depth": [3, 4, 5],
    "gamma": [5, 30, 50],
Beispiel #11
0
def run_ann(data_train,
            label_train,
            data_test,
            label_test,
            algo_name,
            data_name,
            fig_name=None,
            show_plots=False,
            plot_learning=False,
            plot_val=False,
            val_param="max_iter",
            val_range=range(10, 210, 10),
            test=False,
            val_lab="Iterations",
            grid_search=False,
            **kwargs):
    start = time.time()

    if grid_search:
        # based off sklearn example for hp tuning
        # https://scikit-learn.org/stable/modules/grid_search.html#

        # define hyper parameter space to check over
        param_grid = {
            # alpha
            "alpha": [1e-3, 1e-4, 1e-5],
            # learning rate
            "learning_rate_init": [1e-2, 1e-3, 1e-4]
        }
        clf = MLPClassifier(hidden_layer_sizes=(85, ),
                            **kwargs,
                            max_iter=1000,
                            early_stopping=True,
                            random_state=0)

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore",
                                    category=ConvergenceWarning,
                                    module="sklearn")
            sh = HalvingGridSearchCV(clf, param_grid, cv=5,
                                     factor=2).fit(data_train, label_train)
        print(sh.best_estimator_)
    clf = MLPClassifier(hidden_layer_sizes=(85, ),
                        max_iter=500,
                        random_state=0,
                        early_stopping=True,
                        **kwargs).fit(data_train, label_train)
    # based on sklearn learning curve example
    # https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html

    if plot_learning:
        # plot learning curve for current model
        title = f"Learning Curves (ANN {algo_name}) ({data_name})"
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore",
                                    category=ConvergenceWarning,
                                    module="sklearn")
            plot_learning_curve(clf,
                                title,
                                data_train,
                                label_train,
                                ylim=(0, 1.01),
                                cv=5,
                                n_jobs=4)

        if fig_name is not None:
            now = datetime.now()
            dt_string = now.strftime("%Y-%m-%d-%H-%M-%S")
            plt.savefig(f"{fig_name}_learn_{dt_string}")

    if plot_val:
        # based off sklearn validation curve example
        # https://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html

        # plot validation curve
        title = f"Validation Curve with ANN ({algo_name}) ({data_name})"
        x_lab = val_lab
        y_lab = "Score"

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore",
                                    category=ConvergenceWarning,
                                    module="sklearn")
            plot_validation_curve(clf,
                                  title,
                                  data_train,
                                  label_train,
                                  x_lab,
                                  y_lab,
                                  cv=5,
                                  param_name=val_param,
                                  param_range=val_range,
                                  ylim=(0.0, 1.1))

        if fig_name is not None:
            now = datetime.now()
            dt_string = now.strftime("%Y-%m-%d-%H-%M-%S")
            plt.savefig(f"{fig_name}_val_{dt_string}")

    if show_plots:
        plt.show()

    if test:
        print(f"ANN ({algo_name}) score: {clf.score(data_test, label_test)}")

    print(f"ANN ({algo_name}) time: {time.time()-start:.2f}")
Beispiel #12
0
def main():
    # tracemalloc.start()

    log_file = "./log/data_set_stats.log"

    # initialize logging
    log_dir = os.path.dirname(log_file)
    if not os.path.isdir(log_dir):
        os.makedirs(log_dir)
    logging.basicConfig(
        filename=log_file,
        filemode='w',
        format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
        datefmt='%H:%M:%S',
        level=logging.DEBUG)

    # instead of fetching all datasets, we get all datasets from OpenML-CC18, but some have missing values;
    # the pre-processing will be performed in convex_hull_stats.dataset_stats.py
    #df_datasets = lg.datasets.fetch_datasets(task="classification", min_classes=2, max_features=4000, update_data=True)

    logging.info("Loading benchmark suite OpenML-CC18...")
    benchmark_suite = openml.study.get_suite('OpenML-CC18')
    random_state = 42
    classifiers = dict()

    # let's create pipelines for classifiers that also include hyperparameter tuning
    from sklearn.experimental import enable_halving_search_cv
    from sklearn.model_selection import HalvingGridSearchCV

    rf_parameter_grid = {
        'n_estimators': [10, 20, 30, 50, 100, 200, 300],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': [4, 5, 6, 7, 8, None],
        'criterion': ['gini', 'entropy']
    }
    # classifiers["RandomForestHT"] = HalvingGridSearchCV(RandomForestClassifier(random_state=random_state), rf_parameter_grid, random_state=random_state)
    # classifiers["RandomForest"] = RandomForestClassifier(random_state=random_state)

    svc_parameter_grid = {
        'C': [0.1, 1.0, 10, 100, 1000],
        'kernel': ['poly'],
        'degree': [2, 3, 4, 5],
        'gamma': [1e-3, 1e-4, 'scale'],
        'coef0': [0.0, 1.0, 10.0],
    }
    #classifiers["SVCHT"] = HalvingGridSearchCV(SVC(random_state=random_state), svc_parameter_grid, random_state=random_state)
    #classifiers["SVC"] = SVC(kernel='poly', random_state=random_state)

    lr_parameter_grid = {
        'C': np.logspace(-3, 3, 7),
        'penalty': ['none', 'l2'],
    }
    # classifiers["LogisticRegressionHT"] = HalvingGridSearchCV(LogisticRegression(random_state=random_state), lr_parameter_grid, random_state=random_state)
    # classifiers["LogisticRegression"] = LogisticRegression(random_state=random_state)

    mlp_parameter_grid = {
        'hidden_layer_sizes': [(50), (100), (50, 50), (100, 50)],
        'learning_rate_init': [0.001, 0.0001],
    }
    classifiers["MLPClassifierHT"] = HalvingGridSearchCV(
        MLPClassifier(max_iter=1000,
                      early_stopping=True,
                      random_state=random_state),
        mlp_parameter_grid,
        random_state=random_state)
    # classifiers["MLPClassifier"] = MLPClassifier(random_state=random_state)

    convex_hull_stats.openml_stats_all(benchmark_suite,
                                       classifiers,
                                       n_splits=10)