Esempio n. 1
0
    def dont_test_smac_choice(self):

        import numpy as np

        # Import ConfigSpace and different types of parameters
        from smac.configspace import ConfigurationSpace
        from smac.facade.smac_facade import SMAC as orig_SMAC
        from smac.scenario.scenario import Scenario

        # Import SMAC-utilities

        tfm = PCA() | Nystroem() | NoOp()
        planned_pipeline1 = ((OneHotEncoder(handle_unknown="ignore",
                                            sparse=False) | NoOp()) >> tfm >>
                             (LogisticRegression() | KNeighborsClassifier()))

        cs: ConfigurationSpace = get_smac_space(planned_pipeline1,
                                                lale_num_grids=1)

        # Scenario object
        scenario = Scenario({
            "run_obj":
            "quality",  # we optimize quality (alternatively runtime)
            "runcount-limit": 1,  # maximum function evaluations
            "cs": cs,  # configuration space
            "deterministic": "true",
        })

        # Optimize, using a SMAC-object
        tae = iris_fmin_tae(planned_pipeline1, num_folds=2)
        print(
            "Optimizing! Depending on your machine, this might take a few minutes."
        )
        smac = orig_SMAC(scenario=scenario,
                         rng=np.random.RandomState(42),
                         tae_runner=tae)

        incumbent = smac.optimize()

        inc_value = tae(incumbent)

        print("Optimized Value: %.2f" % (inc_value))
Esempio n. 2
0
    def test_smac(self):

        import numpy as np

        # Import ConfigSpace and different types of parameters
        from smac.configspace import ConfigurationSpace
        from smac.facade.smac_facade import SMAC as orig_SMAC
        from smac.scenario.scenario import Scenario

        # Import SMAC-utilities
        from lale.search.lale_smac import get_smac_space

        lr = LogisticRegression()

        cs: ConfigurationSpace = get_smac_space(lr)

        # Scenario object
        scenario = Scenario({
            "run_obj":
            "quality",  # we optimize quality (alternatively runtime)
            "runcount-limit": 1,  # maximum function evaluations
            "cs": cs,  # configuration space
            "deterministic": "true",
            "abort_on_first_run_crash": False,
        })

        # Optimize, using a SMAC-object
        tae = iris_fmin_tae(lr, num_folds=2)
        print(
            "Optimizing! Depending on your machine, this might take a few minutes."
        )
        smac = orig_SMAC(scenario=scenario,
                         rng=np.random.RandomState(42),
                         tae_runner=tae)

        incumbent = smac.optimize()

        inc_value = tae(incumbent)

        print("Optimized Value: %.2f" % (inc_value))
Esempio n. 3
0
    def __init__(self,
                 estimator=None,
                 max_evals=50,
                 cv=5,
                 handle_cv_failure=False,
                 scoring='accuracy',
                 best_score=0.0,
                 max_opt_time=None,
                 lale_num_grids=None):
        """ Instantiate the SMAC that will use the given estimator and other parameters to select the 
        best performing trainable instantiation of the estimator. 

        Parameters
        ----------
        estimator : lale.operators.IndividualOp or lale.operators.Pipeline, optional
            A valid Lale individual operator or pipeline, by default LogisticRegression
        max_evals : int, optional
            Number of trials of SMAC search i.e. runcount_limit of SMAC, by default 50
        cv : an integer or an object that has a split function as a generator yielding (train, test) splits as arrays of indices.
            Integer value is used as number of folds in sklearn.model_selection.StratifiedKFold, default is 5.
            Note that any of the iterators from https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators can be used here.
            The fit method performs cross validation on the input dataset for per trial, 
            and uses the mean cross validation performance for optimization. This behavior is also impacted by handle_cv_failure flag, 
            by default 5
        handle_cv_failure : bool, optional
            A boolean flag to indicating how to deal with cross validation failure for a trial.
            If True, the trial is continued by doing a 80-20 percent train-validation split of the dataset input to fit
            and reporting the score on the validation part.
            If False, the trial is terminated by assigning status to FAIL.
            , by default False
        scoring: string or a scorer object created using 
            https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html#sklearn.metrics.make_scorer.
            A string from sklearn.metrics.SCORERS.keys() can be used or a scorer created from one of 
            sklearn.metrics (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics).
            A completely custom scorer object can be created from a python function following the example at 
            https://scikit-learn.org/stable/modules/model_evaluation.html
            The metric has to return a scalar value, and note that scikit-learns's scorer object always returns values such that
            higher score is better. Since Hyperopt solves a minimization problem, we pass (best_score - score) to Hyperopt.
            by default 'accuracy'.
        best_score : float, optional
            The best score for the specified scorer. This allows us to return a loss to hyperopt that is
            greater than equal to zero, where zero is the best loss. By default, zero.
        max_opt_time : float, optional
            Maximum amount of wall clock time in seconds for the optimization. By default, None, implying no runtime
            bound.

        Examples
        --------
        >>> from sklearn.metrics import make_scorer, f1_score, accuracy_score
        >>> lr = LogisticRegression()
        >>> clf = SMAC(estimator=lr, scoring='accuracy', cv=5)
        >>> from sklearn import datasets
        >>> diabetes = datasets.load_diabetes()
        >>> X = diabetes.data[:150]
        >>> y = diabetes.target[:150]
        >>> trained = clf.fit(X, y)
        >>> predictions = trained.predict(X)

        Other scoring metrics:

        >>> clf = SMAC(estimator=lr, scoring=make_scorer(f1_score, average='macro'), cv=3, max_evals=2)

        """

        self.max_evals = max_evals
        if estimator is None:
            self.estimator = LogisticRegression()
        else:
            self.estimator = estimator

        self.search_space: ConfigurationSpace = get_smac_space(
            self.estimator, lale_num_grids=lale_num_grids)
        self.scoring = scoring
        self.best_score = best_score
        self.handle_cv_failure = handle_cv_failure
        self.cv = cv
        self.max_opt_time = max_opt_time
        # Scenario object
        scenario_options = {
            "run_obj":
            "quality",  # we optimize quality (alternatively runtime)
            "runcount-limit": self.max_evals,  # maximum function evaluations
            "cs": self.search_space,  # configuration space
            "deterministic": "true",
            "abort_on_first_run_crash": False,
        }
        if max_opt_time is not None:
            scenario_options["wallclock_limit"] = max_opt_time
        self.scenario = Scenario(scenario_options)
        self.trials = None
Esempio n. 4
0
File: smac.py Progetto: shinnar/lale
    def fit(self, X_train, y_train):
        data_schema = lale.helpers.fold_schema(X_train, y_train, self.cv,
                                               self.estimator.is_classifier())
        self.search_space: ConfigurationSpace = get_smac_space(
            self.estimator,
            lale_num_grids=self.lale_num_grids,
            data_schema=data_schema)
        # Scenario object
        scenario_options = {
            "run_obj": "quality",  # optimize quality (alternatively runtime)
            "runcount-limit": self.max_evals,  # maximum function evaluations
            "cs": self.search_space,  # configuration space
            "deterministic": "true",
            "abort_on_first_run_crash": False,
        }
        if self.max_opt_time is not None:
            scenario_options["wallclock_limit"] = self.max_opt_time
        self.scenario = Scenario(scenario_options)

        self.cv = check_cv(self.cv,
                           y=y_train,
                           classifier=self.estimator.is_classifier())

        def smac_train_test(trainable, X_train, y_train):
            try:
                cv_score, logloss, execution_time = cross_val_score_track_trials(
                    trainable,
                    X_train,
                    y_train,
                    cv=self.cv,
                    scoring=self.scoring)
                logger.debug("Successful trial of SMAC")
            except BaseException as e:
                # If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion
                if self.handle_cv_failure:
                    (
                        X_train_part,
                        X_validation,
                        y_train_part,
                        y_validation,
                    ) = train_test_split(X_train, y_train, test_size=0.20)
                    start = time.time()
                    trained = trainable.fit(X_train_part, y_train_part)
                    scorer = check_scoring(trainable, scoring=self.scoring)
                    cv_score = scorer(trained, X_validation, y_validation)
                    execution_time = time.time() - start
                    y_pred_proba = trained.predict_proba(X_validation)
                    try:
                        logloss = log_loss(y_true=y_validation,
                                           y_pred=y_pred_proba)
                    except BaseException:
                        logloss = 0
                        logger.debug("Warning, log loss cannot be computed")
                else:
                    logger.debug("Error {} with pipeline:{}".format(
                        e, trainable.to_json()))
                    raise e
            return cv_score, logloss, execution_time

        def f(trainable):
            return_dict = {}
            try:
                score, logloss, execution_time = smac_train_test(
                    trainable, X_train=X_train, y_train=y_train)
                return_dict = {
                    "loss": self.best_score - score,
                    "time": execution_time,
                    "log_loss": logloss,
                }
            except BaseException as e:
                logger.warning(
                    f"Exception caught in SMACCV:{type(e)}, {traceback.format_exc()}, SMAC will set a cost_for_crash to MAXINT."
                )
                raise e
            return return_dict["loss"]

        try:
            smac = orig_SMAC(
                scenario=self.scenario,
                rng=np.random.RandomState(42),
                tae_runner=lale_op_smac_tae(self.estimator, f),
            )
            incumbent = smac.optimize()
            self.trials = smac.get_runhistory()
            trainable = lale_trainable_op_from_config(self.estimator,
                                                      incumbent)
            # get the trainable corresponding to the best params and train it on the entire training dataset.
            trained = trainable.fit(X_train, y_train)
            self._best_estimator = trained
        except BudgetExhaustedException:
            logger.warning(
                "Maximum alloted optimization time exceeded. Optimization exited prematurely"
            )
        except BaseException as e:
            logger.warning("Error during optimization: {}".format(e))
            self._best_estimator = None

        return self
    def dont_test_car_smac(self):
        import numpy as np

        from lale.datasets.auto_weka import fetch_car
        from sklearn.metrics import accuracy_score, make_scorer
        from sklearn.preprocessing import LabelEncoder
        import pandas as pd
        from lale.lib.weka import J48
        from lalegpl.lib.r import ArulesCBAClassifier 
        from lale.operators import make_pipeline
        from lale.lib.lale import HyperoptClassifier
        from lale.lib.sklearn import LogisticRegression, KNeighborsClassifier
        from smac.scenario.scenario import Scenario
        from smac.facade.smac_facade import SMAC
        from smac.configspace import ConfigurationSpace


        (X_train, y_train), (X_test, y_test) = fetch_car()
        y_name = y_train.name
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        y_test = le.transform(y_test)

        y_train = pd.Series(y_train, name=y_name)
        y_test = pd.Series(y_test, name=y_name)

#        planned_pipeline = make_pipeline(J48() | ArulesCBAClassifier() | LogisticRegression() | KNeighborsClassifier())
        planned_pipeline = make_pipeline(ArulesCBAClassifier() | KNeighborsClassifier() | LogisticRegression())

        cs:ConfigurationSpace = get_smac_space(planned_pipeline)
        print(cs)
#        X_train = X_train[0:20]
#        y_train = y_train[0:20]
        # Scenario object
        run_count_limit = 1
        scenario = Scenario({"run_obj": "quality",   # we optimize quality (alternatively runtime)
                            "runcount-limit": run_count_limit,  # maximum function evaluations
                            "cs": cs,               # configuration space
                            "deterministic": "true",
                            "abort_on_first_run_crash": False
                            })

        # Optimize, using a SMAC-object
        def f_min(op): 
            return test_f_min(op, X_train, y_train, num_folds=2)
        tae = lale_op_smac_tae(planned_pipeline, f_min)

        print("Optimizing! Depending on your machine, this might take a few minutes.")
        smac = SMAC(scenario=scenario, rng=np.random.RandomState(42),
                tae_runner=tae)

        incumbent = smac.optimize()
        trainable_pipeline = lale_trainable_op_from_config(planned_pipeline, incumbent)
        trained_pipeline = trainable_pipeline.fit(X_train, y_train)
        pred = trained_pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        print("Accuracy: %.2f" % (accuracy))
        inc_value = tae(incumbent)

        print("Optimized Value: %.2f" % (inc_value))
        print(f"Run count limit: {run_count_limit}")