コード例 #1
0
ファイル: evaluators.py プロジェクト: epeters3/skplumber
 def train_test_evaluate(pipeline: Pipeline, X: pd.DataFrame, y: pd.Series,
                         metric: Metric) -> float:
     X_train, X_test, y_train, y_test = train_test_split(
         X,
         y,
         test_size=test_size,
         shuffle=shuffle,
         random_state=random_state)
     pipeline.fit(X_train, y_train)
     test_predictions = pipeline.predict(X_test)
     score = metric(y_test, test_predictions)
     return score
コード例 #2
0
    def test_can_run_basic(self):
        """
        The flexga tuner should be able to complete without
        erroring.
        """
        pipeline = Pipeline()
        pipeline.add_step(classifiers["DecisionTreeClassifierPrimitive"])

        evaluate = make_train_test_evaluator()
        logger.info(
            f"baseline score: {evaluate(pipeline, self.X, self.y, f1macro)}")
        ga_tune(pipeline, self.X, self.y, evaluate, f1macro, iters=2)
コード例 #3
0
    def test_returns_correct_number_evals(self):
        pipeline = Pipeline()
        pipeline.add_step(classifiers["DecisionTreeClassifierPrimitive"])
        evaluate = make_train_test_evaluator()
        n_expected_evals = 13  # 6 * 2 (+ 1 for the default params)

        result = ga_tune(pipeline,
                         self.X,
                         self.y,
                         evaluate,
                         f1macro,
                         iters=2,
                         population_size=6)
        self.assertEqual(result.n_evals, n_expected_evals)
コード例 #4
0
ファイル: evaluators.py プロジェクト: epeters3/skplumber
    def kfold_evaluate(pipeline: Pipeline, X: pd.DataFrame, y: pd.Series,
                       metric: Metric) -> float:
        cv = KFold(n_splits=k, shuffle=shuffle, random_state=random_state)
        # Perform cross validation, calculating the average performance
        # over the folds as this pipeline's performance.
        scores = []
        for train_index, test_index in cv.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            pipeline.fit(X_train, y_train)
            test_predictions = pipeline.predict(X_test)
            fold_score = metric(y_test, test_predictions)
            scores.append(fold_score)
        test_score = mean(scores)
        return test_score
コード例 #5
0
def _get_flexga_metas(pipeline: Pipeline,
                      X: pd.DataFrame) -> t.Dict[str, ArgMeta]:
    """
    Converts meta information about the hyperparameters
    of a pipeline's primitive steps to the format the `flexga`
    package uses to know the bounds and characteristics of
    those hyperparameters (the things `flexga` is optimizing).
    """
    param_metas = pipeline.param_metas_with_data(X)
    kwargsmeta = {}

    for i, step_pmetas in param_metas.items():
        for key, pmeta in step_pmetas.items():
            flexga_key = f"{i},{key}"
            if isinstance(pmeta, IntParamMeta):
                flexga_arg_meta = IntArgMeta(
                    (pmeta.lbound, pmeta.ubound),
                    _range_rule(pmeta.lbound, pmeta.ubound),
                )
            elif isinstance(pmeta, FloatParamMeta):
                flexga_arg_meta = FloatArgMeta(
                    (pmeta.lbound, pmeta.ubound),
                    _range_rule(pmeta.lbound, pmeta.ubound),
                )
            elif isinstance(pmeta, BoolParamMeta):
                flexga_arg_meta = BoolArgMeta()
            elif isinstance(pmeta, CategoricalParamMeta):
                flexga_arg_meta = CategoricalArgMeta(pmeta.options)
            else:
                raise ValueError(
                    f"unsupported ParamMeta type {type(pmeta)} for {key} param"
                )
            kwargsmeta[flexga_key] = flexga_arg_meta

    return kwargsmeta
コード例 #6
0
ファイル: evaluators.py プロジェクト: epeters3/skplumber
 def down_sample_evaluate(pipeline: Pipeline, X: pd.DataFrame, y: pd.Series,
                          metric: Metric) -> float:
     # First down-sample the data by using `train_test_split` in a
     # perhaps unintended way.
     X_smaller, _, y_smaller, _ = train_test_split(
         X,
         y,
         train_size=sample_ratio,
         shuffle=shuffle,
         random_state=random_state)
     # Now make the train/test split.
     X_train, X_test, y_train, y_test = train_test_split(
         X_smaller,
         y_smaller,
         test_size=test_size,
         shuffle=shuffle,
         random_state=random_state,
     )
     # Finally fit and evaluate the models' performance.
     pipeline.fit(X_train, y_train)
     test_predictions = pipeline.predict(X_test)
     score = metric(y_test, test_predictions)
     return score
コード例 #7
0
ファイル: straight.py プロジェクト: epeters3/skplumber
 def sample_pipeline(
     self,
     problem_type: ProblemType,
     models: t.List[t.Type[Primitive]],
     transformers: t.List[t.Type[Primitive]],
 ) -> Pipeline:
     pipeline = Pipeline()
     for _ in range(self.preprocessors):
         pipeline.add_step(random.choice(transformers))
     pipeline.add_step(random.choice(models))
     return pipeline
コード例 #8
0
    def test_can_tune_multiple_primitives(self):
        """
        The flexga tuner should be able to tune the hyperparameters
        of all primitives in a pipeline at once.
        """
        pipeline = Pipeline()
        pipeline.add_step(transformers["PCAPrimitive"])
        pipeline.add_step(classifiers["DecisionTreeClassifierPrimitive"])

        evaluate = make_train_test_evaluator()
        logger.info(
            f"baseline score: {evaluate(pipeline, self.X, self.y, f1macro)}")
        ga_tune(pipeline, self.X, self.y, evaluate, f1macro, iters=2)
コード例 #9
0
 def sample_pipeline(
     self,
     problem_type: ProblemType,
     models: t.List[t.Type[Primitive]],
     transformers: t.List[t.Type[Primitive]],
 ) -> Pipeline:
     all_primitives = models + transformers
     pipeline = Pipeline()
     stack_input = pipeline.curr_step_i
     stack_outputs = []
     for _ in range(self.width):
         primitive = random.choice(all_primitives)
         pipeline.add_step(primitive, [stack_input])
         stack_outputs.append(pipeline.curr_step_i)
     pipeline.add_step(random.choice(models), stack_outputs)
     return pipeline
コード例 #10
0
def ga_tune(
    pipeline: Pipeline,
    X: pd.DataFrame,
    y: pd.Series,
    evaluator: t.Callable,
    metric: Metric,
    exit_on_pipeline_error: bool = True,
    **flexgakwargs,
) -> TuneResult:
    """
    Performs a genetic algorithm hyperparameter tuning on `pipeline`,
    returning the best score it could find and the number of evaluations
    it completed. Essentially performs a `.fit` operation on the pipeline,
    where the pipeine is fit with the best performing hyperparameter
    configuration it could find.

    Returns
    -------
    result : TuneResult
        A named tuple containing data about how the tuning process went.
    """
    # See what score the model gets without any tuning
    starting_params = pipeline.get_params()
    starting_score = evaluator(pipeline, X, y, metric)

    # keep track of how many iterations were completed
    n_evals = 1  # we already completed one

    def objective(*args, **flexga_params) -> float:
        """
        The objective function the genetic algorithm will
        try to maximize.
        """
        params = _get_params_from_flexga(flexga_params)
        nonlocal n_evals

        try:
            pipeline.set_params(params)
            score = evaluator(pipeline, X, y, metric)
        except PipelineRunError as e:
            logger.exception(e)
            if exit_on_pipeline_error:
                raise e
            # Pipelines that make errors are bad.
            # TODO: make this `None` or `np.nan` instead.
            score = metric.worst_value

        n_evals += 1
        # The genetic algorithm tries to maximize
        return -score if metric.opt_dir == OptimizationDirection.MINIMIZE else score

    # Use flexga to find the best hyperparameter configuration it can.
    optimal_score, _, optimal_flexga_params = flexga(
        objective, kwargsmeta=_get_flexga_metas(pipeline, X), **flexgakwargs)
    if metric.is_better_than(optimal_score, starting_score):
        optimal_params = _get_params_from_flexga(optimal_flexga_params)
        did_improve = True
    else:
        # The tuner couldn't find anything better than the params the
        # pipeline started with under the conditions given.
        optimal_score = starting_score
        optimal_params = starting_params
        did_improve = False

    pipeline.set_params(optimal_params)
    pipeline.fit(X, y)

    logger.info("tuning complete.")
    logger.info(f"found best pipeline configuration: {pipeline}")
    logger.info(f"found best validation score of {optimal_score}")
    return TuneResult(optimal_score, n_evals, did_improve)