Ejemplo n.º 1
0
    def test_format_x_y(self):
        def well_formatted_x_y(x, y, y_type):
            assert isinstance(x, pd.DataFrame)
            assert isinstance(y, y_type)
            assert len(x) == len(y)

        from sklearn.datasets import load_digits

        X_np, y_np = load_digits(return_X_y=True)
        X_df, y_df = pd.DataFrame(X_np), pd.DataFrame(y_np)
        y_series = pd.Series(y_np)
        y_2d = y_np.reshape(-1, 1)

        for X, y in itertools.product([X_np, X_df],
                                      [y_np, y_series, y_df, y_2d]):
            well_formatted_x_y(*format_x_y(X, y), y_type=pd.Series)
            well_formatted_x_y(*format_x_y(X, y, y_type=pd.DataFrame),
                               y_type=pd.DataFrame)
Ejemplo n.º 2
0
    def test_format_x_y_missing_targets(self):
        """ Samples with missing labels should be removed from training data. """
        def well_formatted_x_y(x, y, y_type):
            assert isinstance(x, pd.DataFrame)
            assert isinstance(y, y_type)
            assert len(x) == len(y)

        from sklearn.datasets import load_digits

        x, y = load_digits(return_X_y=True)
        y = y.astype(float)
        y[::2] = np.nan
        x_, y_ = format_x_y(x, y)

        assert (1797, ) == y.shape
        assert (898, ) == y_.shape
        assert np.array_equal(y[1::2], y_)
        assert np.array_equal(x[1::2, :], x_)
        well_formatted_x_y(x_, y_, y_type=pd.Series)
Ejemplo n.º 3
0
    def fit(
        self,
        x: Union[pd.DataFrame, np.ndarray],
        y: Union[pd.DataFrame, pd.Series, np.ndarray],
        warm_start: Optional[List[Individual]] = None,
    ) -> "Gama":
        """ Find and fit a model to predict target y from X.

        Various possible machine learning pipelines will be fit to the (X,y) data.
        Using Genetic Programming, the pipelines chosen should lead to gradually
        better models. Pipelines will internally be validated using cross validation.

        After the search termination condition is met, the best found pipeline
        configuration is then used to train a final model on all provided data.

        Parameters
        ----------
        x: pandas.DataFrame or numpy.ndarray, shape = [n_samples, n_features]
            Training data. All elements must be able to be converted to float.
        y: pandas.DataFrame, pandas.Series or numpy.ndarray, shape = [n_samples,]
            Target values.
            If a DataFrame is provided, assumes the first column contains target values.
        warm_start: List[Individual], optional (default=None)
            A list of individual to start the search  procedure with.
            If None is given, random start candidates are generated.
        """
        self._time_manager = TimeKeeper(self._time_manager.total_time)

        with self._time_manager.start_activity("preprocessing",
                                               activity_meta=["default"]):
            x, self._y = format_x_y(x, y)
            self._inferred_dtypes = x.dtypes
            is_classification = hasattr(self, "_label_encoder")
            self._x, self._basic_encoding_pipeline = basic_encoding(
                x, is_classification)
            self._fixed_pipeline_extension = basic_pipeline_extension(
                self._x, is_classification)
            self._operator_set._safe_compile = partial(
                self._operator_set._compile,
                preprocessing_steps=self._fixed_pipeline_extension,
            )
            store_pipelines = (self._evaluation_library._m is None
                               or self._evaluation_library._m > 0)

            if store_pipelines and self._x.shape[0] * self._x.shape[
                    1] > 6_000_000:
                # if m > 0, we are storing models for each evaluation. For this size
                # KNN will create models of about 76Mb in size, which is too big, so
                # we exclude it from search:
                log.info(
                    "Excluding KNN from search because the dataset is too big."
                )
                from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

                self._pset["prediction"] = [
                    p for p in self._pset["prediction"] if p.identifier not in
                    [KNeighborsClassifier, KNeighborsRegressor]
                ]

            if store_pipelines and self._x.shape[1] > 50:
                log.info(
                    "Data has too many features to include PolynomialFeatures")
                from sklearn.preprocessing import PolynomialFeatures

                self._pset["data"] = [
                    p for p in self._pset["data"]
                    if p.identifier not in [PolynomialFeatures]
                ]

        fit_time = int((1 - self._post_processing.time_fraction) *
                       self._time_manager.total_time_remaining)

        with self._time_manager.start_activity(
                "search",
                time_limit=fit_time,
                activity_meta=[self._search_method.__class__.__name__],
        ):
            self._search_phase(warm_start, timeout=fit_time)

        with self._time_manager.start_activity(
                "postprocess",
                time_limit=int(self._time_manager.total_time_remaining),
                activity_meta=[self._post_processing.__class__.__name__],
        ):
            best_individuals = list(
                reversed(
                    sorted(
                        self._final_pop,
                        key=lambda ind: cast(Fitness, ind.fitness).values,
                    )))
            self._post_processing.dynamic_defaults(self)
            self.model = self._post_processing.post_process(
                self._x,
                self._y,
                self._time_manager.total_time_remaining,
                best_individuals,
            )
        if not self._store == "all":
            to_clean = dict(nothing="all", logs="evaluations", models="logs")
            self.cleanup(to_clean[self._store])
        return self