コード例 #1
0
def create_from_population(
    operator_shell: OperatorSet,
    pop: List[Individual],
    n: int,
    cxpb: float,
    mutpb: float,
) -> List[Individual]:
    """ Creates n new individuals based on the population. """
    offspring = []
    metrics = [lambda ind: ind.fitness.values[0], lambda ind: ind.fitness.values[1]]
    parent_pairs = nsga2_select(pop, n, metrics)
    for (ind1, ind2) in parent_pairs:
        if random.random() < cxpb and len(_valid_crossover_functions(ind1, ind2)) > 0:
            ind1 = operator_shell.mate(ind1, ind2)
        else:
            ind1 = operator_shell.mutate(ind1)
        offspring.append(ind1)
    return offspring
コード例 #2
0
ファイル: random_search.py プロジェクト: Warmongul/gama-1
def random_search(
    operations: OperatorSet,
    output: List[Individual],
    start_candidates: List[Individual],
    max_evaluations: Optional[int] = None,
) -> List[Individual]:
    """ Perform random search over all possible pipelines.

    Parameters
    ----------
    operations: OperatorSet
        An operator set with `evaluate` and `individual` functions.
    output: List[Individual]
        A list which contains the found individuals during search.
    start_candidates: List[Individual]
        A list with candidate individuals to evaluate first.
    max_evaluations: int, optional (default=None)
        If specified, only a maximum of `max_evaluations` individuals are evaluated.
        If None, the algorithm will be run indefinitely.

    Returns
    -------
    List[Individual]
        All evaluated individuals.
    """
    _check_base_search_hyperparameters(operations, output, start_candidates)

    with AsyncEvaluator() as async_:
        for individual in start_candidates:
            async_.submit(operations.evaluate, individual)

        while (max_evaluations is None) or (len(output) < max_evaluations):
            future = operations.wait_next(async_)
            if future.result is not None:
                output.append(future.result.individual)
            async_.submit(operations.evaluate, operations.individual())

    return output
コード例 #3
0
    def __init__(
        self,
        scoring: Union[str, Metric, Iterable[str],
                       Iterable[Metric]] = "filled_in_by_child_class",
        regularize_length: bool = True,
        max_pipeline_length: Optional[int] = None,
        config: Dict = None,
        random_state: Optional[int] = None,
        max_total_time: int = 3600,
        max_eval_time: Optional[int] = None,
        n_jobs: Optional[int] = None,
        max_memory_mb: Optional[int] = None,
        verbosity: int = logging.WARNING,
        search: BaseSearch = AsyncEA(),
        post_processing: BasePostProcessing = BestFitPostProcessing(),
        output_directory: Optional[str] = None,
        store: str = "logs",
    ):
        """

        Parameters
        ----------
        scoring: str, Metric or Tuple
            Specifies the/all metric(s) to optimize towards.
            A string will be converted to Metric.
            A tuple must specify each metric with the same type (e.g. all str).
            See :ref:`Metrics` for built-in metrics.

        regularize_length: bool (default=True)
            If True, add pipeline length as an optimization metric.
            Short pipelines should then be preferred over long ones.

        max_pipeline_length: int, optional (default=None)
            If set, limit the maximum number of steps in any evaluated pipeline.
            Encoding and imputation are excluded.

        config: Dict
            Specifies available components and their valid hyperparameter settings.
            For more information, see :ref:`search_space_configuration`.

        random_state:  int, optional (default=None)
            Seed for the random number generators used in the process.
            However, with `n_jobs > 1`,
            there will be randomization introduced by multi-processing.
            For reproducible results, set this and use `n_jobs=1`.

        max_total_time: positive int (default=3600)
            Time in seconds that can be used for the `fit` call.

        max_eval_time: positive int, optional (default=None)
            Time in seconds that can be used to evaluate any one single individual.
            If None, set to 0.1 * max_total_time.

        n_jobs: int, optional (default=None)
            The amount of parallel processes that may be created to speed up `fit`.
            Accepted values are positive integers, -1 or None.
            If -1 is specified, multiprocessing.cpu_count() processes are created.
            If None is specified, multiprocessing.cpu_count() / 2 processes are created.

        max_memory_mb: int, optional (default=None)
            Sets the total amount of memory GAMA is allowed to use (in megabytes).
            If not set, GAMA will use as much as it needs.
            GAMA is not guaranteed to respect this limit at all times,
            but it should never violate it for too long.

        verbosity: int (default=logging.WARNING)
            Sets the level of log messages to be automatically output to terminal.

        search: BaseSearch (default=AsyncEA())
            Search method to use to find good pipelines. Should be instantiated.

        post_processing: BasePostProcessing (default=BestFitPostProcessing())
            Post-processing method to create a model after the search phase.
            Should be an instantiated subclass of BasePostProcessing.

        output_directory: str, optional (default=None)
            Directory to use to save GAMA output. This includes both intermediate
            results during search and logs.
            If set to None, generate a unique name ("gama_HEXCODE").

        store: str (default='logs')
            Determines which data is stored after each run:
             - 'nothing': keep nothing from this run
             - 'models': keep only cache with models and predictions
             - 'logs': keep only the logs
             - 'all': keep logs and cache with models and predictions
        """
        if not output_directory:
            output_directory = f"gama_{str(uuid.uuid4())}"
        self.output_directory = os.path.abspath(
            os.path.expanduser(output_directory))
        if not os.path.exists(self.output_directory):
            os.mkdir(self.output_directory)

        register_stream_log(verbosity)
        if store in ["logs", "all"]:
            log_file = os.path.join(self.output_directory, "gama.log")
            log_handler = logging.FileHandler(log_file)
            log_handler.setLevel(logging.DEBUG)
            log_format = logging.Formatter(
                "[%(asctime)s - %(name)s] %(message)s")
            log_handler.setFormatter(log_format)
            logging.getLogger("gama").addHandler(log_handler)

        arguments = ",".join([
            f"{k}={v}" for (k, v) in locals().items() if k not in
            ["self", "config", "log_file", "log_handler", "log_format"]
        ])
        log.info(f"Using GAMA version {__version__}.")
        log.info(f"INIT:{self.__class__.__name__}({arguments})")

        if n_jobs is None:
            n_jobs = multiprocessing.cpu_count() // 2
            log.debug("n_jobs defaulted to %d.", n_jobs)
        elif n_jobs == -1:
            n_jobs = multiprocessing.cpu_count()
            log.debug("n_jobs set to use all %d cores.", n_jobs)

        err = ""
        if max_total_time is None or max_total_time <= 0:
            err = f"Expect positive int for max_total_time, got {max_total_time}."
        if max_eval_time is not None and max_eval_time <= 0:
            err = f"Expect None or positive int for max_eval_time, got {max_eval_time}."
        if n_jobs < -1 or n_jobs == 0:
            err = f"n_jobs should be -1 or positive int but is {n_jobs}."
        if err:
            self.cleanup("all")
            raise ValueError(err)

        setattr(
            AsyncEvaluator,
            "__init__",
            partialmethod(
                AsyncEvaluator.__init__,
                n_workers=n_jobs,
                memory_limit_mb=max_memory_mb,
                logfile=os.path.join(self.output_directory, "memory.log"),
            ),
        )

        if max_eval_time is None:
            max_eval_time = round(0.1 * max_total_time)
        if max_eval_time > max_total_time:
            log.warning(
                f"max_eval_time ({max_eval_time}) > max_total_time ({max_total_time}) "
                f"is not allowed. max_eval_time set to {max_total_time}.")
            max_eval_time = max_total_time

        self._max_eval_time = max_eval_time
        self._time_manager = TimeKeeper(max_total_time)
        self._metrics: Tuple[Metric, ...] = scoring_to_metric(scoring)
        self._regularize_length = regularize_length
        self._search_method: BaseSearch = search
        self._post_processing = post_processing
        self._store = store

        if random_state is not None:
            random.seed(random_state)
            np.random.seed(random_state)

        self._x: Optional[pd.DataFrame] = None
        self._y: Optional[pd.DataFrame] = None
        self._basic_encoding_pipeline: Optional[Pipeline] = None
        self._fixed_pipeline_extension: List[Tuple[str, TransformerMixin]] = []
        self._inferred_dtypes: List[Type] = []
        self.model: object = None
        self._final_pop: List[Individual] = []

        self._subscribers: Dict[str, List[Callable]] = defaultdict(list)
        cache_directory = os.path.join(self.output_directory, "cache")
        if isinstance(post_processing, EnsemblePostProcessing):
            self._evaluation_library = EvaluationLibrary(
                m=post_processing.hyperparameters["max_models"],
                n=post_processing.hyperparameters["hillclimb_size"],
                cache=cache_directory,
            )
        else:
            # Don't keep memory-heavy evaluation meta-data (predictions, estimators)
            self._evaluation_library = EvaluationLibrary(m=0,
                                                         cache=cache_directory)
        self.evaluation_completed(self._evaluation_library.save_evaluation)
        e = search.logger(
            os.path.join(self.output_directory, "evaluations.log"))
        self.evaluation_completed(e.log_evaluation)

        self._pset, parameter_checks = pset_from_config(config)

        if DATA_TERMINAL not in self._pset:
            if max_pipeline_length is None:
                log.info(
                    "Setting `max_pipeline_length` to 1 "
                    "because there are no preprocessing steps in the search space."
                )
                max_pipeline_length = 1
            elif max_pipeline_length > 1:
                raise ValueError(
                    f"`max_pipeline_length` can't be {max_pipeline_length} "
                    "because there are no preprocessing steps in the search space."
                )
        max_start_length = 3 if max_pipeline_length is None else max_pipeline_length
        self._operator_set = OperatorSet(
            mutate=partial(
                random_valid_mutation_in_place,
                primitive_set=self._pset,
                max_length=max_pipeline_length,
            ),
            mate=partial(random_crossover, max_length=max_pipeline_length),
            create_from_population=partial(create_from_population,
                                           cxpb=0.2,
                                           mutpb=0.8),
            create_new=partial(
                create_random_expression,
                primitive_set=self._pset,
                max_length=max_start_length,
            ),
            compile_=compile_individual,
            eliminate=eliminate_from_pareto,
            evaluate_callback=self._on_evaluation_completed,
            completed_evaluations=self._evaluation_library.lookup,
        )
コード例 #4
0
class Gama(ABC):
    """ Wrapper for the toolbox logic surrounding executing the AutoML pipeline. """
    def __init__(
        self,
        scoring: Union[str, Metric, Iterable[str],
                       Iterable[Metric]] = "filled_in_by_child_class",
        regularize_length: bool = True,
        max_pipeline_length: Optional[int] = None,
        config: Dict = None,
        random_state: Optional[int] = None,
        max_total_time: int = 3600,
        max_eval_time: Optional[int] = None,
        n_jobs: Optional[int] = None,
        max_memory_mb: Optional[int] = None,
        verbosity: int = logging.WARNING,
        search: BaseSearch = AsyncEA(),
        post_processing: BasePostProcessing = BestFitPostProcessing(),
        output_directory: Optional[str] = None,
        store: str = "logs",
    ):
        """

        Parameters
        ----------
        scoring: str, Metric or Tuple
            Specifies the/all metric(s) to optimize towards.
            A string will be converted to Metric.
            A tuple must specify each metric with the same type (e.g. all str).
            See :ref:`Metrics` for built-in metrics.

        regularize_length: bool (default=True)
            If True, add pipeline length as an optimization metric.
            Short pipelines should then be preferred over long ones.

        max_pipeline_length: int, optional (default=None)
            If set, limit the maximum number of steps in any evaluated pipeline.
            Encoding and imputation are excluded.

        config: Dict
            Specifies available components and their valid hyperparameter settings.
            For more information, see :ref:`search_space_configuration`.

        random_state:  int, optional (default=None)
            Seed for the random number generators used in the process.
            However, with `n_jobs > 1`,
            there will be randomization introduced by multi-processing.
            For reproducible results, set this and use `n_jobs=1`.

        max_total_time: positive int (default=3600)
            Time in seconds that can be used for the `fit` call.

        max_eval_time: positive int, optional (default=None)
            Time in seconds that can be used to evaluate any one single individual.
            If None, set to 0.1 * max_total_time.

        n_jobs: int, optional (default=None)
            The amount of parallel processes that may be created to speed up `fit`.
            Accepted values are positive integers, -1 or None.
            If -1 is specified, multiprocessing.cpu_count() processes are created.
            If None is specified, multiprocessing.cpu_count() / 2 processes are created.

        max_memory_mb: int, optional (default=None)
            Sets the total amount of memory GAMA is allowed to use (in megabytes).
            If not set, GAMA will use as much as it needs.
            GAMA is not guaranteed to respect this limit at all times,
            but it should never violate it for too long.

        verbosity: int (default=logging.WARNING)
            Sets the level of log messages to be automatically output to terminal.

        search: BaseSearch (default=AsyncEA())
            Search method to use to find good pipelines. Should be instantiated.

        post_processing: BasePostProcessing (default=BestFitPostProcessing())
            Post-processing method to create a model after the search phase.
            Should be an instantiated subclass of BasePostProcessing.

        output_directory: str, optional (default=None)
            Directory to use to save GAMA output. This includes both intermediate
            results during search and logs.
            If set to None, generate a unique name ("gama_HEXCODE").

        store: str (default='logs')
            Determines which data is stored after each run:
             - 'nothing': keep nothing from this run
             - 'models': keep only cache with models and predictions
             - 'logs': keep only the logs
             - 'all': keep logs and cache with models and predictions
        """
        if not output_directory:
            output_directory = f"gama_{str(uuid.uuid4())}"
        self.output_directory = os.path.abspath(
            os.path.expanduser(output_directory))
        if not os.path.exists(self.output_directory):
            os.mkdir(self.output_directory)

        register_stream_log(verbosity)
        if store in ["logs", "all"]:
            log_file = os.path.join(self.output_directory, "gama.log")
            log_handler = logging.FileHandler(log_file)
            log_handler.setLevel(logging.DEBUG)
            log_format = logging.Formatter(
                "[%(asctime)s - %(name)s] %(message)s")
            log_handler.setFormatter(log_format)
            logging.getLogger("gama").addHandler(log_handler)

        arguments = ",".join([
            f"{k}={v}" for (k, v) in locals().items() if k not in
            ["self", "config", "log_file", "log_handler", "log_format"]
        ])
        log.info(f"Using GAMA version {__version__}.")
        log.info(f"INIT:{self.__class__.__name__}({arguments})")

        if n_jobs is None:
            n_jobs = multiprocessing.cpu_count() // 2
            log.debug("n_jobs defaulted to %d.", n_jobs)
        elif n_jobs == -1:
            n_jobs = multiprocessing.cpu_count()
            log.debug("n_jobs set to use all %d cores.", n_jobs)

        err = ""
        if max_total_time is None or max_total_time <= 0:
            err = f"Expect positive int for max_total_time, got {max_total_time}."
        if max_eval_time is not None and max_eval_time <= 0:
            err = f"Expect None or positive int for max_eval_time, got {max_eval_time}."
        if n_jobs < -1 or n_jobs == 0:
            err = f"n_jobs should be -1 or positive int but is {n_jobs}."
        if err:
            self.cleanup("all")
            raise ValueError(err)

        setattr(
            AsyncEvaluator,
            "__init__",
            partialmethod(
                AsyncEvaluator.__init__,
                n_workers=n_jobs,
                memory_limit_mb=max_memory_mb,
                logfile=os.path.join(self.output_directory, "memory.log"),
            ),
        )

        if max_eval_time is None:
            max_eval_time = round(0.1 * max_total_time)
        if max_eval_time > max_total_time:
            log.warning(
                f"max_eval_time ({max_eval_time}) > max_total_time ({max_total_time}) "
                f"is not allowed. max_eval_time set to {max_total_time}.")
            max_eval_time = max_total_time

        self._max_eval_time = max_eval_time
        self._time_manager = TimeKeeper(max_total_time)
        self._metrics: Tuple[Metric, ...] = scoring_to_metric(scoring)
        self._regularize_length = regularize_length
        self._search_method: BaseSearch = search
        self._post_processing = post_processing
        self._store = store

        if random_state is not None:
            random.seed(random_state)
            np.random.seed(random_state)

        self._x: Optional[pd.DataFrame] = None
        self._y: Optional[pd.DataFrame] = None
        self._basic_encoding_pipeline: Optional[Pipeline] = None
        self._fixed_pipeline_extension: List[Tuple[str, TransformerMixin]] = []
        self._inferred_dtypes: List[Type] = []
        self.model: object = None
        self._final_pop: List[Individual] = []

        self._subscribers: Dict[str, List[Callable]] = defaultdict(list)
        cache_directory = os.path.join(self.output_directory, "cache")
        if isinstance(post_processing, EnsemblePostProcessing):
            self._evaluation_library = EvaluationLibrary(
                m=post_processing.hyperparameters["max_models"],
                n=post_processing.hyperparameters["hillclimb_size"],
                cache=cache_directory,
            )
        else:
            # Don't keep memory-heavy evaluation meta-data (predictions, estimators)
            self._evaluation_library = EvaluationLibrary(m=0,
                                                         cache=cache_directory)
        self.evaluation_completed(self._evaluation_library.save_evaluation)
        e = search.logger(
            os.path.join(self.output_directory, "evaluations.log"))
        self.evaluation_completed(e.log_evaluation)

        self._pset, parameter_checks = pset_from_config(config)

        if DATA_TERMINAL not in self._pset:
            if max_pipeline_length is None:
                log.info(
                    "Setting `max_pipeline_length` to 1 "
                    "because there are no preprocessing steps in the search space."
                )
                max_pipeline_length = 1
            elif max_pipeline_length > 1:
                raise ValueError(
                    f"`max_pipeline_length` can't be {max_pipeline_length} "
                    "because there are no preprocessing steps in the search space."
                )
        max_start_length = 3 if max_pipeline_length is None else max_pipeline_length
        self._operator_set = OperatorSet(
            mutate=partial(
                random_valid_mutation_in_place,
                primitive_set=self._pset,
                max_length=max_pipeline_length,
            ),
            mate=partial(random_crossover, max_length=max_pipeline_length),
            create_from_population=partial(create_from_population,
                                           cxpb=0.2,
                                           mutpb=0.8),
            create_new=partial(
                create_random_expression,
                primitive_set=self._pset,
                max_length=max_start_length,
            ),
            compile_=compile_individual,
            eliminate=eliminate_from_pareto,
            evaluate_callback=self._on_evaluation_completed,
            completed_evaluations=self._evaluation_library.lookup,
        )

    def cleanup(self, which="evaluations"):
        cache_directory = os.path.join(self.output_directory, "cache")
        if not os.path.exists(self.output_directory):
            return  # Cleanup has been called previously

        if which in ["logs", "all"]:
            for file in os.listdir(self.output_directory):
                if file.endswith(".log"):
                    os.remove(os.path.join(self.output_directory, file))
        if which in ["evaluations", "all"] and os.path.exists(cache_directory):
            shutil.rmtree(cache_directory)
        if which == "all":
            os.rmdir(self.output_directory)

    def _np_to_matching_dataframe(self, x: np.ndarray) -> pd.DataFrame:
        """ Format np array to dataframe whose column types match the training data. """
        if not isinstance(x, np.ndarray):
            raise TypeError(
                f"Expected x to be of type 'numpy.ndarray' not {type(x)}.")

        x = pd.DataFrame(x)
        for i, dtype in enumerate(self._inferred_dtypes):
            x[i] = x[i].astype(dtype)
        return x

    def _prepare_for_prediction(self, x):
        if isinstance(x, np.ndarray):
            x = self._np_to_matching_dataframe(x)
        x = self._basic_encoding_pipeline.transform(x)
        return x

    def _predict(self, x: pd.DataFrame):
        raise NotImplementedError("_predict is implemented by base classes.")

    def predict(self, x: Union[pd.DataFrame, np.ndarray]):
        """ Predict the target for input X.

        Parameters
        ----------
        x: pandas.DataFrame or numpy.ndarray
            A dataframe or array with the same number of columns as the input to `fit`.

        Returns
        -------
        numpy.ndarray
            array with predictions of shape (N,) where N is len(x)
        """
        x = self._prepare_for_prediction(x)
        return self._predict(x)

    def predict_from_file(
        self,
        file_path: str,
        target_column: Optional[str] = None,
        encoding: Optional[str] = None,
        **kwargs,
    ) -> np.ndarray:
        """ Predict the target for input found in the ARFF file.

        Parameters
        ----------
        file_path: str
            A csv or ARFF file with the same columns as the one that used in fit.
            Target column must be present in file, but its values are ignored.
        target_column: str, optional (default=None)
            Specifies which column the model should predict.
            If left None, the last column is taken to be the target.
        encoding: str, optional
            Encoding of the ARFF file.
        **kwargs:
            Any additional arguments for calls to pandas.read_csv or arff.load.

        Returns
        -------
        numpy.ndarray
            array with predictions for each row in the ARFF file.
        """
        x, _ = X_y_from_file(file_path,
                             split_column=target_column,
                             encoding=encoding,
                             **kwargs)
        x = self._prepare_for_prediction(x)
        return self._predict(x)

    def score(self, x: Union[pd.DataFrame, np.ndarray],
              y: Union[pd.Series, np.ndarray]) -> float:
        """ Calculate `self.scoring` metric of the model on (x, y).

        Parameters
        ----------
        x: pandas.DataFrame or numpy.ndarray
            Data to predict target values for.
        y: pandas.Series or numpy.ndarray
            True values for the target.

        Returns
        -------
        float
            The score obtained on the given test data according to the `scoring` metric.
        """
        predictions = (
            self.predict_proba(x)  # type: ignore
            if self._metrics[0].requires_probabilities else self.predict(x))
        return self._metrics[0].score(y, predictions)

    def score_from_file(
        self,
        file_path: str,
        target_column: Optional[str] = None,
        encoding: Optional[str] = None,
        **kwargs,
    ) -> float:
        """ Calculate `self.scoring` metric of the model on data in the file.

        Parameters
        ----------
        file_path: str
            A csv or ARFF file with which to calculate the score.
        target_column: str, optional (default=None)
            Specifies which column the model should predict.
            If left None, the last column is taken to be the target.
        encoding: str, optional
            Encoding of the ARFF file.
        **kwargs:
            Any additional arguments for calls to pandas.read_csv or arff.load.

        Returns
        -------
        float
            The score obtained on the given test data according to the `scoring` metric.
        """
        x, y = X_y_from_file(file_path,
                             split_column=target_column,
                             encoding=encoding,
                             **kwargs)
        return self.score(x, y)

    def fit_from_file(
        self,
        file_path: str,
        target_column: Optional[str] = None,
        encoding: Optional[str] = None,
        warm_start: Optional[List[Individual]] = None,
        **kwargs,
    ) -> None:
        """ Find and fit a model to predict the target column (last) from other columns.

        Parameters
        ----------
        file_path: str
            Path to a csv or ARFF file containing the training data.
        target_column: str, optional (default=None)
            Specifies which column the model should predict.
            If left None, the last column is taken to be the target.
        encoding: str, optional
            Encoding of the file.
        warm_start: List[Individual], optional (default=None)
            A list of individual to start the search  procedure with.
            If None is given, random start candidates are generated.
        **kwargs:
            Any additional arguments for calls to pandas.read_csv or arff.load.

        """
        x, y = X_y_from_file(file_path, target_column, encoding, **kwargs)
        self.fit(x, y, warm_start)

    def fit(
        self,
        x: Union[pd.DataFrame, np.ndarray],
        y: Union[pd.DataFrame, pd.Series, np.ndarray],
        warm_start: Optional[List[Individual]] = None,
    ) -> "Gama":
        """ Find and fit a model to predict target y from X.

        Various possible machine learning pipelines will be fit to the (X,y) data.
        Using Genetic Programming, the pipelines chosen should lead to gradually
        better models. Pipelines will internally be validated using cross validation.

        After the search termination condition is met, the best found pipeline
        configuration is then used to train a final model on all provided data.

        Parameters
        ----------
        x: pandas.DataFrame or numpy.ndarray, shape = [n_samples, n_features]
            Training data. All elements must be able to be converted to float.
        y: pandas.DataFrame, pandas.Series or numpy.ndarray, shape = [n_samples,]
            Target values.
            If a DataFrame is provided, assumes the first column contains target values.
        warm_start: List[Individual], optional (default=None)
            A list of individual to start the search  procedure with.
            If None is given, random start candidates are generated.
        """
        self._time_manager = TimeKeeper(self._time_manager.total_time)

        with self._time_manager.start_activity("preprocessing",
                                               activity_meta=["default"]):
            x, self._y = format_x_y(x, y)
            self._inferred_dtypes = x.dtypes
            is_classification = hasattr(self, "_label_encoder")
            self._x, self._basic_encoding_pipeline = basic_encoding(
                x, is_classification)
            self._fixed_pipeline_extension = basic_pipeline_extension(
                self._x, is_classification)
            self._operator_set._safe_compile = partial(
                self._operator_set._compile,
                preprocessing_steps=self._fixed_pipeline_extension,
            )
            store_pipelines = (self._evaluation_library._m is None
                               or self._evaluation_library._m > 0)

            if store_pipelines and self._x.shape[0] * self._x.shape[
                    1] > 6_000_000:
                # if m > 0, we are storing models for each evaluation. For this size
                # KNN will create models of about 76Mb in size, which is too big, so
                # we exclude it from search:
                log.info(
                    "Excluding KNN from search because the dataset is too big."
                )
                from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

                self._pset["prediction"] = [
                    p for p in self._pset["prediction"] if p.identifier not in
                    [KNeighborsClassifier, KNeighborsRegressor]
                ]

            if store_pipelines and self._x.shape[1] > 50:
                log.info(
                    "Data has too many features to include PolynomialFeatures")
                from sklearn.preprocessing import PolynomialFeatures

                self._pset["data"] = [
                    p for p in self._pset["data"]
                    if p.identifier not in [PolynomialFeatures]
                ]

        fit_time = int((1 - self._post_processing.time_fraction) *
                       self._time_manager.total_time_remaining)

        with self._time_manager.start_activity(
                "search",
                time_limit=fit_time,
                activity_meta=[self._search_method.__class__.__name__],
        ):
            self._search_phase(warm_start, timeout=fit_time)

        with self._time_manager.start_activity(
                "postprocess",
                time_limit=int(self._time_manager.total_time_remaining),
                activity_meta=[self._post_processing.__class__.__name__],
        ):
            best_individuals = list(
                reversed(
                    sorted(
                        self._final_pop,
                        key=lambda ind: cast(Fitness, ind.fitness).values,
                    )))
            self._post_processing.dynamic_defaults(self)
            self.model = self._post_processing.post_process(
                self._x,
                self._y,
                self._time_manager.total_time_remaining,
                best_individuals,
            )
        if not self._store == "all":
            to_clean = dict(nothing="all", logs="evaluations", models="logs")
            self.cleanup(to_clean[self._store])
        return self

    def _search_phase(self,
                      warm_start: Optional[List[Individual]] = None,
                      timeout: float = 1e6):
        """ Invoke the search algorithm, populate `final_pop`. """
        if warm_start:
            if not all([isinstance(i, Individual) for i in warm_start]):
                raise TypeError("`warm_start` must be a list of Individual.")
            pop = warm_start
        elif warm_start is None and len(self._final_pop) > 0:
            pop = self._final_pop
        else:
            pop = [self._operator_set.individual() for _ in range(50)]

        deadline = time.time() + timeout

        evaluate_pipeline = partial(
            gama.genetic_programming.compilers.scikitlearn.evaluate_pipeline,
            x=self._x,
            y_train=self._y,
            metrics=self._metrics,
        )
        AsyncEvaluator.defaults = dict(evaluate_pipeline=evaluate_pipeline)

        self._operator_set.evaluate = partial(
            gama.genetic_programming.compilers.scikitlearn.evaluate_individual,
            # evaluate_pipeline=evaluate_pipeline,
            timeout=self._max_eval_time,
            deadline=deadline,
            add_length_to_score=self._regularize_length,
        )

        try:
            with stopit.ThreadingTimeout(timeout):
                self._search_method.dynamic_defaults(self._x, self._y, timeout)
                self._search_method.search(self._operator_set,
                                           start_candidates=pop)
        except KeyboardInterrupt:
            log.info("Search phase terminated because of Keyboard Interrupt.")

        self._final_pop = self._search_method.output
        n_evaluations = len(self._evaluation_library.evaluations)
        log.info(f"Search phase evaluated {n_evaluations} individuals.")

    def export_script(self,
                      file: Optional[str] = "gama_pipeline.py",
                      raise_if_exists: bool = False):
        """ Export a Python script which sets up the best found pipeline.

        Can only be called after `fit`.

        Example
        -------
        After the AutoML search process has completed (i.e. `fit` has been called),
        the model which has been found by GAMA may be exported to a Python file.
        The Python file will define the found pipeline or ensemble.

        .. code-block:: python

            automl = GamaClassifier()
            automl.fit(X, y)
            automl.export_script('my_pipeline_script.py')

        The resulting script will define a variable `pipeline` or `ensemble`,
        depending on the post-processing method that was used after search.

        Parameters
        ----------
        file: str, optional (default='gama_pipeline.py')
            Desired filename of the exported Python script.
            If None, return the code as str instead, it will not be formatted(!).
        raise_if_exists: bool (default=False)
            If True, raise an error if the file already exists.
            If False, overwrite `file` if it already exists.
        """
        if self.model is None:
            raise RuntimeError(STR_NO_OPTIMAL_PIPELINE)
        if raise_if_exists and file is not None and os.path.isfile(file):
            raise FileExistsError(f"File {file} already exists.")

        if self._basic_encoding_pipeline is not None:
            script_text = self._post_processing.to_code(
                self._basic_encoding_pipeline.steps +
                self._fixed_pipeline_extension)
        else:
            script_text = self._post_processing.to_code(
                self._fixed_pipeline_extension)

        if file:
            with open(file, "w") as fh:
                fh.write(script_text)
            subprocess.call(["black", file])
        else:
            return script_text

    def _safe_outside_call(self, fn):
        """ Calls fn logging and ignoring all exceptions except TimeoutException. """
        try:
            fn()
        except stopit.utils.TimeoutException:
            raise
        except Exception:
            # We actually want to catch any other exception here,
            # because the callback code can be arbitrary (it can be provided by users).
            # This excuses the catch-all Exception.
            # Note KeyboardInterrupts are not exceptions and get elevated to the caller.
            log.warning("Exception during callback.", exc_info=True)

        if self._time_manager.current_activity.exceeded_limit(margin=3.0):
            # If time exceeds during a safe callback, the timeout exception *might*
            # have been swallowed. This can result in GAMA running indefinitely.
            # However in rare conditions it can be that the TimeoutException is still
            # being processed, which means we should not raise a new one yet.
            # That's why we raise the exception only if sufficient time has passed
            # since it should have been handled (3 seconds).
            raise stopit.utils.TimeoutException

    def _on_evaluation_completed(self, evaluation: Evaluation):
        for callback in self._subscribers["evaluation_completed"]:
            self._safe_outside_call(partial(callback, evaluation))

    def evaluation_completed(self, callback: Callable[[Evaluation],
                                                      Any]) -> None:
        """ Register a callback function that is called when an evaluation is completed.

        Parameters
        ----------
        callback: Callable[[Evaluation], Any]
            Function to call when a pipeline is evaluated, return values are ignored.
        """
        self._subscribers["evaluation_completed"].append(callback)
コード例 #5
0
ファイル: async_ea.py プロジェクト: prabhant/gama
def async_ea(
    ops: OperatorSet,
    output: List[Individual],
    start_candidates: List[Individual],
    restart_callback: Optional[Callable[[], bool]] = None,
    max_n_evaluations: Optional[int] = None,
    population_size: int = 50,
) -> List[Individual]:
    """ Perform asynchronous evolutionary optimization with given operators.

    Parameters
    ----------
    ops: OperatorSet
        Operator set with `evaluate`, `create`, `individual` and `eliminate` functions.
    output: List[Individual]
        A list which contains the set of best found individuals during search.
    start_candidates: List[Individual]
        A list with candidate individuals which should be used to start search from.
    restart_callback: Callable[[], bool], optional (default=None)
        Function which takes no arguments and returns True if search restart.
    max_n_evaluations: int, optional (default=None)
        If specified, only a maximum of `max_n_evaluations` individuals are evaluated.
        If None, the algorithm will be run indefinitely.
    population_size: int (default=50)
        Maximum number of individuals in the population at any time.

    Returns
    -------
    List[Individual]
        The individuals currently in the population.
    """
    if max_n_evaluations is not None and max_n_evaluations <= 0:
        raise ValueError(
            f"n_evaluations must be non-negative or None, is {max_n_evaluations}."
        )

    max_pop_size = population_size

    current_population = output
    n_evaluated_individuals = 0

    with AsyncEvaluator() as async_:
        should_restart = True
        while should_restart:
            should_restart = False
            current_population[:] = []
            log.info("Starting EA with new population.")
            for individual in start_candidates:
                async_.submit(ops.evaluate, individual)

            while (max_n_evaluations is None) or (n_evaluated_individuals <
                                                  max_n_evaluations):
                future = ops.wait_next(async_)
                if future.exception is None and future.result.error is None:
                    current_population.append(future.result.individual)
                    if len(current_population) > max_pop_size:
                        to_remove = ops.eliminate(current_population, 1)
                        current_population.remove(to_remove[0])

                if async_.job_queue_size <= 1:
                    # Technically 0 should work to keep near-100% worker load,
                    # especially if the dataset is sufficiently large to require
                    # significant time to evaluate a pipeline.
                    # Increasing the number decreases the risk of lost compute time,
                    # but also increases information lag. An offspring created too
                    # early might miss out on a better parent.
                    new_individual = ops.create(current_population, 1)[0]
                    async_.submit(ops.evaluate, new_individual)

                should_restart = restart_callback is not None and restart_callback(
                )
                n_evaluated_individuals += 1
                if should_restart:
                    log.info(
                        "Restart criterion met. Creating new random population."
                    )
                    start_candidates = [
                        ops.individual() for _ in range(max_pop_size)
                    ]
                    break

    return current_population
コード例 #6
0
ファイル: gama.py プロジェクト: vumichien/gama
    def __init__(
        self,
        scoring: Union[str, Metric, Iterable[str],
                       Iterable[Metric]] = "filled_in_by_child_class",
        regularize_length: bool = True,
        max_pipeline_length: Optional[int] = None,
        config: Dict = None,
        random_state: Optional[int] = None,
        max_total_time: int = 3600,
        max_eval_time: Optional[int] = None,
        n_jobs: Optional[int] = None,
        verbosity: int = logging.WARNING,
        keep_analysis_log: Optional[str] = "gama.log",
        search_method: BaseSearch = AsyncEA(),
        post_processing_method: BasePostProcessing = BestFitPostProcessing(),
        cache: Optional[str] = None,
    ):
        """

        Parameters
        ----------
        scoring: str, Metric or Tuple
            Specifies the/all metric(s) to optimize towards.
            A string will be converted to Metric.
            A tuple must specify each metric with the same type (e.g. all str).
            See :ref:`Metrics` for built-in metrics.

        regularize_length: bool (default=True)
            If True, add pipeline length as an optimization metric.
            Short pipelines should then be preferred over long ones.

        max_pipeline_length: int, optional (default=None)
            If set, limit the maximum number of steps in any evaluated pipeline.
            Encoding and imputation are excluded.

        config: Dict
            Specifies available components and their valid hyperparameter settings.
            For more information, see :ref:`search_space_configuration`.

        random_state:  int, optional (default=None)
            Seed for the random number generators used in the process.
            However, with `n_jobs > 1`,
            there will be randomization introduced by multi-processing.
            For reproducible results, set this and use `n_jobs=1`.

        max_total_time: positive int (default=3600)
            Time in seconds that can be used for the `fit` call.

        max_eval_time: positive int, optional (default=None)
            Time in seconds that can be used to evaluate any one single individual.
            If None, set to 0.1 * max_total_time.

        n_jobs: int, optional (default=None)
            The amount of parallel processes that may be created to speed up `fit`.
            Accepted values are positive integers, -1 or None.
            If -1 is specified, multiprocessing.cpu_count() processes are created.
            If None is specified, multiprocessing.cpu_count() / 2 processes are created.

        verbosity: int (default=logging.WARNING)
            Sets the level of log messages to be automatically output to terminal.

        keep_analysis_log: str, optional (default='gama.log')
            If non-empty str, specify filepath where the log should be stored.
            If `None`, no log is stored.

        search_method: BaseSearch (default=AsyncEA())
            Search method to use to find good pipelines. Should be instantiated.

        post_processing_method: BasePostProcessing (default=BestFitPostProcessing())
            Post-processing method to create a model after the search phase.
            Should be an instantiated subclass of BasePostProcessing.

        cache: str, optional (default=None)
            Directory to use to save intermediate results during search.
            If set to None, generate a unique cache name.
        """
        register_stream_log(verbosity)
        if keep_analysis_log is not None:
            register_file_log(keep_analysis_log)

        if keep_analysis_log is not None and not os.path.isabs(
                keep_analysis_log):
            keep_analysis_log = os.path.abspath(keep_analysis_log)

        arguments = ",".join([
            f"{k}={v}" for (k, v) in locals().items()
            if k not in ["self", "config"]
        ])
        log.info(f"Using GAMA version {__version__}.")
        log.info(f"{self.__class__.__name__}({arguments})")
        log_event(log, TOKENS.INIT, arguments)

        if n_jobs is None:
            n_jobs = multiprocessing.cpu_count() // 2
            log.debug("n_jobs defaulted to %d", n_jobs)

        if max_total_time is None or max_total_time <= 0:
            raise ValueError(
                f"Expect positive int for max_total_time, got {max_total_time}."
            )
        if max_eval_time is not None and max_eval_time <= 0:
            raise ValueError(
                f"Expect None or positive int for max_eval_time, got {max_eval_time}."
            )
        if n_jobs < -1 or n_jobs == 0:
            raise ValueError(
                f"n_jobs should be -1 or positive int but is {n_jobs}.")
        AsyncEvaluator.n_jobs = n_jobs

        if max_eval_time is None:
            max_eval_time = round(0.1 * max_total_time)
        if max_eval_time > max_total_time:
            log.warning(
                f"max_eval_time ({max_eval_time}) > max_total_time ({max_total_time}) "
                f"is not allowed. max_eval_time set to {max_total_time}.")
            max_eval_time = max_total_time

        self._max_eval_time = max_eval_time
        self._time_manager = TimeKeeper(max_total_time)
        self._metrics: Tuple[Metric, ...] = scoring_to_metric(scoring)
        self._regularize_length = regularize_length
        self._search_method: BaseSearch = search_method
        self._post_processing = post_processing_method

        if random_state is not None:
            random.seed(random_state)
            np.random.seed(random_state)

        self._x: Optional[pd.DataFrame] = None
        self._y: Optional[pd.DataFrame] = None
        self._basic_encoding_pipeline: Optional[Pipeline] = None
        self._fixed_pipeline_extension: List[Tuple[str, TransformerMixin]] = []
        self._inferred_dtypes: List[Type] = []
        self.model: object = None
        self._final_pop: List[Individual] = []

        self._subscribers: Dict[str, List[Callable]] = defaultdict(list)
        if not cache:
            cache = f"cache_{str(uuid.uuid4())}"
        if isinstance(post_processing_method, EnsemblePostProcessing):
            self._evaluation_library = EvaluationLibrary(
                m=post_processing_method.hyperparameters["max_models"],
                n=post_processing_method.hyperparameters["hillclimb_size"],
                cache_directory=cache,
            )
        else:
            # Don't keep memory-heavy evaluation meta-data (predictions, estimators)
            self._evaluation_library = EvaluationLibrary(m=0,
                                                         cache_directory=cache)
        self.evaluation_completed(self._evaluation_library.save_evaluation)

        self._pset, parameter_checks = pset_from_config(config)

        max_start_length = 3 if max_pipeline_length is None else max_pipeline_length
        self._operator_set = OperatorSet(
            mutate=partial(
                random_valid_mutation_in_place,
                primitive_set=self._pset,
                max_length=max_pipeline_length,
            ),
            mate=partial(random_crossover, max_length=max_pipeline_length),
            create_from_population=partial(create_from_population,
                                           cxpb=0.2,
                                           mutpb=0.8),
            create_new=partial(
                create_random_expression,
                primitive_set=self._pset,
                max_length=max_start_length,
            ),
            compile_=compile_individual,
            eliminate=eliminate_from_pareto,
            evaluate_callback=self._on_evaluation_completed,
            completed_evaluations=self._evaluation_library.lookup,
        )
コード例 #7
0
ファイル: async_ea.py プロジェクト: vumichien/gama
def async_ea(
    ops: OperatorSet,
    output: List[Individual],
    start_candidates: List[Individual],
    restart_callback: Optional[Callable[[], bool]] = None,
    max_n_evaluations: Optional[int] = None,
    population_size: int = 50,
) -> List[Individual]:
    """ Perform asynchronous evolutionary optimization with given operators.

    Parameters
    ----------
    ops: OperatorSet
        Operator set with `evaluate`, `create`, `individual` and `eliminate` functions.
    output: List[Individual]
        A list which contains the set of best found individuals during search.
    start_candidates: List[Individual]
        A list with candidate individuals which should be used to start search from.
    restart_callback: Callable[[], bool], optional (default=None)
        Function which takes no arguments and returns True if search restart.
    max_n_evaluations: int, optional (default=None)
        If specified, only a maximum of `max_n_evaluations` individuals are evaluated.
        If None, the algorithm will be run indefinitely.
    population_size: int (default=50)
        Maximum number of individuals in the population at any time.

    Returns
    -------
    List[Individual]
        The individuals currently in the population.
    """
    if max_n_evaluations is not None and max_n_evaluations <= 0:
        raise ValueError(
            f"n_evaluations must be non-negative or None, is {max_n_evaluations}."
        )

    max_pop_size = population_size
    logger = MultiprocessingLogger()

    evaluate_log = partial(ops.evaluate, logger=logger)

    current_population = output
    n_evaluated_individuals = 0

    with AsyncEvaluator() as async_:
        should_restart = True
        while should_restart:
            should_restart = False
            current_population[:] = []
            log.info("Starting EA with new population.")
            for individual in start_candidates:
                async_.submit(evaluate_log, individual)

            while (max_n_evaluations is None) or (n_evaluated_individuals <
                                                  max_n_evaluations):
                future = ops.wait_next(async_)
                logger.flush_to_log(log)
                if future.exception is None:
                    individual = future.result.individual
                    current_population.append(individual)
                    if len(current_population) > max_pop_size:
                        to_remove = ops.eliminate(current_population, 1)
                        log_event(log, TOKENS.EA_REMOVE_IND, to_remove[0])
                        current_population.remove(to_remove[0])

                if len(current_population) > 2:
                    new_individual = ops.create(current_population, 1)[0]
                    async_.submit(evaluate_log, new_individual)

                should_restart = restart_callback is not None and restart_callback(
                )
                n_evaluated_individuals += 1
                if should_restart:
                    log.info(
                        "Restart criterion met. Creating new random population."
                    )
                    log_event(log, TOKENS.EA_RESTART, n_evaluated_individuals)
                    start_candidates = [
                        ops.individual() for _ in range(max_pop_size)
                    ]
                    break

    return current_population
コード例 #8
0
    def __init__(
        self,
        scoring: Union[str, Metric, Tuple[Union[str, Metric],
                                          ...]] = 'filled_in_by_child_class',
        regularize_length: bool = True,
        max_pipeline_length: Optional[int] = None,
        config: Dict = None,
        random_state: Optional[int] = None,
        max_total_time: int = 3600,
        max_eval_time: Optional[int] = None,
        n_jobs: Optional[int] = None,
        verbosity: int = logging.WARNING,
        keep_analysis_log: Optional[str] = 'gama.log',
        search_method: BaseSearch = AsyncEA(),
        post_processing_method: BasePostProcessing = BestFitPostProcessing()):
        """

        Parameters
        ----------
        scoring: str, Metric or Tuple
            Specifies the/all metric(s) to optimize towards. A string will be converted to Metric. A tuple must
            specify each metric with the same type (i.e. all str or all Metric). See :ref:`Metrics` for built-in
            metrics.

        regularize_length: bool
            If True, add pipeline length as an optimization metric (preferring short over long).

        max_pipeline_length: int, optional (default=None)
            If set, limit the maximum number of steps in any evaluated pipeline. Encoding and imputation are excluded.

        config: a dictionary which specifies available components and their valid hyperparameter settings
            For more information, see :ref:`search_space_configuration`.

        random_state:  int, optional (default=None)
            If an integer is passed, this will be the seed for the random number generators used in the process.
            However, with `n_jobs > 1`, there will be randomization introduced by multi-processing.
            For reproducible results, set this and use `n_jobs=1`.

        max_total_time: positive int (default=3600)
            Time in seconds that can be used for the `fit` call.

        max_eval_time: positive int, optional (default=None)
            Time in seconds that can be used to evaluate any one single individual.
            If None, set to 0.1 * max_total_time.

        n_jobs: int, optional (default=None)
            The amount of parallel processes that may be created to speed up `fit`.
            Accepted values are positive integers, -1 or None.
            If -1 is specified, multiprocessing.cpu_count() processes are created.
            If None is specified, multiprocessing.cpu_count() / 2 processes are created.

        verbosity: int (default=logging.WARNING)
            Sets the level of log messages to be automatically output to terminal.

        keep_analysis_log: str, optional (default='gama.log')
            If non-empty str, specifies the path (and name) where the log should be stored, e.g. /output/gama.log.
            If `None`, no log is stored.

        search_method: BaseSearch (default=AsyncEA())
            Search method to use to find good pipelines. Should be instantiated.

        post_processing_method: BasePostProcessing (default=BestFitPostProcessing())
            Post-processing method to create a model after the search phase. Should be instantiated.

        """
        register_stream_log(verbosity)
        if keep_analysis_log is not None:
            register_file_log(keep_analysis_log)

        if keep_analysis_log is not None and not os.path.isabs(
                keep_analysis_log):
            keep_analysis_log = os.path.abspath(keep_analysis_log)

        arguments = ','.join([
            '{}={}'.format(k, v) for (k, v) in locals().items() if k not in [
                'self', 'config', 'gamalog', 'file_handler',
                'stdout_streamhandler'
            ]
        ])
        log.info('Using GAMA version {}.'.format(__version__))
        log.info('{}({})'.format(self.__class__.__name__, arguments))
        log_event(log, TOKENS.INIT, arguments)

        if n_jobs is None:
            n_jobs = multiprocessing.cpu_count() // 2
            log.debug('n_jobs defaulted to %d', n_jobs)

        if max_total_time is None or max_total_time <= 0:
            raise ValueError(
                f"max_total_time should be integer greater than zero but is {max_total_time}."
            )
        if max_eval_time is not None and max_eval_time <= 0:
            raise ValueError(
                f"max_eval_time should be None or integer greater than zero but is {max_eval_time}."
            )
        if n_jobs < -1 or n_jobs == 0:
            raise ValueError(
                f"n_jobs should be -1 or positive integer but is {n_jobs}.")
        elif n_jobs != -1:
            # AsyncExecutor defaults to using multiprocessing.cpu_count(), i.e. n_jobs=-1
            AsyncEvaluator.n_jobs = n_jobs

        if max_eval_time is None:
            max_eval_time = 0.1 * max_total_time
        if max_eval_time > max_total_time:
            log.warning(
                f"max_eval_time ({max_eval_time}) > max_total_time ({max_total_time}) is not allowed. "
                f"max_eval_time set to {max_total_time}.")
            max_eval_time = max_total_time

        self._max_eval_time = max_eval_time
        self._time_manager = TimeKeeper(max_total_time)
        self._metrics: Tuple[Metric] = scoring_to_metric(scoring)
        self._regularize_length = regularize_length
        self._search_method: BaseSearch = search_method
        self._post_processing = post_processing_method

        if random_state is not None:
            random.seed(random_state)
            np.random.seed(random_state)

        self._X: Optional[pd.DataFrame] = None
        self._y: Optional[pd.DataFrame] = None
        self._basic_encoding_pipeline: Optional[Pipeline] = None
        self._inferred_dtypes: List[Type] = []
        self.model: object = None
        self._final_pop = None

        self._subscribers = defaultdict(list)
        if isinstance(post_processing_method, EnsemblePostProcessing):
            self._evaluation_library = EvaluationLibrary(
                m=post_processing_method.hyperparameters['max_models'],
                n=post_processing_method.hyperparameters['hillclimb_size'],
            )
        else:
            # Don't keep memory-heavy evaluation meta-data (predictions, estimators)
            self._evaluation_library = EvaluationLibrary(m=0)
        self.evaluation_completed(self._evaluation_library.save_evaluation)

        self._pset, parameter_checks = pset_from_config(config)

        max_start_length = 3 if max_pipeline_length is None else max_pipeline_length
        self._operator_set = OperatorSet(
            mutate=partial(random_valid_mutation_in_place,
                           primitive_set=self._pset,
                           max_length=max_pipeline_length),
            mate=partial(random_crossover, max_length=max_pipeline_length),
            create_from_population=partial(create_from_population,
                                           cxpb=0.2,
                                           mutpb=0.8),
            create_new=partial(create_random_expression,
                               primitive_set=self._pset,
                               max_length=max_start_length),
            compile_=compile_individual,
            eliminate=eliminate_from_pareto,
            evaluate_callback=self._on_evaluation_completed)
コード例 #9
0
class Gama(ABC):
    """ Wrapper for the toolbox logic surrounding executing the AutoML pipeline. """
    def __init__(
        self,
        scoring: Union[str, Metric, Tuple[Union[str, Metric],
                                          ...]] = 'filled_in_by_child_class',
        regularize_length: bool = True,
        max_pipeline_length: Optional[int] = None,
        config: Dict = None,
        random_state: Optional[int] = None,
        max_total_time: int = 3600,
        max_eval_time: Optional[int] = None,
        n_jobs: Optional[int] = None,
        verbosity: int = logging.WARNING,
        keep_analysis_log: Optional[str] = 'gama.log',
        search_method: BaseSearch = AsyncEA(),
        post_processing_method: BasePostProcessing = BestFitPostProcessing()):
        """

        Parameters
        ----------
        scoring: str, Metric or Tuple
            Specifies the/all metric(s) to optimize towards. A string will be converted to Metric. A tuple must
            specify each metric with the same type (i.e. all str or all Metric). See :ref:`Metrics` for built-in
            metrics.

        regularize_length: bool
            If True, add pipeline length as an optimization metric (preferring short over long).

        max_pipeline_length: int, optional (default=None)
            If set, limit the maximum number of steps in any evaluated pipeline. Encoding and imputation are excluded.

        config: a dictionary which specifies available components and their valid hyperparameter settings
            For more information, see :ref:`search_space_configuration`.

        random_state:  int, optional (default=None)
            If an integer is passed, this will be the seed for the random number generators used in the process.
            However, with `n_jobs > 1`, there will be randomization introduced by multi-processing.
            For reproducible results, set this and use `n_jobs=1`.

        max_total_time: positive int (default=3600)
            Time in seconds that can be used for the `fit` call.

        max_eval_time: positive int, optional (default=None)
            Time in seconds that can be used to evaluate any one single individual.
            If None, set to 0.1 * max_total_time.

        n_jobs: int, optional (default=None)
            The amount of parallel processes that may be created to speed up `fit`.
            Accepted values are positive integers, -1 or None.
            If -1 is specified, multiprocessing.cpu_count() processes are created.
            If None is specified, multiprocessing.cpu_count() / 2 processes are created.

        verbosity: int (default=logging.WARNING)
            Sets the level of log messages to be automatically output to terminal.

        keep_analysis_log: str, optional (default='gama.log')
            If non-empty str, specifies the path (and name) where the log should be stored, e.g. /output/gama.log.
            If `None`, no log is stored.

        search_method: BaseSearch (default=AsyncEA())
            Search method to use to find good pipelines. Should be instantiated.

        post_processing_method: BasePostProcessing (default=BestFitPostProcessing())
            Post-processing method to create a model after the search phase. Should be instantiated.

        """
        register_stream_log(verbosity)
        if keep_analysis_log is not None:
            register_file_log(keep_analysis_log)

        if keep_analysis_log is not None and not os.path.isabs(
                keep_analysis_log):
            keep_analysis_log = os.path.abspath(keep_analysis_log)

        arguments = ','.join([
            '{}={}'.format(k, v) for (k, v) in locals().items() if k not in [
                'self', 'config', 'gamalog', 'file_handler',
                'stdout_streamhandler'
            ]
        ])
        log.info('Using GAMA version {}.'.format(__version__))
        log.info('{}({})'.format(self.__class__.__name__, arguments))
        log_event(log, TOKENS.INIT, arguments)

        if n_jobs is None:
            n_jobs = multiprocessing.cpu_count() // 2
            log.debug('n_jobs defaulted to %d', n_jobs)

        if max_total_time is None or max_total_time <= 0:
            raise ValueError(
                f"max_total_time should be integer greater than zero but is {max_total_time}."
            )
        if max_eval_time is not None and max_eval_time <= 0:
            raise ValueError(
                f"max_eval_time should be None or integer greater than zero but is {max_eval_time}."
            )
        if n_jobs < -1 or n_jobs == 0:
            raise ValueError(
                f"n_jobs should be -1 or positive integer but is {n_jobs}.")
        elif n_jobs != -1:
            # AsyncExecutor defaults to using multiprocessing.cpu_count(), i.e. n_jobs=-1
            AsyncEvaluator.n_jobs = n_jobs

        if max_eval_time is None:
            max_eval_time = 0.1 * max_total_time
        if max_eval_time > max_total_time:
            log.warning(
                f"max_eval_time ({max_eval_time}) > max_total_time ({max_total_time}) is not allowed. "
                f"max_eval_time set to {max_total_time}.")
            max_eval_time = max_total_time

        self._max_eval_time = max_eval_time
        self._time_manager = TimeKeeper(max_total_time)
        self._metrics: Tuple[Metric] = scoring_to_metric(scoring)
        self._regularize_length = regularize_length
        self._search_method: BaseSearch = search_method
        self._post_processing = post_processing_method

        if random_state is not None:
            random.seed(random_state)
            np.random.seed(random_state)

        self._X: Optional[pd.DataFrame] = None
        self._y: Optional[pd.DataFrame] = None
        self._basic_encoding_pipeline: Optional[Pipeline] = None
        self._inferred_dtypes: List[Type] = []
        self.model: object = None
        self._final_pop = None

        self._subscribers = defaultdict(list)
        if isinstance(post_processing_method, EnsemblePostProcessing):
            self._evaluation_library = EvaluationLibrary(
                m=post_processing_method.hyperparameters['max_models'],
                n=post_processing_method.hyperparameters['hillclimb_size'],
            )
        else:
            # Don't keep memory-heavy evaluation meta-data (predictions, estimators)
            self._evaluation_library = EvaluationLibrary(m=0)
        self.evaluation_completed(self._evaluation_library.save_evaluation)

        self._pset, parameter_checks = pset_from_config(config)

        max_start_length = 3 if max_pipeline_length is None else max_pipeline_length
        self._operator_set = OperatorSet(
            mutate=partial(random_valid_mutation_in_place,
                           primitive_set=self._pset,
                           max_length=max_pipeline_length),
            mate=partial(random_crossover, max_length=max_pipeline_length),
            create_from_population=partial(create_from_population,
                                           cxpb=0.2,
                                           mutpb=0.8),
            create_new=partial(create_random_expression,
                               primitive_set=self._pset,
                               max_length=max_start_length),
            compile_=compile_individual,
            eliminate=eliminate_from_pareto,
            evaluate_callback=self._on_evaluation_completed)

    def _np_to_matching_dataframe(self, x: np.ndarray) -> pd.DataFrame:
        """ Format the numpy array to a dataframe whose column types match the training data. """
        if not isinstance(x, np.ndarray):
            raise TypeError(
                f"Expected x to be of type 'numpy.ndarray' not {type(x)}.")

        x = pd.DataFrame(x)
        for i, dtype in enumerate(self._inferred_dtypes):
            x[i] = x[i].astype(dtype)
        return x

    def _prepare_for_prediction(self, x):
        if isinstance(x, np.ndarray):
            x = self._np_to_matching_dataframe(x)
        x = self._basic_encoding_pipeline.transform(x)
        return x

    def _predict(self, x: pd.DataFrame):
        raise NotImplemented('_predict is implemented by base classes.')

    def predict(self, x: Union[pd.DataFrame, np.ndarray]):
        """ Predict the target for input X.

        Parameters
        ----------
        x: pandas.DataFrame or numpy.ndarray
            A dataframe or array with the same number of columns as the input to `fit`.

        Returns
        -------
        numpy.ndarray
            array with predictions of shape (N,) where N is the length of the first dimension of X.
        """
        x = self._prepare_for_prediction(x)
        return self._predict(x)

    def predict_arff(self,
                     arff_file_path: str,
                     target_column: Optional[str] = None,
                     encoding: Optional[str] = None) -> np.ndarray:
        """ Predict the target for input found in the ARFF file.

        Parameters
        ----------
        arff_file_path: str
            An ARFF file with the same columns as the one that used in fit.
            Target column must be present in file, but its values are ignored (can be '?').
        target_column: str, optional (default=None)
            Specifies which column the model should predict.
            If left None, the last column is taken to be the target.
        encoding: str, optional
            Encoding of the ARFF file.

        Returns
        -------
        numpy.ndarray
            array with predictions for each row in the ARFF file.
        """
        x, _ = X_y_from_arff(arff_file_path,
                             split_column=target_column,
                             encoding=encoding)
        x = self._prepare_for_prediction(x)
        return self._predict(x)

    def score(self, x: Union[pd.DataFrame, np.ndarray],
              y: Union[pd.Series, np.ndarray]) -> float:
        """ Calculate the score of the model according to the `scoring` metric and input (x, y).

        Parameters
        ----------
        x: pandas.DataFrame or numpy.ndarray
            Data to predict target values for.
        y: pandas.Series or numpy.ndarray
            True values for the target.

        Returns
        -------
        float
            The score obtained on the given test data according to the `scoring` metric.
        """
        predictions = self.predict_proba(
            x) if self._metrics[0].requires_probabilities else self.predict(x)
        return self._metrics[0].score(y, predictions)

    def score_arff(self,
                   arff_file_path: str,
                   target_column: Optional[str] = None,
                   encoding: Optional[str] = None) -> float:
        """ Calculate the score of the model according to the `scoring` metric and input in the ARFF file.

        Parameters
        ----------
        arff_file_path: str
            An ARFF file with which to calculate the score.
        target_column: str, optional (default=None)
            Specifies which column the model should predict.
            If left None, the last column is taken to be the target.
        encoding: str, optional
            Encoding of the ARFF file.

        Returns
        -------
        float
            The score obtained on the given test data according to the `scoring` metric.
        """
        X, y = X_y_from_arff(arff_file_path,
                             split_column=target_column,
                             encoding=encoding)
        return self.score(X, y)

    def fit_arff(self,
                 arff_file_path: str,
                 target_column: Optional[str] = None,
                 encoding: Optional[str] = None,
                 *args,
                 **kwargs) -> None:
        """ Find and fit a model to predict the target column (last) from other columns.

        Parameters
        ----------
        arff_file_path: str
            Path to an ARFF file containing the training data.
        target_column: str, optional (default=None)
            Specifies which column the model should predict.
            If left None, the last column is taken to be the target.
        encoding: str, optional
            Encoding of the ARFF file.

        """
        X, y = X_y_from_arff(arff_file_path,
                             split_column=target_column,
                             encoding=encoding)
        self.fit(X, y, *args, **kwargs)

    def fit(self,
            x: Union[pd.DataFrame, np.ndarray],
            y: Union[pd.DataFrame, pd.Series, np.ndarray],
            warm_start: bool = False) -> None:
        """ Find and fit a model to predict target y from X.

        Various possible machine learning pipelines will be fit to the (X,y) data.
        Using Genetic Programming, the pipelines chosen should lead to gradually
        better models. Pipelines will internally be validated using cross validation.

        After the search termination condition is met, the best found pipeline
        configuration is then used to train a final model on all provided data.

        Parameters
        ----------
        x: pandas.DataFrame or numpy.ndarray, shape = [n_samples, n_features]
            Training data. All elements must be able to be converted to float.
        y: pandas.DataFrame, pandas.Series or numpy.ndarray, shape = [n_samples,]
            Target values. If a DataFrame is provided, it is assumed the first column contains target values.
        warm_start: bool (default=False)
            Indicates the optimization should continue using the last individuals of the
            previous `fit` call.
        """

        with self._time_manager.start_activity('preprocessing',
                                               activity_meta=['default']):
            x, self._y = format_x_y(x, y)
            self._inferred_dtypes = x.dtypes
            self._X, self._basic_encoding_pipeline = basic_encoding(x)
            steps = basic_pipeline_extension(self._X)
            #  steps = define_preprocessing_steps(self._X, max_extra_features_created=None, max_categories_for_one_hot=10)
            self._operator_set._safe_compile = partial(
                compile_individual, preprocessing_steps=steps)

        fit_time = int((1 - self._post_processing.time_fraction) *
                       self._time_manager.total_time_remaining)

        with self._time_manager.start_activity(
                'search',
                time_limit=fit_time,
                activity_meta=[self._search_method.__class__.__name__]):
            self._search_phase(warm_start, timeout=fit_time)

        with self._time_manager.start_activity(
                'postprocess',
                time_limit=int(self._time_manager.total_time_remaining),
                activity_meta=[self._post_processing.__class__.__name__]):
            best_individuals = list(
                reversed(
                    sorted(self._final_pop,
                           key=lambda ind: ind.fitness.values)))
            self._post_processing.dynamic_defaults(self)
            self.model = self._post_processing.post_process(
                self._X, self._y, self._time_manager.total_time_remaining,
                best_individuals)

    def _search_phase(self, warm_start: bool = False, timeout: int = 1e6):
        """ Invoke the evolutionary algorithm, populate `final_pop` regardless of termination. """
        if warm_start and self._final_pop is not None:
            pop = [ind for ind in self._final_pop]
        else:
            if warm_start:
                log.warning(
                    'Warm-start enabled but no earlier fit. Using new generated population instead.'
                )
            pop = [self._operator_set.individual() for _ in range(50)]

        deadline = time.time() + timeout

        evaluate_pipeline = partial(
            gama.genetic_programming.compilers.scikitlearn.evaluate_pipeline,
            X=self._X,
            y_train=self._y,
            metrics=self._metrics)
        self._operator_set.evaluate = partial(
            gama.genetic_programming.compilers.scikitlearn.evaluate_individual,
            evaluate_pipeline=evaluate_pipeline,
            timeout=self._max_eval_time,
            deadline=deadline,
            add_length_to_score=self._regularize_length)

        try:
            with stopit.ThreadingTimeout(timeout):
                self._search_method.dynamic_defaults(self._X, self._y, timeout)
                self._search_method.search(self._operator_set,
                                           start_candidates=pop)
        except KeyboardInterrupt:
            log.info('Search phase terminated because of Keyboard Interrupt.')

        self._final_pop = self._search_method.output
        log.debug([str(i) for i in self._final_pop[:100]])
        log.info(
            f'Search phase evaluated {len(self._evaluation_library.evaluations)} individuals.'
        )

    def export_script(self,
                      file: str = 'gama_pipeline.py',
                      raise_if_exists: bool = False):
        """ Exports a Python script which sets up the best found pipeline. Can only be called after `fit`.

        Example
        -------
        After the AutoML search process has completed (i.e. `fit` has been called), the model which
        has been found by GAMA may be exported to a Python file. The Python file will define the found
        pipeline or ensemble.

        .. code-block:: python

            automl = GamaClassifier()
            automl.fit(X, y)
            automl.export_script('my_pipeline_script.py')

        The resulting script will define a variable `pipeline` or `ensemble`, depending on the post-processing
        method that was used after search.

        Parameters
        ----------
        file: str (default='gama_pipeline.py')
            Desired filename of the exported Python script.
        raise_if_exists: bool (default=False)
            If True, raise an error if the file already exists.
            If False, overwrite `file` if it already exists.
        """
        if self.model is None:
            raise RuntimeError(STR_NO_OPTIMAL_PIPELINE)
        if raise_if_exists and os.path.isfile(file):
            raise FileExistsError(f"File {file} already exists.")

        script_text = self._post_processing.to_code(
            self._basic_encoding_pipeline)
        with open(file, 'w') as fh:
            fh.write(script_text)

    def export_script(self,
                      file: str = 'gama_pipeline.py',
                      raise_if_exists: bool = False):
        """ Exports a Python script which sets up the best found pipeline. Can only be called after `fit`.

        Example
        -------
        After the AutoML search process has completed (i.e. `fit` has been called), the model which
        has been found by GAMA may be exported to a Python file. The Python file will define the found
        pipeline or ensemble.

        .. code-block:: python

            automl = GamaClassifier()
            automl.fit(X, y)
            automl.export_script('my_pipeline_script.py')

        The resulting script will define a variable `pipeline` or `ensemble`, depending on the post-processing
        method that was used after search.

        Parameters
        ----------
        file: str (default='gama_pipeline.py')
            Desired filename of the exported Python script.
        raise_if_exists: bool (default=False)
            If True, raise an error if the file already exists.
            If False, overwrite `file` if it already exists.
        """
        if self.model is None:
            raise RuntimeError(STR_NO_OPTIMAL_PIPELINE)
        if raise_if_exists and os.path.isfile(file):
            raise FileExistsError(f"File {file} already exists.")

        script_text = self._post_processing.to_code()
        with open(file, 'w') as fh:
            fh.write(script_text)

    def _safe_outside_call(self, fn):
        """ Calls fn and log any exception it raises without reraising, except for TimeoutException. """
        try:
            fn()
        except stopit.utils.TimeoutException:
            raise
        except Exception:
            # We actually want to catch any other exception here, because the callback code can be
            # arbitrary (it can be provided by users). This excuses the catch-all Exception.
            # Note that KeyboardInterrupts are not exceptions and get elevated to the caller.
            log.warning("Exception during callback.", exc_info=True)
            pass
        if self._time_manager.current_activity.exceeded_limit:
            log.info(
                "Time exceeded during callback, but exception was swallowed.")
            raise stopit.utils.TimeoutException

    def _on_evaluation_completed(self, evaluation: Evaluation):
        for callback in self._subscribers['evaluation_completed']:
            self._safe_outside_call(partial(callback, evaluation))

    def evaluation_completed(self, callback_function):
        """ Register a callback function that is called when new evaluation is completed.

        Parameters
        ----------
        callback_function:
            Function to call when a pipeline is evaluated, return values are ignored.
            Expected signature is: Evaluation -> Any
        """
        self._subscribers['evaluation_completed'].append(callback_function)