Esempio n. 1
0
    def __init__(
        self,
        scoring: Union[str, Metric, Tuple[Union[str, Metric],
                                          ...]] = 'filled_in_by_child_class',
        regularize_length: bool = True,
        max_pipeline_length: Optional[int] = None,
        config: Dict = None,
        random_state: Optional[int] = None,
        max_total_time: int = 3600,
        max_eval_time: Optional[int] = None,
        n_jobs: Optional[int] = None,
        verbosity: int = logging.WARNING,
        keep_analysis_log: Optional[str] = 'gama.log',
        search_method: BaseSearch = AsyncEA(),
        post_processing_method: BasePostProcessing = BestFitPostProcessing()):
        """

        Parameters
        ----------
        scoring: str, Metric or Tuple
            Specifies the/all metric(s) to optimize towards. A string will be converted to Metric. A tuple must
            specify each metric with the same type (i.e. all str or all Metric). See :ref:`Metrics` for built-in
            metrics.

        regularize_length: bool
            If True, add pipeline length as an optimization metric (preferring short over long).

        max_pipeline_length: int, optional (default=None)
            If set, limit the maximum number of steps in any evaluated pipeline. Encoding and imputation are excluded.

        config: a dictionary which specifies available components and their valid hyperparameter settings
            For more information, see :ref:`search_space_configuration`.

        random_state:  int, optional (default=None)
            If an integer is passed, this will be the seed for the random number generators used in the process.
            However, with `n_jobs > 1`, there will be randomization introduced by multi-processing.
            For reproducible results, set this and use `n_jobs=1`.

        max_total_time: positive int (default=3600)
            Time in seconds that can be used for the `fit` call.

        max_eval_time: positive int, optional (default=None)
            Time in seconds that can be used to evaluate any one single individual.
            If None, set to 0.1 * max_total_time.

        n_jobs: int, optional (default=None)
            The amount of parallel processes that may be created to speed up `fit`.
            Accepted values are positive integers, -1 or None.
            If -1 is specified, multiprocessing.cpu_count() processes are created.
            If None is specified, multiprocessing.cpu_count() / 2 processes are created.

        verbosity: int (default=logging.WARNING)
            Sets the level of log messages to be automatically output to terminal.

        keep_analysis_log: str, optional (default='gama.log')
            If non-empty str, specifies the path (and name) where the log should be stored, e.g. /output/gama.log.
            If `None`, no log is stored.

        search_method: BaseSearch (default=AsyncEA())
            Search method to use to find good pipelines. Should be instantiated.

        post_processing_method: BasePostProcessing (default=BestFitPostProcessing())
            Post-processing method to create a model after the search phase. Should be instantiated.

        """
        register_stream_log(verbosity)
        if keep_analysis_log is not None:
            register_file_log(keep_analysis_log)

        if keep_analysis_log is not None and not os.path.isabs(
                keep_analysis_log):
            keep_analysis_log = os.path.abspath(keep_analysis_log)

        arguments = ','.join([
            '{}={}'.format(k, v) for (k, v) in locals().items() if k not in [
                'self', 'config', 'gamalog', 'file_handler',
                'stdout_streamhandler'
            ]
        ])
        log.info('Using GAMA version {}.'.format(__version__))
        log.info('{}({})'.format(self.__class__.__name__, arguments))
        log_event(log, TOKENS.INIT, arguments)

        if n_jobs is None:
            n_jobs = multiprocessing.cpu_count() // 2
            log.debug('n_jobs defaulted to %d', n_jobs)

        if max_total_time is None or max_total_time <= 0:
            raise ValueError(
                f"max_total_time should be integer greater than zero but is {max_total_time}."
            )
        if max_eval_time is not None and max_eval_time <= 0:
            raise ValueError(
                f"max_eval_time should be None or integer greater than zero but is {max_eval_time}."
            )
        if n_jobs < -1 or n_jobs == 0:
            raise ValueError(
                f"n_jobs should be -1 or positive integer but is {n_jobs}.")
        elif n_jobs != -1:
            # AsyncExecutor defaults to using multiprocessing.cpu_count(), i.e. n_jobs=-1
            AsyncEvaluator.n_jobs = n_jobs

        if max_eval_time is None:
            max_eval_time = 0.1 * max_total_time
        if max_eval_time > max_total_time:
            log.warning(
                f"max_eval_time ({max_eval_time}) > max_total_time ({max_total_time}) is not allowed. "
                f"max_eval_time set to {max_total_time}.")
            max_eval_time = max_total_time

        self._max_eval_time = max_eval_time
        self._time_manager = TimeKeeper(max_total_time)
        self._metrics: Tuple[Metric] = scoring_to_metric(scoring)
        self._regularize_length = regularize_length
        self._search_method: BaseSearch = search_method
        self._post_processing = post_processing_method

        if random_state is not None:
            random.seed(random_state)
            np.random.seed(random_state)

        self._X: Optional[pd.DataFrame] = None
        self._y: Optional[pd.DataFrame] = None
        self._basic_encoding_pipeline: Optional[Pipeline] = None
        self._inferred_dtypes: List[Type] = []
        self.model: object = None
        self._final_pop = None

        self._subscribers = defaultdict(list)
        if isinstance(post_processing_method, EnsemblePostProcessing):
            self._evaluation_library = EvaluationLibrary(
                m=post_processing_method.hyperparameters['max_models'],
                n=post_processing_method.hyperparameters['hillclimb_size'],
            )
        else:
            # Don't keep memory-heavy evaluation meta-data (predictions, estimators)
            self._evaluation_library = EvaluationLibrary(m=0)
        self.evaluation_completed(self._evaluation_library.save_evaluation)

        self._pset, parameter_checks = pset_from_config(config)

        max_start_length = 3 if max_pipeline_length is None else max_pipeline_length
        self._operator_set = OperatorSet(
            mutate=partial(random_valid_mutation_in_place,
                           primitive_set=self._pset,
                           max_length=max_pipeline_length),
            mate=partial(random_crossover, max_length=max_pipeline_length),
            create_from_population=partial(create_from_population,
                                           cxpb=0.2,
                                           mutpb=0.8),
            create_new=partial(create_random_expression,
                               primitive_set=self._pset,
                               max_length=max_start_length),
            compile_=compile_individual,
            eliminate=eliminate_from_pareto,
            evaluate_callback=self._on_evaluation_completed)