def __init__( self, scoring: Union[str, Metric, Tuple[Union[str, Metric], ...]] = 'filled_in_by_child_class', regularize_length: bool = True, max_pipeline_length: Optional[int] = None, config: Dict = None, random_state: Optional[int] = None, max_total_time: int = 3600, max_eval_time: Optional[int] = None, n_jobs: Optional[int] = None, verbosity: int = logging.WARNING, keep_analysis_log: Optional[str] = 'gama.log', search_method: BaseSearch = AsyncEA(), post_processing_method: BasePostProcessing = BestFitPostProcessing()): """ Parameters ---------- scoring: str, Metric or Tuple Specifies the/all metric(s) to optimize towards. A string will be converted to Metric. A tuple must specify each metric with the same type (i.e. all str or all Metric). See :ref:`Metrics` for built-in metrics. regularize_length: bool If True, add pipeline length as an optimization metric (preferring short over long). max_pipeline_length: int, optional (default=None) If set, limit the maximum number of steps in any evaluated pipeline. Encoding and imputation are excluded. config: a dictionary which specifies available components and their valid hyperparameter settings For more information, see :ref:`search_space_configuration`. random_state: int, optional (default=None) If an integer is passed, this will be the seed for the random number generators used in the process. However, with `n_jobs > 1`, there will be randomization introduced by multi-processing. For reproducible results, set this and use `n_jobs=1`. max_total_time: positive int (default=3600) Time in seconds that can be used for the `fit` call. max_eval_time: positive int, optional (default=None) Time in seconds that can be used to evaluate any one single individual. If None, set to 0.1 * max_total_time. n_jobs: int, optional (default=None) The amount of parallel processes that may be created to speed up `fit`. Accepted values are positive integers, -1 or None. If -1 is specified, multiprocessing.cpu_count() processes are created. If None is specified, multiprocessing.cpu_count() / 2 processes are created. verbosity: int (default=logging.WARNING) Sets the level of log messages to be automatically output to terminal. keep_analysis_log: str, optional (default='gama.log') If non-empty str, specifies the path (and name) where the log should be stored, e.g. /output/gama.log. If `None`, no log is stored. search_method: BaseSearch (default=AsyncEA()) Search method to use to find good pipelines. Should be instantiated. post_processing_method: BasePostProcessing (default=BestFitPostProcessing()) Post-processing method to create a model after the search phase. Should be instantiated. """ register_stream_log(verbosity) if keep_analysis_log is not None: register_file_log(keep_analysis_log) if keep_analysis_log is not None and not os.path.isabs( keep_analysis_log): keep_analysis_log = os.path.abspath(keep_analysis_log) arguments = ','.join([ '{}={}'.format(k, v) for (k, v) in locals().items() if k not in [ 'self', 'config', 'gamalog', 'file_handler', 'stdout_streamhandler' ] ]) log.info('Using GAMA version {}.'.format(__version__)) log.info('{}({})'.format(self.__class__.__name__, arguments)) log_event(log, TOKENS.INIT, arguments) if n_jobs is None: n_jobs = multiprocessing.cpu_count() // 2 log.debug('n_jobs defaulted to %d', n_jobs) if max_total_time is None or max_total_time <= 0: raise ValueError( f"max_total_time should be integer greater than zero but is {max_total_time}." ) if max_eval_time is not None and max_eval_time <= 0: raise ValueError( f"max_eval_time should be None or integer greater than zero but is {max_eval_time}." ) if n_jobs < -1 or n_jobs == 0: raise ValueError( f"n_jobs should be -1 or positive integer but is {n_jobs}.") elif n_jobs != -1: # AsyncExecutor defaults to using multiprocessing.cpu_count(), i.e. n_jobs=-1 AsyncEvaluator.n_jobs = n_jobs if max_eval_time is None: max_eval_time = 0.1 * max_total_time if max_eval_time > max_total_time: log.warning( f"max_eval_time ({max_eval_time}) > max_total_time ({max_total_time}) is not allowed. " f"max_eval_time set to {max_total_time}.") max_eval_time = max_total_time self._max_eval_time = max_eval_time self._time_manager = TimeKeeper(max_total_time) self._metrics: Tuple[Metric] = scoring_to_metric(scoring) self._regularize_length = regularize_length self._search_method: BaseSearch = search_method self._post_processing = post_processing_method if random_state is not None: random.seed(random_state) np.random.seed(random_state) self._X: Optional[pd.DataFrame] = None self._y: Optional[pd.DataFrame] = None self._basic_encoding_pipeline: Optional[Pipeline] = None self._inferred_dtypes: List[Type] = [] self.model: object = None self._final_pop = None self._subscribers = defaultdict(list) if isinstance(post_processing_method, EnsemblePostProcessing): self._evaluation_library = EvaluationLibrary( m=post_processing_method.hyperparameters['max_models'], n=post_processing_method.hyperparameters['hillclimb_size'], ) else: # Don't keep memory-heavy evaluation meta-data (predictions, estimators) self._evaluation_library = EvaluationLibrary(m=0) self.evaluation_completed(self._evaluation_library.save_evaluation) self._pset, parameter_checks = pset_from_config(config) max_start_length = 3 if max_pipeline_length is None else max_pipeline_length self._operator_set = OperatorSet( mutate=partial(random_valid_mutation_in_place, primitive_set=self._pset, max_length=max_pipeline_length), mate=partial(random_crossover, max_length=max_pipeline_length), create_from_population=partial(create_from_population, cxpb=0.2, mutpb=0.8), create_new=partial(create_random_expression, primitive_set=self._pset, max_length=max_start_length), compile_=compile_individual, eliminate=eliminate_from_pareto, evaluate_callback=self._on_evaluation_completed)