def test_evaluation_library_n_best(GNB): """ Test `n_best` normal usage. """ lib = EvaluationLibrary(m=None, sample=None, cache=_short_name()) try: best_evaluation = _mock_evaluation(GNB, score=(1.0, 1.0, 1.0)) worst_evaluation = _mock_evaluation(GNB, score=(0.0, 0.0, 0.0)) lib.save_evaluation(best_evaluation) lib.save_evaluation(worst_evaluation) for _ in range(10): lib.save_evaluation(_mock_evaluation(GNB)) assert ( len(lib.n_best(10)) == 10 ), "n_best(10) should return 10 results as more than 10 evaluations are saved." assert ( best_evaluation is lib.n_best(10)[0] ), "`best_evaluation` should be number one in `n_best`" assert worst_evaluation not in lib.n_best( 10 ), "`worst_evaluation` should not be in the top 10 of 12 evaluations." assert ( len(lib.n_best(100)) == 12 ), "`n > len(lib.top_evaluations)` should return all evaluations." finally: lib.clear_cache()
def test_evaluation_library_max_number_evaluations(GNB): """ `max_number_of_evaluations` restricts the size of `top_evaluations`. """ lib200 = EvaluationLibrary(m=200, sample=None, cache=_short_name()) lib_unlimited = EvaluationLibrary(m=None, sample=None, cache=_short_name()) try: worst_evaluation = _mock_evaluation(GNB, score=(0.0, 0.0, 0.0)) lib200.save_evaluation(worst_evaluation) lib_unlimited.save_evaluation(worst_evaluation) for _ in range(200): lib200.save_evaluation(_mock_evaluation(GNB)) lib_unlimited.save_evaluation(_mock_evaluation(GNB)) assert 200 == len( lib200.top_evaluations ), "After 201 evaluations, lib200 should be at its cap of 200." assert ( worst_evaluation not in lib200.top_evaluations ), "The worst of 201 evaluations should not be present." assert 201 == len( lib_unlimited.top_evaluations ), "All evaluations should be present." assert ( worst_evaluation in lib_unlimited.top_evaluations ), "All evaluations should be present." finally: lib200.clear_cache() lib_unlimited.clear_cache()
def _test_subsample(sample, predictions, subsample, individual): """ Test the `predictions` correctly get sampled to `subsample`. """ lib = EvaluationLibrary(sample=sample, cache=_short_name()) try: best_evaluation = _mock_evaluation(individual, predictions=predictions) lib.save_evaluation(best_evaluation) assert ( subsample.shape == best_evaluation.predictions.shape ), "Subsample does not have expected shape." assert np.array_equal( subsample, best_evaluation.predictions ), "Content of subsample differs from expected." finally: lib.clear_cache()
class Gama(ABC): """ Wrapper for the toolbox logic surrounding executing the AutoML pipeline. """ def __init__( self, scoring: Union[str, Metric, Iterable[str], Iterable[Metric]] = "filled_in_by_child_class", regularize_length: bool = True, max_pipeline_length: Optional[int] = None, config: Dict = None, random_state: Optional[int] = None, max_total_time: int = 3600, max_eval_time: Optional[int] = None, n_jobs: Optional[int] = None, verbosity: int = logging.WARNING, keep_analysis_log: Optional[str] = "gama.log", search_method: BaseSearch = AsyncEA(), post_processing_method: BasePostProcessing = BestFitPostProcessing(), cache: Optional[str] = None, ): """ Parameters ---------- scoring: str, Metric or Tuple Specifies the/all metric(s) to optimize towards. A string will be converted to Metric. A tuple must specify each metric with the same type (e.g. all str). See :ref:`Metrics` for built-in metrics. regularize_length: bool (default=True) If True, add pipeline length as an optimization metric. Short pipelines should then be preferred over long ones. max_pipeline_length: int, optional (default=None) If set, limit the maximum number of steps in any evaluated pipeline. Encoding and imputation are excluded. config: Dict Specifies available components and their valid hyperparameter settings. For more information, see :ref:`search_space_configuration`. random_state: int, optional (default=None) Seed for the random number generators used in the process. However, with `n_jobs > 1`, there will be randomization introduced by multi-processing. For reproducible results, set this and use `n_jobs=1`. max_total_time: positive int (default=3600) Time in seconds that can be used for the `fit` call. max_eval_time: positive int, optional (default=None) Time in seconds that can be used to evaluate any one single individual. If None, set to 0.1 * max_total_time. n_jobs: int, optional (default=None) The amount of parallel processes that may be created to speed up `fit`. Accepted values are positive integers, -1 or None. If -1 is specified, multiprocessing.cpu_count() processes are created. If None is specified, multiprocessing.cpu_count() / 2 processes are created. verbosity: int (default=logging.WARNING) Sets the level of log messages to be automatically output to terminal. keep_analysis_log: str, optional (default='gama.log') If non-empty str, specify filepath where the log should be stored. If `None`, no log is stored. search_method: BaseSearch (default=AsyncEA()) Search method to use to find good pipelines. Should be instantiated. post_processing_method: BasePostProcessing (default=BestFitPostProcessing()) Post-processing method to create a model after the search phase. Should be an instantiated subclass of BasePostProcessing. cache: str, optional (default=None) Directory to use to save intermediate results during search. If set to None, generate a unique cache name. """ register_stream_log(verbosity) if keep_analysis_log is not None: register_file_log(keep_analysis_log) if keep_analysis_log is not None and not os.path.isabs( keep_analysis_log): keep_analysis_log = os.path.abspath(keep_analysis_log) arguments = ",".join([ f"{k}={v}" for (k, v) in locals().items() if k not in ["self", "config"] ]) log.info(f"Using GAMA version {__version__}.") log.info(f"{self.__class__.__name__}({arguments})") log_event(log, TOKENS.INIT, arguments) if n_jobs is None: n_jobs = multiprocessing.cpu_count() // 2 log.debug("n_jobs defaulted to %d", n_jobs) if max_total_time is None or max_total_time <= 0: raise ValueError( f"Expect positive int for max_total_time, got {max_total_time}." ) if max_eval_time is not None and max_eval_time <= 0: raise ValueError( f"Expect None or positive int for max_eval_time, got {max_eval_time}." ) if n_jobs < -1 or n_jobs == 0: raise ValueError( f"n_jobs should be -1 or positive int but is {n_jobs}.") AsyncEvaluator.n_jobs = n_jobs if max_eval_time is None: max_eval_time = round(0.1 * max_total_time) if max_eval_time > max_total_time: log.warning( f"max_eval_time ({max_eval_time}) > max_total_time ({max_total_time}) " f"is not allowed. max_eval_time set to {max_total_time}.") max_eval_time = max_total_time self._max_eval_time = max_eval_time self._time_manager = TimeKeeper(max_total_time) self._metrics: Tuple[Metric, ...] = scoring_to_metric(scoring) self._regularize_length = regularize_length self._search_method: BaseSearch = search_method self._post_processing = post_processing_method if random_state is not None: random.seed(random_state) np.random.seed(random_state) self._x: Optional[pd.DataFrame] = None self._y: Optional[pd.DataFrame] = None self._basic_encoding_pipeline: Optional[Pipeline] = None self._fixed_pipeline_extension: List[Tuple[str, TransformerMixin]] = [] self._inferred_dtypes: List[Type] = [] self.model: object = None self._final_pop: List[Individual] = [] self._subscribers: Dict[str, List[Callable]] = defaultdict(list) if not cache: cache = f"cache_{str(uuid.uuid4())}" if isinstance(post_processing_method, EnsemblePostProcessing): self._evaluation_library = EvaluationLibrary( m=post_processing_method.hyperparameters["max_models"], n=post_processing_method.hyperparameters["hillclimb_size"], cache_directory=cache, ) else: # Don't keep memory-heavy evaluation meta-data (predictions, estimators) self._evaluation_library = EvaluationLibrary(m=0, cache_directory=cache) self.evaluation_completed(self._evaluation_library.save_evaluation) self._pset, parameter_checks = pset_from_config(config) max_start_length = 3 if max_pipeline_length is None else max_pipeline_length self._operator_set = OperatorSet( mutate=partial( random_valid_mutation_in_place, primitive_set=self._pset, max_length=max_pipeline_length, ), mate=partial(random_crossover, max_length=max_pipeline_length), create_from_population=partial(create_from_population, cxpb=0.2, mutpb=0.8), create_new=partial( create_random_expression, primitive_set=self._pset, max_length=max_start_length, ), compile_=compile_individual, eliminate=eliminate_from_pareto, evaluate_callback=self._on_evaluation_completed, completed_evaluations=self._evaluation_library.lookup, ) def _np_to_matching_dataframe(self, x: np.ndarray) -> pd.DataFrame: """ Format np array to dataframe whose column types match the training data. """ if not isinstance(x, np.ndarray): raise TypeError( f"Expected x to be of type 'numpy.ndarray' not {type(x)}.") x = pd.DataFrame(x) for i, dtype in enumerate(self._inferred_dtypes): x[i] = x[i].astype(dtype) return x def _prepare_for_prediction(self, x): if isinstance(x, np.ndarray): x = self._np_to_matching_dataframe(x) x = self._basic_encoding_pipeline.transform(x) return x def _predict(self, x: pd.DataFrame): raise NotImplementedError("_predict is implemented by base classes.") def predict(self, x: Union[pd.DataFrame, np.ndarray]): """ Predict the target for input X. Parameters ---------- x: pandas.DataFrame or numpy.ndarray A dataframe or array with the same number of columns as the input to `fit`. Returns ------- numpy.ndarray array with predictions of shape (N,) where N is len(x) """ x = self._prepare_for_prediction(x) return self._predict(x) def predict_arff( self, arff_file_path: str, target_column: Optional[str] = None, encoding: Optional[str] = None, ) -> np.ndarray: """ Predict the target for input found in the ARFF file. Parameters ---------- arff_file_path: str An ARFF file with the same columns as the one that used in fit. Target column must be present in file, but its values are ignored. target_column: str, optional (default=None) Specifies which column the model should predict. If left None, the last column is taken to be the target. encoding: str, optional Encoding of the ARFF file. Returns ------- numpy.ndarray array with predictions for each row in the ARFF file. """ x, _ = X_y_from_arff(arff_file_path, split_column=target_column, encoding=encoding) x = self._prepare_for_prediction(x) return self._predict(x) def score(self, x: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]) -> float: """ Calculate `self.scoring` metric of the model on (x, y). Parameters ---------- x: pandas.DataFrame or numpy.ndarray Data to predict target values for. y: pandas.Series or numpy.ndarray True values for the target. Returns ------- float The score obtained on the given test data according to the `scoring` metric. """ predictions = ( self.predict_proba(x) # type: ignore if self._metrics[0].requires_probabilities else self.predict(x)) return self._metrics[0].score(y, predictions) def score_arff( self, arff_file_path: str, target_column: Optional[str] = None, encoding: Optional[str] = None, ) -> float: """ Calculate `self.scoring` metric of the model on data in the file. Parameters ---------- arff_file_path: str An ARFF file with which to calculate the score. target_column: str, optional (default=None) Specifies which column the model should predict. If left None, the last column is taken to be the target. encoding: str, optional Encoding of the ARFF file. Returns ------- float The score obtained on the given test data according to the `scoring` metric. """ x, y = X_y_from_arff(arff_file_path, split_column=target_column, encoding=encoding) return self.score(x, y) def fit_arff( self, arff_file_path: str, target_column: Optional[str] = None, encoding: Optional[str] = None, *args, **kwargs, ) -> None: """ Find and fit a model to predict the target column (last) from other columns. Parameters ---------- arff_file_path: str Path to an ARFF file containing the training data. target_column: str, optional (default=None) Specifies which column the model should predict. If left None, the last column is taken to be the target. encoding: str, optional Encoding of the ARFF file. """ x, y = X_y_from_arff(arff_file_path, split_column=target_column, encoding=encoding) self.fit(x, y, *args, **kwargs) def fit( self, x: Union[pd.DataFrame, np.ndarray], y: Union[pd.DataFrame, pd.Series, np.ndarray], warm_start: bool = False, ) -> None: """ Find and fit a model to predict target y from X. Various possible machine learning pipelines will be fit to the (X,y) data. Using Genetic Programming, the pipelines chosen should lead to gradually better models. Pipelines will internally be validated using cross validation. After the search termination condition is met, the best found pipeline configuration is then used to train a final model on all provided data. Parameters ---------- x: pandas.DataFrame or numpy.ndarray, shape = [n_samples, n_features] Training data. All elements must be able to be converted to float. y: pandas.DataFrame, pandas.Series or numpy.ndarray, shape = [n_samples,] Target values. If a DataFrame is provided, assumes the first column contains target values. warm_start: bool (default=False) Indicates the optimization should continue using the last individuals of the previous `fit` call. """ with self._time_manager.start_activity("preprocessing", activity_meta=["default"]): x, self._y = format_x_y(x, y) self._inferred_dtypes = x.dtypes self._x, self._basic_encoding_pipeline = basic_encoding(x) self._fixed_pipeline_extension = basic_pipeline_extension(self._x) self._operator_set._safe_compile = partial( compile_individual, preprocessing_steps=self._fixed_pipeline_extension) store_pipelines = (self._evaluation_library._m is None or self._evaluation_library._m > 0) if store_pipelines and self._x.shape[0] * self._x.shape[ 1] > 6_000_000: # if m > 0, we are storing models for each evaluation. For this size # KNN will create models of about 76Mb in size, which is too big, so # we exclude it from search: log.info( "Excluding KNN from search because the dataset is too big." ) from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor self._pset["prediction"] = [ p for p in self._pset["prediction"] if p.identifier not in [KNeighborsClassifier, KNeighborsRegressor] ] if store_pipelines and self._x.shape[1] > 50: log.info( "Data has too many features to include PolynomialFeatures") from sklearn.preprocessing import PolynomialFeatures self._pset["data"] = [ p for p in self._pset["data"] if p.identifier not in [PolynomialFeatures] ] fit_time = int((1 - self._post_processing.time_fraction) * self._time_manager.total_time_remaining) with self._time_manager.start_activity( "search", time_limit=fit_time, activity_meta=[self._search_method.__class__.__name__], ): self._search_phase(warm_start, timeout=fit_time) with self._time_manager.start_activity( "postprocess", time_limit=int(self._time_manager.total_time_remaining), activity_meta=[self._post_processing.__class__.__name__], ): best_individuals = list( reversed( sorted( self._final_pop, key=lambda ind: cast(Fitness, ind.fitness).values, ))) self._post_processing.dynamic_defaults(self) self.model = self._post_processing.post_process( self._x, self._y, self._time_manager.total_time_remaining, best_individuals, ) self._evaluation_library.clear_cache() def _search_phase(self, warm_start: bool = False, timeout: float = 1e6): """ Invoke the search algorithm, populate `final_pop`. """ if warm_start and not self._final_pop: pop = [ind for ind in self._final_pop] else: if warm_start: log.warning( "Warm-start True but no earlier fit. Using new population." ) pop = [self._operator_set.individual() for _ in range(50)] deadline = time.time() + timeout evaluate_pipeline = partial( gama.genetic_programming.compilers.scikitlearn.evaluate_pipeline, x=self._x, y_train=self._y, metrics=self._metrics, ) AsyncEvaluator.defaults = dict(evaluate_pipeline=evaluate_pipeline) self._operator_set.evaluate = partial( gama.genetic_programming.compilers.scikitlearn.evaluate_individual, # evaluate_pipeline=evaluate_pipeline, timeout=self._max_eval_time, deadline=deadline, add_length_to_score=self._regularize_length, ) try: with stopit.ThreadingTimeout(timeout): self._search_method.dynamic_defaults(self._x, self._y, timeout) self._search_method.search(self._operator_set, start_candidates=pop) except KeyboardInterrupt: log.info("Search phase terminated because of Keyboard Interrupt.") self._final_pop = self._search_method.output log.debug([str(i) for i in self._final_pop[:100]]) n_evaluations = len(self._evaluation_library.evaluations) log.info(f"Search phase evaluated {n_evaluations} individuals.") def export_script(self, file: Optional[str] = "gama_pipeline.py", raise_if_exists: bool = False): """ Export a Python script which sets up the best found pipeline. Can only be called after `fit`. Example ------- After the AutoML search process has completed (i.e. `fit` has been called), the model which has been found by GAMA may be exported to a Python file. The Python file will define the found pipeline or ensemble. .. code-block:: python automl = GamaClassifier() automl.fit(X, y) automl.export_script('my_pipeline_script.py') The resulting script will define a variable `pipeline` or `ensemble`, depending on the post-processing method that was used after search. Parameters ---------- file: str, optional (default='gama_pipeline.py') Desired filename of the exported Python script. If None, return the code as str instead, it will not be formatted(!). raise_if_exists: bool (default=False) If True, raise an error if the file already exists. If False, overwrite `file` if it already exists. """ if self.model is None: raise RuntimeError(STR_NO_OPTIMAL_PIPELINE) if raise_if_exists and file is not None and os.path.isfile(file): raise FileExistsError(f"File {file} already exists.") if self._basic_encoding_pipeline is not None: script_text = self._post_processing.to_code( self._basic_encoding_pipeline.steps + self._fixed_pipeline_extension) else: script_text = self._post_processing.to_code( self._fixed_pipeline_extension) if file: with open(file, "w") as fh: fh.write(script_text) subprocess.call(["black", file]) else: return script_text def _safe_outside_call(self, fn): """ Calls fn logging and ignoring all exceptions except TimeoutException. """ try: fn() except stopit.utils.TimeoutException: raise except Exception: # We actually want to catch any other exception here, # because the callback code can be arbitrary (it can be provided by users). # This excuses the catch-all Exception. # Note KeyboardInterrupts are not exceptions and get elevated to the caller. log.warning("Exception during callback.", exc_info=True) if self._time_manager.current_activity.exceeded_limit: log.info( "Time exceeded during callback, but exception was swallowed.") raise stopit.utils.TimeoutException def _on_evaluation_completed(self, evaluation: Evaluation): for callback in self._subscribers["evaluation_completed"]: self._safe_outside_call(partial(callback, evaluation)) def evaluation_completed(self, callback: Callable[[Evaluation], Any]) -> None: """ Register a callback function that is called when an evaluation is completed. Parameters ---------- callback: Callable[[Evaluation], Any] Function to call when a pipeline is evaluated, return values are ignored. """ self._subscribers["evaluation_completed"].append(callback)