def __init__( self, feat_type: typing.Optional[typing.List[str]] = None, is_classification: bool = False, logger_port: typing.Optional[int] = None, ) -> None: self.feat_type = feat_type self.is_classification = is_classification self.logger_port = logger_port if self.logger_port is not None: self.logger = get_named_client_logger( name='Validation', port=self.logger_port, ) else: self.logger = logging.getLogger('Validation') self.feature_validator = FeatureValidator(feat_type=self.feat_type, logger=self.logger) self.target_validator = TargetValidator( is_classification=self.is_classification, logger=self.logger) self._is_fitted = False
def setup_logger(self, port: int) -> None: self._logger = get_named_client_logger( name=__name__, port=port, )
def setup_logger(self, port: int) -> None: self.logger = get_named_client_logger( name=__name__, port=port, ) self.context.setup_logger(port)
def __init__( self, config_space, dataset_name, backend, total_walltime_limit, func_eval_time_limit, memory_limit, metric, watcher, n_jobs, dask_client: dask.distributed.Client, port: int, start_num_run=1, data_memory_limit=None, num_metalearning_cfgs=25, config_file=None, seed=1, metadata_directory=None, resampling_strategy='holdout', resampling_strategy_args=None, include=None, exclude=None, disable_file_output=False, smac_scenario_args=None, get_smac_object_callback=None, scoring_functions=None, pynisher_context='spawn', ensemble_callback: typing.Optional[EnsembleBuilderManager] = None, trials_callback: typing. Optional[IncorporateRunResultCallback] = None): super(AutoMLSMBO, self).__init__() # data related self.dataset_name = dataset_name self.datamanager = None self.metric = metric self.task = None self.backend = backend self.port = port # the configuration space self.config_space = config_space # the number of parallel workers/jobs self.n_jobs = n_jobs self.dask_client = dask_client # Evaluation self.resampling_strategy = resampling_strategy if resampling_strategy_args is None: resampling_strategy_args = {} self.resampling_strategy_args = resampling_strategy_args # and a bunch of useful limits self.worst_possible_result = get_cost_of_crash(self.metric) self.total_walltime_limit = int(total_walltime_limit) self.func_eval_time_limit = int(func_eval_time_limit) self.memory_limit = memory_limit self.data_memory_limit = data_memory_limit self.watcher = watcher self.num_metalearning_cfgs = num_metalearning_cfgs self.config_file = config_file self.seed = seed self.metadata_directory = metadata_directory self.start_num_run = start_num_run self.include = include self.exclude = exclude self.disable_file_output = disable_file_output self.smac_scenario_args = smac_scenario_args self.get_smac_object_callback = get_smac_object_callback self.scoring_functions = scoring_functions self.pynisher_context = pynisher_context self.ensemble_callback = ensemble_callback self.trials_callback = trials_callback dataset_name_ = "" if dataset_name is None else dataset_name logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed, ":" + dataset_name_) if port is None: self.logger = logging.getLogger(__name__) else: self.logger = get_named_client_logger( name=logger_name, port=self.port, )
def run( self, config: Configuration, instance: Optional[str] = None, cutoff: Optional[float] = None, seed: int = 12345, budget: float = 0.0, instance_specific: Optional[str] = None, ) -> Tuple[StatusType, float, float, Dict[str, Union[int, float, str, Dict, List, Tuple]]]: # Additional information of each of the tae executions # Defined upfront for mypy additional_run_info: TYPE_ADDITIONAL_INFO = {} context = multiprocessing.get_context(self.pynisher_context) preload_modules(context) queue = context.Queue() if not (instance_specific is None or instance_specific == '0'): raise ValueError(instance_specific) init_params = {'instance': instance} if self.init_params is not None: init_params.update(self.init_params) if self.port is None: logger: Union[logging.Logger, PickableLoggerAdapter] = logging.getLogger("pynisher") else: logger = get_named_client_logger( name="pynisher", port=self.port, ) arguments = dict( logger=logger, wall_time_in_s=cutoff, mem_in_mb=self.memory_limit, capture_output=True, context=context, ) if isinstance(config, int): num_run = self.initial_num_run else: num_run = config.config_id + self.initial_num_run obj_kwargs = dict( queue=queue, config=config, backend=self.backend, port=self.port, metric=self.metric, seed=self.autosklearn_seed, num_run=num_run, scoring_functions=self.scoring_functions, output_y_hat_optimization=self.output_y_hat_optimization, include=self.include, exclude=self.exclude, disable_file_output=self.disable_file_output, instance=instance, init_params=init_params, budget=budget, budget_type=self.budget_type, additional_components=autosklearn.pipeline.components.base._addons, ) if self.resampling_strategy != 'test': obj_kwargs['resampling_strategy'] = self.resampling_strategy obj_kwargs['resampling_strategy_args'] = self.resampling_strategy_args try: obj = pynisher.enforce_limits(**arguments)(self.ta) obj(**obj_kwargs) except Exception as e: exception_traceback = traceback.format_exc() error_message = repr(e) additional_run_info.update({ 'traceback': exception_traceback, 'error': error_message }) return StatusType.CRASHED, self.worst_possible_result, 0.0, additional_run_info if obj.exit_status in (pynisher.TimeoutException, pynisher.MemorylimitException): # Even if the pynisher thinks that a timeout or memout occured, # it can be that the target algorithm wrote something into the queue # - then we treat it as a succesful run try: info = autosklearn.evaluation.util.read_queue(queue) result = info[-1]['loss'] status = info[-1]['status'] additional_run_info = info[-1]['additional_run_info'] if obj.stdout: additional_run_info['subprocess_stdout'] = obj.stdout if obj.stderr: additional_run_info['subprocess_stderr'] = obj.stderr if obj.exit_status is pynisher.TimeoutException: additional_run_info['info'] = 'Run stopped because of timeout.' elif obj.exit_status is pynisher.MemorylimitException: additional_run_info['info'] = 'Run stopped because of memout.' if status in [StatusType.SUCCESS, StatusType.DONOTADVANCE]: cost = result else: cost = self.worst_possible_result except Empty: info = None if obj.exit_status is pynisher.TimeoutException: status = StatusType.TIMEOUT additional_run_info = {'error': 'Timeout'} elif obj.exit_status is pynisher.MemorylimitException: status = StatusType.MEMOUT additional_run_info = { "error": "Memout (used more than {} MB).".format(self.memory_limit) } else: raise ValueError(obj.exit_status) cost = self.worst_possible_result elif obj.exit_status is TAEAbortException: info = None status = StatusType.ABORT cost = self.worst_possible_result additional_run_info = {'error': 'Your configuration of ' 'auto-sklearn does not work!', 'exit_status': _encode_exit_status(obj.exit_status), 'subprocess_stdout': obj.stdout, 'subprocess_stderr': obj.stderr, } else: try: info = autosklearn.evaluation.util.read_queue(queue) result = info[-1]['loss'] status = info[-1]['status'] additional_run_info = info[-1]['additional_run_info'] if obj.exit_status == 0: cost = result else: status = StatusType.CRASHED cost = self.worst_possible_result additional_run_info['info'] = 'Run treated as crashed ' \ 'because the pynisher exit ' \ 'status %s is unknown.' % \ str(obj.exit_status) additional_run_info['exit_status'] = _encode_exit_status(obj.exit_status) additional_run_info['subprocess_stdout'] = obj.stdout additional_run_info['subprocess_stderr'] = obj.stderr except Empty: info = None additional_run_info = { 'error': 'Result queue is empty', 'exit_status': _encode_exit_status(obj.exit_status), 'subprocess_stdout': obj.stdout, 'subprocess_stderr': obj.stderr, 'exitcode': obj.exitcode } status = StatusType.CRASHED cost = self.worst_possible_result if ( (self.budget_type is None or budget == 0) and status == StatusType.DONOTADVANCE ): status = StatusType.SUCCESS if not isinstance(additional_run_info, dict): additional_run_info = {'message': additional_run_info} if ( info is not None and self.resampling_strategy in ('holdout-iterative-fit', 'cv-iterative-fit') and status != StatusType.CRASHED ): learning_curve = autosklearn.evaluation.util.extract_learning_curve(info) learning_curve_runtime = autosklearn.evaluation.util.extract_learning_curve( info, 'duration' ) if len(learning_curve) > 1: additional_run_info['learning_curve'] = learning_curve additional_run_info['learning_curve_runtime'] = learning_curve_runtime train_learning_curve = autosklearn.evaluation.util.extract_learning_curve( info, 'train_loss' ) if len(train_learning_curve) > 1: additional_run_info['train_learning_curve'] = train_learning_curve additional_run_info['learning_curve_runtime'] = learning_curve_runtime if self._get_validation_loss: validation_learning_curve = autosklearn.evaluation.util.extract_learning_curve( info, 'validation_loss', ) if len(validation_learning_curve) > 1: additional_run_info['validation_learning_curve'] = \ validation_learning_curve additional_run_info[ 'learning_curve_runtime'] = learning_curve_runtime if self._get_test_loss: test_learning_curve = autosklearn.evaluation.util.extract_learning_curve( info, 'test_loss', ) if len(test_learning_curve) > 1: additional_run_info['test_learning_curve'] = test_learning_curve additional_run_info[ 'learning_curve_runtime'] = learning_curve_runtime if isinstance(config, int): origin = 'DUMMY' config_id = config else: origin = getattr(config, 'origin', 'UNKNOWN') config_id = config.config_id additional_run_info['configuration_origin'] = origin runtime = float(obj.wall_clock_time) autosklearn.evaluation.util.empty_queue(queue) self.logger.info("Finished evaluating configuration %d" % config_id) return status, cost, runtime, additional_run_info
def __init__( self, backend: Backend, autosklearn_seed: int, resampling_strategy: Union[str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit], metric: Scorer, cost_for_crash: float, abort_on_first_run_crash: bool, port: int, pynisher_context: str, initial_num_run: int = 1, stats: Optional[Stats] = None, run_obj: str = 'quality', par_factor: int = 1, scoring_functions: Optional[List[Scorer]] = None, output_y_hat_optimization: bool = True, include: Optional[List[str]] = None, exclude: Optional[List[str]] = None, memory_limit: Optional[int] = None, disable_file_output: bool = False, init_params: Optional[Dict[str, Any]] = None, budget_type: Optional[str] = None, ta: Optional[Callable] = None, **resampling_strategy_args: Any, ): if resampling_strategy == 'holdout': eval_function = autosklearn.evaluation.train_evaluator.eval_holdout elif resampling_strategy == 'holdout-iterative-fit': eval_function = autosklearn.evaluation.train_evaluator.eval_iterative_holdout elif resampling_strategy == 'cv-iterative-fit': eval_function = autosklearn.evaluation.train_evaluator.eval_iterative_cv elif resampling_strategy == 'cv' or isinstance(resampling_strategy, ( BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit) ): eval_function = autosklearn.evaluation.train_evaluator.eval_cv elif resampling_strategy == 'partial-cv': eval_function = autosklearn.evaluation.train_evaluator.eval_partial_cv elif resampling_strategy == 'partial-cv-iterative-fit': eval_function = autosklearn.evaluation.train_evaluator.eval_partial_cv_iterative elif resampling_strategy == 'test': eval_function = autosklearn.evaluation.test_evaluator.eval_t output_y_hat_optimization = False else: raise ValueError('Unknown resampling strategy %s' % resampling_strategy) self.worst_possible_result = cost_for_crash eval_function = functools.partial( fit_predict_try_except_decorator, ta=eval_function, cost_for_crash=self.worst_possible_result, ) super().__init__( ta=eval_function, stats=stats, run_obj=run_obj, par_factor=par_factor, cost_for_crash=self.worst_possible_result, abort_on_first_run_crash=abort_on_first_run_crash, ) self.backend = backend self.autosklearn_seed = autosklearn_seed self.resampling_strategy = resampling_strategy self.initial_num_run = initial_num_run self.metric = metric self.resampling_strategy = resampling_strategy self.resampling_strategy_args = resampling_strategy_args self.scoring_functions = scoring_functions # TODO deactivate output_y_hat_optimization and let the respective evaluator decide self.output_y_hat_optimization = output_y_hat_optimization self.include = include self.exclude = exclude self.disable_file_output = disable_file_output self.init_params = init_params self.budget_type = budget_type if memory_limit is not None: memory_limit = int(math.ceil(memory_limit)) self.memory_limit = memory_limit dm = self.backend.load_datamanager() if 'X_valid' in dm.data and 'Y_valid' in dm.data: self._get_validation_loss = True else: self._get_validation_loss = False if 'X_test' in dm.data and 'Y_test' in dm.data: self._get_test_loss = True else: self._get_test_loss = False self.port = port self.pynisher_context = pynisher_context if self.port is None: self.logger: Union[logging.Logger, PickableLoggerAdapter] = logging.getLogger("TAE") else: self.logger = get_named_client_logger( name="TAE", port=self.port, )
def __init__( self, backend: Backend, queue: multiprocessing.Queue, metric: Scorer, additional_components: Dict[str, ThirdPartyComponents], port: Optional[int], configuration: Optional[Union[int, Configuration]] = None, scoring_functions: Optional[List[Scorer]] = None, seed: int = 1, output_y_hat_optimization: bool = True, num_run: Optional[int] = None, include: Optional[List[str]] = None, exclude: Optional[List[str]] = None, disable_file_output: Union[bool, List[str]] = False, init_params: Optional[Dict[str, Any]] = None, budget: Optional[float] = None, budget_type: Optional[str] = None, ): # Limit the number of threads that numpy uses threadpool_limits(limits=1) self.starttime = time.time() self.configuration = configuration self.backend = backend self.port = port self.queue = queue self.datamanager = self.backend.load_datamanager() self.include = include self.exclude = exclude self.X_valid = self.datamanager.data.get('X_valid') self.y_valid = self.datamanager.data.get('Y_valid') self.X_test = self.datamanager.data.get('X_test') self.y_test = self.datamanager.data.get('Y_test') self.metric = metric self.task_type = self.datamanager.info['task'] self.seed = seed self.output_y_hat_optimization = output_y_hat_optimization self.scoring_functions = scoring_functions if isinstance(disable_file_output, (bool, list)): self.disable_file_output: Union[bool, List[str]] = disable_file_output else: raise ValueError( 'disable_file_output should be either a bool or a list') if self.task_type in REGRESSION_TASKS: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyRegressor else: self.model_class = \ autosklearn.pipeline.regression.SimpleRegressionPipeline self.predict_function = self._predict_regression else: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyClassifier else: self.model_class = autosklearn.pipeline.classification.SimpleClassificationPipeline self.predict_function = self._predict_proba self._init_params = { 'data_preprocessor:feat_type': self.datamanager.feat_type } if init_params is not None: self._init_params.update(init_params) if num_run is None: num_run = 0 self.num_run = num_run logger_name = '%s(%d):%s' % (self.__class__.__name__.split('.')[-1], self.seed, self.datamanager.name) if self.port is None: self.logger = logging.getLogger(__name__) else: self.logger = get_named_client_logger( name=logger_name, port=self.port, ) self.Y_optimization: Optional[Union[List, np.ndarray]] = None self.Y_actual_train = None self.budget = budget self.budget_type = budget_type # Add 3rd-party components to the list of 3rd-party components in case this wasn't done # before (this happens if we run in parallel and the components are only passed to the # AbstractEvaluator via the TAE and are not there yet because the worker is in its own # process). for key in additional_components: for component_name, component in additional_components[ key].components.items(): if component_name not in _addons[key].components: _addons[key].add_component(component) # Please mypy to prevent not defined attr self.model = self._get_model()
def __init__( self, backend: Backend, queue: multiprocessing.Queue, metric: Scorer, port: Optional[int], configuration: Optional[Union[int, Configuration]] = None, scoring_functions: Optional[List[Scorer]] = None, seed: int = 1, output_y_hat_optimization: bool = True, num_run: Optional[int] = None, include: Optional[List[str]] = None, exclude: Optional[List[str]] = None, disable_file_output: Union[bool, List[str]] = False, init_params: Optional[Dict[str, Any]] = None, budget: Optional[float] = None, budget_type: Optional[str] = None, ): self.starttime = time.time() self.configuration = configuration self.backend = backend self.port = port self.queue = queue self.datamanager = self.backend.load_datamanager() self.include = include self.exclude = exclude self.X_valid = self.datamanager.data.get('X_valid') self.y_valid = self.datamanager.data.get('Y_valid') self.X_test = self.datamanager.data.get('X_test') self.y_test = self.datamanager.data.get('Y_test') self.metric = metric self.task_type = self.datamanager.info['task'] self.seed = seed self.output_y_hat_optimization = output_y_hat_optimization self.scoring_functions = scoring_functions if isinstance(disable_file_output, (bool, list)): self.disable_file_output: Union[bool, List[str]] = disable_file_output else: raise ValueError( 'disable_file_output should be either a bool or a list') if self.task_type in REGRESSION_TASKS: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyRegressor else: self.model_class = \ autosklearn.pipeline.regression.SimpleRegressionPipeline self.predict_function = self._predict_regression else: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyClassifier else: self.model_class = autosklearn.pipeline.classification.SimpleClassificationPipeline self.predict_function = self._predict_proba categorical_mask = [] for feat in self.datamanager.feat_type: if feat.lower() == 'numerical': categorical_mask.append(False) elif feat.lower() == 'categorical': categorical_mask.append(True) else: raise ValueError(feat) if np.sum(categorical_mask) > 0: self._init_params = { 'data_preprocessing:categorical_features': categorical_mask } else: self._init_params = {} if init_params is not None: self._init_params.update(init_params) if num_run is None: num_run = 0 self.num_run = num_run logger_name = '%s(%d):%s' % (self.__class__.__name__.split('.')[-1], self.seed, self.datamanager.name) if self.port is None: self.logger = logging.getLogger(__name__) else: self.logger = get_named_client_logger( name=logger_name, port=self.port, ) self.Y_optimization: Optional[Union[List, np.ndarray]] = None self.Y_actual_train = None self.budget = budget self.budget_type = budget_type # Please mypy to prevent not defined attr self.model = self._get_model()
def __init__(self, backend, autosklearn_seed, resampling_strategy, metric, cost_for_crash, abort_on_first_run_crash, port, initial_num_run=1, stats=None, run_obj='quality', par_factor=1, scoring_functions=None, output_y_hat_optimization=True, include=None, exclude=None, memory_limit=None, disable_file_output=False, init_params=None, budget_type=None, ta=False, pynisher_context='spawn', **resampling_strategy_args): if resampling_strategy == 'holdout': eval_function = autosklearn.evaluation.train_evaluator.eval_holdout elif resampling_strategy == 'holdout-iterative-fit': eval_function = autosklearn.evaluation.train_evaluator.eval_iterative_holdout elif resampling_strategy == 'cv-iterative-fit': eval_function = autosklearn.evaluation.train_evaluator.eval_iterative_cv elif resampling_strategy == 'cv' or ( isinstance(resampling_strategy, type) and ( issubclass(resampling_strategy, BaseCrossValidator) or issubclass(resampling_strategy, _RepeatedSplits) or issubclass(resampling_strategy, BaseShuffleSplit) ) ): eval_function = autosklearn.evaluation.train_evaluator.eval_cv elif resampling_strategy == 'partial-cv': eval_function = autosklearn.evaluation.train_evaluator.eval_partial_cv elif resampling_strategy == 'partial-cv-iterative-fit': eval_function = autosklearn.evaluation.train_evaluator.eval_partial_cv_iterative elif resampling_strategy == 'test': eval_function = autosklearn.evaluation.test_evaluator.eval_t output_y_hat_optimization = False else: raise ValueError('Unknown resampling strategy %s' % resampling_strategy) self.worst_possible_result = cost_for_crash eval_function = functools.partial( fit_predict_try_except_decorator, ta=eval_function, cost_for_crash=self.worst_possible_result, ) super().__init__( ta=eval_function, stats=stats, run_obj=run_obj, par_factor=par_factor, cost_for_crash=self.worst_possible_result, abort_on_first_run_crash=abort_on_first_run_crash, ) self.backend = backend self.autosklearn_seed = autosklearn_seed self.resampling_strategy = resampling_strategy self.initial_num_run = initial_num_run self.metric = metric self.resampling_strategy = resampling_strategy self.resampling_strategy_args = resampling_strategy_args self.scoring_functions = scoring_functions # TODO deactivate output_y_hat_optimization and let the respective evaluator decide self.output_y_hat_optimization = output_y_hat_optimization self.include = include self.exclude = exclude self.disable_file_output = disable_file_output self.init_params = init_params self.budget_type = budget_type if memory_limit is not None: memory_limit = int(math.ceil(memory_limit)) self.memory_limit = memory_limit dm = self.backend.load_datamanager() if 'X_valid' in dm.data and 'Y_valid' in dm.data: self._get_validation_loss = True else: self._get_validation_loss = False if 'X_test' in dm.data and 'Y_test' in dm.data: self._get_test_loss = True else: self._get_test_loss = False self.port = port self.pynisher_context = pynisher_context if self.port is None: self.logger = logging.getLogger("TAE") else: self.logger = get_named_client_logger( name="TAE", port=self.port, )