Exemple #1
0
    def __init__(
        self,
        feat_type: typing.Optional[typing.List[str]] = None,
        is_classification: bool = False,
        logger_port: typing.Optional[int] = None,
    ) -> None:
        self.feat_type = feat_type
        self.is_classification = is_classification
        self.logger_port = logger_port
        if self.logger_port is not None:
            self.logger = get_named_client_logger(
                name='Validation',
                port=self.logger_port,
            )
        else:
            self.logger = logging.getLogger('Validation')

        self.feature_validator = FeatureValidator(feat_type=self.feat_type,
                                                  logger=self.logger)
        self.target_validator = TargetValidator(
            is_classification=self.is_classification, logger=self.logger)
        self._is_fitted = False
Exemple #2
0
 def setup_logger(self, port: int) -> None:
     self._logger = get_named_client_logger(
         name=__name__,
         port=port,
     )
Exemple #3
0
 def setup_logger(self, port: int) -> None:
     self.logger = get_named_client_logger(
         name=__name__,
         port=port,
     )
     self.context.setup_logger(port)
Exemple #4
0
    def __init__(
            self,
            config_space,
            dataset_name,
            backend,
            total_walltime_limit,
            func_eval_time_limit,
            memory_limit,
            metric,
            watcher,
            n_jobs,
            dask_client: dask.distributed.Client,
            port: int,
            start_num_run=1,
            data_memory_limit=None,
            num_metalearning_cfgs=25,
            config_file=None,
            seed=1,
            metadata_directory=None,
            resampling_strategy='holdout',
            resampling_strategy_args=None,
            include=None,
            exclude=None,
            disable_file_output=False,
            smac_scenario_args=None,
            get_smac_object_callback=None,
            scoring_functions=None,
            pynisher_context='spawn',
            ensemble_callback: typing.Optional[EnsembleBuilderManager] = None,
            trials_callback: typing.
        Optional[IncorporateRunResultCallback] = None):
        super(AutoMLSMBO, self).__init__()
        # data related
        self.dataset_name = dataset_name
        self.datamanager = None
        self.metric = metric
        self.task = None
        self.backend = backend
        self.port = port

        # the configuration space
        self.config_space = config_space

        # the number of parallel workers/jobs
        self.n_jobs = n_jobs
        self.dask_client = dask_client

        # Evaluation
        self.resampling_strategy = resampling_strategy
        if resampling_strategy_args is None:
            resampling_strategy_args = {}
        self.resampling_strategy_args = resampling_strategy_args

        # and a bunch of useful limits
        self.worst_possible_result = get_cost_of_crash(self.metric)
        self.total_walltime_limit = int(total_walltime_limit)
        self.func_eval_time_limit = int(func_eval_time_limit)
        self.memory_limit = memory_limit
        self.data_memory_limit = data_memory_limit
        self.watcher = watcher
        self.num_metalearning_cfgs = num_metalearning_cfgs
        self.config_file = config_file
        self.seed = seed
        self.metadata_directory = metadata_directory
        self.start_num_run = start_num_run
        self.include = include
        self.exclude = exclude
        self.disable_file_output = disable_file_output
        self.smac_scenario_args = smac_scenario_args
        self.get_smac_object_callback = get_smac_object_callback
        self.scoring_functions = scoring_functions

        self.pynisher_context = pynisher_context

        self.ensemble_callback = ensemble_callback
        self.trials_callback = trials_callback

        dataset_name_ = "" if dataset_name is None else dataset_name
        logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed,
                                     ":" + dataset_name_)
        if port is None:
            self.logger = logging.getLogger(__name__)
        else:
            self.logger = get_named_client_logger(
                name=logger_name,
                port=self.port,
            )
    def run(
        self,
        config: Configuration,
        instance: Optional[str] = None,
        cutoff: Optional[float] = None,
        seed: int = 12345,
        budget: float = 0.0,
        instance_specific: Optional[str] = None,
    ) -> Tuple[StatusType, float, float, Dict[str, Union[int, float, str, Dict, List, Tuple]]]:

        # Additional information of each of the tae executions
        # Defined upfront for mypy
        additional_run_info: TYPE_ADDITIONAL_INFO = {}

        context = multiprocessing.get_context(self.pynisher_context)
        preload_modules(context)
        queue = context.Queue()

        if not (instance_specific is None or instance_specific == '0'):
            raise ValueError(instance_specific)
        init_params = {'instance': instance}
        if self.init_params is not None:
            init_params.update(self.init_params)

        if self.port is None:
            logger: Union[logging.Logger, PickableLoggerAdapter] = logging.getLogger("pynisher")
        else:
            logger = get_named_client_logger(
                name="pynisher",
                port=self.port,
            )
        arguments = dict(
            logger=logger,
            wall_time_in_s=cutoff,
            mem_in_mb=self.memory_limit,
            capture_output=True,
            context=context,
        )

        if isinstance(config, int):
            num_run = self.initial_num_run
        else:
            num_run = config.config_id + self.initial_num_run

        obj_kwargs = dict(
            queue=queue,
            config=config,
            backend=self.backend,
            port=self.port,
            metric=self.metric,
            seed=self.autosklearn_seed,
            num_run=num_run,
            scoring_functions=self.scoring_functions,
            output_y_hat_optimization=self.output_y_hat_optimization,
            include=self.include,
            exclude=self.exclude,
            disable_file_output=self.disable_file_output,
            instance=instance,
            init_params=init_params,
            budget=budget,
            budget_type=self.budget_type,
            additional_components=autosklearn.pipeline.components.base._addons,
        )

        if self.resampling_strategy != 'test':
            obj_kwargs['resampling_strategy'] = self.resampling_strategy
            obj_kwargs['resampling_strategy_args'] = self.resampling_strategy_args

        try:
            obj = pynisher.enforce_limits(**arguments)(self.ta)
            obj(**obj_kwargs)
        except Exception as e:
            exception_traceback = traceback.format_exc()
            error_message = repr(e)
            additional_run_info.update({
                'traceback': exception_traceback,
                'error': error_message
            })
            return StatusType.CRASHED, self.worst_possible_result, 0.0, additional_run_info

        if obj.exit_status in (pynisher.TimeoutException, pynisher.MemorylimitException):
            # Even if the pynisher thinks that a timeout or memout occured,
            # it can be that the target algorithm wrote something into the queue
            #  - then we treat it as a succesful run
            try:
                info = autosklearn.evaluation.util.read_queue(queue)
                result = info[-1]['loss']
                status = info[-1]['status']
                additional_run_info = info[-1]['additional_run_info']

                if obj.stdout:
                    additional_run_info['subprocess_stdout'] = obj.stdout
                if obj.stderr:
                    additional_run_info['subprocess_stderr'] = obj.stderr

                if obj.exit_status is pynisher.TimeoutException:
                    additional_run_info['info'] = 'Run stopped because of timeout.'
                elif obj.exit_status is pynisher.MemorylimitException:
                    additional_run_info['info'] = 'Run stopped because of memout.'

                if status in [StatusType.SUCCESS, StatusType.DONOTADVANCE]:
                    cost = result
                else:
                    cost = self.worst_possible_result

            except Empty:
                info = None
                if obj.exit_status is pynisher.TimeoutException:
                    status = StatusType.TIMEOUT
                    additional_run_info = {'error': 'Timeout'}
                elif obj.exit_status is pynisher.MemorylimitException:
                    status = StatusType.MEMOUT
                    additional_run_info = {
                        "error": "Memout (used more than {} MB).".format(self.memory_limit)
                    }
                else:
                    raise ValueError(obj.exit_status)
                cost = self.worst_possible_result

        elif obj.exit_status is TAEAbortException:
            info = None
            status = StatusType.ABORT
            cost = self.worst_possible_result
            additional_run_info = {'error': 'Your configuration of '
                                            'auto-sklearn does not work!',
                                   'exit_status': _encode_exit_status(obj.exit_status),
                                   'subprocess_stdout': obj.stdout,
                                   'subprocess_stderr': obj.stderr,
                                   }

        else:
            try:
                info = autosklearn.evaluation.util.read_queue(queue)
                result = info[-1]['loss']
                status = info[-1]['status']
                additional_run_info = info[-1]['additional_run_info']

                if obj.exit_status == 0:
                    cost = result
                else:
                    status = StatusType.CRASHED
                    cost = self.worst_possible_result
                    additional_run_info['info'] = 'Run treated as crashed ' \
                                                  'because the pynisher exit ' \
                                                  'status %s is unknown.' % \
                                                  str(obj.exit_status)
                    additional_run_info['exit_status'] = _encode_exit_status(obj.exit_status)
                    additional_run_info['subprocess_stdout'] = obj.stdout
                    additional_run_info['subprocess_stderr'] = obj.stderr
            except Empty:
                info = None
                additional_run_info = {
                    'error': 'Result queue is empty',
                    'exit_status': _encode_exit_status(obj.exit_status),
                    'subprocess_stdout': obj.stdout,
                    'subprocess_stderr': obj.stderr,
                    'exitcode': obj.exitcode
                }
                status = StatusType.CRASHED
                cost = self.worst_possible_result

        if (
            (self.budget_type is None or budget == 0)
            and status == StatusType.DONOTADVANCE
        ):
            status = StatusType.SUCCESS

        if not isinstance(additional_run_info, dict):
            additional_run_info = {'message': additional_run_info}

        if (
            info is not None
            and self.resampling_strategy in ('holdout-iterative-fit', 'cv-iterative-fit')
            and status != StatusType.CRASHED
        ):
            learning_curve = autosklearn.evaluation.util.extract_learning_curve(info)
            learning_curve_runtime = autosklearn.evaluation.util.extract_learning_curve(
                info, 'duration'
            )
            if len(learning_curve) > 1:
                additional_run_info['learning_curve'] = learning_curve
                additional_run_info['learning_curve_runtime'] = learning_curve_runtime

            train_learning_curve = autosklearn.evaluation.util.extract_learning_curve(
                info, 'train_loss'
            )
            if len(train_learning_curve) > 1:
                additional_run_info['train_learning_curve'] = train_learning_curve
                additional_run_info['learning_curve_runtime'] = learning_curve_runtime

            if self._get_validation_loss:
                validation_learning_curve = autosklearn.evaluation.util.extract_learning_curve(
                    info, 'validation_loss',
                )
                if len(validation_learning_curve) > 1:
                    additional_run_info['validation_learning_curve'] = \
                        validation_learning_curve
                    additional_run_info[
                        'learning_curve_runtime'] = learning_curve_runtime

            if self._get_test_loss:
                test_learning_curve = autosklearn.evaluation.util.extract_learning_curve(
                    info, 'test_loss',
                )
                if len(test_learning_curve) > 1:
                    additional_run_info['test_learning_curve'] = test_learning_curve
                    additional_run_info[
                        'learning_curve_runtime'] = learning_curve_runtime

        if isinstance(config, int):
            origin = 'DUMMY'
            config_id = config
        else:
            origin = getattr(config, 'origin', 'UNKNOWN')
            config_id = config.config_id
        additional_run_info['configuration_origin'] = origin

        runtime = float(obj.wall_clock_time)

        autosklearn.evaluation.util.empty_queue(queue)
        self.logger.info("Finished evaluating configuration %d" % config_id)
        return status, cost, runtime, additional_run_info
    def __init__(
        self,
        backend: Backend,
        autosklearn_seed: int,
        resampling_strategy: Union[str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit],
        metric: Scorer,
        cost_for_crash: float,
        abort_on_first_run_crash: bool,
        port: int,
        pynisher_context: str,
        initial_num_run: int = 1,
        stats: Optional[Stats] = None,
        run_obj: str = 'quality',
        par_factor: int = 1,
        scoring_functions: Optional[List[Scorer]] = None,
        output_y_hat_optimization: bool = True,
        include: Optional[List[str]] = None,
        exclude: Optional[List[str]] = None,
        memory_limit: Optional[int] = None,
        disable_file_output: bool = False,
        init_params: Optional[Dict[str, Any]] = None,
        budget_type: Optional[str] = None,
        ta: Optional[Callable] = None,
        **resampling_strategy_args: Any,
    ):

        if resampling_strategy == 'holdout':
            eval_function = autosklearn.evaluation.train_evaluator.eval_holdout
        elif resampling_strategy == 'holdout-iterative-fit':
            eval_function = autosklearn.evaluation.train_evaluator.eval_iterative_holdout
        elif resampling_strategy == 'cv-iterative-fit':
            eval_function = autosklearn.evaluation.train_evaluator.eval_iterative_cv
        elif resampling_strategy == 'cv' or isinstance(resampling_strategy, (
            BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit)
        ):
            eval_function = autosklearn.evaluation.train_evaluator.eval_cv
        elif resampling_strategy == 'partial-cv':
            eval_function = autosklearn.evaluation.train_evaluator.eval_partial_cv
        elif resampling_strategy == 'partial-cv-iterative-fit':
            eval_function = autosklearn.evaluation.train_evaluator.eval_partial_cv_iterative
        elif resampling_strategy == 'test':
            eval_function = autosklearn.evaluation.test_evaluator.eval_t
            output_y_hat_optimization = False
        else:
            raise ValueError('Unknown resampling strategy %s' %
                             resampling_strategy)

        self.worst_possible_result = cost_for_crash

        eval_function = functools.partial(
            fit_predict_try_except_decorator,
            ta=eval_function,
            cost_for_crash=self.worst_possible_result,
        )

        super().__init__(
            ta=eval_function,
            stats=stats,
            run_obj=run_obj,
            par_factor=par_factor,
            cost_for_crash=self.worst_possible_result,
            abort_on_first_run_crash=abort_on_first_run_crash,
        )

        self.backend = backend
        self.autosklearn_seed = autosklearn_seed
        self.resampling_strategy = resampling_strategy
        self.initial_num_run = initial_num_run
        self.metric = metric
        self.resampling_strategy = resampling_strategy
        self.resampling_strategy_args = resampling_strategy_args
        self.scoring_functions = scoring_functions
        # TODO deactivate output_y_hat_optimization and let the respective evaluator decide
        self.output_y_hat_optimization = output_y_hat_optimization
        self.include = include
        self.exclude = exclude
        self.disable_file_output = disable_file_output
        self.init_params = init_params
        self.budget_type = budget_type

        if memory_limit is not None:
            memory_limit = int(math.ceil(memory_limit))
        self.memory_limit = memory_limit

        dm = self.backend.load_datamanager()
        if 'X_valid' in dm.data and 'Y_valid' in dm.data:
            self._get_validation_loss = True
        else:
            self._get_validation_loss = False
        if 'X_test' in dm.data and 'Y_test' in dm.data:
            self._get_test_loss = True
        else:
            self._get_test_loss = False

        self.port = port
        self.pynisher_context = pynisher_context
        if self.port is None:
            self.logger: Union[logging.Logger, PickableLoggerAdapter] = logging.getLogger("TAE")
        else:
            self.logger = get_named_client_logger(
                name="TAE",
                port=self.port,
            )
    def __init__(
        self,
        backend: Backend,
        queue: multiprocessing.Queue,
        metric: Scorer,
        additional_components: Dict[str, ThirdPartyComponents],
        port: Optional[int],
        configuration: Optional[Union[int, Configuration]] = None,
        scoring_functions: Optional[List[Scorer]] = None,
        seed: int = 1,
        output_y_hat_optimization: bool = True,
        num_run: Optional[int] = None,
        include: Optional[List[str]] = None,
        exclude: Optional[List[str]] = None,
        disable_file_output: Union[bool, List[str]] = False,
        init_params: Optional[Dict[str, Any]] = None,
        budget: Optional[float] = None,
        budget_type: Optional[str] = None,
    ):

        # Limit the number of threads that numpy uses
        threadpool_limits(limits=1)

        self.starttime = time.time()

        self.configuration = configuration
        self.backend = backend
        self.port = port
        self.queue = queue

        self.datamanager = self.backend.load_datamanager()
        self.include = include
        self.exclude = exclude

        self.X_valid = self.datamanager.data.get('X_valid')
        self.y_valid = self.datamanager.data.get('Y_valid')
        self.X_test = self.datamanager.data.get('X_test')
        self.y_test = self.datamanager.data.get('Y_test')

        self.metric = metric
        self.task_type = self.datamanager.info['task']
        self.seed = seed

        self.output_y_hat_optimization = output_y_hat_optimization
        self.scoring_functions = scoring_functions

        if isinstance(disable_file_output, (bool, list)):
            self.disable_file_output: Union[bool,
                                            List[str]] = disable_file_output
        else:
            raise ValueError(
                'disable_file_output should be either a bool or a list')

        if self.task_type in REGRESSION_TASKS:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyRegressor
            else:
                self.model_class = \
                    autosklearn.pipeline.regression.SimpleRegressionPipeline
            self.predict_function = self._predict_regression
        else:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyClassifier
            else:
                self.model_class = autosklearn.pipeline.classification.SimpleClassificationPipeline
            self.predict_function = self._predict_proba

        self._init_params = {
            'data_preprocessor:feat_type': self.datamanager.feat_type
        }

        if init_params is not None:
            self._init_params.update(init_params)

        if num_run is None:
            num_run = 0
        self.num_run = num_run

        logger_name = '%s(%d):%s' % (self.__class__.__name__.split('.')[-1],
                                     self.seed, self.datamanager.name)

        if self.port is None:
            self.logger = logging.getLogger(__name__)
        else:
            self.logger = get_named_client_logger(
                name=logger_name,
                port=self.port,
            )

        self.Y_optimization: Optional[Union[List, np.ndarray]] = None
        self.Y_actual_train = None

        self.budget = budget
        self.budget_type = budget_type

        # Add 3rd-party components to the list of 3rd-party components in case this wasn't done
        # before (this happens if we run in parallel and the components are only passed to the
        # AbstractEvaluator via the TAE and are not there yet because the worker is in its own
        # process).
        for key in additional_components:
            for component_name, component in additional_components[
                    key].components.items():
                if component_name not in _addons[key].components:
                    _addons[key].add_component(component)

        # Please mypy to prevent not defined attr
        self.model = self._get_model()
    def __init__(
        self,
        backend: Backend,
        queue: multiprocessing.Queue,
        metric: Scorer,
        port: Optional[int],
        configuration: Optional[Union[int, Configuration]] = None,
        scoring_functions: Optional[List[Scorer]] = None,
        seed: int = 1,
        output_y_hat_optimization: bool = True,
        num_run: Optional[int] = None,
        include: Optional[List[str]] = None,
        exclude: Optional[List[str]] = None,
        disable_file_output: Union[bool, List[str]] = False,
        init_params: Optional[Dict[str, Any]] = None,
        budget: Optional[float] = None,
        budget_type: Optional[str] = None,
    ):

        self.starttime = time.time()

        self.configuration = configuration
        self.backend = backend
        self.port = port
        self.queue = queue

        self.datamanager = self.backend.load_datamanager()
        self.include = include
        self.exclude = exclude

        self.X_valid = self.datamanager.data.get('X_valid')
        self.y_valid = self.datamanager.data.get('Y_valid')
        self.X_test = self.datamanager.data.get('X_test')
        self.y_test = self.datamanager.data.get('Y_test')

        self.metric = metric
        self.task_type = self.datamanager.info['task']
        self.seed = seed

        self.output_y_hat_optimization = output_y_hat_optimization
        self.scoring_functions = scoring_functions

        if isinstance(disable_file_output, (bool, list)):
            self.disable_file_output: Union[bool,
                                            List[str]] = disable_file_output
        else:
            raise ValueError(
                'disable_file_output should be either a bool or a list')

        if self.task_type in REGRESSION_TASKS:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyRegressor
            else:
                self.model_class = \
                    autosklearn.pipeline.regression.SimpleRegressionPipeline
            self.predict_function = self._predict_regression
        else:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyClassifier
            else:
                self.model_class = autosklearn.pipeline.classification.SimpleClassificationPipeline
            self.predict_function = self._predict_proba

        categorical_mask = []
        for feat in self.datamanager.feat_type:
            if feat.lower() == 'numerical':
                categorical_mask.append(False)
            elif feat.lower() == 'categorical':
                categorical_mask.append(True)
            else:
                raise ValueError(feat)
        if np.sum(categorical_mask) > 0:
            self._init_params = {
                'data_preprocessing:categorical_features': categorical_mask
            }
        else:
            self._init_params = {}
        if init_params is not None:
            self._init_params.update(init_params)

        if num_run is None:
            num_run = 0
        self.num_run = num_run

        logger_name = '%s(%d):%s' % (self.__class__.__name__.split('.')[-1],
                                     self.seed, self.datamanager.name)

        if self.port is None:
            self.logger = logging.getLogger(__name__)
        else:
            self.logger = get_named_client_logger(
                name=logger_name,
                port=self.port,
            )

        self.Y_optimization: Optional[Union[List, np.ndarray]] = None
        self.Y_actual_train = None

        self.budget = budget
        self.budget_type = budget_type

        # Please mypy to prevent not defined attr
        self.model = self._get_model()
Exemple #9
0
    def __init__(self, backend, autosklearn_seed, resampling_strategy, metric,
                 cost_for_crash, abort_on_first_run_crash, port,
                 initial_num_run=1, stats=None,
                 run_obj='quality', par_factor=1, scoring_functions=None,
                 output_y_hat_optimization=True, include=None, exclude=None,
                 memory_limit=None, disable_file_output=False, init_params=None,
                 budget_type=None, ta=False, pynisher_context='spawn', **resampling_strategy_args):

        if resampling_strategy == 'holdout':
            eval_function = autosklearn.evaluation.train_evaluator.eval_holdout
        elif resampling_strategy == 'holdout-iterative-fit':
            eval_function = autosklearn.evaluation.train_evaluator.eval_iterative_holdout
        elif resampling_strategy == 'cv-iterative-fit':
            eval_function = autosklearn.evaluation.train_evaluator.eval_iterative_cv
        elif resampling_strategy == 'cv' or (
             isinstance(resampling_strategy, type) and (
                issubclass(resampling_strategy, BaseCrossValidator) or
                issubclass(resampling_strategy, _RepeatedSplits) or
                issubclass(resampling_strategy, BaseShuffleSplit)
                )
             ):
            eval_function = autosklearn.evaluation.train_evaluator.eval_cv
        elif resampling_strategy == 'partial-cv':
            eval_function = autosklearn.evaluation.train_evaluator.eval_partial_cv
        elif resampling_strategy == 'partial-cv-iterative-fit':
            eval_function = autosklearn.evaluation.train_evaluator.eval_partial_cv_iterative
        elif resampling_strategy == 'test':
            eval_function = autosklearn.evaluation.test_evaluator.eval_t
            output_y_hat_optimization = False
        else:
            raise ValueError('Unknown resampling strategy %s' %
                             resampling_strategy)

        self.worst_possible_result = cost_for_crash

        eval_function = functools.partial(
            fit_predict_try_except_decorator,
            ta=eval_function,
            cost_for_crash=self.worst_possible_result,
        )

        super().__init__(
            ta=eval_function,
            stats=stats,
            run_obj=run_obj,
            par_factor=par_factor,
            cost_for_crash=self.worst_possible_result,
            abort_on_first_run_crash=abort_on_first_run_crash,
        )

        self.backend = backend
        self.autosklearn_seed = autosklearn_seed
        self.resampling_strategy = resampling_strategy
        self.initial_num_run = initial_num_run
        self.metric = metric
        self.resampling_strategy = resampling_strategy
        self.resampling_strategy_args = resampling_strategy_args
        self.scoring_functions = scoring_functions
        # TODO deactivate output_y_hat_optimization and let the respective evaluator decide
        self.output_y_hat_optimization = output_y_hat_optimization
        self.include = include
        self.exclude = exclude
        self.disable_file_output = disable_file_output
        self.init_params = init_params
        self.budget_type = budget_type

        if memory_limit is not None:
            memory_limit = int(math.ceil(memory_limit))
        self.memory_limit = memory_limit

        dm = self.backend.load_datamanager()
        if 'X_valid' in dm.data and 'Y_valid' in dm.data:
            self._get_validation_loss = True
        else:
            self._get_validation_loss = False
        if 'X_test' in dm.data and 'Y_test' in dm.data:
            self._get_test_loss = True
        else:
            self._get_test_loss = False

        self.port = port
        self.pynisher_context = pynisher_context
        if self.port is None:
            self.logger = logging.getLogger("TAE")
        else:
            self.logger = get_named_client_logger(
                name="TAE",
                port=self.port,
            )