예제 #1
0
    def __init__(
        self,
        temporary_directory: str,
        output_directory: Optional[str],
        delete_tmp_folder_after_terminate: bool,
        delete_output_folder_after_terminate: bool,
    ):

        # Check that the names of tmp_dir and output_dir is not the same.
        if temporary_directory == output_directory and temporary_directory is not None:
            raise ValueError("The temporary and the output directory "
                             "must be different.")

        self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = delete_output_folder_after_terminate
        # attributes to check that directories were created by autosklearn.
        self._tmp_dir_created = False
        self._output_dir_created = False

        self._temporary_directory = (get_randomized_directory_name(
            temporary_directory=temporary_directory, ))
        self._output_directory = output_directory
        self.create_directories()
        # This is the first place the logger gets created.
        # We want to make sure any logging forward sets the correct directory
        # were all files should be created
        logging.setup_logger(output_dir=self._temporary_directory)
        self._logger = logging.get_logger(__name__)
예제 #2
0
    def __init__(self,
                 autosklearn_tmp_dir,
                 dataset_name,
                 task_type,
                 metric,
                 limit,
                 output_dir,
                 ensemble_size=None,
                 ensemble_nbest=None,
                 seed=1,
                 shared_mode=False,
                 max_iterations=-1,
                 precision="32",
                 low_precision=True):
        super(EnsembleBuilder, self).__init__()

        self.autosklearn_tmp_dir = autosklearn_tmp_dir
        self.dataset_name = dataset_name
        self.task_type = task_type
        self.metric = metric
        self.limit = limit
        self.output_dir = output_dir
        self.ensemble_size = ensemble_size
        self.ensemble_nbest = ensemble_nbest
        self.seed = seed
        self.shared_mode = shared_mode
        self.max_iterations = max_iterations
        self.precision = precision
        self.low_precision = low_precision

        logger_name = 'EnsembleBuilder(%d):%s' % (self.seed, self.dataset_name)
        setup_logger(
            os.path.join(self.autosklearn_tmp_dir,
                         '%s.log' % str(logger_name)))
        self.logger = get_logger(logger_name)
예제 #3
0
 def _get_logger(self, name):
     logger_name = 'AutoML(%d):%s' % (self._seed, name)
     setup_logger(os.path.join(self._backend.temporary_directory,
                               '%s.log' % str(logger_name)),
                  self.logging_config,
                  )
     return get_logger(logger_name)
예제 #4
0
    def __init__(self,
                 backend,
                 dataset_name,
                 task_type,
                 metric,
                 limit,
                 ensemble_size=None,
                 ensemble_nbest=None,
                 seed=1,
                 shared_mode=False,
                 max_iterations=-1,
                 precision="32",
                 low_precision=True):
        super(EnsembleBuilder, self).__init__()

        self.backend = backend
        self.dataset_name = dataset_name
        self.task_type = task_type
        self.metric = metric
        self.limit = limit
        self.ensemble_size = ensemble_size
        self.ensemble_nbest = ensemble_nbest
        self.seed = seed
        self.shared_mode = shared_mode
        self.max_iterations = max_iterations
        self.precision = precision
        self.low_precision = low_precision

        logger_name = 'EnsembleBuilder(%d):%s' % (self.seed, self.dataset_name)
        self.logger = get_logger(logger_name)
예제 #5
0
    def __init__(self,
                 temporary_directory,
                 output_directory,
                 delete_tmp_folder_after_terminate,
                 delete_output_folder_after_terminate,
                 shared_mode=False):

        # Check that the names of tmp_dir and output_dir is not the same.
        if temporary_directory == output_directory and temporary_directory is not None:
            raise ValueError("The temporary and the output directory "
                             "must be different.")

        self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = delete_output_folder_after_terminate
        self.shared_mode = shared_mode
        # attributes to check that directories were created by autosklearn.
        self._tmp_dir_created = False
        self._output_dir_created = False

        self.__temporary_directory, self.__output_directory = (
            get_randomized_directory_names(
                temporary_directory=temporary_directory,
                output_directory=output_directory,
            ))
        self._logger = logging.get_logger(__name__)
        self.create_directories()
예제 #6
0
    def test_do_dummy_prediction(self):
        for name in ['401_bac', '31_bac', 'adult', 'cadata']:
            backend_api = self._create_backend('test_do_dummy_prediction')

            dataset = os.path.join(self.test_dir, '..', '.data', name)

            auto = autosklearn.automl.AutoML(
                backend_api, 20, 5, initial_configurations_via_metalearning=25)
            setup_logger()
            auto._logger = get_logger('test_do_dummy_predictions')
            auto._backend._make_internals_directory()
            D = load_data(dataset, backend_api)
            auto._backend.save_datamanager(D)
            auto._do_dummy_prediction(D, 1)

            # Ensure that the dummy predictions are not in the current working
            # directory, but in the temporary directory.
            self.assertFalse(
                os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn')))
            self.assertTrue(
                os.path.exists(
                    os.path.join(backend_api.temporary_directory,
                                 '.auto-sklearn', 'predictions_ensemble',
                                 'predictions_ensemble_1_1_0.0.npy')))

            del auto
            self._tearDown(backend_api.temporary_directory)
            self._tearDown(backend_api.output_directory)
예제 #7
0
 def __init__(self, temporary_directory, output_directory,
              delete_tmp_folder_after_terminate,
              delete_output_folder_after_terminate):
     self._prepare_directories(temporary_directory, output_directory)
     self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate
     self.delete_output_folder_after_terminate = delete_output_folder_after_terminate
     self._logger = logging.get_logger(__name__)
     self.create_directories()
예제 #8
0
    def __init__(
        self,
        Datamanager,
        backend,
        configuration=None,
        with_predictions=False,
        all_scoring_functions=False,
        seed=1,
        output_y_test=False,
        num_run=None,
        subsample=None,
    ):

        self.starttime = time.time()

        self.configuration = configuration
        self.backend = backend

        self.D = Datamanager

        self.X_valid = Datamanager.data.get('X_valid')
        self.X_test = Datamanager.data.get('X_test')

        self.metric = Datamanager.info['metric']
        self.task_type = Datamanager.info['task']
        self.seed = seed

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyRegressor
            else:
                self.model_class = \
                    autosklearn.pipeline.regression.SimpleRegressionPipeline
            self.predict_function = self._predict_regression
        else:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyClassifier
            else:
                self.model_class = \
                    autosklearn.pipeline.classification.SimpleClassificationPipeline
            self.predict_function = self._predict_proba

        if num_run is None:
            num_run = 0
        self.num_run = num_run

        self.subsample = subsample

        self.model = self.model_class(self.configuration, self.seed)

        logger_name = '%s(%d):%s' % (self.__class__.__name__.split('.')[-1],
                                     self.seed, self.D.name)
        self.logger = get_logger(logger_name)
예제 #9
0
def run_smac(tmp_dir, basename, time_for_task, ml_memory_limit,
              data_manager_path, configspace_path, initial_configurations,
              per_run_time_limit, watcher, backend, seed,
              resampling_strategy, resampling_strategy_arguments, shared_mode):
    logger = logging.get_logger(__name__)

    task_name = 'runSmac'
    watcher.start_task(task_name)

    instance_file_path, test_instance_file_path = \
        _write_instance_file(resampling_strategy, resampling_strategy_arguments,
                             data_manager_path, backend, tmp_dir)

    scenario_file_path = _write_scenario_file(time_for_task, per_run_time_limit,
                                              ml_memory_limit, tmp_dir,
                                              configspace_path,
                                              instance_file_path,
                                              test_instance_file_path,
                                              basename)

    # = Start SMAC
    time_smac = max(0, time_for_task - watcher.wall_elapsed(basename))
    if time_smac <= 0:
        logger.info('No time left for SMAC')
        return

    logger.info('Start SMAC with %5.2fsec time left' % time_smac)

    initial_challengers = initial_configurations
    if initial_challengers is None:
        initial_challengers = []

    smac_options = {
        'retryTargetAlgorithmRunCount': '0',
        'intensification-percentage': '0.5',
        'num-ei-random': '1000',
        'num-challengers': 100,
        'initial-incumbent': 'DEFAULT',
        'validation': 'false'
    }

    if shared_mode:
        smac_options['shared-model-mode'] = 'true'
        smac_options['shared-model-mode-frequency'] = '300'

    call = ' '.join(['smac', '--numRun', str(seed), '--scenario',
                     scenario_file_path] +
                    ['--%s %s' % (opt, smac_options[opt])
                     for opt in smac_options]
                    + initial_challengers,
    )

    proc = submit_call(call, seed, logger)

    watcher.stop_task(task_name)
    return proc
예제 #10
0
 def __init__(self,
              temporary_directory,
              output_directory,
              delete_tmp_folder_after_terminate,
              delete_output_folder_after_terminate):
     self._prepare_directories(temporary_directory, output_directory)
     self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate
     self.delete_output_folder_after_terminate = delete_output_folder_after_terminate
     self._logger = logging.get_logger(__name__)
     self.create_directories()
예제 #11
0
def run_smac(tmp_dir, basename, time_for_task, ml_memory_limit,
             data_manager_path, configspace_path, initial_configurations,
             per_run_time_limit, watcher, backend, seed, resampling_strategy,
             resampling_strategy_arguments, shared_mode):
    logger = logging.get_logger(__name__)

    task_name = 'runSmac'
    watcher.start_task(task_name)

    instance_file_path, test_instance_file_path = \
        _write_instance_file(resampling_strategy, resampling_strategy_arguments,
                             data_manager_path, backend, tmp_dir)

    scenario_file_path = _write_scenario_file(
        time_for_task, per_run_time_limit, ml_memory_limit, tmp_dir,
        configspace_path, instance_file_path, test_instance_file_path,
        basename)

    # = Start SMAC
    time_smac = max(0, time_for_task - watcher.wall_elapsed(basename))
    if time_smac <= 0:
        logger.info('No time left for SMAC')
        return

    logger.info('Start SMAC with %5.2fsec time left' % time_smac)

    initial_challengers = initial_configurations
    if initial_challengers is None:
        initial_challengers = []

    smac_options = {
        'retryTargetAlgorithmRunCount': '0',
        'intensification-percentage': '0.5',
        'num-ei-random': '1000',
        'num-challengers': 100,
        'initial-incumbent': 'DEFAULT',
        'validation': 'false'
    }

    if shared_mode:
        smac_options['shared-model-mode'] = 'true'
        smac_options['shared-model-mode-frequency'] = '300'

    call = ' '.join(
        ['smac', '--numRun',
         str(seed), '--scenario', scenario_file_path] +
        ['--%s %s' % (opt, smac_options[opt])
         for opt in smac_options] + initial_challengers, )

    proc = submit_call(call, seed, logger)

    watcher.stop_task(task_name)
    return proc
예제 #12
0
    def __init__(self, Datamanager, backend, configuration=None,
                 with_predictions=False,
                 all_scoring_functions=False,
                 seed=1,
                 output_y_test=False,
                 num_run=None,
                 subsample=None,):

        self.starttime = time.time()

        self.configuration = configuration
        self.backend = backend

        self.D = Datamanager

        self.X_valid = Datamanager.data.get('X_valid')
        self.X_test = Datamanager.data.get('X_test')

        self.metric = Datamanager.info['metric']
        self.task_type = Datamanager.info['task']
        self.seed = seed

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyRegressor
            else:
                self.model_class = \
                    autosklearn.pipeline.regression.SimpleRegressionPipeline
            self.predict_function = self._predict_regression
        else:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyClassifier
            else:
                self.model_class = \
                    autosklearn.pipeline.classification.SimpleClassificationPipeline
            self.predict_function = self._predict_proba

        if num_run is None:
            num_run = 0
        self.num_run = num_run

        self.subsample = subsample

        self.model = self.model_class(self.configuration, self.seed)

        logger_name = '%s(%d):%s' % (self.__class__.__name__.split('.')[-1],
                                     self.seed, self.D.name)
        self.logger = get_logger(logger_name)
예제 #13
0
    def test_do_dummy_prediction(self):
        datasets = {
            'breast_cancer': BINARY_CLASSIFICATION,
            'wine': MULTICLASS_CLASSIFICATION,
            'diabetes': REGRESSION,
        }

        for name, task in datasets.items():
            backend_api = self._create_backend('test_do_dummy_prediction')

            X_train, Y_train, X_test, Y_test = putil.get_dataset(name)
            datamanager = XYDataManager(
                X_train,
                Y_train,
                X_test,
                Y_test,
                task=task,
                dataset_name=name,
                feat_type=None,
            )

            auto = autosklearn.automl.AutoML(
                backend_api,
                20,
                5,
                initial_configurations_via_metalearning=25,
                metric=accuracy,
            )
            setup_logger()
            auto._logger = get_logger('test_do_dummy_predictions')
            auto._backend.save_datamanager(datamanager)
            D = backend_api.load_datamanager()

            # Check if data manager is correcly loaded
            self.assertEqual(D.info['task'], datamanager.info['task'])

            auto._do_dummy_prediction(D, 1)

            # Ensure that the dummy predictions are not in the current working
            # directory, but in the temporary directory.
            self.assertFalse(
                os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn')))
            self.assertTrue(
                os.path.exists(
                    os.path.join(backend_api.temporary_directory,
                                 '.auto-sklearn', 'predictions_ensemble',
                                 'predictions_ensemble_1_1_0.0.npy')))

            del auto
            self._tearDown(backend_api.temporary_directory)
            self._tearDown(backend_api.output_directory)
예제 #14
0
def run_ensemble_builder(tmp_dir, dataset_name, task_type, metric, limit,
                         output_dir, ensemble_size, ensemble_nbest, seed,
                         shared_mode, max_iterations, precision):
    logger = logging.get_logger(__name__)

    if limit <= 0 and (max_iterations is None or max_iterations <= 0):
        logger.warning("Not starting ensemble builder because it's not worth "
                       "it.")
        # It makes no sense to start building ensembles_statistics
        return
    ensemble_script = 'python -m autosklearn.ensemble_selection_script'
    runsolver_exec = 'runsolver'
    delay = 5

    task_type = TASK_TYPES_TO_STRING[task_type]
    metric = METRIC_TO_STRING[metric]

    call = [ensemble_script,
         '--auto-sklearn-tmp-directory', tmp_dir,
         '--basename', dataset_name,
         '--task', task_type,
         '--metric', metric,
         '--limit', str(limit - 5),
         '--output-directory', output_dir,
         '--ensemble-size', str(ensemble_size),
         '--ensemble-nbest', str(ensemble_nbest),
         '--auto-sklearn-seed', str(seed),
         '--max-iterations', str(max_iterations),
         '--precision', str(precision)]
    if shared_mode:
        call.append('--shared-mode')

    call = ' '.join(call)

    # Runsolver does strange things if the time limit is negative. Set it to
    # be at least one (0 means infinity)
    if limit <= 0:
        limit = 0
    else:
        limit = max(1, limit)

    # Now add runsolver command
    # runsolver_cmd = "%s --watcher-data /dev/null -W %d" % \
    #                (runsolver_exec, limit)
    runsolver_cmd = '%s --watcher-data /dev/null -W %d -d %d' % \
                    (runsolver_exec, limit, delay)
    call = runsolver_cmd + ' ' + call

    proc = submit_call(call, seed, logger, log_dir=tmp_dir)
    return proc
예제 #15
0
def run_ensemble_builder(tmp_dir, dataset_name, task_type, metric, limit,
                         output_dir, ensemble_size, ensemble_nbest, seed,
                         shared_mode, max_iterations, precision):
    logger = logging.get_logger(__name__)

    if limit <= 0 and (max_iterations is None or max_iterations <= 0):
        logger.warning("Not starting ensemble builder because it's not worth "
                       "it.")
        # It makes no sense to start building ensembles_statistics
        return
    ensemble_script = 'python -m autosklearn.ensemble_selection_script'
    runsolver_exec = 'runsolver'
    delay = 5

    task_type = TASK_TYPES_TO_STRING[task_type]
    metric = METRIC_TO_STRING[metric]

    call = [
        ensemble_script, '--auto-sklearn-tmp-directory', tmp_dir,
        '--dataset_name', dataset_name, '--task', task_type, '--metric',
        metric, '--limit',
        str(limit - 5), '--output-directory', output_dir, '--ensemble-size',
        str(ensemble_size), '--ensemble-nbest',
        str(ensemble_nbest), '--auto-sklearn-seed',
        str(seed), '--max-iterations',
        str(max_iterations), '--precision',
        str(precision)
    ]
    if shared_mode:
        call.append('--shared-mode')

    call = ' '.join(call)

    # Runsolver does strange things if the time limit is negative. Set it to
    # be at least one (0 means infinity)
    if limit <= 0:
        limit = 0
    else:
        limit = max(1, limit)

    # Now add runsolver command
    # runsolver_cmd = "%s --watcher-data /dev/null -W %d" % \
    #                (runsolver_exec, limit)
    runsolver_cmd = '%s --watcher-data /dev/null -W %d -d %d' % \
                    (runsolver_exec, limit, delay)
    call = runsolver_cmd + ' ' + call

    proc = submit_call(call, seed, logger, log_dir=tmp_dir)
    return proc
예제 #16
0
    def __init__(self, context):
        self.logger = logging.get_logger(__name__)
        self.context = context

        # Create the temporary directory if it does not yet exist
        try:
            os.makedirs(self.temporary_directory)
        except Exception:
            pass
        # This does not have to exist or be specified
        if self.output_directory is not None:
            if not os.path.exists(self.output_directory):
                raise ValueError("Output directory %s does not exist." %
                                 self.output_directory)

        self.internals_directory = os.path.join(self.temporary_directory,
                                                ".auto-sklearn")
        self._make_internals_directory()
예제 #17
0
    def __init__(self,
                 dataset_name,
                 configuration_space,
                 meta_base,
                 distance='l1',
                 seed=None,
                 use_features=None,
                 distance_kwargs=None):
        self.dataset_name = dataset_name
        self.configuration_space = configuration_space
        self.meta_base = meta_base
        self.distance = distance
        self.seed = seed
        self.use_features = use_features
        self.distance_kwargs = distance_kwargs
        self.kND = None  # For caching, makes things faster...

        self.logger = get_logger(__name__)
예제 #18
0
    def __init__(self, context):
        self.logger = logging.get_logger(__name__)
        self.context = context

        # Create the temporary directory if it does not yet exist
        try:
            os.makedirs(self.temporary_directory)
        except Exception:
            pass
        # This does not have to exist or be specified
        if self.output_directory is not None:
            if not os.path.exists(self.output_directory):
                raise ValueError("Output directory %s does not exist." %
                                 self.output_directory)

        self.internals_directory = os.path.join(self.temporary_directory,
                                                ".auto-sklearn")
        self._make_internals_directory()
예제 #19
0
    def test_exceptions_inside_log_in_smbo(self, smbo_run_mock):

        # Make sure that any exception during the AutoML fit due to
        # SMAC are properly captured in a log file
        backend_api = self._create_backend('test_exceptions_inside_log')
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)

        automl = autosklearn.automl.AutoML(
            backend_api,
            20,
            5,
            metric=accuracy,
        )

        output_file = 'test_exceptions_inside_log.log'
        setup_logger(output_file=output_file)
        logger = get_logger('test_exceptions_inside_log')

        # Create a custom exception to prevent other errors to slip in
        class MyException(Exception):
            pass

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        # The first call is on dummy predictor failure
        message = str(np.random.randint(100)) + '_run_smbo'
        smbo_run_mock.side_effect = MyException(message)

        with unittest.mock.patch(
                'autosklearn.automl.AutoML._get_logger') as mock:
            mock.return_value = logger
            with self.assertRaises(MyException):
                automl.fit(
                    X_train,
                    Y_train,
                    task=MULTICLASS_CLASSIFICATION,
                )
            with open(output_file) as f:
                self.assertTrue(message in f.read())

        # Cleanup
        os.unlink(output_file)
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
예제 #20
0
    def test_fit_roar(self):
        def get_roar_object_callback(
                scenario_dict,
                seed,
                ta,
                ta_kwargs,
                **kwargs
        ):
            """Random online adaptive racing.

            http://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf"""
            scenario = Scenario(scenario_dict)
            return ROAR(
                scenario=scenario,
                rng=seed,
                tae_runner=ta,
                tae_runner_kwargs=ta_kwargs,
            )

        backend_api = self._create_backend('test_fit_roar')

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = autosklearn.automl.AutoML(
            backend=backend_api,
            time_left_for_this_task=20,
            per_run_time_limit=5,
            initial_configurations_via_metalearning=0,
            get_smac_object_callback=get_roar_object_callback,
            metric=accuracy,
        )
        setup_logger()
        automl._logger = get_logger('test_fit_roar')
        automl.fit(
            X_train, Y_train, task=MULTICLASS_CLASSIFICATION,
        )
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertGreater(self._count_succeses(automl.cv_results_), 0)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
예제 #21
0
def test_do_dummy_prediction(backend, dask_client, datasets):

    name, task = datasets

    X_train, Y_train, X_test, Y_test = putil.get_dataset(name)
    datamanager = XYDataManager(
        X_train,
        Y_train,
        X_test,
        Y_test,
        task=task,
        dataset_name=name,
        feat_type=None,
    )

    auto = autosklearn.automl.AutoML(
        backend,
        20,
        5,
        initial_configurations_via_metalearning=25,
        metric=accuracy,
        dask_client=dask_client,
    )
    setup_logger(backend.temporary_directory)
    auto._logger = get_logger('test_do_dummy_predictions')

    auto._backend.save_datamanager(datamanager)
    D = backend.load_datamanager()

    # Check if data manager is correcly loaded
    assert D.info['task'] == datamanager.info['task']
    auto._do_dummy_prediction(D, 1)

    # Ensure that the dummy predictions are not in the current working
    # directory, but in the temporary directory.
    assert not os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))
    assert os.path.exists(
        os.path.join(backend.temporary_directory, '.auto-sklearn', 'runs',
                     '1_1_0.0', 'predictions_ensemble_1_1_0.0.npy'))

    del auto
예제 #22
0
    def __init__(self, backend, dataset_name, task_type, metric,
                 limit, ensemble_size=None, ensemble_nbest=None,
                 seed=1, shared_mode=False, max_iterations=-1, precision="32",
                 low_precision=True):
        super(EnsembleBuilder, self).__init__()

        self.backend = backend
        self.dataset_name = dataset_name
        self.task_type = task_type
        self.metric = metric
        self.limit = limit
        self.ensemble_size = ensemble_size
        self.ensemble_nbest = ensemble_nbest
        self.seed = seed
        self.shared_mode = shared_mode
        self.max_iterations = max_iterations
        self.precision = precision
        self.low_precision = low_precision

        logger_name = 'EnsembleBuilder(%d):%s' % (self.seed, self.dataset_name)
        self.logger = get_logger(logger_name)
예제 #23
0
def suggest_via_metalearning(meta_base, dataset_name, metric, task, sparse,
                             num_initial_configurations):
    logger = get_logger('autosklearn.metalearning.mismbo')

    if task == MULTILABEL_CLASSIFICATION:
        task = MULTICLASS_CLASSIFICATION

    task = TASK_TYPES_TO_STRING[task]

    logger.info(task)

    start = time.time()
    ml = MetaLearningOptimizer(
        dataset_name=dataset_name,
        configuration_space=meta_base.configuration_space,
        meta_base=meta_base,
        distance='l1',
        seed=1,
    )
    logger.info('Reading meta-data took %5.2f seconds', time.time() - start)
    runs = ml.metalearning_suggest_all(exclude_double_configurations=True)
    return runs[:num_initial_configurations]
예제 #24
0
def test_fit_roar(dask_client_single_worker, backend):
    def get_roar_object_callback(scenario_dict, seed, ta, ta_kwargs,
                                 dask_client, n_jobs, **kwargs):
        """Random online adaptive racing.

        http://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf"""
        scenario = Scenario(scenario_dict)
        return ROAR(
            scenario=scenario,
            rng=seed,
            tae_runner=ta,
            tae_runner_kwargs=ta_kwargs,
            dask_client=dask_client,
            n_jobs=n_jobs,
        )

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    automl = autosklearn.automl.AutoML(
        backend=backend,
        time_left_for_this_task=30,
        per_run_time_limit=5,
        initial_configurations_via_metalearning=0,
        get_smac_object_callback=get_roar_object_callback,
        metric=accuracy,
        dask_client=dask_client_single_worker,
    )
    setup_logger()
    automl._logger = get_logger('test_fit_roar')
    automl.fit(
        X_train,
        Y_train,
        task=MULTICLASS_CLASSIFICATION,
    )
    score = automl.score(X_test, Y_test)
    assert score > 0.8
    assert count_succeses(automl.cv_results_) > 0
    assert automl._task == MULTICLASS_CLASSIFICATION

    del automl
예제 #25
0
    def __init__(self,
                 temporary_directory,
                 output_directory,
                 delete_tmp_folder_after_terminate,
                 delete_output_folder_after_terminate,
                 shared_mode=False):

        # Check that the names of tmp_dir and output_dir is not the same.
        if temporary_directory == output_directory \
            and temporary_directory is not None:
            raise ValueError("The temporary and the output directory "
                             "must be different.")

        self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = delete_output_folder_after_terminate
        self.shared_mode = shared_mode
        # attributes to check that directories were created by autosklearn.
        self._tmp_dir_created = False
        self._output_dir_created = False

        self._prepare_directories(temporary_directory, output_directory)
        self._logger = logging.get_logger(__name__)
        self.create_directories()
예제 #26
0
    def __init__(self, autosklearn_tmp_dir, dataset_name, task_type, metric,
                 limit, output_dir, ensemble_size=None, ensemble_nbest=None,
                 seed=1, shared_mode=False, max_iterations=-1, precision="32",
                 low_precision=True):
        super(EnsembleBuilder, self).__init__()

        self.autosklearn_tmp_dir = autosklearn_tmp_dir
        self.dataset_name = dataset_name
        self.task_type = task_type
        self.metric = metric
        self.limit = limit
        self.output_dir = output_dir
        self.ensemble_size = ensemble_size
        self.ensemble_nbest = ensemble_nbest
        self.seed = seed
        self.shared_mode = shared_mode
        self.max_iterations = max_iterations
        self.precision = precision
        self.low_precision = low_precision

        logger_name = 'EnsembleBuilder(%d):%s' % (self.seed, self.dataset_name)
        setup_logger(os.path.join(self.autosklearn_tmp_dir,
                                  '%s.log' % str(logger_name)))
        self.logger = get_logger(logger_name)
예제 #27
0
def test_exceptions_inside_log_in_smbo(smbo_run_mock, backend, dask_client):

    automl = autosklearn.automl.AutoML(
        backend,
        20,
        5,
        metric=accuracy,
        dask_client=dask_client,
    )

    output_file = 'test_exceptions_inside_log.log'
    setup_logger(output_file=output_file)
    logger = get_logger('test_exceptions_inside_log')

    # Create a custom exception to prevent other errors to slip in
    class MyException(Exception):
        pass

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    # The first call is on dummy predictor failure
    message = str(np.random.randint(100)) + '_run_smbo'
    smbo_run_mock.side_effect = MyException(message)

    with unittest.mock.patch('autosklearn.automl.AutoML._get_logger') as mock:
        mock.return_value = logger
        with pytest.raises(MyException):
            automl.fit(
                X_train,
                Y_train,
                task=MULTICLASS_CLASSIFICATION,
            )
        with open(output_file) as f:
            assert message in f.read()

    # Cleanup
    os.unlink(output_file)
예제 #28
0
    def __init__(
        self,
        config_space,
        dataset_name,
        backend,
        total_walltime_limit,
        func_eval_time_limit,
        memory_limit,
        metric,
        watcher,
        n_jobs,
        dask_client: dask.distributed.Client,
        start_num_run=1,
        data_memory_limit=None,
        num_metalearning_cfgs=25,
        config_file=None,
        seed=1,
        metadata_directory=None,
        resampling_strategy='holdout',
        resampling_strategy_args=None,
        include_estimators=None,
        exclude_estimators=None,
        include_preprocessors=None,
        exclude_preprocessors=None,
        disable_file_output=False,
        smac_scenario_args=None,
        get_smac_object_callback=None,
        scoring_functions=None,
        ensemble_callback: typing.Optional[EnsembleBuilderManager] = None,
    ):
        super(AutoMLSMBO, self).__init__()
        # data related
        self.dataset_name = dataset_name
        self.datamanager = None
        self.metric = metric
        self.task = None
        self.backend = backend

        # the configuration space
        self.config_space = config_space

        # the number of parallel workers/jobs
        self.n_jobs = n_jobs
        self.dask_client = dask_client

        # Evaluation
        self.resampling_strategy = resampling_strategy
        if resampling_strategy_args is None:
            resampling_strategy_args = {}
        self.resampling_strategy_args = resampling_strategy_args

        # and a bunch of useful limits
        self.worst_possible_result = get_cost_of_crash(self.metric)
        self.total_walltime_limit = int(total_walltime_limit)
        self.func_eval_time_limit = int(func_eval_time_limit)
        self.memory_limit = memory_limit
        self.data_memory_limit = data_memory_limit
        self.watcher = watcher
        self.num_metalearning_cfgs = num_metalearning_cfgs
        self.config_file = config_file
        self.seed = seed
        self.metadata_directory = metadata_directory
        self.start_num_run = start_num_run
        self.include_estimators = include_estimators
        self.exclude_estimators = exclude_estimators
        self.include_preprocessors = include_preprocessors
        self.exclude_preprocessors = exclude_preprocessors
        self.disable_file_output = disable_file_output
        self.smac_scenario_args = smac_scenario_args
        self.get_smac_object_callback = get_smac_object_callback
        self.scoring_functions = scoring_functions

        self.ensemble_callback = ensemble_callback

        dataset_name_ = "" if dataset_name is None else dataset_name
        logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed,
                                     ":" + dataset_name_)
        self.logger = get_logger(logger_name)
예제 #29
0
    def __init__(
            self,
            backend: Backend,
            dataset_name: str,
            task_type: int,
            metric: str,
            limit: int,
            ensemble_size: int=10,
            ensemble_nbest: int=100,
            seed: int=1,
            shared_mode: bool=False,
            max_iterations: int=None,
            precision: str="32",
            sleep_duration: int=2,
            memory_limit: int=1000,
            read_at_most: int=5,
    ):
        """
            Constructor
            
            Parameters
            ----------
            backend: util.backend.Backend
                backend to write and read files
            dataset_name: str
                name of dataset
            task_type: int
                type of ML task
            metric: str
                name of metric to score predictions
            limit: int
                time limit in sec
            ensemble_size: int
                maximal size of ensemble (passed to autosklearn.ensemble.ensemble_selection)
            ensemble_nbest: int
                consider only the n best prediction (wrt validation predictions)
            seed: int
                random seed
                if set to -1, read files with any seed (e.g., for shared model mode)
            shared_model: bool
                auto-sklearn used shared model mode (aka pSMAC)
            max_iterations: int
                maximal number of iterations to run this script
                (default None --> deactivated)
            precision: ["16","32","64","128"]
                precision of floats to read the predictions 
            sleep_duration: int
                duration of sleeping time between two iterations of this script (in sec)
            memory_limit: int
                memory limit in mb
            read_at_most: int 
                read at most n new prediction files in each iteration
        """

        super(EnsembleBuilder, self).__init__()

        self.backend = backend  # communication with filesystem
        self.dataset_name = dataset_name
        self.task_type = task_type
        self.metric = metric
        self.time_limit = limit  # time limit
        self.ensemble_size = ensemble_size
        self.ensemble_nbest = ensemble_nbest  # max number of members that will be used for building the ensemble
        self.seed = seed
        self.shared_mode = shared_mode  # pSMAC?
        self.max_iterations = max_iterations
        self.precision = precision
        self.sleep_duration = sleep_duration
        self.memory_limit = memory_limit
        self.read_at_most = read_at_most
        
        # part of the original training set
        # used to build the ensemble
        self.dir_ensemble = os.path.join(
            self.backend.temporary_directory,
            '.auto-sklearn',
            'predictions_ensemble',
        )

        # validation set (public test set) -- y_true not known
        self.dir_valid = os.path.join(
            self.backend.temporary_directory,
            '.auto-sklearn',
            'predictions_valid',
        )
        # test set (private test set) -- y_true not known
        self.dir_test = os.path.join(
            self.backend.temporary_directory,
            '.auto-sklearn',
            'predictions_test',
        )

        logger_name = 'EnsembleBuilder(%d):%s' % (self.seed, self.dataset_name)
        self.logger = get_logger(logger_name)

        self.start_time = 0
        self.model_fn_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy')
        
        # already read prediction files
        # {"file name": {
        #    "ens_score": float
        #    "mtime_ens": str,
        #    "mtime_valid": str,
        #    "mtime_test": str,
        #    "seed": int,
        #    "num_run": int,
        #    Y_ENSEMBLE: np.ndarray
        #    Y_VALID: np.ndarray
        #    Y_TEST: np.ndarray
        # }
        self.read_preds = {}
        self.last_hash = None  # hash of ensemble training data
        self.y_true_ensemble = None
        self.SAVE2DISC = True

        self.validation_performance_ = np.inf
예제 #30
0
    def __init__(
        self,
        backend: Backend,
        dataset_name: str,
        task_type: int,
        metric: str,
        limit: int,
        ensemble_size: int = 10,
        ensemble_nbest: int = 100,
        seed: int = 1,
        shared_mode: bool = False,
        max_iterations: int = None,
        precision: str = "32",
        sleep_duration: int = 2,
        memory_limit: int = 1000,
        read_at_most: int = 5,
    ):
        """
            Constructor
            
            Parameters
            ----------
            backend: util.backend.Backend
                backend to write and read files
            dataset_name: str
                name of dataset
            task_type: int
                type of ML task
            metric: str
                name of metric to score predictions
            limit: int
                time limit in sec
            ensemble_size: int
                maximal size of ensemble (passed to autosklearn.ensemble.ensemble_selection)
            ensemble_nbest: int
                consider only the n best prediction (wrt validation predictions)
            seed: int
                random seed
                if set to -1, read files with any seed (e.g., for shared model mode)
            shared_model: bool
                auto-sklearn used shared model mode (aka pSMAC)
            max_iterations: int
                maximal number of iterations to run this script
                (default None --> deactivated)
            precision: ["16","32","64","128"]
                precision of floats to read the predictions 
            sleep_duration: int
                duration of sleeping time between two iterations of this script (in sec)
            memory_limit: int
                memory limit in mb
            read_at_most: int 
                read at most n new prediction files in each iteration
        """

        super(EnsembleBuilder, self).__init__()

        self.backend = backend  # communication with filesystem
        self.dataset_name = dataset_name
        self.task_type = task_type
        self.metric = metric
        self.time_limit = limit  # time limit
        self.ensemble_size = ensemble_size
        self.ensemble_nbest = ensemble_nbest  # max number of members that will be used for building the ensemble
        self.seed = seed
        self.shared_mode = shared_mode  # pSMAC?
        self.max_iterations = max_iterations
        self.precision = precision
        self.sleep_duration = sleep_duration
        self.memory_limit = memory_limit
        self.read_at_most = read_at_most

        # part of the original training set
        # used to build the ensemble
        self.dir_ensemble = os.path.join(
            self.backend.temporary_directory,
            '.auto-sklearn',
            'predictions_ensemble',
        )

        # validation set (public test set) -- y_true not known
        self.dir_valid = os.path.join(
            self.backend.temporary_directory,
            '.auto-sklearn',
            'predictions_valid',
        )
        # test set (private test set) -- y_true not known
        self.dir_test = os.path.join(
            self.backend.temporary_directory,
            '.auto-sklearn',
            'predictions_test',
        )

        logger_name = 'EnsembleBuilder(%d):%s' % (self.seed, self.dataset_name)
        self.logger = get_logger(logger_name)

        self.start_time = 0
        self.model_fn_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy')

        # already read prediction files
        # {"file name": {
        #    "ens_score": float
        #    "mtime_ens": str,
        #    "mtime_valid": str,
        #    "mtime_test": str,
        #    "seed": int,
        #    "num_run": int,
        #    Y_ENSEMBLE: np.ndarray
        #    Y_VALID: np.ndarray
        #    Y_TEST: np.ndarray
        # }
        self.read_preds = {}
        self.last_hash = None  # hash of ensemble training data
        self.y_true_ensemble = None
        self.SAVE2DISC = True

        self.validation_performance_ = np.inf
예제 #31
0
def calculate_all_metafeatures(X,
                               y,
                               categorical,
                               dataset_name,
                               calculate=None,
                               dont_calculate=None,
                               densify_threshold=1000):
    logger = get_logger(__name__)
    """Calculate all metafeatures."""
    helper_functions.clear()
    metafeatures.clear()
    mf_ = dict()

    visited = set()
    to_visit = deque()
    to_visit.extend(metafeatures)

    X_transformed = None
    y_transformed = None

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
            continue
        if dont_calculate is not None and name in dont_calculate:
            continue

        if name in npy_metafeatures:
            if X_transformed is None:
                # TODO make sure this is done as efficient as possible (no copy for
                # sparse matrices because of wrong sparse format)
                sparse = scipy.sparse.issparse(X)
                DPP = DataPreprocessor(categorical_features=categorical,
                                       force_sparse_output=True)
                X_transformed = DPP.fit_transform(X)
                categorical_transformed = [False] * X_transformed.shape[1]

                # Densify the transformed matrix
                if not sparse and scipy.sparse.issparse(X_transformed):
                    bytes_per_float = X_transformed.dtype.itemsize
                    num_elements = X_transformed.shape[
                        0] * X_transformed.shape[1]
                    megabytes_required = num_elements * bytes_per_float / 1000 / 1000
                    if megabytes_required < densify_threshold:
                        X_transformed = X_transformed.todense()

                # This is not only important for datasets which are somehow
                # sorted in a strange way, but also prevents lda from failing in
                # some cases.
                # Because this is advanced indexing, a copy of the data is returned!!!
                X_transformed = check_array(X_transformed,
                                            force_all_finite=True,
                                            accept_sparse='csr')
                rs = np.random.RandomState(42)
                indices = np.arange(X_transformed.shape[0])
                rs.shuffle(indices)
                # TODO Shuffle inplace
                X_transformed = X_transformed[indices]
                y_transformed = y[indices]

            X_ = X_transformed
            y_ = y_transformed
            categorical_ = categorical_transformed
        else:
            X_ = X
            y_ = y
            categorical_ = categorical

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            is_helper_function = dependency in helper_functions

            if is_metafeature and is_helper_function:
                raise NotImplementedError()
            elif not is_metafeature and not is_helper_function:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
                to_visit.appendleft(name)
                continue
            elif is_helper_function and not helper_functions.is_calculated(
                    dependency):
                logger.info("%s: Going to calculate: %s", dataset_name,
                            dependency)
                value = helper_functions[dependency](X_, y_, categorical_)
                helper_functions.set_value(dependency, value)
                mf_[dependency] = value

        logger.info("%s: Going to calculate: %s", dataset_name, name)

        value = metafeatures[name](X_, y_, categorical_)
        metafeatures.set_value(name, value)
        mf_[name] = value
        visited.add(name)

    mf_ = DatasetMetafeatures(dataset_name, mf_)
    return mf_
예제 #32
0
    def __init__(self, backend, queue, metric,
                 configuration=None,
                 all_scoring_functions=False,
                 seed=1,
                 output_y_hat_optimization=True,
                 num_run=None,
                 subsample=None,
                 include=None,
                 exclude=None,
                 disable_file_output=False,
                 init_params=None):

        self.starttime = time.time()

        self.configuration = configuration
        self.backend = backend
        self.queue = queue

        self.datamanager = self.backend.load_datamanager()
        self.include = include
        self.exclude = exclude

        self.X_valid = self.datamanager.data.get('X_valid')
        self.y_valid = self.datamanager.data.get('Y_valid')
        self.X_test = self.datamanager.data.get('X_test')
        self.y_test = self.datamanager.data.get('Y_test')

        self.metric = metric
        self.task_type = self.datamanager.info['task']
        self.seed = seed

        self.output_y_hat_optimization = output_y_hat_optimization
        self.all_scoring_functions = all_scoring_functions
        self.disable_file_output = disable_file_output

        if self.task_type in REGRESSION_TASKS:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyRegressor
            else:
                self.model_class = \
                    autosklearn.pipeline.regression.SimpleRegressionPipeline
            self.predict_function = self._predict_regression
        else:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyClassifier
            else:
                self.model_class = (
                    autosklearn.pipeline.classification.
                        SimpleClassificationPipeline
                )
            self.predict_function = self._predict_proba

        categorical_mask = []
        for feat in self.datamanager.feat_type:
            if feat.lower() == 'numerical':
                categorical_mask.append(False)
            elif feat.lower() == 'categorical':
                categorical_mask.append(True)
            else:
                raise ValueError(feat)
        if np.sum(categorical_mask) > 0:
            self._init_params = {
                'categorical_encoding:one_hot_encoding:categorical_features':
                    categorical_mask
            }
        else:
            self._init_params = {}
        if init_params is not None:
            self._init_params.update(init_params)

        if num_run is None:
            num_run = 0
        self.num_run = num_run

        self.subsample = subsample

        logger_name = '%s(%d):%s' % (self.__class__.__name__.split('.')[-1],
                                     self.seed, self.datamanager.name)
        self.logger = get_logger(logger_name)

        self.Y_optimization = None
        self.Y_actual_train = None
예제 #33
0
def calculate_all_metafeatures(X, y, categorical, dataset_name,
        calculate=None, dont_calculate=None, densify_threshold=1000):
    logger = get_logger(__name__)

    """Calculate all metafeatures."""
    helper_functions.clear()
    metafeatures.clear()
    mf_ = dict()

    visited = set()
    to_visit = deque()
    to_visit.extend(metafeatures)

    X_transformed = None
    y_transformed = None

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
            continue
        if dont_calculate is not None and name in dont_calculate:
            continue

        if name in npy_metafeatures:
            if X_transformed is None:
                # TODO make sure this is done as efficient as possible (no copy for
                # sparse matrices because of wrong sparse format)
                sparse = scipy.sparse.issparse(X)
                ohe = OneHotEncoder(categorical_features=categorical, sparse=True)
                X_transformed = ohe.fit_transform(X)
                imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype)
                X_transformed = imputer.fit_transform(X_transformed)
                standard_scaler = StandardScaler(copy=False)
                X_transformed = standard_scaler.fit_transform(X_transformed)

                # Transform the array which indicates the categorical metafeatures
                number_numerical = np.sum(~np.array(categorical))
                categorical_transformed = [True] * (X_transformed.shape[1] -
                                                    number_numerical) + \
                                          [False] * number_numerical

                # Densify the transformed matrix
                if not sparse and scipy.sparse.issparse(X_transformed):
                    bytes_per_float = X_transformed.dtype.itemsize
                    num_elements = X_transformed.shape[0] * X_transformed.shape[1]
                    megabytes_required = num_elements * bytes_per_float / 1000 / 1000
                    if megabytes_required < densify_threshold:
                        X_transformed = X_transformed.todense()

                # This is not only important for datasets which are somehow
                # sorted in a strange way, but also prevents lda from failing in
                # some cases.
                # Because this is advanced indexing, a copy of the data is returned!!!
                X_transformed = check_array(X_transformed,
                                            force_all_finite=True,
                                            accept_sparse='csr')
                rs = np.random.RandomState(42)
                indices = np.arange(X_transformed.shape[0])
                rs.shuffle(indices)
                # TODO Shuffle inplace
                X_transformed = X_transformed[indices]
                y_transformed = y[indices]

            X_ = X_transformed
            y_ = y_transformed
            categorical_ = categorical_transformed
        else:
            X_ = X
            y_ = y
            categorical_ = categorical

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            is_helper_function = dependency in helper_functions

            if is_metafeature and is_helper_function:
                raise NotImplementedError()
            elif not is_metafeature and not is_helper_function:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
                to_visit.appendleft(name)
                continue
            elif is_helper_function and not helper_functions.is_calculated(
                    dependency):
                logger.info("%s: Going to calculate: %s", dataset_name,
                            dependency)
                value = helper_functions[dependency](X_, y_, categorical_)
                helper_functions.set_value(dependency, value)
                mf_[dependency] = value

        logger.info("%s: Going to calculate: %s", dataset_name,
                    name)

        value = metafeatures[name](X_, y_, categorical_)
        metafeatures.set_value(name, value)
        mf_[name] = value
        visited.add(name)

    mf_ = DatasetMetafeatures(dataset_name, mf_)
    return mf_
예제 #34
0
 def __init__(self):
     self.logger = get_logger(__name__)
예제 #35
0
    def __init__(self,
                 backend,
                 queue,
                 metric,
                 configuration=None,
                 all_scoring_functions=False,
                 seed=1,
                 output_y_hat_optimization=True,
                 num_run=None,
                 include=None,
                 exclude=None,
                 disable_file_output=False,
                 init_params=None,
                 budget=None,
                 budget_type=None):

        self.starttime = time.time()

        self.configuration = configuration
        self.backend = backend
        self.queue = queue

        self.datamanager = self.backend.load_datamanager()
        self.include = include
        self.exclude = exclude

        self.X_valid = self.datamanager.data.get('X_valid')
        self.y_valid = self.datamanager.data.get('Y_valid')
        self.X_test = self.datamanager.data.get('X_test')
        self.y_test = self.datamanager.data.get('Y_test')

        self.metric = metric
        self.task_type = self.datamanager.info['task']
        self.seed = seed

        self.output_y_hat_optimization = output_y_hat_optimization
        self.all_scoring_functions = all_scoring_functions

        if isinstance(disable_file_output, (bool, list)):
            self.disable_file_output = disable_file_output
        else:
            raise ValueError(
                'disable_file_output should be either a bool or a list')

        if self.task_type in REGRESSION_TASKS:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyRegressor
            else:
                self.model_class = \
                    autosklearn.pipeline.regression.SimpleRegressionPipeline
            self.predict_function = self._predict_regression
        else:
            if not isinstance(self.configuration, Configuration):
                self.model_class = MyDummyClassifier
            else:
                self.model_class = autosklearn.pipeline.classification.SimpleClassificationPipeline
            self.predict_function = self._predict_proba

        categorical_mask = []
        for feat in self.datamanager.feat_type:
            if feat.lower() == 'numerical':
                categorical_mask.append(False)
            elif feat.lower() == 'categorical':
                categorical_mask.append(True)
            else:
                raise ValueError(feat)
        if np.sum(categorical_mask) > 0:
            self._init_params = {
                'data_preprocessing:categorical_features': categorical_mask
            }
        else:
            self._init_params = {}
        if init_params is not None:
            self._init_params.update(init_params)

        if num_run is None:
            num_run = 0
        self.num_run = num_run

        logger_name = '%s(%d):%s' % (self.__class__.__name__.split('.')[-1],
                                     self.seed, self.datamanager.name)
        self.logger = get_logger(logger_name)

        self.Y_optimization = None
        self.Y_actual_train = None

        self.budget = budget
        self.budget_type = budget_type
예제 #36
0
import os
from StringIO import StringIO
import time

import numpy as np

import pyMetaLearn.metafeatures.metafeatures as metafeatures
import pyMetaLearn.optimizers.metalearn_optimizer.metalearner as \
    metalearner

from autosklearn.util import logging_
from autosklearn.constants import *
logger = logging_.get_logger(__name__)


class MetaLearning(object):
    """Right now, pyMetaLearn performs a OneHotEncoding if necessary, but it
    is really not necessary. This object helps to circumvent this by:

    1. call metafeatures.calculate_all_metafeatures() only for the
        metafeatures which do not need OneHotEncoded data
    2. Allows the caller to then perform a OneHotEncoding
    3. call metafeatures.calculate_metafeatures_encoded_labels() for all
        other metafeatures need OneHotEncoded data.
    """
    def __init__(self):
        self._sentinel = "uiaeo"
        self._metafeatures_encoded_labels = None
        self._metafeatures_labels = None
        # Hard-coded list of too-expensive metafeatures!
        self._exclude_metafeatures = set([
예제 #37
0
    def test_fail_if_dummy_prediction_fails(self, ta_run_mock):
        backend_api = self._create_backend('test_fail_if_dummy_prediction_fails')

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        datamanager = XYDataManager(
            X_train, Y_train,
            X_test, Y_test,
            task=2,
            feat_type=['Numerical' for i in range(X_train.shape[1])],
            dataset_name='iris',
        )

        time_for_this_task = 30
        per_run_time = 10
        auto = autosklearn.automl.AutoML(backend_api,
                                         time_for_this_task,
                                         per_run_time,
                                         initial_configurations_via_metalearning=25,
                                         metric=accuracy,
                                         )
        setup_logger()
        auto._logger = get_logger('test_fail_if_dummy_prediction_fails')
        auto._backend._make_internals_directory()
        auto._backend.save_datamanager(datamanager)

        # First of all, check that ta.run() is actually called.
        ta_run_mock.return_value = StatusType.SUCCESS, None, None, "test"
        auto._do_dummy_prediction(datamanager, 1)
        ta_run_mock.assert_called_once_with(1, cutoff=time_for_this_task)

        # Case 1. Check that function raises no error when statustype == success.
        # ta.run() returns status, cost, runtime, and additional info.
        ta_run_mock.return_value = StatusType.SUCCESS, None, None, "test"
        raised = False
        try:
            auto._do_dummy_prediction(datamanager, 1)
        except ValueError:
            raised = True
        self.assertFalse(raised, 'Exception raised')

        # Case 2. Check that if statustype returned by ta.run() != success,
        # the function raises error.
        ta_run_mock.return_value = StatusType.CRASHED, None, None, "test"
        self.assertRaisesRegex(ValueError,
                               'Dummy prediction failed with run state StatusType.CRASHED '
                               'and additional output: test.',
                               auto._do_dummy_prediction,
                               datamanager, 1,
                               )
        ta_run_mock.return_value = StatusType.ABORT, None, None, "test"
        self.assertRaisesRegex(ValueError,
                               'Dummy prediction failed with run state StatusType.ABORT '
                               'and additional output: test.',
                               auto._do_dummy_prediction,
                               datamanager, 1,
                               )
        ta_run_mock.return_value = StatusType.TIMEOUT, None, None, "test"
        self.assertRaisesRegex(ValueError,
                               'Dummy prediction failed with run state StatusType.TIMEOUT '
                               'and additional output: test.',
                               auto._do_dummy_prediction,
                               datamanager, 1,
                               )
        ta_run_mock.return_value = StatusType.MEMOUT, None, None, "test"
        self.assertRaisesRegex(ValueError,
                               'Dummy prediction failed with run state StatusType.MEMOUT '
                               'and additional output: test.',
                               auto._do_dummy_prediction,
                               datamanager, 1,
                               )
        ta_run_mock.return_value = StatusType.CAPPED, None, None, "test"
        self.assertRaisesRegex(ValueError,
                               'Dummy prediction failed with run state StatusType.CAPPED '
                               'and additional output: test.',
                               auto._do_dummy_prediction,
                               datamanager, 1,
                               )

        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
예제 #38
0
def test_fail_if_dummy_prediction_fails(ta_run_mock, backend, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    datamanager = XYDataManager(
        X_train,
        Y_train,
        X_test,
        Y_test,
        task=2,
        feat_type=['Numerical' for i in range(X_train.shape[1])],
        dataset_name='iris',
    )

    time_for_this_task = 30
    per_run_time = 10
    auto = autosklearn.automl.AutoML(
        backend,
        time_for_this_task,
        per_run_time,
        initial_configurations_via_metalearning=25,
        metric=accuracy,
        dask_client=dask_client,
    )
    setup_logger()
    auto._logger = get_logger('test_fail_if_dummy_prediction_fails')
    auto._backend._make_internals_directory()
    auto._backend.save_datamanager(datamanager)

    # First of all, check that ta.run() is actually called.
    ta_run_mock.return_value = StatusType.SUCCESS, None, None, {}
    auto._do_dummy_prediction(datamanager, 1)
    ta_run_mock.assert_called_once_with(1, cutoff=time_for_this_task)

    # Case 1. Check that function raises no error when statustype == success.
    # ta.run() returns status, cost, runtime, and additional info.
    ta_run_mock.return_value = StatusType.SUCCESS, None, None, {}
    raised = False
    try:
        auto._do_dummy_prediction(datamanager, 1)
    except ValueError:
        raised = True
    assert not raised, 'Exception raised'

    # Case 2. Check that if statustype returned by ta.run() != success,
    # the function raises error.
    ta_run_mock.return_value = StatusType.CRASHED, None, None, {}
    with pytest.raises(
            ValueError,
            match=
            'Dummy prediction failed with run state StatusType.CRASHED and additional output: {}.'  # noqa
    ):
        auto._do_dummy_prediction(datamanager, 1)

    ta_run_mock.return_value = StatusType.ABORT, None, None, {}
    with pytest.raises(
            ValueError,
            match='Dummy prediction failed with run state StatusType.ABORT '
            'and additional output: {}.',
    ):
        auto._do_dummy_prediction(datamanager, 1)
    ta_run_mock.return_value = StatusType.TIMEOUT, None, None, {}
    with pytest.raises(
            ValueError,
            match='Dummy prediction failed with run state StatusType.TIMEOUT '
            'and additional output: {}.'):
        auto._do_dummy_prediction(datamanager, 1)
    ta_run_mock.return_value = StatusType.MEMOUT, None, None, {}
    with pytest.raises(
            ValueError,
            match='Dummy prediction failed with run state StatusType.MEMOUT '
            'and additional output: {}.',
    ):
        auto._do_dummy_prediction(datamanager, 1)
    ta_run_mock.return_value = StatusType.CAPPED, None, None, {}
    with pytest.raises(
            ValueError,
            match='Dummy prediction failed with run state StatusType.CAPPED '
            'and additional output: {}.'):
        auto._do_dummy_prediction(datamanager, 1)

    ta_run_mock.return_value = StatusType.CRASHED, None, None, {'exitcode': -6}
    with pytest.raises(
            ValueError,
            match=
            'The error suggests that the provided memory limits were too tight.',
    ):
        auto._do_dummy_prediction(datamanager, 1)
예제 #39
0
    def __init__(self,
                 backend: Backend,
                 dataset_name: str,
                 task_type: int,
                 metric: Scorer,
                 limit: int,
                 ensemble_size: int = 10,
                 ensemble_nbest: int = 100,
                 max_models_on_disc: int = 100,
                 performance_range_threshold: float = 0,
                 seed: int = 1,
                 shared_mode: bool = False,
                 max_iterations: int = None,
                 precision: int = 32,
                 sleep_duration: int = 2,
                 memory_limit: Optional[int] = 1024,
                 read_at_most: int = 5,
                 random_state: Optional[Union[int,
                                              np.random.RandomState]] = None,
                 queue: multiprocessing.Queue = None):
        """
            Constructor

            Parameters
            ----------
            backend: util.backend.Backend
                backend to write and read files
            dataset_name: str
                name of dataset
            task_type: int
                type of ML task
            metric: str
                name of metric to score predictions
            limit: int
                time limit in sec
            ensemble_size: int
                maximal size of ensemble (passed to autosklearn.ensemble.ensemble_selection)
            ensemble_nbest: int/float
                if int: consider only the n best prediction
                if float: consider only this fraction of the best models
                Both wrt to validation predictions
                If performance_range_threshold > 0, might return less models
            max_models_on_disc: int
               Defines the maximum number of models that are kept in the disc.
               If int, it must be greater or equal than 1, and dictates the max number of
               models to keep.
               If float, it will be interpreted as the max megabytes allowed of disc space. That
               is, if the number of ensemble candidates require more disc space than this float
               value, the worst models will be deleted to keep within this budget.
               Models and predictions of the worst-performing models will be deleted then.
               If None, the feature is disabled.
               It defines an upper bound on the models that can be used in the ensemble.
            performance_range_threshold: float
                Keep only models that are better than:
                    dummy + (best - dummy)*performance_range_threshold
                E.g dummy=2, best=4, thresh=0.5 --> only consider models with score > 3
                Will at most return the minimum between ensemble_nbest models,
                and max_models_on_disc. Might return less
            seed: int
                random seed
                if set to -1, read files with any seed (e.g., for shared model mode)
            shared_model: bool
                auto-sklearn used shared model mode (aka pSMAC)
            max_iterations: int
                maximal number of iterations to run this script
                (default None --> deactivated)
            precision: [16,32,64,128]
                precision of floats to read the predictions
            sleep_duration: int
                duration of sleeping time between two iterations of this script (in sec)
            memory_limit: Optional[int]
                memory limit in mb. If ``None``, no memory limit is enforced.
            read_at_most: int
                read at most n new prediction files in each iteration
        """

        super(EnsembleBuilder, self).__init__()

        self.backend = backend  # communication with filesystem
        self.dataset_name = dataset_name
        self.task_type = task_type
        self.metric = metric
        self.time_limit = limit  # time limit
        self.ensemble_size = ensemble_size
        self.performance_range_threshold = performance_range_threshold

        if isinstance(ensemble_nbest, numbers.Integral) and ensemble_nbest < 1:
            raise ValueError("Integer ensemble_nbest has to be larger 1: %s" %
                             ensemble_nbest)
        elif not isinstance(ensemble_nbest, numbers.Integral):
            if ensemble_nbest < 0 or ensemble_nbest > 1:
                raise ValueError(
                    "Float ensemble_nbest best has to be >= 0 and <= 1: %s" %
                    ensemble_nbest)

        self.ensemble_nbest = ensemble_nbest

        # max_models_on_disc can be a float, in such case we need to
        # remember the user specified Megabytes and translate this to
        # max number of ensemble models. max_resident_models keeps the
        # maximum number of models in disc
        if max_models_on_disc is not None and max_models_on_disc < 0:
            raise ValueError(
                "max_models_on_disc has to be a positive number or None")
        self.max_models_on_disc = max_models_on_disc
        self.max_resident_models = None

        self.seed = seed
        self.shared_mode = shared_mode  # pSMAC?
        self.max_iterations = max_iterations
        self.precision = precision
        self.sleep_duration = sleep_duration
        self.memory_limit = memory_limit
        self.read_at_most = read_at_most
        self.random_state = check_random_state(random_state)

        # part of the original training set
        # used to build the ensemble
        self.dir_ensemble = os.path.join(
            self.backend.temporary_directory,
            '.auto-sklearn',
            'predictions_ensemble',
        )
        # validation set (public test set) -- y_true not known
        self.dir_valid = os.path.join(
            self.backend.temporary_directory,
            '.auto-sklearn',
            'predictions_valid',
        )
        # test set (private test set) -- y_true not known
        self.dir_test = os.path.join(
            self.backend.temporary_directory,
            '.auto-sklearn',
            'predictions_test',
        )
        self.dir_models = os.path.join(
            self.backend.temporary_directory,
            '.auto-sklearn',
            'models',
        )
        logger_name = 'EnsembleBuilder(%d):%s' % (self.seed, self.dataset_name)
        self.logger = get_logger(logger_name)
        if ensemble_nbest == 1:
            self.logger.debug(
                "Behaviour depends on int/float: %s, %s (ensemble_nbest, type)"
                % (ensemble_nbest, type(ensemble_nbest)))

        self.start_time = 0
        self.model_fn_re = re.compile(
            r'_([0-9]*)_([0-9]*)_([0-9]{1,3}\.[0-9]*)\.npy')

        # already read prediction files
        # {"file name": {
        #    "ens_score": float
        #    "mtime_ens": str,
        #    "mtime_valid": str,
        #    "mtime_test": str,
        #    "seed": int,
        #    "num_run": int,
        #    "deleted": bool,
        #    Y_ENSEMBLE: np.ndarray
        #    Y_VALID: np.ndarray
        #    Y_TEST: np.ndarray
        #    }
        # }
        self.read_preds = {}
        self.last_hash = None  # hash of ensemble training data
        self.y_true_ensemble = None
        self.SAVE2DISC = True

        # hidden feature which can be activated via an environment variable. This keeps all
        # models and predictions which have ever been a candidate. This is necessary to post-hoc
        # compute the whole ensemble building trajectory.
        self._has_been_candidate = set()

        self.validation_performance_ = np.inf

        # Track the ensemble performance
        self.datamanager = self.backend.load_datamanager()
        self.y_valid = self.datamanager.data.get('Y_valid')
        self.y_test = self.datamanager.data.get('Y_test')

        # Support for tracking the performance across time
        # A Queue is needed to handle multiprocessing, not only
        # internally for pynisher calls, but to return data
        # to the main process
        # Hence, because we are using three different processes,
        # the below strategy prevents MemoryErrors. That is,
        # without clearly isolating the queue with a manger,
        # we run into a threading MemoryError
        if queue is None:
            mgr = multiprocessing.Manager()
            mgr.Namespace()
            self.queue = mgr.Queue()
        else:
            self.queue = queue
        self.queue.put([])
        self.queue.get()
예제 #40
0
    def test_automl_outputs(self):
        backend_api = self._create_backend('test_automl_outputs')

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        name = 'iris'
        data_manager_file = os.path.join(
            backend_api.temporary_directory,
            '.auto-sklearn',
            'datamanager.pkl'
        )

        auto = autosklearn.automl.AutoML(
            backend_api, 20, 5,
            initial_configurations_via_metalearning=0,
            seed=100,
            metric=accuracy,
        )
        setup_logger()
        auto._logger = get_logger('test_automl_outputs')
        auto.fit(
            X=X_train,
            y=Y_train,
            X_test=X_test,
            y_test=Y_test,
            dataset_name=name,
            task=MULTICLASS_CLASSIFICATION,
        )

        # pickled data manager (without one hot encoding!)
        with open(data_manager_file, 'rb') as fh:
            D = pickle.load(fh)
            self.assertTrue(np.allclose(D.data['X_train'], X_train))

        # Check that all directories are there
        fixture = ['cv_models', 'true_targets_ensemble.npy',
                   'start_time_100', 'datamanager.pkl',
                   'predictions_ensemble',
                   'ensembles', 'predictions_test', 'models']
        self.assertEqual(sorted(os.listdir(os.path.join(backend_api.temporary_directory,
                                                        '.auto-sklearn'))),
                         sorted(fixture))

        # At least one ensemble, one validation, one test prediction and one
        # model and one ensemble
        fixture = os.listdir(os.path.join(backend_api.temporary_directory,
                                          '.auto-sklearn', 'predictions_ensemble'))
        self.assertGreater(len(fixture), 0)

        fixture = glob.glob(os.path.join(backend_api.temporary_directory, '.auto-sklearn',
                                         'models', '100.*.model'))
        self.assertGreater(len(fixture), 0)

        fixture = os.listdir(os.path.join(backend_api.temporary_directory,
                                          '.auto-sklearn', 'ensembles'))
        self.assertIn('100.0000000001.ensemble', fixture)

        # Start time
        start_time_file_path = os.path.join(backend_api.temporary_directory,
                                            '.auto-sklearn', "start_time_100")
        with open(start_time_file_path, 'r') as fh:
            start_time = float(fh.read())
        self.assertGreaterEqual(time.time() - start_time, 10)

        del auto
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
예제 #41
0
def test_automl_outputs(backend, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    name = 'iris'
    data_manager_file = os.path.join(backend.temporary_directory,
                                     '.auto-sklearn', 'datamanager.pkl')

    auto = autosklearn.automl.AutoML(
        backend,
        30,
        5,
        initial_configurations_via_metalearning=0,
        seed=100,
        metric=accuracy,
        dask_client=dask_client,
    )
    setup_logger()
    auto._logger = get_logger('test_automl_outputs')
    auto.fit(
        X=X_train,
        y=Y_train,
        X_test=X_test,
        y_test=Y_test,
        dataset_name=name,
        task=MULTICLASS_CLASSIFICATION,
    )

    # Log file path
    log_file_path = glob.glob(
        os.path.join(backend.temporary_directory, 'AutoML*.log'))[0]

    # pickled data manager (without one hot encoding!)
    with open(data_manager_file, 'rb') as fh:
        D = pickle.load(fh)
        assert np.allclose(D.data['X_train'], X_train)

    # Check that all directories are there
    fixture = [
        'true_targets_ensemble.npy',
        'start_time_100',
        'datamanager.pkl',
        'ensemble_read_preds.pkl',
        'ensemble_read_scores.pkl',
        'runs',
        'ensembles',
    ]
    assert (sorted(
        os.listdir(os.path.join(backend.temporary_directory,
                                '.auto-sklearn'))) == sorted(fixture))

    # At least one ensemble, one validation, one test prediction and one
    # model and one ensemble
    fixture = glob.glob(
        os.path.join(
            backend.temporary_directory,
            '.auto-sklearn',
            'runs',
            '*',
            'predictions_ensemble*npy',
        ))
    assert len(fixture) > 0

    fixture = glob.glob(
        os.path.join(backend.temporary_directory, '.auto-sklearn', 'runs', '*',
                     '100.*.model'))
    assert len(fixture) > 0

    fixture = os.listdir(
        os.path.join(backend.temporary_directory, '.auto-sklearn',
                     'ensembles'))
    assert '100.0000000000.ensemble' in fixture

    # Start time
    start_time_file_path = os.path.join(backend.temporary_directory,
                                        '.auto-sklearn', "start_time_100")
    with open(start_time_file_path, 'r') as fh:
        start_time = float(fh.read())
    assert time.time() - start_time >= 10, extract_msg_from_log(log_file_path)

    del auto
예제 #42
0
    def __init__(self, config_space, dataset_name,
                 backend,
                 total_walltime_limit,
                 func_eval_time_limit,
                 memory_limit,
                 metric,
                 watcher, start_num_run=1,
                 data_memory_limit=None,
                 num_metalearning_cfgs=25,
                 config_file=None,
                 seed=1,
                 metadata_directory=None,
                 resampling_strategy='holdout',
                 resampling_strategy_args=None,
                 shared_mode=False,
                 include_estimators=None,
                 exclude_estimators=None,
                 include_preprocessors=None,
                 exclude_preprocessors=None,
                 disable_file_output=False,
                 smac_scenario_args=None,
                 get_smac_object_callback=None):
        super(AutoMLSMBO, self).__init__()
        # data related
        self.dataset_name = dataset_name
        self.datamanager = None
        self.metric = metric
        self.task = None
        self.backend = backend

        # the configuration space
        self.config_space = config_space

        # Evaluation
        self.resampling_strategy = resampling_strategy
        if resampling_strategy_args is None:
            resampling_strategy_args = {}
        self.resampling_strategy_args = resampling_strategy_args

        # and a bunch of useful limits
        self.total_walltime_limit = int(total_walltime_limit)
        self.func_eval_time_limit = int(func_eval_time_limit)
        self.memory_limit = memory_limit
        self.data_memory_limit = data_memory_limit
        self.watcher = watcher
        self.num_metalearning_cfgs = num_metalearning_cfgs
        self.config_file = config_file
        self.seed = seed
        self.metadata_directory = metadata_directory
        self.start_num_run = start_num_run
        self.shared_mode = shared_mode
        self.include_estimators = include_estimators
        self.exclude_estimators = exclude_estimators
        self.include_preprocessors = include_preprocessors
        self.exclude_preprocessors = exclude_preprocessors
        self.disable_file_output = disable_file_output
        self.smac_scenario_args = smac_scenario_args
        self.get_smac_object_callback = get_smac_object_callback

        dataset_name_ = "" if dataset_name is None else dataset_name
        logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed, ":" + dataset_name_)
        self.logger = get_logger(logger_name)
예제 #43
0
import os
from StringIO import StringIO
import time

import numpy as np

import pyMetaLearn.metafeatures.metafeatures as metafeatures
import pyMetaLearn.optimizers.metalearn_optimizer.metalearner as \
    metalearner

from autosklearn.util import logging_
from autosklearn.constants import *
logger = logging_.get_logger(__name__)


class MetaLearning(object):
    """Right now, pyMetaLearn performs a OneHotEncoding if necessary, but it
    is really not necessary. This object helps to circumvent this by:

    1. call metafeatures.calculate_all_metafeatures() only for the
        metafeatures which do not need OneHotEncoded data
    2. Allows the caller to then perform a OneHotEncoding
    3. call metafeatures.calculate_metafeatures_encoded_labels() for all
        other metafeatures need OneHotEncoded data.
    """

    def __init__(self):
        self._sentinel = "uiaeo"
        self._metafeatures_encoded_labels = None
        self._metafeatures_labels = None
        # Hard-coded list of too-expensive metafeatures!