Beispiel #1
0
    def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric="acc_metric", feat_type=None, dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        logger_name = "AutoML(%d):%s" % (self._seed, dataset_name)
        setup_logger(os.path.join(self._tmp_dir, "%s.log" % str(logger_name)))
        self._logger = get_logger(logger_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError(
                "Array feat_type does not have same number of "
                "variables as X has features. %d vs %d." % (len(feat_type), X.shape[1])
            )
        if feat_type is not None and not all([isinstance(f, bool) for f in feat_type]):
            raise ValueError("Array feat_type must only contain bools.")

        loaded_data_manager = XYDataManager(
            X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False
        )

        return self._fit(loaded_data_manager)
Beispiel #2
0
    def fit(self, X, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        loaded_data_manager = XYDataManager(X, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)
 def _get_logger(self, name):
     logger_name = 'AutoML(%d):%s' % (self._seed, name)
     setup_logger(os.path.join(self._backend.temporary_directory,
                               '%s.log' % str(logger_name)),
                  self.logging_config,
                  )
     return get_logger(logger_name)
Beispiel #4
0
    def test_do_dummy_prediction(self):
        for name in ['401_bac', '31_bac', 'adult', 'cadata']:
            backend_api = self._create_backend('test_do_dummy_prediction')

            dataset = os.path.join(self.test_dir, '..', '.data', name)

            auto = autosklearn.automl.AutoML(
                backend_api, 20, 5,
                initial_configurations_via_metalearning=25)
            setup_logger()
            auto._logger = get_logger('test_do_dummy_predictions')
            auto._backend._make_internals_directory()
            D = load_data(dataset, backend_api)
            auto._backend.save_datamanager(D)
            auto._do_dummy_prediction(D, 1)

            # Ensure that the dummy predictions are not in the current working
            # directory, but in the temporary directory.
            self.assertFalse(os.path.exists(os.path.join(os.getcwd(),
                                                         '.auto-sklearn')))
            self.assertTrue(os.path.exists(os.path.join(
                backend_api.temporary_directory, '.auto-sklearn', 'predictions_ensemble',
                'predictions_ensemble_1_1.npy')))

            del auto
            self._tearDown(backend_api.temporary_directory)
            self._tearDown(backend_api.output_directory)
    def test_do_dummy_prediction(self):
        for name in ['401_bac', '31_bac', 'adult', 'cadata']:
            output = os.path.join(self.test_dir, '..',
                                  '.tmp_test_do_dummy_prediction')
            self._setUp(output)

            dataset = os.path.join(self.test_dir, '..', '.data', name)

            auto = autosklearn.automl.AutoML(
                output, output, 15, 15,
                initial_configurations_via_metalearning=25)
            setup_logger()
            auto._logger = get_logger('test_do_dummy_predictions')
            auto._backend._make_internals_directory()
            D = store_and_or_load_data(dataset, output)
            auto._do_dummy_prediction(D)

            # Assure that the dummy predictions are not in the current working
            # directory, but in the output directory (under output)
            self.assertFalse(os.path.exists(os.path.join(os.getcwd(),
                                                         '.auto-sklearn')))
            self.assertTrue(os.path.exists(os.path.join(output,
                                                        '.auto-sklearn')))

            del auto
            self._tearDown(output)
Beispiel #6
0
    def test_do_dummy_prediction(self):
        for name in ['401_bac', '31_bac', 'adult', 'cadata']:
            output = os.path.join(self.test_dir, '..',
                                  '.tmp_test_do_dummy_prediction')
            self._setUp(output)

            dataset = os.path.join(self.test_dir, '..', '.data', name)

            backend_api = backend.create(output, output)
            auto = autosklearn.automl.AutoML(
                backend_api, 20, 5,
                initial_configurations_via_metalearning=25)
            setup_logger()
            auto._logger = get_logger('test_do_dummy_predictions')
            auto._backend._make_internals_directory()
            D = load_data(dataset, backend_api)
            auto._backend.save_datamanager(D)
            auto._do_dummy_prediction(D, 1)

            # Ensure that the dummy predictions are not in the current working
            # directory, but in the output directory (under output)
            self.assertFalse(os.path.exists(os.path.join(os.getcwd(),
                                                         '.auto-sklearn')))
            self.assertTrue(os.path.exists(os.path.join(
                output, '.auto-sklearn', 'predictions_ensemble',
                'predictions_ensemble_1_00001.npy')))

            del auto
            self._tearDown(output)
Beispiel #7
0
    def test_do_dummy_prediction(self):
        for name in ['401_bac', '31_bac', 'adult', 'cadata']:
            output = os.path.join(self.test_dir, '..',
                                  '.tmp_test_do_dummy_prediction')
            self._setUp(output)

            dataset = os.path.join(self.test_dir, '..', '.data', name)

            auto = autosklearn.automl.AutoML(
                output,
                output,
                15,
                15,
                initial_configurations_via_metalearning=25)
            setup_logger()
            auto._logger = get_logger('test_do_dummy_predictions')
            auto._backend._make_internals_directory()
            D = store_and_or_load_data(dataset, output)
            auto._do_dummy_prediction(D)

            # Assure that the dummy predictions are not in the current working
            # directory, but in the output directory (under output)
            self.assertFalse(
                os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn')))
            self.assertTrue(
                os.path.exists(os.path.join(output, '.auto-sklearn')))

            del auto
            self._tearDown(output)
Beispiel #8
0
    def __init__(self,
                 config_space,
                 dataset_name,
                 backend,
                 total_walltime_limit,
                 func_eval_time_limit,
                 memory_limit,
                 watcher,
                 start_num_run=1,
                 data_memory_limit=None,
                 num_metalearning_cfgs=25,
                 config_file=None,
                 smac_iters=1000,
                 seed=1,
                 metadata_directory=None,
                 resampling_strategy='holdout',
                 resampling_strategy_args=None,
                 acquisition_function='EI',
                 shared_mode=False):
        super(AutoMLSMBO, self).__init__()
        # data related
        self.dataset_name = dataset_name
        #self.output_dir = output_dir
        #self.tmp_dir = tmp_dir
        self.datamanager = None
        self.metric = None
        self.task = None
        self.backend = backend

        # the configuration space
        self.config_space = config_space

        # Evaluation
        self.resampling_strategy = resampling_strategy
        if resampling_strategy_args is None:
            resampling_strategy_args = {}
        self.resampling_strategy_args = resampling_strategy_args

        # and a bunch of useful limits
        self.total_walltime_limit = int(total_walltime_limit)
        self.func_eval_time_limit = int(func_eval_time_limit)
        self.memory_limit = memory_limit
        self.data_memory_limit = data_memory_limit
        self.watcher = watcher
        self.num_metalearning_cfgs = num_metalearning_cfgs
        self.config_file = config_file
        self.seed = seed
        self.metadata_directory = metadata_directory
        self.smac_iters = smac_iters
        self.start_num_run = start_num_run
        self.acquisition_function = acquisition_function
        self.shared_mode = shared_mode
        self.runhistory = None

        logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed,
                                     ":" + dataset_name
                                     if dataset_name is not None else "")
        self.logger = get_logger(logger_name)
Beispiel #9
0
    def __init__(self, config_space, dataset_name,
                 output_dir, tmp_dir,
                 total_walltime_limit,
                 func_eval_time_limit,
                 memory_limit,
                 watcher, start_num_run=1,
                 data_memory_limit=None,
                 num_metalearning_cfgs=25,
                 config_file=None,
                 smac_iters=1000,
                 seed=1,
                 metadata_directory=None,
                 resampling_strategy='holdout',
                 resampling_strategy_args=None,
                 acquisition_function='EI',
                 shared_mode=False):
        super(AutoMLSMBO, self).__init__()
        # data related
        self.dataset_name = dataset_name
        self.output_dir = output_dir
        self.tmp_dir = tmp_dir
        self.datamanager = None
        self.metric = None
        self.task = None

        # the configuration space
        self.config_space = config_space

        # Evaluation
        self.resampling_strategy = resampling_strategy
        if resampling_strategy_args is None:
            resampling_strategy_args = {}
        self.resampling_strategy_args = resampling_strategy_args

        # and a bunch of useful limits
        self.total_walltime_limit = int(total_walltime_limit)
        self.func_eval_time_limit = int(func_eval_time_limit)
        self.memory_limit = memory_limit
        self.data_memory_limit = data_memory_limit
        self.watcher = watcher
        self.num_metalearning_cfgs = num_metalearning_cfgs
        self.config_file = config_file
        self.seed = seed
        self.metadata_directory = metadata_directory
        self.smac_iters = smac_iters
        self.start_num_run = start_num_run
        self.acquisition_function = acquisition_function
        self.shared_mode = shared_mode

        self.config_space.seed(self.seed)
        logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed,
                                     ":" + dataset_name if dataset_name is
                                                           not None else "")
        self.logger = get_logger(logger_name)
        import logging
        root = logging.getLogger()
        root.setLevel(logging.DEBUG)
Beispiel #10
0
    def __init__(self, config_space, limit, cutoff_time, metafeatures,
                 output_dir, shared_model):
        self.logger = get_logger(self.__class__.__name__)

        # Give SMAC at least 5 seconds
        soft_limit = max(5, cutoff_time - 35)

        scenario_dict = {'cs': config_space,
                         'run-obj': 'quality',
                         'cutoff-time': soft_limit,
                         'tuner-timeout': soft_limit,
                         'wallclock-limit': limit,
                         'features': metafeatures,
                         'instances': [[name] for name in metafeatures],
                         'output_dir': output_dir,
                         'shared_model': shared_model}

        super(AutoMLScenario, self).__init__(scenario_dict)
        # reset the logger, because otherwise we can't pickle the AutoMLScenario
        self.logger = get_logger(self.__class__.__name__)
Beispiel #11
0
    def start_automl(self, parser):
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=parser)
        self._stopwatch.start_task(datamanager.name)

        logger_name = 'AutoML(%d):%s' % (self._seed, datamanager.name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self.start()
Beispiel #12
0
    def start_automl(self, parser):
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=parser)
        self._stopwatch.start_task(datamanager.name)

        logger_name = 'AutoML(%d):%s' % (self._seed, datamanager.name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self.start()
Beispiel #13
0
    def __init__(self, config_space, limit, cutoff_time, metafeatures,
                 output_dir, shared_model):
        self.logger = get_logger(self.__class__.__name__)

        # Give SMAC at least 5 seconds
        soft_limit = max(5, cutoff_time - 35)

        scenario_dict = {'cs': config_space,
                         'run_obj': 'quality',
                         'cutoff': soft_limit,
                         'algo_runs_timelimit': soft_limit,
                         'wallclock-limit': limit,
                         'features': metafeatures,
                         'instances': [[name] for name in metafeatures],
                         'output_dir': output_dir,
                         'shared_model': shared_model}

        super(AutoMLScenario, self).__init__(scenario_dict)
Beispiel #14
0
    def fit(self, X, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all([isinstance(f, str)
                                              for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' % ft)

        loaded_data_manager = XYDataManager(X, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)
Beispiel #15
0
    def __init__(self, config_space, limit, cutoff_time, metafeatures,
                 output_dir, shared_model):
        self.logger = get_logger(self.__class__.__name__)

        # Give SMAC at least 5 seconds
        soft_limit = max(5, cutoff_time - 35)

        scenario_dict = {
            'cs': config_space,
            'run_obj': 'quality',
            'cutoff': soft_limit,
            'algo_runs_timelimit': soft_limit,
            'wallclock-limit': limit,
            'features': metafeatures,
            'instances': [[name] for name in metafeatures],
            'output_dir': output_dir,
            'shared_model': shared_model
        }

        super(AutoMLScenario, self).__init__(scenario_dict)
    def __init__(self, data_manager, configuration,
                 with_predictions=False,
                 all_scoring_functions=False,
                 seed=1,
                 output_dir=None,
                 output_y_test=False,
                 num_run=None):

        self.starttime = time.time()

        self.configuration = configuration
        self.D = data_manager

        self.X_valid = data_manager.data.get('X_valid')
        self.X_test = data_manager.data.get('X_test')

        self.metric = data_manager.info['metric']
        self.task_type = data_manager.info['task']
        self.seed = seed

        if output_dir is None:
            self.output_dir = os.getcwd()
        else:
            self.output_dir = output_dir

        self.output_y_test = output_y_test
        self.with_predictions = with_predictions
        self.all_scoring_functions = all_scoring_functions

        if self.task_type in REGRESSION_TASKS:
            self.model_class = ParamSklearnRegressor
            self.predict_function = self.predict_regression
        else:
            self.model_class = ParamSklearnClassifier
            self.predict_function = self.predict_proba

        if num_run is None:
            num_run = get_new_run_num()
        self.num_run = num_run

        self._logger = get_logger(os.path.basename(__file__))
Beispiel #17
0
    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        loaded_data_manager = CompetitionDataManager(dataset,
                                                     encode_labels=False)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)
Beispiel #18
0
def suggest_via_metalearning(meta_base, dataset_name, metric, task, sparse,
                             num_initial_configurations):
    logger = get_logger('autosklearn.metalearning.mismbo')

    if task == MULTILABEL_CLASSIFICATION:
        task = MULTICLASS_CLASSIFICATION

    task = TASK_TYPES_TO_STRING[task]

    logger.warning(task)

    start = time.time()
    ml = MetaLearningOptimizer(
        dataset_name=dataset_name,
        configuration_space=meta_base.configuration_space,
        meta_base=meta_base,
        distance='l1',
        seed=1,
    )
    logger.info('Reading meta-data took %5.2f seconds', time.time() - start)
    runs = ml.metalearning_suggest_all(exclude_double_configurations=True)
    return runs[:num_initial_configurations]
Beispiel #19
0
    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        loaded_data_manager = CompetitionDataManager(
            dataset, encode_labels=False,
            max_memory_in_mb=float(self._ml_memory_limit) / 3)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)
Beispiel #20
0
def suggest_via_metalearning(
        meta_base, dataset_name, metric, task, sparse,
        num_initial_configurations):
    logger = get_logger('autosklearn.metalearning.mismbo')

    if task == MULTILABEL_CLASSIFICATION:
        task = MULTICLASS_CLASSIFICATION

    task = TASK_TYPES_TO_STRING[task]

    logger.warning(task)

    start = time.time()
    ml = MetaLearningOptimizer(
        dataset_name=dataset_name,
        configuration_space=meta_base.configuration_space,
        meta_base=meta_base,
        distance='l1',
        seed=1,)
    logger.info('Reading meta-data took %5.2f seconds',
                time.time() - start)
    runs = ml.metalearning_suggest_all(exclude_double_configurations=True)
    return runs[:num_initial_configurations]
Beispiel #21
0
    def __init__(self, config_space, dataset_name,
                 backend,
                 total_walltime_limit,
                 func_eval_time_limit,
                 memory_limit,
                 metric,
                 watcher, start_num_run=1,
                 data_memory_limit=None,
                 num_metalearning_cfgs=25,
                 config_file=None,
                 seed=1,
                 metadata_directory=None,
                 resampling_strategy='holdout',
                 resampling_strategy_args=None,
                 shared_mode=False,
                 include_estimators=None,
                 exclude_estimators=None,
                 include_preprocessors=None,
                 exclude_preprocessors=None,
                 disable_file_output=False,
                 smac_scenario_args=None,
                 get_smac_object_callback=None):
        super(AutoMLSMBO, self).__init__()
        # data related
        self.dataset_name = dataset_name
        self.datamanager = None
        self.metric = metric
        self.task = None
        self.backend = backend

        # the configuration space
        self.config_space = config_space

        # Evaluation
        self.resampling_strategy = resampling_strategy
        if resampling_strategy_args is None:
            resampling_strategy_args = {}
        self.resampling_strategy_args = resampling_strategy_args

        # and a bunch of useful limits
        self.total_walltime_limit = int(total_walltime_limit)
        self.func_eval_time_limit = int(func_eval_time_limit)
        self.memory_limit = memory_limit
        self.data_memory_limit = data_memory_limit
        self.watcher = watcher
        self.num_metalearning_cfgs = num_metalearning_cfgs
        self.config_file = config_file
        self.seed = seed
        self.metadata_directory = metadata_directory
        self.start_num_run = start_num_run
        self.shared_mode = shared_mode
        self.include_estimators = include_estimators
        self.exclude_estimators = exclude_estimators
        self.include_preprocessors = include_preprocessors
        self.exclude_preprocessors = exclude_preprocessors
        self.disable_file_output = disable_file_output
        self.smac_scenario_args = smac_scenario_args
        self.get_smac_object_callback = get_smac_object_callback

        logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed,
                                     ":" + dataset_name if dataset_name is
                                                           not None else "")
        self.logger = get_logger(logger_name)
Beispiel #22
0
 def _get_logger(self, name):
     logger_name = 'AutoML(%d):%s' % (self._seed, name)
     setup_logger(os.path.join(self._backend.temporary_directory, '%s.log' % str(logger_name)))
     return get_logger(logger_name)
Beispiel #23
0
    def __init__(self,
                 config_space,
                 dataset_name,
                 backend,
                 total_walltime_limit,
                 func_eval_time_limit,
                 memory_limit,
                 metric,
                 write_history,
                 read_history,
                 watcher,
                 start_num_run=1,
                 data_memory_limit=None,
                 num_metalearning_cfgs=25,
                 config_file=None,
                 seed=1,
                 metadata_directory=None,
                 resampling_strategy='holdout',
                 resampling_strategy_args=None,
                 shared_mode=False,
                 include_estimators=None,
                 exclude_estimators=None,
                 include_preprocessors=None,
                 exclude_preprocessors=None,
                 disable_file_output=False,
                 smac_scenario_args=None,
                 get_smac_object_callback=None):
        super(AutoMLSMBO, self).__init__()

        #milad
        self.write_history = write_history
        self.read_history = read_history

        # data related
        self.dataset_name = dataset_name
        self.datamanager = None
        self.metric = metric
        self.task = None
        self.backend = backend

        # the configuration space
        self.config_space = config_space

        # Evaluation
        self.resampling_strategy = resampling_strategy
        if resampling_strategy_args is None:
            resampling_strategy_args = {}
        self.resampling_strategy_args = resampling_strategy_args

        # and a bunch of useful limits
        self.total_walltime_limit = int(total_walltime_limit)
        self.func_eval_time_limit = int(func_eval_time_limit)
        self.memory_limit = memory_limit
        self.data_memory_limit = data_memory_limit
        self.watcher = watcher
        self.num_metalearning_cfgs = num_metalearning_cfgs
        self.config_file = config_file
        self.seed = seed
        self.metadata_directory = metadata_directory
        self.start_num_run = start_num_run
        self.shared_mode = shared_mode
        self.include_estimators = include_estimators
        self.exclude_estimators = exclude_estimators
        self.include_preprocessors = include_preprocessors
        self.exclude_preprocessors = exclude_preprocessors
        self.disable_file_output = disable_file_output
        self.smac_scenario_args = smac_scenario_args
        self.get_smac_object_callback = get_smac_object_callback

        logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed,
                                     ":" + dataset_name
                                     if dataset_name is not None else "")
        self.logger = get_logger(logger_name)
Beispiel #24
0
# -*- encoding: utf-8 -*-
import numpy as np
import sklearn.cross_validation

from autosklearn.util import get_logger

logger = get_logger(__name__)


__all__ = [
    'split_data',
    'get_CV_fold'
]


def split_data(X, Y, classification=None):
    num_data_points = X.shape[0]
    num_labels = Y.shape[1] if len(Y.shape) > 1 else 1
    X_train, X_valid, Y_train, Y_valid = None, None, None, None
    if X.shape[0] != Y.shape[0]:
        raise ValueError('The first dimension of the X and Y array must '
                         'be equal.')

    # If one class only has one sample, put it into the training set
    if classification is True and num_labels == 1:
        classes, y_indices = np.unique(Y, return_inverse=True)
        if np.min(np.bincount(y_indices)) < 2:
            classes_with_one_sample = np.bincount(y_indices) < 2
            sample_idxs = []
            Y_old = Y
            indices = np.ones(Y.shape, dtype=bool)
Beispiel #25
0
 def _get_logger(self, name):
     logger_name = 'AutoML(%d):%s' % (self._seed, name)
     setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
     return get_logger(logger_name)
Beispiel #26
0
def create_metalearning_string_for_smac_call(metafeatures_labels,
                                             metafeatures_encoded_labels,
                                             configuration_space, dataset_name,
                                             metric, task, sparse,
                                             num_initial_configurations,
                                             metadata_directory):
    """

    :param metafeatures_labels:
    :param metafeatures_encoded_labels:
    :param configuration_space:
    :param dataset_name:
    :param metric:
    :param task:
    :param sparse:
    :param num_initial_configurations:
    :param metadata_directory:
    :return:
    """
    logger = get_logger('autosklearn.metalearning.mismbo')

    task = task if task != MULTILABEL_CLASSIFICATION else MULTICLASS_CLASSIFICATION
    task = TASK_TYPES_TO_STRING[task]

    if metafeatures_encoded_labels is None or \
                    metafeatures_labels is None:
        raise ValueError('Please call '
                         'calculate_metafeatures_encoded_labels and '
                         'calculate_metafeatures_with_labels first!')

    logger.warning(task)
    current_directory = os.path.dirname(__file__)
    if metadata_directory is None:
        metadata_directory = os.path.join(
            current_directory, 'files',
            '%s_%s_%s' % (METRIC_TO_STRING[metric], task,
                          'sparse' if sparse is True else 'dense'))
    logger.warning(metadata_directory)
    # Concatenate the metafeatures!
    mf = metafeatures_labels
    mf.metafeature_values.update(
        metafeatures_encoded_labels.metafeature_values)

    metafeatures_subset = subsets['all']
    metafeatures_subset.difference_update(EXCLUDE_META_FUTURES)
    metafeatures_subset = list(metafeatures_subset)

    start = time.time()
    ml = MetaLearningOptimizer(dataset_name=dataset_name + SENTINEL,
                               configuration_space=configuration_space,
                               aslib_directory=metadata_directory,
                               distance='l1',
                               seed=1,
                               use_features=metafeatures_subset,
                               subset='all')
    logger.info('Reading meta-data took %5.2f seconds', time.time() - start)
    # TODO This is hacky, I must find a different way of adding a new
    # dataset!
    ml.meta_base.add_dataset(dataset_name + SENTINEL, mf)
    runs = ml.metalearning_suggest_all(exclude_double_configurations=True)

    # = Convert these configurations into the SMAC CLI configuration format
    smac_initial_configuration_strings = []

    for configuration in runs[:num_initial_configurations]:
        smac_initial_configuration_strings.append(
            convert_conf2smac_string(configuration))

    return smac_initial_configuration_strings
Beispiel #27
0
    def test_fail_if_dummy_prediction_fails(self, ta_run_mock):
        backend_api = self._create_backend(
            'test_fail_if_dummy_prediction_fails')

        dataset = os.path.join(self.test_dir, '..', '.data', '401_bac')

        time_for_this_task = 30
        per_run_time = 10
        auto = autosklearn.automl.AutoML(
            backend_api,
            time_for_this_task,
            per_run_time,
            initial_configurations_via_metalearning=25,
        )
        setup_logger()
        auto._logger = get_logger('test_fail_if_dummy_prediction_fails')
        auto._backend._make_internals_directory()
        D = load_data(dataset, backend_api)
        auto._backend.save_datamanager(D)

        # First of all, check that ta.run() is actually called.
        ta_run_mock.return_value = StatusType.SUCCESS, None, None, "test"
        auto._do_dummy_prediction(D, 1)
        ta_run_mock.assert_called_once_with(1, cutoff=time_for_this_task)

        # Case 1. Check that function raises no error when statustype == success.
        # ta.run() returns status, cost, runtime, and additional info.
        ta_run_mock.return_value = StatusType.SUCCESS, None, None, "test"
        raised = False
        try:
            auto._do_dummy_prediction(D, 1)
        except ValueError:
            raised = True
        self.assertFalse(raised, 'Exception raised')

        # Case 2. Check that if statustype returned by ta.run() != success,
        # the function raises error.
        ta_run_mock.return_value = StatusType.CRASHED, None, None, "test"
        self.assertRaisesRegex(
            ValueError,
            'Dummy prediction failed: test',
            auto._do_dummy_prediction,
            D,
            1,
        )
        ta_run_mock.return_value = StatusType.ABORT, None, None, "test"
        self.assertRaisesRegex(
            ValueError,
            'Dummy prediction failed: test',
            auto._do_dummy_prediction,
            D,
            1,
        )
        ta_run_mock.return_value = StatusType.TIMEOUT, None, None, "test"
        self.assertRaisesRegex(
            ValueError,
            'Dummy prediction failed: test',
            auto._do_dummy_prediction,
            D,
            1,
        )
        ta_run_mock.return_value = StatusType.MEMOUT, None, None, "test"
        self.assertRaisesRegex(
            ValueError,
            'Dummy prediction failed: test',
            auto._do_dummy_prediction,
            D,
            1,
        )
        ta_run_mock.return_value = StatusType.CAPPED, None, None, "test"
        self.assertRaisesRegex(
            ValueError,
            'Dummy prediction failed: test',
            auto._do_dummy_prediction,
            D,
            1,
        )

        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Beispiel #28
0
def get_automl_logger(log_dir, basename, seed):
    logger = get_logger(os.path.basename(__file__))
    logger_file = os.path.join(log_dir, '%s.log' % str(
        'AutoML_%s_%d' % (basename, seed)))
    add_file_handler(logger, logger_file)
    return logger
Beispiel #29
0
 def _get_logger(self, name):
     logger_name = 'AutoML(%d):%s' % (self._seed, name)
     setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
     return get_logger(logger_name)
Beispiel #30
0
import autosklearn.metalearning
from autosklearn.constants import *
from autosklearn.metalearning.mismbo import suggest_via_metalearning
from autosklearn.data.abstract_data_manager import AbstractDataManager
from autosklearn.evaluation import ExecuteTaFuncWithQueue, WORST_POSSIBLE_RESULT
from autosklearn.util import get_logger
from autosklearn.metalearning.metalearning.meta_base import MetaBase
from autosklearn.metalearning.metafeatures.metafeatures import \
    calculate_all_metafeatures_with_labels, calculate_all_metafeatures_encoded_labels

from amb.data.profiler import DataProfiler
from amb.settings import settings, get_logger

logger = get_logger(__name__)


EXCLUDE_META_FEATURES_CLASSIFICATION = {
    'Landmark1NN',
    'LandmarkDecisionNodeLearner',
    'LandmarkDecisionTree',
    'LandmarkLDA',
    'LandmarkNaiveBayes',
    'PCAFractionOfComponentsFor95PercentVariance',
    'PCAKurtosisFirstPC',
    'PCASkewnessFirstPC',
    'PCA'
}

"""
Same as above except these are added:
    'NumberOfClasses',
Beispiel #31
0
import os
import time
from StringIO import StringIO

import numpy as np
from pyMetaLearn.metafeatures.metafeatures import \
    calculate_all_metafeatures_with_labels, \
    calculate_all_metafeatures_encoded_labels, subsets

from pyMetaLearn.optimizers.metalearn_optimizer.metalearner import \
    MetaLearningOptimizer


from autosklearn.util import get_logger

logger = get_logger(os.path.basename(__file__))

SENTINEL = 'uiaeo'

EXCLUDE_META_FUTURES = {
    'Landmark1NN',
    'LandmarkDecisionNodeLearner',
    'LandmarkDecisionTree',
    'LandmarkLDA',
    'LandmarkNaiveBayes',
    'PCAFractionOfComponentsFor95PercentVariance',
    'PCAKurtosisFirstPC',
    'PCASkewnessFirstPC'
}

def calc_meta_features(X_train, Y_train, categorical, dataset_name):
Beispiel #32
0
def create_metalearning_string_for_smac_call(
        metafeatures_labels, metafeatures_encoded_labels,
        configuration_space, dataset_name, metric, task, sparse,
        num_initial_configurations, metadata_directory):
    """

    :param metafeatures_labels:
    :param metafeatures_encoded_labels:
    :param configuration_space:
    :param dataset_name:
    :param metric:
    :param task:
    :param sparse:
    :param num_initial_configurations:
    :param metadata_directory:
    :return:
    """
    logger = get_logger('autosklearn.metalearning.mismbo')

    task = TASK_TYPES_TO_STRING[task]

    if metafeatures_encoded_labels is None or \
                    metafeatures_labels is None:
        raise ValueError('Please call '
                         'calculate_metafeatures_encoded_labels and '
                         'calculate_metafeatures_with_labels first!')

    current_directory = os.path.dirname(__file__)
    if metadata_directory is None:
        metadata_directory = os.path.join(
            current_directory, 'files',
            '%s_%s_%s' % (METRIC_TO_STRING[metric], task,
                          'sparse' if sparse is True else 'dense'))

    # Concatenate the metafeatures!
    mf = metafeatures_labels
    mf.metafeature_values.update(
        metafeatures_encoded_labels.metafeature_values)

    metafeatures_subset = subsets['all']
    metafeatures_subset.difference_update(EXCLUDE_META_FUTURES)
    metafeatures_subset = list(metafeatures_subset)

    start = time.time()
    ml = MetaLearningOptimizer(
        dataset_name=dataset_name + SENTINEL,
        configuration_space=configuration_space,
        aslib_directory=metadata_directory,
        distance='l1',
        seed=1,
        use_features=metafeatures_subset,
        subset='all')
    logger.info('Reading meta-data took %5.2f seconds',
                time.time() - start)
    # TODO This is hacky, I must find a different way of adding a new
    # dataset!
    ml.meta_base.add_dataset(dataset_name + SENTINEL, mf)
    runs = ml.metalearning_suggest_all(exclude_double_configurations=True)

    # = Convert these configurations into the SMAC CLI configuration format
    smac_initial_configuration_strings = []

    for configuration in runs[:num_initial_configurations]:
        smac_initial_configuration_strings.append(
            convert_conf2smac_string(configuration))

    return smac_initial_configuration_strings