def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric="acc_metric", feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) logger_name = "AutoML(%d):%s" % (self._seed, dataset_name) setup_logger(os.path.join(self._tmp_dir, "%s.log" % str(logger_name))) self._logger = get_logger(logger_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError( "Array feat_type does not have same number of " "variables as X has features. %d vs %d." % (len(feat_type), X.shape[1]) ) if feat_type is not None and not all([isinstance(f, bool) for f in feat_type]): raise ValueError("Array feat_type must only contain bools.") loaded_data_manager = XYDataManager( X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False ) return self._fit(loaded_data_manager)
def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager)
def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._backend.temporary_directory, '%s.log' % str(logger_name)), self.logging_config, ) return get_logger(logger_name)
def test_do_dummy_prediction(self): for name in ['401_bac', '31_bac', 'adult', 'cadata']: backend_api = self._create_backend('test_do_dummy_prediction') dataset = os.path.join(self.test_dir, '..', '.data', name) auto = autosklearn.automl.AutoML( backend_api, 20, 5, initial_configurations_via_metalearning=25) setup_logger() auto._logger = get_logger('test_do_dummy_predictions') auto._backend._make_internals_directory() D = load_data(dataset, backend_api) auto._backend.save_datamanager(D) auto._do_dummy_prediction(D, 1) # Ensure that the dummy predictions are not in the current working # directory, but in the temporary directory. self.assertFalse(os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))) self.assertTrue(os.path.exists(os.path.join( backend_api.temporary_directory, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_1.npy'))) del auto self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_do_dummy_prediction(self): for name in ['401_bac', '31_bac', 'adult', 'cadata']: output = os.path.join(self.test_dir, '..', '.tmp_test_do_dummy_prediction') self._setUp(output) dataset = os.path.join(self.test_dir, '..', '.data', name) auto = autosklearn.automl.AutoML( output, output, 15, 15, initial_configurations_via_metalearning=25) setup_logger() auto._logger = get_logger('test_do_dummy_predictions') auto._backend._make_internals_directory() D = store_and_or_load_data(dataset, output) auto._do_dummy_prediction(D) # Assure that the dummy predictions are not in the current working # directory, but in the output directory (under output) self.assertFalse(os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))) self.assertTrue(os.path.exists(os.path.join(output, '.auto-sklearn'))) del auto self._tearDown(output)
def test_do_dummy_prediction(self): for name in ['401_bac', '31_bac', 'adult', 'cadata']: output = os.path.join(self.test_dir, '..', '.tmp_test_do_dummy_prediction') self._setUp(output) dataset = os.path.join(self.test_dir, '..', '.data', name) backend_api = backend.create(output, output) auto = autosklearn.automl.AutoML( backend_api, 20, 5, initial_configurations_via_metalearning=25) setup_logger() auto._logger = get_logger('test_do_dummy_predictions') auto._backend._make_internals_directory() D = load_data(dataset, backend_api) auto._backend.save_datamanager(D) auto._do_dummy_prediction(D, 1) # Ensure that the dummy predictions are not in the current working # directory, but in the output directory (under output) self.assertFalse(os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))) self.assertTrue(os.path.exists(os.path.join( output, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_00001.npy'))) del auto self._tearDown(output)
def test_do_dummy_prediction(self): for name in ['401_bac', '31_bac', 'adult', 'cadata']: output = os.path.join(self.test_dir, '..', '.tmp_test_do_dummy_prediction') self._setUp(output) dataset = os.path.join(self.test_dir, '..', '.data', name) auto = autosklearn.automl.AutoML( output, output, 15, 15, initial_configurations_via_metalearning=25) setup_logger() auto._logger = get_logger('test_do_dummy_predictions') auto._backend._make_internals_directory() D = store_and_or_load_data(dataset, output) auto._do_dummy_prediction(D) # Assure that the dummy predictions are not in the current working # directory, but in the output directory (under output) self.assertFalse( os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))) self.assertTrue( os.path.exists(os.path.join(output, '.auto-sklearn'))) del auto self._tearDown(output)
def __init__(self, config_space, dataset_name, backend, total_walltime_limit, func_eval_time_limit, memory_limit, watcher, start_num_run=1, data_memory_limit=None, num_metalearning_cfgs=25, config_file=None, smac_iters=1000, seed=1, metadata_directory=None, resampling_strategy='holdout', resampling_strategy_args=None, acquisition_function='EI', shared_mode=False): super(AutoMLSMBO, self).__init__() # data related self.dataset_name = dataset_name #self.output_dir = output_dir #self.tmp_dir = tmp_dir self.datamanager = None self.metric = None self.task = None self.backend = backend # the configuration space self.config_space = config_space # Evaluation self.resampling_strategy = resampling_strategy if resampling_strategy_args is None: resampling_strategy_args = {} self.resampling_strategy_args = resampling_strategy_args # and a bunch of useful limits self.total_walltime_limit = int(total_walltime_limit) self.func_eval_time_limit = int(func_eval_time_limit) self.memory_limit = memory_limit self.data_memory_limit = data_memory_limit self.watcher = watcher self.num_metalearning_cfgs = num_metalearning_cfgs self.config_file = config_file self.seed = seed self.metadata_directory = metadata_directory self.smac_iters = smac_iters self.start_num_run = start_num_run self.acquisition_function = acquisition_function self.shared_mode = shared_mode self.runhistory = None logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed, ":" + dataset_name if dataset_name is not None else "") self.logger = get_logger(logger_name)
def __init__(self, config_space, dataset_name, output_dir, tmp_dir, total_walltime_limit, func_eval_time_limit, memory_limit, watcher, start_num_run=1, data_memory_limit=None, num_metalearning_cfgs=25, config_file=None, smac_iters=1000, seed=1, metadata_directory=None, resampling_strategy='holdout', resampling_strategy_args=None, acquisition_function='EI', shared_mode=False): super(AutoMLSMBO, self).__init__() # data related self.dataset_name = dataset_name self.output_dir = output_dir self.tmp_dir = tmp_dir self.datamanager = None self.metric = None self.task = None # the configuration space self.config_space = config_space # Evaluation self.resampling_strategy = resampling_strategy if resampling_strategy_args is None: resampling_strategy_args = {} self.resampling_strategy_args = resampling_strategy_args # and a bunch of useful limits self.total_walltime_limit = int(total_walltime_limit) self.func_eval_time_limit = int(func_eval_time_limit) self.memory_limit = memory_limit self.data_memory_limit = data_memory_limit self.watcher = watcher self.num_metalearning_cfgs = num_metalearning_cfgs self.config_file = config_file self.seed = seed self.metadata_directory = metadata_directory self.smac_iters = smac_iters self.start_num_run = start_num_run self.acquisition_function = acquisition_function self.shared_mode = shared_mode self.config_space.seed(self.seed) logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed, ":" + dataset_name if dataset_name is not None else "") self.logger = get_logger(logger_name) import logging root = logging.getLogger() root.setLevel(logging.DEBUG)
def __init__(self, config_space, limit, cutoff_time, metafeatures, output_dir, shared_model): self.logger = get_logger(self.__class__.__name__) # Give SMAC at least 5 seconds soft_limit = max(5, cutoff_time - 35) scenario_dict = {'cs': config_space, 'run-obj': 'quality', 'cutoff-time': soft_limit, 'tuner-timeout': soft_limit, 'wallclock-limit': limit, 'features': metafeatures, 'instances': [[name] for name in metafeatures], 'output_dir': output_dir, 'shared_model': shared_model} super(AutoMLScenario, self).__init__(scenario_dict) # reset the logger, because otherwise we can't pickle the AutoMLScenario self.logger = get_logger(self.__class__.__name__)
def start_automl(self, parser): self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=parser) self._stopwatch.start_task(datamanager.name) logger_name = 'AutoML(%d):%s' % (self._seed, datamanager.name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) self._datamanager = datamanager self._dataset_name = datamanager.name self.start()
def __init__(self, config_space, limit, cutoff_time, metafeatures, output_dir, shared_model): self.logger = get_logger(self.__class__.__name__) # Give SMAC at least 5 seconds soft_limit = max(5, cutoff_time - 35) scenario_dict = {'cs': config_space, 'run_obj': 'quality', 'cutoff': soft_limit, 'algo_runs_timelimit': soft_limit, 'wallclock-limit': limit, 'features': metafeatures, 'instances': [[name] for name in metafeatures], 'output_dir': output_dir, 'shared_model': shared_model} super(AutoMLScenario, self).__init__(scenario_dict)
def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all([isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager)
def __init__(self, config_space, limit, cutoff_time, metafeatures, output_dir, shared_model): self.logger = get_logger(self.__class__.__name__) # Give SMAC at least 5 seconds soft_limit = max(5, cutoff_time - 35) scenario_dict = { 'cs': config_space, 'run_obj': 'quality', 'cutoff': soft_limit, 'algo_runs_timelimit': soft_limit, 'wallclock-limit': limit, 'features': metafeatures, 'instances': [[name] for name in metafeatures], 'output_dir': output_dir, 'shared_model': shared_model } super(AutoMLScenario, self).__init__(scenario_dict)
def __init__(self, data_manager, configuration, with_predictions=False, all_scoring_functions=False, seed=1, output_dir=None, output_y_test=False, num_run=None): self.starttime = time.time() self.configuration = configuration self.D = data_manager self.X_valid = data_manager.data.get('X_valid') self.X_test = data_manager.data.get('X_test') self.metric = data_manager.info['metric'] self.task_type = data_manager.info['task'] self.seed = seed if output_dir is None: self.output_dir = os.getcwd() else: self.output_dir = output_dir self.output_y_test = output_y_test self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions if self.task_type in REGRESSION_TASKS: self.model_class = ParamSklearnRegressor self.predict_function = self.predict_regression else: self.model_class = ParamSklearnClassifier self.predict_function = self.predict_proba if num_run is None: num_run = get_new_run_num() self.num_run = num_run self._logger = get_logger(os.path.basename(__file__))
def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! loaded_data_manager = CompetitionDataManager(dataset, encode_labels=False) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager)
def suggest_via_metalearning(meta_base, dataset_name, metric, task, sparse, num_initial_configurations): logger = get_logger('autosklearn.metalearning.mismbo') if task == MULTILABEL_CLASSIFICATION: task = MULTICLASS_CLASSIFICATION task = TASK_TYPES_TO_STRING[task] logger.warning(task) start = time.time() ml = MetaLearningOptimizer( dataset_name=dataset_name, configuration_space=meta_base.configuration_space, meta_base=meta_base, distance='l1', seed=1, ) logger.info('Reading meta-data took %5.2f seconds', time.time() - start) runs = ml.metalearning_suggest_all(exclude_double_configurations=True) return runs[:num_initial_configurations]
def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=float(self._ml_memory_limit) / 3) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager)
def suggest_via_metalearning( meta_base, dataset_name, metric, task, sparse, num_initial_configurations): logger = get_logger('autosklearn.metalearning.mismbo') if task == MULTILABEL_CLASSIFICATION: task = MULTICLASS_CLASSIFICATION task = TASK_TYPES_TO_STRING[task] logger.warning(task) start = time.time() ml = MetaLearningOptimizer( dataset_name=dataset_name, configuration_space=meta_base.configuration_space, meta_base=meta_base, distance='l1', seed=1,) logger.info('Reading meta-data took %5.2f seconds', time.time() - start) runs = ml.metalearning_suggest_all(exclude_double_configurations=True) return runs[:num_initial_configurations]
def __init__(self, config_space, dataset_name, backend, total_walltime_limit, func_eval_time_limit, memory_limit, metric, watcher, start_num_run=1, data_memory_limit=None, num_metalearning_cfgs=25, config_file=None, seed=1, metadata_directory=None, resampling_strategy='holdout', resampling_strategy_args=None, shared_mode=False, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None, disable_file_output=False, smac_scenario_args=None, get_smac_object_callback=None): super(AutoMLSMBO, self).__init__() # data related self.dataset_name = dataset_name self.datamanager = None self.metric = metric self.task = None self.backend = backend # the configuration space self.config_space = config_space # Evaluation self.resampling_strategy = resampling_strategy if resampling_strategy_args is None: resampling_strategy_args = {} self.resampling_strategy_args = resampling_strategy_args # and a bunch of useful limits self.total_walltime_limit = int(total_walltime_limit) self.func_eval_time_limit = int(func_eval_time_limit) self.memory_limit = memory_limit self.data_memory_limit = data_memory_limit self.watcher = watcher self.num_metalearning_cfgs = num_metalearning_cfgs self.config_file = config_file self.seed = seed self.metadata_directory = metadata_directory self.start_num_run = start_num_run self.shared_mode = shared_mode self.include_estimators = include_estimators self.exclude_estimators = exclude_estimators self.include_preprocessors = include_preprocessors self.exclude_preprocessors = exclude_preprocessors self.disable_file_output = disable_file_output self.smac_scenario_args = smac_scenario_args self.get_smac_object_callback = get_smac_object_callback logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed, ":" + dataset_name if dataset_name is not None else "") self.logger = get_logger(logger_name)
def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._backend.temporary_directory, '%s.log' % str(logger_name))) return get_logger(logger_name)
def __init__(self, config_space, dataset_name, backend, total_walltime_limit, func_eval_time_limit, memory_limit, metric, write_history, read_history, watcher, start_num_run=1, data_memory_limit=None, num_metalearning_cfgs=25, config_file=None, seed=1, metadata_directory=None, resampling_strategy='holdout', resampling_strategy_args=None, shared_mode=False, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None, disable_file_output=False, smac_scenario_args=None, get_smac_object_callback=None): super(AutoMLSMBO, self).__init__() #milad self.write_history = write_history self.read_history = read_history # data related self.dataset_name = dataset_name self.datamanager = None self.metric = metric self.task = None self.backend = backend # the configuration space self.config_space = config_space # Evaluation self.resampling_strategy = resampling_strategy if resampling_strategy_args is None: resampling_strategy_args = {} self.resampling_strategy_args = resampling_strategy_args # and a bunch of useful limits self.total_walltime_limit = int(total_walltime_limit) self.func_eval_time_limit = int(func_eval_time_limit) self.memory_limit = memory_limit self.data_memory_limit = data_memory_limit self.watcher = watcher self.num_metalearning_cfgs = num_metalearning_cfgs self.config_file = config_file self.seed = seed self.metadata_directory = metadata_directory self.start_num_run = start_num_run self.shared_mode = shared_mode self.include_estimators = include_estimators self.exclude_estimators = exclude_estimators self.include_preprocessors = include_preprocessors self.exclude_preprocessors = exclude_preprocessors self.disable_file_output = disable_file_output self.smac_scenario_args = smac_scenario_args self.get_smac_object_callback = get_smac_object_callback logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed, ":" + dataset_name if dataset_name is not None else "") self.logger = get_logger(logger_name)
# -*- encoding: utf-8 -*- import numpy as np import sklearn.cross_validation from autosklearn.util import get_logger logger = get_logger(__name__) __all__ = [ 'split_data', 'get_CV_fold' ] def split_data(X, Y, classification=None): num_data_points = X.shape[0] num_labels = Y.shape[1] if len(Y.shape) > 1 else 1 X_train, X_valid, Y_train, Y_valid = None, None, None, None if X.shape[0] != Y.shape[0]: raise ValueError('The first dimension of the X and Y array must ' 'be equal.') # If one class only has one sample, put it into the training set if classification is True and num_labels == 1: classes, y_indices = np.unique(Y, return_inverse=True) if np.min(np.bincount(y_indices)) < 2: classes_with_one_sample = np.bincount(y_indices) < 2 sample_idxs = [] Y_old = Y indices = np.ones(Y.shape, dtype=bool)
def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) return get_logger(logger_name)
def create_metalearning_string_for_smac_call(metafeatures_labels, metafeatures_encoded_labels, configuration_space, dataset_name, metric, task, sparse, num_initial_configurations, metadata_directory): """ :param metafeatures_labels: :param metafeatures_encoded_labels: :param configuration_space: :param dataset_name: :param metric: :param task: :param sparse: :param num_initial_configurations: :param metadata_directory: :return: """ logger = get_logger('autosklearn.metalearning.mismbo') task = task if task != MULTILABEL_CLASSIFICATION else MULTICLASS_CLASSIFICATION task = TASK_TYPES_TO_STRING[task] if metafeatures_encoded_labels is None or \ metafeatures_labels is None: raise ValueError('Please call ' 'calculate_metafeatures_encoded_labels and ' 'calculate_metafeatures_with_labels first!') logger.warning(task) current_directory = os.path.dirname(__file__) if metadata_directory is None: metadata_directory = os.path.join( current_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[metric], task, 'sparse' if sparse is True else 'dense')) logger.warning(metadata_directory) # Concatenate the metafeatures! mf = metafeatures_labels mf.metafeature_values.update( metafeatures_encoded_labels.metafeature_values) metafeatures_subset = subsets['all'] metafeatures_subset.difference_update(EXCLUDE_META_FUTURES) metafeatures_subset = list(metafeatures_subset) start = time.time() ml = MetaLearningOptimizer(dataset_name=dataset_name + SENTINEL, configuration_space=configuration_space, aslib_directory=metadata_directory, distance='l1', seed=1, use_features=metafeatures_subset, subset='all') logger.info('Reading meta-data took %5.2f seconds', time.time() - start) # TODO This is hacky, I must find a different way of adding a new # dataset! ml.meta_base.add_dataset(dataset_name + SENTINEL, mf) runs = ml.metalearning_suggest_all(exclude_double_configurations=True) # = Convert these configurations into the SMAC CLI configuration format smac_initial_configuration_strings = [] for configuration in runs[:num_initial_configurations]: smac_initial_configuration_strings.append( convert_conf2smac_string(configuration)) return smac_initial_configuration_strings
def test_fail_if_dummy_prediction_fails(self, ta_run_mock): backend_api = self._create_backend( 'test_fail_if_dummy_prediction_fails') dataset = os.path.join(self.test_dir, '..', '.data', '401_bac') time_for_this_task = 30 per_run_time = 10 auto = autosklearn.automl.AutoML( backend_api, time_for_this_task, per_run_time, initial_configurations_via_metalearning=25, ) setup_logger() auto._logger = get_logger('test_fail_if_dummy_prediction_fails') auto._backend._make_internals_directory() D = load_data(dataset, backend_api) auto._backend.save_datamanager(D) # First of all, check that ta.run() is actually called. ta_run_mock.return_value = StatusType.SUCCESS, None, None, "test" auto._do_dummy_prediction(D, 1) ta_run_mock.assert_called_once_with(1, cutoff=time_for_this_task) # Case 1. Check that function raises no error when statustype == success. # ta.run() returns status, cost, runtime, and additional info. ta_run_mock.return_value = StatusType.SUCCESS, None, None, "test" raised = False try: auto._do_dummy_prediction(D, 1) except ValueError: raised = True self.assertFalse(raised, 'Exception raised') # Case 2. Check that if statustype returned by ta.run() != success, # the function raises error. ta_run_mock.return_value = StatusType.CRASHED, None, None, "test" self.assertRaisesRegex( ValueError, 'Dummy prediction failed: test', auto._do_dummy_prediction, D, 1, ) ta_run_mock.return_value = StatusType.ABORT, None, None, "test" self.assertRaisesRegex( ValueError, 'Dummy prediction failed: test', auto._do_dummy_prediction, D, 1, ) ta_run_mock.return_value = StatusType.TIMEOUT, None, None, "test" self.assertRaisesRegex( ValueError, 'Dummy prediction failed: test', auto._do_dummy_prediction, D, 1, ) ta_run_mock.return_value = StatusType.MEMOUT, None, None, "test" self.assertRaisesRegex( ValueError, 'Dummy prediction failed: test', auto._do_dummy_prediction, D, 1, ) ta_run_mock.return_value = StatusType.CAPPED, None, None, "test" self.assertRaisesRegex( ValueError, 'Dummy prediction failed: test', auto._do_dummy_prediction, D, 1, ) self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def get_automl_logger(log_dir, basename, seed): logger = get_logger(os.path.basename(__file__)) logger_file = os.path.join(log_dir, '%s.log' % str( 'AutoML_%s_%d' % (basename, seed))) add_file_handler(logger, logger_file) return logger
import autosklearn.metalearning from autosklearn.constants import * from autosklearn.metalearning.mismbo import suggest_via_metalearning from autosklearn.data.abstract_data_manager import AbstractDataManager from autosklearn.evaluation import ExecuteTaFuncWithQueue, WORST_POSSIBLE_RESULT from autosklearn.util import get_logger from autosklearn.metalearning.metalearning.meta_base import MetaBase from autosklearn.metalearning.metafeatures.metafeatures import \ calculate_all_metafeatures_with_labels, calculate_all_metafeatures_encoded_labels from amb.data.profiler import DataProfiler from amb.settings import settings, get_logger logger = get_logger(__name__) EXCLUDE_META_FEATURES_CLASSIFICATION = { 'Landmark1NN', 'LandmarkDecisionNodeLearner', 'LandmarkDecisionTree', 'LandmarkLDA', 'LandmarkNaiveBayes', 'PCAFractionOfComponentsFor95PercentVariance', 'PCAKurtosisFirstPC', 'PCASkewnessFirstPC', 'PCA' } """ Same as above except these are added: 'NumberOfClasses',
import os import time from StringIO import StringIO import numpy as np from pyMetaLearn.metafeatures.metafeatures import \ calculate_all_metafeatures_with_labels, \ calculate_all_metafeatures_encoded_labels, subsets from pyMetaLearn.optimizers.metalearn_optimizer.metalearner import \ MetaLearningOptimizer from autosklearn.util import get_logger logger = get_logger(os.path.basename(__file__)) SENTINEL = 'uiaeo' EXCLUDE_META_FUTURES = { 'Landmark1NN', 'LandmarkDecisionNodeLearner', 'LandmarkDecisionTree', 'LandmarkLDA', 'LandmarkNaiveBayes', 'PCAFractionOfComponentsFor95PercentVariance', 'PCAKurtosisFirstPC', 'PCASkewnessFirstPC' } def calc_meta_features(X_train, Y_train, categorical, dataset_name):
def create_metalearning_string_for_smac_call( metafeatures_labels, metafeatures_encoded_labels, configuration_space, dataset_name, metric, task, sparse, num_initial_configurations, metadata_directory): """ :param metafeatures_labels: :param metafeatures_encoded_labels: :param configuration_space: :param dataset_name: :param metric: :param task: :param sparse: :param num_initial_configurations: :param metadata_directory: :return: """ logger = get_logger('autosklearn.metalearning.mismbo') task = TASK_TYPES_TO_STRING[task] if metafeatures_encoded_labels is None or \ metafeatures_labels is None: raise ValueError('Please call ' 'calculate_metafeatures_encoded_labels and ' 'calculate_metafeatures_with_labels first!') current_directory = os.path.dirname(__file__) if metadata_directory is None: metadata_directory = os.path.join( current_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[metric], task, 'sparse' if sparse is True else 'dense')) # Concatenate the metafeatures! mf = metafeatures_labels mf.metafeature_values.update( metafeatures_encoded_labels.metafeature_values) metafeatures_subset = subsets['all'] metafeatures_subset.difference_update(EXCLUDE_META_FUTURES) metafeatures_subset = list(metafeatures_subset) start = time.time() ml = MetaLearningOptimizer( dataset_name=dataset_name + SENTINEL, configuration_space=configuration_space, aslib_directory=metadata_directory, distance='l1', seed=1, use_features=metafeatures_subset, subset='all') logger.info('Reading meta-data took %5.2f seconds', time.time() - start) # TODO This is hacky, I must find a different way of adding a new # dataset! ml.meta_base.add_dataset(dataset_name + SENTINEL, mf) runs = ml.metalearning_suggest_all(exclude_double_configurations=True) # = Convert these configurations into the SMAC CLI configuration format smac_initial_configuration_strings = [] for configuration in runs[:num_initial_configurations]: smac_initial_configuration_strings.append( convert_conf2smac_string(configuration)) return smac_initial_configuration_strings