def test_choose_next(self): configspace = ConfigurationSpace() configspace.add_hyperparameter(UniformFloatHyperparameter('a', 0, 1)) configspace.add_hyperparameter(UniformFloatHyperparameter('b', 0, 1)) dataset_name = 'foo' func_eval_time_limit = 15 total_walltime_limit = 15 memory_limit = 3000 auto = AutoMLSMBO(None, dataset_name, None, func_eval_time_limit, total_walltime_limit, memory_limit, None) auto.config_space = configspace scenario = Scenario({'cs': configspace, 'cutoff-time': func_eval_time_limit, 'wallclock-limit': total_walltime_limit, 'memory-limit': memory_limit, 'run-obj': 'quality'}) smac = SMAC(scenario) self.assertRaisesRegex(ValueError, 'Cannot use SMBO algorithm on ' 'empty runhistory', auto.choose_next, smac) runhistory = smac.solver.runhistory runhistory.add(config=Configuration(configspace, values={'a': 0.1, 'b': 0.2}), cost=0.5, time=0.5, status=StatusType.SUCCESS) auto.choose_next(smac)
def test_choose_next(self): configspace = ConfigurationSpace() configspace.add_hyperparameter(UniformFloatHyperparameter('a', 0, 1)) configspace.add_hyperparameter(UniformFloatHyperparameter('b', 0, 1)) dataset_name = 'foo' func_eval_time_limit = 15 total_walltime_limit = 15 memory_limit = 3072 auto = AutoMLSMBO(config_space=None, dataset_name=dataset_name, backend=None, func_eval_time_limit=func_eval_time_limit, total_walltime_limit=total_walltime_limit, memory_limit=memory_limit, watcher=None, metric=accuracy) auto.config_space = configspace scenario = Scenario({ 'cs': configspace, 'cutoff_time': func_eval_time_limit, 'wallclock_limit': total_walltime_limit, 'memory_limit': memory_limit, 'run_obj': 'quality', }) smac = SMAC(scenario) self.assertRaisesRegex( ValueError, 'Cannot use SMBO algorithm on empty runhistory', auto.choose_next, smac) config = Configuration(configspace, values={'a': 0.1, 'b': 0.2}) # TODO make sure the incumbent is always set? smac.solver.incumbent = config runhistory = smac.solver.runhistory runhistory.add(config=config, cost=0.5, time=0.5, status=StatusType.SUCCESS) auto.choose_next(smac)
def test_smbo_metalearning_configurations(backend, context, dask_client): # Get the inputs to the optimizer X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') config_space = AutoML(backend=backend, metric=autosklearn.metrics.accuracy, time_left_for_this_task=20, per_run_time_limit=5).fit( X_train, Y_train, task=BINARY_CLASSIFICATION, only_return_configuration_space=True) watcher = StopWatch() # Create an optimizer smbo = AutoMLSMBO( config_space=config_space, dataset_name='iris', backend=backend, total_walltime_limit=10, func_eval_time_limit=5, memory_limit=4096, metric=autosklearn.metrics.accuracy, watcher=watcher, n_jobs=1, dask_client=dask_client, port=logging.handlers.DEFAULT_TCP_LOGGING_PORT, start_num_run=1, data_memory_limit=None, num_metalearning_cfgs=25, pynisher_context=context, ) assert smbo.pynisher_context == context # Create the inputs to metalearning datamanager = XYDataManager( X_train, Y_train, X_test, Y_test, task=BINARY_CLASSIFICATION, dataset_name='iris', feat_type={i: 'numerical' for i in range(X_train.shape[1])}, ) backend.save_datamanager(datamanager) smbo.task = BINARY_CLASSIFICATION smbo.reset_data_manager() metalearning_configurations = smbo.get_metalearning_suggestions() # We should have 25 metalearning configurations assert len(metalearning_configurations) == 25 assert [ isinstance(config, Configuration) for config in metalearning_configurations ]
def get_meta_learning_configs(X, y, task_type, dataset_name='default', metric='accuracy', num_cfgs=5): if X is None or y is None: X, y, _ = load_data(dataset_name) backend = create(temporary_directory=None, output_directory=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=True) dm = XYDataManager(X, y, None, None, task_type, None, dataset_name) configuration_space = pipeline.get_configuration_space( dm.info, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None) watcher = StopWatch() name = os.path.basename(dm.name) watcher.start_task(name) def reset_data_manager(max_mem=None): pass automlsmbo = AutoMLSMBO( config_space=configuration_space, dataset_name=dataset_name, backend=backend, total_walltime_limit=1e5, func_eval_time_limit=1e5, memory_limit=1e5, metric=metric, watcher=watcher, metadata_directory='components/meta_learning/meta_resource', num_metalearning_cfgs=num_cfgs) automlsmbo.reset_data_manager = reset_data_manager automlsmbo.task = task_type automlsmbo.datamanager = dm configs = automlsmbo.get_metalearning_suggestions() return configs
def _fit(self, datamanager, metric): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if not isinstance(self._disable_evaluator_output, (bool, list)): raise ValueError('disable_evaluator_output must be of type bool ' 'or list.') if isinstance(self._disable_evaluator_output, list): allowed_elements = ['model', 'y_optimization'] for element in self._disable_evaluator_output: if element not in allowed_elements: raise ValueError("List member '%s' for argument " "'disable_evaluator_output' must be one " "of " + str(allowed_elements)) if self._resampling_strategy not in [ 'holdout', 'holdout-iterative-fit', 'cv', 'partial-cv', 'partial-cv-iterative-fit' ]: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit'] \ and self._ensemble_size != 0: raise ValueError("Resampling strategy %s cannot be used " "together with ensembles." % self._resampling_strategy) if self._resampling_strategy in ['partial-cv', 'cv', 'partial-cv-iterative-fit'] and \ not 'folds' in self._resampling_strategy_arguments: self._resampling_strategy_arguments['folds'] = 5 self._backend._make_internals_directory() if self._keep_models: try: os.makedirs(self._backend.get_model_dir()) except (OSError, FileExistsError) as e: if not self._shared_mode: raise self._metric = metric self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time(self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._backend.temporary_directory, self._backend, datamanager, include_estimators=self._include_estimators, exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info('Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._logger.warning("Not starting ensemble builder because there " "is no time left!") self._proc_ensemble = None else: self._proc_ensemble = self._get_ensemble_process( time_left_for_ensembles) if self._ensemble_size > 0: self._proc_ensemble.start() else: self._logger.info('Not starting ensemble builder because ' 'ensemble size is <= 0.') self._stopwatch.stop_task(ensemble_task_name) # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max( 0, self._time_for_task - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info('Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") _proc_smac = None else: if self._per_run_time_limit is None or \ self._per_run_time_limit > time_left_for_smac: print('Time limit for a single run is higher than total time ' 'limit. Capping the limit for a single run to the total ' 'time given to SMAC (%f)' % time_left_for_smac) per_run_time_limit = time_left_for_smac else: per_run_time_limit = self._per_run_time_limit _proc_smac = AutoMLSMBO( config_space=self.configuration_space, dataset_name=self._dataset_name, backend=self._backend, total_walltime_limit=time_left_for_smac, func_eval_time_limit=per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self. _initial_configurations_via_metalearning, config_file=configspace_path, seed=self._seed, metadata_directory=self._metadata_directory, metric=self._metric, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, shared_mode=self._shared_mode, include_estimators=self._include_estimators, exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors, disable_file_output=self._disable_evaluator_output, get_smac_object_callback=self._get_smac_object_callback, smac_scenario_args=self._smac_scenario_args, ) self.runhistory_, self.trajectory_ = \ _proc_smac.run_smbo() trajectory_filename = os.path.join( self._backend.get_smac_output_directory_for_run(self._seed), 'trajectory.json') saveable_trajectory = \ [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:]) for entry in self.trajectory_] with open(trajectory_filename, 'w') as fh: json.dump(saveable_trajectory, fh) # Wait until the ensemble process is finished to avoid shutting down # while the ensemble builder tries to access the data if self._proc_ensemble is not None and self._ensemble_size > 0: self._proc_ensemble.join() self._proc_ensemble = None self._load_models() return self
class AutoML(BaseEstimator, multiprocessing.Process): def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32, max_iter_smac=None, acquisition_function='EI'): super(AutoML, self).__init__() self._tmp_dir = tmp_dir self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self._max_iter_smac = max_iter_smac self.delete_tmp_folder_after_terminate = \ delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = \ delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self.acquisition_function = acquisition_function self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self._parser = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode if not isinstance(self._time_for_task, int): raise ValueError("time_left_for_this_task not of type integer, " "but %s" % str(type(self._time_for_task))) if not isinstance(self._per_run_time_limit, int): raise ValueError("per_run_time_limit not of type integer, but %s" % str(type(self._per_run_time_limit))) # After assignging and checking variables... self._backend = Backend(self._output_dir, self._tmp_dir) def start_automl(self, parser): self._parser = parser self.start() def start(self): if self._parser is None: raise ValueError('You must invoke start() only via start_automl()') super(AutoML, self).start() def run(self): if self._parser is None: raise ValueError('You must invoke run() only via start_automl()') self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=self._parser) self._stopwatch.start_task(datamanager.name) self._logger = self._get_logger(datamanager.name) self._datamanager = datamanager self._dataset_name = datamanager.name self._fit(self._datamanager) def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager) def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! self._data_memory_limit = float(self._ml_memory_limit) / 3 loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=self._data_memory_limit) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager) def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) return get_logger(logger_name) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager, num_run): self._logger.info("Starting to create dummy predictions.") time_limit = int(self._time_for_task / 6.) memory_limit = int(self._ml_memory_limit) _info = eval_with_limits(datamanager, self._tmp_dir, 1, self._seed, num_run, self._resampling_strategy, self._resampling_strategy_arguments, memory_limit, time_limit) if _info[4] == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 1/2.") else: self._logger.error('Error creating dummy prediction 1/2:%s ', _info[3]) num_run += 1 _info = eval_with_limits(datamanager, self._tmp_dir, 2, self._seed, num_run, self._resampling_strategy, self._resampling_strategy_arguments, memory_limit, time_limit) if _info[4] == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 2/2.") else: self._logger.error('Error creating dummy prediction 2/2 %s', _info[3]) num_run += 1 return num_run def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in [ 'holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv' ]: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") acquisition_functions = ['EI', 'EIPS'] if self.acquisition_function not in acquisition_functions: raise ValueError( 'Illegal acquisition %s: Must be one of %s.' % (self.acquisition_function, acquisition_functions)) self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warning("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data(datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time(self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._tmp_dir, self._backend, datamanager, self._include_estimators, self._include_preprocessors) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info('Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._logger.warning("Not starting ensemble builder because there " "is no time left!") self._proc_ensemble = None else: self._proc_ensemble = self._get_ensemble_process( time_left_for_ensembles) self._proc_ensemble.start() self._stopwatch.stop_task(ensemble_task_name) # == RUN SMBO # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max( 0, self._time_for_task - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info('Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") self._procsmac = None else: self._proc_smac = AutoMLSMBO( config_space=self.configuration_space, dataset_name=self._dataset_name, tmp_dir=self._tmp_dir, output_dir=self._output_dir, total_walltime_limit=time_left_for_smac, func_eval_time_limit=self._per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self. _initial_configurations_via_metalearning, config_file=configspace_path, smac_iters=self._max_iter_smac, seed=self._seed, metadata_directory=self._metadata_directory, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, acquisition_function=self.acquisition_function, shared_mode=self._shared_mode) self._proc_smac.start() psutil_procs = [] procs = [] if self._proc_smac is not None: proc_smac = psutil.Process(self._proc_smac.pid) psutil_procs.append(proc_smac) procs.append(self._proc_smac) if self._proc_ensemble is not None: proc_ensemble = psutil.Process(self._proc_ensemble.pid) psutil_procs.append(proc_ensemble) procs.append(self._proc_ensemble) if self._queue is not None: self._queue.put( [time_for_load_data, data_manager_path, psutil_procs]) else: for proc in procs: proc.join() if self._queue is None: self._load_models() return self def refit(self, X, y): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() for identifier in self.models_: if identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method model.fit(X.copy(), y.copy()) self._can_predict = True def predict(self, X): return np.argmax(self.predict_proba(X), axis=1) def predict_proba(self, X): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if not self._can_predict and \ self._resampling_strategy not in \ ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently only implemented for resampling ' 'strategy holdout.') if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() all_predictions = [] for identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] X_ = X.copy() if self._task in REGRESSION_TASKS: prediction = model.predict(X_) else: prediction = model.predict_proba(X_) if len(prediction.shape) < 1 or len(X_.shape) < 1 or \ X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]: self._logger.warning( "Prediction shape for model %s is %s " "while X_.shape is %s" % (model, str(prediction.shape), str(X_.shape))) all_predictions.append(prediction) if len(all_predictions) == 0: raise ValueError( 'Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = self.ensemble_.predict(all_predictions) return predictions def fit_ensemble(self, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): if self._logger is None: self._logger = self._get_logger(dataset_name) self._proc_ensemble = self._get_ensemble_process( 1, task, metric, precision, dataset_name, max_iterations=1, ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size) self._proc_ensemble.main() def _get_ensemble_process(self, time_left_for_ensembles, task=None, metric=None, precision=None, dataset_name=None, max_iterations=-1, ensemble_nbest=None, ensemble_size=None): if task is None: task = self._task if metric is None: metric = self._metric if precision is None: precision = self.precision if dataset_name is None: dataset_name = self._dataset_name if ensemble_nbest is None: ensemble_nbest = self._ensemble_nbest if ensemble_size is None: ensemble_size = self._ensemble_size return EnsembleBuilder(autosklearn_tmp_dir=self._tmp_dir, dataset_name=dataset_name, task_type=task, metric=metric, limit=time_left_for_ensembles, output_dir=self._output_dir, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, precision=precision, max_iterations=max_iterations) def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.ensemble_ = self._backend.load_ensemble(seed) if self.ensemble_: identifiers = self.ensemble_.identifiers_ self.models_ = self._backend.load_models_by_identifiers( identifiers) else: self.models_ = self._backend.load_all_models(seed) if len(self.models_) == 0: raise ValueError('No models fitted!') def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict_proba(X) return calculate_score(y, prediction, self._task, self._metric, self._label_num, logger=self._logger) def show_models(self): if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() return self.ensemble_.pprint_ensemble_string(self.models_) def _save_ensemble_data(self, X, y): """Split dataset and store Data for the ensemble script. :param X: :param y: :return: """ task_name = 'LoadData' self._start_task(self._stopwatch, task_name) _, _, _, y_ensemble = resampling.split_data(X, y) self._backend.save_targets_ensemble(y_ensemble) self._stop_task(self._stopwatch, task_name) def _create_search_space(self, tmp_dir, backend, datamanager, include_estimators=None, include_preprocessors=None): task_name = 'CreateConfigSpace' self._stopwatch.start_task(task_name) configspace_path = os.path.join(tmp_dir, 'space.pcs') configuration_space = pipeline.get_configuration_space( datamanager.info, include_estimators=include_estimators, include_preprocessors=include_preprocessors) configuration_space = self.configuration_space_created_hook( datamanager, configuration_space) sp_string = pcs.write(configuration_space) backend.write_txt_file(configspace_path, sp_string, 'Configuration space') self._stopwatch.stop_task(task_name) return configuration_space, configspace_path def configuration_space_created_hook(self, datamanager, configuration_space): return configuration_space def get_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'get_params() because it is not intended to ' 'be optimized.') def set_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'set_params() because it is not intended to ' 'be optimized.') def __del__(self): self._delete_output_directories() def _delete_output_directories(self): if self.delete_output_folder_after_terminate: try: shutil.rmtree(self._output_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete output dir: %s" % self._output_dir) else: print("Could not delete output dir: %s" % self._output_dir) if self.delete_tmp_folder_after_terminate: try: shutil.rmtree(self._tmp_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete tmp dir: %s" % self._tmp_dir) pass else: print("Could not delete tmp dir: %s" % self._tmp_dir)
def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in [ 'holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv' ]: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") acquisition_functions = ['EI', 'EIPS'] if self.acquisition_function not in acquisition_functions: raise ValueError( 'Illegal acquisition %s: Must be one of %s.' % (self.acquisition_function, acquisition_functions)) self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warning("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data(datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time(self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._tmp_dir, self._backend, datamanager, self._include_estimators, self._include_preprocessors) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info('Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._logger.warning("Not starting ensemble builder because there " "is no time left!") self._proc_ensemble = None else: self._proc_ensemble = self._get_ensemble_process( time_left_for_ensembles) self._proc_ensemble.start() self._stopwatch.stop_task(ensemble_task_name) # == RUN SMBO # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max( 0, self._time_for_task - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info('Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") self._procsmac = None else: self._proc_smac = AutoMLSMBO( config_space=self.configuration_space, dataset_name=self._dataset_name, tmp_dir=self._tmp_dir, output_dir=self._output_dir, total_walltime_limit=time_left_for_smac, func_eval_time_limit=self._per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self. _initial_configurations_via_metalearning, config_file=configspace_path, smac_iters=self._max_iter_smac, seed=self._seed, metadata_directory=self._metadata_directory, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, acquisition_function=self.acquisition_function, shared_mode=self._shared_mode) self._proc_smac.start() psutil_procs = [] procs = [] if self._proc_smac is not None: proc_smac = psutil.Process(self._proc_smac.pid) psutil_procs.append(proc_smac) procs.append(self._proc_smac) if self._proc_ensemble is not None: proc_ensemble = psutil.Process(self._proc_ensemble.pid) psutil_procs.append(proc_ensemble) procs.append(self._proc_ensemble) if self._queue is not None: self._queue.put( [time_for_load_data, data_manager_path, psutil_procs]) else: for proc in procs: proc.join() if self._queue is None: self._load_models() return self
class AutoML(BaseEstimator, multiprocessing.Process): def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32, max_iter_smac=None, acquisition_function='EI'): super(AutoML, self).__init__() self._tmp_dir = tmp_dir self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self._max_iter_smac = max_iter_smac self.delete_tmp_folder_after_terminate = \ delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = \ delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self.acquisition_function = acquisition_function self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self._parser = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode if not isinstance(self._time_for_task, int): raise ValueError("time_left_for_this_task not of type integer, " "but %s" % str(type(self._time_for_task))) if not isinstance(self._per_run_time_limit, int): raise ValueError("per_run_time_limit not of type integer, but %s" % str(type(self._per_run_time_limit))) # After assignging and checking variables... self._backend = Backend(self._output_dir, self._tmp_dir) def start_automl(self, parser): self._parser = parser self.start() def start(self): if self._parser is None: raise ValueError('You must invoke start() only via start_automl()') super(AutoML, self).start() def run(self): if self._parser is None: raise ValueError('You must invoke run() only via start_automl()') self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=self._parser) self._stopwatch.start_task(datamanager.name) self._logger = self._get_logger(datamanager.name) self._datamanager = datamanager self._dataset_name = datamanager.name self._fit(self._datamanager) def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all([isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager) def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! self._data_memory_limit = float(self._ml_memory_limit) / 3 loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=self._data_memory_limit) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager) def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) return get_logger(logger_name) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager, num_run): self._logger.info("Starting to create dummy predictions.") time_limit = int(self._time_for_task / 6.) memory_limit = int(self._ml_memory_limit) _info = eval_with_limits(datamanager, self._tmp_dir, 1, self._seed, num_run, self._resampling_strategy, self._resampling_strategy_arguments, memory_limit, time_limit) if _info[4] == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 1/2.") else: self._logger.error('Error creating dummy prediction 1/2:%s ', _info[3]) num_run += 1 _info = eval_with_limits(datamanager, self._tmp_dir, 2, self._seed, num_run, self._resampling_strategy, self._resampling_strategy_arguments, memory_limit, time_limit) if _info[4] == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 2/2.") else: self._logger.error('Error creating dummy prediction 2/2 %s', _info[3]) num_run += 1 return num_run def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv']: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") acquisition_functions = ['EI', 'EIPS'] if self.acquisition_function not in acquisition_functions: raise ValueError('Illegal acquisition %s: Must be one of %s.' % (self.acquisition_function, acquisition_functions)) self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warning("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data( datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._tmp_dir, self._backend, datamanager, self._include_estimators, self._include_preprocessors) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info( 'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._logger.warning("Not starting ensemble builder because there " "is no time left!") self._proc_ensemble = None else: self._proc_ensemble = self._get_ensemble_process(time_left_for_ensembles) self._proc_ensemble.start() self._stopwatch.stop_task(ensemble_task_name) # == RUN SMBO # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max(0, self._time_for_task - self._stopwatch.wall_elapsed( self._dataset_name)) if self._logger: self._logger.info( 'Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") self._procsmac = None else: self._proc_smac = AutoMLSMBO(config_space=self.configuration_space, dataset_name=self._dataset_name, tmp_dir=self._tmp_dir, output_dir=self._output_dir, total_walltime_limit=time_left_for_smac, func_eval_time_limit=self._per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self._initial_configurations_via_metalearning, config_file=configspace_path, smac_iters=self._max_iter_smac, seed=self._seed, metadata_directory=self._metadata_directory, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, acquisition_function=self.acquisition_function, shared_mode=self._shared_mode) self._proc_smac.start() psutil_procs = [] procs = [] if self._proc_smac is not None: proc_smac = psutil.Process(self._proc_smac.pid) psutil_procs.append(proc_smac) procs.append(self._proc_smac) if self._proc_ensemble is not None: proc_ensemble = psutil.Process(self._proc_ensemble.pid) psutil_procs.append(proc_ensemble) procs.append(self._proc_ensemble) if self._queue is not None: self._queue.put([time_for_load_data, data_manager_path, psutil_procs]) else: for proc in procs: proc.join() if self._queue is None: self._load_models() return self def refit(self, X, y): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() for identifier in self.models_: if identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method model.fit(X.copy(), y.copy()) self._can_predict = True def predict(self, X): return np.argmax(self.predict_proba(X), axis=1) def predict_proba(self, X): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if not self._can_predict and \ self._resampling_strategy not in \ ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently only implemented for resampling ' 'strategy holdout.') if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() all_predictions = [] for identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] X_ = X.copy() if self._task in REGRESSION_TASKS: prediction = model.predict(X_) else: prediction = model.predict_proba(X_) if len(prediction.shape) < 1 or len(X_.shape) < 1 or \ X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]: self._logger.warning("Prediction shape for model %s is %s " "while X_.shape is %s" % (model, str(prediction.shape), str(X_.shape))) all_predictions.append(prediction) if len(all_predictions) == 0: raise ValueError('Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = self.ensemble_.predict(all_predictions) return predictions def fit_ensemble(self, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): if self._logger is None: self._logger = self._get_logger(dataset_name) self._proc_ensemble = self._get_ensemble_process( 1, task, metric, precision, dataset_name, max_iterations=1, ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size) self._proc_ensemble.main() def _get_ensemble_process(self, time_left_for_ensembles, task=None, metric=None, precision=None, dataset_name=None, max_iterations=-1, ensemble_nbest=None, ensemble_size=None): if task is None: task = self._task if metric is None: metric = self._metric if precision is None: precision = self.precision if dataset_name is None: dataset_name = self._dataset_name if ensemble_nbest is None: ensemble_nbest = self._ensemble_nbest if ensemble_size is None: ensemble_size = self._ensemble_size return EnsembleBuilder(autosklearn_tmp_dir=self._tmp_dir, dataset_name=dataset_name, task_type=task, metric=metric, limit=time_left_for_ensembles, output_dir=self._output_dir, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, precision=precision, max_iterations=max_iterations) def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.ensemble_ = self._backend.load_ensemble(seed) if self.ensemble_: identifiers = self.ensemble_.identifiers_ self.models_ = self._backend.load_models_by_identifiers(identifiers) else: self.models_ = self._backend.load_all_models(seed) if len(self.models_) == 0: raise ValueError('No models fitted!') def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict_proba(X) return calculate_score(y, prediction, self._task, self._metric, self._label_num, logger=self._logger) def show_models(self): if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() return self.ensemble_.pprint_ensemble_string(self.models_) def _save_ensemble_data(self, X, y): """Split dataset and store Data for the ensemble script. :param X: :param y: :return: """ task_name = 'LoadData' self._start_task(self._stopwatch, task_name) _, _, _, y_ensemble = resampling.split_data(X, y) self._backend.save_targets_ensemble(y_ensemble) self._stop_task(self._stopwatch, task_name) def _create_search_space(self, tmp_dir, backend, datamanager, include_estimators=None, include_preprocessors=None): task_name = 'CreateConfigSpace' self._stopwatch.start_task(task_name) configspace_path = os.path.join(tmp_dir, 'space.pcs') configuration_space = pipeline.get_configuration_space( datamanager.info, include_estimators=include_estimators, include_preprocessors=include_preprocessors) configuration_space = self.configuration_space_created_hook( datamanager, configuration_space) sp_string = pcs.write(configuration_space) backend.write_txt_file(configspace_path, sp_string, 'Configuration space') self._stopwatch.stop_task(task_name) return configuration_space, configspace_path def configuration_space_created_hook(self, datamanager, configuration_space): return configuration_space def get_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'get_params() because it is not intended to ' 'be optimized.') def set_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'set_params() because it is not intended to ' 'be optimized.') def __del__(self): self._delete_output_directories() def _delete_output_directories(self): if self.delete_output_folder_after_terminate: try: shutil.rmtree(self._output_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete output dir: %s" % self._output_dir) else: print("Could not delete output dir: %s" % self._output_dir) if self.delete_tmp_folder_after_terminate: try: shutil.rmtree(self._tmp_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete tmp dir: %s" % self._tmp_dir) pass else: print("Could not delete tmp dir: %s" % self._tmp_dir)
def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv']: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") acquisition_functions = ['EI', 'EIPS'] if self.acquisition_function not in acquisition_functions: raise ValueError('Illegal acquisition %s: Must be one of %s.' % (self.acquisition_function, acquisition_functions)) self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warning("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data( datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._tmp_dir, self._backend, datamanager, self._include_estimators, self._include_preprocessors) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info( 'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._logger.warning("Not starting ensemble builder because there " "is no time left!") self._proc_ensemble = None else: self._proc_ensemble = self._get_ensemble_process(time_left_for_ensembles) self._proc_ensemble.start() self._stopwatch.stop_task(ensemble_task_name) # == RUN SMBO # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max(0, self._time_for_task - self._stopwatch.wall_elapsed( self._dataset_name)) if self._logger: self._logger.info( 'Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") self._procsmac = None else: self._proc_smac = AutoMLSMBO(config_space=self.configuration_space, dataset_name=self._dataset_name, tmp_dir=self._tmp_dir, output_dir=self._output_dir, total_walltime_limit=time_left_for_smac, func_eval_time_limit=self._per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self._initial_configurations_via_metalearning, config_file=configspace_path, smac_iters=self._max_iter_smac, seed=self._seed, metadata_directory=self._metadata_directory, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, acquisition_function=self.acquisition_function, shared_mode=self._shared_mode) self._proc_smac.start() psutil_procs = [] procs = [] if self._proc_smac is not None: proc_smac = psutil.Process(self._proc_smac.pid) psutil_procs.append(proc_smac) procs.append(self._proc_smac) if self._proc_ensemble is not None: proc_ensemble = psutil.Process(self._proc_ensemble.pid) psutil_procs.append(proc_ensemble) procs.append(self._proc_ensemble) if self._queue is not None: self._queue.put([time_for_load_data, data_manager_path, psutil_procs]) else: for proc in procs: proc.join() if self._queue is None: self._load_models() return self
class AutoML(BaseEstimator): def __init__(self, backend, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32, max_iter_smac=None, acquisition_function='EI'): super(AutoML, self).__init__() self._backend = backend #self._tmp_dir = tmp_dir #self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit #self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None self._metadata_directory = metadata_directory self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self._max_iter_smac = max_iter_smac #self.delete_tmp_folder_after_terminate = \ # delete_tmp_folder_after_terminate #self.delete_output_folder_after_terminate = \ # delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self.acquisition_function = acquisition_function self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self._parser = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode if not isinstance(self._time_for_task, int): raise ValueError("time_left_for_this_task not of type integer, " "but %s" % str(type(self._time_for_task))) if not isinstance(self._per_run_time_limit, int): raise ValueError("per_run_time_limit not of type integer, but %s" % str(type(self._per_run_time_limit))) # After assignging and checking variables... #self._backend = Backend(self._output_dir, self._tmp_dir) def start_automl(self, parser): self._parser = parser self.start() def start(self): if self._parser is None: raise ValueError('You must invoke start() only via start_automl()') super(AutoML, self).start() def run(self): if self._parser is None: raise ValueError('You must invoke run() only via start_automl()') self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=self._parser) self._stopwatch.start_task(datamanager.name) self._logger = self._get_logger(datamanager.name) self._datamanager = datamanager self._dataset_name = datamanager.name self._fit(self._datamanager) def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if not self._shared_mode: self._backend.context.delete_directories() else: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass self._backend.context.create_directories() if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all([isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager) def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! self._data_memory_limit = float(self._ml_memory_limit) / 3 loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=self._data_memory_limit) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager) def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._backend.temporary_directory, '%s.log' % str(logger_name))) return get_logger(logger_name) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager, num_run): self._logger.info("Starting to create dummy predictions.") time_limit = int(self._time_for_task / 6.) memory_limit = int(self._ml_memory_limit) _info = eval_with_limits(datamanager, self._backend, 1, self._seed, num_run, self._resampling_strategy, self._resampling_strategy_arguments, memory_limit, time_limit, logger=self._logger) if _info[4] == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 1/2.") else: self._logger.error('Error creating dummy prediction 1/2:%s ', _info[3]) num_run += 1 _info = eval_with_limits(datamanager, self._backend, 2, self._seed, num_run, self._resampling_strategy, self._resampling_strategy_arguments, memory_limit, time_limit, logger=self._logger) if _info[4] == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 2/2.") else: self._logger.error('Error creating dummy prediction 2/2 %s', _info[3]) num_run += 1 return num_run def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv']: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") acquisition_functions = ['EI', 'EIPS'] if self.acquisition_function not in acquisition_functions: raise ValueError('Illegal acquisition %s: Must be one of %s.' % (self.acquisition_function, acquisition_functions)) self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data( datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._backend.temporary_directory, self._backend, datamanager, self._include_estimators, self._include_preprocessors) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info( 'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._logger.warning("Not starting ensemble builder because there " "is no time left!") self._proc_ensemble = None else: self._proc_ensemble = self._get_ensemble_process(time_left_for_ensembles) if self._ensemble_size > 0: self._proc_ensemble.start() else: self._logger.info('Not starting ensemble builder because ' 'ensemble size is <= 0.') self._stopwatch.stop_task(ensemble_task_name) # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max(0, self._time_for_task - self._stopwatch.wall_elapsed( self._dataset_name)) if self._logger: self._logger.info( 'Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") self._proc_smac = None else: if self._per_run_time_limit is None or \ self._per_run_time_limit > time_left_for_smac: print('Time limit for a single run is higher than total time ' 'limit. Capping the limit for a single run to the total ' 'time given to SMAC (%f)' % time_left_for_smac) per_run_time_limit = time_left_for_smac else: per_run_time_limit = self._per_run_time_limit self._proc_smac = AutoMLSMBO(config_space=self.configuration_space, dataset_name=self._dataset_name, backend=self._backend, total_walltime_limit=time_left_for_smac, func_eval_time_limit=per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self._initial_configurations_via_metalearning, config_file=configspace_path, smac_iters=self._max_iter_smac, seed=self._seed, metadata_directory=self._metadata_directory, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, acquisition_function=self.acquisition_function, shared_mode=self._shared_mode) self._proc_smac.run_smbo() self._proc_ensemble = None self._load_models() return self def refit(self, X, y): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() for identifier in self.models_: if identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method model.fit(X.copy(), y.copy()) self._can_predict = True return self def predict(self, X): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if not self._can_predict and \ self._resampling_strategy not in \ ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently only implemented for resampling ' 'strategy holdout.') if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() all_predictions = [] for identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] X_ = X.copy() if self._task in REGRESSION_TASKS: prediction = model.predict(X_) else: prediction = model.predict_proba(X_) if len(prediction.shape) < 1 or len(X_.shape) < 1 or \ X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]: self._logger.warning("Prediction shape for model %s is %s " "while X_.shape is %s" % (model, str(prediction.shape), str(X_.shape))) all_predictions.append(prediction) if len(all_predictions) == 0: raise ValueError('Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = self.ensemble_.predict(all_predictions) return predictions def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): if self._logger is None: self._logger = self._get_logger(dataset_name) self._proc_ensemble = self._get_ensemble_process( 1, task, metric, precision, dataset_name, max_iterations=1, ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size) self._proc_ensemble.main() return self def _get_ensemble_process(self, time_left_for_ensembles, task=None, metric=None, precision=None, dataset_name=None, max_iterations=-1, ensemble_nbest=None, ensemble_size=None): if task is None: task = self._task else: self._task = task if metric is None: metric = self._metric else: self._metric = metric if precision is None: precision = self.precision else: self.precision = precision if dataset_name is None: dataset_name = self._dataset_name else: self._dataset_name = dataset_name if ensemble_nbest is None: ensemble_nbest = self._ensemble_nbest else: self._ensemble_nbest = ensemble_nbest if ensemble_size is None: ensemble_size = self._ensemble_size else: self._ensemble_size = ensemble_size return EnsembleBuilder(backend=self._backend, dataset_name=dataset_name, task_type=task, metric=metric, limit=time_left_for_ensembles, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, precision=precision, max_iterations=max_iterations) def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.ensemble_ = self._backend.load_ensemble(seed) if self.ensemble_: identifiers = self.ensemble_.identifiers_ self.models_ = self._backend.load_models_by_identifiers(identifiers) else: self.models_ = self._backend.load_all_models(seed) if len(self.models_) == 0: raise ValueError('No models fitted!') def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict(X) return calculate_score(y, prediction, self._task, self._metric, self._label_num, logger=self._logger) @property def grid_scores_(self): grid_scores = list() scores_per_config = defaultdict(list) config_list = list() for run_key in self._proc_smac.runhistory.data: run_value = self._proc_smac.runhistory.data[run_key] config_id = run_key.config_id cost = run_value.cost if config_id not in config_list: config_list.append(config_id) scores_per_config[config_id].append(cost) for config_id in config_list: scores = [1 - score for score in scores_per_config[config_id]] mean_score = np.mean(scores) config = self._proc_smac.runhistory.ids_config[config_id] grid_score = _CVScoreTuple(config.get_dictionary(), mean_score, scores) grid_scores.append(grid_score) return grid_scores @property def cv_results_(self): results = dict() # Missing in contrast to scikit-learn # splitX_test_score - auto-sklearn does not store the scores on a split # basis # std_test_score - auto-sklearn does not store the scores on a split # basis # splitX_train_score - auto-sklearn does not compute train scores, add # flag to compute the train scores # mean_train_score - auto-sklearn does not store the train scores # std_train_score - auto-sklearn does not store the train scores # std_fit_time - auto-sklearn does not store the fit times per split # mean_score_time - auto-sklearn does not store the score time # std_score_time - auto-sklearn does not store the score time # TODO: add those arguments parameter_dictionaries = dict() masks = dict() hp_names = [] # Set up dictionary for parameter values for hp in self.configuration_space.get_hyperparameters(): name = hp.name parameter_dictionaries[name] = [] masks[name] = [] hp_names.append(name) mean_test_score = [] mean_fit_time = [] params = [] status = [] for run_key in self._proc_smac.runhistory.data: run_value = self._proc_smac.runhistory.data[run_key] config_id = run_key.config_id config = self._proc_smac.runhistory.ids_config[config_id] param_dict = config.get_dictionary() params.append(param_dict) mean_test_score.append(1 - run_value.cost) mean_fit_time.append(run_value.time) s = run_value.status if s == 1: status.append('Success') elif s == 2: status.append('Timeout') elif s == 3: status.append('Crash') elif s == 4: status.append('Abort') elif s == 5: status.append('Memout') else: status.append('Unknown') for hp_name in hp_names: if hp_name in param_dict: hp_value = param_dict[hp_name] mask_value = False else: hp_value = np.NaN mask_value = True parameter_dictionaries[hp_name].append(hp_value) masks[hp_name].append(mask_value) results['mean_test_score'] = np.array(mean_test_score) results['mean_fit_time'] = np.array(mean_fit_time) results['params'] = params results['rank_test_scores'] = scipy.stats.rankdata(1 - results['mean_test_score'], method='min') results['status'] = status for hp_name in hp_names: masked_array = ma.MaskedArray(parameter_dictionaries[hp_name], masks[hp_name]) results['param_%s' % hp_name] = masked_array return results def sprint_statistics(self): cv_results = self.cv_results_ sio = io.StringIO() sio.write('auto-sklearn results:\n') sio.write(' Dataset name: %s\n' % self._dataset_name) sio.write(' Metric: %s\n' % METRIC_TO_STRING[self._metric]) idx_best_run = np.argmax(cv_results['mean_test_score']) best_score = cv_results['mean_test_score'][idx_best_run] sio.write(' Best validation score: %f\n' % best_score) num_runs = len(cv_results['status']) sio.write(' Number of target algorithm runs: %d\n' % num_runs) num_success = sum([s == 'Success' for s in cv_results['status']]) sio.write(' Number of successful target algorithm runs: %d\n' % num_success) num_crash = sum([s == 'Crash' for s in cv_results['status']]) sio.write(' Number of crashed target algorithm runs: %d\n' % num_crash) num_timeout = sum([s == 'Timeout' for s in cv_results['status']]) sio.write(' Number of target algorithms that exceeded the memory ' 'limit: %d\n' % num_timeout) num_memout = sum([s == 'Memout' for s in cv_results['status']]) sio.write(' Number of target algorithms that exceeded the time ' 'limit: %d\n' % num_memout) return sio.getvalue() def show_models(self): if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() return self.ensemble_.pprint_ensemble_string(self.models_) def _save_ensemble_data(self, X, y): """Split dataset and store Data for the ensemble script. :param X: :param y: :return: """ task_name = 'LoadData' self._start_task(self._stopwatch, task_name) _, _, _, y_ensemble = resampling.split_data(X, y) self._backend.save_targets_ensemble(y_ensemble) self._stop_task(self._stopwatch, task_name) def _create_search_space(self, tmp_dir, backend, datamanager, include_estimators=None, include_preprocessors=None): task_name = 'CreateConfigSpace' self._stopwatch.start_task(task_name) configspace_path = os.path.join(tmp_dir, 'space.pcs') configuration_space = pipeline.get_configuration_space( datamanager.info, include_estimators=include_estimators, include_preprocessors=include_preprocessors) configuration_space = self.configuration_space_created_hook( datamanager, configuration_space) sp_string = pcs.write(configuration_space) backend.write_txt_file(configspace_path, sp_string, 'Configuration space') self._stopwatch.stop_task(task_name) return configuration_space, configspace_path def configuration_space_created_hook(self, datamanager, configuration_space): return configuration_space
def fit( self, X: np.ndarray, y: np.ndarray, task: int, X_test: Optional[np.ndarray] = None, y_test: Optional[np.ndarray] = None, feat_type: Optional[List[str]] = None, dataset_name: Optional[str] = None, only_return_configuration_space: Optional[bool] = False, load_models: bool = True, ): # Reset learnt stuff self.models_ = None self.cv_models_ = None self.ensemble_ = None # The metric must exist as of this point # It can be provided in the constructor, or automatically # defined in the estimator fit call if self._metric is None: raise ValueError('No metric given.') if not isinstance(self._metric, Scorer): raise ValueError('Metric must be instance of ' 'autosklearn.metrics.Scorer.') if self._shared_mode: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass if dataset_name is None: dataset_name = hash_array_or_matrix(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) datamanager = XYDataManager( X, y, X_test=X_test, y_test=y_test, task=task, feat_type=feat_type, dataset_name=dataset_name, ) self._backend._make_internals_directory() try: os.makedirs(self._backend.get_model_dir()) except (OSError, FileExistsError): if not self._shared_mode: raise try: os.makedirs(self._backend.get_cv_model_dir()) except (OSError, FileExistsError): if not self._shared_mode: raise self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading self._backend.save_datamanager(datamanager) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time(self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 # if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._backend.temporary_directory, self._backend, datamanager, include_estimators=self._include_estimators, exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors) if only_return_configuration_space: return self.configuration_space # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) elapsed_time = self._stopwatch.wall_elapsed(self._dataset_name) time_left_for_ensembles = max(0, self._time_for_task - elapsed_time) if time_left_for_ensembles <= 0: self._proc_ensemble = None # Fit only raises error when ensemble_size is not zero but # time_left_for_ensembles is zero. if self._ensemble_size > 0: raise ValueError("Not starting ensemble builder because there " "is no time left. Try increasing the value " "of time_left_for_this_task.") elif self._ensemble_size <= 0: self._proc_ensemble = None self._logger.info('Not starting ensemble builder because ' 'ensemble size is <= 0.') else: self._logger.info('Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) self._proc_ensemble = self._get_ensemble_process( time_left_for_ensembles) self._proc_ensemble.start() self._stopwatch.stop_task(ensemble_task_name) # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) elapsed_time = self._stopwatch.wall_elapsed(self._dataset_name) time_left_for_smac = max(0, self._time_for_task - elapsed_time) if self._logger: self._logger.info('Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") _proc_smac = None self._budget_type = None else: if self._per_run_time_limit is None or \ self._per_run_time_limit > time_left_for_smac: self._logger.warning( 'Time limit for a single run is higher than total time ' 'limit. Capping the limit for a single run to the total ' 'time given to SMAC (%f)' % time_left_for_smac) per_run_time_limit = time_left_for_smac else: per_run_time_limit = self._per_run_time_limit # Make sure that at least 2 models are created for the ensemble process num_models = time_left_for_smac // per_run_time_limit if num_models < 2: per_run_time_limit = time_left_for_smac // 2 self._logger.warning( "Capping the per_run_time_limit to {} to have " "time for a least 2 models in each process.".format( per_run_time_limit)) _proc_smac = AutoMLSMBO( config_space=self.configuration_space, dataset_name=self._dataset_name, backend=self._backend, total_walltime_limit=time_left_for_smac, func_eval_time_limit=per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self. _initial_configurations_via_metalearning, config_file=configspace_path, seed=self._seed, metadata_directory=self._metadata_directory, metric=self._metric, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, shared_mode=self._shared_mode, include_estimators=self._include_estimators, exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors, disable_file_output=self._disable_evaluator_output, get_smac_object_callback=self._get_smac_object_callback, smac_scenario_args=self._smac_scenario_args, ) try: self.runhistory_, self.trajectory_, self._budget_type = \ _proc_smac.run_smbo() trajectory_filename = os.path.join( self._backend.get_smac_output_directory_for_run( self._seed), 'trajectory.json') saveable_trajectory = \ [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:]) for entry in self.trajectory_] with open(trajectory_filename, 'w') as fh: json.dump(saveable_trajectory, fh) except Exception as e: self._logger.exception(e) raise # Wait until the ensemble process is finished to avoid shutting down # while the ensemble builder tries to access the data if self._proc_ensemble is not None and self._ensemble_size > 0: self._proc_ensemble.join() self._proc_ensemble = None if load_models: self._load_models() return self
def _fit(self, datamanager, metric, only_return_configuration_space=False): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if not isinstance(self._disable_evaluator_output, (bool, list)): raise ValueError('disable_evaluator_output must be of type bool ' 'or list.') if isinstance(self._disable_evaluator_output, list): allowed_elements = ['model', 'y_optimization'] for element in self._disable_evaluator_output: if element not in allowed_elements: raise ValueError("List member '%s' for argument " "'disable_evaluator_output' must be one " "of " + str(allowed_elements)) if self._resampling_strategy not in [ 'holdout', 'holdout-iterative-fit', 'cv', 'partial-cv', 'partial-cv-iterative-fit'] \ and not issubclass(self._resampling_strategy, BaseCrossValidator)\ and not issubclass(self._resampling_strategy, _RepeatedSplits)\ and not issubclass(self._resampling_strategy, BaseShuffleSplit): raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit'] \ and self._ensemble_size != 0: raise ValueError("Resampling strategy %s cannot be used " "together with ensembles." % self._resampling_strategy) if self._resampling_strategy in ['partial-cv', 'cv', 'partial-cv-iterative-fit'] and \ not 'folds' in self._resampling_strategy_arguments: self._resampling_strategy_arguments['folds'] = 5 self._backend._make_internals_directory() if self._keep_models: try: os.makedirs(self._backend.get_model_dir()) except (OSError, FileExistsError) as e: if not self._shared_mode: raise self._metric = metric self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._backend.temporary_directory, self._backend, datamanager, include_estimators=self._include_estimators, exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors) if only_return_configuration_space: return self.configuration_space # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info( 'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._proc_ensemble = None # Fit only raises error when ensemble_size is not zero but # time_left_for_ensembles is zero. if self._ensemble_size > 0: raise ValueError("Not starting ensemble builder because there " "is no time left. Try increasing the value " "of time_left_for_this_task.") else: self._proc_ensemble = self._get_ensemble_process(time_left_for_ensembles) if self._ensemble_size > 0: self._proc_ensemble.start() else: self._logger.info('Not starting ensemble builder because ' 'ensemble size is <= 0.') self._stopwatch.stop_task(ensemble_task_name) # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max(0, self._time_for_task - self._stopwatch.wall_elapsed( self._dataset_name)) if self._logger: self._logger.info( 'Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") _proc_smac = None else: if self._per_run_time_limit is None or \ self._per_run_time_limit > time_left_for_smac: print('Time limit for a single run is higher than total time ' 'limit. Capping the limit for a single run to the total ' 'time given to SMAC (%f)' % time_left_for_smac) per_run_time_limit = time_left_for_smac else: per_run_time_limit = self._per_run_time_limit _proc_smac = AutoMLSMBO( config_space=self.configuration_space, dataset_name=self._dataset_name, backend=self._backend, total_walltime_limit=time_left_for_smac, func_eval_time_limit=per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self._initial_configurations_via_metalearning, config_file=configspace_path, seed=self._seed, metadata_directory=self._metadata_directory, metric=self._metric, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, shared_mode=self._shared_mode, include_estimators=self._include_estimators, exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors, disable_file_output=self._disable_evaluator_output, get_smac_object_callback=self._get_smac_object_callback, smac_scenario_args=self._smac_scenario_args, ) self.runhistory_, self.trajectory_ = \ _proc_smac.run_smbo() trajectory_filename = os.path.join( self._backend.get_smac_output_directory_for_run(self._seed), 'trajectory.json') saveable_trajectory = \ [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:]) for entry in self.trajectory_] with open(trajectory_filename, 'w') as fh: json.dump(saveable_trajectory, fh) # Wait until the ensemble process is finished to avoid shutting down # while the ensemble builder tries to access the data if self._proc_ensemble is not None and self._ensemble_size > 0: self._proc_ensemble.join() self._proc_ensemble = None self._load_models() return self