def test_stopwatch_overhead(self): # Wall Overhead start = time.time() cpu_start = time.clock() watch = StopWatch() for i in range(1, 1000): watch.start_task('task_%d' % i) watch.stop_task('task_%d' % i) cpu_stop = time.clock() stop = time.time() dur = stop - start cpu_dur = cpu_stop - cpu_start cpu_overhead = cpu_dur - watch.cpu_sum() wall_overhead = dur - watch.wall_sum() self.assertLess(cpu_overhead, 1) self.assertLess(wall_overhead, 1) self.assertLess(watch.cpu_sum(), 2 * watch.wall_sum())
def test_stopwatch_overhead(self): # CPU overhead start = time.clock() watch = StopWatch() for i in range(1, 100000): watch.start_task("task_%d" % i) watch.stop_task("task_%d" % i) stop = time.clock() dur = stop - start cpu_overhead = dur - watch.cpu_sum() self.assertLess(cpu_overhead, 1.5) # Wall Overhead start = time.time() watch = StopWatch() for i in range(1, 100000): watch.start_task("task_%d" % i) watch.stop_task("task_%d" % i) stop = time.time() dur = stop - start wall_overhead = dur - watch.wall_sum() self.assertLess(wall_overhead, 2) self.assertLess(cpu_overhead, wall_overhead)
class AutoML(BaseEstimator, multiprocessing.Process): def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32, max_iter_smac=None, acquisition_function='EI'): super(AutoML, self).__init__() self._tmp_dir = tmp_dir self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self._max_iter_smac = max_iter_smac self.delete_tmp_folder_after_terminate = \ delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = \ delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self.acquisition_function = acquisition_function self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self._parser = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode if not isinstance(self._time_for_task, int): raise ValueError("time_left_for_this_task not of type integer, " "but %s" % str(type(self._time_for_task))) if not isinstance(self._per_run_time_limit, int): raise ValueError("per_run_time_limit not of type integer, but %s" % str(type(self._per_run_time_limit))) # After assignging and checking variables... self._backend = Backend(self._output_dir, self._tmp_dir) def start_automl(self, parser): self._parser = parser self.start() def start(self): if self._parser is None: raise ValueError('You must invoke start() only via start_automl()') super(AutoML, self).start() def run(self): if self._parser is None: raise ValueError('You must invoke run() only via start_automl()') self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=self._parser) self._stopwatch.start_task(datamanager.name) self._logger = self._get_logger(datamanager.name) self._datamanager = datamanager self._dataset_name = datamanager.name self._fit(self._datamanager) def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager) def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! self._data_memory_limit = float(self._ml_memory_limit) / 3 loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=self._data_memory_limit) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager) def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) return get_logger(logger_name) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager, num_run): self._logger.info("Starting to create dummy predictions.") time_limit = int(self._time_for_task / 6.) memory_limit = int(self._ml_memory_limit) _info = eval_with_limits(datamanager, self._tmp_dir, 1, self._seed, num_run, self._resampling_strategy, self._resampling_strategy_arguments, memory_limit, time_limit) if _info[4] == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 1/2.") else: self._logger.error('Error creating dummy prediction 1/2:%s ', _info[3]) num_run += 1 _info = eval_with_limits(datamanager, self._tmp_dir, 2, self._seed, num_run, self._resampling_strategy, self._resampling_strategy_arguments, memory_limit, time_limit) if _info[4] == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 2/2.") else: self._logger.error('Error creating dummy prediction 2/2 %s', _info[3]) num_run += 1 return num_run def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in [ 'holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv' ]: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") acquisition_functions = ['EI', 'EIPS'] if self.acquisition_function not in acquisition_functions: raise ValueError( 'Illegal acquisition %s: Must be one of %s.' % (self.acquisition_function, acquisition_functions)) self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warning("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data(datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time(self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._tmp_dir, self._backend, datamanager, self._include_estimators, self._include_preprocessors) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info('Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._logger.warning("Not starting ensemble builder because there " "is no time left!") self._proc_ensemble = None else: self._proc_ensemble = self._get_ensemble_process( time_left_for_ensembles) self._proc_ensemble.start() self._stopwatch.stop_task(ensemble_task_name) # == RUN SMBO # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max( 0, self._time_for_task - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info('Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") self._procsmac = None else: self._proc_smac = AutoMLSMBO( config_space=self.configuration_space, dataset_name=self._dataset_name, tmp_dir=self._tmp_dir, output_dir=self._output_dir, total_walltime_limit=time_left_for_smac, func_eval_time_limit=self._per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self. _initial_configurations_via_metalearning, config_file=configspace_path, smac_iters=self._max_iter_smac, seed=self._seed, metadata_directory=self._metadata_directory, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, acquisition_function=self.acquisition_function, shared_mode=self._shared_mode) self._proc_smac.start() psutil_procs = [] procs = [] if self._proc_smac is not None: proc_smac = psutil.Process(self._proc_smac.pid) psutil_procs.append(proc_smac) procs.append(self._proc_smac) if self._proc_ensemble is not None: proc_ensemble = psutil.Process(self._proc_ensemble.pid) psutil_procs.append(proc_ensemble) procs.append(self._proc_ensemble) if self._queue is not None: self._queue.put( [time_for_load_data, data_manager_path, psutil_procs]) else: for proc in procs: proc.join() if self._queue is None: self._load_models() return self def refit(self, X, y): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() for identifier in self.models_: if identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method model.fit(X.copy(), y.copy()) self._can_predict = True def predict(self, X): return np.argmax(self.predict_proba(X), axis=1) def predict_proba(self, X): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if not self._can_predict and \ self._resampling_strategy not in \ ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently only implemented for resampling ' 'strategy holdout.') if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() all_predictions = [] for identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] X_ = X.copy() if self._task in REGRESSION_TASKS: prediction = model.predict(X_) else: prediction = model.predict_proba(X_) if len(prediction.shape) < 1 or len(X_.shape) < 1 or \ X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]: self._logger.warning( "Prediction shape for model %s is %s " "while X_.shape is %s" % (model, str(prediction.shape), str(X_.shape))) all_predictions.append(prediction) if len(all_predictions) == 0: raise ValueError( 'Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = self.ensemble_.predict(all_predictions) return predictions def fit_ensemble(self, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): if self._logger is None: self._logger = self._get_logger(dataset_name) self._proc_ensemble = self._get_ensemble_process( 1, task, metric, precision, dataset_name, max_iterations=1, ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size) self._proc_ensemble.main() def _get_ensemble_process(self, time_left_for_ensembles, task=None, metric=None, precision=None, dataset_name=None, max_iterations=-1, ensemble_nbest=None, ensemble_size=None): if task is None: task = self._task if metric is None: metric = self._metric if precision is None: precision = self.precision if dataset_name is None: dataset_name = self._dataset_name if ensemble_nbest is None: ensemble_nbest = self._ensemble_nbest if ensemble_size is None: ensemble_size = self._ensemble_size return EnsembleBuilder(autosklearn_tmp_dir=self._tmp_dir, dataset_name=dataset_name, task_type=task, metric=metric, limit=time_left_for_ensembles, output_dir=self._output_dir, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, precision=precision, max_iterations=max_iterations) def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.ensemble_ = self._backend.load_ensemble(seed) if self.ensemble_: identifiers = self.ensemble_.identifiers_ self.models_ = self._backend.load_models_by_identifiers( identifiers) else: self.models_ = self._backend.load_all_models(seed) if len(self.models_) == 0: raise ValueError('No models fitted!') def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict_proba(X) return calculate_score(y, prediction, self._task, self._metric, self._label_num, logger=self._logger) def show_models(self): if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() return self.ensemble_.pprint_ensemble_string(self.models_) def _save_ensemble_data(self, X, y): """Split dataset and store Data for the ensemble script. :param X: :param y: :return: """ task_name = 'LoadData' self._start_task(self._stopwatch, task_name) _, _, _, y_ensemble = resampling.split_data(X, y) self._backend.save_targets_ensemble(y_ensemble) self._stop_task(self._stopwatch, task_name) def _create_search_space(self, tmp_dir, backend, datamanager, include_estimators=None, include_preprocessors=None): task_name = 'CreateConfigSpace' self._stopwatch.start_task(task_name) configspace_path = os.path.join(tmp_dir, 'space.pcs') configuration_space = pipeline.get_configuration_space( datamanager.info, include_estimators=include_estimators, include_preprocessors=include_preprocessors) configuration_space = self.configuration_space_created_hook( datamanager, configuration_space) sp_string = pcs.write(configuration_space) backend.write_txt_file(configspace_path, sp_string, 'Configuration space') self._stopwatch.stop_task(task_name) return configuration_space, configspace_path def configuration_space_created_hook(self, datamanager, configuration_space): return configuration_space def get_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'get_params() because it is not intended to ' 'be optimized.') def set_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'set_params() because it is not intended to ' 'be optimized.') def __del__(self): self._delete_output_directories() def _delete_output_directories(self): if self.delete_output_folder_after_terminate: try: shutil.rmtree(self._output_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete output dir: %s" % self._output_dir) else: print("Could not delete output dir: %s" % self._output_dir) if self.delete_tmp_folder_after_terminate: try: shutil.rmtree(self._tmp_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete tmp dir: %s" % self._tmp_dir) pass else: print("Could not delete tmp dir: %s" % self._tmp_dir)
def main(self): watch = StopWatch() watch.start_task('ensemble_builder') used_time = 0 time_iter = 0 index_run = 0 num_iteration = 0 current_num_models = 0 last_hash = None current_hash = None backend = Backend(self.output_dir, self.autosklearn_tmp_dir) dir_ensemble = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn', 'predictions_ensemble') dir_valid = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn', 'predictions_valid') dir_test = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn', 'predictions_test') paths_ = [dir_ensemble, dir_valid, dir_test] dir_ensemble_list_mtimes = [] self.logger.debug('Starting main loop with %f seconds and %d iterations ' 'left.' % (self.limit - used_time, num_iteration)) while used_time < self.limit or (self.max_iterations > 0 and self.max_iterations >= num_iteration): num_iteration += 1 self.logger.debug('Time left: %f', self.limit - used_time) self.logger.debug('Time last ensemble building: %f', time_iter) # Reload the ensemble targets every iteration, important, because cv may # update the ensemble targets in the cause of running auto-sklearn # TODO update cv in order to not need this any more! targets_ensemble = backend.load_targets_ensemble() # Load the predictions from the models exists = [os.path.isdir(dir_) for dir_ in paths_] if not exists[0]: # all(exists): self.logger.debug('Prediction directory %s does not exist!' % dir_ensemble) time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if self.shared_mode is False: dir_ensemble_list = sorted(glob.glob(os.path.join( dir_ensemble, 'predictions_ensemble_%s_*.npy' % self.seed))) if exists[1]: dir_valid_list = sorted(glob.glob(os.path.join( dir_valid, 'predictions_valid_%s_*.npy' % self.seed))) else: dir_valid_list = [] if exists[2]: dir_test_list = sorted(glob.glob(os.path.join( dir_test, 'predictions_test_%s_*.npy' % self.seed))) else: dir_test_list = [] else: dir_ensemble_list = sorted(os.listdir(dir_ensemble)) dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else [] dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else [] # Check the modification times because predictions can be updated # over time! old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes dir_ensemble_list_mtimes = [] for dir_ensemble_file in dir_ensemble_list: if dir_ensemble_file.endswith("/"): dir_ensemble_file = dir_ensemble_file[:-1] basename = os.path.basename(dir_ensemble_file) dir_ensemble_file = os.path.join(dir_ensemble, basename) mtime = os.path.getmtime(dir_ensemble_file) dir_ensemble_list_mtimes.append(mtime) if len(dir_ensemble_list) == 0: self.logger.debug('Directories are empty') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if len(dir_ensemble_list) <= current_num_models and \ old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes: self.logger.debug('Nothing has changed since the last time') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue watch.start_task('index_run' + str(index_run)) watch.start_task('ensemble_iter_' + str(num_iteration)) # List of num_runs (which are in the filename) which will be included # later include_num_runs = [] backup_num_runs = [] model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$') if self.ensemble_nbest is not None: # Keeps track of the single scores of each model in our ensemble scores_nbest = [] # The indices of the model that are currently in our ensemble indices_nbest = [] # The names of the models model_names = [] model_names_to_scores = dict() model_idx = 0 for model_name in dir_ensemble_list: if model_name.endswith("/"): model_name = model_name[:-1] basename = os.path.basename(model_name) try: if self.precision is "16": predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float16) elif self.precision is "32": predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float32) elif self.precision is "64": predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float64) else: predictions = np.load(os.path.join(dir_ensemble, basename)) score = calculate_score(targets_ensemble, predictions, self.task_type, self.metric, predictions.shape[1]) except Exception as e: self.logger.warning('Error loading %s: %s', basename, e) score = -1 model_names_to_scores[model_name] = score match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if self.ensemble_nbest is not None: if score <= 0.001: self.logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) # If we have less models in our ensemble than ensemble_nbest add # the current model if it is better than random elif len(scores_nbest) < self.ensemble_nbest: scores_nbest.append(score) indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) model_names.append(model_name) else: # Take the worst performing model in our ensemble so far idx = np.argmin(np.array([scores_nbest])) # If the current model is better than the worst model in # our ensemble replace it by the current model if scores_nbest[idx] < score: self.logger.debug('Worst model in our ensemble: %s with ' 'score %f will be replaced by model %s ' 'with score %f', model_names[idx], scores_nbest[idx], model_name, score) # Exclude the old model del scores_nbest[idx] scores_nbest.append(score) del include_num_runs[idx] del indices_nbest[idx] indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) del model_names[idx] model_names.append(model_name) # Otherwise exclude the current model from the ensemble else: # include_num_runs.append(True) pass else: # Load all predictions that are better than random if score <= 0.001: # include_num_runs.append(True) self.logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) else: include_num_runs.append((automl_seed, num_run)) model_idx += 1 # If there is no model better than random guessing, we have to use # all models which do random guessing if len(include_num_runs) == 0: include_num_runs = backup_num_runs indices_to_model_names = dict() indices_to_run_num = dict() for i, model_name in enumerate(dir_ensemble_list): match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if (automl_seed, num_run) in include_num_runs: num_indices = len(indices_to_model_names) indices_to_model_names[num_indices] = model_name indices_to_run_num[num_indices] = (automl_seed, num_run) try: all_predictions_train, all_predictions_valid, all_predictions_test =\ self.get_all_predictions(dir_ensemble, dir_ensemble_list, dir_valid, dir_valid_list, dir_test, dir_test_list, include_num_runs, model_and_automl_re, self.precision) except IOError: self.logger.error('Could not load the predictions.') continue if len(include_num_runs) == 0: self.logger.error('All models do just random guessing') time.sleep(2) continue else: ensemble = EnsembleSelection(ensemble_size=self.ensemble_size, task_type=self.task_type, metric=self.metric) try: ensemble.fit(all_predictions_train, targets_ensemble, include_num_runs) self.logger.info(ensemble) except ValueError as e: self.logger.error('Caught ValueError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except IndexError as e: self.logger.error('Caught IndexError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except Exception as e: self.logger.error('Caught error! %s', str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue # Output the score self.logger.info('Training performance: %f' % ensemble.train_score_) self.logger.info('Building the ensemble took %f seconds' % watch.wall_elapsed('ensemble_iter_' + str(num_iteration))) # Set this variable here to avoid re-running the ensemble builder # every two seconds in case the ensemble did not change current_num_models = len(dir_ensemble_list) ensemble_predictions = ensemble.predict(all_predictions_train) if sys.version_info[0] == 2: ensemble_predictions.flags.writeable = False current_hash = hash(ensemble_predictions.data) else: current_hash = hash(ensemble_predictions.data.tobytes()) # Only output a new ensemble and new predictions if the output of the # ensemble would actually change! # TODO this is neither safe (collisions, tests only with the ensemble # prediction, but not the ensemble), implement a hash function for # each possible ensemble builder. if last_hash is not None: if current_hash == last_hash: self.logger.info('Ensemble output did not change.') time.sleep(2) continue else: last_hash = current_hash else: last_hash = current_hash # Save the ensemble for later use in the main auto-sklearn module! backend.save_ensemble(ensemble, index_run, self.seed) # Save predictions for valid and test data set if len(dir_valid_list) == len(dir_ensemble_list): all_predictions_valid = np.array(all_predictions_valid) ensemble_predictions_valid = ensemble.predict(all_predictions_valid) if self.task_type == BINARY_CLASSIFICATION: ensemble_predictions_valid = ensemble_predictions_valid[:, 1] if self.low_precision: if self.task_type in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION]: ensemble_predictions_valid[ensemble_predictions_valid < 1e-4] = 0. if self.metric in [BAC_METRIC, F1_METRIC]: bin_array = np.zeros(ensemble_predictions_valid.shape, dtype=np.int32) if (self.task_type != MULTICLASS_CLASSIFICATION) or ( ensemble_predictions_valid.shape[1] == 1): bin_array[ensemble_predictions_valid >= 0.5] = 1 else: sample_num = ensemble_predictions_valid.shape[0] for i in range(sample_num): j = np.argmax(ensemble_predictions_valid[i, :]) bin_array[i, j] = 1 ensemble_predictions_valid = bin_array if self.task_type in CLASSIFICATION_TASKS: if ensemble_predictions_valid.size < (20000 * 20): precision = 3 else: precision = 2 else: if ensemble_predictions_valid.size > 1000000: precision = 4 else: # File size maximally 2.1MB precision = 6 backend.save_predictions_as_txt(ensemble_predictions_valid, 'valid', index_run, prefix=self.dataset_name, precision=precision) else: self.logger.info('Could not find as many validation set predictions (%d)' 'as ensemble predictions (%d)!.', len(dir_valid_list), len(dir_ensemble_list)) del all_predictions_valid if len(dir_test_list) == len(dir_ensemble_list): all_predictions_test = np.array(all_predictions_test) ensemble_predictions_test = ensemble.predict(all_predictions_test) if self.task_type == BINARY_CLASSIFICATION: ensemble_predictions_test = ensemble_predictions_test[:, 1] if self.low_precision: if self.task_type in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION]: ensemble_predictions_test[ensemble_predictions_test < 1e-4] = 0. if self.metric in [BAC_METRIC, F1_METRIC]: bin_array = np.zeros(ensemble_predictions_test.shape, dtype=np.int32) if (self.task_type != MULTICLASS_CLASSIFICATION) or ( ensemble_predictions_test.shape[1] == 1): bin_array[ensemble_predictions_test >= 0.5] = 1 else: sample_num = ensemble_predictions_test.shape[0] for i in range(sample_num): j = np.argmax(ensemble_predictions_test[i, :]) bin_array[i, j] = 1 ensemble_predictions_test = bin_array if self.task_type in CLASSIFICATION_TASKS: if ensemble_predictions_test.size < (20000 * 20): precision = 3 else: precision = 2 else: if ensemble_predictions_test.size > 1000000: precision = 4 else: precision = 6 backend.save_predictions_as_txt(ensemble_predictions_test, 'test', index_run, prefix=self.dataset_name, precision=precision) else: self.logger.info('Could not find as many test set predictions (%d) as ' 'ensemble predictions (%d)!', len(dir_test_list), len(dir_ensemble_list)) del all_predictions_test current_num_models = len(dir_ensemble_list) watch.stop_task('index_run' + str(index_run)) time_iter = watch.get_wall_dur('index_run' + str(index_run)) used_time = watch.wall_elapsed('ensemble_builder') index_run += 1 return
class AutoML(BaseEstimator, multiprocessing.Process): def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32): super(AutoML, self).__init__() self._tmp_dir = tmp_dir self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self.delete_tmp_folder_after_terminate = \ delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = \ delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self.models_ = None self.ensemble_indices_ = None self._debug_mode = debug_mode self._backend = Backend(self._output_dir, self._tmp_dir) def start_automl(self, parser): self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=parser) self._stopwatch.start_task(datamanager.name) logger_name = 'AutoML(%d):%s' % (self._seed, datamanager.name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) self._datamanager = datamanager self._dataset_name = datamanager.name self.start() def start(self): if self._datamanager is None: raise ValueError('You must invoke start() only via start_automl()') super(AutoML, self).start() def run(self): if self._datamanager is None: raise ValueError('You must invoke run() only via start_automl()') self._fit(self._datamanager) def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all([isinstance(f, bool) for f in feat_type]): raise ValueError('Array feat_type must only contain bools.') loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager) def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=float(self._ml_memory_limit) / 3) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager): autosklearn.cli.base_interface.main(datamanager, self._resampling_strategy, None, None, mode_args=self._resampling_strategy_arguments) def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_indices_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv']: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warning("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] set_auto_seed(self._seed) # == Pickle the data manager, here, because no more global # OneHotEncoding data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data( datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions self._do_dummy_prediction(datamanager) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = _create_search_space( self._tmp_dir, datamanager.info, self._backend, self._stopwatch, self._logger, self._include_estimators, self._include_preprocessors) self.configuration_space_created_hook(datamanager) # == Calculate metafeatures meta_features = _calculate_metafeatures( data_feat_type=datamanager.feat_type, data_info_task=datamanager.info['task'], x_train=datamanager.data['X_train'], y_train=datamanager.data['Y_train'], basename=self._dataset_name, watcher=self._stopwatch, metalearning_cnt=self._initial_configurations_via_metalearning, logger=self._logger) self._stopwatch.start_task('OneHot') datamanager.perform1HotEncoding() self._stopwatch.stop_task('OneHot') if meta_features is None: initial_configurations = [] elif datamanager.info['task'] in [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION]: meta_features_encoded = _calculate_metafeatures_encoded( self._dataset_name, datamanager.data['X_train'], datamanager.data['Y_train'], self._stopwatch, self._logger) self._logger.debug(meta_features.__repr__(verbosity=2)) self._logger.debug(meta_features_encoded.__repr__(verbosity=2)) initial_configurations = _get_initial_configuration( meta_features, meta_features_encoded, self._dataset_name, self._metric, self.configuration_space, self._task, self._metadata_directory, self._initial_configurations_via_metalearning, datamanager.info[ 'is_sparse'], self._stopwatch, self._logger) _print_debug_info_of_init_configuration( initial_configurations, self._dataset_name, self._time_for_task, self._logger, self._stopwatch) else: initial_configurations = [] self._logger.warning('Metafeatures encoded not calculated') # == RUN SMAC if (datamanager.info["task"] == BINARY_CLASSIFICATION) or \ (datamanager.info["task"] == MULTICLASS_CLASSIFICATION): config = {'balancing:strategy': 'weighting', 'classifier:__choice__': 'sgd', 'classifier:sgd:loss': 'hinge', 'classifier:sgd:penalty': 'l2', 'classifier:sgd:alpha': 0.0001, 'classifier:sgd:fit_intercept': 'True', 'classifier:sgd:n_iter': 5, 'classifier:sgd:learning_rate': 'optimal', 'classifier:sgd:eta0': 0.01, 'classifier:sgd:average': 'True', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'one_hot_encoding:minimum_fraction': 0.1, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'min/max'} elif datamanager.info["task"] == MULTILABEL_CLASSIFICATION: config = {'classifier:__choice__': 'adaboost', 'classifier:adaboost:algorithm': 'SAMME.R', 'classifier:adaboost:learning_rate': 1.0, 'classifier:adaboost:max_depth': 1, 'classifier:adaboost:n_estimators': 50, 'balancing:strategy': 'weighting', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'one_hot_encoding:minimum_fraction': 0.1, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'none'} else: config = None self._logger.info("Tasktype unknown: %s" % TASK_TYPES_TO_STRING[datamanager.info["task"]]) if config is not None: configuration = Configuration(self.configuration_space, config) config_string = convert_conf2smac_string(configuration) initial_configurations = [config_string] + initial_configurations # == RUN SMAC proc_smac = run_smac(tmp_dir=self._tmp_dir, basename=self._dataset_name, time_for_task=self._time_for_task, ml_memory_limit=self._ml_memory_limit, data_manager_path=data_manager_path, configspace_path=configspace_path, initial_configurations=initial_configurations, per_run_time_limit=self._per_run_time_limit, watcher=self._stopwatch, backend=self._backend, seed=self._seed, resampling_strategy=self._resampling_strategy, resampling_strategy_arguments=self._resampling_strategy_arguments, shared_mode=self._shared_mode) # == RUN ensemble builder proc_ensembles = self.run_ensemble_builder() procs = [] if proc_smac is not None: procs.append(proc_smac) if proc_ensembles is not None: procs.append(proc_ensembles) if self._queue is not None: self._queue.put([time_for_load_data, data_manager_path, procs]) else: for proc in procs: proc.wait() # Delete AutoSklearn environment variable del_auto_seed() # In case try: del self._datamanager except Exception: pass if self._queue is None: self._load_models() return self def run_ensemble_builder(self, time_left_for_ensembles=None, max_iterations=None, ensemble_size=None): if self._ensemble_size > 0 or ensemble_size is not None: task_name = 'runEnsemble' self._stopwatch.start_task(task_name) if time_left_for_ensembles is None: time_left_for_ensembles = max(0, self._time_for_task - self._stopwatch.wall_elapsed( self._dataset_name)) if max_iterations is None: max_iterations = -1 if ensemble_size is None: ensemble_size = self._ensemble_size # It can happen that run_ensemble_builder is called without # calling fit. if self._logger: self._logger.info( 'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) proc_ensembles = submit_process.run_ensemble_builder( tmp_dir=self._tmp_dir, dataset_name=self._dataset_name, task_type=self._task, metric=self._metric, limit=time_left_for_ensembles, output_dir=self._output_dir, ensemble_size=ensemble_size, ensemble_nbest=self._ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, max_iterations=max_iterations, precision=self.precision ) self._stopwatch.stop_task(task_name) return proc_ensembles else: self._logger.info('Not starting ensemble script due to ensemble ' 'size 0.') return None def predict(self, X): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently only implemented for resampling ' 'strategy holdout.') if self.models_ is None or len(self.models_) == 0 or len( self.ensemble_indices_) == 0: self._load_models() predictions = [] for identifier in self.models_: if identifier not in self.ensemble_indices_: continue weight = self.ensemble_indices_[identifier] model = self.models_[identifier] X_ = X.copy() if self._task in REGRESSION_TASKS: prediction = model.predict(X_) else: prediction = model.predict_proba(X_) if len(prediction.shape) < 1 or len(X_.shape) < 1 or \ X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]: self._logger.warning("Prediction shape for model %s is %s " "while X_.shape is %s" % (model, str(prediction.shape), str(X_.shape))) predictions.append(prediction * weight) if len(predictions) == 0: raise ValueError('Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = np.sum(np.array(predictions), axis=0) return predictions def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.models_ = self._backend.load_all_models(seed) if len(self.models_) == 0: raise ValueError('No models fitted!') self.ensemble_indices_ = self._backend.load_ensemble_indices_weights( seed) def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict(X) if self._task == BINARY_CLASSIFICATION: prediction = prediction[:, 1].reshape((-1, 1)) return calculate_score(y, prediction, self._task, self._metric, self._label_num, logger=self._logger) def show_models(self): if self.models_ is None or len(self.models_) == 0 or len( self.ensemble_indices_) == 0: self._load_models() output = [] sio = six.StringIO() for identifier in self.models_: if identifier not in self.ensemble_indices_: continue weight = self.ensemble_indices_[identifier] model = self.models_[identifier] output.append((weight, model)) output.sort(reverse=True) sio.write("[") for weight, model in output: sio.write("(%f, %s),\n" % (weight, model)) sio.write("]") return sio.getvalue() def _save_ensemble_data(self, X, y): """Split dataset and store Data for the ensemble script. :param X: :param y: :return: """ task_name = 'LoadData' self._start_task(self._stopwatch, task_name) _, _, _, y_ensemble = resampling.split_data(X, y) self._backend.save_targets_ensemble(y_ensemble) self._stop_task(self._stopwatch, task_name) def configuration_space_created_hook(self, datamanager): pass def get_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'get_params() because it is not intended to ' 'be optimized.') def set_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'set_params() because it is not intended to ' 'be optimized.') def __del__(self): self._delete_output_directories() def _delete_output_directories(self): if self.delete_output_folder_after_terminate: try: shutil.rmtree(self._output_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete output dir: %s" % self._output_dir) else: print("Could not delete output dir: %s" % self._output_dir) if self.delete_tmp_folder_after_terminate: try: shutil.rmtree(self._tmp_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete tmp dir: %s" % self._tmp_dir) pass else: print("Could not delete tmp dir: %s" % self._tmp_dir)
def main(autosklearn_tmp_dir, basename, task_type, metric, limit, output_dir, ensemble_size=None, ensemble_nbest=None, seed=1, shared_mode=False, max_iterations=-1, precision="32"): watch = StopWatch() watch.start_task('ensemble_builder') used_time = 0 time_iter = 0 index_run = 0 num_iteration = 0 current_num_models = 0 backend = Backend(output_dir, autosklearn_tmp_dir) dir_ensemble = os.path.join(autosklearn_tmp_dir, '.auto-sklearn', 'predictions_ensemble') dir_valid = os.path.join(autosklearn_tmp_dir, '.auto-sklearn', 'predictions_valid') dir_test = os.path.join(autosklearn_tmp_dir, '.auto-sklearn', 'predictions_test') paths_ = [dir_ensemble, dir_valid, dir_test] targets_ensemble = backend.load_targets_ensemble() dir_ensemble_list_mtimes = [] while used_time < limit or (max_iterations > 0 and max_iterations >= num_iteration): num_iteration += 1 logger.debug('Time left: %f', limit - used_time) logger.debug('Time last iteration: %f', time_iter) # Load the predictions from the models exists = [os.path.isdir(dir_) for dir_ in paths_] if not exists[0]: # all(exists): logger.debug('Prediction directory %s does not exist!' % dir_ensemble) time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if shared_mode is False: dir_ensemble_list = sorted(glob.glob(os.path.join( dir_ensemble, 'predictions_ensemble_%s_*.npy' % seed))) if exists[1]: dir_valid_list = sorted(glob.glob(os.path.join( dir_valid, 'predictions_valid_%s_*.npy' % seed))) else: dir_valid_list = [] if exists[2]: dir_test_list = sorted(glob.glob(os.path.join( dir_test, 'predictions_test_%s_*.npy' % seed))) else: dir_test_list = [] else: dir_ensemble_list = sorted(os.listdir(dir_ensemble)) dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else [] dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else [] # Check the modification times because predictions can be updated # over time! old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes dir_ensemble_list_mtimes = [] for dir_ensemble_file in dir_ensemble_list: dir_ensemble_file = os.path.join(dir_ensemble, dir_ensemble_file) mtime = os.path.getmtime(dir_ensemble_file) dir_ensemble_list_mtimes.append(mtime) if len(dir_ensemble_list) == 0: logger.debug('Directories are empty') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if len(dir_ensemble_list) <= current_num_models and \ old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes: logger.debug('Nothing has changed since the last time') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue watch.start_task('ensemble_iter_' + str(index_run)) # List of num_runs (which are in the filename) which will be included # later include_num_runs = [] backup_num_runs = [] model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$') if ensemble_nbest is not None: # Keeps track of the single scores of each model in our ensemble scores_nbest = [] # The indices of the model that are currently in our ensemble indices_nbest = [] # The names of the models model_names = [] model_names_to_scores = dict() model_idx = 0 for model_name in dir_ensemble_list: if precision is "16": predictions = np.load(os.path.join(dir_ensemble, model_name)).astype(dtype=np.float16) elif precision is "32": predictions = np.load(os.path.join(dir_ensemble, model_name)).astype(dtype=np.float32) elif precision is "64": predictions = np.load(os.path.join(dir_ensemble, model_name)).astype(dtype=np.float64) else: predictions = np.load(os.path.join(dir_ensemble, model_name)) score = calculate_score(targets_ensemble, predictions, task_type, metric, predictions.shape[1]) model_names_to_scores[model_name] = score match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if ensemble_nbest is not None: if score <= 0.001: # include_num_runs.append(True) logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append(num_run) # If we have less models in our ensemble than ensemble_nbest add # the current model if it is better than random elif len(scores_nbest) < ensemble_nbest: scores_nbest.append(score) indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) model_names.append(model_name) else: # Take the worst performing model in our ensemble so far idx = np.argmin(np.array([scores_nbest])) # If the current model is better than the worst model in # our ensemble replace it by the current model if scores_nbest[idx] < score: logger.debug('Worst model in our ensemble: %s with ' 'score %f will be replaced by model %s ' 'with score %f', model_names[idx], scores_nbest[idx], model_name, score) # Exclude the old model del scores_nbest[idx] scores_nbest.append(score) del include_num_runs[idx] del indices_nbest[idx] indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) del model_names[idx] model_names.append(model_name) # Otherwise exclude the current model from the ensemble else: # include_num_runs.append(True) pass else: # Load all predictions that are better than random if score <= 0.001: # include_num_runs.append(True) logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) else: include_num_runs.append((automl_seed, num_run)) model_idx += 1 # If there is no model better than random guessing, we have to use # all models which do random guessing if len(include_num_runs) == 0: include_num_runs = backup_num_runs indices_to_model_names = dict() indices_to_run_num = dict() for i, model_name in enumerate(dir_ensemble_list): match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if (automl_seed, num_run) in include_num_runs: num_indices = len(indices_to_model_names) indices_to_model_names[num_indices] = model_name indices_to_run_num[num_indices] = (automl_seed, num_run) # logging.info("Indices to model names:") # logging.info(indices_to_model_names) # for i, item in enumerate(sorted(model_names_to_scores.items(), # key=lambda t: t[1])): # logging.info("%d: %s", i, item) include_num_runs = set(include_num_runs) all_predictions_train = get_predictions(dir_ensemble, dir_ensemble_list, include_num_runs, model_and_automl_re, precision) # if len(all_predictions_train) == len(all_predictions_test) == len( # all_predictions_valid) == 0: if len(include_num_runs) == 0: logger.error('All models do just random guessing') time.sleep(2) continue else: try: indices, trajectory = ensemble_selection( np.array(all_predictions_train), targets_ensemble, ensemble_size, task_type, metric) logger.info('Trajectory and indices!') logger.info(trajectory) logger.info(indices) except ValueError as e: logger.error('Caught ValueError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except Exception as e: logger.error('Caught error! %s', e.message) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue # Output the score logger.info('Training performance: %f' % trajectory[-1]) # Print the ensemble members: ensemble_members_run_numbers = dict() ensemble_members = Counter(indices).most_common() ensemble_members_string = 'Ensemble members:\n' logger.info(ensemble_members) for ensemble_member in ensemble_members: weight = float(ensemble_member[1]) / len(indices) ensemble_members_string += \ (' %s; weight: %10f; performance: %10f\n' % (indices_to_model_names[ensemble_member[0]], weight, model_names_to_scores[ indices_to_model_names[ensemble_member[0]]])) ensemble_members_run_numbers[ indices_to_run_num[ ensemble_member[0]]] = weight logger.info(ensemble_members_string) # Save the ensemble indices for later use! backend.save_ensemble_indices_weights(ensemble_members_run_numbers, index_run, seed) all_predictions_valid = get_predictions(dir_valid, dir_valid_list, include_num_runs, model_and_automl_re, precision) # Save predictions for valid and test data set if len(dir_valid_list) == len(dir_ensemble_list): all_predictions_valid = np.array(all_predictions_valid) ensemble_predictions_valid = np.mean( all_predictions_valid[indices.astype(int)], axis=0) backend.save_predictions_as_txt(ensemble_predictions_valid, 'valid', index_run, prefix=basename) else: logger.info('Could not find as many validation set predictions (%d)' 'as ensemble predictions (%d)!.', len(dir_valid_list), len(dir_ensemble_list)) del all_predictions_valid all_predictions_test = get_predictions(dir_test, dir_test_list, include_num_runs, model_and_automl_re, precision) if len(dir_test_list) == len(dir_ensemble_list): all_predictions_test = np.array(all_predictions_test) ensemble_predictions_test = np.mean( all_predictions_test[indices.astype(int)], axis=0) backend.save_predictions_as_txt(ensemble_predictions_test, 'test', index_run, prefix=basename) else: logger.info('Could not find as many test set predictions (%d) as ' 'ensemble predictions (%d)!', len(dir_test_list), len(dir_ensemble_list)) del all_predictions_test current_num_models = len(dir_ensemble_list) watch.stop_task('ensemble_iter_' + str(index_run)) time_iter = watch.get_wall_dur('ensemble_iter_' + str(index_run)) used_time = watch.wall_elapsed('ensemble_builder') index_run += 1 return
class AutoML(BaseEstimator, multiprocessing.Process): def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32): super(AutoML, self).__init__() self._tmp_dir = tmp_dir self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self.delete_tmp_folder_after_terminate = \ delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = \ delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode self._backend = Backend(self._output_dir, self._tmp_dir) def start_automl(self, parser): self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=parser) self._stopwatch.start_task(datamanager.name) logger_name = 'AutoML(%d):%s' % (self._seed, datamanager.name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) self._datamanager = datamanager self._dataset_name = datamanager.name self.start() def start(self): if self._datamanager is None: raise ValueError('You must invoke start() only via start_automl()') super(AutoML, self).start() def run(self): if self._datamanager is None: raise ValueError('You must invoke run() only via start_automl()') self._fit(self._datamanager) def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all([isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager) def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=float(self._ml_memory_limit) / 3) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager): self._logger.info("Starting to create dummy predictions.") autosklearn.cli.base_interface.main(datamanager, self._resampling_strategy, None, None, mode_args=self._resampling_strategy_arguments, output_dir=self._tmp_dir) self._logger.info("Finished creating dummy predictions.") def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv']: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warning("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] set_auto_seed(self._seed) # == Pickle the data manager, here, because no more global # OneHotEncoding data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data( datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: self._do_dummy_prediction(datamanager) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = _create_search_space( self._tmp_dir, datamanager.info, self._backend, self._stopwatch, self._logger, self._include_estimators, self._include_preprocessors) self.configuration_space_created_hook(datamanager) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long proc_ensembles = self.run_ensemble_builder() # == Calculate metafeatures meta_features = _calculate_metafeatures( data_feat_type=datamanager.feat_type, data_info_task=datamanager.info['task'], x_train=datamanager.data['X_train'], y_train=datamanager.data['Y_train'], basename=self._dataset_name, watcher=self._stopwatch, metalearning_cnt=self._initial_configurations_via_metalearning, logger=self._logger) self._stopwatch.start_task('OneHot') datamanager.perform1HotEncoding() self._stopwatch.stop_task('OneHot') if meta_features is None: initial_configurations = [] elif datamanager.info['task'] in [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION]: meta_features_encoded = _calculate_metafeatures_encoded( self._dataset_name, datamanager.data['X_train'], datamanager.data['Y_train'], self._stopwatch, self._logger) self._logger.debug(meta_features.__repr__(verbosity=2)) self._logger.debug(meta_features_encoded.__repr__(verbosity=2)) initial_configurations = _get_initial_configuration( meta_features, meta_features_encoded, self._dataset_name, self._metric, self.configuration_space, self._task, self._metadata_directory, self._initial_configurations_via_metalearning, datamanager.info[ 'is_sparse'], self._stopwatch, self._logger) _print_debug_info_of_init_configuration( initial_configurations, self._dataset_name, self._time_for_task, self._logger, self._stopwatch) else: initial_configurations = [] self._logger.warning('Metafeatures encoded not calculated') # == RUN SMAC if (datamanager.info["task"] == BINARY_CLASSIFICATION) or \ (datamanager.info["task"] == MULTICLASS_CLASSIFICATION): config = {'balancing:strategy': 'weighting', 'classifier:__choice__': 'sgd', 'classifier:sgd:loss': 'hinge', 'classifier:sgd:penalty': 'l2', 'classifier:sgd:alpha': 0.0001, 'classifier:sgd:fit_intercept': 'True', 'classifier:sgd:n_iter': 5, 'classifier:sgd:learning_rate': 'optimal', 'classifier:sgd:eta0': 0.01, 'classifier:sgd:average': 'True', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'one_hot_encoding:minimum_fraction': 0.1, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'min/max'} elif datamanager.info["task"] == MULTILABEL_CLASSIFICATION: config = {'classifier:__choice__': 'adaboost', 'classifier:adaboost:algorithm': 'SAMME.R', 'classifier:adaboost:learning_rate': 1.0, 'classifier:adaboost:max_depth': 1, 'classifier:adaboost:n_estimators': 50, 'balancing:strategy': 'weighting', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'one_hot_encoding:minimum_fraction': 0.1, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'none'} else: config = None self._logger.info("Tasktype unknown: %s" % TASK_TYPES_TO_STRING[datamanager.info["task"]]) if config is not None: try: configuration = Configuration(self.configuration_space, config) config_string = convert_conf2smac_string(configuration) initial_configurations = [config_string] + initial_configurations except ValueError: pass # == RUN SMAC proc_smac = run_smac(tmp_dir=self._tmp_dir, basename=self._dataset_name, time_for_task=self._time_for_task, ml_memory_limit=self._ml_memory_limit, data_manager_path=data_manager_path, configspace_path=configspace_path, initial_configurations=initial_configurations, per_run_time_limit=self._per_run_time_limit, watcher=self._stopwatch, backend=self._backend, seed=self._seed, resampling_strategy=self._resampling_strategy, resampling_strategy_arguments=self._resampling_strategy_arguments, shared_mode=self._shared_mode) procs = [] if proc_smac is not None: procs.append(proc_smac) if proc_ensembles is not None: procs.append(proc_ensembles) if self._queue is not None: self._queue.put([time_for_load_data, data_manager_path, procs]) else: for proc in procs: proc.wait() # Delete AutoSklearn environment variable del_auto_seed() # In case try: del self._datamanager except Exception: pass if self._queue is None: self._load_models() return self def run_ensemble_builder(self, time_left_for_ensembles=None, max_iterations=None, ensemble_size=None): if self._ensemble_size > 0 or ensemble_size is not None: task_name = 'runEnsemble' self._stopwatch.start_task(task_name) if time_left_for_ensembles is None: time_left_for_ensembles = max(0, self._time_for_task - self._stopwatch.wall_elapsed( self._dataset_name)) if max_iterations is None: max_iterations = -1 if ensemble_size is None: ensemble_size = self._ensemble_size # It can happen that run_ensemble_builder is called without # calling fit. if self._logger: self._logger.info( 'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) proc_ensembles = submit_process.run_ensemble_builder( tmp_dir=self._tmp_dir, dataset_name=self._dataset_name, task_type=self._task, metric=self._metric, limit=time_left_for_ensembles, output_dir=self._output_dir, ensemble_size=ensemble_size, ensemble_nbest=self._ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, max_iterations=max_iterations, precision=self.precision ) self._stopwatch.stop_task(task_name) return proc_ensembles else: self._logger.info('Not starting ensemble script due to ensemble ' 'size 0.') return None def refit(self, X, y): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() for identifier in self.models_: if identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method model.fit(X.copy(), y.copy()) self._can_predict = True def predict(self, X): return np.argmax(self.predict_proba(X), axis=1) def predict_proba(self, X): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if not self._can_predict and \ self._resampling_strategy not in \ ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently only implemented for resampling ' 'strategy holdout.') if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() all_predictions = [] for identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] X_ = X.copy() if self._task in REGRESSION_TASKS: prediction = model.predict(X_) else: prediction = model.predict_proba(X_) if len(prediction.shape) < 1 or len(X_.shape) < 1 or \ X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]: self._logger.warning("Prediction shape for model %s is %s " "while X_.shape is %s" % (model, str(prediction.shape), str(X_.shape))) all_predictions.append(prediction) if len(all_predictions) == 0: raise ValueError('Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = self.ensemble_.predict(all_predictions) return predictions def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.ensemble_ = self._backend.load_ensemble(seed) if self.ensemble_: identifiers = self.ensemble_.identifiers_ self.models_ = self._backend.load_models_by_identifiers(identifiers) else: self.models_ = self._backend.load_all_models(seed) if len(self.models_) == 0: raise ValueError('No models fitted!') def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict_proba(X) return calculate_score(y, prediction, self._task, self._metric, self._label_num, logger=self._logger) def show_models(self): if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() return self.ensemble_.pprint_ensemble_string(self.models_) def _save_ensemble_data(self, X, y): """Split dataset and store Data for the ensemble script. :param X: :param y: :return: """ task_name = 'LoadData' self._start_task(self._stopwatch, task_name) _, _, _, y_ensemble = resampling.split_data(X, y) self._backend.save_targets_ensemble(y_ensemble) self._stop_task(self._stopwatch, task_name) def configuration_space_created_hook(self, datamanager): pass def get_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'get_params() because it is not intended to ' 'be optimized.') def set_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'set_params() because it is not intended to ' 'be optimized.') def __del__(self): self._delete_output_directories() def _delete_output_directories(self): if self.delete_output_folder_after_terminate: try: shutil.rmtree(self._output_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete output dir: %s" % self._output_dir) else: print("Could not delete output dir: %s" % self._output_dir) if self.delete_tmp_folder_after_terminate: try: shutil.rmtree(self._tmp_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete tmp dir: %s" % self._tmp_dir) pass else: print("Could not delete tmp dir: %s" % self._tmp_dir)
class AutoML(BaseEstimator): def __init__(self, backend, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32, max_iter_smac=None, acquisition_function='EI'): super(AutoML, self).__init__() self._backend = backend #self._tmp_dir = tmp_dir #self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit #self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None self._metadata_directory = metadata_directory self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments \ if resampling_strategy_arguments is not None else {} self._max_iter_smac = max_iter_smac #self.delete_tmp_folder_after_terminate = \ # delete_tmp_folder_after_terminate #self.delete_output_folder_after_terminate = \ # delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self.acquisition_function = acquisition_function self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self._parser = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode if not isinstance(self._time_for_task, int): raise ValueError("time_left_for_this_task not of type integer, " "but %s" % str(type(self._time_for_task))) if not isinstance(self._per_run_time_limit, int): raise ValueError("per_run_time_limit not of type integer, but %s" % str(type(self._per_run_time_limit))) # After assignging and checking variables... #self._backend = Backend(self._output_dir, self._tmp_dir) def start_automl(self, parser): self._parser = parser self.start() def start(self): if self._parser is None: raise ValueError('You must invoke start() only via start_automl()') super(AutoML, self).start() def run(self): if self._parser is None: raise ValueError('You must invoke run() only via start_automl()') self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=self._parser) self._stopwatch.start_task(datamanager.name) self._logger = self._get_logger(datamanager.name) self._datamanager = datamanager self._dataset_name = datamanager.name self._fit(self._datamanager) def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if not self._shared_mode: self._backend.context.delete_directories() else: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass self._backend.context.create_directories() if dataset_name is None: dataset_name = hash_numpy_array(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all([isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager) def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! self._data_memory_limit = float(self._ml_memory_limit) / 3 loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=self._data_memory_limit) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager) def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._backend.temporary_directory, '%s.log' % str(logger_name))) return get_logger(logger_name) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager, num_run): self._logger.info("Starting to create dummy predictions.") # time_limit = int(self._time_for_task / 6.) memory_limit = int(self._ml_memory_limit) ta = ExecuteTaFuncWithQueue(backend=self._backend, autosklearn_seed=self._seed, resampling_strategy=self._resampling_strategy, initial_num_run=num_run, logger=self._logger, **self._resampling_strategy_arguments) status, cost, runtime, additional_info = \ ta.run(1, cutoff=self._time_for_task, memory_limit=memory_limit) if status == StatusType.SUCCESS: self._logger.info("Finished creating dummy predictions.") else: self._logger.error('Error creating dummy predictions:%s ', additional_info) #status, cost, runtime, additional_info = \ # ta.run(2, cutoff=time_limit, memory_limit=memory_limit) #if status == StatusType.SUCCESS: # self._logger.info("Finished creating dummy prediction 2/2.") #else: # self._logger.error('Error creating dummy prediction 2/2 %s', # additional_info) return ta.num_run def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv']: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") acquisition_functions = ['EI', 'EIPS'] if self.acquisition_function not in acquisition_functions: raise ValueError('Illegal acquisition %s: Must be one of %s.' % (self.acquisition_function, acquisition_functions)) self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data( datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._backend.temporary_directory, self._backend, datamanager, self._include_estimators, self._include_preprocessors) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info( 'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._logger.warning("Not starting ensemble builder because there " "is no time left!") self._proc_ensemble = None else: self._proc_ensemble = self._get_ensemble_process(time_left_for_ensembles) if self._ensemble_size > 0: self._proc_ensemble.start() else: self._logger.info('Not starting ensemble builder because ' 'ensemble size is <= 0.') self._stopwatch.stop_task(ensemble_task_name) # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max(0, self._time_for_task - self._stopwatch.wall_elapsed( self._dataset_name)) if self._logger: self._logger.info( 'Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") _proc_smac = None else: if self._per_run_time_limit is None or \ self._per_run_time_limit > time_left_for_smac: print('Time limit for a single run is higher than total time ' 'limit. Capping the limit for a single run to the total ' 'time given to SMAC (%f)' % time_left_for_smac) per_run_time_limit = time_left_for_smac else: per_run_time_limit = self._per_run_time_limit _proc_smac = AutoMLSMBO(config_space=self.configuration_space, dataset_name=self._dataset_name, backend=self._backend, total_walltime_limit=time_left_for_smac, func_eval_time_limit=per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self._initial_configurations_via_metalearning, config_file=configspace_path, smac_iters=self._max_iter_smac, seed=self._seed, metadata_directory=self._metadata_directory, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, acquisition_function=self.acquisition_function, shared_mode=self._shared_mode) self.runhistory_ = _proc_smac.run_smbo() self._proc_ensemble = None self._load_models() return self def refit(self, X, y): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() random_state = np.random.RandomState(self._seed) for identifier in self.models_: if identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method # try to fit the model. If it fails, shuffle the data. This # could alleviate the problem in algorithms that depend on # the ordering of the data. for i in range(10): try: model.fit(X.copy(), y.copy()) break except ValueError: indices = list(range(X.shape[0])) random_state.shuffle(indices) X = X[indices] y = y[indices] self._can_predict = True return self def predict(self, X): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if not self._can_predict and \ self._resampling_strategy not in \ ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently only implemented for resampling ' 'strategy holdout.') if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() all_predictions = [] for identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] X_ = X.copy() if self._task in REGRESSION_TASKS: prediction = model.predict(X_) else: prediction = model.predict_proba(X_) if len(prediction.shape) < 1 or len(X_.shape) < 1 or \ X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]: self._logger.warning("Prediction shape for model %s is %s " "while X_.shape is %s" % (model, str(prediction.shape), str(X_.shape))) all_predictions.append(prediction) if len(all_predictions) == 0: raise ValueError('Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = self.ensemble_.predict(all_predictions) return predictions def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): if self._logger is None: self._logger = self._get_logger(dataset_name) self._proc_ensemble = self._get_ensemble_process( 1, task, metric, precision, dataset_name, max_iterations=1, ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size) self._proc_ensemble.main() return self def _get_ensemble_process(self, time_left_for_ensembles, task=None, metric=None, precision=None, dataset_name=None, max_iterations=-1, ensemble_nbest=None, ensemble_size=None): if task is None: task = self._task else: self._task = task if metric is None: metric = self._metric else: self._metric = metric if precision is None: precision = self.precision else: self.precision = precision if dataset_name is None: dataset_name = self._dataset_name else: self._dataset_name = dataset_name if ensemble_nbest is None: ensemble_nbest = self._ensemble_nbest else: self._ensemble_nbest = ensemble_nbest if ensemble_size is None: ensemble_size = self._ensemble_size else: self._ensemble_size = ensemble_size return EnsembleBuilder(backend=self._backend, dataset_name=dataset_name, task_type=task, metric=metric, limit=time_left_for_ensembles, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, precision=precision, max_iterations=max_iterations) def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.ensemble_ = self._backend.load_ensemble(seed) if self.ensemble_: identifiers = self.ensemble_.identifiers_ self.models_ = self._backend.load_models_by_identifiers(identifiers) else: self.models_ = self._backend.load_all_models(seed) if len(self.models_) == 0: raise ValueError('No models fitted!') def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict(X) return calculate_score(y, prediction, self._task, self._metric, self._label_num, logger=self._logger) @property def grid_scores_(self): grid_scores = list() scores_per_config = defaultdict(list) config_list = list() for run_key in self.runhistory_.data: run_value = self.runhistory_.data[run_key] config_id = run_key.config_id cost = run_value.cost if config_id not in config_list: config_list.append(config_id) scores_per_config[config_id].append(cost) for config_id in config_list: scores = [1 - score for score in scores_per_config[config_id]] mean_score = np.mean(scores) config = self.runhistory_.ids_config[config_id] grid_score = _CVScoreTuple(config.get_dictionary(), mean_score, scores) grid_scores.append(grid_score) return grid_scores @property def cv_results_(self): results = dict() # Missing in contrast to scikit-learn # splitX_test_score - auto-sklearn does not store the scores on a split # basis # std_test_score - auto-sklearn does not store the scores on a split # basis # splitX_train_score - auto-sklearn does not compute train scores, add # flag to compute the train scores # mean_train_score - auto-sklearn does not store the train scores # std_train_score - auto-sklearn does not store the train scores # std_fit_time - auto-sklearn does not store the fit times per split # mean_score_time - auto-sklearn does not store the score time # std_score_time - auto-sklearn does not store the score time # TODO: add those arguments parameter_dictionaries = dict() masks = dict() hp_names = [] # Set up dictionary for parameter values for hp in self.configuration_space.get_hyperparameters(): name = hp.name parameter_dictionaries[name] = [] masks[name] = [] hp_names.append(name) mean_test_score = [] mean_fit_time = [] params = [] status = [] for run_key in self.runhistory_.data: run_value = self.runhistory_.data[run_key] config_id = run_key.config_id config = self.runhistory_.ids_config[config_id] param_dict = config.get_dictionary() params.append(param_dict) mean_test_score.append(1 - run_value.cost) mean_fit_time.append(run_value.time) s = run_value.status if s == 1: status.append('Success') elif s == 2: status.append('Timeout') elif s == 3: status.append('Crash') elif s == 4: status.append('Abort') elif s == 5: status.append('Memout') else: status.append('Unknown') for hp_name in hp_names: if hp_name in param_dict: hp_value = param_dict[hp_name] mask_value = False else: hp_value = np.NaN mask_value = True parameter_dictionaries[hp_name].append(hp_value) masks[hp_name].append(mask_value) results['mean_test_score'] = np.array(mean_test_score) results['mean_fit_time'] = np.array(mean_fit_time) results['params'] = params results['rank_test_scores'] = scipy.stats.rankdata(1 - results['mean_test_score'], method='min') results['status'] = status for hp_name in hp_names: masked_array = ma.MaskedArray(parameter_dictionaries[hp_name], masks[hp_name]) results['param_%s' % hp_name] = masked_array return results def sprint_statistics(self): cv_results = self.cv_results_ sio = io.StringIO() sio.write('auto-sklearn results:\n') sio.write(' Dataset name: %s\n' % self._dataset_name) sio.write(' Metric: %s\n' % METRIC_TO_STRING[self._metric]) idx_best_run = np.argmax(cv_results['mean_test_score']) best_score = cv_results['mean_test_score'][idx_best_run] sio.write(' Best validation score: %f\n' % best_score) num_runs = len(cv_results['status']) sio.write(' Number of target algorithm runs: %d\n' % num_runs) num_success = sum([s == 'Success' for s in cv_results['status']]) sio.write(' Number of successful target algorithm runs: %d\n' % num_success) num_crash = sum([s == 'Crash' for s in cv_results['status']]) sio.write(' Number of crashed target algorithm runs: %d\n' % num_crash) num_timeout = sum([s == 'Timeout' for s in cv_results['status']]) sio.write(' Number of target algorithms that exceeded the memory ' 'limit: %d\n' % num_timeout) num_memout = sum([s == 'Memout' for s in cv_results['status']]) sio.write(' Number of target algorithms that exceeded the time ' 'limit: %d\n' % num_memout) return sio.getvalue() def show_models(self): if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() return self.ensemble_.pprint_ensemble_string(self.models_) def _save_ensemble_data(self, X, y): """Split dataset and store Data for the ensemble script. :param X: :param y: :return: """ task_name = 'LoadData' self._start_task(self._stopwatch, task_name) _, _, _, y_ensemble = resampling.split_data(X, y) self._backend.save_targets_ensemble(y_ensemble) self._stop_task(self._stopwatch, task_name) def _create_search_space(self, tmp_dir, backend, datamanager, include_estimators=None, include_preprocessors=None): task_name = 'CreateConfigSpace' self._stopwatch.start_task(task_name) configspace_path = os.path.join(tmp_dir, 'space.pcs') configuration_space = pipeline.get_configuration_space( datamanager.info, include_estimators=include_estimators, include_preprocessors=include_preprocessors) configuration_space = self.configuration_space_created_hook( datamanager, configuration_space) sp_string = pcs.write(configuration_space) backend.write_txt_file(configspace_path, sp_string, 'Configuration space') self._stopwatch.stop_task(task_name) return configuration_space, configspace_path def configuration_space_created_hook(self, datamanager, configuration_space): return configuration_space
class AutoML(multiprocessing.Process, BaseEstimator): def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, logger=None): super(AutoML, self).__init__() self._seed = seed self._tmp_dir = tmp_dir self._output_dir = output_dir self._model_dir = join(self._tmp_dir, 'models_%d' % self._seed) self._ensemble_indices_dir = join(self._tmp_dir, 'ensemble_indices_%d' % self._seed) self._create_folder(self._tmp_dir, to_log=False) self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._ml_memory_limit = ml_memory_limit self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._basename = None self._stopwatch = None self._logger = logger if logger is not None else get_automl_logger( self._log_dir, self._basename, self._seed) self._ohe = None self._task = None self._metric = None self._target_num = None self._debug_mode = debug_mode self._create_folders() def _create_folder(self, folder, to_log=True): if to_log: self._debug("CREATE folder: %s" % folder) else: print("CREATE folder: %s" % folder) if os.path.isdir(folder): if not self._debug_mode: raise OSError("Folder '%s' exists" % folder) else: os.mkdir(folder) def _create_folders(self): # == Set up a directory where all the trained models will be pickled to self._create_folder(self._output_dir) if self._log_dir != self._tmp_dir: self._create_folder(self._log_dir) self._create_folder(self._model_dir) self._create_folder(self._ensemble_indices_dir) def run(self): raise NotImplementedError() def fit(self, data_x, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(data_x.data) dataset_name = m.hexdigest() self._basename = dataset_name self._stopwatch = StopWatch() self._stopwatch.start_task(self._basename) loaded_data_manager = XYDataManager(data_x, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager) def _debug(self, text): self._logger.debug(text) def _critical(self, text): self._logger.critical(text) def _error(self, text): self._logger.error(text) def _info(self, text): self._logger.info(text) def fit_automl_dataset(self, basename, input_dir): # == Creating a data object with data and information about it self._basename = basename self._stopwatch = StopWatch() self._stopwatch.start_task(self._basename) self._logger = get_automl_logger(self._log_dir, self._basename, self._seed) self._debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! loaded_data_manager = CompetitionDataManager(self._basename, input_dir, verbose=True, encode_labels=False) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._debug(part) return self._fit(loaded_data_manager) def _save_data_manager(self, data_d, tmp_dir, basename, watcher): task_name = 'StoreDatamanager' watcher.start_task(task_name) filepath = os.path.join(tmp_dir, basename + '_Manager.pkl') if _check_path_for_save(filepath, 'Data manager ', self._debug): pickle.dump(data_d, open(filepath, 'w'), protocol=-1) watcher.stop_task(task_name) return filepath def _fit(self, manager): # TODO: check that data and task definition fit together! self._metric = manager.info['metric'] self._task = manager.info['task'] self._target_num = manager.info['target_num'] set_auto_seed(self._seed) # load data _save_ensemble_data( manager.data['X_train'], manager.data['Y_train'], self._tmp_dir, self._stopwatch) time_for_load_data = self._stopwatch.wall_elapsed(self._basename) if self._debug_mode: _print_load_time( self._basename, self._time_for_task, time_for_load_data, self._info) # == Calculate metafeatures meta_features = _calculate_meta_features( data_feat_type=manager.feat_type, data_info_task=manager.info['task'], basename=self._basename, metalearning_cnt=self._initial_configurations_via_metalearning, x_train=manager.data['X_train'], y_train=manager.data['Y_train'], watcher=self._stopwatch, log_function=self._debug) self._stopwatch.start_task('OneHot') manager.perform_hot_encoding() self._ohe = manager.encoder self._stopwatch.stop_task('OneHot') # == Pickle the data manager data_manager_path = self._save_data_manager( manager, self._tmp_dir, self._basename, watcher=self._stopwatch) # = Create a searchspace self._configuration_space, configspace_path = _create_search_space( self._tmp_dir, manager.info, self._stopwatch, self._debug) if meta_features is not None \ and manager.info['task'] in [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION]: meta_features_encoded = _calculate_meta_features_encoded( self._basename, manager.data['X_train'], manager.data['Y_train'], self._stopwatch, self._debug) self._debug(meta_features) self._debug(meta_features_encoded) initial_configurations = _get_initial_configuration( meta_features, meta_features_encoded, self._basename, self._metric, self._configuration_space, self._task, self._metadata_directory, self._initial_configurations_via_metalearning, manager.info['is_sparse'], self._stopwatch, self._error) _print_debug_info_of_init_configuration( initial_configurations, self._basename, self._time_for_task, self._debug, self._info, self._stopwatch) else: initial_configurations = [] self._critical('Metafeatures encoded not calculated') # == RUN SMAC proc_smac = _run_smac(self._tmp_dir, self._basename, self._time_for_task, self._ml_memory_limit, data_manager_path, configspace_path, initial_configurations, self._per_run_time_limit, self._stopwatch, self._debug) # == RUN ensemble builder proc_ensembles = _run_ensemble_builder( self._tmp_dir, self._output_dir, self._basename, self._time_for_task, self._task, self._metric, self._ensemble_size, self._ensemble_nbest, self._ensemble_indices_dir, self._stopwatch, self._debug ) if self._queue is not None: self._queue.put([time_for_load_data, data_manager_path, proc_smac, proc_ensembles]) else: self._debug("Real run SMAC") proc_smac.wait() self._debug("Real run RUNSOLVER") proc_ensembles.wait() # Delete AutoSklearn environment variable del_auto_seed() return self def predict(self, data_x): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") assert os.path.isdir( self._model_dir), "Not found model directory: %s" % self._model_dir model_files = os.listdir(self._model_dir) models = [] for model_file in model_files: model_file = os.path.join(self._model_dir, model_file) with open(model_file) as fh: models.append(pickle.load(fh)) if len(models) == 0: raise ValueError('No models fitted!') if self._ohe is not None: data_x = self._ohe._transform(data_x) print(self._ensemble_indices_dir) _ = os.listdir(self._ensemble_indices_dir) assert len(_), "Not found ensemle indices" indices_files = sorted(_) indices_file = os.path.join(self._ensemble_indices_dir, indices_files[-1]) with open(indices_file) as fh: ensemble_members_run_numbers = pickle.load(fh) predictions = [] for model, model_file in zip(models, model_files): num_run = int(model_file.split('.')[0]) if num_run not in ensemble_members_run_numbers: continue weight = ensemble_members_run_numbers[num_run] X_ = data_x.copy() if self._task in REGRESSION_TASKS: prediction = model.predict(X_) else: prediction = model.predict_proba(X_) predictions.append(prediction * weight) predictions = np.sum(np.array(predictions), axis=0) return predictions def score(self, data_x, y): prediction = self.predict(data_x) return calculate_score(y, prediction, self._task, self._metric, self._target_num) def configuration_space_created_hook(self): pass def get_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'get_params() because it is not intended to ' 'be optimized.') def set_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'set_params() because it is not intended to ' 'be optimized.')
def main(logger, predictions_dir, basename, task_type, metric, limit, output_dir, ensemble_size=None): watch = StopWatch() watch.start_task('ensemble_builder') used_time = 0 time_iter = 0 index_run = 0 current_num_models = 0 while used_time < limit: logger.debug('Time left: %f' % (limit - used_time)) logger.debug('Time last iteration: %f' % time_iter) # Load the true labels of the validation data true_labels = np.load(os.path.join(predictions_dir, 'true_labels_ensemble.npy')) # Load the predictions from the models all_predictions_train = [] dir_ensemble = os.path.join(predictions_dir, 'predictions_ensemble/') dir_valid = os.path.join(predictions_dir, 'predictions_valid/') dir_test = os.path.join(predictions_dir, 'predictions_test/') if not os.path.isdir(dir_ensemble) or not os.path.isdir(dir_valid) or \ not os.path.isdir(dir_test): logger.debug('Prediction directory does not exist') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue dir_ensemble_list = sorted(os.listdir(dir_ensemble)) dir_valid_list = sorted(os.listdir(dir_valid)) dir_test_list = sorted(os.listdir(dir_test)) if check_data(logger, len(dir_ensemble_list), len(dir_valid_list), len(dir_test_list), current_num_models): time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue watch.start_task('ensemble_iter_' + str(index_run)) # Binary mask where True indicates that the corresponding will be # excluded from the ensemble exclude_mask = [] if ensemble_size is not None: # Keeps track of the single scores of each model in our ensemble scores_nbest = [] # The indices of the model that are currently in our ensemble indices_nbest = [] model_idx = 0 for f in dir_ensemble_list: predictions = np.load(os.path.join(dir_ensemble, f)) score = calculate_score(true_labels, predictions, task_type, metric, predictions.shape[1]) if ensemble_size is not None: if score <= 0.001: exclude_mask.append(True) logger.error('Model only predicts at random: ' + f + ' has score: ' + str(score)) # If we have less model in our ensemble than ensemble_size add # the current model if it is better than random elif len(scores_nbest) < ensemble_size: scores_nbest.append(score) indices_nbest.append(model_idx) exclude_mask.append(False) else: # Take the worst performing model in our ensemble so far idx = np.argmin(np.array([scores_nbest])) # If the current model is better than the worst model in # our ensemble replace it by the current model if scores_nbest[idx] < score: logger.debug( 'Worst model in our ensemble: %d with score %f will be replaced by model %d with score %f' % (idx, scores_nbest[idx], model_idx, score)) scores_nbest[idx] = score # Exclude the old model exclude_mask[int(indices_nbest[idx])] = True indices_nbest[idx] = model_idx exclude_mask.append(False) # Otherwise exclude the current model from the ensemble else: exclude_mask.append(True) else: # Load all predictions that are better than random if score <= 0.001: exclude_mask.append(True) logger.error('Model only predicts at random: ' + f + ' has score: ' + str(score)) else: exclude_mask.append(False) all_predictions_train.append(predictions) model_idx += 1 print(exclude_mask) all_predictions_valid = get_predictions(dir_valid, dir_valid_list, exclude_mask) all_predictions_test = get_predictions(dir_test, dir_test_list, exclude_mask) if len(all_predictions_train) == len(all_predictions_test) == len( all_predictions_valid) == 0: logger.error('All models do just random guessing') time.sleep(2) continue if len(all_predictions_train) == 1: logger.debug('Only one model so far we just copy its predictions') Y_valid = all_predictions_valid[0] Y_test = all_predictions_test[0] else: try: # Compute the weights for the ensemble # Use equally initialized weights n_models = len(all_predictions_train) init_weights = np.ones([n_models]) / n_models weights = weighted_ensemble(logger.debug, np.array(all_predictions_train), true_labels, task_type, metric, init_weights) except ValueError: logger.error('Caught ValueError!') used_time = watch.wall_elapsed('ensemble_builder') continue except Exception: logger.error('Caught error!') used_time = watch.wall_elapsed('ensemble_builder') continue # Compute the ensemble predictions for the valid data Y_valid = ensemble_prediction(np.array(all_predictions_valid), weights) # Compute the ensemble predictions for the test data Y_test = ensemble_prediction(np.array(all_predictions_test), weights) # Save predictions for valid and test data set filename_test = os.path.join( output_dir, basename + '_valid_' + str(index_run).zfill(3) + '.predict') save_predictions(os.path.join(predictions_dir, filename_test), Y_valid) filename_test = os.path.join( output_dir, basename + '_test_' + str(index_run).zfill(3) + '.predict') save_predictions(os.path.join(predictions_dir, filename_test), Y_test) current_num_models = len(dir_ensemble_list) watch.stop_task('ensemble_iter_' + str(index_run)) time_iter = watch.get_wall_dur('ensemble_iter_' + str(index_run)) used_time = watch.wall_elapsed('ensemble_builder') index_run += 1 return
class AutoML(BaseEstimator): def __init__(self, backend, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32, max_iter_smac=None, acquisition_function='EI'): super(AutoML, self).__init__() self._backend = backend #self._tmp_dir = tmp_dir #self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit #self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None self._metadata_directory = metadata_directory self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments \ if resampling_strategy_arguments is not None else {} self._max_iter_smac = max_iter_smac #self.delete_tmp_folder_after_terminate = \ # delete_tmp_folder_after_terminate #self.delete_output_folder_after_terminate = \ # delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self.acquisition_function = acquisition_function self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self._parser = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode if not isinstance(self._time_for_task, int): raise ValueError("time_left_for_this_task not of type integer, " "but %s" % str(type(self._time_for_task))) if not isinstance(self._per_run_time_limit, int): raise ValueError("per_run_time_limit not of type integer, but %s" % str(type(self._per_run_time_limit))) # After assignging and checking variables... #self._backend = Backend(self._output_dir, self._tmp_dir) def start_automl(self, parser): self._parser = parser self.start() def start(self): if self._parser is None: raise ValueError('You must invoke start() only via start_automl()') super(AutoML, self).start() def run(self): if self._parser is None: raise ValueError('You must invoke run() only via start_automl()') self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=self._parser) self._stopwatch.start_task(datamanager.name) self._logger = self._get_logger(datamanager.name) self._datamanager = datamanager self._dataset_name = datamanager.name self._fit(self._datamanager) def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if not self._shared_mode: self._backend.context.delete_directories() else: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass self._backend.context.create_directories() if dataset_name is None: dataset_name = hash_numpy_array(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager) def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! self._data_memory_limit = float(self._ml_memory_limit) / 3 loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=self._data_memory_limit) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager) def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger( os.path.join(self._backend.temporary_directory, '%s.log' % str(logger_name))) return get_logger(logger_name) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager, num_run): self._logger.info("Starting to create dummy predictions.") # time_limit = int(self._time_for_task / 6.) memory_limit = int(self._ml_memory_limit) ta = ExecuteTaFuncWithQueue( backend=self._backend, autosklearn_seed=self._seed, resampling_strategy=self._resampling_strategy, initial_num_run=num_run, logger=self._logger, **self._resampling_strategy_arguments) status, cost, runtime, additional_info = \ ta.run(1, cutoff=self._time_for_task, memory_limit=memory_limit) if status == StatusType.SUCCESS: self._logger.info("Finished creating dummy predictions.") else: self._logger.error('Error creating dummy predictions:%s ', additional_info) #status, cost, runtime, additional_info = \ # ta.run(2, cutoff=time_limit, memory_limit=memory_limit) #if status == StatusType.SUCCESS: # self._logger.info("Finished creating dummy prediction 2/2.") #else: # self._logger.error('Error creating dummy prediction 2/2 %s', # additional_info) return ta.num_run def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in [ 'holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv' ]: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") acquisition_functions = ['EI', 'EIPS'] if self.acquisition_function not in acquisition_functions: raise ValueError( 'Illegal acquisition %s: Must be one of %s.' % (self.acquisition_function, acquisition_functions)) self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data(datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time(self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._backend.temporary_directory, self._backend, datamanager, self._include_estimators, self._include_preprocessors) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info('Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._logger.warning("Not starting ensemble builder because there " "is no time left!") self._proc_ensemble = None else: self._proc_ensemble = self._get_ensemble_process( time_left_for_ensembles) if self._ensemble_size > 0: self._proc_ensemble.start() else: self._logger.info('Not starting ensemble builder because ' 'ensemble size is <= 0.') self._stopwatch.stop_task(ensemble_task_name) # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max( 0, self._time_for_task - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info('Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") _proc_smac = None else: if self._per_run_time_limit is None or \ self._per_run_time_limit > time_left_for_smac: print('Time limit for a single run is higher than total time ' 'limit. Capping the limit for a single run to the total ' 'time given to SMAC (%f)' % time_left_for_smac) per_run_time_limit = time_left_for_smac else: per_run_time_limit = self._per_run_time_limit _proc_smac = AutoMLSMBO( config_space=self.configuration_space, dataset_name=self._dataset_name, backend=self._backend, total_walltime_limit=time_left_for_smac, func_eval_time_limit=per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self. _initial_configurations_via_metalearning, config_file=configspace_path, smac_iters=self._max_iter_smac, seed=self._seed, metadata_directory=self._metadata_directory, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, acquisition_function=self.acquisition_function, shared_mode=self._shared_mode) self.runhistory_ = _proc_smac.run_smbo() self._proc_ensemble = None self._load_models() return self def refit(self, X, y): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() random_state = np.random.RandomState(self._seed) for identifier in self.models_: if identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method # try to fit the model. If it fails, shuffle the data. This # could alleviate the problem in algorithms that depend on # the ordering of the data. for i in range(10): try: model.fit(X.copy(), y.copy()) break except ValueError: indices = list(range(X.shape[0])) random_state.shuffle(indices) X = X[indices] y = y[indices] self._can_predict = True return self def predict(self, X): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if not self._can_predict and \ self._resampling_strategy not in \ ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently only implemented for resampling ' 'strategy holdout.') if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() all_predictions = [] for identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] X_ = X.copy() if self._task in REGRESSION_TASKS: prediction = model.predict(X_) else: prediction = model.predict_proba(X_) if len(prediction.shape) < 1 or len(X_.shape) < 1 or \ X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]: self._logger.warning( "Prediction shape for model %s is %s " "while X_.shape is %s" % (model, str(prediction.shape), str(X_.shape))) all_predictions.append(prediction) if len(all_predictions) == 0: raise ValueError( 'Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = self.ensemble_.predict(all_predictions) return predictions def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): if self._logger is None: self._logger = self._get_logger(dataset_name) self._proc_ensemble = self._get_ensemble_process( 1, task, metric, precision, dataset_name, max_iterations=1, ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size) self._proc_ensemble.main() return self def _get_ensemble_process(self, time_left_for_ensembles, task=None, metric=None, precision=None, dataset_name=None, max_iterations=-1, ensemble_nbest=None, ensemble_size=None): if task is None: task = self._task else: self._task = task if metric is None: metric = self._metric else: self._metric = metric if precision is None: precision = self.precision else: self.precision = precision if dataset_name is None: dataset_name = self._dataset_name else: self._dataset_name = dataset_name if ensemble_nbest is None: ensemble_nbest = self._ensemble_nbest else: self._ensemble_nbest = ensemble_nbest if ensemble_size is None: ensemble_size = self._ensemble_size else: self._ensemble_size = ensemble_size return EnsembleBuilder(backend=self._backend, dataset_name=dataset_name, task_type=task, metric=metric, limit=time_left_for_ensembles, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, precision=precision, max_iterations=max_iterations) def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.ensemble_ = self._backend.load_ensemble(seed) if self.ensemble_: identifiers = self.ensemble_.identifiers_ self.models_ = self._backend.load_models_by_identifiers( identifiers) else: self.models_ = self._backend.load_all_models(seed) if len(self.models_) == 0: raise ValueError('No models fitted!') def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict(X) return calculate_score(y, prediction, self._task, self._metric, self._label_num, logger=self._logger) @property def grid_scores_(self): grid_scores = list() scores_per_config = defaultdict(list) config_list = list() for run_key in self.runhistory_.data: run_value = self.runhistory_.data[run_key] config_id = run_key.config_id cost = run_value.cost if config_id not in config_list: config_list.append(config_id) scores_per_config[config_id].append(cost) for config_id in config_list: scores = [1 - score for score in scores_per_config[config_id]] mean_score = np.mean(scores) config = self.runhistory_.ids_config[config_id] grid_score = _CVScoreTuple(config.get_dictionary(), mean_score, scores) grid_scores.append(grid_score) return grid_scores @property def cv_results_(self): results = dict() # Missing in contrast to scikit-learn # splitX_test_score - auto-sklearn does not store the scores on a split # basis # std_test_score - auto-sklearn does not store the scores on a split # basis # splitX_train_score - auto-sklearn does not compute train scores, add # flag to compute the train scores # mean_train_score - auto-sklearn does not store the train scores # std_train_score - auto-sklearn does not store the train scores # std_fit_time - auto-sklearn does not store the fit times per split # mean_score_time - auto-sklearn does not store the score time # std_score_time - auto-sklearn does not store the score time # TODO: add those arguments parameter_dictionaries = dict() masks = dict() hp_names = [] # Set up dictionary for parameter values for hp in self.configuration_space.get_hyperparameters(): name = hp.name parameter_dictionaries[name] = [] masks[name] = [] hp_names.append(name) mean_test_score = [] mean_fit_time = [] params = [] status = [] for run_key in self.runhistory_.data: run_value = self.runhistory_.data[run_key] config_id = run_key.config_id config = self.runhistory_.ids_config[config_id] param_dict = config.get_dictionary() params.append(param_dict) mean_test_score.append(1 - run_value.cost) mean_fit_time.append(run_value.time) s = run_value.status if s == 1: status.append('Success') elif s == 2: status.append('Timeout') elif s == 3: status.append('Crash') elif s == 4: status.append('Abort') elif s == 5: status.append('Memout') else: status.append('Unknown') for hp_name in hp_names: if hp_name in param_dict: hp_value = param_dict[hp_name] mask_value = False else: hp_value = np.NaN mask_value = True parameter_dictionaries[hp_name].append(hp_value) masks[hp_name].append(mask_value) results['mean_test_score'] = np.array(mean_test_score) results['mean_fit_time'] = np.array(mean_fit_time) results['params'] = params results['rank_test_scores'] = scipy.stats.rankdata( 1 - results['mean_test_score'], method='min') results['status'] = status for hp_name in hp_names: masked_array = ma.MaskedArray(parameter_dictionaries[hp_name], masks[hp_name]) results['param_%s' % hp_name] = masked_array return results def sprint_statistics(self): cv_results = self.cv_results_ sio = io.StringIO() sio.write('auto-sklearn results:\n') sio.write(' Dataset name: %s\n' % self._dataset_name) sio.write(' Metric: %s\n' % METRIC_TO_STRING[self._metric]) idx_best_run = np.argmax(cv_results['mean_test_score']) best_score = cv_results['mean_test_score'][idx_best_run] sio.write(' Best validation score: %f\n' % best_score) num_runs = len(cv_results['status']) sio.write(' Number of target algorithm runs: %d\n' % num_runs) num_success = sum([s == 'Success' for s in cv_results['status']]) sio.write(' Number of successful target algorithm runs: %d\n' % num_success) num_crash = sum([s == 'Crash' for s in cv_results['status']]) sio.write(' Number of crashed target algorithm runs: %d\n' % num_crash) num_timeout = sum([s == 'Timeout' for s in cv_results['status']]) sio.write(' Number of target algorithms that exceeded the memory ' 'limit: %d\n' % num_timeout) num_memout = sum([s == 'Memout' for s in cv_results['status']]) sio.write(' Number of target algorithms that exceeded the time ' 'limit: %d\n' % num_memout) return sio.getvalue() def show_models(self): if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() return self.ensemble_.pprint_ensemble_string(self.models_) def get_models(self): if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() return self.ensemble_.show_ensemble(self.models_) def _save_ensemble_data(self, X, y): """Split dataset and store Data for the ensemble script. :param X: :param y: :return: """ task_name = 'LoadData' self._start_task(self._stopwatch, task_name) _, _, _, y_ensemble = resampling.split_data(X, y) self._backend.save_targets_ensemble(y_ensemble) self._stop_task(self._stopwatch, task_name) def _create_search_space(self, tmp_dir, backend, datamanager, include_estimators=None, include_preprocessors=None): task_name = 'CreateConfigSpace' self._stopwatch.start_task(task_name) configspace_path = os.path.join(tmp_dir, 'space.pcs') configuration_space = pipeline.get_configuration_space( datamanager.info, include_estimators=include_estimators, include_preprocessors=include_preprocessors) configuration_space = self.configuration_space_created_hook( datamanager, configuration_space) sp_string = pcs.write(configuration_space) backend.write_txt_file(configspace_path, sp_string, 'Configuration space') self._stopwatch.stop_task(task_name) return configuration_space, configspace_path def configuration_space_created_hook(self, datamanager, configuration_space): return configuration_space
class AutoML(BaseEstimator, multiprocessing.Process): def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32, max_iter_smac=None, acquisition_function='EI'): super(AutoML, self).__init__() self._tmp_dir = tmp_dir self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self._max_iter_smac = max_iter_smac self.delete_tmp_folder_after_terminate = \ delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = \ delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self.acquisition_function = acquisition_function self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self._parser = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode if not isinstance(self._time_for_task, int): raise ValueError("time_left_for_this_task not of type integer, " "but %s" % str(type(self._time_for_task))) if not isinstance(self._per_run_time_limit, int): raise ValueError("per_run_time_limit not of type integer, but %s" % str(type(self._per_run_time_limit))) # After assignging and checking variables... self._backend = Backend(self._output_dir, self._tmp_dir) def start_automl(self, parser): self._parser = parser self.start() def start(self): if self._parser is None: raise ValueError('You must invoke start() only via start_automl()') super(AutoML, self).start() def run(self): if self._parser is None: raise ValueError('You must invoke run() only via start_automl()') self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=self._parser) self._stopwatch.start_task(datamanager.name) self._logger = self._get_logger(datamanager.name) self._datamanager = datamanager self._dataset_name = datamanager.name self._fit(self._datamanager) def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all([isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager) def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! self._data_memory_limit = float(self._ml_memory_limit) / 3 loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=self._data_memory_limit) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager) def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) return get_logger(logger_name) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager, num_run): self._logger.info("Starting to create dummy predictions.") time_limit = int(self._time_for_task / 6.) memory_limit = int(self._ml_memory_limit) _info = eval_with_limits(datamanager, self._tmp_dir, 1, self._seed, num_run, self._resampling_strategy, self._resampling_strategy_arguments, memory_limit, time_limit) if _info[4] == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 1/2.") else: self._logger.error('Error creating dummy prediction 1/2:%s ', _info[3]) num_run += 1 _info = eval_with_limits(datamanager, self._tmp_dir, 2, self._seed, num_run, self._resampling_strategy, self._resampling_strategy_arguments, memory_limit, time_limit) if _info[4] == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 2/2.") else: self._logger.error('Error creating dummy prediction 2/2 %s', _info[3]) num_run += 1 return num_run def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv']: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") acquisition_functions = ['EI', 'EIPS'] if self.acquisition_function not in acquisition_functions: raise ValueError('Illegal acquisition %s: Must be one of %s.' % (self.acquisition_function, acquisition_functions)) self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warning("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data( datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._tmp_dir, self._backend, datamanager, self._include_estimators, self._include_preprocessors) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info( 'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._logger.warning("Not starting ensemble builder because there " "is no time left!") self._proc_ensemble = None else: self._proc_ensemble = self._get_ensemble_process(time_left_for_ensembles) self._proc_ensemble.start() self._stopwatch.stop_task(ensemble_task_name) # == RUN SMBO # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max(0, self._time_for_task - self._stopwatch.wall_elapsed( self._dataset_name)) if self._logger: self._logger.info( 'Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") self._procsmac = None else: self._proc_smac = AutoMLSMBO(config_space=self.configuration_space, dataset_name=self._dataset_name, tmp_dir=self._tmp_dir, output_dir=self._output_dir, total_walltime_limit=time_left_for_smac, func_eval_time_limit=self._per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self._initial_configurations_via_metalearning, config_file=configspace_path, smac_iters=self._max_iter_smac, seed=self._seed, metadata_directory=self._metadata_directory, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, acquisition_function=self.acquisition_function, shared_mode=self._shared_mode) self._proc_smac.start() psutil_procs = [] procs = [] if self._proc_smac is not None: proc_smac = psutil.Process(self._proc_smac.pid) psutil_procs.append(proc_smac) procs.append(self._proc_smac) if self._proc_ensemble is not None: proc_ensemble = psutil.Process(self._proc_ensemble.pid) psutil_procs.append(proc_ensemble) procs.append(self._proc_ensemble) if self._queue is not None: self._queue.put([time_for_load_data, data_manager_path, psutil_procs]) else: for proc in procs: proc.join() if self._queue is None: self._load_models() return self def refit(self, X, y): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() for identifier in self.models_: if identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method model.fit(X.copy(), y.copy()) self._can_predict = True def predict(self, X): return np.argmax(self.predict_proba(X), axis=1) def predict_proba(self, X): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if not self._can_predict and \ self._resampling_strategy not in \ ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently only implemented for resampling ' 'strategy holdout.') if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() all_predictions = [] for identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] X_ = X.copy() if self._task in REGRESSION_TASKS: prediction = model.predict(X_) else: prediction = model.predict_proba(X_) if len(prediction.shape) < 1 or len(X_.shape) < 1 or \ X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]: self._logger.warning("Prediction shape for model %s is %s " "while X_.shape is %s" % (model, str(prediction.shape), str(X_.shape))) all_predictions.append(prediction) if len(all_predictions) == 0: raise ValueError('Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = self.ensemble_.predict(all_predictions) return predictions def fit_ensemble(self, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): if self._logger is None: self._logger = self._get_logger(dataset_name) self._proc_ensemble = self._get_ensemble_process( 1, task, metric, precision, dataset_name, max_iterations=1, ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size) self._proc_ensemble.main() def _get_ensemble_process(self, time_left_for_ensembles, task=None, metric=None, precision=None, dataset_name=None, max_iterations=-1, ensemble_nbest=None, ensemble_size=None): if task is None: task = self._task if metric is None: metric = self._metric if precision is None: precision = self.precision if dataset_name is None: dataset_name = self._dataset_name if ensemble_nbest is None: ensemble_nbest = self._ensemble_nbest if ensemble_size is None: ensemble_size = self._ensemble_size return EnsembleBuilder(autosklearn_tmp_dir=self._tmp_dir, dataset_name=dataset_name, task_type=task, metric=metric, limit=time_left_for_ensembles, output_dir=self._output_dir, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, precision=precision, max_iterations=max_iterations) def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.ensemble_ = self._backend.load_ensemble(seed) if self.ensemble_: identifiers = self.ensemble_.identifiers_ self.models_ = self._backend.load_models_by_identifiers(identifiers) else: self.models_ = self._backend.load_all_models(seed) if len(self.models_) == 0: raise ValueError('No models fitted!') def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict_proba(X) return calculate_score(y, prediction, self._task, self._metric, self._label_num, logger=self._logger) def show_models(self): if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() return self.ensemble_.pprint_ensemble_string(self.models_) def _save_ensemble_data(self, X, y): """Split dataset and store Data for the ensemble script. :param X: :param y: :return: """ task_name = 'LoadData' self._start_task(self._stopwatch, task_name) _, _, _, y_ensemble = resampling.split_data(X, y) self._backend.save_targets_ensemble(y_ensemble) self._stop_task(self._stopwatch, task_name) def _create_search_space(self, tmp_dir, backend, datamanager, include_estimators=None, include_preprocessors=None): task_name = 'CreateConfigSpace' self._stopwatch.start_task(task_name) configspace_path = os.path.join(tmp_dir, 'space.pcs') configuration_space = pipeline.get_configuration_space( datamanager.info, include_estimators=include_estimators, include_preprocessors=include_preprocessors) configuration_space = self.configuration_space_created_hook( datamanager, configuration_space) sp_string = pcs.write(configuration_space) backend.write_txt_file(configspace_path, sp_string, 'Configuration space') self._stopwatch.stop_task(task_name) return configuration_space, configspace_path def configuration_space_created_hook(self, datamanager, configuration_space): return configuration_space def get_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'get_params() because it is not intended to ' 'be optimized.') def set_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'set_params() because it is not intended to ' 'be optimized.') def __del__(self): self._delete_output_directories() def _delete_output_directories(self): if self.delete_output_folder_after_terminate: try: shutil.rmtree(self._output_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete output dir: %s" % self._output_dir) else: print("Could not delete output dir: %s" % self._output_dir) if self.delete_tmp_folder_after_terminate: try: shutil.rmtree(self._tmp_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete tmp dir: %s" % self._tmp_dir) pass else: print("Could not delete tmp dir: %s" % self._tmp_dir)
def main(logger, predictions_dir, basename, task_type, metric, limit, output_dir, ensemble_size=None, seed=1, indices_output_dir='.'): watch = StopWatch() watch.start_task('ensemble_builder') task_type = STRING_TO_TASK_TYPES[task_type] used_time = 0 time_iter = 0 index_run = 0 current_num_models = 0 dir_ensemble = join(predictions_dir, 'predictions_ensemble_%s/' % seed) dir_valid = join(predictions_dir, 'predictions_valid_%s/' % seed) dir_test = join(predictions_dir, 'predictions_test_%s/' % seed) paths_ = [dir_ensemble, dir_valid, dir_test] tru_labels_path = join(predictions_dir, 'true_labels_ensemble.npy') while used_time < limit: logger.debug('Time left: %f', limit - used_time) logger.debug('Time last iteration: %f', time_iter) # Load the true labels of the validation data true_labels = np.load(tru_labels_path) # Load the predictions from the models exists = [os.path.isdir(dir_) for dir_ in paths_] if not exists[0]: # all(exists): logger.debug('Prediction directory %s does not exist!' % dir_ensemble) time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue dir_ensemble_list = sorted(os.listdir(dir_ensemble)) dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else [] dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else [] if check_data(logger, len(dir_ensemble_list), current_num_models): time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue watch.start_task('ensemble_iter_' + str(index_run)) # List of num_runs (which are in the filename) which will be included # later include_num_runs = [] re_num_run = re.compile(r'_([0-9]*)\.npy$') if ensemble_size is not None: # Keeps track of the single scores of each model in our ensemble scores_nbest = [] # The indices of the model that are currently in our ensemble indices_nbest = [] # The names of the models model_names = [] # The num run of the models num_runs = [] model_names_to_scores = dict() model_idx = 0 for model_name in dir_ensemble_list: predictions = np.load(os.path.join(dir_ensemble, model_name)) score = calculate_score(true_labels, predictions, task_type, metric, predictions.shape[1]) model_names_to_scores[model_name] = score num_run = int(re_num_run.search(model_name).group(1)) if ensemble_size is not None: if score <= 0.001: # include_num_runs.append(True) logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) # If we have less models in our ensemble than ensemble_size add # the current model if it is better than random elif len(scores_nbest) < ensemble_size: scores_nbest.append(score) indices_nbest.append(model_idx) include_num_runs.append(num_run) model_names.append(model_name) num_runs.append(num_run) else: # Take the worst performing model in our ensemble so far idx = np.argmin(np.array([scores_nbest])) # If the current model is better than the worst model in # our ensemble replace it by the current model if scores_nbest[idx] < score: logger.debug('Worst model in our ensemble: %s with ' 'score %f will be replaced by model %s ' 'with score %f', model_names[idx], scores_nbest[idx], model_name, score) # Exclude the old model del scores_nbest[idx] scores_nbest.append(score) del include_num_runs[idx] del indices_nbest[idx] indices_nbest.append(model_idx) include_num_runs.append(num_run) del model_names[idx] model_names.append(model_name) del num_runs[idx] num_runs.append(num_run) # Otherwise exclude the current model from the ensemble else: # include_num_runs.append(True) pass else: # Load all predictions that are better than random if score <= 0.001: # include_num_runs.append(True) logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) else: include_num_runs.append(num_run) model_idx += 1 indices_to_model_names = dict() indices_to_run_num = dict() for i, model_name in enumerate(dir_ensemble_list): num_run = int(re_num_run.search(model_name).group(1)) if num_run in include_num_runs: num_indices = len(indices_to_model_names) indices_to_model_names[num_indices] = model_name indices_to_run_num[num_indices] = num_run # logging.info("Indices to model names:") # logging.info(indices_to_model_names) # for i, item in enumerate(sorted(model_names_to_scores.items(), # key=lambda t: t[1])): # logging.info("%d: %s", i, item) include_num_runs = set(include_num_runs) all_predictions_train = get_predictions(dir_ensemble, dir_ensemble_list, include_num_runs, re_num_run) all_predictions_valid = get_predictions(dir_valid, dir_valid_list, include_num_runs, re_num_run) all_predictions_test = get_predictions(dir_test, dir_test_list, include_num_runs, re_num_run) if len(all_predictions_train) == len(all_predictions_test) == len( all_predictions_valid) == 0: logger.error('All models do just random guessing') time.sleep(2) continue elif len(all_predictions_train) == 1: logger.debug('Only one model so far we just copy its predictions') ensemble_members_run_numbers = {0: 1.0} # Output the score logger.info('Training performance: %f' % np.max(model_names_to_scores.values())) else: try: indices, trajectory = ensemble_selection( np.array(all_predictions_train), true_labels, ensemble_size, task_type, metric) logger.info('Trajectory and indices!') logger.info(trajectory) logger.info(indices) except ValueError as e: logger.error('Caught ValueError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') continue except Exception as e: logger.error('Caught error! %s', e.message) used_time = watch.wall_elapsed('ensemble_builder') continue # Output the score logger.info('Training performance: %f' % trajectory[-1]) # Print the ensemble members: ensemble_members_run_numbers = dict() ensemble_members = Counter(indices).most_common() ensemble_members_string = 'Ensemble members:\n' logger.info(ensemble_members) for ensemble_member in ensemble_members: weight = float(ensemble_member[1]) / len(indices) ensemble_members_string += \ (' %s; weight: %10f; performance: %10f\n' % (indices_to_model_names[ensemble_member[0]], weight, model_names_to_scores[ indices_to_model_names[ensemble_member[0]]])) ensemble_members_run_numbers[ indices_to_run_num[ ensemble_member[0]]] = weight logger.info(ensemble_members_string) # Save the ensemble indices for later use! filename_indices = os.path.join(indices_output_dir, str(index_run).zfill(5) + '.indices') logger.info(ensemble_members_run_numbers) with open(filename_indices, 'w') as fh: pickle.dump(ensemble_members_run_numbers, fh) # Save predictions for valid and test data set if len(dir_valid_list) == len(dir_ensemble_list): ensemble_predictions_valid = np.mean( all_predictions_valid[indices.astype(int)], axis=0) filename_test = os.path.join( output_dir, basename + '_valid_' + str(index_run).zfill(3) + '.predict') save_predictions( os.path.join(predictions_dir, filename_test), ensemble_predictions_valid) else: logger.info('Could not find as many validation set predictions ' 'as ensemble predictions!.') if len(dir_test_list) == len(dir_ensemble_list): ensemble_predictions_test = np.mean( all_predictions_test[indices.astype(int)], axis=0) filename_test = os.path.join( output_dir, basename + '_test_' + str(index_run).zfill(3) + '.predict') save_predictions( os.path.join(predictions_dir, filename_test), ensemble_predictions_test) else: logger.info('Could not find as many test set predictions as ' 'ensemble predictions!') current_num_models = len(dir_ensemble_list) watch.stop_task('ensemble_iter_' + str(index_run)) time_iter = watch.get_wall_dur('ensemble_iter_' + str(index_run)) used_time = watch.wall_elapsed('ensemble_builder') index_run += 1 return
def main(self): watch = StopWatch() watch.start_task('ensemble_builder') used_time = 0 time_iter = 0 index_run = 0 num_iteration = 0 current_num_models = 0 last_hash = None current_hash = None backend = Backend(self.output_dir, self.autosklearn_tmp_dir) dir_ensemble = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn', 'predictions_ensemble') dir_valid = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn', 'predictions_valid') dir_test = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn', 'predictions_test') paths_ = [dir_ensemble, dir_valid, dir_test] dir_ensemble_list_mtimes = [] self.logger.debug( 'Starting main loop with %f seconds and %d iterations ' 'left.' % (self.limit - used_time, num_iteration)) while used_time < self.limit or (self.max_iterations > 0 and self.max_iterations >= num_iteration): num_iteration += 1 self.logger.debug('Time left: %f', self.limit - used_time) self.logger.debug('Time last ensemble building: %f', time_iter) # Reload the ensemble targets every iteration, important, because cv may # update the ensemble targets in the cause of running auto-sklearn # TODO update cv in order to not need this any more! targets_ensemble = backend.load_targets_ensemble() # Load the predictions from the models exists = [os.path.isdir(dir_) for dir_ in paths_] if not exists[0]: # all(exists): self.logger.debug('Prediction directory %s does not exist!' % dir_ensemble) time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if self.shared_mode is False: dir_ensemble_list = sorted( glob.glob( os.path.join( dir_ensemble, 'predictions_ensemble_%s_*.npy' % self.seed))) if exists[1]: dir_valid_list = sorted( glob.glob( os.path.join( dir_valid, 'predictions_valid_%s_*.npy' % self.seed))) else: dir_valid_list = [] if exists[2]: dir_test_list = sorted( glob.glob( os.path.join( dir_test, 'predictions_test_%s_*.npy' % self.seed))) else: dir_test_list = [] else: dir_ensemble_list = sorted(os.listdir(dir_ensemble)) dir_valid_list = sorted( os.listdir(dir_valid)) if exists[1] else [] dir_test_list = sorted( os.listdir(dir_test)) if exists[2] else [] # Check the modification times because predictions can be updated # over time! old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes dir_ensemble_list_mtimes = [] for dir_ensemble_file in dir_ensemble_list: if dir_ensemble_file.endswith("/"): dir_ensemble_file = dir_ensemble_file[:-1] basename = os.path.basename(dir_ensemble_file) dir_ensemble_file = os.path.join(dir_ensemble, basename) mtime = os.path.getmtime(dir_ensemble_file) dir_ensemble_list_mtimes.append(mtime) if len(dir_ensemble_list) == 0: self.logger.debug('Directories are empty') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if len(dir_ensemble_list) <= current_num_models and \ old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes: self.logger.debug('Nothing has changed since the last time') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue watch.start_task('index_run' + str(index_run)) watch.start_task('ensemble_iter_' + str(num_iteration)) # List of num_runs (which are in the filename) which will be included # later include_num_runs = [] backup_num_runs = [] model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$') if self.ensemble_nbest is not None: # Keeps track of the single scores of each model in our ensemble scores_nbest = [] # The indices of the model that are currently in our ensemble indices_nbest = [] # The names of the models model_names = [] model_names_to_scores = dict() model_idx = 0 for model_name in dir_ensemble_list: if model_name.endswith("/"): model_name = model_name[:-1] basename = os.path.basename(model_name) try: if self.precision is "16": predictions = np.load( os.path.join(dir_ensemble, basename)).astype(dtype=np.float16) elif self.precision is "32": predictions = np.load( os.path.join(dir_ensemble, basename)).astype(dtype=np.float32) elif self.precision is "64": predictions = np.load( os.path.join(dir_ensemble, basename)).astype(dtype=np.float64) else: predictions = np.load( os.path.join(dir_ensemble, basename)) score = calculate_score(targets_ensemble, predictions, self.task_type, self.metric, predictions.shape[1]) except Exception as e: self.logger.warning('Error loading %s: %s', basename, e) score = -1 model_names_to_scores[model_name] = score match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if self.ensemble_nbest is not None: if score <= 0.001: self.logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) # If we have less models in our ensemble than ensemble_nbest add # the current model if it is better than random elif len(scores_nbest) < self.ensemble_nbest: scores_nbest.append(score) indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) model_names.append(model_name) else: # Take the worst performing model in our ensemble so far idx = np.argmin(np.array([scores_nbest])) # If the current model is better than the worst model in # our ensemble replace it by the current model if scores_nbest[idx] < score: self.logger.debug( 'Worst model in our ensemble: %s with ' 'score %f will be replaced by model %s ' 'with score %f', model_names[idx], scores_nbest[idx], model_name, score) # Exclude the old model del scores_nbest[idx] scores_nbest.append(score) del include_num_runs[idx] del indices_nbest[idx] indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) del model_names[idx] model_names.append(model_name) # Otherwise exclude the current model from the ensemble else: # include_num_runs.append(True) pass else: # Load all predictions that are better than random if score <= 0.001: # include_num_runs.append(True) self.logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) else: include_num_runs.append((automl_seed, num_run)) model_idx += 1 # If there is no model better than random guessing, we have to use # all models which do random guessing if len(include_num_runs) == 0: include_num_runs = backup_num_runs indices_to_model_names = dict() indices_to_run_num = dict() for i, model_name in enumerate(dir_ensemble_list): match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if (automl_seed, num_run) in include_num_runs: num_indices = len(indices_to_model_names) indices_to_model_names[num_indices] = model_name indices_to_run_num[num_indices] = (automl_seed, num_run) try: all_predictions_train, all_predictions_valid, all_predictions_test =\ self.get_all_predictions(dir_ensemble, dir_ensemble_list, dir_valid, dir_valid_list, dir_test, dir_test_list, include_num_runs, model_and_automl_re, self.precision) except IOError: self.logger.error('Could not load the predictions.') continue if len(include_num_runs) == 0: self.logger.error('All models do just random guessing') time.sleep(2) continue else: ensemble = EnsembleSelection(ensemble_size=self.ensemble_size, task_type=self.task_type, metric=self.metric) try: ensemble.fit(all_predictions_train, targets_ensemble, include_num_runs) self.logger.info(ensemble) except ValueError as e: self.logger.error('Caught ValueError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except IndexError as e: self.logger.error('Caught IndexError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except Exception as e: self.logger.error('Caught error! %s', str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue # Output the score self.logger.info('Training performance: %f' % ensemble.train_score_) self.logger.info( 'Building the ensemble took %f seconds' % watch.wall_elapsed('ensemble_iter_' + str(num_iteration))) # Set this variable here to avoid re-running the ensemble builder # every two seconds in case the ensemble did not change current_num_models = len(dir_ensemble_list) ensemble_predictions = ensemble.predict(all_predictions_train) if sys.version_info[0] == 2: ensemble_predictions.flags.writeable = False current_hash = hash(ensemble_predictions.data) else: current_hash = hash(ensemble_predictions.data.tobytes()) # Only output a new ensemble and new predictions if the output of the # ensemble would actually change! # TODO this is neither safe (collisions, tests only with the ensemble # prediction, but not the ensemble), implement a hash function for # each possible ensemble builder. if last_hash is not None: if current_hash == last_hash: self.logger.info('Ensemble output did not change.') time.sleep(2) continue else: last_hash = current_hash else: last_hash = current_hash # Save the ensemble for later use in the main auto-sklearn module! backend.save_ensemble(ensemble, index_run, self.seed) # Save predictions for valid and test data set if len(dir_valid_list) == len(dir_ensemble_list): all_predictions_valid = np.array(all_predictions_valid) ensemble_predictions_valid = ensemble.predict( all_predictions_valid) if self.task_type == BINARY_CLASSIFICATION: ensemble_predictions_valid = ensemble_predictions_valid[:, 1] if self.low_precision: if self.task_type in [ BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION ]: ensemble_predictions_valid[ ensemble_predictions_valid < 1e-4] = 0. if self.metric in [BAC_METRIC, F1_METRIC]: bin_array = np.zeros(ensemble_predictions_valid.shape, dtype=np.int32) if (self.task_type != MULTICLASS_CLASSIFICATION) or ( ensemble_predictions_valid.shape[1] == 1): bin_array[ensemble_predictions_valid >= 0.5] = 1 else: sample_num = ensemble_predictions_valid.shape[0] for i in range(sample_num): j = np.argmax(ensemble_predictions_valid[i, :]) bin_array[i, j] = 1 ensemble_predictions_valid = bin_array if self.task_type in CLASSIFICATION_TASKS: if ensemble_predictions_valid.size < (20000 * 20): precision = 3 else: precision = 2 else: if ensemble_predictions_valid.size > 1000000: precision = 4 else: # File size maximally 2.1MB precision = 6 backend.save_predictions_as_txt(ensemble_predictions_valid, 'valid', index_run, prefix=self.dataset_name, precision=precision) else: self.logger.info( 'Could not find as many validation set predictions (%d)' 'as ensemble predictions (%d)!.', len(dir_valid_list), len(dir_ensemble_list)) del all_predictions_valid if len(dir_test_list) == len(dir_ensemble_list): all_predictions_test = np.array(all_predictions_test) ensemble_predictions_test = ensemble.predict( all_predictions_test) if self.task_type == BINARY_CLASSIFICATION: ensemble_predictions_test = ensemble_predictions_test[:, 1] if self.low_precision: if self.task_type in [ BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION ]: ensemble_predictions_test[ ensemble_predictions_test < 1e-4] = 0. if self.metric in [BAC_METRIC, F1_METRIC]: bin_array = np.zeros(ensemble_predictions_test.shape, dtype=np.int32) if (self.task_type != MULTICLASS_CLASSIFICATION) or ( ensemble_predictions_test.shape[1] == 1): bin_array[ensemble_predictions_test >= 0.5] = 1 else: sample_num = ensemble_predictions_test.shape[0] for i in range(sample_num): j = np.argmax(ensemble_predictions_test[i, :]) bin_array[i, j] = 1 ensemble_predictions_test = bin_array if self.task_type in CLASSIFICATION_TASKS: if ensemble_predictions_test.size < (20000 * 20): precision = 3 else: precision = 2 else: if ensemble_predictions_test.size > 1000000: precision = 4 else: precision = 6 backend.save_predictions_as_txt(ensemble_predictions_test, 'test', index_run, prefix=self.dataset_name, precision=precision) else: self.logger.info( 'Could not find as many test set predictions (%d) as ' 'ensemble predictions (%d)!', len(dir_test_list), len(dir_ensemble_list)) del all_predictions_test current_num_models = len(dir_ensemble_list) watch.stop_task('index_run' + str(index_run)) time_iter = watch.get_wall_dur('index_run' + str(index_run)) used_time = watch.wall_elapsed('ensemble_builder') index_run += 1 return
def main(autosklearn_tmp_dir, dataset_name, task_type, metric, limit, output_dir, ensemble_size=None, ensemble_nbest=None, seed=1, shared_mode=False, max_iterations=-1, precision="32"): watch = StopWatch() watch.start_task('ensemble_builder') used_time = 0 time_iter = 0 index_run = 0 num_iteration = 0 current_num_models = 0 backend = Backend(output_dir, autosklearn_tmp_dir) dir_ensemble = os.path.join(autosklearn_tmp_dir, '.auto-sklearn', 'predictions_ensemble') dir_valid = os.path.join(autosklearn_tmp_dir, '.auto-sklearn', 'predictions_valid') dir_test = os.path.join(autosklearn_tmp_dir, '.auto-sklearn', 'predictions_test') paths_ = [dir_ensemble, dir_valid, dir_test] dir_ensemble_list_mtimes = [] while used_time < limit or (max_iterations > 0 and max_iterations >= num_iteration): num_iteration += 1 logger.debug('Time left: %f', limit - used_time) logger.debug('Time last iteration: %f', time_iter) # Reload the ensemble targets every iteration, important, because cv may # update the ensemble targets in the cause of running auto-sklearn # TODO update cv in order to not need this any more! targets_ensemble = backend.load_targets_ensemble() # Load the predictions from the models exists = [os.path.isdir(dir_) for dir_ in paths_] if not exists[0]: # all(exists): logger.debug('Prediction directory %s does not exist!' % dir_ensemble) time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if shared_mode is False: dir_ensemble_list = sorted(glob.glob(os.path.join( dir_ensemble, 'predictions_ensemble_%s_*.npy' % seed))) if exists[1]: dir_valid_list = sorted(glob.glob(os.path.join( dir_valid, 'predictions_valid_%s_*.npy' % seed))) else: dir_valid_list = [] if exists[2]: dir_test_list = sorted(glob.glob(os.path.join( dir_test, 'predictions_test_%s_*.npy' % seed))) else: dir_test_list = [] else: dir_ensemble_list = sorted(os.listdir(dir_ensemble)) dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else [] dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else [] # Check the modification times because predictions can be updated # over time! old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes dir_ensemble_list_mtimes = [] for dir_ensemble_file in dir_ensemble_list: if dir_ensemble_file.endswith("/"): dir_ensemble_file = dir_ensemble_file[:-1] basename = os.path.basename(dir_ensemble_file) dir_ensemble_file = os.path.join(dir_ensemble, basename) mtime = os.path.getmtime(dir_ensemble_file) dir_ensemble_list_mtimes.append(mtime) if len(dir_ensemble_list) == 0: logger.debug('Directories are empty') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if len(dir_ensemble_list) <= current_num_models and \ old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes: logger.debug('Nothing has changed since the last time') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue watch.start_task('ensemble_iter_' + str(index_run)) # List of num_runs (which are in the filename) which will be included # later include_num_runs = [] backup_num_runs = [] model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$') if ensemble_nbest is not None: # Keeps track of the single scores of each model in our ensemble scores_nbest = [] # The indices of the model that are currently in our ensemble indices_nbest = [] # The names of the models model_names = [] model_names_to_scores = dict() model_idx = 0 for model_name in dir_ensemble_list: if model_name.endswith("/"): model_name = model_name[:-1] basename = os.path.basename(model_name) if precision is "16": predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float16) elif precision is "32": predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float32) elif precision is "64": predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float64) else: predictions = np.load(os.path.join(dir_ensemble, basename)) try: score = calculate_score(targets_ensemble, predictions, task_type, metric, predictions.shape[1]) except: score = -1 model_names_to_scores[model_name] = score match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if ensemble_nbest is not None: if score <= 0.001: logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) # If we have less models in our ensemble than ensemble_nbest add # the current model if it is better than random elif len(scores_nbest) < ensemble_nbest: scores_nbest.append(score) indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) model_names.append(model_name) else: # Take the worst performing model in our ensemble so far idx = np.argmin(np.array([scores_nbest])) # If the current model is better than the worst model in # our ensemble replace it by the current model if scores_nbest[idx] < score: logger.debug('Worst model in our ensemble: %s with ' 'score %f will be replaced by model %s ' 'with score %f', model_names[idx], scores_nbest[idx], model_name, score) # Exclude the old model del scores_nbest[idx] scores_nbest.append(score) del include_num_runs[idx] del indices_nbest[idx] indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) del model_names[idx] model_names.append(model_name) # Otherwise exclude the current model from the ensemble else: # include_num_runs.append(True) pass else: # Load all predictions that are better than random if score <= 0.001: # include_num_runs.append(True) logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) else: include_num_runs.append((automl_seed, num_run)) model_idx += 1 # If there is no model better than random guessing, we have to use # all models which do random guessing if len(include_num_runs) == 0: include_num_runs = backup_num_runs indices_to_model_names = dict() indices_to_run_num = dict() for i, model_name in enumerate(dir_ensemble_list): match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if (automl_seed, num_run) in include_num_runs: num_indices = len(indices_to_model_names) indices_to_model_names[num_indices] = model_name indices_to_run_num[num_indices] = (automl_seed, num_run) try: all_predictions_train, all_predictions_valid, all_predictions_test =\ get_all_predictions(dir_ensemble, dir_ensemble_list, dir_valid, dir_valid_list, dir_test, dir_test_list, include_num_runs, model_and_automl_re, precision) except IOError: logger.error('Could not load the predictions.') continue if len(include_num_runs) == 0: logger.error('All models do just random guessing') time.sleep(2) continue else: ensemble = EnsembleSelection(ensemble_size=ensemble_size, task_type=task_type, metric=metric) try: ensemble.fit(all_predictions_train, targets_ensemble, include_num_runs) logger.info(ensemble) except ValueError as e: logger.error('Caught ValueError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except IndexError as e: logger.error('Caught IndexError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except Exception as e: logger.error('Caught error! %s', e.message) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue # Output the score logger.info('Training performance: %f' % ensemble.train_score_) # Save the ensemble for later use in the main auto-sklearn module! backend.save_ensemble(ensemble, index_run, seed) # Save predictions for valid and test data set if len(dir_valid_list) == len(dir_ensemble_list): all_predictions_valid = np.array(all_predictions_valid) ensemble_predictions_valid = ensemble.predict(all_predictions_valid) backend.save_predictions_as_txt(ensemble_predictions_valid, 'valid', index_run, prefix=dataset_name) else: logger.info('Could not find as many validation set predictions (%d)' 'as ensemble predictions (%d)!.', len(dir_valid_list), len(dir_ensemble_list)) del all_predictions_valid if len(dir_test_list) == len(dir_ensemble_list): all_predictions_test = np.array(all_predictions_test) ensemble_predictions_test = ensemble.predict(all_predictions_test) backend.save_predictions_as_txt(ensemble_predictions_test, 'test', index_run, prefix=dataset_name) else: logger.info('Could not find as many test set predictions (%d) as ' 'ensemble predictions (%d)!', len(dir_test_list), len(dir_ensemble_list)) del all_predictions_test current_num_models = len(dir_ensemble_list) watch.stop_task('ensemble_iter_' + str(index_run)) time_iter = watch.get_wall_dur('ensemble_iter_' + str(index_run)) used_time = watch.wall_elapsed('ensemble_builder') index_run += 1 return
class AutoML(BaseEstimator): def __init__( self, backend, time_left_for_this_task, per_run_time_limit, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3072, metadata_directory=None, keep_models=True, debug_mode=False, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, shared_mode=False, precision=32, disable_evaluator_output=False, get_smac_object_callback=None, smac_scenario_args=None, ): super(AutoML, self).__init__() self._backend = backend #self._tmp_dir = tmp_dir #self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None self._metadata_directory = metadata_directory self._keep_models = keep_models self._include_estimators = include_estimators self._exclude_estimators = exclude_estimators self._include_preprocessors = include_preprocessors self._exclude_preprocessors = exclude_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments \ if resampling_strategy_arguments is not None else {} self._shared_mode = shared_mode self.precision = precision self._disable_evaluator_output = disable_evaluator_output self._get_smac_object_callback = get_smac_object_callback self._smac_scenario_args = smac_scenario_args self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self._parser = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode if not isinstance(self._time_for_task, int): raise ValueError("time_left_for_this_task not of type integer, " "but %s" % str(type(self._time_for_task))) if not isinstance(self._per_run_time_limit, int): raise ValueError("per_run_time_limit not of type integer, but %s" % str(type(self._per_run_time_limit))) # After assignging and checking variables... #self._backend = Backend(self._output_dir, self._tmp_dir) def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric=None, feat_type=None, dataset_name=None): if not self._shared_mode: self._backend.context.delete_directories() else: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass self._backend.context.create_directories() if dataset_name is None: dataset_name = hash_array_or_matrix(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if metric is None: raise ValueError('No metric given.') if not isinstance(metric, Scorer): raise ValueError('Metric must be instance of ' 'autosklearn.metrics.Scorer.') if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager(X, y, task=task, feat_type=feat_type, dataset_name=dataset_name) return self._fit(loaded_data_manager, metric) def fit_automl_dataset(self, dataset, metric): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! self._data_memory_limit = float(self._ml_memory_limit) / 3 loaded_data_manager = CompetitionDataManager( dataset, max_memory_in_mb=self._data_memory_limit) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager, metric) def fit_on_datamanager(self, datamanager, metric): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(datamanager.name) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._fit(datamanager, metric) def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger( os.path.join(self._backend.temporary_directory, '%s.log' % str(logger_name))) return get_logger(logger_name) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager, num_run): # When using partial-cv it makes no sense to do dummy predictions if self._resampling_strategy in [ 'partial-cv', 'partial-cv-iterative-fit' ]: return num_run self._logger.info("Starting to create dummy predictions.") memory_limit = int(self._ml_memory_limit) scenario_mock = unittest.mock.Mock() scenario_mock.wallclock_limit = self._time_for_task # This stats object is a hack - maybe the SMAC stats object should # already be generated here! stats = Stats(scenario_mock) stats.start_timing() ta = ExecuteTaFuncWithQueue( backend=self._backend, autosklearn_seed=self._seed, resampling_strategy=self._resampling_strategy, initial_num_run=num_run, logger=self._logger, stats=stats, metric=self._metric, memory_limit=memory_limit, disable_file_output=self._disable_evaluator_output, **self._resampling_strategy_arguments) status, cost, runtime, additional_info = \ ta.run(1, cutoff=self._time_for_task) if status == StatusType.SUCCESS: self._logger.info("Finished creating dummy predictions.") else: self._logger.error('Error creating dummy predictions: %s ', str(additional_info)) return ta.num_run def _fit(self, datamanager, metric): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if not isinstance(self._disable_evaluator_output, (bool, list)): raise ValueError('disable_evaluator_output must be of type bool ' 'or list.') if isinstance(self._disable_evaluator_output, list): allowed_elements = ['model', 'y_optimization'] for element in self._disable_evaluator_output: if element not in allowed_elements: raise ValueError("List member '%s' for argument " "'disable_evaluator_output' must be one " "of " + str(allowed_elements)) if self._resampling_strategy not in [ 'holdout', 'holdout-iterative-fit', 'cv', 'partial-cv', 'partial-cv-iterative-fit' ]: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit'] \ and self._ensemble_size != 0: raise ValueError("Resampling strategy %s cannot be used " "together with ensembles." % self._resampling_strategy) if self._resampling_strategy in ['partial-cv', 'cv', 'partial-cv-iterative-fit'] and \ not 'folds' in self._resampling_strategy_arguments: self._resampling_strategy_arguments['folds'] = 5 self._backend._make_internals_directory() if self._keep_models: try: os.makedirs(self._backend.get_model_dir()) except (OSError, FileExistsError) as e: if not self._shared_mode: raise self._metric = metric self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time(self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._backend.temporary_directory, self._backend, datamanager, include_estimators=self._include_estimators, exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info('Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._logger.warning("Not starting ensemble builder because there " "is no time left!") self._proc_ensemble = None else: self._proc_ensemble = self._get_ensemble_process( time_left_for_ensembles) if self._ensemble_size > 0: self._proc_ensemble.start() else: self._logger.info('Not starting ensemble builder because ' 'ensemble size is <= 0.') self._stopwatch.stop_task(ensemble_task_name) # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max( 0, self._time_for_task - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info('Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") _proc_smac = None else: if self._per_run_time_limit is None or \ self._per_run_time_limit > time_left_for_smac: print('Time limit for a single run is higher than total time ' 'limit. Capping the limit for a single run to the total ' 'time given to SMAC (%f)' % time_left_for_smac) per_run_time_limit = time_left_for_smac else: per_run_time_limit = self._per_run_time_limit _proc_smac = AutoMLSMBO( config_space=self.configuration_space, dataset_name=self._dataset_name, backend=self._backend, total_walltime_limit=time_left_for_smac, func_eval_time_limit=per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self. _initial_configurations_via_metalearning, config_file=configspace_path, seed=self._seed, metadata_directory=self._metadata_directory, metric=self._metric, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, shared_mode=self._shared_mode, include_estimators=self._include_estimators, exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors, disable_file_output=self._disable_evaluator_output, get_smac_object_callback=self._get_smac_object_callback, smac_scenario_args=self._smac_scenario_args, ) self.runhistory_, self.trajectory_ = \ _proc_smac.run_smbo() trajectory_filename = os.path.join( self._backend.get_smac_output_directory_for_run(self._seed), 'trajectory.json') saveable_trajectory = \ [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:]) for entry in self.trajectory_] with open(trajectory_filename, 'w') as fh: json.dump(saveable_trajectory, fh) # Wait until the ensemble process is finished to avoid shutting down # while the ensemble builder tries to access the data if self._proc_ensemble is not None and self._ensemble_size > 0: self._proc_ensemble.join() self._proc_ensemble = None self._load_models() return self def refit(self, X, y): def send_warnings_to_log(message, category, filename, lineno, file=None, line=None): self._logger.debug('%s:%s: %s:%s' % (filename, lineno, category.__name__, message)) return if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() random_state = np.random.RandomState(self._seed) for identifier in self.models_: if identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method # try to fit the model. If it fails, shuffle the data. This # could alleviate the problem in algorithms that depend on # the ordering of the data. for i in range(10): try: with warnings.catch_warnings(): warnings.showwarning = send_warnings_to_log model.fit(X.copy(), y.copy()) break except ValueError as e: indices = list(range(X.shape[0])) random_state.shuffle(indices) X = X[indices] y = y[indices] if i == 9: raise e self._can_predict = True return self def predict(self, X, batch_size=None, n_jobs=1): """predict. Parameters ---------- X: array-like, shape = (n_samples, n_features) batch_size: int or None, defaults to None batch_size controls whether the pipelines will be called on small chunks of the data. Useful when calling the predict method on the whole array X results in a MemoryError. n_jobs: int, defaults to 1 Parallelize the predictions across the models with n_jobs processes. """ if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if not self._can_predict and \ self._resampling_strategy not in \ ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently not implemented for resampling ' 'strategy %s, please call refit().' % self._resampling_strategy) if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() # Parallelize predictions across models with n_jobs processes. # Each process computes predictions in chunks of batch_size rows. all_predictions = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(_model_predict)(self, X, batch_size, identifier) for identifier in self.ensemble_.get_model_identifiers()) if len(all_predictions) == 0: raise ValueError( 'Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = self.ensemble_.predict(all_predictions) return predictions def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): if self._resampling_strategy in [ 'partial-cv', 'partial-cv-iterative-fit' ]: raise ValueError('Cannot call fit_ensemble with resampling ' 'strategy %s.' % self._resampling_strategy) if self._logger is None: self._logger = self._get_logger(dataset_name) self._proc_ensemble = self._get_ensemble_process( 1, task, metric, precision, dataset_name, max_iterations=1, ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size) self._proc_ensemble.main() self._proc_ensemble = None return self def _get_ensemble_process(self, time_left_for_ensembles, task=None, metric=None, precision=None, dataset_name=None, max_iterations=-1, ensemble_nbest=None, ensemble_size=None): if task is None: task = self._task else: self._task = task if metric is None: metric = self._metric else: self._metric = metric if precision is None: precision = self.precision else: self.precision = precision if dataset_name is None: dataset_name = self._dataset_name else: self._dataset_name = dataset_name if ensemble_nbest is None: ensemble_nbest = self._ensemble_nbest else: self._ensemble_nbest = ensemble_nbest if ensemble_size is None: ensemble_size = self._ensemble_size else: self._ensemble_size = ensemble_size return EnsembleBuilder(backend=self._backend, dataset_name=dataset_name, task_type=task, metric=metric, limit=time_left_for_ensembles, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, precision=precision, max_iterations=max_iterations) def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.ensemble_ = self._backend.load_ensemble(seed) if self.ensemble_: identifiers = self.ensemble_.identifiers_ self.models_ = self._backend.load_models_by_identifiers( identifiers) if len(self.models_) == 0 and self._resampling_strategy not in \ ['partial-cv', 'partial-cv-iterative-fit']: raise ValueError('No models fitted!') elif self._disable_evaluator_output is False or \ (isinstance(self._disable_evaluator_output, list) and 'model' not in self._disable_evaluator_output): model_names = self._backend.list_all_models(seed) if len(model_names) == 0 and self._resampling_strategy not in \ ['partial-cv', 'partial-cv-iterative-fit']: raise ValueError('No models fitted!') self.models = [] else: self.models = [] def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict(X) return calculate_score(solution=y, prediction=prediction, task_type=self._task, metric=self._metric, all_scoring_functions=False) @property def cv_results_(self): results = dict() # Missing in contrast to scikit-learn # splitX_test_score - auto-sklearn does not store the scores on a split # basis # std_test_score - auto-sklearn does not store the scores on a split # basis # splitX_train_score - auto-sklearn does not compute train scores, add # flag to compute the train scores # mean_train_score - auto-sklearn does not store the train scores # std_train_score - auto-sklearn does not store the train scores # std_fit_time - auto-sklearn does not store the fit times per split # mean_score_time - auto-sklearn does not store the score time # std_score_time - auto-sklearn does not store the score time # TODO: add those arguments # TODO remove this restriction! if self._resampling_strategy in [ 'partial-cv', 'partial-cv-iterative-fit' ]: raise ValueError('Cannot call cv_results when using partial-cv!') parameter_dictionaries = dict() masks = dict() hp_names = [] # Set up dictionary for parameter values for hp in self.configuration_space.get_hyperparameters(): name = hp.name parameter_dictionaries[name] = [] masks[name] = [] hp_names.append(name) mean_test_score = [] mean_fit_time = [] params = [] status = [] for run_key in self.runhistory_.data: run_value = self.runhistory_.data[run_key] config_id = run_key.config_id config = self.runhistory_.ids_config[config_id] param_dict = config.get_dictionary() params.append(param_dict) mean_test_score.append(1 - run_value.cost) mean_fit_time.append(run_value.time) s = run_value.status if s == StatusType.SUCCESS: status.append('Success') elif s == StatusType.TIMEOUT: status.append('Timeout') elif s == StatusType.CRASHED: status.append('Crash') elif s == StatusType.ABORT: status.append('Abort') elif s == StatusType.MEMOUT: status.append('Memout') else: raise NotImplementedError(s) for hp_name in hp_names: if hp_name in param_dict: hp_value = param_dict[hp_name] mask_value = False else: hp_value = np.NaN mask_value = True parameter_dictionaries[hp_name].append(hp_value) masks[hp_name].append(mask_value) results['mean_test_score'] = np.array(mean_test_score) results['mean_fit_time'] = np.array(mean_fit_time) results['params'] = params results['rank_test_scores'] = scipy.stats.rankdata( 1 - results['mean_test_score'], method='min') results['status'] = status for hp_name in hp_names: masked_array = ma.MaskedArray(parameter_dictionaries[hp_name], masks[hp_name]) results['param_%s' % hp_name] = masked_array return results def sprint_statistics(self): cv_results = self.cv_results_ sio = io.StringIO() sio.write('auto-sklearn results:\n') sio.write(' Dataset name: %s\n' % self._dataset_name) sio.write(' Metric: %s\n' % self._metric) idx_best_run = np.argmax(cv_results['mean_test_score']) best_score = cv_results['mean_test_score'][idx_best_run] sio.write(' Best validation score: %f\n' % best_score) num_runs = len(cv_results['status']) sio.write(' Number of target algorithm runs: %d\n' % num_runs) num_success = sum([s == 'Success' for s in cv_results['status']]) sio.write(' Number of successful target algorithm runs: %d\n' % num_success) num_crash = sum([s == 'Crash' for s in cv_results['status']]) sio.write(' Number of crashed target algorithm runs: %d\n' % num_crash) num_timeout = sum([s == 'Timeout' for s in cv_results['status']]) sio.write(' Number of target algorithms that exceeded the memory ' 'limit: %d\n' % num_timeout) num_memout = sum([s == 'Memout' for s in cv_results['status']]) sio.write(' Number of target algorithms that exceeded the time ' 'limit: %d\n' % num_memout) return sio.getvalue() def get_models_with_weights(self): if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() return self.ensemble_.get_models_with_weights(self.models_) def show_models(self): models_with_weights = self.get_models_with_weights() with io.StringIO() as sio: sio.write("[") for weight, model in models_with_weights: sio.write("(%f, %s),\n" % (weight, model)) sio.write("]") return sio.getvalue() def _create_search_space(self, tmp_dir, backend, datamanager, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None): task_name = 'CreateConfigSpace' self._stopwatch.start_task(task_name) configspace_path = os.path.join(tmp_dir, 'space.pcs') configuration_space = pipeline.get_configuration_space( datamanager.info, include_estimators=include_estimators, exclude_estimators=exclude_estimators, include_preprocessors=include_preprocessors, exclude_preprocessors=exclude_preprocessors) configuration_space = self.configuration_space_created_hook( datamanager, configuration_space) sp_string = pcs.write(configuration_space) backend.write_txt_file(configspace_path, sp_string, 'Configuration space') self._stopwatch.stop_task(task_name) return configuration_space, configspace_path def configuration_space_created_hook(self, datamanager, configuration_space): return configuration_space
class AutoML(BaseEstimator): def __init__(self, backend, time_left_for_this_task, per_run_time_limit, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3072, metadata_directory=None, keep_models=True, debug_mode=False, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, shared_mode=False, precision=32, disable_evaluator_output=False, get_smac_object_callback=None, smac_scenario_args=None, ): super(AutoML, self).__init__() self._backend = backend #self._tmp_dir = tmp_dir #self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None self._metadata_directory = metadata_directory self._keep_models = keep_models self._include_estimators = include_estimators self._exclude_estimators = exclude_estimators self._include_preprocessors = include_preprocessors self._exclude_preprocessors = exclude_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments \ if resampling_strategy_arguments is not None else {} self._shared_mode = shared_mode self.precision = precision self._disable_evaluator_output = disable_evaluator_output self._get_smac_object_callback = get_smac_object_callback self._smac_scenario_args = smac_scenario_args self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self._parser = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode if not isinstance(self._time_for_task, int): raise ValueError("time_left_for_this_task not of type integer, " "but %s" % str(type(self._time_for_task))) if not isinstance(self._per_run_time_limit, int): raise ValueError("per_run_time_limit not of type integer, but %s" % str(type(self._per_run_time_limit))) # After assignging and checking variables... #self._backend = Backend(self._output_dir, self._tmp_dir) def fit( self, X, y, task, metric, X_test=None, y_test=None, feat_type=None, dataset_name=None, only_return_configuration_space=False, ): if self._shared_mode: # If this fails, it's likely that this is the first call to get # the data manager try: D = self._backend.load_datamanager() dataset_name = D.name except IOError: pass if dataset_name is None: dataset_name = hash_array_or_matrix(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if metric is None: raise ValueError('No metric given.') if not isinstance(metric, Scorer): raise ValueError('Metric must be instance of ' 'autosklearn.metrics.Scorer.') if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all([isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager( X, y, X_test=X_test, y_test=y_test, task=task, feat_type=feat_type, dataset_name=dataset_name, ) return self._fit( loaded_data_manager, metric, only_return_configuration_space, ) # TODO this is very old code which can be dropped! def fit_automl_dataset(self, dataset, metric): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! self._data_memory_limit = float(self._ml_memory_limit) / 3 loaded_data_manager = CompetitionDataManager( dataset, max_memory_in_mb=self._data_memory_limit) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager, metric) def fit_on_datamanager(self, datamanager, metric): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(datamanager.name) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._fit(datamanager, metric) def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._backend.temporary_directory, '%s.log' % str(logger_name))) return get_logger(logger_name) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager, num_run): # When using partial-cv it makes no sense to do dummy predictions if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit']: return num_run self._logger.info("Starting to create dummy predictions.") memory_limit = int(self._ml_memory_limit) scenario_mock = unittest.mock.Mock() scenario_mock.wallclock_limit = self._time_for_task # This stats object is a hack - maybe the SMAC stats object should # already be generated here! stats = Stats(scenario_mock) stats.start_timing() ta = ExecuteTaFuncWithQueue(backend=self._backend, autosklearn_seed=self._seed, resampling_strategy=self._resampling_strategy, initial_num_run=num_run, logger=self._logger, stats=stats, metric=self._metric, memory_limit=memory_limit, disable_file_output=self._disable_evaluator_output, **self._resampling_strategy_arguments) status, cost, runtime, additional_info = \ ta.run(1, cutoff=self._time_for_task) if status == StatusType.SUCCESS: self._logger.info("Finished creating dummy predictions.") else: self._logger.error('Error creating dummy predictions: %s ', str(additional_info)) return ta.num_run def _fit(self, datamanager, metric, only_return_configuration_space=False): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if not isinstance(self._disable_evaluator_output, (bool, list)): raise ValueError('disable_evaluator_output must be of type bool ' 'or list.') if isinstance(self._disable_evaluator_output, list): allowed_elements = ['model', 'y_optimization'] for element in self._disable_evaluator_output: if element not in allowed_elements: raise ValueError("List member '%s' for argument " "'disable_evaluator_output' must be one " "of " + str(allowed_elements)) if self._resampling_strategy not in [ 'holdout', 'holdout-iterative-fit', 'cv', 'partial-cv', 'partial-cv-iterative-fit'] \ and not issubclass(self._resampling_strategy, BaseCrossValidator)\ and not issubclass(self._resampling_strategy, _RepeatedSplits)\ and not issubclass(self._resampling_strategy, BaseShuffleSplit): raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit'] \ and self._ensemble_size != 0: raise ValueError("Resampling strategy %s cannot be used " "together with ensembles." % self._resampling_strategy) if self._resampling_strategy in ['partial-cv', 'cv', 'partial-cv-iterative-fit'] and \ not 'folds' in self._resampling_strategy_arguments: self._resampling_strategy_arguments['folds'] = 5 self._backend._make_internals_directory() if self._keep_models: try: os.makedirs(self._backend.get_model_dir()) except (OSError, FileExistsError) as e: if not self._shared_mode: raise self._metric = metric self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._backend.temporary_directory, self._backend, datamanager, include_estimators=self._include_estimators, exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors) if only_return_configuration_space: return self.configuration_space # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info( 'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._proc_ensemble = None # Fit only raises error when ensemble_size is not zero but # time_left_for_ensembles is zero. if self._ensemble_size > 0: raise ValueError("Not starting ensemble builder because there " "is no time left. Try increasing the value " "of time_left_for_this_task.") else: self._proc_ensemble = self._get_ensemble_process(time_left_for_ensembles) if self._ensemble_size > 0: self._proc_ensemble.start() else: self._logger.info('Not starting ensemble builder because ' 'ensemble size is <= 0.') self._stopwatch.stop_task(ensemble_task_name) # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max(0, self._time_for_task - self._stopwatch.wall_elapsed( self._dataset_name)) if self._logger: self._logger.info( 'Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") _proc_smac = None else: if self._per_run_time_limit is None or \ self._per_run_time_limit > time_left_for_smac: print('Time limit for a single run is higher than total time ' 'limit. Capping the limit for a single run to the total ' 'time given to SMAC (%f)' % time_left_for_smac) per_run_time_limit = time_left_for_smac else: per_run_time_limit = self._per_run_time_limit _proc_smac = AutoMLSMBO( config_space=self.configuration_space, dataset_name=self._dataset_name, backend=self._backend, total_walltime_limit=time_left_for_smac, func_eval_time_limit=per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self._initial_configurations_via_metalearning, config_file=configspace_path, seed=self._seed, metadata_directory=self._metadata_directory, metric=self._metric, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, shared_mode=self._shared_mode, include_estimators=self._include_estimators, exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors, disable_file_output=self._disable_evaluator_output, get_smac_object_callback=self._get_smac_object_callback, smac_scenario_args=self._smac_scenario_args, ) self.runhistory_, self.trajectory_ = \ _proc_smac.run_smbo() trajectory_filename = os.path.join( self._backend.get_smac_output_directory_for_run(self._seed), 'trajectory.json') saveable_trajectory = \ [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:]) for entry in self.trajectory_] with open(trajectory_filename, 'w') as fh: json.dump(saveable_trajectory, fh) # Wait until the ensemble process is finished to avoid shutting down # while the ensemble builder tries to access the data if self._proc_ensemble is not None and self._ensemble_size > 0: self._proc_ensemble.join() self._proc_ensemble = None self._load_models() return self def refit(self, X, y): def send_warnings_to_log(message, category, filename, lineno, file=None, line=None): self._logger.debug('%s:%s: %s:%s' % (filename, lineno, category.__name__, message)) return if self._keep_models is not True: raise ValueError( "Refit can only be called if 'keep_models==True'") if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() # Refit is not applicable when ensemble_size is set to zero. if self.ensemble_ is None: raise ValueError("Refit can only be called if 'ensemble_size != 0'") random_state = np.random.RandomState(self._seed) for identifier in self.models_: if identifier in self.ensemble_.get_selected_model_identifiers(): model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method # try to fit the model. If it fails, shuffle the data. This # could alleviate the problem in algorithms that depend on # the ordering of the data. for i in range(10): try: with warnings.catch_warnings(): warnings.showwarning = send_warnings_to_log model.fit(X.copy(), y.copy()) break except ValueError as e: indices = list(range(X.shape[0])) random_state.shuffle(indices) X = X[indices] y = y[indices] if i == 9: raise e self._can_predict = True return self def predict(self, X, batch_size=None, n_jobs=1): """predict. Parameters ---------- X: array-like, shape = (n_samples, n_features) batch_size: int or None, defaults to None batch_size controls whether the pipelines will be called on small chunks of the data. Useful when calling the predict method on the whole array X results in a MemoryError. n_jobs: int, defaults to 1 Parallelize the predictions across the models with n_jobs processes. """ if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if not self._can_predict and \ self._resampling_strategy not in \ ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently not implemented for resampling ' 'strategy %s, please call refit().' % self._resampling_strategy) if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() # If self.ensemble_ is None, it means that ensemble_size is set to zero. # In such cases, raise error because predict and predict_proba cannot # be called. if self.ensemble_ is None: raise ValueError("Predict and predict_proba can only be called " "if 'ensemble_size != 0'") # Parallelize predictions across models with n_jobs processes. # Each process computes predictions in chunks of batch_size rows. all_predictions = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(_model_predict)(self, X, batch_size, identifier) for identifier in self.ensemble_.get_selected_model_identifiers()) if len(all_predictions) == 0: raise ValueError('Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = self.ensemble_.predict(all_predictions) return predictions def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit']: raise ValueError('Cannot call fit_ensemble with resampling ' 'strategy %s.' % self._resampling_strategy) if self._logger is None: self._logger = self._get_logger(dataset_name) self._proc_ensemble = self._get_ensemble_process( 1, task, metric, precision, dataset_name, max_iterations=1, ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size) self._proc_ensemble.main() self._proc_ensemble = None self._load_models() return self def _get_ensemble_process(self, time_left_for_ensembles, task=None, metric=None, precision=None, dataset_name=None, max_iterations=-1, ensemble_nbest=None, ensemble_size=None): if task is None: task = self._task else: self._task = task if metric is None: metric = self._metric else: self._metric = metric if precision is None: precision = self.precision else: self.precision = precision if dataset_name is None: dataset_name = self._dataset_name else: self._dataset_name = dataset_name if ensemble_nbest is None: ensemble_nbest = self._ensemble_nbest else: self._ensemble_nbest = ensemble_nbest if ensemble_size is None: ensemble_size = self._ensemble_size else: self._ensemble_size = ensemble_size return EnsembleBuilder(backend=self._backend, dataset_name=dataset_name, task_type=task, metric=metric, limit=time_left_for_ensembles, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, precision=precision, max_iterations=max_iterations, read_at_most=np.inf) def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.ensemble_ = self._backend.load_ensemble(seed) if self.ensemble_: identifiers = self.ensemble_.identifiers_ self.models_ = self._backend.load_models_by_identifiers(identifiers) if len(self.models_) == 0 and self._resampling_strategy not in \ ['partial-cv', 'partial-cv-iterative-fit']: raise ValueError('No models fitted!') elif self._disable_evaluator_output is False or \ (isinstance(self._disable_evaluator_output, list) and 'model' not in self._disable_evaluator_output): model_names = self._backend.list_all_models(seed) if len(model_names) == 0 and self._resampling_strategy not in \ ['partial-cv', 'partial-cv-iterative-fit']: raise ValueError('No models fitted!') self.models_ = [] else: self.models_ = [] def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict(X) return calculate_score(solution=y, prediction=prediction, task_type=self._task, metric=self._metric, all_scoring_functions=False) @property def cv_results_(self): results = dict() # Missing in contrast to scikit-learn # splitX_test_score - auto-sklearn does not store the scores on a split # basis # std_test_score - auto-sklearn does not store the scores on a split # basis # splitX_train_score - auto-sklearn does not compute train scores, add # flag to compute the train scores # mean_train_score - auto-sklearn does not store the train scores # std_train_score - auto-sklearn does not store the train scores # std_fit_time - auto-sklearn does not store the fit times per split # mean_score_time - auto-sklearn does not store the score time # std_score_time - auto-sklearn does not store the score time # TODO: add those arguments # TODO remove this restriction! if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit']: raise ValueError('Cannot call cv_results when using partial-cv!') parameter_dictionaries = dict() masks = dict() hp_names = [] # Set up dictionary for parameter values for hp in self.configuration_space.get_hyperparameters(): name = hp.name parameter_dictionaries[name] = [] masks[name] = [] hp_names.append(name) mean_test_score = [] mean_fit_time = [] params = [] status = [] for run_key in self.runhistory_.data: run_value = self.runhistory_.data[run_key] config_id = run_key.config_id config = self.runhistory_.ids_config[config_id] param_dict = config.get_dictionary() params.append(param_dict) mean_test_score.append(1 - run_value.cost) mean_fit_time.append(run_value.time) s = run_value.status if s == StatusType.SUCCESS: status.append('Success') elif s == StatusType.TIMEOUT: status.append('Timeout') elif s == StatusType.CRASHED: status.append('Crash') elif s == StatusType.ABORT: status.append('Abort') elif s == StatusType.MEMOUT: status.append('Memout') else: raise NotImplementedError(s) for hp_name in hp_names: if hp_name in param_dict: hp_value = param_dict[hp_name] mask_value = False else: hp_value = np.NaN mask_value = True parameter_dictionaries[hp_name].append(hp_value) masks[hp_name].append(mask_value) results['mean_test_score'] = np.array(mean_test_score) results['mean_fit_time'] = np.array(mean_fit_time) results['params'] = params results['rank_test_scores'] = scipy.stats.rankdata(1 - results['mean_test_score'], method='min') results['status'] = status for hp_name in hp_names: masked_array = ma.MaskedArray(parameter_dictionaries[hp_name], masks[hp_name]) results['param_%s' % hp_name] = masked_array return results def sprint_statistics(self): cv_results = self.cv_results_ sio = io.StringIO() sio.write('auto-sklearn results:\n') sio.write(' Dataset name: %s\n' % self._dataset_name) sio.write(' Metric: %s\n' % self._metric) idx_best_run = np.argmax(cv_results['mean_test_score']) best_score = cv_results['mean_test_score'][idx_best_run] sio.write(' Best validation score: %f\n' % best_score) num_runs = len(cv_results['status']) sio.write(' Number of target algorithm runs: %d\n' % num_runs) num_success = sum([s == 'Success' for s in cv_results['status']]) sio.write(' Number of successful target algorithm runs: %d\n' % num_success) num_crash = sum([s == 'Crash' for s in cv_results['status']]) sio.write(' Number of crashed target algorithm runs: %d\n' % num_crash) num_timeout = sum([s == 'Timeout' for s in cv_results['status']]) sio.write(' Number of target algorithms that exceeded the time ' 'limit: %d\n' % num_timeout) num_memout = sum([s == 'Memout' for s in cv_results['status']]) sio.write(' Number of target algorithms that exceeded the memory ' 'limit: %d\n' % num_memout) return sio.getvalue() def get_models_with_weights(self): if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() return self.ensemble_.get_models_with_weights(self.models_) def show_models(self): models_with_weights = self.get_models_with_weights() with io.StringIO() as sio: sio.write("[") for weight, model in models_with_weights: sio.write("(%f, %s),\n" % (weight, model)) sio.write("]") return sio.getvalue() def _create_search_space(self, tmp_dir, backend, datamanager, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None): task_name = 'CreateConfigSpace' self._stopwatch.start_task(task_name) configspace_path = os.path.join(tmp_dir, 'space.pcs') configuration_space = pipeline.get_configuration_space( datamanager.info, include_estimators=include_estimators, exclude_estimators=exclude_estimators, include_preprocessors=include_preprocessors, exclude_preprocessors=exclude_preprocessors) configuration_space = self.configuration_space_created_hook( datamanager, configuration_space) sp_string = pcs.write(configuration_space) backend.write_txt_file(configspace_path, sp_string, 'Configuration space') self._stopwatch.stop_task(task_name) return configuration_space, configspace_path def configuration_space_created_hook(self, datamanager, configuration_space): return configuration_space