def make_mode_holdout_iterative_fit(data, seed, configuration, num_run): global evaluator evaluator = HoldoutEvaluator(data, configuration, seed=seed, num_run=num_run, **_get_base_dict()) evaluator.iterative_fit() signal.signal(15, empty_signal_handler) evaluator.finish_up() backend = Backend(None, os.getcwd()) if os.path.exists(backend.get_model_dir()): backend.save_model(evaluator.model, num_run, seed)
class AbstractEvaluator(object): __metaclass__ = abc.ABCMeta @abc.abstractmethod def __init__(self, Datamanager, output_dir, configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, output_y_test=False, num_run=None): self.starttime = time.time() self.output_dir = output_dir self.configuration = configuration self.D = Datamanager self.X_valid = Datamanager.data.get('X_valid') self.X_test = Datamanager.data.get('X_test') self.metric = Datamanager.info['metric'] self.task_type = Datamanager.info['task'] self.seed = seed self.output_y_test = output_y_test self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions if self.task_type in REGRESSION_TASKS: if self.configuration is None: self.model_class = MyDummyRegressor else: self.model_class = \ autosklearn.pipeline.regression.SimpleRegressionPipeline self.predict_function = self._predict_regression else: if self.configuration is None: self.model_class = MyDummyClassifier else: self.model_class = \ autosklearn.pipeline.classification.SimpleClassificationPipeline self.predict_function = self._predict_proba if num_run is None: num_run = get_new_run_num() self.num_run = num_run self.backend = Backend(None, self.output_dir) self.model = self.model_class(self.configuration, self.seed) def fit_predict_and_loss(self): """Fit model(s) according to resampling strategy, predict for the validation set and return the loss and predictions on the validation set. Provides a closed interface in which all steps of the target algorithm are performed without any communication with other processes. Useful for cross-validation because it allows to train a model, predict for the validation set and then forget the model in order to save main memory. """ raise NotImplementedError() def iterative_fit(self): """Fit a model iteratively. Fitting can be interrupted in order to use a partially trained model.""" raise NotImplementedError() def predict_and_loss(self): """Use current model to predict on the validation set and calculate loss. Should be used when using iterative fitting.""" raise NotImplementedError() def predict(self): """Use the current model to predict on the validation set. Should only be used to create dummy predictions.""" raise NotImplementedError() def _loss(self, y_true, y_hat): if self.configuration is None: if self.all_scoring_functions: return {self.metric: 1.0} else: return 1.0 score = calculate_score( y_true, y_hat, self.task_type, self.metric, self.D.info['label_num'], all_scoring_functions=self.all_scoring_functions) if hasattr(score, '__len__'): err = {key: 1 - score[key] for key in score} else: err = 1 - score return err def finish_up(self, loss=None, opt_pred=None, valid_pred=None, test_pred=None): """This function does everything necessary after the fitting is done: * predicting * saving the files for the ensembles_statistics * generate output for SMAC We use it as the signal handler so we can recycle the code for the normal usecase and when the runsolver kills us here :)""" try: self.duration = time.time() - self.starttime if loss is None: loss, opt_pred, valid_pred, test_pred = self.predict_and_loss() self.file_output(loss, opt_pred, valid_pred, test_pred) self.duration = time.time() - self.starttime num_run = str(self.num_run).zfill(5) if isinstance(loss, dict): loss_ = loss loss = loss_[self.D.info['metric']] else: loss_ = {} additional_run_info = ';'.join([ '%s: %s' % (METRIC_TO_STRING[metric] if metric in METRIC_TO_STRING else metric, value) for metric, value in loss_.items() ]) additional_run_info += ';' + 'duration: ' + str(self.duration) additional_run_info += ';' + 'num_run:' + num_run if self.configuration is not None: self._output_SMAC_string(self.duration, loss, self.seed, additional_run_info) except Exception as e: self.duration = time.time() - self.starttime print(traceback.format_exc()) self._output_SMAC_string( self.duration, 2.0, self.seed, 'No results were produced! Error is %s' % str(e)) def _output_SMAC_string(self, duration, loss, seed, additional_run_info): print( 'Result for ParamILS: %s, %f, 1, %f, %d, %s' % ('SAT', abs(self.duration), loss, self.seed, additional_run_info)) def file_output(self, loss, Y_optimization_pred, Y_valid_pred, Y_test_pred): seed = os.environ.get('AUTOSKLEARN_SEED') if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]: return 2, "Targets %s and prediction %s don't have the same " \ "length. Probably training didn't finish" % ( self.Y_optimization.shape, Y_optimization_pred.shape) num_run = str(self.num_run).zfill(5) if os.path.exists(self.backend.get_model_dir()): self.backend.save_model(self.model, self.num_run, seed) if self.output_y_test: try: os.makedirs(self.output_dir) except OSError: pass self.backend.save_targets_ensemble(self.Y_optimization) self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble', seed, num_run) if Y_valid_pred is not None: self.backend.save_predictions_as_npy(Y_valid_pred, 'valid', seed, num_run) if Y_test_pred is not None: self.backend.save_predictions_as_npy(Y_test_pred, 'test', seed, num_run) def _predict_proba(self, X, model, task_type, Y_train): Y_pred = model.predict_proba(X, batch_size=1000) Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train) return Y_pred def _predict_regression(self, X, model, task_type, Y_train=None): Y_pred = model.predict(X) if len(Y_pred.shape) == 1: Y_pred = Y_pred.reshape((-1, 1)) return Y_pred def _ensure_prediction_array_sizes(self, prediction, Y_train): num_classes = self.D.info['label_num'] if self.task_type == MULTICLASS_CLASSIFICATION and \ prediction.shape[1] < num_classes: if Y_train is None: raise ValueError('Y_train must not be None!') classes = list(np.unique(Y_train)) mapping = dict() for class_number in range(num_classes): if class_number in classes: index = classes.index(class_number) mapping[index] = class_number new_predictions = np.zeros((prediction.shape[0], num_classes), dtype=np.float32) for index in mapping: class_index = mapping[index] new_predictions[:, class_index] = prediction[:, index] return new_predictions return prediction
class AutoML(BaseEstimator, multiprocessing.Process): def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32, max_iter_smac=None, acquisition_function='EI'): super(AutoML, self).__init__() self._tmp_dir = tmp_dir self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self._max_iter_smac = max_iter_smac self.delete_tmp_folder_after_terminate = \ delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = \ delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self.acquisition_function = acquisition_function self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self._parser = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode if not isinstance(self._time_for_task, int): raise ValueError("time_left_for_this_task not of type integer, " "but %s" % str(type(self._time_for_task))) if not isinstance(self._per_run_time_limit, int): raise ValueError("per_run_time_limit not of type integer, but %s" % str(type(self._per_run_time_limit))) # After assignging and checking variables... self._backend = Backend(self._output_dir, self._tmp_dir) def start_automl(self, parser): self._parser = parser self.start() def start(self): if self._parser is None: raise ValueError('You must invoke start() only via start_automl()') super(AutoML, self).start() def run(self): if self._parser is None: raise ValueError('You must invoke run() only via start_automl()') self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=self._parser) self._stopwatch.start_task(datamanager.name) self._logger = self._get_logger(datamanager.name) self._datamanager = datamanager self._dataset_name = datamanager.name self._fit(self._datamanager) def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all( [isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager) def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! self._data_memory_limit = float(self._ml_memory_limit) / 3 loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=self._data_memory_limit) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager) def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) return get_logger(logger_name) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager, num_run): self._logger.info("Starting to create dummy predictions.") time_limit = int(self._time_for_task / 6.) memory_limit = int(self._ml_memory_limit) _info = eval_with_limits(datamanager, self._tmp_dir, 1, self._seed, num_run, self._resampling_strategy, self._resampling_strategy_arguments, memory_limit, time_limit) if _info[4] == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 1/2.") else: self._logger.error('Error creating dummy prediction 1/2:%s ', _info[3]) num_run += 1 _info = eval_with_limits(datamanager, self._tmp_dir, 2, self._seed, num_run, self._resampling_strategy, self._resampling_strategy_arguments, memory_limit, time_limit) if _info[4] == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 2/2.") else: self._logger.error('Error creating dummy prediction 2/2 %s', _info[3]) num_run += 1 return num_run def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in [ 'holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv' ]: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") acquisition_functions = ['EI', 'EIPS'] if self.acquisition_function not in acquisition_functions: raise ValueError( 'Illegal acquisition %s: Must be one of %s.' % (self.acquisition_function, acquisition_functions)) self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warning("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data(datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time(self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._tmp_dir, self._backend, datamanager, self._include_estimators, self._include_preprocessors) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info('Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._logger.warning("Not starting ensemble builder because there " "is no time left!") self._proc_ensemble = None else: self._proc_ensemble = self._get_ensemble_process( time_left_for_ensembles) self._proc_ensemble.start() self._stopwatch.stop_task(ensemble_task_name) # == RUN SMBO # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max( 0, self._time_for_task - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info('Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") self._procsmac = None else: self._proc_smac = AutoMLSMBO( config_space=self.configuration_space, dataset_name=self._dataset_name, tmp_dir=self._tmp_dir, output_dir=self._output_dir, total_walltime_limit=time_left_for_smac, func_eval_time_limit=self._per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self. _initial_configurations_via_metalearning, config_file=configspace_path, smac_iters=self._max_iter_smac, seed=self._seed, metadata_directory=self._metadata_directory, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, acquisition_function=self.acquisition_function, shared_mode=self._shared_mode) self._proc_smac.start() psutil_procs = [] procs = [] if self._proc_smac is not None: proc_smac = psutil.Process(self._proc_smac.pid) psutil_procs.append(proc_smac) procs.append(self._proc_smac) if self._proc_ensemble is not None: proc_ensemble = psutil.Process(self._proc_ensemble.pid) psutil_procs.append(proc_ensemble) procs.append(self._proc_ensemble) if self._queue is not None: self._queue.put( [time_for_load_data, data_manager_path, psutil_procs]) else: for proc in procs: proc.join() if self._queue is None: self._load_models() return self def refit(self, X, y): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() for identifier in self.models_: if identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method model.fit(X.copy(), y.copy()) self._can_predict = True def predict(self, X): return np.argmax(self.predict_proba(X), axis=1) def predict_proba(self, X): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if not self._can_predict and \ self._resampling_strategy not in \ ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently only implemented for resampling ' 'strategy holdout.') if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() all_predictions = [] for identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] X_ = X.copy() if self._task in REGRESSION_TASKS: prediction = model.predict(X_) else: prediction = model.predict_proba(X_) if len(prediction.shape) < 1 or len(X_.shape) < 1 or \ X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]: self._logger.warning( "Prediction shape for model %s is %s " "while X_.shape is %s" % (model, str(prediction.shape), str(X_.shape))) all_predictions.append(prediction) if len(all_predictions) == 0: raise ValueError( 'Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = self.ensemble_.predict(all_predictions) return predictions def fit_ensemble(self, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): if self._logger is None: self._logger = self._get_logger(dataset_name) self._proc_ensemble = self._get_ensemble_process( 1, task, metric, precision, dataset_name, max_iterations=1, ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size) self._proc_ensemble.main() def _get_ensemble_process(self, time_left_for_ensembles, task=None, metric=None, precision=None, dataset_name=None, max_iterations=-1, ensemble_nbest=None, ensemble_size=None): if task is None: task = self._task if metric is None: metric = self._metric if precision is None: precision = self.precision if dataset_name is None: dataset_name = self._dataset_name if ensemble_nbest is None: ensemble_nbest = self._ensemble_nbest if ensemble_size is None: ensemble_size = self._ensemble_size return EnsembleBuilder(autosklearn_tmp_dir=self._tmp_dir, dataset_name=dataset_name, task_type=task, metric=metric, limit=time_left_for_ensembles, output_dir=self._output_dir, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, precision=precision, max_iterations=max_iterations) def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.ensemble_ = self._backend.load_ensemble(seed) if self.ensemble_: identifiers = self.ensemble_.identifiers_ self.models_ = self._backend.load_models_by_identifiers( identifiers) else: self.models_ = self._backend.load_all_models(seed) if len(self.models_) == 0: raise ValueError('No models fitted!') def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict_proba(X) return calculate_score(y, prediction, self._task, self._metric, self._label_num, logger=self._logger) def show_models(self): if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() return self.ensemble_.pprint_ensemble_string(self.models_) def _save_ensemble_data(self, X, y): """Split dataset and store Data for the ensemble script. :param X: :param y: :return: """ task_name = 'LoadData' self._start_task(self._stopwatch, task_name) _, _, _, y_ensemble = resampling.split_data(X, y) self._backend.save_targets_ensemble(y_ensemble) self._stop_task(self._stopwatch, task_name) def _create_search_space(self, tmp_dir, backend, datamanager, include_estimators=None, include_preprocessors=None): task_name = 'CreateConfigSpace' self._stopwatch.start_task(task_name) configspace_path = os.path.join(tmp_dir, 'space.pcs') configuration_space = pipeline.get_configuration_space( datamanager.info, include_estimators=include_estimators, include_preprocessors=include_preprocessors) configuration_space = self.configuration_space_created_hook( datamanager, configuration_space) sp_string = pcs.write(configuration_space) backend.write_txt_file(configspace_path, sp_string, 'Configuration space') self._stopwatch.stop_task(task_name) return configuration_space, configspace_path def configuration_space_created_hook(self, datamanager, configuration_space): return configuration_space def get_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'get_params() because it is not intended to ' 'be optimized.') def set_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'set_params() because it is not intended to ' 'be optimized.') def __del__(self): self._delete_output_directories() def _delete_output_directories(self): if self.delete_output_folder_after_terminate: try: shutil.rmtree(self._output_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete output dir: %s" % self._output_dir) else: print("Could not delete output dir: %s" % self._output_dir) if self.delete_tmp_folder_after_terminate: try: shutil.rmtree(self._tmp_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete tmp dir: %s" % self._tmp_dir) pass else: print("Could not delete tmp dir: %s" % self._tmp_dir)
class AutoML(BaseEstimator, multiprocessing.Process): def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32): super(AutoML, self).__init__() self._tmp_dir = tmp_dir self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self.delete_tmp_folder_after_terminate = \ delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = \ delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self.models_ = None self.ensemble_indices_ = None self._debug_mode = debug_mode self._backend = Backend(self._output_dir, self._tmp_dir) def start_automl(self, parser): self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=parser) self._stopwatch.start_task(datamanager.name) logger_name = 'AutoML(%d):%s' % (self._seed, datamanager.name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) self._datamanager = datamanager self._dataset_name = datamanager.name self.start() def start(self): if self._datamanager is None: raise ValueError('You must invoke start() only via start_automl()') super(AutoML, self).start() def run(self): if self._datamanager is None: raise ValueError('You must invoke run() only via start_automl()') self._fit(self._datamanager) def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all([isinstance(f, bool) for f in feat_type]): raise ValueError('Array feat_type must only contain bools.') loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager) def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=float(self._ml_memory_limit) / 3) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager): autosklearn.cli.base_interface.main(datamanager, self._resampling_strategy, None, None, mode_args=self._resampling_strategy_arguments) def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_indices_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv']: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warning("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] set_auto_seed(self._seed) # == Pickle the data manager, here, because no more global # OneHotEncoding data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data( datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions self._do_dummy_prediction(datamanager) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = _create_search_space( self._tmp_dir, datamanager.info, self._backend, self._stopwatch, self._logger, self._include_estimators, self._include_preprocessors) self.configuration_space_created_hook(datamanager) # == Calculate metafeatures meta_features = _calculate_metafeatures( data_feat_type=datamanager.feat_type, data_info_task=datamanager.info['task'], x_train=datamanager.data['X_train'], y_train=datamanager.data['Y_train'], basename=self._dataset_name, watcher=self._stopwatch, metalearning_cnt=self._initial_configurations_via_metalearning, logger=self._logger) self._stopwatch.start_task('OneHot') datamanager.perform1HotEncoding() self._stopwatch.stop_task('OneHot') if meta_features is None: initial_configurations = [] elif datamanager.info['task'] in [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION]: meta_features_encoded = _calculate_metafeatures_encoded( self._dataset_name, datamanager.data['X_train'], datamanager.data['Y_train'], self._stopwatch, self._logger) self._logger.debug(meta_features.__repr__(verbosity=2)) self._logger.debug(meta_features_encoded.__repr__(verbosity=2)) initial_configurations = _get_initial_configuration( meta_features, meta_features_encoded, self._dataset_name, self._metric, self.configuration_space, self._task, self._metadata_directory, self._initial_configurations_via_metalearning, datamanager.info[ 'is_sparse'], self._stopwatch, self._logger) _print_debug_info_of_init_configuration( initial_configurations, self._dataset_name, self._time_for_task, self._logger, self._stopwatch) else: initial_configurations = [] self._logger.warning('Metafeatures encoded not calculated') # == RUN SMAC if (datamanager.info["task"] == BINARY_CLASSIFICATION) or \ (datamanager.info["task"] == MULTICLASS_CLASSIFICATION): config = {'balancing:strategy': 'weighting', 'classifier:__choice__': 'sgd', 'classifier:sgd:loss': 'hinge', 'classifier:sgd:penalty': 'l2', 'classifier:sgd:alpha': 0.0001, 'classifier:sgd:fit_intercept': 'True', 'classifier:sgd:n_iter': 5, 'classifier:sgd:learning_rate': 'optimal', 'classifier:sgd:eta0': 0.01, 'classifier:sgd:average': 'True', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'one_hot_encoding:minimum_fraction': 0.1, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'min/max'} elif datamanager.info["task"] == MULTILABEL_CLASSIFICATION: config = {'classifier:__choice__': 'adaboost', 'classifier:adaboost:algorithm': 'SAMME.R', 'classifier:adaboost:learning_rate': 1.0, 'classifier:adaboost:max_depth': 1, 'classifier:adaboost:n_estimators': 50, 'balancing:strategy': 'weighting', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'one_hot_encoding:minimum_fraction': 0.1, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'none'} else: config = None self._logger.info("Tasktype unknown: %s" % TASK_TYPES_TO_STRING[datamanager.info["task"]]) if config is not None: configuration = Configuration(self.configuration_space, config) config_string = convert_conf2smac_string(configuration) initial_configurations = [config_string] + initial_configurations # == RUN SMAC proc_smac = run_smac(tmp_dir=self._tmp_dir, basename=self._dataset_name, time_for_task=self._time_for_task, ml_memory_limit=self._ml_memory_limit, data_manager_path=data_manager_path, configspace_path=configspace_path, initial_configurations=initial_configurations, per_run_time_limit=self._per_run_time_limit, watcher=self._stopwatch, backend=self._backend, seed=self._seed, resampling_strategy=self._resampling_strategy, resampling_strategy_arguments=self._resampling_strategy_arguments, shared_mode=self._shared_mode) # == RUN ensemble builder proc_ensembles = self.run_ensemble_builder() procs = [] if proc_smac is not None: procs.append(proc_smac) if proc_ensembles is not None: procs.append(proc_ensembles) if self._queue is not None: self._queue.put([time_for_load_data, data_manager_path, procs]) else: for proc in procs: proc.wait() # Delete AutoSklearn environment variable del_auto_seed() # In case try: del self._datamanager except Exception: pass if self._queue is None: self._load_models() return self def run_ensemble_builder(self, time_left_for_ensembles=None, max_iterations=None, ensemble_size=None): if self._ensemble_size > 0 or ensemble_size is not None: task_name = 'runEnsemble' self._stopwatch.start_task(task_name) if time_left_for_ensembles is None: time_left_for_ensembles = max(0, self._time_for_task - self._stopwatch.wall_elapsed( self._dataset_name)) if max_iterations is None: max_iterations = -1 if ensemble_size is None: ensemble_size = self._ensemble_size # It can happen that run_ensemble_builder is called without # calling fit. if self._logger: self._logger.info( 'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) proc_ensembles = submit_process.run_ensemble_builder( tmp_dir=self._tmp_dir, dataset_name=self._dataset_name, task_type=self._task, metric=self._metric, limit=time_left_for_ensembles, output_dir=self._output_dir, ensemble_size=ensemble_size, ensemble_nbest=self._ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, max_iterations=max_iterations, precision=self.precision ) self._stopwatch.stop_task(task_name) return proc_ensembles else: self._logger.info('Not starting ensemble script due to ensemble ' 'size 0.') return None def predict(self, X): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently only implemented for resampling ' 'strategy holdout.') if self.models_ is None or len(self.models_) == 0 or len( self.ensemble_indices_) == 0: self._load_models() predictions = [] for identifier in self.models_: if identifier not in self.ensemble_indices_: continue weight = self.ensemble_indices_[identifier] model = self.models_[identifier] X_ = X.copy() if self._task in REGRESSION_TASKS: prediction = model.predict(X_) else: prediction = model.predict_proba(X_) if len(prediction.shape) < 1 or len(X_.shape) < 1 or \ X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]: self._logger.warning("Prediction shape for model %s is %s " "while X_.shape is %s" % (model, str(prediction.shape), str(X_.shape))) predictions.append(prediction * weight) if len(predictions) == 0: raise ValueError('Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = np.sum(np.array(predictions), axis=0) return predictions def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.models_ = self._backend.load_all_models(seed) if len(self.models_) == 0: raise ValueError('No models fitted!') self.ensemble_indices_ = self._backend.load_ensemble_indices_weights( seed) def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict(X) if self._task == BINARY_CLASSIFICATION: prediction = prediction[:, 1].reshape((-1, 1)) return calculate_score(y, prediction, self._task, self._metric, self._label_num, logger=self._logger) def show_models(self): if self.models_ is None or len(self.models_) == 0 or len( self.ensemble_indices_) == 0: self._load_models() output = [] sio = six.StringIO() for identifier in self.models_: if identifier not in self.ensemble_indices_: continue weight = self.ensemble_indices_[identifier] model = self.models_[identifier] output.append((weight, model)) output.sort(reverse=True) sio.write("[") for weight, model in output: sio.write("(%f, %s),\n" % (weight, model)) sio.write("]") return sio.getvalue() def _save_ensemble_data(self, X, y): """Split dataset and store Data for the ensemble script. :param X: :param y: :return: """ task_name = 'LoadData' self._start_task(self._stopwatch, task_name) _, _, _, y_ensemble = resampling.split_data(X, y) self._backend.save_targets_ensemble(y_ensemble) self._stop_task(self._stopwatch, task_name) def configuration_space_created_hook(self, datamanager): pass def get_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'get_params() because it is not intended to ' 'be optimized.') def set_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'set_params() because it is not intended to ' 'be optimized.') def __del__(self): self._delete_output_directories() def _delete_output_directories(self): if self.delete_output_folder_after_terminate: try: shutil.rmtree(self._output_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete output dir: %s" % self._output_dir) else: print("Could not delete output dir: %s" % self._output_dir) if self.delete_tmp_folder_after_terminate: try: shutil.rmtree(self._tmp_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete tmp dir: %s" % self._tmp_dir) pass else: print("Could not delete tmp dir: %s" % self._tmp_dir)
class AutoML(BaseEstimator, multiprocessing.Process): def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32): super(AutoML, self).__init__() self._tmp_dir = tmp_dir self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self.delete_tmp_folder_after_terminate = \ delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = \ delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode self._backend = Backend(self._output_dir, self._tmp_dir) def start_automl(self, parser): self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=parser) self._stopwatch.start_task(datamanager.name) logger_name = 'AutoML(%d):%s' % (self._seed, datamanager.name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) self._datamanager = datamanager self._dataset_name = datamanager.name self.start() def start(self): if self._datamanager is None: raise ValueError('You must invoke start() only via start_automl()') super(AutoML, self).start() def run(self): if self._datamanager is None: raise ValueError('You must invoke run() only via start_automl()') self._fit(self._datamanager) def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all([isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager) def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=float(self._ml_memory_limit) / 3) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager): self._logger.info("Starting to create dummy predictions.") autosklearn.cli.base_interface.main(datamanager, self._resampling_strategy, None, None, mode_args=self._resampling_strategy_arguments, output_dir=self._tmp_dir) self._logger.info("Finished creating dummy predictions.") def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv']: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warning("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] set_auto_seed(self._seed) # == Pickle the data manager, here, because no more global # OneHotEncoding data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data( datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: self._do_dummy_prediction(datamanager) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = _create_search_space( self._tmp_dir, datamanager.info, self._backend, self._stopwatch, self._logger, self._include_estimators, self._include_preprocessors) self.configuration_space_created_hook(datamanager) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long proc_ensembles = self.run_ensemble_builder() # == Calculate metafeatures meta_features = _calculate_metafeatures( data_feat_type=datamanager.feat_type, data_info_task=datamanager.info['task'], x_train=datamanager.data['X_train'], y_train=datamanager.data['Y_train'], basename=self._dataset_name, watcher=self._stopwatch, metalearning_cnt=self._initial_configurations_via_metalearning, logger=self._logger) self._stopwatch.start_task('OneHot') datamanager.perform1HotEncoding() self._stopwatch.stop_task('OneHot') if meta_features is None: initial_configurations = [] elif datamanager.info['task'] in [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION]: meta_features_encoded = _calculate_metafeatures_encoded( self._dataset_name, datamanager.data['X_train'], datamanager.data['Y_train'], self._stopwatch, self._logger) self._logger.debug(meta_features.__repr__(verbosity=2)) self._logger.debug(meta_features_encoded.__repr__(verbosity=2)) initial_configurations = _get_initial_configuration( meta_features, meta_features_encoded, self._dataset_name, self._metric, self.configuration_space, self._task, self._metadata_directory, self._initial_configurations_via_metalearning, datamanager.info[ 'is_sparse'], self._stopwatch, self._logger) _print_debug_info_of_init_configuration( initial_configurations, self._dataset_name, self._time_for_task, self._logger, self._stopwatch) else: initial_configurations = [] self._logger.warning('Metafeatures encoded not calculated') # == RUN SMAC if (datamanager.info["task"] == BINARY_CLASSIFICATION) or \ (datamanager.info["task"] == MULTICLASS_CLASSIFICATION): config = {'balancing:strategy': 'weighting', 'classifier:__choice__': 'sgd', 'classifier:sgd:loss': 'hinge', 'classifier:sgd:penalty': 'l2', 'classifier:sgd:alpha': 0.0001, 'classifier:sgd:fit_intercept': 'True', 'classifier:sgd:n_iter': 5, 'classifier:sgd:learning_rate': 'optimal', 'classifier:sgd:eta0': 0.01, 'classifier:sgd:average': 'True', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'one_hot_encoding:minimum_fraction': 0.1, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'min/max'} elif datamanager.info["task"] == MULTILABEL_CLASSIFICATION: config = {'classifier:__choice__': 'adaboost', 'classifier:adaboost:algorithm': 'SAMME.R', 'classifier:adaboost:learning_rate': 1.0, 'classifier:adaboost:max_depth': 1, 'classifier:adaboost:n_estimators': 50, 'balancing:strategy': 'weighting', 'imputation:strategy': 'mean', 'one_hot_encoding:use_minimum_fraction': 'True', 'one_hot_encoding:minimum_fraction': 0.1, 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'none'} else: config = None self._logger.info("Tasktype unknown: %s" % TASK_TYPES_TO_STRING[datamanager.info["task"]]) if config is not None: try: configuration = Configuration(self.configuration_space, config) config_string = convert_conf2smac_string(configuration) initial_configurations = [config_string] + initial_configurations except ValueError: pass # == RUN SMAC proc_smac = run_smac(tmp_dir=self._tmp_dir, basename=self._dataset_name, time_for_task=self._time_for_task, ml_memory_limit=self._ml_memory_limit, data_manager_path=data_manager_path, configspace_path=configspace_path, initial_configurations=initial_configurations, per_run_time_limit=self._per_run_time_limit, watcher=self._stopwatch, backend=self._backend, seed=self._seed, resampling_strategy=self._resampling_strategy, resampling_strategy_arguments=self._resampling_strategy_arguments, shared_mode=self._shared_mode) procs = [] if proc_smac is not None: procs.append(proc_smac) if proc_ensembles is not None: procs.append(proc_ensembles) if self._queue is not None: self._queue.put([time_for_load_data, data_manager_path, procs]) else: for proc in procs: proc.wait() # Delete AutoSklearn environment variable del_auto_seed() # In case try: del self._datamanager except Exception: pass if self._queue is None: self._load_models() return self def run_ensemble_builder(self, time_left_for_ensembles=None, max_iterations=None, ensemble_size=None): if self._ensemble_size > 0 or ensemble_size is not None: task_name = 'runEnsemble' self._stopwatch.start_task(task_name) if time_left_for_ensembles is None: time_left_for_ensembles = max(0, self._time_for_task - self._stopwatch.wall_elapsed( self._dataset_name)) if max_iterations is None: max_iterations = -1 if ensemble_size is None: ensemble_size = self._ensemble_size # It can happen that run_ensemble_builder is called without # calling fit. if self._logger: self._logger.info( 'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) proc_ensembles = submit_process.run_ensemble_builder( tmp_dir=self._tmp_dir, dataset_name=self._dataset_name, task_type=self._task, metric=self._metric, limit=time_left_for_ensembles, output_dir=self._output_dir, ensemble_size=ensemble_size, ensemble_nbest=self._ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, max_iterations=max_iterations, precision=self.precision ) self._stopwatch.stop_task(task_name) return proc_ensembles else: self._logger.info('Not starting ensemble script due to ensemble ' 'size 0.') return None def refit(self, X, y): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() for identifier in self.models_: if identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method model.fit(X.copy(), y.copy()) self._can_predict = True def predict(self, X): return np.argmax(self.predict_proba(X), axis=1) def predict_proba(self, X): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if not self._can_predict and \ self._resampling_strategy not in \ ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently only implemented for resampling ' 'strategy holdout.') if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() all_predictions = [] for identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] X_ = X.copy() if self._task in REGRESSION_TASKS: prediction = model.predict(X_) else: prediction = model.predict_proba(X_) if len(prediction.shape) < 1 or len(X_.shape) < 1 or \ X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]: self._logger.warning("Prediction shape for model %s is %s " "while X_.shape is %s" % (model, str(prediction.shape), str(X_.shape))) all_predictions.append(prediction) if len(all_predictions) == 0: raise ValueError('Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = self.ensemble_.predict(all_predictions) return predictions def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.ensemble_ = self._backend.load_ensemble(seed) if self.ensemble_: identifiers = self.ensemble_.identifiers_ self.models_ = self._backend.load_models_by_identifiers(identifiers) else: self.models_ = self._backend.load_all_models(seed) if len(self.models_) == 0: raise ValueError('No models fitted!') def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict_proba(X) return calculate_score(y, prediction, self._task, self._metric, self._label_num, logger=self._logger) def show_models(self): if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() return self.ensemble_.pprint_ensemble_string(self.models_) def _save_ensemble_data(self, X, y): """Split dataset and store Data for the ensemble script. :param X: :param y: :return: """ task_name = 'LoadData' self._start_task(self._stopwatch, task_name) _, _, _, y_ensemble = resampling.split_data(X, y) self._backend.save_targets_ensemble(y_ensemble) self._stop_task(self._stopwatch, task_name) def configuration_space_created_hook(self, datamanager): pass def get_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'get_params() because it is not intended to ' 'be optimized.') def set_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'set_params() because it is not intended to ' 'be optimized.') def __del__(self): self._delete_output_directories() def _delete_output_directories(self): if self.delete_output_folder_after_terminate: try: shutil.rmtree(self._output_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete output dir: %s" % self._output_dir) else: print("Could not delete output dir: %s" % self._output_dir) if self.delete_tmp_folder_after_terminate: try: shutil.rmtree(self._tmp_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete tmp dir: %s" % self._tmp_dir) pass else: print("Could not delete tmp dir: %s" % self._tmp_dir)
class AbstractEvaluator(object): __metaclass__ = abc.ABCMeta @abc.abstractmethod def __init__(self, Datamanager, output_dir, configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, output_y_test=False, num_run=None): self.starttime = time.time() self.output_dir = output_dir self.configuration = configuration self.D = Datamanager self.X_valid = Datamanager.data.get('X_valid') self.X_test = Datamanager.data.get('X_test') self.metric = Datamanager.info['metric'] self.task_type = Datamanager.info['task'] self.seed = seed self.output_y_test = output_y_test self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions if self.task_type in REGRESSION_TASKS: if self.configuration is None: self.model_class = MyDummyRegressor else: self.model_class = \ autosklearn.pipeline.regression.SimpleRegressionPipeline self.predict_function = self._predict_regression else: if self.configuration is None: self.model_class = MyDummyClassifier else: self.model_class = \ autosklearn.pipeline.classification.SimpleClassificationPipeline self.predict_function = self._predict_proba if num_run is None: num_run = get_new_run_num() self.num_run = num_run self.backend = Backend(None, self.output_dir) self.model = self.model_class(self.configuration, self.seed) def fit_predict_and_loss(self): """Fit model(s) according to resampling strategy, predict for the validation set and return the loss and predictions on the validation set. Provides a closed interface in which all steps of the target algorithm are performed without any communication with other processes. Useful for cross-validation because it allows to train a model, predict for the validation set and then forget the model in order to save main memory. """ raise NotImplementedError() def iterative_fit(self): """Fit a model iteratively. Fitting can be interrupted in order to use a partially trained model.""" raise NotImplementedError() def predict_and_loss(self): """Use current model to predict on the validation set and calculate loss. Should be used when using iterative fitting.""" raise NotImplementedError() def predict(self): """Use the current model to predict on the validation set. Should only be used to create dummy predictions.""" raise NotImplementedError() def _loss(self, y_true, y_hat): if self.configuration is None: if self.all_scoring_functions: return {self.metric: 1.0} else: return 1.0 score = calculate_score( y_true, y_hat, self.task_type, self.metric, self.D.info['label_num'], all_scoring_functions=self.all_scoring_functions) if hasattr(score, '__len__'): err = {key: 1 - score[key] for key in score} else: err = 1 - score return err def finish_up(self, loss=None, opt_pred=None, valid_pred=None, test_pred=None): """This function does everything necessary after the fitting is done: * predicting * saving the files for the ensembles_statistics * generate output for SMAC We use it as the signal handler so we can recycle the code for the normal usecase and when the runsolver kills us here :)""" try: self.duration = time.time() - self.starttime if loss is None: loss, opt_pred, valid_pred, test_pred = self.predict_and_loss() self.file_output(loss, opt_pred, valid_pred, test_pred) self.duration = time.time() - self.starttime num_run = str(self.num_run).zfill(5) if isinstance(loss, dict): loss_ = loss loss = loss_[self.D.info['metric']] else: loss_ = {} additional_run_info = ';'.join(['%s: %s' % (METRIC_TO_STRING[ metric] if metric in METRIC_TO_STRING else metric, value) for metric, value in loss_.items()]) additional_run_info += ';' + 'duration: ' + str(self.duration) additional_run_info += ';' + 'num_run:' + num_run if self.configuration is not None: self._output_SMAC_string(self.duration, loss, self.seed, additional_run_info) except Exception as e: self.duration = time.time() - self.starttime print(traceback.format_exc()) self._output_SMAC_string(self.duration, 2.0, self.seed, 'No results were produced! Error is %s' % str(e)) def _output_SMAC_string(self, duration, loss, seed, additional_run_info): print('Result for ParamILS: %s, %f, 1, %f, %d, %s' % ('SAT', abs(self.duration), loss, self.seed, additional_run_info)) def file_output(self, loss, Y_optimization_pred, Y_valid_pred, Y_test_pred): seed = os.environ.get('AUTOSKLEARN_SEED') if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]: return 2, "Targets %s and prediction %s don't have the same " \ "length. Probably training didn't finish" % ( self.Y_optimization.shape, Y_optimization_pred.shape) num_run = str(self.num_run).zfill(5) if os.path.exists(self.backend.get_model_dir()): self.backend.save_model(self.model, self.num_run, seed) if self.output_y_test: try: os.makedirs(self.output_dir) except OSError: pass self.backend.save_targets_ensemble(self.Y_optimization) self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble', seed, num_run) if Y_valid_pred is not None: self.backend.save_predictions_as_npy(Y_valid_pred, 'valid', seed, num_run) if Y_test_pred is not None: self.backend.save_predictions_as_npy(Y_test_pred, 'test', seed, num_run) def _predict_proba(self, X, model, task_type, Y_train): Y_pred = model.predict_proba(X, batch_size=1000) Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train) return Y_pred def _predict_regression(self, X, model, task_type, Y_train=None): Y_pred = model.predict(X) if len(Y_pred.shape) == 1: Y_pred = Y_pred.reshape((-1, 1)) return Y_pred def _ensure_prediction_array_sizes(self, prediction, Y_train): num_classes = self.D.info['label_num'] if self.task_type == MULTICLASS_CLASSIFICATION and \ prediction.shape[1] < num_classes: if Y_train is None: raise ValueError('Y_train must not be None!') classes = list(np.unique(Y_train)) mapping = dict() for class_number in range(num_classes): if class_number in classes: index = classes.index(class_number) mapping[index] = class_number new_predictions = np.zeros((prediction.shape[0], num_classes), dtype=np.float32) for index in mapping: class_index = mapping[index] new_predictions[:, class_index] = prediction[:, index] return new_predictions return prediction
class AbstractEvaluator(object): __metaclass__ = abc.ABCMeta @abc.abstractmethod def __init__(self, Datamanager, configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, output_dir=None, output_y_test=False, num_run=None): self.starttime = time.time() self.configuration = configuration self.D = Datamanager self.X_valid = Datamanager.data.get('X_valid') self.X_test = Datamanager.data.get('X_test') self.metric = Datamanager.info['metric'] self.task_type = Datamanager.info['task'] self.seed = seed if output_dir is None: self.output_dir = os.getcwd() else: self.output_dir = output_dir self.output_y_test = output_y_test self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions if self.task_type in REGRESSION_TASKS: if self.configuration is None: self.model_class = MyDummyRegressor else: self.model_class = ParamSklearnRegressor self.predict_function = self.predict_regression else: if self.configuration is None: self.model_class = MyDummyClassifier else: self.model_class = ParamSklearnClassifier self.predict_function = self.predict_proba if num_run is None: num_run = get_new_run_num() self.num_run = num_run self.backend = Backend(None, self.output_dir) self.model = self.model_class(self.configuration, self.seed) @abc.abstractmethod def fit(self): pass @abc.abstractmethod def predict(self): pass # This function does everything necessary after the fitting is done: # predicting # saving the files for the ensembles_statistics # generate output for SMAC # We use it as the signal handler so we can recycle the code for the # normal usecase and when the runsolver kills us here :) def finish_up(self): try: self.duration = time.time() - self.starttime result, additional_run_info = self.file_output() if self.configuration is not None: print('Result for ParamILS: %s, %f, 1, %f, %d, %s' % ('SAT', abs(self.duration), result, self.seed, additional_run_info)) except Exception as e: self.duration = time.time() - self.starttime print(traceback.format_exc()) print('Result for ParamILS: %s, %f, 1, %f, %d, %s' % ('TIMEOUT', abs(self.duration), 1.0, self.seed, 'No results were produced! Error is %s' % str(e))) def file_output(self): seed = os.environ.get('AUTOSKLEARN_SEED') errs, Y_optimization_pred, Y_valid_pred, Y_test_pred = self.predict() if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]: return 2, "Targets %s and prediction %s don't have the same " \ "length. Probably training didn't finish" % ( self.Y_optimization.shape, Y_optimization_pred.shape) num_run = str(self.num_run).zfill(5) if os.path.exists(self.backend.get_model_dir()): self.backend.save_model(self.model, self.num_run, seed) if self.output_y_test: try: os.makedirs(self.output_dir) except OSError: pass self.backend.save_targets_ensemble(self.Y_optimization) self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble', seed, num_run) if Y_valid_pred is not None: self.backend.save_predictions_as_npy(Y_valid_pred, 'valid', seed, num_run) if Y_test_pred is not None: self.backend.save_predictions_as_npy(Y_test_pred, 'test', seed, num_run) self.duration = time.time() - self.starttime err = errs[self.D.info['metric']] additional_run_info = ';'.join(['%s: %s' % (METRIC_TO_STRING[metric] if metric in METRIC_TO_STRING else metric, value) for metric, value in errs.items()]) additional_run_info += ';' + 'duration: ' + str(self.duration) additional_run_info += ';' + 'num_run:' + num_run return err, additional_run_info def predict_proba(self, X, model, task_type, Y_train=None): Y_pred = model.predict_proba(X, batch_size=1000) if task_type == MULTILABEL_CLASSIFICATION: Y_pred = np.hstack([Y_pred[i][:, -1].reshape((-1, 1)) for i in range(len(Y_pred))]) elif task_type == BINARY_CLASSIFICATION: if len(Y_pred.shape) != 1: Y_pred = Y_pred[:, 1].reshape(-1, 1) elif task_type == MULTICLASS_CLASSIFICATION: pass Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train) return Y_pred def predict_regression(self, X, model, task_type, Y_train=None): Y_pred = model.predict(X) if len(Y_pred.shape) == 1: Y_pred = Y_pred.reshape((-1, 1)) return Y_pred def _ensure_prediction_array_sizes(self, prediction, Y_train): num_classes = self.D.info['label_num'] if self.task_type == MULTICLASS_CLASSIFICATION and \ prediction.shape[1] < num_classes: classes = list(np.unique(self.D.data['Y_train'])) if num_classes == prediction.shape[1]: return prediction if Y_train is not None: classes = list(np.unique(Y_train)) mapping = dict() for class_number in range(num_classes): if class_number in classes: index = classes.index(class_number) mapping[index] = class_number new_predictions = np.zeros((prediction.shape[0], num_classes)) for index in mapping: class_index = mapping[index] new_predictions[:, class_index] = prediction[:, index] return new_predictions return prediction
class AutoML(BaseEstimator, multiprocessing.Process): def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32, max_iter_smac=None, acquisition_function='EI'): super(AutoML, self).__init__() self._tmp_dir = tmp_dir self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self._max_iter_smac = max_iter_smac self.delete_tmp_folder_after_terminate = \ delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = \ delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self.acquisition_function = acquisition_function self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self._parser = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode if not isinstance(self._time_for_task, int): raise ValueError("time_left_for_this_task not of type integer, " "but %s" % str(type(self._time_for_task))) if not isinstance(self._per_run_time_limit, int): raise ValueError("per_run_time_limit not of type integer, but %s" % str(type(self._per_run_time_limit))) # After assignging and checking variables... self._backend = Backend(self._output_dir, self._tmp_dir) def start_automl(self, parser): self._parser = parser self.start() def start(self): if self._parser is None: raise ValueError('You must invoke start() only via start_automl()') super(AutoML, self).start() def run(self): if self._parser is None: raise ValueError('You must invoke run() only via start_automl()') self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() datamanager = get_data_manager(namespace=self._parser) self._stopwatch.start_task(datamanager.name) self._logger = self._get_logger(datamanager.name) self._datamanager = datamanager self._dataset_name = datamanager.name self._fit(self._datamanager) def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric='acc_metric', feat_type=None, dataset_name=None): if dataset_name is None: m = hashlib.md5() m.update(X.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) self._logger = self._get_logger(dataset_name) if isinstance(metric, str): metric = STRING_TO_METRIC[metric] if feat_type is not None and len(feat_type) != X.shape[1]: raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) if feat_type is not None and not all([isinstance(f, str) for f in feat_type]): raise ValueError('Array feat_type must only contain strings.') if feat_type is not None: for ft in feat_type: if ft.lower() not in ['categorical', 'numerical']: raise ValueError('Only `Categorical` and `Numerical` are ' 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None loaded_data_manager = XYDataManager(X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False) return self._fit(loaded_data_manager) def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! self._data_memory_limit = float(self._ml_memory_limit) / 3 loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=self._data_memory_limit) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager) def _get_logger(self, name): logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) return get_logger(logger_name) @staticmethod def _start_task(watcher, task_name): watcher.start_task(task_name) @staticmethod def _stop_task(watcher, task_name): watcher.stop_task(task_name) @staticmethod def _print_load_time(basename, time_left_for_this_task, time_for_load_data, logger): time_left_after_reading = max( 0, time_left_for_this_task - time_for_load_data) logger.info('Remaining time after reading %s %5.2f sec' % (basename, time_left_after_reading)) return time_for_load_data def _do_dummy_prediction(self, datamanager, num_run): self._logger.info("Starting to create dummy predictions.") time_limit = int(self._time_for_task / 6.) memory_limit = int(self._ml_memory_limit) _info = eval_with_limits(datamanager, self._tmp_dir, 1, self._seed, num_run, self._resampling_strategy, self._resampling_strategy_arguments, memory_limit, time_limit) if _info[4] == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 1/2.") else: self._logger.error('Error creating dummy prediction 1/2:%s ', _info[3]) num_run += 1 _info = eval_with_limits(datamanager, self._tmp_dir, 2, self._seed, num_run, self._resampling_strategy, self._resampling_strategy_arguments, memory_limit, time_limit) if _info[4] == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 2/2.") else: self._logger.error('Error creating dummy prediction 2/2 %s', _info[3]) num_run += 1 return num_run def _fit(self, datamanager): # Reset learnt stuff self.models_ = None self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', 'cv', 'nested-cv', 'partial-cv']: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy == 'partial-cv' and \ self._ensemble_size != 0: raise ValueError("Resampling strategy partial-cv cannot be used " "together with ensembles.") acquisition_functions = ['EI', 'EIPS'] if self.acquisition_function not in acquisition_functions: raise ValueError('Illegal acquisition %s: Must be one of %s.' % (self.acquisition_function, acquisition_functions)) self._backend._make_internals_directory() if self._keep_models: try: os.mkdir(self._backend.get_model_dir()) except OSError: self._logger.warning("model directory already exists") if not self._shared_mode: raise self._metric = datamanager.info['metric'] self._task = datamanager.info['task'] self._label_num = datamanager.info['label_num'] # == Pickle the data manager to speed up loading data_manager_path = self._backend.save_datamanager(datamanager) self._save_ensemble_data( datamanager.data['X_train'], datamanager.data['Y_train']) time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name) if self._debug_mode: self._print_load_time( self._dataset_name, self._time_for_task, time_for_load_data, self._logger) # == Perform dummy predictions num_run = 1 #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: num_run = self._do_dummy_prediction(datamanager, num_run) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a # search space for a dense classifier even if one hot encoding would # make it sparse (tradeoff; if one hot encoding would make it sparse, # densifier and truncatedSVD would probably lead to a MemoryError, # like this we can't use some of the preprocessing methods in case # the data became sparse) self.configuration_space, configspace_path = self._create_search_space( self._tmp_dir, self._backend, datamanager, self._include_estimators, self._include_preprocessors) # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the # dummy predictions are actually included in the ensemble even if # calculating the meta-features takes very long ensemble_task_name = 'runEnsemble' self._stopwatch.start_task(ensemble_task_name) time_left_for_ensembles = max(0,self._time_for_task \ - self._stopwatch.wall_elapsed(self._dataset_name)) if self._logger: self._logger.info( 'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles) if time_left_for_ensembles <= 0: self._logger.warning("Not starting ensemble builder because there " "is no time left!") self._proc_ensemble = None else: self._proc_ensemble = self._get_ensemble_process(time_left_for_ensembles) self._proc_ensemble.start() self._stopwatch.stop_task(ensemble_task_name) # == RUN SMBO # kill the datamanager as it will be re-loaded anyways from sub processes try: del self._datamanager except Exception: pass # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) time_left_for_smac = max(0, self._time_for_task - self._stopwatch.wall_elapsed( self._dataset_name)) if self._logger: self._logger.info( 'Start SMAC with %5.2fsec time left' % time_left_for_smac) if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") self._procsmac = None else: self._proc_smac = AutoMLSMBO(config_space=self.configuration_space, dataset_name=self._dataset_name, tmp_dir=self._tmp_dir, output_dir=self._output_dir, total_walltime_limit=time_left_for_smac, func_eval_time_limit=self._per_run_time_limit, memory_limit=self._ml_memory_limit, data_memory_limit=self._data_memory_limit, watcher=self._stopwatch, start_num_run=num_run, num_metalearning_cfgs=self._initial_configurations_via_metalearning, config_file=configspace_path, smac_iters=self._max_iter_smac, seed=self._seed, metadata_directory=self._metadata_directory, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, acquisition_function=self.acquisition_function, shared_mode=self._shared_mode) self._proc_smac.start() psutil_procs = [] procs = [] if self._proc_smac is not None: proc_smac = psutil.Process(self._proc_smac.pid) psutil_procs.append(proc_smac) procs.append(self._proc_smac) if self._proc_ensemble is not None: proc_ensemble = psutil.Process(self._proc_ensemble.pid) psutil_procs.append(proc_ensemble) procs.append(self._proc_ensemble) if self._queue is not None: self._queue.put([time_for_load_data, data_manager_path, psutil_procs]) else: for proc in procs: proc.join() if self._queue is None: self._load_models() return self def refit(self, X, y): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() for identifier in self.models_: if identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method model.fit(X.copy(), y.copy()) self._can_predict = True def predict(self, X): return np.argmax(self.predict_proba(X), axis=1) def predict_proba(self, X): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") if not self._can_predict and \ self._resampling_strategy not in \ ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently only implemented for resampling ' 'strategy holdout.') if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() all_predictions = [] for identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] X_ = X.copy() if self._task in REGRESSION_TASKS: prediction = model.predict(X_) else: prediction = model.predict_proba(X_) if len(prediction.shape) < 1 or len(X_.shape) < 1 or \ X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]: self._logger.warning("Prediction shape for model %s is %s " "while X_.shape is %s" % (model, str(prediction.shape), str(X_.shape))) all_predictions.append(prediction) if len(all_predictions) == 0: raise ValueError('Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) predictions = self.ensemble_.predict(all_predictions) return predictions def fit_ensemble(self, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): if self._logger is None: self._logger = self._get_logger(dataset_name) self._proc_ensemble = self._get_ensemble_process( 1, task, metric, precision, dataset_name, max_iterations=1, ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size) self._proc_ensemble.main() def _get_ensemble_process(self, time_left_for_ensembles, task=None, metric=None, precision=None, dataset_name=None, max_iterations=-1, ensemble_nbest=None, ensemble_size=None): if task is None: task = self._task if metric is None: metric = self._metric if precision is None: precision = self.precision if dataset_name is None: dataset_name = self._dataset_name if ensemble_nbest is None: ensemble_nbest = self._ensemble_nbest if ensemble_size is None: ensemble_size = self._ensemble_size return EnsembleBuilder(autosklearn_tmp_dir=self._tmp_dir, dataset_name=dataset_name, task_type=task, metric=metric, limit=time_left_for_ensembles, output_dir=self._output_dir, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, seed=self._seed, shared_mode=self._shared_mode, precision=precision, max_iterations=max_iterations) def _load_models(self): if self._shared_mode: seed = -1 else: seed = self._seed self.ensemble_ = self._backend.load_ensemble(seed) if self.ensemble_: identifiers = self.ensemble_.identifiers_ self.models_ = self._backend.load_models_by_identifiers(identifiers) else: self.models_ = self._backend.load_all_models(seed) if len(self.models_) == 0: raise ValueError('No models fitted!') def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score prediction = self.predict_proba(X) return calculate_score(y, prediction, self._task, self._metric, self._label_num, logger=self._logger) def show_models(self): if self.models_ is None or len(self.models_) == 0 or \ self.ensemble_ is None: self._load_models() return self.ensemble_.pprint_ensemble_string(self.models_) def _save_ensemble_data(self, X, y): """Split dataset and store Data for the ensemble script. :param X: :param y: :return: """ task_name = 'LoadData' self._start_task(self._stopwatch, task_name) _, _, _, y_ensemble = resampling.split_data(X, y) self._backend.save_targets_ensemble(y_ensemble) self._stop_task(self._stopwatch, task_name) def _create_search_space(self, tmp_dir, backend, datamanager, include_estimators=None, include_preprocessors=None): task_name = 'CreateConfigSpace' self._stopwatch.start_task(task_name) configspace_path = os.path.join(tmp_dir, 'space.pcs') configuration_space = pipeline.get_configuration_space( datamanager.info, include_estimators=include_estimators, include_preprocessors=include_preprocessors) configuration_space = self.configuration_space_created_hook( datamanager, configuration_space) sp_string = pcs.write(configuration_space) backend.write_txt_file(configspace_path, sp_string, 'Configuration space') self._stopwatch.stop_task(task_name) return configuration_space, configspace_path def configuration_space_created_hook(self, datamanager, configuration_space): return configuration_space def get_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'get_params() because it is not intended to ' 'be optimized.') def set_params(self, deep=True): raise NotImplementedError('auto-sklearn does not implement ' 'set_params() because it is not intended to ' 'be optimized.') def __del__(self): self._delete_output_directories() def _delete_output_directories(self): if self.delete_output_folder_after_terminate: try: shutil.rmtree(self._output_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete output dir: %s" % self._output_dir) else: print("Could not delete output dir: %s" % self._output_dir) if self.delete_tmp_folder_after_terminate: try: shutil.rmtree(self._tmp_dir) except Exception: if self._logger is not None: self._logger.warning("Could not delete tmp dir: %s" % self._tmp_dir) pass else: print("Could not delete tmp dir: %s" % self._tmp_dir)
class AbstractEvaluator(object): __metaclass__ = abc.ABCMeta @abc.abstractmethod def __init__(self, Datamanager, configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, output_dir=None, output_y_test=False, num_run=None): self.starttime = time.time() self.configuration = configuration self.D = Datamanager self.X_valid = Datamanager.data.get('X_valid') self.X_test = Datamanager.data.get('X_test') self.metric = Datamanager.info['metric'] self.task_type = Datamanager.info['task'] self.seed = seed if output_dir is None: self.output_dir = os.getcwd() else: self.output_dir = output_dir self.output_y_test = output_y_test self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions if self.task_type in REGRESSION_TASKS: if self.configuration is None: self.model_class = MyDummyRegressor else: self.model_class = SimpleRegressionPipeline self.predict_function = self.predict_regression else: if self.configuration is None: self.model_class = MyDummyClassifier else: self.model_class = SimpleClassificationPipeline self.predict_function = self.predict_proba if num_run is None: num_run = get_new_run_num() self.num_run = num_run self.backend = Backend(None, self.output_dir) self.model = self.model_class(self.configuration, self.seed) @abc.abstractmethod def fit(self): pass @abc.abstractmethod def predict(self): pass # This function does everything necessary after the fitting is done: # predicting # saving the files for the ensembles_statistics # generate output for SMAC # We use it as the signal handler so we can recycle the code for the # normal usecase and when the runsolver kills us here :) def finish_up(self): try: self.duration = time.time() - self.starttime result, additional_run_info = self.file_output() if self.configuration is not None: print('Result for ParamILS: %s, %f, 1, %f, %d, %s' % ('SAT', abs(self.duration), result, self.seed, additional_run_info)) except Exception as e: self.duration = time.time() - self.starttime print(traceback.format_exc()) print('Result for ParamILS: %s, %f, 1, %f, %d, %s' % ('TIMEOUT', abs(self.duration), 1.0, self.seed, 'No results were produced! Error is %s' % str(e))) def file_output(self): seed = os.environ.get('AUTOSKLEARN_SEED') errs, Y_optimization_pred, Y_valid_pred, Y_test_pred = self.predict() if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]: return 2, "Targets %s and prediction %s don't have the same " \ "length. Probably training didn't finish" % ( self.Y_optimization.shape, Y_optimization_pred.shape) num_run = str(self.num_run).zfill(5) if os.path.exists(self.backend.get_model_dir()): self.backend.save_model(self.model, self.num_run, seed) if self.output_y_test: try: os.makedirs(self.output_dir) except OSError: pass self.backend.save_targets_ensemble(self.Y_optimization) self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble', seed, num_run) if Y_valid_pred is not None: self.backend.save_predictions_as_npy(Y_valid_pred, 'valid', seed, num_run) if Y_test_pred is not None: self.backend.save_predictions_as_npy(Y_test_pred, 'test', seed, num_run) self.duration = time.time() - self.starttime err = errs[self.D.info['metric']] additional_run_info = ';'.join([ '%s: %s' % (METRIC_TO_STRING[metric] if metric in METRIC_TO_STRING else metric, value) for metric, value in errs.items() ]) additional_run_info += ';' + 'duration: ' + str(self.duration) additional_run_info += ';' + 'num_run:' + num_run return err, additional_run_info def predict_proba(self, X, model, task_type, Y_train=None): Y_pred = model.predict_proba(X, batch_size=1000) if task_type == MULTILABEL_CLASSIFICATION: Y_pred = np.hstack([ Y_pred[i][:, -1].reshape((-1, 1)) for i in range(len(Y_pred)) ]) elif task_type == BINARY_CLASSIFICATION: if len(Y_pred.shape) != 1: Y_pred = Y_pred[:, 1].reshape(-1, 1) elif task_type == MULTICLASS_CLASSIFICATION: pass Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train) return Y_pred def predict_regression(self, X, model, task_type, Y_train=None): Y_pred = model.predict(X) if len(Y_pred.shape) == 1: Y_pred = Y_pred.reshape((-1, 1)) return Y_pred def _ensure_prediction_array_sizes(self, prediction, Y_train): num_classes = self.D.info['label_num'] if self.task_type == MULTICLASS_CLASSIFICATION and \ prediction.shape[1] < num_classes: classes = list(np.unique(self.D.data['Y_train'])) if num_classes == prediction.shape[1]: return prediction if Y_train is not None: classes = list(np.unique(Y_train)) mapping = dict() for class_number in range(num_classes): if class_number in classes: index = classes.index(class_number) mapping[index] = class_number new_predictions = np.zeros((prediction.shape[0], num_classes)) for index in mapping: class_index = mapping[index] new_predictions[:, class_index] = prediction[:, index] return new_predictions return prediction