def __init__( self, Datamanager, output_dir, configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, output_y_test=False, num_run=None, subsample=None, ): self.starttime = time.time() self.output_dir = output_dir self.configuration = configuration self.D = Datamanager self.X_valid = Datamanager.data.get('X_valid') self.X_test = Datamanager.data.get('X_test') self.metric = Datamanager.info['metric'] self.task_type = Datamanager.info['task'] self.seed = seed self.output_y_test = output_y_test self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions if self.task_type in REGRESSION_TASKS: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyRegressor else: self.model_class = \ autosklearn.pipeline.regression.SimpleRegressionPipeline self.predict_function = self._predict_regression else: if not isinstance(self.configuration, Configuration): self.model_class = MyDummyClassifier else: self.model_class = \ autosklearn.pipeline.classification.SimpleClassificationPipeline self.predict_function = self._predict_proba if num_run is None: num_run = 0 self.num_run = num_run self.subsample = subsample self.backend = Backend(None, self.output_dir) self.model = self.model_class(self.configuration, self.seed)
def __init__(self, Datamanager, configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, output_dir=None, output_y_test=False, num_run=None): self.starttime = time.time() self.configuration = configuration self.D = Datamanager self.X_valid = Datamanager.data.get('X_valid') self.X_test = Datamanager.data.get('X_test') self.metric = Datamanager.info['metric'] self.task_type = Datamanager.info['task'] self.seed = seed if output_dir is None: self.output_dir = os.getcwd() else: self.output_dir = output_dir self.output_y_test = output_y_test self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions if self.task_type in REGRESSION_TASKS: if self.configuration is None: self.model_class = MyDummyRegressor else: self.model_class = SimpleRegressionPipeline self.predict_function = self.predict_regression else: if self.configuration is None: self.model_class = MyDummyClassifier else: self.model_class = SimpleClassificationPipeline self.predict_function = self.predict_proba if num_run is None: num_run = get_new_run_num() self.num_run = num_run self.backend = Backend(None, self.output_dir) self.model = self.model_class(self.configuration, self.seed)
def make_mode_holdout_iterative_fit(data, seed, configuration, num_run): global evaluator evaluator = HoldoutEvaluator(data, configuration, seed=seed, num_run=num_run, **_get_base_dict()) evaluator.iterative_fit() signal.signal(15, empty_signal_handler) evaluator.finish_up() backend = Backend(None, os.getcwd()) if os.path.exists(backend.get_model_dir()): backend.save_model(evaluator.model, num_run, seed)
def store_and_or_load_data(dataset_info, outputdir): backend = Backend(None, outputdir) try: D = backend.load_datamanager() except IOError: D = None # Datamanager probably doesn't exist if D is None: D = CompetitionDataManager(dataset_info, encode_labels=True) backend.save_datamanager(D) return D
def load_data(dataset_info, outputdir, tmp_dir=None, max_mem=None): if tmp_dir is None: tmp_dir = outputdir backend = Backend(outputdir, tmp_dir) try: D = backend.load_datamanager() except IOError: D = None # Datamanager probably doesn't exist if D is None: if max_mem is None: D = CompetitionDataManager(dataset_info, encode_labels=True) else: D = CompetitionDataManager(dataset_info, encode_labels=True, max_memory_in_mb=max_mem) return D
def main(self): watch = StopWatch() watch.start_task('ensemble_builder') used_time = 0 time_iter = 0 index_run = 0 num_iteration = 0 current_num_models = 0 last_hash = None current_hash = None backend = Backend(self.output_dir, self.autosklearn_tmp_dir) dir_ensemble = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn', 'predictions_ensemble') dir_valid = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn', 'predictions_valid') dir_test = os.path.join(self.autosklearn_tmp_dir, '.auto-sklearn', 'predictions_test') paths_ = [dir_ensemble, dir_valid, dir_test] dir_ensemble_list_mtimes = [] self.logger.debug( 'Starting main loop with %f seconds and %d iterations ' 'left.' % (self.limit - used_time, num_iteration)) while used_time < self.limit or (self.max_iterations > 0 and self.max_iterations >= num_iteration): num_iteration += 1 self.logger.debug('Time left: %f', self.limit - used_time) self.logger.debug('Time last ensemble building: %f', time_iter) # Reload the ensemble targets every iteration, important, because cv may # update the ensemble targets in the cause of running auto-sklearn # TODO update cv in order to not need this any more! targets_ensemble = backend.load_targets_ensemble() # Load the predictions from the models exists = [os.path.isdir(dir_) for dir_ in paths_] if not exists[0]: # all(exists): self.logger.debug('Prediction directory %s does not exist!' % dir_ensemble) time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if self.shared_mode is False: dir_ensemble_list = sorted( glob.glob( os.path.join( dir_ensemble, 'predictions_ensemble_%s_*.npy' % self.seed))) if exists[1]: dir_valid_list = sorted( glob.glob( os.path.join( dir_valid, 'predictions_valid_%s_*.npy' % self.seed))) else: dir_valid_list = [] if exists[2]: dir_test_list = sorted( glob.glob( os.path.join( dir_test, 'predictions_test_%s_*.npy' % self.seed))) else: dir_test_list = [] else: dir_ensemble_list = sorted(os.listdir(dir_ensemble)) dir_valid_list = sorted( os.listdir(dir_valid)) if exists[1] else [] dir_test_list = sorted( os.listdir(dir_test)) if exists[2] else [] # Check the modification times because predictions can be updated # over time! old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes dir_ensemble_list_mtimes = [] for dir_ensemble_file in dir_ensemble_list: if dir_ensemble_file.endswith("/"): dir_ensemble_file = dir_ensemble_file[:-1] basename = os.path.basename(dir_ensemble_file) dir_ensemble_file = os.path.join(dir_ensemble, basename) mtime = os.path.getmtime(dir_ensemble_file) dir_ensemble_list_mtimes.append(mtime) if len(dir_ensemble_list) == 0: self.logger.debug('Directories are empty') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if len(dir_ensemble_list) <= current_num_models and \ old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes: self.logger.debug('Nothing has changed since the last time') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue watch.start_task('index_run' + str(index_run)) watch.start_task('ensemble_iter_' + str(num_iteration)) # List of num_runs (which are in the filename) which will be included # later include_num_runs = [] backup_num_runs = [] model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$') if self.ensemble_nbest is not None: # Keeps track of the single scores of each model in our ensemble scores_nbest = [] # The indices of the model that are currently in our ensemble indices_nbest = [] # The names of the models model_names = [] model_names_to_scores = dict() model_idx = 0 for model_name in dir_ensemble_list: if model_name.endswith("/"): model_name = model_name[:-1] basename = os.path.basename(model_name) try: if self.precision is "16": predictions = np.load( os.path.join(dir_ensemble, basename)).astype(dtype=np.float16) elif self.precision is "32": predictions = np.load( os.path.join(dir_ensemble, basename)).astype(dtype=np.float32) elif self.precision is "64": predictions = np.load( os.path.join(dir_ensemble, basename)).astype(dtype=np.float64) else: predictions = np.load( os.path.join(dir_ensemble, basename)) score = calculate_score(targets_ensemble, predictions, self.task_type, self.metric, predictions.shape[1]) except Exception as e: self.logger.warning('Error loading %s: %s', basename, e) score = -1 model_names_to_scores[model_name] = score match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if self.ensemble_nbest is not None: if score <= 0.001: self.logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) # If we have less models in our ensemble than ensemble_nbest add # the current model if it is better than random elif len(scores_nbest) < self.ensemble_nbest: scores_nbest.append(score) indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) model_names.append(model_name) else: # Take the worst performing model in our ensemble so far idx = np.argmin(np.array([scores_nbest])) # If the current model is better than the worst model in # our ensemble replace it by the current model if scores_nbest[idx] < score: self.logger.debug( 'Worst model in our ensemble: %s with ' 'score %f will be replaced by model %s ' 'with score %f', model_names[idx], scores_nbest[idx], model_name, score) # Exclude the old model del scores_nbest[idx] scores_nbest.append(score) del include_num_runs[idx] del indices_nbest[idx] indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) del model_names[idx] model_names.append(model_name) # Otherwise exclude the current model from the ensemble else: # include_num_runs.append(True) pass else: # Load all predictions that are better than random if score <= 0.001: # include_num_runs.append(True) self.logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) else: include_num_runs.append((automl_seed, num_run)) model_idx += 1 # If there is no model better than random guessing, we have to use # all models which do random guessing if len(include_num_runs) == 0: include_num_runs = backup_num_runs indices_to_model_names = dict() indices_to_run_num = dict() for i, model_name in enumerate(dir_ensemble_list): match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if (automl_seed, num_run) in include_num_runs: num_indices = len(indices_to_model_names) indices_to_model_names[num_indices] = model_name indices_to_run_num[num_indices] = (automl_seed, num_run) try: all_predictions_train, all_predictions_valid, all_predictions_test =\ self.get_all_predictions(dir_ensemble, dir_ensemble_list, dir_valid, dir_valid_list, dir_test, dir_test_list, include_num_runs, model_and_automl_re, self.precision) except IOError: self.logger.error('Could not load the predictions.') continue if len(include_num_runs) == 0: self.logger.error('All models do just random guessing') time.sleep(2) continue else: ensemble = EnsembleSelection(ensemble_size=self.ensemble_size, task_type=self.task_type, metric=self.metric) try: ensemble.fit(all_predictions_train, targets_ensemble, include_num_runs) self.logger.info(ensemble) except ValueError as e: self.logger.error('Caught ValueError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except IndexError as e: self.logger.error('Caught IndexError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except Exception as e: self.logger.error('Caught error! %s', str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue # Output the score self.logger.info('Training performance: %f' % ensemble.train_score_) self.logger.info( 'Building the ensemble took %f seconds' % watch.wall_elapsed('ensemble_iter_' + str(num_iteration))) # Set this variable here to avoid re-running the ensemble builder # every two seconds in case the ensemble did not change current_num_models = len(dir_ensemble_list) ensemble_predictions = ensemble.predict(all_predictions_train) if sys.version_info[0] == 2: ensemble_predictions.flags.writeable = False current_hash = hash(ensemble_predictions.data) else: current_hash = hash(ensemble_predictions.data.tobytes()) # Only output a new ensemble and new predictions if the output of the # ensemble would actually change! # TODO this is neither safe (collisions, tests only with the ensemble # prediction, but not the ensemble), implement a hash function for # each possible ensemble builder. if last_hash is not None: if current_hash == last_hash: self.logger.info('Ensemble output did not change.') time.sleep(2) continue else: last_hash = current_hash else: last_hash = current_hash # Save the ensemble for later use in the main auto-sklearn module! backend.save_ensemble(ensemble, index_run, self.seed) # Save predictions for valid and test data set if len(dir_valid_list) == len(dir_ensemble_list): all_predictions_valid = np.array(all_predictions_valid) ensemble_predictions_valid = ensemble.predict( all_predictions_valid) if self.task_type == BINARY_CLASSIFICATION: ensemble_predictions_valid = ensemble_predictions_valid[:, 1] if self.low_precision: if self.task_type in [ BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION ]: ensemble_predictions_valid[ ensemble_predictions_valid < 1e-4] = 0. if self.metric in [BAC_METRIC, F1_METRIC]: bin_array = np.zeros(ensemble_predictions_valid.shape, dtype=np.int32) if (self.task_type != MULTICLASS_CLASSIFICATION) or ( ensemble_predictions_valid.shape[1] == 1): bin_array[ensemble_predictions_valid >= 0.5] = 1 else: sample_num = ensemble_predictions_valid.shape[0] for i in range(sample_num): j = np.argmax(ensemble_predictions_valid[i, :]) bin_array[i, j] = 1 ensemble_predictions_valid = bin_array if self.task_type in CLASSIFICATION_TASKS: if ensemble_predictions_valid.size < (20000 * 20): precision = 3 else: precision = 2 else: if ensemble_predictions_valid.size > 1000000: precision = 4 else: # File size maximally 2.1MB precision = 6 backend.save_predictions_as_txt(ensemble_predictions_valid, 'valid', index_run, prefix=self.dataset_name, precision=precision) else: self.logger.info( 'Could not find as many validation set predictions (%d)' 'as ensemble predictions (%d)!.', len(dir_valid_list), len(dir_ensemble_list)) del all_predictions_valid if len(dir_test_list) == len(dir_ensemble_list): all_predictions_test = np.array(all_predictions_test) ensemble_predictions_test = ensemble.predict( all_predictions_test) if self.task_type == BINARY_CLASSIFICATION: ensemble_predictions_test = ensemble_predictions_test[:, 1] if self.low_precision: if self.task_type in [ BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION ]: ensemble_predictions_test[ ensemble_predictions_test < 1e-4] = 0. if self.metric in [BAC_METRIC, F1_METRIC]: bin_array = np.zeros(ensemble_predictions_test.shape, dtype=np.int32) if (self.task_type != MULTICLASS_CLASSIFICATION) or ( ensemble_predictions_test.shape[1] == 1): bin_array[ensemble_predictions_test >= 0.5] = 1 else: sample_num = ensemble_predictions_test.shape[0] for i in range(sample_num): j = np.argmax(ensemble_predictions_test[i, :]) bin_array[i, j] = 1 ensemble_predictions_test = bin_array if self.task_type in CLASSIFICATION_TASKS: if ensemble_predictions_test.size < (20000 * 20): precision = 3 else: precision = 2 else: if ensemble_predictions_test.size > 1000000: precision = 4 else: precision = 6 backend.save_predictions_as_txt(ensemble_predictions_test, 'test', index_run, prefix=self.dataset_name, precision=precision) else: self.logger.info( 'Could not find as many test set predictions (%d) as ' 'ensemble predictions (%d)!', len(dir_test_list), len(dir_ensemble_list)) del all_predictions_test current_num_models = len(dir_ensemble_list) watch.stop_task('index_run' + str(index_run)) time_iter = watch.get_wall_dur('index_run' + str(index_run)) used_time = watch.wall_elapsed('ensemble_builder') index_run += 1 return
def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32, max_iter_smac=None, acquisition_function='EI'): super(AutoML, self).__init__() self._tmp_dir = tmp_dir self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._data_memory_limit = None self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self._max_iter_smac = max_iter_smac self.delete_tmp_folder_after_terminate = \ delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = \ delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self.acquisition_function = acquisition_function self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self._parser = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode if not isinstance(self._time_for_task, int): raise ValueError("time_left_for_this_task not of type integer, " "but %s" % str(type(self._time_for_task))) if not isinstance(self._per_run_time_limit, int): raise ValueError("per_run_time_limit not of type integer, but %s" % str(type(self._per_run_time_limit))) # After assignging and checking variables... self._backend = Backend(self._output_dir, self._tmp_dir)
def __init__(self, tmp_dir, output_dir, time_left_for_this_task, per_run_time_limit, log_dir=None, initial_configurations_via_metalearning=25, ensemble_size=1, ensemble_nbest=1, seed=1, ml_memory_limit=3000, metadata_directory=None, queue=None, keep_models=True, debug_mode=False, include_estimators=None, include_preprocessors=None, resampling_strategy='holdout-iterative-fit', resampling_strategy_arguments=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=False, precision=32): super(AutoML, self).__init__() self._tmp_dir = tmp_dir self._output_dir = output_dir self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit self._log_dir = log_dir if log_dir is not None else self._tmp_dir self._initial_configurations_via_metalearning = \ initial_configurations_via_metalearning self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._seed = seed self._ml_memory_limit = ml_memory_limit self._metadata_directory = metadata_directory self._queue = queue self._keep_models = keep_models self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy self._resampling_strategy_arguments = resampling_strategy_arguments self.delete_tmp_folder_after_terminate = \ delete_tmp_folder_after_terminate self.delete_output_folder_after_terminate = \ delete_output_folder_after_terminate self._shared_mode = shared_mode self.precision = precision self._datamanager = None self._dataset_name = None self._stopwatch = StopWatch() self._logger = None self._task = None self._metric = None self._label_num = None self.models_ = None self.ensemble_ = None self._can_predict = False self._debug_mode = debug_mode self._backend = Backend(self._output_dir, self._tmp_dir)
def main(autosklearn_tmp_dir, dataset_name, task_type, metric, limit, output_dir, ensemble_size=None, ensemble_nbest=None, seed=1, shared_mode=False, max_iterations=-1, precision="32"): watch = StopWatch() watch.start_task('ensemble_builder') used_time = 0 time_iter = 0 index_run = 0 num_iteration = 0 current_num_models = 0 backend = Backend(output_dir, autosklearn_tmp_dir) dir_ensemble = os.path.join(autosklearn_tmp_dir, '.auto-sklearn', 'predictions_ensemble') dir_valid = os.path.join(autosklearn_tmp_dir, '.auto-sklearn', 'predictions_valid') dir_test = os.path.join(autosklearn_tmp_dir, '.auto-sklearn', 'predictions_test') paths_ = [dir_ensemble, dir_valid, dir_test] dir_ensemble_list_mtimes = [] while used_time < limit or (max_iterations > 0 and max_iterations >= num_iteration): num_iteration += 1 logger.debug('Time left: %f', limit - used_time) logger.debug('Time last iteration: %f', time_iter) # Reload the ensemble targets every iteration, important, because cv may # update the ensemble targets in the cause of running auto-sklearn # TODO update cv in order to not need this any more! targets_ensemble = backend.load_targets_ensemble() # Load the predictions from the models exists = [os.path.isdir(dir_) for dir_ in paths_] if not exists[0]: # all(exists): logger.debug('Prediction directory %s does not exist!' % dir_ensemble) time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if shared_mode is False: dir_ensemble_list = sorted(glob.glob(os.path.join( dir_ensemble, 'predictions_ensemble_%s_*.npy' % seed))) if exists[1]: dir_valid_list = sorted(glob.glob(os.path.join( dir_valid, 'predictions_valid_%s_*.npy' % seed))) else: dir_valid_list = [] if exists[2]: dir_test_list = sorted(glob.glob(os.path.join( dir_test, 'predictions_test_%s_*.npy' % seed))) else: dir_test_list = [] else: dir_ensemble_list = sorted(os.listdir(dir_ensemble)) dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else [] dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else [] # Check the modification times because predictions can be updated # over time! old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes dir_ensemble_list_mtimes = [] for dir_ensemble_file in dir_ensemble_list: if dir_ensemble_file.endswith("/"): dir_ensemble_file = dir_ensemble_file[:-1] basename = os.path.basename(dir_ensemble_file) dir_ensemble_file = os.path.join(dir_ensemble, basename) mtime = os.path.getmtime(dir_ensemble_file) dir_ensemble_list_mtimes.append(mtime) if len(dir_ensemble_list) == 0: logger.debug('Directories are empty') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue if len(dir_ensemble_list) <= current_num_models and \ old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes: logger.debug('Nothing has changed since the last time') time.sleep(2) used_time = watch.wall_elapsed('ensemble_builder') continue watch.start_task('ensemble_iter_' + str(index_run)) # List of num_runs (which are in the filename) which will be included # later include_num_runs = [] backup_num_runs = [] model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$') if ensemble_nbest is not None: # Keeps track of the single scores of each model in our ensemble scores_nbest = [] # The indices of the model that are currently in our ensemble indices_nbest = [] # The names of the models model_names = [] model_names_to_scores = dict() model_idx = 0 for model_name in dir_ensemble_list: if model_name.endswith("/"): model_name = model_name[:-1] basename = os.path.basename(model_name) if precision is "16": predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float16) elif precision is "32": predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float32) elif precision is "64": predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float64) else: predictions = np.load(os.path.join(dir_ensemble, basename)) try: score = calculate_score(targets_ensemble, predictions, task_type, metric, predictions.shape[1]) except: score = -1 model_names_to_scores[model_name] = score match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if ensemble_nbest is not None: if score <= 0.001: logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) # If we have less models in our ensemble than ensemble_nbest add # the current model if it is better than random elif len(scores_nbest) < ensemble_nbest: scores_nbest.append(score) indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) model_names.append(model_name) else: # Take the worst performing model in our ensemble so far idx = np.argmin(np.array([scores_nbest])) # If the current model is better than the worst model in # our ensemble replace it by the current model if scores_nbest[idx] < score: logger.debug('Worst model in our ensemble: %s with ' 'score %f will be replaced by model %s ' 'with score %f', model_names[idx], scores_nbest[idx], model_name, score) # Exclude the old model del scores_nbest[idx] scores_nbest.append(score) del include_num_runs[idx] del indices_nbest[idx] indices_nbest.append(model_idx) include_num_runs.append((automl_seed, num_run)) del model_names[idx] model_names.append(model_name) # Otherwise exclude the current model from the ensemble else: # include_num_runs.append(True) pass else: # Load all predictions that are better than random if score <= 0.001: # include_num_runs.append(True) logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) backup_num_runs.append((automl_seed, num_run)) else: include_num_runs.append((automl_seed, num_run)) model_idx += 1 # If there is no model better than random guessing, we have to use # all models which do random guessing if len(include_num_runs) == 0: include_num_runs = backup_num_runs indices_to_model_names = dict() indices_to_run_num = dict() for i, model_name in enumerate(dir_ensemble_list): match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) num_run = int(match.group(2)) if (automl_seed, num_run) in include_num_runs: num_indices = len(indices_to_model_names) indices_to_model_names[num_indices] = model_name indices_to_run_num[num_indices] = (automl_seed, num_run) try: all_predictions_train, all_predictions_valid, all_predictions_test =\ get_all_predictions(dir_ensemble, dir_ensemble_list, dir_valid, dir_valid_list, dir_test, dir_test_list, include_num_runs, model_and_automl_re, precision) except IOError: logger.error('Could not load the predictions.') continue if len(include_num_runs) == 0: logger.error('All models do just random guessing') time.sleep(2) continue else: ensemble = EnsembleSelection(ensemble_size=ensemble_size, task_type=task_type, metric=metric) try: ensemble.fit(all_predictions_train, targets_ensemble, include_num_runs) logger.info(ensemble) except ValueError as e: logger.error('Caught ValueError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except IndexError as e: logger.error('Caught IndexError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue except Exception as e: logger.error('Caught error! %s', e.message) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue # Output the score logger.info('Training performance: %f' % ensemble.train_score_) # Save the ensemble for later use in the main auto-sklearn module! backend.save_ensemble(ensemble, index_run, seed) # Save predictions for valid and test data set if len(dir_valid_list) == len(dir_ensemble_list): all_predictions_valid = np.array(all_predictions_valid) ensemble_predictions_valid = ensemble.predict(all_predictions_valid) backend.save_predictions_as_txt(ensemble_predictions_valid, 'valid', index_run, prefix=dataset_name) else: logger.info('Could not find as many validation set predictions (%d)' 'as ensemble predictions (%d)!.', len(dir_valid_list), len(dir_ensemble_list)) del all_predictions_valid if len(dir_test_list) == len(dir_ensemble_list): all_predictions_test = np.array(all_predictions_test) ensemble_predictions_test = ensemble.predict(all_predictions_test) backend.save_predictions_as_txt(ensemble_predictions_test, 'test', index_run, prefix=dataset_name) else: logger.info('Could not find as many test set predictions (%d) as ' 'ensemble predictions (%d)!', len(dir_test_list), len(dir_ensemble_list)) del all_predictions_test current_num_models = len(dir_ensemble_list) watch.stop_task('ensemble_iter_' + str(index_run)) time_iter = watch.get_wall_dur('ensemble_iter_' + str(index_run)) used_time = watch.wall_elapsed('ensemble_builder') index_run += 1 return