Esempio n. 1
0
    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self.models_ = None
        self.ensemble_indices_ = None

        self._debug_mode = debug_mode
        self._backend = Backend(self._output_dir, self._tmp_dir)
Esempio n. 2
0
    def fit(self, X, y, task=MULTICLASS_CLASSIFICATION, metric="acc_metric", feat_type=None, dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        logger_name = "AutoML(%d):%s" % (self._seed, dataset_name)
        setup_logger(os.path.join(self._tmp_dir, "%s.log" % str(logger_name)))
        self._logger = get_logger(logger_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError(
                "Array feat_type does not have same number of "
                "variables as X has features. %d vs %d." % (len(feat_type), X.shape[1])
            )
        if feat_type is not None and not all([isinstance(f, bool) for f in feat_type]):
            raise ValueError("Array feat_type must only contain bools.")

        loaded_data_manager = XYDataManager(
            X, y, task=task, metric=metric, feat_type=feat_type, dataset_name=dataset_name, encode_labels=False
        )

        return self._fit(loaded_data_manager)
Esempio n. 3
0
    def fit(self, X, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        loaded_data_manager = XYDataManager(X, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)
Esempio n. 4
0
    def fit_on_datamanager(self, datamanager, metric):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(datamanager.name)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        self._logger = self._get_logger(name)
        self._fit(datamanager, metric)
Esempio n. 5
0
    def fit(self, X, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if not self._shared_mode:
            self._backend.context.delete_directories()
        else:
            # If this fails, it's likely that this is the first call to get
            # the data manager
            try:
                D = self._backend.load_datamanager()
                dataset_name = D.name
            except IOError:
                pass

        self._backend.context.create_directories()

        if dataset_name is None:
            dataset_name = hash_numpy_array(X)

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all([isinstance(f, str)
                                              for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' % ft)

        self._data_memory_limit = None
        loaded_data_manager = XYDataManager(X, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)
Esempio n. 6
0
    def run(self):
        if self._parser is None:
            raise ValueError('You must invoke run() only via start_automl()')
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=self._parser)
        self._stopwatch.start_task(datamanager.name)

        self._logger = self._get_logger(datamanager.name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self._fit(self._datamanager)
Esempio n. 7
0
    def start_automl(self, parser):
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=parser)
        self._stopwatch.start_task(datamanager.name)

        logger_name = 'AutoML(%d):%s' % (self._seed, datamanager.name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self.start()
Esempio n. 8
0
    def test_stopwatch_overhead(self):

        # Wall Overhead
        start = time.time()
        cpu_start = time.clock()
        watch = StopWatch()
        for i in range(1, 1000):
            watch.start_task('task_%d' % i)
            watch.stop_task('task_%d' % i)
        cpu_stop = time.clock()
        stop = time.time()
        dur = stop - start
        cpu_dur = cpu_stop - cpu_start
        cpu_overhead = cpu_dur - watch.cpu_sum()
        wall_overhead = dur - watch.wall_sum()

        self.assertLess(cpu_overhead, 1)
        self.assertLess(wall_overhead, 1)
        self.assertLess(watch.cpu_sum(), 2 * watch.wall_sum())
Esempio n. 9
0
    def fit(self, X, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all([isinstance(f, str)
                                              for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' % ft)

        loaded_data_manager = XYDataManager(X, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)
Esempio n. 10
0
    def fit_automl_dataset(self, dataset, metric):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        self._logger = self._get_logger(name)
        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        self._data_memory_limit = float(self._ml_memory_limit) / 3
        loaded_data_manager = CompetitionDataManager(
            dataset, max_memory_in_mb=self._data_memory_limit)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager, metric)
Esempio n. 11
0
    def fit_automl_dataset(self, basename, input_dir):
        # == Creating a data object with data and information about it
        self._basename = basename

        self._stopwatch = StopWatch()
        self._stopwatch.start_task(self._basename)

        self._logger = get_automl_logger(self._log_dir, self._basename,
                                         self._seed)

        self._debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        loaded_data_manager = CompetitionDataManager(self._basename, input_dir,
                                                     verbose=True,
                                                     encode_labels=False)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._debug(part)

        return self._fit(loaded_data_manager)
Esempio n. 12
0
    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        loaded_data_manager = CompetitionDataManager(dataset,
                                                     encode_labels=False)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)
Esempio n. 13
0
    def fit(self, data_x, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(data_x.data)
            dataset_name = m.hexdigest()

        self._basename = dataset_name

        self._stopwatch = StopWatch()
        self._stopwatch.start_task(self._basename)

        loaded_data_manager = XYDataManager(data_x, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)
def main(logger,
         predictions_dir,
         basename,
         task_type,
         metric,
         limit,
         output_dir,
         ensemble_size=None,
         seed=1,
         indices_output_dir='.'):

    watch = StopWatch()
    watch.start_task('ensemble_builder')

    task_type = STRING_TO_TASK_TYPES[task_type]

    used_time = 0
    time_iter = 0
    index_run = 0
    current_num_models = 0

    dir_ensemble = join(predictions_dir, 'predictions_ensemble_%s/' % seed)
    dir_valid = join(predictions_dir, 'predictions_valid_%s/' % seed)
    dir_test = join(predictions_dir, 'predictions_test_%s/' % seed)
    paths_ = [dir_ensemble, dir_valid, dir_test]

    tru_labels_path = join(predictions_dir, 'true_labels_ensemble.npy')

    while used_time < limit:
        logger.debug('Time left: %f', limit - used_time)
        logger.debug('Time last iteration: %f', time_iter)
        # Load the true labels of the validation data
        true_labels = np.load(tru_labels_path)

        # Load the predictions from the models
        exists = [os.path.isdir(dir_) for dir_ in paths_]
        if not exists[0]:  # all(exists):
            logger.debug('Prediction directory %s does not exist!' %
                          dir_ensemble)
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        dir_ensemble_list = sorted(os.listdir(dir_ensemble))
        dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else []
        dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else []

        if check_data(logger, len(dir_ensemble_list), current_num_models):
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        watch.start_task('ensemble_iter_' + str(index_run))

        # List of num_runs (which are in the filename) which will be included
        #  later
        include_num_runs = []
        re_num_run = re.compile(r'_([0-9]*)\.npy$')
        if ensemble_size is not None:
            # Keeps track of the single scores of each model in our ensemble
            scores_nbest = []
            # The indices of the model that are currently in our ensemble
            indices_nbest = []
            # The names of the models
            model_names = []
            # The num run of the models
            num_runs = []

        model_names_to_scores = dict()

        model_idx = 0
        for model_name in dir_ensemble_list:
            predictions = np.load(os.path.join(dir_ensemble, model_name))
            score = calculate_score(true_labels, predictions,
                                              task_type, metric,
                                              predictions.shape[1])
            model_names_to_scores[model_name] = score
            num_run = int(re_num_run.search(model_name).group(1))

            if ensemble_size is not None:
                if score <= 0.001:
                    # include_num_runs.append(True)
                    logger.error('Model only predicts at random: ' +
                                  model_name + ' has score: ' + str(score))
                # If we have less models in our ensemble than ensemble_size add
                # the current model if it is better than random
                elif len(scores_nbest) < ensemble_size:
                    scores_nbest.append(score)
                    indices_nbest.append(model_idx)
                    include_num_runs.append(num_run)
                    model_names.append(model_name)
                    num_runs.append(num_run)
                else:
                    # Take the worst performing model in our ensemble so far
                    idx = np.argmin(np.array([scores_nbest]))

                    # If the current model is better than the worst model in
                    # our ensemble replace it by the current model
                    if scores_nbest[idx] < score:
                        logger.debug('Worst model in our ensemble: %s with '
                                      'score %f will be replaced by model %s '
                                      'with score %f', model_names[idx],
                                      scores_nbest[idx], model_name, score)
                        # Exclude the old model
                        del scores_nbest[idx]
                        scores_nbest.append(score)
                        del include_num_runs[idx]
                        del indices_nbest[idx]
                        indices_nbest.append(model_idx)
                        include_num_runs.append(num_run)
                        del model_names[idx]
                        model_names.append(model_name)
                        del num_runs[idx]
                        num_runs.append(num_run)

                    # Otherwise exclude the current model from the ensemble
                    else:
                        # include_num_runs.append(True)
                        pass

            else:
                # Load all predictions that are better than random
                if score <= 0.001:
                    # include_num_runs.append(True)
                    logger.error('Model only predicts at random: ' +
                                  model_name + ' has score: ' + str(score))
                else:
                    include_num_runs.append(num_run)

            model_idx += 1

        indices_to_model_names = dict()
        indices_to_run_num = dict()
        for i, model_name in enumerate(dir_ensemble_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                num_indices = len(indices_to_model_names)
                indices_to_model_names[num_indices] = model_name
                indices_to_run_num[num_indices] = num_run

        # logging.info("Indices to model names:")
        # logging.info(indices_to_model_names)

        # for i, item in enumerate(sorted(model_names_to_scores.items(),
        #                                key=lambda t: t[1])):
        #    logging.info("%d: %s", i, item)

        include_num_runs = set(include_num_runs)

        all_predictions_train = get_predictions(dir_ensemble,
                                                dir_ensemble_list,
                                                include_num_runs,
                                                re_num_run)
        all_predictions_valid = get_predictions(dir_valid,
                                                dir_valid_list,
                                                include_num_runs,
                                                re_num_run)

        all_predictions_test = get_predictions(dir_test,
                                               dir_test_list,
                                               include_num_runs,
                                               re_num_run)

        if len(all_predictions_train) == len(all_predictions_test) == len(
                all_predictions_valid) == 0:
            logger.error('All models do just random guessing')
            time.sleep(2)
            continue

        elif len(all_predictions_train) == 1:
            logger.debug('Only one model so far we just copy its predictions')
            ensemble_members_run_numbers = {0: 1.0}

            # Output the score
            logger.info('Training performance: %f' %
                         np.max(model_names_to_scores.values()))
        else:
            try:
                indices, trajectory = ensemble_selection(
                    np.array(all_predictions_train), true_labels,
                    ensemble_size, task_type, metric)

                logger.info('Trajectory and indices!')
                logger.info(trajectory)
                logger.info(indices)

            except ValueError as e:
                logger.error('Caught ValueError: ' + str(e))
                used_time = watch.wall_elapsed('ensemble_builder')
                continue
            except Exception as e:
                logger.error('Caught error! %s', e.message)
                used_time = watch.wall_elapsed('ensemble_builder')
                continue

            # Output the score
            logger.info('Training performance: %f' % trajectory[-1])

            # Print the ensemble members:
            ensemble_members_run_numbers = dict()
            ensemble_members = Counter(indices).most_common()
            ensemble_members_string = 'Ensemble members:\n'
            logger.info(ensemble_members)
            for ensemble_member in ensemble_members:
                weight = float(ensemble_member[1]) / len(indices)
                ensemble_members_string += \
                    ('    %s; weight: %10f; performance: %10f\n' %
                     (indices_to_model_names[ensemble_member[0]],
                      weight,
                      model_names_to_scores[
                         indices_to_model_names[ensemble_member[0]]]))

                ensemble_members_run_numbers[
                    indices_to_run_num[
                        ensemble_member[0]]] = weight
            logger.info(ensemble_members_string)

        # Save the ensemble indices for later use!
        filename_indices = os.path.join(indices_output_dir,
                                        str(index_run).zfill(5) + '.indices')

        logger.info(ensemble_members_run_numbers)
        with open(filename_indices, 'w') as fh:
            pickle.dump(ensemble_members_run_numbers, fh)

        # Save predictions for valid and test data set
        if len(dir_valid_list) == len(dir_ensemble_list):
            ensemble_predictions_valid = np.mean(
                all_predictions_valid[indices.astype(int)],
                axis=0)
            filename_test = os.path.join(
                output_dir,
                basename + '_valid_' + str(index_run).zfill(3) + '.predict')
            save_predictions(
                os.path.join(predictions_dir, filename_test),
                ensemble_predictions_valid)
        else:
            logger.info('Could not find as many validation set predictions '
                         'as ensemble predictions!.')

        if len(dir_test_list) == len(dir_ensemble_list):
            ensemble_predictions_test = np.mean(
                all_predictions_test[indices.astype(int)],
                axis=0)
            filename_test = os.path.join(
                output_dir,
                basename + '_test_' + str(index_run).zfill(3) + '.predict')
            save_predictions(
                os.path.join(predictions_dir, filename_test),
                ensemble_predictions_test)
        else:
            logger.info('Could not find as many test set predictions as '
                         'ensemble predictions!')

        current_num_models = len(dir_ensemble_list)
        watch.stop_task('ensemble_iter_' + str(index_run))
        time_iter = watch.get_wall_dur('ensemble_iter_' + str(index_run))
        used_time = watch.wall_elapsed('ensemble_builder')
        index_run += 1
    return
def main(autosklearn_tmp_dir,
         basename,
         task_type,
         metric,
         limit,
         output_dir,
         ensemble_size=None,
         ensemble_nbest=None,
         seed=1,
         shared_mode=False,
         max_iterations=-1,
         precision="32"):

    watch = StopWatch()
    watch.start_task('ensemble_builder')

    used_time = 0
    time_iter = 0
    index_run = 0
    num_iteration = 0
    current_num_models = 0

    backend = Backend(output_dir, autosklearn_tmp_dir)
    dir_ensemble = os.path.join(autosklearn_tmp_dir, '.auto-sklearn',
                                'predictions_ensemble')
    dir_valid = os.path.join(autosklearn_tmp_dir, '.auto-sklearn',
                             'predictions_valid')
    dir_test = os.path.join(autosklearn_tmp_dir, '.auto-sklearn',
                            'predictions_test')
    paths_ = [dir_ensemble, dir_valid, dir_test]

    targets_ensemble = backend.load_targets_ensemble()

    dir_ensemble_list_mtimes = []

    while used_time < limit or (max_iterations > 0 and max_iterations >= num_iteration):
        num_iteration += 1
        logger.debug('Time left: %f', limit - used_time)
        logger.debug('Time last iteration: %f', time_iter)

        # Load the predictions from the models
        exists = [os.path.isdir(dir_) for dir_ in paths_]
        if not exists[0]:  # all(exists):
            logger.debug('Prediction directory %s does not exist!' %
                          dir_ensemble)
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        if shared_mode is False:
            dir_ensemble_list = sorted(glob.glob(os.path.join(
                dir_ensemble, 'predictions_ensemble_%s_*.npy' % seed)))
            if exists[1]:
                dir_valid_list = sorted(glob.glob(os.path.join(
                    dir_valid, 'predictions_valid_%s_*.npy' % seed)))
            else:
                dir_valid_list = []
            if exists[2]:
                dir_test_list = sorted(glob.glob(os.path.join(
                    dir_test, 'predictions_test_%s_*.npy' % seed)))
            else:
                dir_test_list = []
        else:
            dir_ensemble_list = sorted(os.listdir(dir_ensemble))
            dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else []
            dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else []

        # Check the modification times because predictions can be updated
        # over time!
        old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes
        dir_ensemble_list_mtimes = []

        for dir_ensemble_file in dir_ensemble_list:
            dir_ensemble_file = os.path.join(dir_ensemble, dir_ensemble_file)
            mtime = os.path.getmtime(dir_ensemble_file)
            dir_ensemble_list_mtimes.append(mtime)

        if len(dir_ensemble_list) == 0:
            logger.debug('Directories are empty')
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        if len(dir_ensemble_list) <= current_num_models and \
                old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes:
            logger.debug('Nothing has changed since the last time')
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        watch.start_task('ensemble_iter_' + str(index_run))

        # List of num_runs (which are in the filename) which will be included
        #  later
        include_num_runs = []
        backup_num_runs = []
        model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$')
        if ensemble_nbest is not None:
            # Keeps track of the single scores of each model in our ensemble
            scores_nbest = []
            # The indices of the model that are currently in our ensemble
            indices_nbest = []
            # The names of the models
            model_names = []

        model_names_to_scores = dict()

        model_idx = 0
        for model_name in dir_ensemble_list:
            if precision is "16":
                predictions = np.load(os.path.join(dir_ensemble, model_name)).astype(dtype=np.float16)
            elif precision is "32":
                predictions = np.load(os.path.join(dir_ensemble, model_name)).astype(dtype=np.float32)
            elif precision is "64":
                predictions = np.load(os.path.join(dir_ensemble, model_name)).astype(dtype=np.float64)
            else:
                predictions = np.load(os.path.join(dir_ensemble, model_name))
            score = calculate_score(targets_ensemble, predictions,
                                    task_type, metric,
                                    predictions.shape[1])
            model_names_to_scores[model_name] = score
            match = model_and_automl_re.search(model_name)
            automl_seed = int(match.group(1))
            num_run = int(match.group(2))

            if ensemble_nbest is not None:
                if score <= 0.001:
                    # include_num_runs.append(True)
                    logger.error('Model only predicts at random: ' +
                                  model_name + ' has score: ' + str(score))
                    backup_num_runs.append(num_run)
                # If we have less models in our ensemble than ensemble_nbest add
                # the current model if it is better than random
                elif len(scores_nbest) < ensemble_nbest:
                    scores_nbest.append(score)
                    indices_nbest.append(model_idx)
                    include_num_runs.append((automl_seed, num_run))
                    model_names.append(model_name)
                else:
                    # Take the worst performing model in our ensemble so far
                    idx = np.argmin(np.array([scores_nbest]))

                    # If the current model is better than the worst model in
                    # our ensemble replace it by the current model
                    if scores_nbest[idx] < score:
                        logger.debug('Worst model in our ensemble: %s with '
                                      'score %f will be replaced by model %s '
                                      'with score %f', model_names[idx],
                                      scores_nbest[idx], model_name, score)
                        # Exclude the old model
                        del scores_nbest[idx]
                        scores_nbest.append(score)
                        del include_num_runs[idx]
                        del indices_nbest[idx]
                        indices_nbest.append(model_idx)
                        include_num_runs.append((automl_seed, num_run))
                        del model_names[idx]
                        model_names.append(model_name)

                    # Otherwise exclude the current model from the ensemble
                    else:
                        # include_num_runs.append(True)
                        pass

            else:
                # Load all predictions that are better than random
                if score <= 0.001:
                    # include_num_runs.append(True)
                    logger.error('Model only predicts at random: ' +
                                  model_name + ' has score: ' + str(score))
                    backup_num_runs.append((automl_seed, num_run))
                else:
                    include_num_runs.append((automl_seed, num_run))

            model_idx += 1

        # If there is no model better than random guessing, we have to use
        # all models which do random guessing
        if len(include_num_runs) == 0:
            include_num_runs = backup_num_runs

        indices_to_model_names = dict()
        indices_to_run_num = dict()
        for i, model_name in enumerate(dir_ensemble_list):
            match = model_and_automl_re.search(model_name)
            automl_seed = int(match.group(1))
            num_run = int(match.group(2))
            if (automl_seed, num_run) in include_num_runs:
                num_indices = len(indices_to_model_names)
                indices_to_model_names[num_indices] = model_name
                indices_to_run_num[num_indices] = (automl_seed, num_run)

        # logging.info("Indices to model names:")
        # logging.info(indices_to_model_names)

        # for i, item in enumerate(sorted(model_names_to_scores.items(),
        #                                key=lambda t: t[1])):
        #    logging.info("%d: %s", i, item)

        include_num_runs = set(include_num_runs)

        all_predictions_train = get_predictions(dir_ensemble,
                                                dir_ensemble_list,
                                                include_num_runs,
                                                model_and_automl_re,
                                                precision)

#        if len(all_predictions_train) == len(all_predictions_test) == len(
#                all_predictions_valid) == 0:
        if len(include_num_runs) == 0:
            logger.error('All models do just random guessing')
            time.sleep(2)
            continue

        else:
            try:
                indices, trajectory = ensemble_selection(
                    np.array(all_predictions_train), targets_ensemble,
                    ensemble_size, task_type, metric)

                logger.info('Trajectory and indices!')
                logger.info(trajectory)
                logger.info(indices)

            except ValueError as e:
                logger.error('Caught ValueError: ' + str(e))
                used_time = watch.wall_elapsed('ensemble_builder')
                time.sleep(2)
                continue
            except Exception as e:
                logger.error('Caught error! %s', e.message)
                used_time = watch.wall_elapsed('ensemble_builder')
                time.sleep(2)
                continue

            # Output the score
            logger.info('Training performance: %f' % trajectory[-1])

            # Print the ensemble members:
            ensemble_members_run_numbers = dict()
            ensemble_members = Counter(indices).most_common()
            ensemble_members_string = 'Ensemble members:\n'
            logger.info(ensemble_members)
            for ensemble_member in ensemble_members:
                weight = float(ensemble_member[1]) / len(indices)
                ensemble_members_string += \
                    ('    %s; weight: %10f; performance: %10f\n' %
                     (indices_to_model_names[ensemble_member[0]],
                      weight,
                      model_names_to_scores[
                         indices_to_model_names[ensemble_member[0]]]))

                ensemble_members_run_numbers[
                    indices_to_run_num[
                        ensemble_member[0]]] = weight
            logger.info(ensemble_members_string)

        # Save the ensemble indices for later use!
        backend.save_ensemble_indices_weights(ensemble_members_run_numbers,
                                              index_run, seed)

        all_predictions_valid = get_predictions(dir_valid,
                                                dir_valid_list,
                                                include_num_runs,
                                                model_and_automl_re,
                                                precision)

        # Save predictions for valid and test data set
        if len(dir_valid_list) == len(dir_ensemble_list):
            all_predictions_valid = np.array(all_predictions_valid)
            ensemble_predictions_valid = np.mean(
                all_predictions_valid[indices.astype(int)], axis=0)
            backend.save_predictions_as_txt(ensemble_predictions_valid,
                                            'valid', index_run, prefix=basename)
        else:
            logger.info('Could not find as many validation set predictions (%d)'
                         'as ensemble predictions (%d)!.',
                        len(dir_valid_list), len(dir_ensemble_list))

        del all_predictions_valid
        all_predictions_test = get_predictions(dir_test,
                                               dir_test_list,
                                               include_num_runs,
                                               model_and_automl_re,
                                               precision)

        if len(dir_test_list) == len(dir_ensemble_list):
            all_predictions_test = np.array(all_predictions_test)
            ensemble_predictions_test = np.mean(
                all_predictions_test[indices.astype(int)], axis=0)
            backend.save_predictions_as_txt(ensemble_predictions_test,
                                            'test', index_run, prefix=basename)
        else:
            logger.info('Could not find as many test set predictions (%d) as '
                         'ensemble predictions (%d)!',
                        len(dir_test_list), len(dir_ensemble_list))

        del all_predictions_test

        current_num_models = len(dir_ensemble_list)
        watch.stop_task('ensemble_iter_' + str(index_run))
        time_iter = watch.get_wall_dur('ensemble_iter_' + str(index_run))
        used_time = watch.wall_elapsed('ensemble_builder')
        index_run += 1
    return
Esempio n. 16
0
    def test_stopwatch_overhead(self):
        # CPU overhead
        start = time.clock()
        watch = StopWatch()
        for i in range(1, 100000):
            watch.start_task("task_%d" % i)
            watch.stop_task("task_%d" % i)
        stop = time.clock()
        dur = stop - start
        cpu_overhead = dur - watch.cpu_sum()
        self.assertLess(cpu_overhead, 1.5)

        # Wall Overhead
        start = time.time()
        watch = StopWatch()
        for i in range(1, 100000):
            watch.start_task("task_%d" % i)
            watch.stop_task("task_%d" % i)
        stop = time.time()
        dur = stop - start
        wall_overhead = dur - watch.wall_sum()

        self.assertLess(wall_overhead, 2)
        self.assertLess(cpu_overhead, wall_overhead)
def main(autosklearn_tmp_dir,
         dataset_name,
         task_type,
         metric,
         limit,
         output_dir,
         ensemble_size=None,
         ensemble_nbest=None,
         seed=1,
         shared_mode=False,
         max_iterations=-1,
         precision="32"):

    watch = StopWatch()
    watch.start_task('ensemble_builder')

    used_time = 0
    time_iter = 0
    index_run = 0
    num_iteration = 0
    current_num_models = 0

    backend = Backend(output_dir, autosklearn_tmp_dir)
    dir_ensemble = os.path.join(autosklearn_tmp_dir, '.auto-sklearn',
                                'predictions_ensemble')
    dir_valid = os.path.join(autosklearn_tmp_dir, '.auto-sklearn',
                             'predictions_valid')
    dir_test = os.path.join(autosklearn_tmp_dir, '.auto-sklearn',
                            'predictions_test')
    paths_ = [dir_ensemble, dir_valid, dir_test]

    dir_ensemble_list_mtimes = []

    while used_time < limit or (max_iterations > 0 and max_iterations >= num_iteration):
        num_iteration += 1
        logger.debug('Time left: %f', limit - used_time)
        logger.debug('Time last iteration: %f', time_iter)

        # Reload the ensemble targets every iteration, important, because cv may
        # update the ensemble targets in the cause of running auto-sklearn
        # TODO update cv in order to not need this any more!
        targets_ensemble = backend.load_targets_ensemble()

        # Load the predictions from the models
        exists = [os.path.isdir(dir_) for dir_ in paths_]
        if not exists[0]:  # all(exists):
            logger.debug('Prediction directory %s does not exist!' %
                          dir_ensemble)
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        if shared_mode is False:
            dir_ensemble_list = sorted(glob.glob(os.path.join(
                dir_ensemble, 'predictions_ensemble_%s_*.npy' % seed)))
            if exists[1]:
                dir_valid_list = sorted(glob.glob(os.path.join(
                    dir_valid, 'predictions_valid_%s_*.npy' % seed)))
            else:
                dir_valid_list = []
            if exists[2]:
                dir_test_list = sorted(glob.glob(os.path.join(
                    dir_test, 'predictions_test_%s_*.npy' % seed)))
            else:
                dir_test_list = []
        else:
            dir_ensemble_list = sorted(os.listdir(dir_ensemble))
            dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else []
            dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else []

        # Check the modification times because predictions can be updated
        # over time!
        old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes
        dir_ensemble_list_mtimes = []

        for dir_ensemble_file in dir_ensemble_list:
            if dir_ensemble_file.endswith("/"):
                dir_ensemble_file = dir_ensemble_file[:-1]
            basename = os.path.basename(dir_ensemble_file)
            dir_ensemble_file = os.path.join(dir_ensemble, basename)
            mtime = os.path.getmtime(dir_ensemble_file)
            dir_ensemble_list_mtimes.append(mtime)

        if len(dir_ensemble_list) == 0:
            logger.debug('Directories are empty')
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        if len(dir_ensemble_list) <= current_num_models and \
                old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes:
            logger.debug('Nothing has changed since the last time')
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        watch.start_task('ensemble_iter_' + str(index_run))

        # List of num_runs (which are in the filename) which will be included
        #  later
        include_num_runs = []
        backup_num_runs = []
        model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$')
        if ensemble_nbest is not None:
            # Keeps track of the single scores of each model in our ensemble
            scores_nbest = []
            # The indices of the model that are currently in our ensemble
            indices_nbest = []
            # The names of the models
            model_names = []

        model_names_to_scores = dict()

        model_idx = 0
        for model_name in dir_ensemble_list:
            if model_name.endswith("/"):
                model_name = model_name[:-1]
            basename = os.path.basename(model_name)

            if precision is "16":
                predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float16)
            elif precision is "32":
                predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float32)
            elif precision is "64":
                predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float64)
            else:
                predictions = np.load(os.path.join(dir_ensemble, basename))

            try:
                score = calculate_score(targets_ensemble, predictions,
                                        task_type, metric,
                                        predictions.shape[1])
            except:
                score = -1

            model_names_to_scores[model_name] = score
            match = model_and_automl_re.search(model_name)
            automl_seed = int(match.group(1))
            num_run = int(match.group(2))

            if ensemble_nbest is not None:
                if score <= 0.001:
                    logger.error('Model only predicts at random: ' +
                                  model_name + ' has score: ' + str(score))
                    backup_num_runs.append((automl_seed, num_run))
                # If we have less models in our ensemble than ensemble_nbest add
                # the current model if it is better than random
                elif len(scores_nbest) < ensemble_nbest:
                    scores_nbest.append(score)
                    indices_nbest.append(model_idx)
                    include_num_runs.append((automl_seed, num_run))
                    model_names.append(model_name)
                else:
                    # Take the worst performing model in our ensemble so far
                    idx = np.argmin(np.array([scores_nbest]))

                    # If the current model is better than the worst model in
                    # our ensemble replace it by the current model
                    if scores_nbest[idx] < score:
                        logger.debug('Worst model in our ensemble: %s with '
                                      'score %f will be replaced by model %s '
                                      'with score %f', model_names[idx],
                                      scores_nbest[idx], model_name, score)
                        # Exclude the old model
                        del scores_nbest[idx]
                        scores_nbest.append(score)
                        del include_num_runs[idx]
                        del indices_nbest[idx]
                        indices_nbest.append(model_idx)
                        include_num_runs.append((automl_seed, num_run))
                        del model_names[idx]
                        model_names.append(model_name)

                    # Otherwise exclude the current model from the ensemble
                    else:
                        # include_num_runs.append(True)
                        pass

            else:
                # Load all predictions that are better than random
                if score <= 0.001:
                    # include_num_runs.append(True)
                    logger.error('Model only predicts at random: ' +
                                  model_name + ' has score: ' + str(score))
                    backup_num_runs.append((automl_seed, num_run))
                else:
                    include_num_runs.append((automl_seed, num_run))

            model_idx += 1

        # If there is no model better than random guessing, we have to use
        # all models which do random guessing
        if len(include_num_runs) == 0:
            include_num_runs = backup_num_runs

        indices_to_model_names = dict()
        indices_to_run_num = dict()
        for i, model_name in enumerate(dir_ensemble_list):
            match = model_and_automl_re.search(model_name)
            automl_seed = int(match.group(1))
            num_run = int(match.group(2))
            if (automl_seed, num_run) in include_num_runs:
                num_indices = len(indices_to_model_names)
                indices_to_model_names[num_indices] = model_name
                indices_to_run_num[num_indices] = (automl_seed, num_run)

        try:
            all_predictions_train, all_predictions_valid, all_predictions_test =\
                get_all_predictions(dir_ensemble, dir_ensemble_list,
                                    dir_valid, dir_valid_list,
                                    dir_test, dir_test_list,
                                    include_num_runs,
                                    model_and_automl_re,
                                    precision)
        except IOError:
            logger.error('Could not load the predictions.')
            continue

        if len(include_num_runs) == 0:
            logger.error('All models do just random guessing')
            time.sleep(2)
            continue

        else:
            ensemble = EnsembleSelection(ensemble_size=ensemble_size,
                                         task_type=task_type,
                                         metric=metric)

            try:
                ensemble.fit(all_predictions_train, targets_ensemble,
                             include_num_runs)
                logger.info(ensemble)

            except ValueError as e:
                logger.error('Caught ValueError: ' + str(e))
                used_time = watch.wall_elapsed('ensemble_builder')
                time.sleep(2)
                continue
            except IndexError as e:
                logger.error('Caught IndexError: ' + str(e))
                used_time = watch.wall_elapsed('ensemble_builder')
                time.sleep(2)
                continue
            except Exception as e:
                logger.error('Caught error! %s', e.message)
                used_time = watch.wall_elapsed('ensemble_builder')
                time.sleep(2)
                continue

            # Output the score
            logger.info('Training performance: %f' % ensemble.train_score_)

        # Save the ensemble for later use in the main auto-sklearn module!
        backend.save_ensemble(ensemble, index_run, seed)

        # Save predictions for valid and test data set
        if len(dir_valid_list) == len(dir_ensemble_list):
            all_predictions_valid = np.array(all_predictions_valid)
            ensemble_predictions_valid = ensemble.predict(all_predictions_valid)
            backend.save_predictions_as_txt(ensemble_predictions_valid,
                                            'valid', index_run, prefix=dataset_name)
        else:
            logger.info('Could not find as many validation set predictions (%d)'
                         'as ensemble predictions (%d)!.',
                        len(dir_valid_list), len(dir_ensemble_list))

        del all_predictions_valid

        if len(dir_test_list) == len(dir_ensemble_list):
            all_predictions_test = np.array(all_predictions_test)
            ensemble_predictions_test = ensemble.predict(all_predictions_test)
            backend.save_predictions_as_txt(ensemble_predictions_test,
                                            'test', index_run, prefix=dataset_name)
        else:
            logger.info('Could not find as many test set predictions (%d) as '
                         'ensemble predictions (%d)!',
                        len(dir_test_list), len(dir_ensemble_list))

        del all_predictions_test

        current_num_models = len(dir_ensemble_list)
        watch.stop_task('ensemble_iter_' + str(index_run))
        time_iter = watch.get_wall_dur('ensemble_iter_' + str(index_run))
        used_time = watch.wall_elapsed('ensemble_builder')
        index_run += 1
    return
Esempio n. 18
0
class AutoML(BaseEstimator):

    def __init__(self,
                 backend,
                 time_left_for_this_task,
                 per_run_time_limit,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3072,
                 metadata_directory=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 exclude_estimators=None,
                 include_preprocessors=None,
                 exclude_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 shared_mode=False,
                 precision=32,
                 disable_evaluator_output=False,
                 get_smac_object_callback=None,
                 smac_scenario_args=None,
                 ):
        super(AutoML, self).__init__()
        self._backend = backend
        #self._tmp_dir = tmp_dir
        #self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._data_memory_limit = None
        self._metadata_directory = metadata_directory
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._exclude_estimators = exclude_estimators
        self._include_preprocessors = include_preprocessors
        self._exclude_preprocessors = exclude_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments \
            if resampling_strategy_arguments is not None else {}
        self._shared_mode = shared_mode
        self.precision = precision
        self._disable_evaluator_output = disable_evaluator_output
        self._get_smac_object_callback = get_smac_object_callback
        self._smac_scenario_args = smac_scenario_args

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self._parser = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode

        if not isinstance(self._time_for_task, int):
            raise ValueError("time_left_for_this_task not of type integer, "
                             "but %s" % str(type(self._time_for_task)))
        if not isinstance(self._per_run_time_limit, int):
            raise ValueError("per_run_time_limit not of type integer, but %s" %
                             str(type(self._per_run_time_limit)))

        # After assignging and checking variables...
        #self._backend = Backend(self._output_dir, self._tmp_dir)

    def fit(
        self, X, y,
        task,
        metric,
        X_test=None,
        y_test=None,
        feat_type=None,
        dataset_name=None,
        only_return_configuration_space=False,
    ):
        if self._shared_mode:
            # If this fails, it's likely that this is the first call to get
            # the data manager
            try:
                D = self._backend.load_datamanager()
                dataset_name = D.name
            except IOError:
                pass

        if dataset_name is None:
            dataset_name = hash_array_or_matrix(X)

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if metric is None:
            raise ValueError('No metric given.')
        if not isinstance(metric, Scorer):
            raise ValueError('Metric must be instance of '
                             'autosklearn.metrics.Scorer.')

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all([isinstance(f, str)
                                              for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' % ft)

        self._data_memory_limit = None
        loaded_data_manager = XYDataManager(
            X, y,
            X_test=X_test,
            y_test=y_test,
            task=task,
            feat_type=feat_type,
            dataset_name=dataset_name,
        )

        return self._fit(
            loaded_data_manager,
            metric,
            only_return_configuration_space,
        )

    # TODO this is very old code which can be dropped!
    def fit_automl_dataset(self, dataset, metric):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        self._logger = self._get_logger(name)
        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        self._data_memory_limit = float(self._ml_memory_limit) / 3
        loaded_data_manager = CompetitionDataManager(
            dataset, max_memory_in_mb=self._data_memory_limit)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager, metric)

    def fit_on_datamanager(self, datamanager, metric):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(datamanager.name)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        self._logger = self._get_logger(name)
        self._fit(datamanager, metric)

    def _get_logger(self, name):
        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._backend.temporary_directory, '%s.log' % str(logger_name)))
        return get_logger(logger_name)

    @staticmethod
    def _start_task(watcher, task_name):
        watcher.start_task(task_name)

    @staticmethod
    def _stop_task(watcher, task_name):
        watcher.stop_task(task_name)

    @staticmethod
    def _print_load_time(basename, time_left_for_this_task,
                         time_for_load_data, logger):

        time_left_after_reading = max(
            0, time_left_for_this_task - time_for_load_data)
        logger.info('Remaining time after reading %s %5.2f sec' %
                    (basename, time_left_after_reading))
        return time_for_load_data

    def _do_dummy_prediction(self, datamanager, num_run):

        # When using partial-cv it makes no sense to do dummy predictions
        if self._resampling_strategy in ['partial-cv',
                                         'partial-cv-iterative-fit']:
            return num_run

        self._logger.info("Starting to create dummy predictions.")
        memory_limit = int(self._ml_memory_limit)
        scenario_mock = unittest.mock.Mock()
        scenario_mock.wallclock_limit = self._time_for_task
        # This stats object is a hack - maybe the SMAC stats object should
        # already be generated here!
        stats = Stats(scenario_mock)
        stats.start_timing()
        ta = ExecuteTaFuncWithQueue(backend=self._backend,
                                    autosklearn_seed=self._seed,
                                    resampling_strategy=self._resampling_strategy,
                                    initial_num_run=num_run,
                                    logger=self._logger,
                                    stats=stats,
                                    metric=self._metric,
                                    memory_limit=memory_limit,
                                    disable_file_output=self._disable_evaluator_output,
                                    **self._resampling_strategy_arguments)

        status, cost, runtime, additional_info = \
            ta.run(1, cutoff=self._time_for_task)
        if status == StatusType.SUCCESS:
            self._logger.info("Finished creating dummy predictions.")
        else:
            self._logger.error('Error creating dummy predictions: %s ',
                               str(additional_info))

        return ta.num_run

    def _fit(self, datamanager, metric, only_return_configuration_space=False):
        # Reset learnt stuff
        self.models_ = None
        self.ensemble_ = None

        # Check arguments prior to doing anything!
        if not isinstance(self._disable_evaluator_output, (bool, list)):
            raise ValueError('disable_evaluator_output must be of type bool '
                             'or list.')
        if isinstance(self._disable_evaluator_output, list):
            allowed_elements = ['model', 'y_optimization']
            for element in self._disable_evaluator_output:
                if element not in allowed_elements:
                    raise ValueError("List member '%s' for argument "
                                     "'disable_evaluator_output' must be one "
                                     "of " + str(allowed_elements))
        if self._resampling_strategy not in [
             'holdout', 'holdout-iterative-fit',
             'cv', 'partial-cv',
             'partial-cv-iterative-fit'] \
             and not issubclass(self._resampling_strategy, BaseCrossValidator)\
             and not issubclass(self._resampling_strategy, _RepeatedSplits)\
             and not issubclass(self._resampling_strategy, BaseShuffleSplit):
            raise ValueError('Illegal resampling strategy: %s' %
                             self._resampling_strategy)
        if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit'] \
                and self._ensemble_size != 0:
            raise ValueError("Resampling strategy %s cannot be used "
                             "together with ensembles." % self._resampling_strategy)
        if self._resampling_strategy in ['partial-cv', 'cv',
                                         'partial-cv-iterative-fit'] and \
                not 'folds' in self._resampling_strategy_arguments:
            self._resampling_strategy_arguments['folds'] = 5

        self._backend._make_internals_directory()
        if self._keep_models:
            try:
                os.makedirs(self._backend.get_model_dir())
            except (OSError, FileExistsError) as e:
                if not self._shared_mode:
                    raise

        self._metric = metric
        self._task = datamanager.info['task']
        self._label_num = datamanager.info['label_num']

        # == Pickle the data manager to speed up loading
        data_manager_path = self._backend.save_datamanager(datamanager)

        time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name)

        if self._debug_mode:
            self._print_load_time(
                self._dataset_name,
                self._time_for_task,
                time_for_load_data,
                self._logger)

        # == Perform dummy predictions
        num_run = 1
        #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']:
        num_run = self._do_dummy_prediction(datamanager, num_run)

        # = Create a searchspace
        # Do this before One Hot Encoding to make sure that it creates a
        # search space for a dense classifier even if one hot encoding would
        # make it sparse (tradeoff; if one hot encoding would make it sparse,
        #  densifier and truncatedSVD would probably lead to a MemoryError,
        # like this we can't use some of the preprocessing methods in case
        # the data became sparse)
        self.configuration_space, configspace_path = self._create_search_space(
            self._backend.temporary_directory,
            self._backend,
            datamanager,
            include_estimators=self._include_estimators,
            exclude_estimators=self._exclude_estimators,
            include_preprocessors=self._include_preprocessors,
            exclude_preprocessors=self._exclude_preprocessors)
        if only_return_configuration_space:
            return self.configuration_space

        # == RUN ensemble builder
        # Do this before calculating the meta-features to make sure that the
        # dummy predictions are actually included in the ensemble even if
        # calculating the meta-features takes very long
        ensemble_task_name = 'runEnsemble'
        self._stopwatch.start_task(ensemble_task_name)
        time_left_for_ensembles = max(0,self._time_for_task \
                                      - self._stopwatch.wall_elapsed(self._dataset_name))
        if self._logger:
            self._logger.info(
                'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles)
        if time_left_for_ensembles <= 0:
            self._proc_ensemble = None
            # Fit only raises error when ensemble_size is not zero but
            # time_left_for_ensembles is zero.
            if self._ensemble_size > 0:
                raise ValueError("Not starting ensemble builder because there "
                                 "is no time left. Try increasing the value "
                                 "of time_left_for_this_task.")
        else:
            self._proc_ensemble = self._get_ensemble_process(time_left_for_ensembles)
            if self._ensemble_size > 0:
                self._proc_ensemble.start()
            else:
                self._logger.info('Not starting ensemble builder because '
                                  'ensemble size is <= 0.')
        self._stopwatch.stop_task(ensemble_task_name)

        # kill the datamanager as it will be re-loaded anyways from sub processes
        try:
            del self._datamanager
        except Exception:
            pass

        # => RUN SMAC
        smac_task_name = 'runSMAC'
        self._stopwatch.start_task(smac_task_name)
        time_left_for_smac = max(0,
            self._time_for_task - self._stopwatch.wall_elapsed(
            self._dataset_name))

        if self._logger:
            self._logger.info(
                'Start SMAC with %5.2fsec time left' % time_left_for_smac)
        if time_left_for_smac <= 0:
            self._logger.warning("Not starting SMAC because there is no time "
                                 "left.")
            _proc_smac = None
        else:
            if self._per_run_time_limit is None or \
                    self._per_run_time_limit > time_left_for_smac:
                print('Time limit for a single run is higher than total time '
                      'limit. Capping the limit for a single run to the total '
                      'time given to SMAC (%f)' % time_left_for_smac)
                per_run_time_limit = time_left_for_smac
            else:
                per_run_time_limit = self._per_run_time_limit

            _proc_smac = AutoMLSMBO(
                config_space=self.configuration_space,
                dataset_name=self._dataset_name,
                backend=self._backend,
                total_walltime_limit=time_left_for_smac,
                func_eval_time_limit=per_run_time_limit,
                memory_limit=self._ml_memory_limit,
                data_memory_limit=self._data_memory_limit,
                watcher=self._stopwatch,
                start_num_run=num_run,
                num_metalearning_cfgs=self._initial_configurations_via_metalearning,
                config_file=configspace_path,
                seed=self._seed,
                metadata_directory=self._metadata_directory,
                metric=self._metric,
                resampling_strategy=self._resampling_strategy,
                resampling_strategy_args=self._resampling_strategy_arguments,
                shared_mode=self._shared_mode,
                include_estimators=self._include_estimators,
                exclude_estimators=self._exclude_estimators,
                include_preprocessors=self._include_preprocessors,
                exclude_preprocessors=self._exclude_preprocessors,
                disable_file_output=self._disable_evaluator_output,
                get_smac_object_callback=self._get_smac_object_callback,
                smac_scenario_args=self._smac_scenario_args,
            )
            self.runhistory_, self.trajectory_ = \
                _proc_smac.run_smbo()
            trajectory_filename = os.path.join(
                self._backend.get_smac_output_directory_for_run(self._seed),
                'trajectory.json')
            saveable_trajectory = \
                [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:])
                 for entry in self.trajectory_]
            with open(trajectory_filename, 'w') as fh:
                json.dump(saveable_trajectory, fh)

        # Wait until the ensemble process is finished to avoid shutting down
        # while the ensemble builder tries to access the data
        if self._proc_ensemble is not None and self._ensemble_size > 0:
            self._proc_ensemble.join()

        self._proc_ensemble = None
        self._load_models()

        return self

    def refit(self, X, y):
        def send_warnings_to_log(message, category, filename, lineno,
                                 file=None, line=None):
            self._logger.debug('%s:%s: %s:%s' %
                               (filename, lineno, category.__name__, message))
            return

        if self._keep_models is not True:
            raise ValueError(
                "Refit can only be called if 'keep_models==True'")
        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        # Refit is not applicable when ensemble_size is set to zero.
        if self.ensemble_ is None:
            raise ValueError("Refit can only be called if 'ensemble_size != 0'")

        random_state = np.random.RandomState(self._seed)
        for identifier in self.models_:
            if identifier in self.ensemble_.get_selected_model_identifiers():
                model = self.models_[identifier]
                # this updates the model inplace, it can then later be used in
                # predict method

                # try to fit the model. If it fails, shuffle the data. This
                # could alleviate the problem in algorithms that depend on
                # the ordering of the data.
                for i in range(10):
                    try:
                        with warnings.catch_warnings():
                            warnings.showwarning = send_warnings_to_log
                            model.fit(X.copy(), y.copy())
                        break
                    except ValueError as e:
                        indices = list(range(X.shape[0]))
                        random_state.shuffle(indices)
                        X = X[indices]
                        y = y[indices]

                        if i == 9:
                            raise e

        self._can_predict = True
        return self

    def predict(self, X, batch_size=None, n_jobs=1):
        """predict.

        Parameters
        ----------
        X: array-like, shape = (n_samples, n_features)

        batch_size: int or None, defaults to None
            batch_size controls whether the pipelines will be
            called on small chunks of the data. Useful when calling the
            predict method on the whole array X results in a MemoryError.

        n_jobs: int, defaults to 1
            Parallelize the predictions across the models with n_jobs
            processes.
        """
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if not self._can_predict and \
                self._resampling_strategy not in  \
                        ['holdout', 'holdout-iterative-fit']:
            raise NotImplementedError(
                'Predict is currently not implemented for resampling '
                'strategy %s, please call refit().' % self._resampling_strategy)

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        # If self.ensemble_ is None, it means that ensemble_size is set to zero.
        # In such cases, raise error because predict and predict_proba cannot
        # be called.
        if self.ensemble_ is None:
            raise ValueError("Predict and predict_proba can only be called "
                             "if 'ensemble_size != 0'")

        # Parallelize predictions across models with n_jobs processes.
        # Each process computes predictions in chunks of batch_size rows.
        all_predictions = joblib.Parallel(n_jobs=n_jobs)(
            joblib.delayed(_model_predict)(self, X, batch_size, identifier)
            for identifier in self.ensemble_.get_selected_model_identifiers())

        if len(all_predictions) == 0:
            raise ValueError('Something went wrong generating the predictions. '
                             'The ensemble should consist of the following '
                             'models: %s, the following models were loaded: '
                             '%s' % (str(list(self.ensemble_indices_.keys())),
                                     str(list(self.models_.keys()))))

        predictions = self.ensemble_.predict(all_predictions)
        return predictions

    def fit_ensemble(self, y, task=None, metric=None, precision='32',
                     dataset_name=None, ensemble_nbest=None,
                     ensemble_size=None):
        if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit']:
            raise ValueError('Cannot call fit_ensemble with resampling '
                             'strategy %s.' % self._resampling_strategy)

        if self._logger is None:
            self._logger = self._get_logger(dataset_name)

        self._proc_ensemble = self._get_ensemble_process(
            1, task, metric, precision, dataset_name, max_iterations=1,
            ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size)
        self._proc_ensemble.main()
        self._proc_ensemble = None
        self._load_models()
        return self

    def _get_ensemble_process(self, time_left_for_ensembles,
                              task=None, metric=None, precision=None,
                              dataset_name=None, max_iterations=-1,
                              ensemble_nbest=None, ensemble_size=None):

        if task is None:
            task = self._task
        else:
            self._task = task

        if metric is None:
            metric = self._metric
        else:
            self._metric = metric

        if precision is None:
            precision = self.precision
        else:
            self.precision = precision

        if dataset_name is None:
            dataset_name = self._dataset_name
        else:
            self._dataset_name = dataset_name

        if ensemble_nbest is None:
            ensemble_nbest = self._ensemble_nbest
        else:
            self._ensemble_nbest = ensemble_nbest

        if ensemble_size is None:
            ensemble_size = self._ensemble_size
        else:
            self._ensemble_size = ensemble_size

        return EnsembleBuilder(backend=self._backend,
                               dataset_name=dataset_name,
                               task_type=task,
                               metric=metric,
                               limit=time_left_for_ensembles,
                               ensemble_size=ensemble_size,
                               ensemble_nbest=ensemble_nbest,
                               seed=self._seed,
                               shared_mode=self._shared_mode,
                               precision=precision,
                               max_iterations=max_iterations,
                               read_at_most=np.inf)

    def _load_models(self):
        if self._shared_mode:
            seed = -1
        else:
            seed = self._seed

        self.ensemble_ = self._backend.load_ensemble(seed)
        if self.ensemble_:
            identifiers = self.ensemble_.identifiers_
            self.models_ = self._backend.load_models_by_identifiers(identifiers)
            if len(self.models_) == 0 and self._resampling_strategy not in \
                    ['partial-cv', 'partial-cv-iterative-fit']:
                raise ValueError('No models fitted!')

        elif self._disable_evaluator_output is False or \
                (isinstance(self._disable_evaluator_output, list) and
                 'model' not in self._disable_evaluator_output):
            model_names = self._backend.list_all_models(seed)

            if len(model_names) == 0 and self._resampling_strategy not in \
                    ['partial-cv', 'partial-cv-iterative-fit']:
                raise ValueError('No models fitted!')

            self.models_ = []

        else:
            self.models_ = []

    def score(self, X, y):
        # fix: Consider only index 1 of second dimension
        # Don't know if the reshaping should be done there or in calculate_score
        prediction = self.predict(X)
        return calculate_score(solution=y,
                               prediction=prediction,
                               task_type=self._task,
                               metric=self._metric,
                               all_scoring_functions=False)

    @property
    def cv_results_(self):
        results = dict()

        # Missing in contrast to scikit-learn
        # splitX_test_score - auto-sklearn does not store the scores on a split
        #                     basis
        # std_test_score - auto-sklearn does not store the scores on a split
        #                  basis
        # splitX_train_score - auto-sklearn does not compute train scores, add
        #                      flag to compute the train scores
        # mean_train_score - auto-sklearn does not store the train scores
        # std_train_score - auto-sklearn does not store the train scores
        # std_fit_time - auto-sklearn does not store the fit times per split
        # mean_score_time - auto-sklearn does not store the score time
        # std_score_time - auto-sklearn does not store the score time
        # TODO: add those arguments

        # TODO remove this restriction!
        if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit']:
            raise ValueError('Cannot call cv_results when using partial-cv!')

        parameter_dictionaries = dict()
        masks = dict()
        hp_names = []

        # Set up dictionary for parameter values
        for hp in self.configuration_space.get_hyperparameters():
            name = hp.name
            parameter_dictionaries[name] = []
            masks[name] = []
            hp_names.append(name)

        mean_test_score = []
        mean_fit_time = []
        params = []
        status = []
        for run_key in self.runhistory_.data:
            run_value = self.runhistory_.data[run_key]
            config_id = run_key.config_id
            config = self.runhistory_.ids_config[config_id]

            param_dict = config.get_dictionary()
            params.append(param_dict)
            mean_test_score.append(1 - run_value.cost)
            mean_fit_time.append(run_value.time)
            s = run_value.status
            if s == StatusType.SUCCESS:
                status.append('Success')
            elif s == StatusType.TIMEOUT:
                status.append('Timeout')
            elif s == StatusType.CRASHED:
                status.append('Crash')
            elif s == StatusType.ABORT:
                status.append('Abort')
            elif s == StatusType.MEMOUT:
                status.append('Memout')
            else:
                raise NotImplementedError(s)

            for hp_name in hp_names:
                if hp_name in param_dict:
                    hp_value = param_dict[hp_name]
                    mask_value = False
                else:
                    hp_value = np.NaN
                    mask_value = True

                parameter_dictionaries[hp_name].append(hp_value)
                masks[hp_name].append(mask_value)

        results['mean_test_score'] = np.array(mean_test_score)
        results['mean_fit_time'] = np.array(mean_fit_time)
        results['params'] = params
        results['rank_test_scores'] = scipy.stats.rankdata(1 - results['mean_test_score'],
                                                           method='min')
        results['status'] = status

        for hp_name in hp_names:
            masked_array = ma.MaskedArray(parameter_dictionaries[hp_name],
                                          masks[hp_name])
            results['param_%s' % hp_name] = masked_array

        return results

    def sprint_statistics(self):
        cv_results = self.cv_results_
        sio = io.StringIO()
        sio.write('auto-sklearn results:\n')
        sio.write('  Dataset name: %s\n' % self._dataset_name)
        sio.write('  Metric: %s\n' % self._metric)
        idx_best_run = np.argmax(cv_results['mean_test_score'])
        best_score = cv_results['mean_test_score'][idx_best_run]
        sio.write('  Best validation score: %f\n' % best_score)
        num_runs = len(cv_results['status'])
        sio.write('  Number of target algorithm runs: %d\n' % num_runs)
        num_success = sum([s == 'Success' for s in cv_results['status']])
        sio.write('  Number of successful target algorithm runs: %d\n' % num_success)
        num_crash = sum([s == 'Crash' for s in cv_results['status']])
        sio.write('  Number of crashed target algorithm runs: %d\n' % num_crash)
        num_timeout = sum([s == 'Timeout' for s in cv_results['status']])
        sio.write('  Number of target algorithms that exceeded the time '
                  'limit: %d\n' % num_timeout)
        num_memout = sum([s == 'Memout' for s in cv_results['status']])
        sio.write('  Number of target algorithms that exceeded the memory '
                  'limit: %d\n' % num_memout)
        return sio.getvalue()

    def get_models_with_weights(self):
        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        return self.ensemble_.get_models_with_weights(self.models_)

    def show_models(self):
        models_with_weights = self.get_models_with_weights()

        with io.StringIO() as sio:
            sio.write("[")
            for weight, model in models_with_weights:
                sio.write("(%f, %s),\n" % (weight, model))
            sio.write("]")

            return sio.getvalue()

    def _create_search_space(self, tmp_dir, backend, datamanager,
                             include_estimators=None,
                             exclude_estimators=None,
                             include_preprocessors=None,
                             exclude_preprocessors=None):
        task_name = 'CreateConfigSpace'

        self._stopwatch.start_task(task_name)
        configspace_path = os.path.join(tmp_dir, 'space.pcs')
        configuration_space = pipeline.get_configuration_space(
            datamanager.info,
            include_estimators=include_estimators,
            exclude_estimators=exclude_estimators,
            include_preprocessors=include_preprocessors,
            exclude_preprocessors=exclude_preprocessors)
        configuration_space = self.configuration_space_created_hook(
            datamanager, configuration_space)
        sp_string = pcs.write(configuration_space)
        backend.write_txt_file(configspace_path, sp_string,
                               'Configuration space')
        self._stopwatch.stop_task(task_name)

        return configuration_space, configspace_path

    def configuration_space_created_hook(self, datamanager, configuration_space):
        return configuration_space
Esempio n. 19
0
def main(logger, predictions_dir, basename, task_type, metric, limit, output_dir,
         ensemble_size=None):
    watch = StopWatch()
    watch.start_task('ensemble_builder')


    used_time = 0
    time_iter = 0
    index_run = 0
    current_num_models = 0

    while used_time < limit:
        logger.debug('Time left: %f' % (limit - used_time))
        logger.debug('Time last iteration: %f' % time_iter)
        # Load the true labels of the validation data
        true_labels = np.load(os.path.join(predictions_dir,
                                           'true_labels_ensemble.npy'))

        # Load the predictions from the models
        all_predictions_train = []
        dir_ensemble = os.path.join(predictions_dir, 'predictions_ensemble/')
        dir_valid = os.path.join(predictions_dir, 'predictions_valid/')
        dir_test = os.path.join(predictions_dir, 'predictions_test/')

        if not os.path.isdir(dir_ensemble) or not os.path.isdir(dir_valid) or \
                not os.path.isdir(dir_test):
            logger.debug('Prediction directory does not exist')
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        dir_ensemble_list = sorted(os.listdir(dir_ensemble))
        dir_valid_list = sorted(os.listdir(dir_valid))
        dir_test_list = sorted(os.listdir(dir_test))

        if check_data(logger, len(dir_ensemble_list), len(dir_valid_list),
                      len(dir_test_list), current_num_models):
            time.sleep(2)
            used_time = watch.wall_elapsed('ensemble_builder')
            continue

        watch.start_task('ensemble_iter_' + str(index_run))

        # Binary mask where True indicates that the corresponding will be
        # excluded from the ensemble
        exclude_mask = []
        if ensemble_size is not None:
            # Keeps track of the single scores of each model in our ensemble
            scores_nbest = []
            # The indices of the model that are currently in our ensemble
            indices_nbest = []

        model_idx = 0
        for f in dir_ensemble_list:
            predictions = np.load(os.path.join(dir_ensemble, f))
            score = calculate_score(true_labels, predictions,
                                              task_type, metric,
                                              predictions.shape[1])

            if ensemble_size is not None:
                if score <= 0.001:
                    exclude_mask.append(True)
                    logger.error('Model only predicts at random: ' + f +
                                  ' has score: ' + str(score))
                # If we have less model in our ensemble than ensemble_size add
                # the current model if it is better than random
                elif len(scores_nbest) < ensemble_size:
                    scores_nbest.append(score)
                    indices_nbest.append(model_idx)
                    exclude_mask.append(False)
                else:
                    # Take the worst performing model in our ensemble so far
                    idx = np.argmin(np.array([scores_nbest]))

                    # If the current model is better than the worst model in
                    # our ensemble replace it by the current model
                    if scores_nbest[idx] < score:
                        logger.debug(
                            'Worst model in our ensemble: %d with score %f will be replaced by model %d with score %f'
                            % (idx, scores_nbest[idx], model_idx, score))
                        scores_nbest[idx] = score
                        # Exclude the old model
                        exclude_mask[int(indices_nbest[idx])] = True
                        indices_nbest[idx] = model_idx
                        exclude_mask.append(False)
                    # Otherwise exclude the current model from the ensemble
                    else:
                        exclude_mask.append(True)

            else:
                # Load all predictions that are better than random
                if score <= 0.001:
                    exclude_mask.append(True)
                    logger.error('Model only predicts at random: ' + f +
                                  ' has score: ' + str(score))
                else:
                    exclude_mask.append(False)
                    all_predictions_train.append(predictions)

            model_idx += 1
            print(exclude_mask)

        all_predictions_valid = get_predictions(dir_valid,
                                                dir_valid_list,
                                                exclude_mask)
        all_predictions_test = get_predictions(dir_test,
                                               dir_test_list,
                                               exclude_mask)

        if len(all_predictions_train) == len(all_predictions_test) == len(
                all_predictions_valid) == 0:
            logger.error('All models do just random guessing')
            time.sleep(2)
            continue

        if len(all_predictions_train) == 1:
            logger.debug('Only one model so far we just copy its predictions')
            Y_valid = all_predictions_valid[0]
            Y_test = all_predictions_test[0]
        else:
            try:
                # Compute the weights for the ensemble
                # Use equally initialized weights
                n_models = len(all_predictions_train)
                init_weights = np.ones([n_models]) / n_models

                weights = weighted_ensemble(logger.debug, np.array(all_predictions_train),
                                            true_labels, task_type, metric,
                                            init_weights)
            except ValueError:
                logger.error('Caught ValueError!')
                used_time = watch.wall_elapsed('ensemble_builder')
                continue
            except Exception:
                logger.error('Caught error!')
                used_time = watch.wall_elapsed('ensemble_builder')
                continue

            # Compute the ensemble predictions for the valid data
            Y_valid = ensemble_prediction(np.array(all_predictions_valid),
                                          weights)

            # Compute the ensemble predictions for the test data
            Y_test = ensemble_prediction(np.array(all_predictions_test),
                                         weights)

        # Save predictions for valid and test data set
        filename_test = os.path.join(
            output_dir,
            basename + '_valid_' + str(index_run).zfill(3) + '.predict')
        save_predictions(os.path.join(predictions_dir,
                                                filename_test), Y_valid)

        filename_test = os.path.join(
            output_dir,
            basename + '_test_' + str(index_run).zfill(3) + '.predict')
        save_predictions(os.path.join(predictions_dir,
                                                filename_test), Y_test)

        current_num_models = len(dir_ensemble_list)
        watch.stop_task('ensemble_iter_' + str(index_run))
        time_iter = watch.get_wall_dur('ensemble_iter_' + str(index_run))
        used_time = watch.wall_elapsed('ensemble_builder')
        index_run += 1
    return
Esempio n. 20
0
class AutoML(BaseEstimator):

    def __init__(self,
                 backend,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32,
                 max_iter_smac=None,
                 acquisition_function='EI'):
        super(AutoML, self).__init__()
        self._backend = backend
        #self._tmp_dir = tmp_dir
        #self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        #self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._data_memory_limit = None
        self._metadata_directory = metadata_directory
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments \
            if resampling_strategy_arguments is not None else {}
        self._max_iter_smac = max_iter_smac
        #self.delete_tmp_folder_after_terminate = \
        #    delete_tmp_folder_after_terminate
        #self.delete_output_folder_after_terminate = \
        #    delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision
        self.acquisition_function = acquisition_function

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self._parser = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode

        if not isinstance(self._time_for_task, int):
            raise ValueError("time_left_for_this_task not of type integer, "
                             "but %s" % str(type(self._time_for_task)))
        if not isinstance(self._per_run_time_limit, int):
            raise ValueError("per_run_time_limit not of type integer, but %s" %
                             str(type(self._per_run_time_limit)))

        # After assignging and checking variables...
        #self._backend = Backend(self._output_dir, self._tmp_dir)

    def start_automl(self, parser):
        self._parser = parser
        self.start()

    def start(self):
        if self._parser is None:
            raise ValueError('You must invoke start() only via start_automl()')
        super(AutoML, self).start()

    def run(self):
        if self._parser is None:
            raise ValueError('You must invoke run() only via start_automl()')
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=self._parser)
        self._stopwatch.start_task(datamanager.name)

        self._logger = self._get_logger(datamanager.name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self._fit(self._datamanager)

    def fit(self, X, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if not self._shared_mode:
            self._backend.context.delete_directories()
        else:
            # If this fails, it's likely that this is the first call to get
            # the data manager
            try:
                D = self._backend.load_datamanager()
                dataset_name = D.name
            except IOError:
                pass

        self._backend.context.create_directories()

        if dataset_name is None:
            dataset_name = hash_numpy_array(X)

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all([isinstance(f, str)
                                              for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' % ft)

        self._data_memory_limit = None
        loaded_data_manager = XYDataManager(X, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)

    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        self._logger = self._get_logger(name)
        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        self._data_memory_limit = float(self._ml_memory_limit) / 3
        loaded_data_manager = CompetitionDataManager(
            dataset, encode_labels=False,
            max_memory_in_mb=self._data_memory_limit)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)

    def _get_logger(self, name):
        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._backend.temporary_directory, '%s.log' % str(logger_name)))
        return get_logger(logger_name)

    @staticmethod
    def _start_task(watcher, task_name):
        watcher.start_task(task_name)

    @staticmethod
    def _stop_task(watcher, task_name):
        watcher.stop_task(task_name)

    @staticmethod
    def _print_load_time(basename, time_left_for_this_task,
                         time_for_load_data, logger):

        time_left_after_reading = max(
            0, time_left_for_this_task - time_for_load_data)
        logger.info('Remaining time after reading %s %5.2f sec' %
                    (basename, time_left_after_reading))
        return time_for_load_data

    def _do_dummy_prediction(self, datamanager, num_run):

        self._logger.info("Starting to create dummy predictions.")
        # time_limit = int(self._time_for_task / 6.)
        memory_limit = int(self._ml_memory_limit)
        ta = ExecuteTaFuncWithQueue(backend=self._backend,
                                    autosklearn_seed=self._seed,
                                    resampling_strategy=self._resampling_strategy,
                                    initial_num_run=num_run,
                                    logger=self._logger,
                                    **self._resampling_strategy_arguments)

        status, cost, runtime, additional_info = \
            ta.run(1, cutoff=self._time_for_task, memory_limit=memory_limit)
        if status == StatusType.SUCCESS:
            self._logger.info("Finished creating dummy predictions.")
        else:
            self._logger.error('Error creating dummy predictions:%s ',
                               additional_info)

        #status, cost, runtime, additional_info = \
        #    ta.run(2, cutoff=time_limit, memory_limit=memory_limit)
        #if status == StatusType.SUCCESS:
        #    self._logger.info("Finished creating dummy prediction 2/2.")
        #else:
        #    self._logger.error('Error creating dummy prediction 2/2 %s',
        #                       additional_info)

        return ta.num_run

    def _fit(self, datamanager):
        # Reset learnt stuff
        self.models_ = None
        self.ensemble_ = None

        # Check arguments prior to doing anything!
        if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit',
                                             'cv', 'nested-cv', 'partial-cv']:
            raise ValueError('Illegal resampling strategy: %s' %
                             self._resampling_strategy)
        if self._resampling_strategy == 'partial-cv' and \
                self._ensemble_size != 0:
            raise ValueError("Resampling strategy partial-cv cannot be used "
                             "together with ensembles.")

        acquisition_functions = ['EI', 'EIPS']
        if self.acquisition_function not in acquisition_functions:
            raise ValueError('Illegal acquisition %s: Must be one of %s.' %
                             (self.acquisition_function, acquisition_functions))

        self._backend._make_internals_directory()
        if self._keep_models:
            try:
                os.mkdir(self._backend.get_model_dir())
            except OSError:
                if not self._shared_mode:
                    raise

        self._metric = datamanager.info['metric']
        self._task = datamanager.info['task']
        self._label_num = datamanager.info['label_num']

        # == Pickle the data manager to speed up loading
        data_manager_path = self._backend.save_datamanager(datamanager)

        self._save_ensemble_data(
            datamanager.data['X_train'],
            datamanager.data['Y_train'])

        time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name)

        if self._debug_mode:
            self._print_load_time(
                self._dataset_name,
                self._time_for_task,
                time_for_load_data,
                self._logger)

        # == Perform dummy predictions
        num_run = 1
        #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']:
        num_run = self._do_dummy_prediction(datamanager, num_run)

        # = Create a searchspace
        # Do this before One Hot Encoding to make sure that it creates a
        # search space for a dense classifier even if one hot encoding would
        # make it sparse (tradeoff; if one hot encoding would make it sparse,
        #  densifier and truncatedSVD would probably lead to a MemoryError,
        # like this we can't use some of the preprocessing methods in case
        # the data became sparse)
        self.configuration_space, configspace_path = self._create_search_space(
            self._backend.temporary_directory,
            self._backend,
            datamanager,
            self._include_estimators,
            self._include_preprocessors)

        # == RUN ensemble builder
        # Do this before calculating the meta-features to make sure that the
        # dummy predictions are actually included in the ensemble even if
        # calculating the meta-features takes very long
        ensemble_task_name = 'runEnsemble'
        self._stopwatch.start_task(ensemble_task_name)
        time_left_for_ensembles = max(0,self._time_for_task \
                                      - self._stopwatch.wall_elapsed(self._dataset_name))
        if self._logger:
            self._logger.info(
                'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles)
        if time_left_for_ensembles <= 0:
            self._logger.warning("Not starting ensemble builder because there "
                                 "is no time left!")
            self._proc_ensemble = None
        else:
            self._proc_ensemble = self._get_ensemble_process(time_left_for_ensembles)
            if self._ensemble_size > 0:
                self._proc_ensemble.start()
            else:
                self._logger.info('Not starting ensemble builder because '
                                  'ensemble size is <= 0.')
        self._stopwatch.stop_task(ensemble_task_name)

        # kill the datamanager as it will be re-loaded anyways from sub processes
        try:
            del self._datamanager
        except Exception:
            pass
            
        # => RUN SMAC
        smac_task_name = 'runSMAC'
        self._stopwatch.start_task(smac_task_name)
        time_left_for_smac = max(0,
            self._time_for_task - self._stopwatch.wall_elapsed(
            self._dataset_name))

        if self._logger:
            self._logger.info(
                'Start SMAC with %5.2fsec time left' % time_left_for_smac)
        if time_left_for_smac <= 0:
            self._logger.warning("Not starting SMAC because there is no time "
                                 "left.")
            _proc_smac = None
        else:
            if self._per_run_time_limit is None or \
                    self._per_run_time_limit > time_left_for_smac:
                print('Time limit for a single run is higher than total time '
                      'limit. Capping the limit for a single run to the total '
                      'time given to SMAC (%f)' % time_left_for_smac)
                per_run_time_limit = time_left_for_smac
            else:
                per_run_time_limit = self._per_run_time_limit

            _proc_smac = AutoMLSMBO(config_space=self.configuration_space,
                                    dataset_name=self._dataset_name,
                                    backend=self._backend,
                                    total_walltime_limit=time_left_for_smac,
                                    func_eval_time_limit=per_run_time_limit,
                                    memory_limit=self._ml_memory_limit,
                                    data_memory_limit=self._data_memory_limit,
                                    watcher=self._stopwatch,
                                    start_num_run=num_run,
                                    num_metalearning_cfgs=self._initial_configurations_via_metalearning,
                                    config_file=configspace_path,
                                    smac_iters=self._max_iter_smac,
                                    seed=self._seed,
                                    metadata_directory=self._metadata_directory,
                                    resampling_strategy=self._resampling_strategy,
                                    resampling_strategy_args=self._resampling_strategy_arguments,
                                    acquisition_function=self.acquisition_function,
                                    shared_mode=self._shared_mode)
            self.runhistory_ = _proc_smac.run_smbo()

        self._proc_ensemble = None
        self._load_models()

        return self

    def refit(self, X, y):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        random_state = np.random.RandomState(self._seed)
        for identifier in self.models_:
            if identifier in self.ensemble_.get_model_identifiers():
                model = self.models_[identifier]
                # this updates the model inplace, it can then later be used in
                # predict method

                # try to fit the model. If it fails, shuffle the data. This
                # could alleviate the problem in algorithms that depend on
                # the ordering of the data.
                for i in range(10):
                    try:
                        model.fit(X.copy(), y.copy())
                        break
                    except ValueError:
                        indices = list(range(X.shape[0]))
                        random_state.shuffle(indices)
                        X = X[indices]
                        y = y[indices]

        self._can_predict = True
        return self

    def predict(self, X):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if not self._can_predict and \
                self._resampling_strategy not in  \
                        ['holdout', 'holdout-iterative-fit']:
            raise NotImplementedError(
                'Predict is currently only implemented for resampling '
                'strategy holdout.')

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        all_predictions = []
        for identifier in self.ensemble_.get_model_identifiers():
            model = self.models_[identifier]

            X_ = X.copy()
            if self._task in REGRESSION_TASKS:
                prediction = model.predict(X_)
            else:
                prediction = model.predict_proba(X_)

            if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
                    X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
                self._logger.warning("Prediction shape for model %s is %s "
                                     "while X_.shape is %s" %
                                     (model, str(prediction.shape),
                                      str(X_.shape)))
            all_predictions.append(prediction)

        if len(all_predictions) == 0:
            raise ValueError('Something went wrong generating the predictions. '
                             'The ensemble should consist of the following '
                             'models: %s, the following models were loaded: '
                             '%s' % (str(list(self.ensemble_indices_.keys())),
                                     str(list(self.models_.keys()))))

        predictions = self.ensemble_.predict(all_predictions)
        return predictions

    def fit_ensemble(self, y, task=None, metric=None, precision='32',
                     dataset_name=None, ensemble_nbest=None,
                     ensemble_size=None):
        if self._logger is None:
            self._logger = self._get_logger(dataset_name)

        self._proc_ensemble = self._get_ensemble_process(
            1, task, metric, precision, dataset_name, max_iterations=1,
            ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size)
        self._proc_ensemble.main()
        return self

    def _get_ensemble_process(self, time_left_for_ensembles,
                              task=None, metric=None, precision=None,
                              dataset_name=None, max_iterations=-1,
                              ensemble_nbest=None, ensemble_size=None):

        if task is None:
            task = self._task
        else:
            self._task = task

        if metric is None:
            metric = self._metric
        else:
            self._metric = metric

        if precision is None:
            precision = self.precision
        else:
            self.precision = precision

        if dataset_name is None:
            dataset_name = self._dataset_name
        else:
            self._dataset_name = dataset_name

        if ensemble_nbest is None:
            ensemble_nbest = self._ensemble_nbest
        else:
            self._ensemble_nbest = ensemble_nbest

        if ensemble_size is None:
            ensemble_size = self._ensemble_size
        else:
            self._ensemble_size = ensemble_size

        return EnsembleBuilder(backend=self._backend,
                               dataset_name=dataset_name,
                               task_type=task,
                               metric=metric,
                               limit=time_left_for_ensembles,
                               ensemble_size=ensemble_size,
                               ensemble_nbest=ensemble_nbest,
                               seed=self._seed,
                               shared_mode=self._shared_mode,
                               precision=precision,
                               max_iterations=max_iterations)

    def _load_models(self):
        if self._shared_mode:
            seed = -1
        else:
            seed = self._seed

        self.ensemble_ = self._backend.load_ensemble(seed)
        if self.ensemble_:
            identifiers = self.ensemble_.identifiers_
            self.models_ = self._backend.load_models_by_identifiers(identifiers)
        else:
            self.models_ = self._backend.load_all_models(seed)

        if len(self.models_) == 0:
            raise ValueError('No models fitted!')

    def score(self, X, y):
        # fix: Consider only index 1 of second dimension
        # Don't know if the reshaping should be done there or in calculate_score
        prediction = self.predict(X)
        return calculate_score(y, prediction, self._task,
                               self._metric, self._label_num,
                               logger=self._logger)

    @property
    def grid_scores_(self):
        grid_scores = list()

        scores_per_config = defaultdict(list)
        config_list = list()

        for run_key in self.runhistory_.data:
            run_value = self.runhistory_.data[run_key]

            config_id = run_key.config_id
            cost = run_value.cost

            if config_id not in config_list:
                config_list.append(config_id)

            scores_per_config[config_id].append(cost)

        for config_id in config_list:
            scores = [1 - score for score in scores_per_config[config_id]]
            mean_score = np.mean(scores)
            config = self.runhistory_.ids_config[config_id]

            grid_score = _CVScoreTuple(config.get_dictionary(), mean_score,
                                       scores)
            grid_scores.append(grid_score)

        return grid_scores

    @property
    def cv_results_(self):
        results = dict()

        # Missing in contrast to scikit-learn
        # splitX_test_score - auto-sklearn does not store the scores on a split
        #                     basis
        # std_test_score - auto-sklearn does not store the scores on a split
        #                  basis
        # splitX_train_score - auto-sklearn does not compute train scores, add
        #                      flag to compute the train scores
        # mean_train_score - auto-sklearn does not store the train scores
        # std_train_score - auto-sklearn does not store the train scores
        # std_fit_time - auto-sklearn does not store the fit times per split
        # mean_score_time - auto-sklearn does not store the score time
        # std_score_time - auto-sklearn does not store the score time
        # TODO: add those arguments

        parameter_dictionaries = dict()
        masks = dict()
        hp_names = []

        # Set up dictionary for parameter values
        for hp in self.configuration_space.get_hyperparameters():
            name = hp.name
            parameter_dictionaries[name] = []
            masks[name] = []
            hp_names.append(name)

        mean_test_score = []
        mean_fit_time = []
        params = []
        status = []
        for run_key in self.runhistory_.data:
            run_value = self.runhistory_.data[run_key]
            config_id = run_key.config_id
            config = self.runhistory_.ids_config[config_id]

            param_dict = config.get_dictionary()
            params.append(param_dict)
            mean_test_score.append(1 - run_value.cost)
            mean_fit_time.append(run_value.time)
            s = run_value.status
            if s == 1:
                status.append('Success')
            elif s == 2:
                status.append('Timeout')
            elif s == 3:
                status.append('Crash')
            elif s == 4:
                status.append('Abort')
            elif s == 5:
                status.append('Memout')
            else:
                status.append('Unknown')

            for hp_name in hp_names:
                if hp_name in param_dict:
                    hp_value = param_dict[hp_name]
                    mask_value = False
                else:
                    hp_value = np.NaN
                    mask_value = True

                parameter_dictionaries[hp_name].append(hp_value)
                masks[hp_name].append(mask_value)

        results['mean_test_score'] = np.array(mean_test_score)
        results['mean_fit_time'] = np.array(mean_fit_time)
        results['params'] = params
        results['rank_test_scores'] = scipy.stats.rankdata(1 - results['mean_test_score'],
                                                           method='min')
        results['status'] = status

        for hp_name in hp_names:
            masked_array = ma.MaskedArray(parameter_dictionaries[hp_name],
                                          masks[hp_name])
            results['param_%s' % hp_name] = masked_array

        return results

    def sprint_statistics(self):
        cv_results = self.cv_results_
        sio = io.StringIO()
        sio.write('auto-sklearn results:\n')
        sio.write('  Dataset name: %s\n' % self._dataset_name)
        sio.write('  Metric: %s\n' % METRIC_TO_STRING[self._metric])
        idx_best_run = np.argmax(cv_results['mean_test_score'])
        best_score = cv_results['mean_test_score'][idx_best_run]
        sio.write('  Best validation score: %f\n' % best_score)
        num_runs = len(cv_results['status'])
        sio.write('  Number of target algorithm runs: %d\n' % num_runs)
        num_success = sum([s == 'Success' for s in cv_results['status']])
        sio.write('  Number of successful target algorithm runs: %d\n' % num_success)
        num_crash = sum([s == 'Crash' for s in cv_results['status']])
        sio.write('  Number of crashed target algorithm runs: %d\n' % num_crash)
        num_timeout = sum([s == 'Timeout' for s in cv_results['status']])
        sio.write('  Number of target algorithms that exceeded the memory '
                  'limit: %d\n' % num_timeout)
        num_memout = sum([s == 'Memout' for s in cv_results['status']])
        sio.write('  Number of target algorithms that exceeded the time '
                  'limit: %d\n' % num_memout)
        return sio.getvalue()


    def show_models(self):
        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        return self.ensemble_.pprint_ensemble_string(self.models_)

    def _save_ensemble_data(self, X, y):
        """Split dataset and store Data for the ensemble script.

        :param X:
        :param y:
        :return:

        """
        task_name = 'LoadData'
        self._start_task(self._stopwatch, task_name)
        _, _, _, y_ensemble = resampling.split_data(X, y)
        self._backend.save_targets_ensemble(y_ensemble)
        self._stop_task(self._stopwatch, task_name)

    def _create_search_space(self, tmp_dir, backend, datamanager,
                             include_estimators=None,
                             include_preprocessors=None):
        task_name = 'CreateConfigSpace'

        self._stopwatch.start_task(task_name)
        configspace_path = os.path.join(tmp_dir, 'space.pcs')
        configuration_space = pipeline.get_configuration_space(
            datamanager.info,
            include_estimators=include_estimators,
            include_preprocessors=include_preprocessors)
        configuration_space = self.configuration_space_created_hook(
            datamanager, configuration_space)
        sp_string = pcs.write(configuration_space)
        backend.write_txt_file(configspace_path, sp_string,
                               'Configuration space')
        self._stopwatch.stop_task(task_name)

        return configuration_space, configspace_path

    def configuration_space_created_hook(self, datamanager, configuration_space):
        return configuration_space
Esempio n. 21
0
    def __init__(self,
                 backend,
                 time_left_for_this_task,
                 per_run_time_limit,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3072,
                 metadata_directory=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 exclude_estimators=None,
                 include_preprocessors=None,
                 exclude_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 shared_mode=False,
                 precision=32,
                 disable_evaluator_output=False,
                 get_smac_object_callback=None,
                 smac_scenario_args=None,
                 ):
        super(AutoML, self).__init__()
        self._backend = backend
        #self._tmp_dir = tmp_dir
        #self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._data_memory_limit = None
        self._metadata_directory = metadata_directory
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._exclude_estimators = exclude_estimators
        self._include_preprocessors = include_preprocessors
        self._exclude_preprocessors = exclude_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments \
            if resampling_strategy_arguments is not None else {}
        self._shared_mode = shared_mode
        self.precision = precision
        self._disable_evaluator_output = disable_evaluator_output
        self._get_smac_object_callback = get_smac_object_callback
        self._smac_scenario_args = smac_scenario_args

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self._parser = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode

        if not isinstance(self._time_for_task, int):
            raise ValueError("time_left_for_this_task not of type integer, "
                             "but %s" % str(type(self._time_for_task)))
        if not isinstance(self._per_run_time_limit, int):
            raise ValueError("per_run_time_limit not of type integer, but %s" %
                             str(type(self._per_run_time_limit)))
Esempio n. 22
0
class AutoML(BaseEstimator, multiprocessing.Process):

    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self.models_ = None
        self.ensemble_indices_ = None

        self._debug_mode = debug_mode
        self._backend = Backend(self._output_dir, self._tmp_dir)

    def start_automl(self, parser):
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=parser)
        self._stopwatch.start_task(datamanager.name)

        logger_name = 'AutoML(%d):%s' % (self._seed, datamanager.name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self.start()

    def start(self):
        if self._datamanager is None:
            raise ValueError('You must invoke start() only via start_automl()')
        super(AutoML, self).start()

    def run(self):
        if self._datamanager is None:
            raise ValueError('You must invoke run() only via start_automl()')
        self._fit(self._datamanager)

    def fit(self, X, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        logger_name = 'AutoML(%d):%s' % (self._seed, dataset_name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all([isinstance(f, bool)
                                              for f in feat_type]):
            raise ValueError('Array feat_type must only contain bools.')

        loaded_data_manager = XYDataManager(X, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)

    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        loaded_data_manager = CompetitionDataManager(
            dataset, encode_labels=False,
            max_memory_in_mb=float(self._ml_memory_limit) / 3)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)

    @staticmethod
    def _start_task(watcher, task_name):
        watcher.start_task(task_name)

    @staticmethod
    def _stop_task(watcher, task_name):
        watcher.stop_task(task_name)

    @staticmethod
    def _print_load_time(basename, time_left_for_this_task,
                         time_for_load_data, logger):

        time_left_after_reading = max(
            0, time_left_for_this_task - time_for_load_data)
        logger.info('Remaining time after reading %s %5.2f sec' %
                    (basename, time_left_after_reading))
        return time_for_load_data

    def _do_dummy_prediction(self, datamanager):
        autosklearn.cli.base_interface.main(datamanager,
                                            self._resampling_strategy,
                                            None,
                                            None,
                                            mode_args=self._resampling_strategy_arguments)

    def _fit(self, datamanager):
        # Reset learnt stuff
        self.models_ = None
        self.ensemble_indices_ = None

        # Check arguments prior to doing anything!
        if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit',
                                             'cv', 'nested-cv', 'partial-cv']:
            raise ValueError('Illegal resampling strategy: %s' %
                             self._resampling_strategy)
        if self._resampling_strategy == 'partial-cv' and \
                self._ensemble_size != 0:
            raise ValueError("Resampling strategy partial-cv cannot be used "
                             "together with ensembles.")

        self._backend._make_internals_directory()
        if self._keep_models:
            try:
                os.mkdir(self._backend.get_model_dir())
            except OSError:
                self._logger.warning("model directory already exists")
                if not self._shared_mode:
                    raise

        self._metric = datamanager.info['metric']
        self._task = datamanager.info['task']
        self._label_num = datamanager.info['label_num']

        set_auto_seed(self._seed)

        # == Pickle the data manager, here, because no more global
        # OneHotEncoding
        data_manager_path = self._backend.save_datamanager(datamanager)

        self._save_ensemble_data(
            datamanager.data['X_train'],
            datamanager.data['Y_train'])

        time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name)

        if self._debug_mode:
            self._print_load_time(
                self._dataset_name,
                self._time_for_task,
                time_for_load_data,
                self._logger)

        # == Perform dummy predictions
        self._do_dummy_prediction(datamanager)

        # = Create a searchspace
        # Do this before One Hot Encoding to make sure that it creates a
        # search space for a dense classifier even if one hot encoding would
        # make it sparse (tradeoff; if one hot encoding would make it sparse,
        #  densifier and truncatedSVD would probably lead to a MemoryError,
        # like this we can't use some of the preprocessing methods in case
        # the data became sparse)
        self.configuration_space, configspace_path = _create_search_space(
            self._tmp_dir,
            datamanager.info,
            self._backend,
            self._stopwatch,
            self._logger,
            self._include_estimators,
            self._include_preprocessors)
        self.configuration_space_created_hook(datamanager)

        # == Calculate metafeatures
        meta_features = _calculate_metafeatures(
            data_feat_type=datamanager.feat_type,
            data_info_task=datamanager.info['task'],
            x_train=datamanager.data['X_train'],
            y_train=datamanager.data['Y_train'],
            basename=self._dataset_name,
            watcher=self._stopwatch,
            metalearning_cnt=self._initial_configurations_via_metalearning,
            logger=self._logger)

        self._stopwatch.start_task('OneHot')
        datamanager.perform1HotEncoding()
        self._stopwatch.stop_task('OneHot')

        if meta_features is None:
            initial_configurations = []
        elif datamanager.info['task'] in [MULTICLASS_CLASSIFICATION,
                                          BINARY_CLASSIFICATION,
                                          MULTILABEL_CLASSIFICATION]:

            meta_features_encoded = _calculate_metafeatures_encoded(
                self._dataset_name,
                datamanager.data['X_train'],
                datamanager.data['Y_train'],
                self._stopwatch,
                self._logger)

            self._logger.debug(meta_features.__repr__(verbosity=2))
            self._logger.debug(meta_features_encoded.__repr__(verbosity=2))

            initial_configurations = _get_initial_configuration(
                meta_features,
                meta_features_encoded,
                self._dataset_name,
                self._metric,
                self.configuration_space,
                self._task,
                self._metadata_directory,
                self._initial_configurations_via_metalearning,
                datamanager.info[
                    'is_sparse'],
                self._stopwatch,
                self._logger)

            _print_debug_info_of_init_configuration(
                initial_configurations,
                self._dataset_name,
                self._time_for_task,
                self._logger,
                self._stopwatch)

        else:
            initial_configurations = []
            self._logger.warning('Metafeatures encoded not calculated')

        # == RUN SMAC
        if (datamanager.info["task"] == BINARY_CLASSIFICATION) or \
            (datamanager.info["task"] == MULTICLASS_CLASSIFICATION):
            config = {'balancing:strategy': 'weighting',
                      'classifier:__choice__': 'sgd',
                      'classifier:sgd:loss': 'hinge',
                      'classifier:sgd:penalty': 'l2',
                      'classifier:sgd:alpha': 0.0001,
                      'classifier:sgd:fit_intercept': 'True',
                      'classifier:sgd:n_iter': 5,
                      'classifier:sgd:learning_rate': 'optimal',
                      'classifier:sgd:eta0': 0.01,
                      'classifier:sgd:average': 'True',
                      'imputation:strategy': 'mean',
                      'one_hot_encoding:use_minimum_fraction': 'True',
                      'one_hot_encoding:minimum_fraction': 0.1,
                      'preprocessor:__choice__': 'no_preprocessing',
                      'rescaling:__choice__': 'min/max'}
        elif datamanager.info["task"] == MULTILABEL_CLASSIFICATION:
            config = {'classifier:__choice__': 'adaboost',
                      'classifier:adaboost:algorithm': 'SAMME.R',
                      'classifier:adaboost:learning_rate': 1.0,
                      'classifier:adaboost:max_depth': 1,
                      'classifier:adaboost:n_estimators': 50,
                      'balancing:strategy': 'weighting',
                      'imputation:strategy': 'mean',
                      'one_hot_encoding:use_minimum_fraction': 'True',
                      'one_hot_encoding:minimum_fraction': 0.1,
                      'preprocessor:__choice__': 'no_preprocessing',
                      'rescaling:__choice__': 'none'}
        else:
            config = None
            self._logger.info("Tasktype unknown: %s" %
                              TASK_TYPES_TO_STRING[datamanager.info["task"]])

        if config is not None:
            configuration = Configuration(self.configuration_space, config)
            config_string = convert_conf2smac_string(configuration)
            initial_configurations = [config_string] + initial_configurations

        # == RUN SMAC
        proc_smac = run_smac(tmp_dir=self._tmp_dir, basename=self._dataset_name,
                             time_for_task=self._time_for_task,
                             ml_memory_limit=self._ml_memory_limit,
                             data_manager_path=data_manager_path,
                             configspace_path=configspace_path,
                             initial_configurations=initial_configurations,
                             per_run_time_limit=self._per_run_time_limit,
                             watcher=self._stopwatch, backend=self._backend,
                             seed=self._seed,
                             resampling_strategy=self._resampling_strategy,
                             resampling_strategy_arguments=self._resampling_strategy_arguments,
                             shared_mode=self._shared_mode)

        # == RUN ensemble builder
        proc_ensembles = self.run_ensemble_builder()

        procs = []

        if proc_smac is not None:
            procs.append(proc_smac)
        if proc_ensembles is not None:
            procs.append(proc_ensembles)

        if self._queue is not None:
            self._queue.put([time_for_load_data, data_manager_path, procs])
        else:
            for proc in procs:
                proc.wait()

        # Delete AutoSklearn environment variable
        del_auto_seed()

        # In case
        try:
            del self._datamanager
        except Exception:
            pass

        if self._queue is None:
            self._load_models()

        return self

    def run_ensemble_builder(self,
                             time_left_for_ensembles=None,
                             max_iterations=None,
                             ensemble_size=None):
        if self._ensemble_size > 0 or ensemble_size is not None:
            task_name = 'runEnsemble'
            self._stopwatch.start_task(task_name)

            if time_left_for_ensembles is None:
                time_left_for_ensembles = max(0,
                    self._time_for_task - self._stopwatch.wall_elapsed(
                        self._dataset_name))
            if max_iterations is None:
                max_iterations = -1
            if ensemble_size is None:
                ensemble_size = self._ensemble_size

            # It can happen that run_ensemble_builder is called without
            # calling fit.
            if self._logger:
                self._logger.info(
                    'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles)
            proc_ensembles = submit_process.run_ensemble_builder(
                tmp_dir=self._tmp_dir,
                dataset_name=self._dataset_name,
                task_type=self._task,
                metric=self._metric,
                limit=time_left_for_ensembles,
                output_dir=self._output_dir,
                ensemble_size=ensemble_size,
                ensemble_nbest=self._ensemble_nbest,
                seed=self._seed,
                shared_mode=self._shared_mode,
                max_iterations=max_iterations,
                precision=self.precision
            )
            self._stopwatch.stop_task(task_name)
            return proc_ensembles
        else:
            self._logger.info('Not starting ensemble script due to ensemble '
                             'size 0.')
            return None

    def predict(self, X):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if self._resampling_strategy not in  ['holdout',
                                              'holdout-iterative-fit']:
            raise NotImplementedError(
                'Predict is currently only implemented for resampling '
                'strategy holdout.')

        if self.models_ is None or len(self.models_) == 0 or len(
                self.ensemble_indices_) == 0:
            self._load_models()

        predictions = []
        for identifier in self.models_:
            if identifier not in self.ensemble_indices_:
                continue

            weight = self.ensemble_indices_[identifier]
            model = self.models_[identifier]

            X_ = X.copy()
            if self._task in REGRESSION_TASKS:
                prediction = model.predict(X_)
            else:
                prediction = model.predict_proba(X_)

            if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
                    X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
                self._logger.warning("Prediction shape for model %s is %s "
                                     "while X_.shape is %s" %
                                     (model, str(prediction.shape),
                                      str(X_.shape)))
            predictions.append(prediction * weight)

        if len(predictions) == 0:
            raise ValueError('Something went wrong generating the predictions. '
                             'The ensemble should consist of the following '
                             'models: %s, the following models were loaded: '
                             '%s' % (str(list(self.ensemble_indices_.keys())),
                                     str(list(self.models_.keys()))))

        predictions = np.sum(np.array(predictions), axis=0)
        return predictions

    def _load_models(self):
        if self._shared_mode:
            seed = -1
        else:
            seed = self._seed

        self.models_ = self._backend.load_all_models(seed)
        if len(self.models_) == 0:
            raise ValueError('No models fitted!')

        self.ensemble_indices_ = self._backend.load_ensemble_indices_weights(
            seed)

    def score(self, X, y):
        # fix: Consider only index 1 of second dimension
        # Don't know if the reshaping should be done there or in calculate_score
        prediction = self.predict(X)
        if self._task == BINARY_CLASSIFICATION:
            prediction = prediction[:, 1].reshape((-1, 1))
        return calculate_score(y, prediction, self._task,
                               self._metric, self._label_num,
                               logger=self._logger)

    def show_models(self):
        if self.models_ is None or len(self.models_) == 0 or len(
                self.ensemble_indices_) == 0:
            self._load_models()

        output = []
        sio = six.StringIO()
        for identifier in self.models_:
            if identifier not in self.ensemble_indices_:
                continue

            weight = self.ensemble_indices_[identifier]
            model = self.models_[identifier]
            output.append((weight, model))

        output.sort(reverse=True)

        sio.write("[")
        for weight, model in output:
            sio.write("(%f, %s),\n" % (weight, model))
        sio.write("]")

        return sio.getvalue()

    def _save_ensemble_data(self, X, y):
        """Split dataset and store Data for the ensemble script.

        :param X:
        :param y:
        :return:

        """
        task_name = 'LoadData'
        self._start_task(self._stopwatch, task_name)
        _, _, _, y_ensemble = resampling.split_data(X, y)
        self._backend.save_targets_ensemble(y_ensemble)
        self._stop_task(self._stopwatch, task_name)

    def configuration_space_created_hook(self, datamanager):
        pass

    def get_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'get_params() because it is not intended to '
                                  'be optimized.')

    def set_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'set_params() because it is not intended to '
                                  'be optimized.')

    def __del__(self):
        self._delete_output_directories()

    def _delete_output_directories(self):
        if self.delete_output_folder_after_terminate:
            try:
                shutil.rmtree(self._output_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete output dir: %s" %
                                         self._output_dir)
                else:
                    print("Could not delete output dir: %s" %
                          self._output_dir)

        if self.delete_tmp_folder_after_terminate:
            try:
                shutil.rmtree(self._tmp_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete tmp dir: %s" %
                                  self._tmp_dir)
                    pass
                else:
                    print("Could not delete tmp dir: %s" %
                          self._tmp_dir)
Esempio n. 23
0
class AutoML(multiprocessing.Process, BaseEstimator):

    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 logger=None):
        super(AutoML, self).__init__()
        self._seed = seed
        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._model_dir = join(self._tmp_dir, 'models_%d' % self._seed)
        self._ensemble_indices_dir = join(self._tmp_dir,
                                          'ensemble_indices_%d' % self._seed)
        self._create_folder(self._tmp_dir, to_log=False)

        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest

        self._ml_memory_limit = ml_memory_limit
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models

        self._basename = None
        self._stopwatch = None
        self._logger = logger if logger is not None else get_automl_logger(
            self._log_dir, self._basename, self._seed)
        self._ohe = None
        self._task = None
        self._metric = None
        self._target_num = None

        self._debug_mode = debug_mode
        self._create_folders()

    def _create_folder(self, folder, to_log=True):
        if to_log:
            self._debug("CREATE folder: %s" % folder)
        else:
            print("CREATE folder: %s" % folder)
        if os.path.isdir(folder):
            if not self._debug_mode:
                raise OSError("Folder '%s' exists" % folder)
        else:
            os.mkdir(folder)

    def _create_folders(self):
        # == Set up a directory where all the trained models will be pickled to

        self._create_folder(self._output_dir)
        if self._log_dir != self._tmp_dir:
            self._create_folder(self._log_dir)
        self._create_folder(self._model_dir)
        self._create_folder(self._ensemble_indices_dir)

    def run(self):
        raise NotImplementedError()

    def fit(self, data_x, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(data_x.data)
            dataset_name = m.hexdigest()

        self._basename = dataset_name

        self._stopwatch = StopWatch()
        self._stopwatch.start_task(self._basename)

        loaded_data_manager = XYDataManager(data_x, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)

    def _debug(self, text):
        self._logger.debug(text)

    def _critical(self, text):
        self._logger.critical(text)

    def _error(self, text):
        self._logger.error(text)

    def _info(self, text):
        self._logger.info(text)

    def fit_automl_dataset(self, basename, input_dir):
        # == Creating a data object with data and information about it
        self._basename = basename

        self._stopwatch = StopWatch()
        self._stopwatch.start_task(self._basename)

        self._logger = get_automl_logger(self._log_dir, self._basename,
                                         self._seed)

        self._debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        loaded_data_manager = CompetitionDataManager(self._basename, input_dir,
                                                     verbose=True,
                                                     encode_labels=False)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._debug(part)

        return self._fit(loaded_data_manager)

    def _save_data_manager(self, data_d, tmp_dir, basename, watcher):
        task_name = 'StoreDatamanager'

        watcher.start_task(task_name)

        filepath = os.path.join(tmp_dir, basename + '_Manager.pkl')

        if _check_path_for_save(filepath, 'Data manager ', self._debug):
            pickle.dump(data_d, open(filepath, 'w'), protocol=-1)

        watcher.stop_task(task_name)
        return filepath

    def _fit(self, manager):
        # TODO: check that data and task definition fit together!

        self._metric = manager.info['metric']
        self._task = manager.info['task']
        self._target_num = manager.info['target_num']

        set_auto_seed(self._seed)

        # load data
        _save_ensemble_data(
            manager.data['X_train'],
            manager.data['Y_train'],
            self._tmp_dir,
            self._stopwatch)

        time_for_load_data = self._stopwatch.wall_elapsed(self._basename)

        if self._debug_mode:
            _print_load_time(
                self._basename,
                self._time_for_task,
                time_for_load_data,
                self._info)

        # == Calculate metafeatures
        meta_features = _calculate_meta_features(
            data_feat_type=manager.feat_type,
            data_info_task=manager.info['task'], basename=self._basename,
            metalearning_cnt=self._initial_configurations_via_metalearning,
            x_train=manager.data['X_train'], y_train=manager.data['Y_train'],
            watcher=self._stopwatch, log_function=self._debug)

        self._stopwatch.start_task('OneHot')
        manager.perform_hot_encoding()
        self._ohe = manager.encoder
        self._stopwatch.stop_task('OneHot')

        # == Pickle the data manager
        data_manager_path = self._save_data_manager(
            manager,
            self._tmp_dir,
            self._basename,
            watcher=self._stopwatch)

        # = Create a searchspace
        self._configuration_space, configspace_path = _create_search_space(
            self._tmp_dir,
            manager.info,
            self._stopwatch,
            self._debug)

        if meta_features is not None \
                and manager.info['task'] in [MULTICLASS_CLASSIFICATION,
                                             BINARY_CLASSIFICATION]:

            meta_features_encoded = _calculate_meta_features_encoded(
                self._basename, manager.data['X_train'],
                manager.data['Y_train'], self._stopwatch, self._debug)

            self._debug(meta_features)
            self._debug(meta_features_encoded)
            initial_configurations = _get_initial_configuration(
                meta_features,
                meta_features_encoded,
                self._basename,
                self._metric,
                self._configuration_space,
                self._task,
                self._metadata_directory,
                self._initial_configurations_via_metalearning,
                manager.info['is_sparse'],
                self._stopwatch,
                self._error)

            _print_debug_info_of_init_configuration(
                initial_configurations,
                self._basename,
                self._time_for_task,
                self._debug, self._info,
                self._stopwatch)

        else:
            initial_configurations = []
            self._critical('Metafeatures encoded not calculated')

        # == RUN SMAC
        proc_smac = _run_smac(self._tmp_dir, self._basename,
                              self._time_for_task, self._ml_memory_limit,
                              data_manager_path, configspace_path,
                              initial_configurations, self._per_run_time_limit,
                              self._stopwatch, self._debug)

        # == RUN ensemble builder
        proc_ensembles = _run_ensemble_builder(
            self._tmp_dir,
            self._output_dir,
            self._basename,
            self._time_for_task,
            self._task,
            self._metric,
            self._ensemble_size,
            self._ensemble_nbest,
            self._ensemble_indices_dir,
            self._stopwatch,
            self._debug
        )

        if self._queue is not None:
            self._queue.put([time_for_load_data, data_manager_path, proc_smac,
                             proc_ensembles])
        else:

            self._debug("Real run SMAC")
            proc_smac.wait()
            self._debug("Real run RUNSOLVER")
            proc_ensembles.wait()

        # Delete AutoSklearn environment variable
        del_auto_seed()
        return self

    def predict(self, data_x):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")

        assert os.path.isdir(
            self._model_dir), "Not found model directory: %s" % self._model_dir

        model_files = os.listdir(self._model_dir)
        models = []
        for model_file in model_files:
            model_file = os.path.join(self._model_dir, model_file)
            with open(model_file) as fh:
                models.append(pickle.load(fh))

        if len(models) == 0:
            raise ValueError('No models fitted!')

        if self._ohe is not None:
            data_x = self._ohe._transform(data_x)

        print(self._ensemble_indices_dir)
        _ = os.listdir(self._ensemble_indices_dir)
        assert len(_), "Not found ensemle indices"
        indices_files = sorted(_)
        indices_file = os.path.join(self._ensemble_indices_dir,
                                    indices_files[-1])
        with open(indices_file) as fh:
            ensemble_members_run_numbers = pickle.load(fh)

        predictions = []
        for model, model_file in zip(models, model_files):
            num_run = int(model_file.split('.')[0])

            if num_run not in ensemble_members_run_numbers:
                continue

            weight = ensemble_members_run_numbers[num_run]

            X_ = data_x.copy()
            if self._task in REGRESSION_TASKS:
                prediction = model.predict(X_)
            else:
                prediction = model.predict_proba(X_)
            predictions.append(prediction * weight)

        predictions = np.sum(np.array(predictions), axis=0)
        return predictions

    def score(self, data_x, y):
        prediction = self.predict(data_x)
        return calculate_score(y, prediction, self._task,
                                         self._metric, self._target_num)

    def configuration_space_created_hook(self):
        pass

    def get_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'get_params() because it is not intended to '
                                  'be optimized.')

    def set_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'set_params() because it is not intended to '
                                  'be optimized.')
Esempio n. 24
0
class AutoML(BaseEstimator):
    def __init__(self,
                 backend,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3072,
                 metadata_directory=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 exclude_estimators=None,
                 include_preprocessors=None,
                 exclude_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32,
                 max_iter_smac=None,
                 acquisition_function='EI',
                 disable_evaluator_output=False,
                 configuration_mode='SMAC'):
        super(AutoML, self).__init__()
        self._backend = backend
        #self._tmp_dir = tmp_dir
        #self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        #self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._data_memory_limit = None
        self._metadata_directory = metadata_directory
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._exclude_estimators = exclude_estimators
        self._include_preprocessors = include_preprocessors
        self._exclude_preprocessors = exclude_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments \
            if resampling_strategy_arguments is not None else {}
        self._max_iter_smac = max_iter_smac
        #self.delete_tmp_folder_after_terminate = \
        #    delete_tmp_folder_after_terminate
        #self.delete_output_folder_after_terminate = \
        #    delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision
        self.acquisition_function = acquisition_function
        self._disable_evaluator_output = disable_evaluator_output
        self._configuration_mode = configuration_mode

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self._parser = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode

        if not isinstance(self._time_for_task, int):
            raise ValueError("time_left_for_this_task not of type integer, "
                             "but %s" % str(type(self._time_for_task)))
        if not isinstance(self._per_run_time_limit, int):
            raise ValueError("per_run_time_limit not of type integer, but %s" %
                             str(type(self._per_run_time_limit)))

        # After assignging and checking variables...
        #self._backend = Backend(self._output_dir, self._tmp_dir)

    def fit(self,
            X,
            y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if not self._shared_mode:
            self._backend.context.delete_directories()
        else:
            # If this fails, it's likely that this is the first call to get
            # the data manager
            try:
                D = self._backend.load_datamanager()
                dataset_name = D.name
            except IOError:
                pass

        self._backend.context.create_directories()

        if dataset_name is None:
            dataset_name = hash_numpy_array(X)

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all(
            [isinstance(f, str) for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' %
                                     ft)

        self._data_memory_limit = None
        loaded_data_manager = XYDataManager(X,
                                            y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)

    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        self._logger = self._get_logger(name)
        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        self._data_memory_limit = float(self._ml_memory_limit) / 3
        loaded_data_manager = CompetitionDataManager(
            dataset,
            encode_labels=False,
            max_memory_in_mb=self._data_memory_limit)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)

    def fit_on_datamanager(self, datamanager):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(datamanager.name)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        self._logger = self._get_logger(name)
        self._fit(datamanager)

    def _get_logger(self, name):
        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(
            os.path.join(self._backend.temporary_directory,
                         '%s.log' % str(logger_name)))
        return get_logger(logger_name)

    @staticmethod
    def _start_task(watcher, task_name):
        watcher.start_task(task_name)

    @staticmethod
    def _stop_task(watcher, task_name):
        watcher.stop_task(task_name)

    @staticmethod
    def _print_load_time(basename, time_left_for_this_task, time_for_load_data,
                         logger):

        time_left_after_reading = max(
            0, time_left_for_this_task - time_for_load_data)
        logger.info('Remaining time after reading %s %5.2f sec' %
                    (basename, time_left_after_reading))
        return time_for_load_data

    def _do_dummy_prediction(self, datamanager, num_run):

        # When using partial-cv it makes no sense to do dummy predictions
        if self._resampling_strategy in [
                'partial-cv', 'partial-cv-iterative-fit'
        ]:
            return num_run

        self._logger.info("Starting to create dummy predictions.")
        memory_limit = int(self._ml_memory_limit)
        scenario_mock = unittest.mock.Mock()
        scenario_mock.wallclock_limit = self._time_for_task
        # This stats object is a hack - maybe the SMAC stats object should
        # already be generated here!
        stats = Stats(scenario_mock)
        stats.start_timing()
        ta = ExecuteTaFuncWithQueue(
            backend=self._backend,
            autosklearn_seed=self._seed,
            resampling_strategy=self._resampling_strategy,
            initial_num_run=num_run,
            logger=self._logger,
            stats=stats,
            memory_limit=memory_limit,
            disable_file_output=self._disable_evaluator_output,
            **self._resampling_strategy_arguments)

        status, cost, runtime, additional_info = \
            ta.run(1, cutoff=self._time_for_task)
        if status == StatusType.SUCCESS:
            self._logger.info("Finished creating dummy predictions.")
        else:
            self._logger.error('Error creating dummy predictions:%s ',
                               additional_info)

        return ta.num_run

    def _fit(self, datamanager):
        # Reset learnt stuff
        self.models_ = None
        self.ensemble_ = None

        # Check arguments prior to doing anything!
        if self._resampling_strategy not in [
                'holdout', 'holdout-iterative-fit', 'cv', 'partial-cv',
                'partial-cv-iterative-fit'
        ]:
            raise ValueError('Illegal resampling strategy: %s' %
                             self._resampling_strategy)
        if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit'] \
                and self._ensemble_size != 0:
            raise ValueError("Resampling strategy %s cannot be used "
                             "together with ensembles." %
                             self._resampling_strategy)
        if self._resampling_strategy in ['partial-cv', 'cv',
                                         'partial-cv-iterative-fit'] and \
                not 'folds' in self._resampling_strategy_arguments:
            self._resampling_strategy_arguments['folds'] = 5

        acquisition_functions = ['EI', 'EIPS']
        if self.acquisition_function not in acquisition_functions:
            raise ValueError(
                'Illegal acquisition %s: Must be one of %s.' %
                (self.acquisition_function, acquisition_functions))

        self._backend._make_internals_directory()
        if self._keep_models:
            try:
                os.makedirs(self._backend.get_model_dir())
            except (OSError, FileExistsError) as e:
                if not self._shared_mode:
                    raise

        self._metric = datamanager.info['metric']
        self._task = datamanager.info['task']
        self._label_num = datamanager.info['label_num']

        # == Pickle the data manager to speed up loading
        data_manager_path = self._backend.save_datamanager(datamanager)

        time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name)

        if self._debug_mode:
            self._print_load_time(self._dataset_name, self._time_for_task,
                                  time_for_load_data, self._logger)

        # == Perform dummy predictions
        num_run = 1
        #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']:
        num_run = self._do_dummy_prediction(datamanager, num_run)

        # = Create a searchspace
        # Do this before One Hot Encoding to make sure that it creates a
        # search space for a dense classifier even if one hot encoding would
        # make it sparse (tradeoff; if one hot encoding would make it sparse,
        #  densifier and truncatedSVD would probably lead to a MemoryError,
        # like this we can't use some of the preprocessing methods in case
        # the data became sparse)
        self.configuration_space, configspace_path = self._create_search_space(
            self._backend.temporary_directory,
            self._backend,
            datamanager,
            include_estimators=self._include_estimators,
            exclude_estimators=self._exclude_estimators,
            include_preprocessors=self._include_preprocessors,
            exclude_preprocessors=self._exclude_preprocessors)

        # == RUN ensemble builder
        # Do this before calculating the meta-features to make sure that the
        # dummy predictions are actually included in the ensemble even if
        # calculating the meta-features takes very long
        ensemble_task_name = 'runEnsemble'
        self._stopwatch.start_task(ensemble_task_name)
        time_left_for_ensembles = max(0,self._time_for_task \
                                      - self._stopwatch.wall_elapsed(self._dataset_name))
        if self._logger:
            self._logger.info('Start Ensemble with %5.2fsec time left' %
                              time_left_for_ensembles)
        if time_left_for_ensembles <= 0:
            self._logger.warning("Not starting ensemble builder because there "
                                 "is no time left!")
            self._proc_ensemble = None
        else:
            self._proc_ensemble = self._get_ensemble_process(
                time_left_for_ensembles)
            if self._ensemble_size > 0:
                self._proc_ensemble.start()
            else:
                self._logger.info('Not starting ensemble builder because '
                                  'ensemble size is <= 0.')
        self._stopwatch.stop_task(ensemble_task_name)

        # kill the datamanager as it will be re-loaded anyways from sub processes
        try:
            del self._datamanager
        except Exception:
            pass

        # => RUN SMAC
        smac_task_name = 'runSMAC'
        self._stopwatch.start_task(smac_task_name)
        time_left_for_smac = max(
            0, self._time_for_task -
            self._stopwatch.wall_elapsed(self._dataset_name))

        if self._logger:
            self._logger.info('Start SMAC with %5.2fsec time left' %
                              time_left_for_smac)
        if time_left_for_smac <= 0:
            self._logger.warning("Not starting SMAC because there is no time "
                                 "left.")
            _proc_smac = None
        else:
            if self._per_run_time_limit is None or \
                    self._per_run_time_limit > time_left_for_smac:
                print('Time limit for a single run is higher than total time '
                      'limit. Capping the limit for a single run to the total '
                      'time given to SMAC (%f)' % time_left_for_smac)
                per_run_time_limit = time_left_for_smac
            else:
                per_run_time_limit = self._per_run_time_limit

            _proc_smac = AutoMLSMBO(
                config_space=self.configuration_space,
                dataset_name=self._dataset_name,
                backend=self._backend,
                total_walltime_limit=time_left_for_smac,
                func_eval_time_limit=per_run_time_limit,
                memory_limit=self._ml_memory_limit,
                data_memory_limit=self._data_memory_limit,
                watcher=self._stopwatch,
                start_num_run=num_run,
                num_metalearning_cfgs=self.
                _initial_configurations_via_metalearning,
                config_file=configspace_path,
                smac_iters=self._max_iter_smac,
                seed=self._seed,
                metadata_directory=self._metadata_directory,
                resampling_strategy=self._resampling_strategy,
                resampling_strategy_args=self._resampling_strategy_arguments,
                acquisition_function=self.acquisition_function,
                shared_mode=self._shared_mode,
                include_estimators=self._include_estimators,
                exclude_estimators=self._exclude_estimators,
                include_preprocessors=self._include_preprocessors,
                exclude_preprocessors=self._exclude_preprocessors,
                disable_file_output=self._disable_evaluator_output,
                configuration_mode=self._configuration_mode)
            self.runhistory_, self.trajectory_ = _proc_smac.run_smbo()
            runhistory_filename = os.path.join(
                self._backend.temporary_directory,
                'runhistory.json',
            )
            self.runhistory_.save_json(runhistory_filename)
            trajectory_filename = os.path.join(
                self._backend.temporary_directory, 'trajectory.json')
            saveable_trajectory = [
                entry[:2] + [entry[2].get_dictionary()] + entry[3:]
                for entry in self.trajectory_
            ]
            with open(trajectory_filename, 'w') as fh:
                json.dump(saveable_trajectory, fh)

        # Wait until the ensemble process is finished to avoid shutting down
        # while the ensemble builder tries to access the data
        if self._proc_ensemble is not None and self._ensemble_size > 0:
            self._proc_ensemble.join()

        self._proc_ensemble = None
        self._load_models()

        return self

    def refit(self, X, y):
        def send_warnings_to_log(message,
                                 category,
                                 filename,
                                 lineno,
                                 file=None):
            self._logger.debug('%s:%s: %s:%s' %
                               (filename, lineno, category.__name__, message))
            return

        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        random_state = np.random.RandomState(self._seed)
        for identifier in self.models_:
            if identifier in self.ensemble_.get_model_identifiers():
                model = self.models_[identifier]
                # this updates the model inplace, it can then later be used in
                # predict method

                # try to fit the model. If it fails, shuffle the data. This
                # could alleviate the problem in algorithms that depend on
                # the ordering of the data.
                for i in range(10):
                    try:
                        with warnings.catch_warnings():
                            warnings.showwarning = send_warnings_to_log
                            model.fit(X.copy(), y.copy())
                        break
                    except ValueError:
                        indices = list(range(X.shape[0]))
                        random_state.shuffle(indices)
                        X = X[indices]
                        y = y[indices]

        self._can_predict = True
        return self

    def predict(self, X):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if not self._can_predict and \
                self._resampling_strategy not in  \
                        ['holdout', 'holdout-iterative-fit']:
            raise NotImplementedError(
                'Predict is currently only implemented for resampling '
                'strategy %s.' % self._resampling_strategy)

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        def send_warnings_to_log(message,
                                 category,
                                 filename,
                                 lineno,
                                 file=None):
            self._logger.debug('%s:%s: %s:%s' %
                               (filename, lineno, category.__name__, message))
            return

        all_predictions = []
        for identifier in self.ensemble_.get_model_identifiers():
            model = self.models_[identifier]

            X_ = X.copy()
            with warnings.catch_warnings():
                warnings.showwarning = send_warnings_to_log
                if self._task in REGRESSION_TASKS:
                    prediction = model.predict(X_)
                else:
                    prediction = model.predict_proba(X_)

            if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
                    X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
                self._logger.warning(
                    "Prediction shape for model %s is %s "
                    "while X_.shape is %s" %
                    (model, str(prediction.shape), str(X_.shape)))
            all_predictions.append(prediction)

        if len(all_predictions) == 0:
            raise ValueError(
                'Something went wrong generating the predictions. '
                'The ensemble should consist of the following '
                'models: %s, the following models were loaded: '
                '%s' % (str(list(self.ensemble_indices_.keys())),
                        str(list(self.models_.keys()))))

        predictions = self.ensemble_.predict(all_predictions)
        return predictions

    def fit_ensemble(self,
                     y,
                     task=None,
                     metric=None,
                     precision='32',
                     dataset_name=None,
                     ensemble_nbest=None,
                     ensemble_size=None):
        if self._resampling_strategy in [
                'partial-cv', 'partial-cv-iterative-fit'
        ]:
            raise ValueError('Cannot call fit_ensemble with resampling '
                             'strategy %s.' % self._resampling_strategy)

        if self._logger is None:
            self._logger = self._get_logger(dataset_name)

        self._proc_ensemble = self._get_ensemble_process(
            1,
            task,
            metric,
            precision,
            dataset_name,
            max_iterations=1,
            ensemble_nbest=ensemble_nbest,
            ensemble_size=ensemble_size)
        self._proc_ensemble.main()
        self._proc_ensemble = None
        return self

    def _get_ensemble_process(self,
                              time_left_for_ensembles,
                              task=None,
                              metric=None,
                              precision=None,
                              dataset_name=None,
                              max_iterations=-1,
                              ensemble_nbest=None,
                              ensemble_size=None):

        if task is None:
            task = self._task
        else:
            self._task = task

        if metric is None:
            metric = self._metric
        else:
            self._metric = metric

        if precision is None:
            precision = self.precision
        else:
            self.precision = precision

        if dataset_name is None:
            dataset_name = self._dataset_name
        else:
            self._dataset_name = dataset_name

        if ensemble_nbest is None:
            ensemble_nbest = self._ensemble_nbest
        else:
            self._ensemble_nbest = ensemble_nbest

        if ensemble_size is None:
            ensemble_size = self._ensemble_size
        else:
            self._ensemble_size = ensemble_size

        return EnsembleBuilder(backend=self._backend,
                               dataset_name=dataset_name,
                               task_type=task,
                               metric=metric,
                               limit=time_left_for_ensembles,
                               ensemble_size=ensemble_size,
                               ensemble_nbest=ensemble_nbest,
                               seed=self._seed,
                               shared_mode=self._shared_mode,
                               precision=precision,
                               max_iterations=max_iterations)

    def _load_models(self):
        if self._shared_mode:
            seed = -1
        else:
            seed = self._seed

        self.ensemble_ = self._backend.load_ensemble(seed)
        if self.ensemble_:
            identifiers = self.ensemble_.identifiers_
            self.models_ = self._backend.load_models_by_identifiers(
                identifiers)
        else:
            self.models_ = self._backend.load_all_models(seed)

        if len(self.models_) == 0 and self._resampling_strategy not in \
                ['partial-cv', 'partial-cv-iterative-fit']:
            raise ValueError('No models fitted!')

    def score(self, X, y):
        # fix: Consider only index 1 of second dimension
        # Don't know if the reshaping should be done there or in calculate_score
        prediction = self.predict(X)
        return calculate_score(y,
                               prediction,
                               self._task,
                               self._metric,
                               self._label_num,
                               logger=self._logger)

    @property
    def grid_scores_(self):
        grid_scores = list()

        scores_per_config = defaultdict(list)
        config_list = list()

        for run_key in self.runhistory_.data:
            run_value = self.runhistory_.data[run_key]

            config_id = run_key.config_id
            cost = run_value.cost

            if config_id not in config_list:
                config_list.append(config_id)

            scores_per_config[config_id].append(cost)

        for config_id in config_list:
            scores = [1 - score for score in scores_per_config[config_id]]
            mean_score = np.mean(scores)
            config = self.runhistory_.ids_config[config_id]

            grid_score = _CVScoreTuple(config.get_dictionary(), mean_score,
                                       scores)
            grid_scores.append(grid_score)

        return grid_scores

    @property
    def cv_results_(self):
        results = dict()

        # Missing in contrast to scikit-learn
        # splitX_test_score - auto-sklearn does not store the scores on a split
        #                     basis
        # std_test_score - auto-sklearn does not store the scores on a split
        #                  basis
        # splitX_train_score - auto-sklearn does not compute train scores, add
        #                      flag to compute the train scores
        # mean_train_score - auto-sklearn does not store the train scores
        # std_train_score - auto-sklearn does not store the train scores
        # std_fit_time - auto-sklearn does not store the fit times per split
        # mean_score_time - auto-sklearn does not store the score time
        # std_score_time - auto-sklearn does not store the score time
        # TODO: add those arguments

        # TODO remove this restriction!
        if self._resampling_strategy in [
                'partial-cv', 'partial-cv-iterative-fit'
        ]:
            raise ValueError('Cannot call cv_results when using partial-cv!')

        parameter_dictionaries = dict()
        masks = dict()
        hp_names = []

        # Set up dictionary for parameter values
        for hp in self.configuration_space.get_hyperparameters():
            name = hp.name
            parameter_dictionaries[name] = []
            masks[name] = []
            hp_names.append(name)

        mean_test_score = []
        mean_fit_time = []
        params = []
        status = []
        for run_key in self.runhistory_.data:
            run_value = self.runhistory_.data[run_key]
            config_id = run_key.config_id
            config = self.runhistory_.ids_config[config_id]

            param_dict = config.get_dictionary()
            params.append(param_dict)
            mean_test_score.append(1 - run_value.cost)
            mean_fit_time.append(run_value.time)
            s = run_value.status
            if s == 1:
                status.append('Success')
            elif s == 2:
                status.append('Timeout')
            elif s == 3:
                status.append('Crash')
            elif s == 4:
                status.append('Abort')
            elif s == 5:
                status.append('Memout')
            else:
                status.append('Unknown')

            for hp_name in hp_names:
                if hp_name in param_dict:
                    hp_value = param_dict[hp_name]
                    mask_value = False
                else:
                    hp_value = np.NaN
                    mask_value = True

                parameter_dictionaries[hp_name].append(hp_value)
                masks[hp_name].append(mask_value)

        results['mean_test_score'] = np.array(mean_test_score)
        results['mean_fit_time'] = np.array(mean_fit_time)
        results['params'] = params
        results['rank_test_scores'] = scipy.stats.rankdata(
            1 - results['mean_test_score'], method='min')
        results['status'] = status

        for hp_name in hp_names:
            masked_array = ma.MaskedArray(parameter_dictionaries[hp_name],
                                          masks[hp_name])
            results['param_%s' % hp_name] = masked_array

        return results

    def sprint_statistics(self):
        cv_results = self.cv_results_
        sio = io.StringIO()
        sio.write('auto-sklearn results:\n')
        sio.write('  Dataset name: %s\n' % self._dataset_name)
        sio.write('  Metric: %s\n' % METRIC_TO_STRING[self._metric])
        idx_best_run = np.argmax(cv_results['mean_test_score'])
        best_score = cv_results['mean_test_score'][idx_best_run]
        sio.write('  Best validation score: %f\n' % best_score)
        num_runs = len(cv_results['status'])
        sio.write('  Number of target algorithm runs: %d\n' % num_runs)
        num_success = sum([s == 'Success' for s in cv_results['status']])
        sio.write('  Number of successful target algorithm runs: %d\n' %
                  num_success)
        num_crash = sum([s == 'Crash' for s in cv_results['status']])
        sio.write('  Number of crashed target algorithm runs: %d\n' %
                  num_crash)
        num_timeout = sum([s == 'Timeout' for s in cv_results['status']])
        sio.write('  Number of target algorithms that exceeded the memory '
                  'limit: %d\n' % num_timeout)
        num_memout = sum([s == 'Memout' for s in cv_results['status']])
        sio.write('  Number of target algorithms that exceeded the time '
                  'limit: %d\n' % num_memout)
        return sio.getvalue()

    def show_models(self):
        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        return self.ensemble_.pprint_ensemble_string(self.models_)

    def _create_search_space(self,
                             tmp_dir,
                             backend,
                             datamanager,
                             include_estimators=None,
                             exclude_estimators=None,
                             include_preprocessors=None,
                             exclude_preprocessors=None):
        task_name = 'CreateConfigSpace'

        self._stopwatch.start_task(task_name)
        configspace_path = os.path.join(tmp_dir, 'space.pcs')
        configuration_space = pipeline.get_configuration_space(
            datamanager.info,
            include_estimators=include_estimators,
            exclude_estimators=exclude_estimators,
            include_preprocessors=include_preprocessors,
            exclude_preprocessors=exclude_preprocessors)
        configuration_space = self.configuration_space_created_hook(
            datamanager, configuration_space)
        sp_string = pcs.write(configuration_space)
        backend.write_txt_file(configspace_path, sp_string,
                               'Configuration space')
        self._stopwatch.stop_task(task_name)

        return configuration_space, configspace_path

    def configuration_space_created_hook(self, datamanager,
                                         configuration_space):
        return configuration_space
Esempio n. 25
0
    def main(self):

        watch = StopWatch()
        watch.start_task('ensemble_builder')

        used_time = 0
        time_iter = 0
        index_run = 0
        num_iteration = 0
        current_num_models = 0
        last_hash = None
        current_hash = None

        backend = Backend(self.output_dir, self.autosklearn_tmp_dir)
        dir_ensemble = os.path.join(self.autosklearn_tmp_dir,
                                    '.auto-sklearn',
                                    'predictions_ensemble')
        dir_valid = os.path.join(self.autosklearn_tmp_dir,
                                 '.auto-sklearn',
                                 'predictions_valid')
        dir_test = os.path.join(self.autosklearn_tmp_dir,
                                '.auto-sklearn',
                                'predictions_test')
        paths_ = [dir_ensemble, dir_valid, dir_test]

        dir_ensemble_list_mtimes = []

        self.logger.debug('Starting main loop with %f seconds and %d iterations '
                          'left.' % (self.limit - used_time, num_iteration))
        while used_time < self.limit or (self.max_iterations > 0 and
                                         self.max_iterations >= num_iteration):
            num_iteration += 1
            self.logger.debug('Time left: %f', self.limit - used_time)
            self.logger.debug('Time last ensemble building: %f', time_iter)

            # Reload the ensemble targets every iteration, important, because cv may
            # update the ensemble targets in the cause of running auto-sklearn
            # TODO update cv in order to not need this any more!
            targets_ensemble = backend.load_targets_ensemble()

            # Load the predictions from the models
            exists = [os.path.isdir(dir_) for dir_ in paths_]
            if not exists[0]:  # all(exists):
                self.logger.debug('Prediction directory %s does not exist!' %
                              dir_ensemble)
                time.sleep(2)
                used_time = watch.wall_elapsed('ensemble_builder')
                continue

            if self.shared_mode is False:
                dir_ensemble_list = sorted(glob.glob(os.path.join(
                    dir_ensemble, 'predictions_ensemble_%s_*.npy' % self.seed)))
                if exists[1]:
                    dir_valid_list = sorted(glob.glob(os.path.join(
                        dir_valid, 'predictions_valid_%s_*.npy' % self.seed)))
                else:
                    dir_valid_list = []
                if exists[2]:
                    dir_test_list = sorted(glob.glob(os.path.join(
                        dir_test, 'predictions_test_%s_*.npy' % self.seed)))
                else:
                    dir_test_list = []
            else:
                dir_ensemble_list = sorted(os.listdir(dir_ensemble))
                dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else []
                dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else []

            # Check the modification times because predictions can be updated
            # over time!
            old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes
            dir_ensemble_list_mtimes = []

            for dir_ensemble_file in dir_ensemble_list:
                if dir_ensemble_file.endswith("/"):
                    dir_ensemble_file = dir_ensemble_file[:-1]
                basename = os.path.basename(dir_ensemble_file)
                dir_ensemble_file = os.path.join(dir_ensemble, basename)
                mtime = os.path.getmtime(dir_ensemble_file)
                dir_ensemble_list_mtimes.append(mtime)

            if len(dir_ensemble_list) == 0:
                self.logger.debug('Directories are empty')
                time.sleep(2)
                used_time = watch.wall_elapsed('ensemble_builder')
                continue

            if len(dir_ensemble_list) <= current_num_models and \
                    old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes:
                self.logger.debug('Nothing has changed since the last time')
                time.sleep(2)
                used_time = watch.wall_elapsed('ensemble_builder')
                continue

            watch.start_task('index_run' + str(index_run))
            watch.start_task('ensemble_iter_' + str(num_iteration))

            # List of num_runs (which are in the filename) which will be included
            #  later
            include_num_runs = []
            backup_num_runs = []
            model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy$')
            if self.ensemble_nbest is not None:
                # Keeps track of the single scores of each model in our ensemble
                scores_nbest = []
                # The indices of the model that are currently in our ensemble
                indices_nbest = []
                # The names of the models
                model_names = []

            model_names_to_scores = dict()

            model_idx = 0
            for model_name in dir_ensemble_list:
                if model_name.endswith("/"):
                    model_name = model_name[:-1]
                basename = os.path.basename(model_name)

                try:
                    if self.precision is "16":
                        predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float16)
                    elif self.precision is "32":
                        predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float32)
                    elif self.precision is "64":
                        predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float64)
                    else:
                        predictions = np.load(os.path.join(dir_ensemble, basename))

                    score = calculate_score(targets_ensemble, predictions,
                                            self.task_type, self.metric,
                                            predictions.shape[1])

                except Exception as e:
                    self.logger.warning('Error loading %s: %s', basename, e)
                    score = -1

                model_names_to_scores[model_name] = score
                match = model_and_automl_re.search(model_name)
                automl_seed = int(match.group(1))
                num_run = int(match.group(2))

                if self.ensemble_nbest is not None:
                    if score <= 0.001:
                        self.logger.error('Model only predicts at random: ' +
                                      model_name + ' has score: ' + str(score))
                        backup_num_runs.append((automl_seed, num_run))
                    # If we have less models in our ensemble than ensemble_nbest add
                    # the current model if it is better than random
                    elif len(scores_nbest) < self.ensemble_nbest:
                        scores_nbest.append(score)
                        indices_nbest.append(model_idx)
                        include_num_runs.append((automl_seed, num_run))
                        model_names.append(model_name)
                    else:
                        # Take the worst performing model in our ensemble so far
                        idx = np.argmin(np.array([scores_nbest]))

                        # If the current model is better than the worst model in
                        # our ensemble replace it by the current model
                        if scores_nbest[idx] < score:
                            self.logger.debug('Worst model in our ensemble: %s with '
                                          'score %f will be replaced by model %s '
                                          'with score %f', model_names[idx],
                                          scores_nbest[idx], model_name, score)
                            # Exclude the old model
                            del scores_nbest[idx]
                            scores_nbest.append(score)
                            del include_num_runs[idx]
                            del indices_nbest[idx]
                            indices_nbest.append(model_idx)
                            include_num_runs.append((automl_seed, num_run))
                            del model_names[idx]
                            model_names.append(model_name)

                        # Otherwise exclude the current model from the ensemble
                        else:
                            # include_num_runs.append(True)
                            pass

                else:
                    # Load all predictions that are better than random
                    if score <= 0.001:
                        # include_num_runs.append(True)
                        self.logger.error('Model only predicts at random: ' +
                                      model_name + ' has score: ' + str(score))
                        backup_num_runs.append((automl_seed, num_run))
                    else:
                        include_num_runs.append((automl_seed, num_run))

                model_idx += 1

            # If there is no model better than random guessing, we have to use
            # all models which do random guessing
            if len(include_num_runs) == 0:
                include_num_runs = backup_num_runs

            indices_to_model_names = dict()
            indices_to_run_num = dict()
            for i, model_name in enumerate(dir_ensemble_list):
                match = model_and_automl_re.search(model_name)
                automl_seed = int(match.group(1))
                num_run = int(match.group(2))
                if (automl_seed, num_run) in include_num_runs:
                    num_indices = len(indices_to_model_names)
                    indices_to_model_names[num_indices] = model_name
                    indices_to_run_num[num_indices] = (automl_seed, num_run)

            try:
                all_predictions_train, all_predictions_valid, all_predictions_test =\
                    self.get_all_predictions(dir_ensemble, dir_ensemble_list,
                                             dir_valid, dir_valid_list,
                                             dir_test, dir_test_list,
                                             include_num_runs,
                                             model_and_automl_re,
                                             self.precision)
            except IOError:
                self.logger.error('Could not load the predictions.')
                continue

            if len(include_num_runs) == 0:
                self.logger.error('All models do just random guessing')
                time.sleep(2)
                continue

            else:
                ensemble = EnsembleSelection(ensemble_size=self.ensemble_size,
                                             task_type=self.task_type,
                                             metric=self.metric)

                try:
                    ensemble.fit(all_predictions_train, targets_ensemble,
                                 include_num_runs)
                    self.logger.info(ensemble)

                except ValueError as e:
                    self.logger.error('Caught ValueError: ' + str(e))
                    used_time = watch.wall_elapsed('ensemble_builder')
                    time.sleep(2)
                    continue
                except IndexError as e:
                    self.logger.error('Caught IndexError: ' + str(e))
                    used_time = watch.wall_elapsed('ensemble_builder')
                    time.sleep(2)
                    continue
                except Exception as e:
                    self.logger.error('Caught error! %s', str(e))
                    used_time = watch.wall_elapsed('ensemble_builder')
                    time.sleep(2)
                    continue

                # Output the score
                self.logger.info('Training performance: %f' % ensemble.train_score_)

                self.logger.info('Building the ensemble took %f seconds' %
                            watch.wall_elapsed('ensemble_iter_' + str(num_iteration)))

            # Set this variable here to avoid re-running the ensemble builder
            # every two seconds in case the ensemble did not change
            current_num_models = len(dir_ensemble_list)

            ensemble_predictions = ensemble.predict(all_predictions_train)
            if sys.version_info[0] == 2:
                ensemble_predictions.flags.writeable = False
                current_hash = hash(ensemble_predictions.data)
            else:
                current_hash = hash(ensemble_predictions.data.tobytes())

            # Only output a new ensemble and new predictions if the output of the
            # ensemble would actually change!
            # TODO this is neither safe (collisions, tests only with the ensemble
            #  prediction, but not the ensemble), implement a hash function for
            # each possible ensemble builder.
            if last_hash is not None:
                if current_hash == last_hash:
                    self.logger.info('Ensemble output did not change.')
                    time.sleep(2)
                    continue
                else:
                    last_hash = current_hash
            else:
                last_hash = current_hash

            # Save the ensemble for later use in the main auto-sklearn module!
            backend.save_ensemble(ensemble, index_run, self.seed)

            # Save predictions for valid and test data set
            if len(dir_valid_list) == len(dir_ensemble_list):
                all_predictions_valid = np.array(all_predictions_valid)
                ensemble_predictions_valid = ensemble.predict(all_predictions_valid)
                if self.task_type == BINARY_CLASSIFICATION:
                    ensemble_predictions_valid = ensemble_predictions_valid[:, 1]
                if self.low_precision:
                    if self.task_type in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION]:
                        ensemble_predictions_valid[ensemble_predictions_valid < 1e-4] = 0.
                    if self.metric in [BAC_METRIC, F1_METRIC]:
                        bin_array = np.zeros(ensemble_predictions_valid.shape, dtype=np.int32)
                        if (self.task_type != MULTICLASS_CLASSIFICATION) or (
                            ensemble_predictions_valid.shape[1] == 1):
                            bin_array[ensemble_predictions_valid >= 0.5] = 1
                        else:
                            sample_num = ensemble_predictions_valid.shape[0]
                            for i in range(sample_num):
                                j = np.argmax(ensemble_predictions_valid[i, :])
                                bin_array[i, j] = 1
                        ensemble_predictions_valid = bin_array
                    if self.task_type in CLASSIFICATION_TASKS:
                        if ensemble_predictions_valid.size < (20000 * 20):
                            precision = 3
                        else:
                            precision = 2
                    else:
                        if ensemble_predictions_valid.size > 1000000:
                            precision = 4
                        else:
                            # File size maximally 2.1MB
                            precision = 6

                backend.save_predictions_as_txt(ensemble_predictions_valid,
                                                'valid', index_run, prefix=self.dataset_name,
                                                precision=precision)
            else:
                self.logger.info('Could not find as many validation set predictions (%d)'
                             'as ensemble predictions (%d)!.',
                            len(dir_valid_list), len(dir_ensemble_list))

            del all_predictions_valid

            if len(dir_test_list) == len(dir_ensemble_list):
                all_predictions_test = np.array(all_predictions_test)
                ensemble_predictions_test = ensemble.predict(all_predictions_test)
                if self.task_type == BINARY_CLASSIFICATION:
                    ensemble_predictions_test = ensemble_predictions_test[:, 1]
                if self.low_precision:
                    if self.task_type in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, MULTILABEL_CLASSIFICATION]:
                        ensemble_predictions_test[ensemble_predictions_test < 1e-4] = 0.
                    if self.metric in [BAC_METRIC, F1_METRIC]:
                        bin_array = np.zeros(ensemble_predictions_test.shape,
                                             dtype=np.int32)
                        if (self.task_type != MULTICLASS_CLASSIFICATION) or (
                                    ensemble_predictions_test.shape[1] == 1):
                            bin_array[ensemble_predictions_test >= 0.5] = 1
                        else:
                            sample_num = ensemble_predictions_test.shape[0]
                            for i in range(sample_num):
                                j = np.argmax(ensemble_predictions_test[i, :])
                                bin_array[i, j] = 1
                        ensemble_predictions_test = bin_array
                    if self.task_type in CLASSIFICATION_TASKS:
                        if ensemble_predictions_test.size < (20000 * 20):
                            precision = 3
                        else:
                            precision = 2
                    else:
                        if ensemble_predictions_test.size > 1000000:
                            precision = 4
                        else:
                            precision = 6

                backend.save_predictions_as_txt(ensemble_predictions_test,
                                                'test', index_run, prefix=self.dataset_name,
                                                precision=precision)
            else:
                self.logger.info('Could not find as many test set predictions (%d) as '
                             'ensemble predictions (%d)!',
                            len(dir_test_list), len(dir_ensemble_list))

            del all_predictions_test

            current_num_models = len(dir_ensemble_list)
            watch.stop_task('index_run' + str(index_run))
            time_iter = watch.get_wall_dur('index_run' + str(index_run))
            used_time = watch.wall_elapsed('ensemble_builder')
            index_run += 1
        return
Esempio n. 26
0
    def main(self):

        watch = StopWatch()
        watch.start_task('ensemble_builder')

        used_time = 0
        time_iter = 0
        index_run = 0
        num_iteration = 0
        current_num_models = 0
        last_hash = None
        current_hash = None

        dir_ensemble = os.path.join(self.backend.temporary_directory,
                                    '.auto-sklearn', 'predictions_ensemble')
        dir_valid = os.path.join(self.backend.temporary_directory,
                                 '.auto-sklearn', 'predictions_valid')
        dir_test = os.path.join(self.backend.temporary_directory,
                                '.auto-sklearn', 'predictions_test')
        paths_ = [dir_ensemble, dir_valid, dir_test]

        dir_ensemble_list_mtimes = []

        self.logger.debug(
            'Starting main loop with %f seconds and %d iterations '
            'left.' % (self.limit - used_time, num_iteration))
        while used_time < self.limit or (self.max_iterations > 0 and
                                         self.max_iterations >= num_iteration):
            num_iteration += 1
            self.logger.debug('Time left: %f', self.limit - used_time)
            self.logger.debug('Time last ensemble building: %f', time_iter)

            # Reload the ensemble targets every iteration, important, because cv may
            # update the ensemble targets in the cause of running auto-sklearn
            # TODO update cv in order to not need this any more!
            targets_ensemble = self.backend.load_targets_ensemble()

            # Load the predictions from the models
            exists = [os.path.isdir(dir_) for dir_ in paths_]
            if not exists[0]:  # all(exists):
                self.logger.debug('Prediction directory %s does not exist!' %
                                  dir_ensemble)
                time.sleep(2)
                used_time = watch.wall_elapsed('ensemble_builder')
                continue

            if self.shared_mode is False:
                dir_ensemble_list = sorted(
                    glob.glob(
                        os.path.join(
                            dir_ensemble,
                            'predictions_ensemble_%s_*.npy' % self.seed)))
                if exists[1]:
                    dir_valid_list = sorted(
                        glob.glob(
                            os.path.join(
                                dir_valid,
                                'predictions_valid_%s_*.npy' % self.seed)))
                else:
                    dir_valid_list = []
                if exists[2]:
                    dir_test_list = sorted(
                        glob.glob(
                            os.path.join(
                                dir_test,
                                'predictions_test_%s_*.npy' % self.seed)))
                else:
                    dir_test_list = []
            else:
                dir_ensemble_list = sorted(os.listdir(dir_ensemble))
                dir_valid_list = sorted(
                    os.listdir(dir_valid)) if exists[1] else []
                dir_test_list = sorted(
                    os.listdir(dir_test)) if exists[2] else []

            # Check the modification times because predictions can be updated
            # over time!
            old_dir_ensemble_list_mtimes = dir_ensemble_list_mtimes
            dir_ensemble_list_mtimes = []
            # The ensemble dir can contain non-model files. We filter them and
            # use the following list instead
            dir_ensemble_model_files = []

            for dir_ensemble_file in dir_ensemble_list:
                if dir_ensemble_file.endswith("/"):
                    dir_ensemble_file = dir_ensemble_file[:-1]
                if not dir_ensemble_file.endswith(".npy"):
                    self.logger.info('Error loading file (not .npy): %s',
                                     dir_ensemble_file)
                    continue

                dir_ensemble_model_files.append(dir_ensemble_file)
                basename = os.path.basename(dir_ensemble_file)
                dir_ensemble_file = os.path.join(dir_ensemble, basename)
                mtime = os.path.getmtime(dir_ensemble_file)
                dir_ensemble_list_mtimes.append(mtime)

            if len(dir_ensemble_model_files) == 0:
                self.logger.debug('Directories are empty')
                time.sleep(2)
                used_time = watch.wall_elapsed('ensemble_builder')
                continue

            if len(dir_ensemble_model_files) <= current_num_models and \
                    old_dir_ensemble_list_mtimes == dir_ensemble_list_mtimes:
                self.logger.debug('Nothing has changed since the last time')
                time.sleep(2)
                used_time = watch.wall_elapsed('ensemble_builder')
                continue

            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                # TODO restructure time management in the ensemble builder,
                # what is the time of index_run actually needed for?
                watch.start_task('index_run' + str(index_run))
            watch.start_task('ensemble_iter_' + str(num_iteration))

            # List of num_runs (which are in the filename) which will be included
            #  later
            include_num_runs = []
            backup_num_runs = []
            model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy')
            if self.ensemble_nbest is not None:
                # Keeps track of the single scores of each model in our ensemble
                scores_nbest = []
                # The indices of the model that are currently in our ensemble
                indices_nbest = []
                # The names of the models
                model_names = []

            model_names_to_scores = dict()

            model_idx = 0
            for model_name in dir_ensemble_model_files:
                if model_name.endswith("/"):
                    model_name = model_name[:-1]
                basename = os.path.basename(model_name)

                try:
                    with open(os.path.join(dir_ensemble, basename),
                              'rb') as fh:
                        if self.precision is "16":
                            predictions = np.load(fh).astype(dtype=np.float16)
                        elif self.precision is "32":
                            predictions = np.load(fh).astype(dtype=np.float32)
                        elif self.precision is "64":
                            predictions = np.load(fh).astype(dtype=np.float64)
                        else:
                            predictions = np.load(fh)

                    score = calculate_score(solution=targets_ensemble,
                                            prediction=predictions,
                                            task_type=self.task_type,
                                            metric=self.metric,
                                            all_scoring_functions=False)

                except Exception as e:
                    self.logger.warning('Error loading %s: %s - %s', basename,
                                        type(e), e)
                    score = -1

                model_names_to_scores[model_name] = score
                match = model_and_automl_re.search(model_name)
                automl_seed = int(match.group(1))
                num_run = int(match.group(2))

                if self.ensemble_nbest is not None:
                    if score <= 0.001 or num_run == 1:
                        self.logger.info('Model only predicts at random: ' +
                                         model_name + ' has score: ' +
                                         str(score))
                        backup_num_runs.append((automl_seed, num_run))
                    # If we have less models in our ensemble than ensemble_nbest add
                    # the current model if it is better than random
                    elif len(scores_nbest) < self.ensemble_nbest:
                        scores_nbest.append(score)
                        indices_nbest.append(model_idx)
                        include_num_runs.append((automl_seed, num_run))
                        model_names.append(model_name)
                    else:
                        # Take the worst performing model in our ensemble so far
                        idx = np.argmin(np.array([scores_nbest]))

                        # If the current model is better than the worst model in
                        # our ensemble replace it by the current model
                        if scores_nbest[idx] < score:
                            self.logger.info(
                                'Worst model in our ensemble: %s with score %f '
                                'will be replaced by model %s with score %f',
                                model_names[idx], scores_nbest[idx],
                                model_name, score)
                            # Exclude the old model
                            del scores_nbest[idx]
                            scores_nbest.append(score)
                            del include_num_runs[idx]
                            del indices_nbest[idx]
                            indices_nbest.append(model_idx)
                            include_num_runs.append((automl_seed, num_run))
                            del model_names[idx]
                            model_names.append(model_name)

                        # Otherwise exclude the current model from the ensemble
                        else:
                            # include_num_runs.append(True)
                            pass

                else:
                    # Load all predictions that are better than random
                    if score <= 0.001 or num_run == 1:
                        # include_num_runs.append(True)
                        self.logger.info('Model only predicts at random: ' +
                                         model_name + ' has score: ' +
                                         str(score))
                        backup_num_runs.append((automl_seed, num_run))
                    else:
                        include_num_runs.append((automl_seed, num_run))

                model_idx += 1

            # If there is no model better than random guessing, we have to use
            # all models which do random guessing
            if len(include_num_runs) == 0:
                include_num_runs = backup_num_runs

            indices_to_model_names = dict()
            indices_to_run_num = dict()
            for i, model_name in enumerate(dir_ensemble_model_files):
                match = model_and_automl_re.search(model_name)
                automl_seed = int(match.group(1))
                num_run = int(match.group(2))
                if (automl_seed, num_run) in include_num_runs:
                    num_indices = len(indices_to_model_names)
                    indices_to_model_names[num_indices] = model_name
                    indices_to_run_num[num_indices] = (automl_seed, num_run)

            try:
                all_predictions_train, all_predictions_valid, all_predictions_test =\
                    self.get_all_predictions(dir_ensemble,
                                             dir_ensemble_model_files,
                                             dir_valid, dir_valid_list,
                                             dir_test, dir_test_list,
                                             include_num_runs,
                                             model_and_automl_re,
                                             self.precision)
            except IOError as e:
                print(e)
                self.logger.error('Could not load the predictions.')
                continue

            if len(include_num_runs) == 0:
                self.logger.error('All models do just random guessing')
                time.sleep(2)
                continue

            else:
                ensemble = EnsembleSelection(ensemble_size=self.ensemble_size,
                                             task_type=self.task_type,
                                             metric=self.metric)

                try:
                    ensemble.fit(all_predictions_train, targets_ensemble,
                                 include_num_runs)
                    self.logger.info(ensemble)

                except ValueError as e:
                    self.logger.error('Caught ValueError: ' + str(e))
                    used_time = watch.wall_elapsed('ensemble_builder')
                    time.sleep(2)
                    continue
                except IndexError as e:
                    self.logger.error('Caught IndexError: ' + str(e))
                    used_time = watch.wall_elapsed('ensemble_builder')
                    time.sleep(2)
                    continue
                #except Exception as e:
                #    self.logger.error('Caught error! %s', str(e))
                #    used_time = watch.wall_elapsed('ensemble_builder')
                #    time.sleep(2)
                #    continue

                # Output the score
                self.logger.info('Training performance: %f' %
                                 ensemble.train_score_)

                self.logger.info(
                    'Building the ensemble took %f seconds' %
                    watch.wall_elapsed('ensemble_iter_' + str(num_iteration)))

            # Set this variable here to avoid re-running the ensemble builder
            # every two seconds in case the ensemble did not change
            current_num_models = len(dir_ensemble_model_files)

            ensemble_predictions = ensemble.predict(all_predictions_train)
            if sys.version_info[0] == 2:
                ensemble_predictions.flags.writeable = False
                current_hash = hash(ensemble_predictions.data)
            else:
                current_hash = hash(ensemble_predictions.data.tobytes())

            # Only output a new ensemble and new predictions if the output of the
            # ensemble would actually change!
            # TODO this is neither safe (collisions, tests only with the ensemble
            #  prediction, but not the ensemble), implement a hash function for
            # each possible ensemble builder.
            if last_hash is not None:
                if current_hash == last_hash:
                    self.logger.info('Ensemble output did not change.')
                    time.sleep(2)
                    continue
                else:
                    last_hash = current_hash
            else:
                last_hash = current_hash

            # Save the ensemble for later use in the main auto-sklearn module!
            self.backend.save_ensemble(ensemble, index_run, self.seed)

            # Save predictions for valid and test data set
            if len(dir_valid_list) == len(dir_ensemble_model_files):
                all_predictions_valid = np.array(all_predictions_valid)
                ensemble_predictions_valid = ensemble.predict(
                    all_predictions_valid)
                if self.task_type == BINARY_CLASSIFICATION:
                    ensemble_predictions_valid = ensemble_predictions_valid[:,
                                                                            1]

                self.backend.save_predictions_as_txt(
                    ensemble_predictions_valid,
                    'valid',
                    index_run,
                    prefix=self.dataset_name)
            else:
                self.logger.info(
                    'Could not find as many validation set predictions (%d)'
                    'as ensemble predictions (%d)!.', len(dir_valid_list),
                    len(dir_ensemble_model_files))

            del all_predictions_valid

            if len(dir_test_list) == len(dir_ensemble_model_files):
                all_predictions_test = np.array(all_predictions_test)
                ensemble_predictions_test = ensemble.predict(
                    all_predictions_test)
                if self.task_type == BINARY_CLASSIFICATION:
                    ensemble_predictions_test = ensemble_predictions_test[:, 1]

                self.backend.save_predictions_as_txt(ensemble_predictions_test,
                                                     'test',
                                                     index_run,
                                                     prefix=self.dataset_name)
            else:
                self.logger.info(
                    'Could not find as many test set predictions (%d) as '
                    'ensemble predictions (%d)!', len(dir_test_list),
                    len(dir_ensemble_model_files))

            del all_predictions_test

            current_num_models = len(dir_ensemble_model_files)
            watch.stop_task('index_run' + str(index_run))
            time_iter = watch.get_wall_dur('index_run' + str(index_run))
            used_time = watch.wall_elapsed('ensemble_builder')
            index_run += 1
        return
Esempio n. 27
0
class AutoML(BaseEstimator, multiprocessing.Process):

    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32,
                 max_iter_smac=None,
                 acquisition_function='EI'):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._data_memory_limit = None
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self._max_iter_smac = max_iter_smac
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision
        self.acquisition_function = acquisition_function

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self._parser = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode

        if not isinstance(self._time_for_task, int):
            raise ValueError("time_left_for_this_task not of type integer, "
                             "but %s" % str(type(self._time_for_task)))
        if not isinstance(self._per_run_time_limit, int):
            raise ValueError("per_run_time_limit not of type integer, but %s" %
                             str(type(self._per_run_time_limit)))

        # After assignging and checking variables...
        self._backend = Backend(self._output_dir, self._tmp_dir)

    def start_automl(self, parser):
        self._parser = parser
        self.start()

    def start(self):
        if self._parser is None:
            raise ValueError('You must invoke start() only via start_automl()')
        super(AutoML, self).start()

    def run(self):
        if self._parser is None:
            raise ValueError('You must invoke run() only via start_automl()')
        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        datamanager = get_data_manager(namespace=self._parser)
        self._stopwatch.start_task(datamanager.name)

        self._logger = self._get_logger(datamanager.name)

        self._datamanager = datamanager
        self._dataset_name = datamanager.name
        self._fit(self._datamanager)

    def fit(self, X, y,
            task=MULTICLASS_CLASSIFICATION,
            metric='acc_metric',
            feat_type=None,
            dataset_name=None):
        if dataset_name is None:
            m = hashlib.md5()
            m.update(X.data)
            dataset_name = m.hexdigest()

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if isinstance(metric, str):
            metric = STRING_TO_METRIC[metric]

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all([isinstance(f, str)
                                              for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' % ft)

        self._data_memory_limit = None
        loaded_data_manager = XYDataManager(X, y,
                                            task=task,
                                            metric=metric,
                                            feat_type=feat_type,
                                            dataset_name=dataset_name,
                                            encode_labels=False)

        return self._fit(loaded_data_manager)

    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        self._logger = self._get_logger(name)
        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        self._data_memory_limit = float(self._ml_memory_limit) / 3
        loaded_data_manager = CompetitionDataManager(
            dataset, encode_labels=False,
            max_memory_in_mb=self._data_memory_limit)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)

    def _get_logger(self, name):
        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        return get_logger(logger_name)

    @staticmethod
    def _start_task(watcher, task_name):
        watcher.start_task(task_name)

    @staticmethod
    def _stop_task(watcher, task_name):
        watcher.stop_task(task_name)

    @staticmethod
    def _print_load_time(basename, time_left_for_this_task,
                         time_for_load_data, logger):

        time_left_after_reading = max(
            0, time_left_for_this_task - time_for_load_data)
        logger.info('Remaining time after reading %s %5.2f sec' %
                    (basename, time_left_after_reading))
        return time_for_load_data

    def _do_dummy_prediction(self, datamanager, num_run):

        self._logger.info("Starting to create dummy predictions.")
        time_limit = int(self._time_for_task / 6.)
        memory_limit = int(self._ml_memory_limit)

        _info = eval_with_limits(datamanager, self._tmp_dir, 1,
                                 self._seed, num_run,
                                 self._resampling_strategy,
                                 self._resampling_strategy_arguments,
                                 memory_limit, time_limit)
        if _info[4] == StatusType.SUCCESS:
            self._logger.info("Finished creating dummy prediction 1/2.")
        else:
            self._logger.error('Error creating dummy prediction 1/2:%s ',
                               _info[3])

        num_run += 1

        _info = eval_with_limits(datamanager, self._tmp_dir, 2,
                                 self._seed, num_run,
                                 self._resampling_strategy,
                                 self._resampling_strategy_arguments,
                                 memory_limit, time_limit)
        if _info[4] == StatusType.SUCCESS:
            self._logger.info("Finished creating dummy prediction 2/2.")
        else:
            self._logger.error('Error creating dummy prediction 2/2 %s',
                               _info[3])

        num_run += 1
        return num_run

    def _fit(self, datamanager):
        # Reset learnt stuff
        self.models_ = None
        self.ensemble_ = None

        # Check arguments prior to doing anything!
        if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit',
                                             'cv', 'nested-cv', 'partial-cv']:
            raise ValueError('Illegal resampling strategy: %s' %
                             self._resampling_strategy)
        if self._resampling_strategy == 'partial-cv' and \
                self._ensemble_size != 0:
            raise ValueError("Resampling strategy partial-cv cannot be used "
                             "together with ensembles.")

        acquisition_functions = ['EI', 'EIPS']
        if self.acquisition_function not in acquisition_functions:
            raise ValueError('Illegal acquisition %s: Must be one of %s.' %
                             (self.acquisition_function, acquisition_functions))

        self._backend._make_internals_directory()
        if self._keep_models:
            try:
                os.mkdir(self._backend.get_model_dir())
            except OSError:
                self._logger.warning("model directory already exists")
                if not self._shared_mode:
                    raise

        self._metric = datamanager.info['metric']
        self._task = datamanager.info['task']
        self._label_num = datamanager.info['label_num']

        # == Pickle the data manager to speed up loading
        data_manager_path = self._backend.save_datamanager(datamanager)

        self._save_ensemble_data(
            datamanager.data['X_train'],
            datamanager.data['Y_train'])

        time_for_load_data = self._stopwatch.wall_elapsed(self._dataset_name)

        if self._debug_mode:
            self._print_load_time(
                self._dataset_name,
                self._time_for_task,
                time_for_load_data,
                self._logger)

        # == Perform dummy predictions
        num_run = 1
        #if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']:
        num_run = self._do_dummy_prediction(datamanager, num_run)

        # = Create a searchspace
        # Do this before One Hot Encoding to make sure that it creates a
        # search space for a dense classifier even if one hot encoding would
        # make it sparse (tradeoff; if one hot encoding would make it sparse,
        #  densifier and truncatedSVD would probably lead to a MemoryError,
        # like this we can't use some of the preprocessing methods in case
        # the data became sparse)
        self.configuration_space, configspace_path = self._create_search_space(
            self._tmp_dir,
            self._backend,
            datamanager,
            self._include_estimators,
            self._include_preprocessors)

        # == RUN ensemble builder
        # Do this before calculating the meta-features to make sure that the
        # dummy predictions are actually included in the ensemble even if
        # calculating the meta-features takes very long
        ensemble_task_name = 'runEnsemble'
        self._stopwatch.start_task(ensemble_task_name)
        time_left_for_ensembles = max(0,self._time_for_task \
                                      - self._stopwatch.wall_elapsed(self._dataset_name))
        if self._logger:
            self._logger.info(
                'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles)
        if time_left_for_ensembles <= 0:
            self._logger.warning("Not starting ensemble builder because there "
                                 "is no time left!")
            self._proc_ensemble = None
        else:
            self._proc_ensemble = self._get_ensemble_process(time_left_for_ensembles)
            self._proc_ensemble.start()
        self._stopwatch.stop_task(ensemble_task_name)

        # == RUN SMBO
        # kill the datamanager as it will be re-loaded anyways from sub processes
        try:
            del self._datamanager
        except Exception:
            pass
            
        # => RUN SMAC
        smac_task_name = 'runSMAC'
        self._stopwatch.start_task(smac_task_name)
        time_left_for_smac = max(0,
            self._time_for_task - self._stopwatch.wall_elapsed(
            self._dataset_name))

        if self._logger:
            self._logger.info(
                'Start SMAC with %5.2fsec time left' % time_left_for_smac)
        if time_left_for_smac <= 0:
            self._logger.warning("Not starting SMAC because there is no time "
                                 "left.")
            self._procsmac = None
        else:
            self._proc_smac = AutoMLSMBO(config_space=self.configuration_space,
                                         dataset_name=self._dataset_name,
                                         tmp_dir=self._tmp_dir,
                                         output_dir=self._output_dir,
                                         total_walltime_limit=time_left_for_smac,
                                         func_eval_time_limit=self._per_run_time_limit,
                                         memory_limit=self._ml_memory_limit,
                                         data_memory_limit=self._data_memory_limit,
                                         watcher=self._stopwatch,
                                         start_num_run=num_run,
                                         num_metalearning_cfgs=self._initial_configurations_via_metalearning,
                                         config_file=configspace_path,
                                         smac_iters=self._max_iter_smac,
                                         seed=self._seed,
                                         metadata_directory=self._metadata_directory,
                                         resampling_strategy=self._resampling_strategy,
                                         resampling_strategy_args=self._resampling_strategy_arguments,
                                         acquisition_function=self.acquisition_function,
                                         shared_mode=self._shared_mode)
            self._proc_smac.start()

        psutil_procs = []
        procs = []
        if self._proc_smac is not None:
            proc_smac = psutil.Process(self._proc_smac.pid)
            psutil_procs.append(proc_smac)
            procs.append(self._proc_smac)
        if self._proc_ensemble is not None:
            proc_ensemble = psutil.Process(self._proc_ensemble.pid)
            psutil_procs.append(proc_ensemble)
            procs.append(self._proc_ensemble)

        if self._queue is not None:
            self._queue.put([time_for_load_data, data_manager_path, psutil_procs])
        else:
            for proc in procs:
                proc.join()

        if self._queue is None:
            self._load_models()

        return self

    def refit(self, X, y):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        for identifier in self.models_:
            if identifier in self.ensemble_.get_model_identifiers():
                model = self.models_[identifier]
                # this updates the model inplace, it can then later be used in
                # predict method
                model.fit(X.copy(), y.copy())

        self._can_predict = True

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

    def predict_proba(self, X):
        if self._keep_models is not True:
            raise ValueError(
                "Predict can only be called if 'keep_models==True'")
        if not self._can_predict and \
                self._resampling_strategy not in  \
                        ['holdout', 'holdout-iterative-fit']:
            raise NotImplementedError(
                'Predict is currently only implemented for resampling '
                'strategy holdout.')

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        all_predictions = []
        for identifier in self.ensemble_.get_model_identifiers():
            model = self.models_[identifier]

            X_ = X.copy()
            if self._task in REGRESSION_TASKS:
                prediction = model.predict(X_)
            else:
                prediction = model.predict_proba(X_)

            if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
                    X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
                self._logger.warning("Prediction shape for model %s is %s "
                                     "while X_.shape is %s" %
                                     (model, str(prediction.shape),
                                      str(X_.shape)))
            all_predictions.append(prediction)

        if len(all_predictions) == 0:
            raise ValueError('Something went wrong generating the predictions. '
                             'The ensemble should consist of the following '
                             'models: %s, the following models were loaded: '
                             '%s' % (str(list(self.ensemble_indices_.keys())),
                                     str(list(self.models_.keys()))))

        predictions = self.ensemble_.predict(all_predictions)
        return predictions

    def fit_ensemble(self, task=None, metric=None, precision='32',
                     dataset_name=None, ensemble_nbest=None,
                     ensemble_size=None):
        if self._logger is None:
            self._logger = self._get_logger(dataset_name)

        self._proc_ensemble = self._get_ensemble_process(
            1, task, metric, precision, dataset_name, max_iterations=1,
            ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size)
        self._proc_ensemble.main()

    def _get_ensemble_process(self, time_left_for_ensembles,
                              task=None, metric=None, precision=None,
                              dataset_name=None, max_iterations=-1,
                              ensemble_nbest=None, ensemble_size=None):

        if task is None:
            task = self._task
        if metric is None:
            metric = self._metric
        if precision is None:
            precision = self.precision
        if dataset_name is None:
            dataset_name = self._dataset_name
        if ensemble_nbest is None:
            ensemble_nbest = self._ensemble_nbest
        if ensemble_size is None:
            ensemble_size = self._ensemble_size

        return EnsembleBuilder(autosklearn_tmp_dir=self._tmp_dir,
                               dataset_name=dataset_name,
                               task_type=task,
                               metric=metric,
                               limit=time_left_for_ensembles,
                               output_dir=self._output_dir,
                               ensemble_size=ensemble_size,
                               ensemble_nbest=ensemble_nbest,
                               seed=self._seed,
                               shared_mode=self._shared_mode,
                               precision=precision,
                               max_iterations=max_iterations)

    def _load_models(self):
        if self._shared_mode:
            seed = -1
        else:
            seed = self._seed

        self.ensemble_ = self._backend.load_ensemble(seed)
        if self.ensemble_:
            identifiers = self.ensemble_.identifiers_
            self.models_ = self._backend.load_models_by_identifiers(identifiers)
        else:
            self.models_ = self._backend.load_all_models(seed)

        if len(self.models_) == 0:
            raise ValueError('No models fitted!')


    def score(self, X, y):
        # fix: Consider only index 1 of second dimension
        # Don't know if the reshaping should be done there or in calculate_score
        prediction = self.predict_proba(X)
        return calculate_score(y, prediction, self._task,
                               self._metric, self._label_num,
                               logger=self._logger)

    def show_models(self):

        if self.models_ is None or len(self.models_) == 0 or \
                self.ensemble_ is None:
            self._load_models()

        return self.ensemble_.pprint_ensemble_string(self.models_)

    def _save_ensemble_data(self, X, y):
        """Split dataset and store Data for the ensemble script.

        :param X:
        :param y:
        :return:

        """
        task_name = 'LoadData'
        self._start_task(self._stopwatch, task_name)
        _, _, _, y_ensemble = resampling.split_data(X, y)
        self._backend.save_targets_ensemble(y_ensemble)
        self._stop_task(self._stopwatch, task_name)

    def _create_search_space(self, tmp_dir, backend, datamanager,
                             include_estimators=None,
                             include_preprocessors=None):
        task_name = 'CreateConfigSpace'

        self._stopwatch.start_task(task_name)
        configspace_path = os.path.join(tmp_dir, 'space.pcs')
        configuration_space = pipeline.get_configuration_space(
            datamanager.info,
            include_estimators=include_estimators,
            include_preprocessors=include_preprocessors)
        configuration_space = self.configuration_space_created_hook(
            datamanager, configuration_space)
        sp_string = pcs.write(configuration_space)
        backend.write_txt_file(configspace_path, sp_string,
                               'Configuration space')
        self._stopwatch.stop_task(task_name)

        return configuration_space, configspace_path

    def configuration_space_created_hook(self, datamanager, configuration_space):
        return configuration_space

    def get_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'get_params() because it is not intended to '
                                  'be optimized.')

    def set_params(self, deep=True):
        raise NotImplementedError('auto-sklearn does not implement '
                                  'set_params() because it is not intended to '
                                  'be optimized.')

    def __del__(self):
        self._delete_output_directories()

    def _delete_output_directories(self):
        if self.delete_output_folder_after_terminate:
            try:
                shutil.rmtree(self._output_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete output dir: %s" %
                                         self._output_dir)
                else:
                    print("Could not delete output dir: %s" %
                          self._output_dir)

        if self.delete_tmp_folder_after_terminate:
            try:
                shutil.rmtree(self._tmp_dir)
            except Exception:
                if self._logger is not None:
                    self._logger.warning("Could not delete tmp dir: %s" %
                                  self._tmp_dir)
                    pass
                else:
                    print("Could not delete tmp dir: %s" %
                          self._tmp_dir)
Esempio n. 28
0
    def __init__(self,
                 backend,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3072,
                 metadata_directory=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 exclude_estimators=None,
                 include_preprocessors=None,
                 exclude_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32,
                 max_iter_smac=None,
                 acquisition_function='EI',
                 disable_evaluator_output=False,
                 configuration_mode='SMAC'):
        super(AutoML, self).__init__()
        self._backend = backend
        #self._tmp_dir = tmp_dir
        #self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        #self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._data_memory_limit = None
        self._metadata_directory = metadata_directory
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._exclude_estimators = exclude_estimators
        self._include_preprocessors = include_preprocessors
        self._exclude_preprocessors = exclude_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments \
            if resampling_strategy_arguments is not None else {}
        self._max_iter_smac = max_iter_smac
        #self.delete_tmp_folder_after_terminate = \
        #    delete_tmp_folder_after_terminate
        #self.delete_output_folder_after_terminate = \
        #    delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision
        self.acquisition_function = acquisition_function
        self._disable_evaluator_output = disable_evaluator_output
        self._configuration_mode = configuration_mode

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self._parser = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode

        if not isinstance(self._time_for_task, int):
            raise ValueError("time_left_for_this_task not of type integer, "
                             "but %s" % str(type(self._time_for_task)))
        if not isinstance(self._per_run_time_limit, int):
            raise ValueError("per_run_time_limit not of type integer, but %s" %
                             str(type(self._per_run_time_limit)))
Esempio n. 29
0
    def __init__(self,
                 tmp_dir,
                 output_dir,
                 time_left_for_this_task,
                 per_run_time_limit,
                 log_dir=None,
                 initial_configurations_via_metalearning=25,
                 ensemble_size=1,
                 ensemble_nbest=1,
                 seed=1,
                 ml_memory_limit=3000,
                 metadata_directory=None,
                 queue=None,
                 keep_models=True,
                 debug_mode=False,
                 include_estimators=None,
                 include_preprocessors=None,
                 resampling_strategy='holdout-iterative-fit',
                 resampling_strategy_arguments=None,
                 delete_tmp_folder_after_terminate=False,
                 delete_output_folder_after_terminate=False,
                 shared_mode=False,
                 precision=32,
                 max_iter_smac=None,
                 acquisition_function='EI'):
        super(AutoML, self).__init__()

        self._tmp_dir = tmp_dir
        self._output_dir = output_dir
        self._time_for_task = time_left_for_this_task
        self._per_run_time_limit = per_run_time_limit
        self._log_dir = log_dir if log_dir is not None else self._tmp_dir
        self._initial_configurations_via_metalearning = \
            initial_configurations_via_metalearning
        self._ensemble_size = ensemble_size
        self._ensemble_nbest = ensemble_nbest
        self._seed = seed
        self._ml_memory_limit = ml_memory_limit
        self._data_memory_limit = None
        self._metadata_directory = metadata_directory
        self._queue = queue
        self._keep_models = keep_models
        self._include_estimators = include_estimators
        self._include_preprocessors = include_preprocessors
        self._resampling_strategy = resampling_strategy
        self._resampling_strategy_arguments = resampling_strategy_arguments
        self._max_iter_smac = max_iter_smac
        self.delete_tmp_folder_after_terminate = \
            delete_tmp_folder_after_terminate
        self.delete_output_folder_after_terminate = \
            delete_output_folder_after_terminate
        self._shared_mode = shared_mode
        self.precision = precision
        self.acquisition_function = acquisition_function

        self._datamanager = None
        self._dataset_name = None
        self._stopwatch = StopWatch()
        self._logger = None
        self._task = None
        self._metric = None
        self._label_num = None
        self._parser = None
        self.models_ = None
        self.ensemble_ = None
        self._can_predict = False

        self._debug_mode = debug_mode

        if not isinstance(self._time_for_task, int):
            raise ValueError("time_left_for_this_task not of type integer, "
                             "but %s" % str(type(self._time_for_task)))
        if not isinstance(self._per_run_time_limit, int):
            raise ValueError("per_run_time_limit not of type integer, but %s" %
                             str(type(self._per_run_time_limit)))

        # After assignging and checking variables...
        self._backend = Backend(self._output_dir, self._tmp_dir)
Esempio n. 30
0
    def fit(
        self, X, y,
        task,
        metric,
        X_test=None,
        y_test=None,
        feat_type=None,
        dataset_name=None,
        only_return_configuration_space=False,
    ):
        if self._shared_mode:
            # If this fails, it's likely that this is the first call to get
            # the data manager
            try:
                D = self._backend.load_datamanager()
                dataset_name = D.name
            except IOError:
                pass

        if dataset_name is None:
            dataset_name = hash_array_or_matrix(X)

        self._backend.save_start_time(self._seed)
        self._stopwatch = StopWatch()
        self._dataset_name = dataset_name
        self._stopwatch.start_task(self._dataset_name)

        self._logger = self._get_logger(dataset_name)

        if metric is None:
            raise ValueError('No metric given.')
        if not isinstance(metric, Scorer):
            raise ValueError('Metric must be instance of '
                             'autosklearn.metrics.Scorer.')

        if feat_type is not None and len(feat_type) != X.shape[1]:
            raise ValueError('Array feat_type does not have same number of '
                             'variables as X has features. %d vs %d.' %
                             (len(feat_type), X.shape[1]))
        if feat_type is not None and not all([isinstance(f, str)
                                              for f in feat_type]):
            raise ValueError('Array feat_type must only contain strings.')
        if feat_type is not None:
            for ft in feat_type:
                if ft.lower() not in ['categorical', 'numerical']:
                    raise ValueError('Only `Categorical` and `Numerical` are '
                                     'valid feature types, you passed `%s`' % ft)

        self._data_memory_limit = None
        loaded_data_manager = XYDataManager(
            X, y,
            X_test=X_test,
            y_test=y_test,
            task=task,
            feat_type=feat_type,
            dataset_name=dataset_name,
        )

        return self._fit(
            loaded_data_manager,
            metric,
            only_return_configuration_space,
        )