def test_predict_mocked(self, rf_mock):
        class SideEffect(object):
            def __init__(self):
                self.counter = 0

            def __call__(self, X):
                self.counter += 1
                # Mean and variance
                rval = np.array([self.counter] * X.shape[0])
                return rval, rval

        rf_mock.side_effect = SideEffect()

        rs = np.random.RandomState(1)
        X = rs.rand(20, 10)
        Y = rs.rand(10, 3)
        model = UncorrelatedMultiObjectiveRandomForestWithInstances(
            ['cost', 'ln(runtime)', 'foo'], np.zeros((10, ), dtype=np.uint))

        model.train(X[:10], Y[:10])
        m_hat, v_hat = model.predict(X[10:])
        self.assertEqual(m_hat.shape, (10, 3))
        self.assertEqual(v_hat.shape, (10, 3))
        self.assertEqual(rf_mock.call_count, 3)
        for i in range(10):
            for j in range(3):
                self.assertEqual(m_hat[i][j], j + 1)
                self.assertEqual(v_hat[i][j], j + 1)
 def test_train_and_predict_with_rf(self):
     rs = np.random.RandomState(1)
     X = rs.rand(20, 10)
     Y = rs.rand(10, 2)
     model = UncorrelatedMultiObjectiveRandomForestWithInstances(
         ['cost', 'ln(runtime)'], np.zeros((10, ), dtype=np.uint))
     model.train(X[:10], Y)
     m, v = model.predict(X[10:])
     self.assertEqual(m.shape, (10, 2))
     self.assertEqual(v.shape, (10, 2))
Esempio n. 3
0
 def test_train_and_predict_with_rf(self):
     rs = np.random.RandomState(1)
     X = rs.rand(20, 10)
     Y = rs.rand(10, 2)
     model = UncorrelatedMultiObjectiveRandomForestWithInstances(
         ['cost', 'ln(runtime)'],
         types=np.zeros((10, ), dtype=np.uint),
         bounds=np.array([(0, np.nan), (0, np.nan), (0, np.nan),
                          (0, np.nan), (0, np.nan), (0, np.nan),
                          (0, np.nan), (0, np.nan), (0, np.nan),
                          (0, np.nan)],
                         dtype=object),
         rf_kwargs={'seed': 1},
         pca_components=5)
     self.assertEqual(model.estimators[0].seed, 1)
     self.assertEqual(model.estimators[1].seed, 1)
     self.assertEqual(model.pca_components, 5)
     model.train(X[:10], Y)
     m, v = model.predict(X[10:])
     self.assertEqual(m.shape, (10, 2))
     self.assertEqual(v.shape, (10, 2))
Esempio n. 4
0
 def test_init_EIPS_as_arguments(self):
     for objective in ['runtime', 'quality']:
         self.scenario.run_obj = objective
         types, bounds = get_types(self.scenario.cs, None)
         umrfwi = UncorrelatedMultiObjectiveRandomForestWithInstances(
             ['cost', 'runtime'], types, bounds)
         eips = EIPS(umrfwi)
         rh2EPM = RunHistory2EPM4EIPS(self.scenario, 2)
         smbo = SMAC(self.scenario, model=umrfwi, acquisition_function=eips,
                     runhistory2epm=rh2EPM).solver
         self.assertIs(umrfwi, smbo.model)
         self.assertIs(eips, smbo.acquisition_func)
         self.assertIs(rh2EPM, smbo.rh2EPM)
Esempio n. 5
0
 def test_eips(self):
     scenario = Scenario({'cs': test_helpers.get_branin_config_space(),
                          'run_obj': 'quality',
                          'deterministic': True,
                          'output_dir': ''})
     types = get_types(scenario.cs, None)
     umrfwi = UncorrelatedMultiObjectiveRandomForestWithInstances(
         ['cost', 'runtime'], types)
     eips = EIPS(umrfwi)
     rh2EPM = RunHistory2EPM4EIPS(scenario, 2)
     taf = ExecuteTAFunc(test_function)
     smbo = SMBO(scenario, model=umrfwi, acquisition_function=eips,
                 runhistory2epm=rh2EPM, tae_runner=taf,
                 random_configuration_chooser=ChooserNoCoolDown(2.0))
     smbo.run(5)
     print(smbo.incumbent)
     raise ValueError()
def get_eips_object_callback(
        scenario_dict,
        seed,
        ta,
        backend,
        metalearning_configurations,
        runhistory,
):
    scenario_dict['input_psmac_dirs'] = backend.get_smac_output_glob()
    scenario = Scenario(scenario_dict)
    rh2EPM = RunHistory2EPM4EIPS(
        num_params=len(scenario.cs.get_hyperparameters()),
        scenario=scenario,
        success_states=[
            StatusType.SUCCESS,
            StatusType.MEMOUT,
            StatusType.TIMEOUT,
            StatusType.CRASHED
        ],
        impute_censored_data=False,
        impute_state=None
    )
    types, bounds = get_types(scenario.cs,
                              scenario.feature_array)
    model = UncorrelatedMultiObjectiveRandomForestWithInstances(
        ['cost', 'runtime'],
        types=types,
        bounds=bounds,
        instance_features=scenario.feature_array,
        rf_kwargs={'seed': 1,},
    )
    acquisition_function = EIPS(model)
    return SMAC(
        runhistory=runhistory,
        scenario=scenario,
        rng=seed,
        tae_runner=ta,
        runhistory2epm=rh2EPM,
        model=model,
        acquisition_function=acquisition_function,
        run_id=seed,
    )
Esempio n. 7
0
    def run_smbo(self, max_iters=1000):
        global evaluator

        # == first things first: load the datamanager
        self.reset_data_manager()

        # == Initialize SMBO stuff
        # first create a scenario
        seed = self.seed  # TODO
        num_params = len(self.config_space.get_hyperparameters())
        # allocate a run history
        run_history = RunHistory()
        meta_runhistory = RunHistory()
        meta_runs_dataset_indices = {}
        num_run = self.start_num_run
        instance_id = self.dataset_name + SENTINEL

        # == Train on subset
        #    before doing anything, let us run the default_cfg
        #    on a subset of the available data to ensure that
        #    we at least have some models
        #    we will try three different ratios of decreasing magnitude
        #    in the hope that at least on the last one we will be able
        #    to get a model
        n_data = self.datamanager.data['X_train'].shape[0]
        subset_ratio = 10000. / n_data
        if subset_ratio >= 0.5:
            subset_ratio = 0.33
            subset_ratios = [subset_ratio, subset_ratio * 0.10]
        else:
            subset_ratios = [subset_ratio, 500. / n_data]
        self.logger.info("Training default configurations on a subset of "
                         "%d/%d data points." %
                         (int(n_data * subset_ratio), n_data))

        # the time limit for these function evaluations is rigorously
        # set to only 1/2 of a full function evaluation
        subset_time_limit = max(5, int(self.func_eval_time_limit / 2))
        # the configs we want to run on the data subset are:
        # 1) the default configs
        # 2) a set of configs we selected for training on a subset
        subset_configs = [self.config_space.get_default_configuration()] \
                          + self.collect_additional_subset_defaults()
        subset_config_succesful = [False] * len(subset_configs)
        for subset_config_id, next_config in enumerate(subset_configs):
            for i, ratio in enumerate(subset_ratios):
                self.reset_data_manager()
                n_data_subsample = int(n_data * ratio)

                # run the config, but throw away the result afterwards
                # since this cfg was evaluated only on a subset
                # and we don't want  to confuse SMAC
                self.logger.info(
                    "Starting to evaluate %d on SUBSET "
                    "with size %d and time limit %ds.", num_run,
                    n_data_subsample, subset_time_limit)
                self.logger.info(next_config)
                _info = eval_with_limits(self.datamanager, self.tmp_dir,
                                         next_config, seed, num_run,
                                         self.resampling_strategy,
                                         self.resampling_strategy_args,
                                         self.memory_limit, subset_time_limit,
                                         n_data_subsample)
                (duration, result, _, additional_run_info, status) = _info
                self.logger.info(
                    "Finished evaluating %d. configuration on SUBSET. "
                    "Duration %f; loss %f; status %s; additional run "
                    "info: %s ", num_run, duration, result, str(status),
                    additional_run_info)

                num_run += 1
                if i < len(subset_ratios) - 1:
                    if status != StatusType.SUCCESS:
                        # Do not increase num_run here, because we will try
                        # the same configuration with less data
                        self.logger.info(
                            "A CONFIG did not finish "
                            " for subset ratio %f -> going smaller", ratio)
                        continue
                    else:
                        self.logger.info(
                            "Finished SUBSET training sucessfully "
                            "with ratio %f", ratio)
                        subset_config_succesful[subset_config_id] = True
                        break
                else:
                    if status != StatusType.SUCCESS:
                        self.logger.info(
                            "A CONFIG did not finish "
                            " for subset ratio %f.", ratio)
                        continue
                    else:
                        self.logger.info(
                            "Finished SUBSET training sucessfully "
                            "with ratio %f", ratio)
                        subset_config_succesful[subset_config_id] = True
                        break

        # Use the first non-failing configuration from the subsets as the new
        #  default configuration -> this guards us against the random forest
        # failing on large, sparse datasets
        default_cfg = None
        for subset_config_id, next_config in enumerate(subset_configs):
            if subset_config_succesful[subset_config_id]:
                default_cfg = next_config
                break
        if default_cfg is None:
            default_cfg = self.config_space.get_default_configuration()

        # == METALEARNING suggestions
        # we start by evaluating the defaults on the full dataset again
        # and add the suggestions from metalearning behind it

        if self.metadata_directory is None:
            metalearning_directory = os.path.dirname(
                autosklearn.metalearning.__file__)
            # There is no multilabel data in OpenML
            if self.task == MULTILABEL_CLASSIFICATION:
                meta_task = BINARY_CLASSIFICATION
            else:
                meta_task = self.task
            metadata_directory = os.path.join(
                metalearning_directory, 'files', '%s_%s_%s' %
                (METRIC_TO_STRING[self.metric],
                 TASK_TYPES_TO_STRING[meta_task],
                 'sparse' if self.datamanager.info['is_sparse'] else 'dense'))
            self.metadata_directory = metadata_directory

        self.logger.info('Metadata directory: %s', self.metadata_directory)
        meta_base = MetaBase(self.config_space, self.metadata_directory)

        metafeature_calculation_time_limit = int(self.total_walltime_limit / 4)
        metafeature_calculation_start_time = time.time()
        meta_features = self._calculate_metafeatures_with_limits(
            metafeature_calculation_time_limit)
        metafeature_calculation_end_time = time.time()
        metafeature_calculation_time_limit = \
            metafeature_calculation_time_limit - (
            metafeature_calculation_end_time -
            metafeature_calculation_start_time)

        if metafeature_calculation_time_limit < 1:
            self.logger.warning(
                'Time limit for metafeature calculation less '
                'than 1 seconds (%f). Skipping calculation '
                'of metafeatures for encoded dataset.',
                metafeature_calculation_time_limit)
            meta_features_encoded = None
        else:
            self.datamanager.perform1HotEncoding()
            meta_features_encoded = \
                self._calculate_metafeatures_encoded_with_limits(
                    metafeature_calculation_time_limit)

        # In case there is a problem calculating the encoded meta-features
        if meta_features is None:
            if meta_features_encoded is not None:
                meta_features = meta_features_encoded
        else:
            if meta_features_encoded is not None:
                meta_features.metafeature_values.update(
                    meta_features_encoded.metafeature_values)

        if meta_features is not None:
            meta_base.add_dataset(instance_id, meta_features)
            # Do mean imputation of the meta-features - should be done specific
            # for each prediction model!
            all_metafeatures = meta_base.get_metafeatures(
                features=list(meta_features.keys()))
            all_metafeatures.fillna(all_metafeatures.mean(), inplace=True)

            metalearning_configurations = self.collect_metalearning_suggestions(
                meta_base)
            if metalearning_configurations is None:
                metalearning_configurations = []
            self.reset_data_manager()

            self.logger.info('%s', meta_features)

            # Convert meta-features into a dictionary because the scenario
            # expects a dictionary
            meta_features_dict = {}
            for dataset, series in all_metafeatures.iterrows():
                meta_features_dict[dataset] = series.values
            meta_features_list = []
            for meta_feature_name in all_metafeatures.columns:
                meta_features_list.append(
                    meta_features[meta_feature_name].value)
            meta_features_list = np.array(meta_features_list).reshape((1, -1))
            self.logger.info(list(meta_features_dict.keys()))

            meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric])
            meta_runs_index = 0
            try:
                meta_durations = meta_base.get_all_runs('runtime')
                read_runtime_data = True
            except KeyError:
                read_runtime_data = False
                self.logger.critical('Cannot read runtime data.')
                if self.acquisition_function == 'EIPS':
                    self.logger.critical(
                        'Reverting to acquisition function EI!')
                    self.acquisition_function = 'EI'

            for meta_dataset in meta_runs.index:
                meta_dataset_start_index = meta_runs_index
                for meta_configuration in meta_runs.columns:
                    if np.isfinite(meta_runs.loc[meta_dataset,
                                                 meta_configuration]):
                        try:
                            config = meta_base.get_configuration_from_algorithm_index(
                                meta_configuration)
                            cost = meta_runs.loc[meta_dataset,
                                                 meta_configuration]
                            if read_runtime_data:
                                runtime = meta_durations.loc[
                                    meta_dataset, meta_configuration]
                            else:
                                runtime = 1
                            # TODO read out other status types!
                            meta_runhistory.add(config,
                                                cost,
                                                runtime,
                                                StatusType.SUCCESS,
                                                instance_id=meta_dataset)
                            meta_runs_index += 1
                        except:
                            # TODO maybe add warning
                            pass

                meta_runs_dataset_indices[meta_dataset] = (
                    meta_dataset_start_index, meta_runs_index)
        else:
            if self.acquisition_function == 'EIPS':
                self.logger.critical('Reverting to acquisition function EI!')
                self.acquisition_function = 'EI'
            meta_features_list = []
            meta_features_dict = {}
            metalearning_configurations = []

        self.scenario = AutoMLScenario(self.config_space,
                                       self.total_walltime_limit,
                                       self.func_eval_time_limit,
                                       meta_features_dict, self.tmp_dir,
                                       self.shared_mode)

        types = get_types(self.config_space, self.scenario.feature_array)
        if self.acquisition_function == 'EI':
            rh2EPM = RunHistory2EPM4Cost(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=None,
                                         impute_censored_data=False,
                                         impute_state=None)
            model = RandomForestWithInstances(
                types,
                instance_features=meta_features_list,
                seed=1,
                num_trees=10)
            smac = SMBO(self.scenario, model=model, rng=seed)
        elif self.acquisition_function == 'EIPS':
            rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=None,
                                         impute_censored_data=False,
                                         impute_state=None)
            model = UncorrelatedMultiObjectiveRandomForestWithInstances(
                ['cost', 'runtime'],
                types,
                num_trees=10,
                instance_features=meta_features_list,
                seed=1)
            acquisition_function = EIPS(model)
            smac = SMBO(self.scenario,
                        acquisition_function=acquisition_function,
                        model=model,
                        runhistory2epm=rh2EPM,
                        rng=seed)
        else:
            raise ValueError('Unknown acquisition function value %s!' %
                             self.acquisition_function)

        # Build a runtime model
        # runtime_rf = RandomForestWithInstances(types,
        #                                        instance_features=meta_features_list,
        #                                        seed=1, num_trees=10)
        # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
        #                                      scenario=self.scenario,
        #                                      success_states=None,
        #                                      impute_censored_data=False,
        #                                      impute_state=None)
        # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory)
        # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten())
        X_meta, Y_meta = rh2EPM.transform(meta_runhistory)
        # Transform Y_meta on a per-dataset base
        for meta_dataset in meta_runs_dataset_indices:
            start_index, end_index = meta_runs_dataset_indices[meta_dataset]
            end_index += 1  # Python indexing
            Y_meta[start_index:end_index, 0]\
                [Y_meta[start_index:end_index, 0] >2.0] =  2.0
            dataset_minimum = np.min(Y_meta[start_index:end_index, 0])
            Y_meta[start_index:end_index,
                   0] = 1 - ((1. - Y_meta[start_index:end_index, 0]) /
                             (1. - dataset_minimum))
            Y_meta[start_index:end_index, 0]\
                  [Y_meta[start_index:end_index, 0] > 2] = 2

        # == first, evaluate all metelearning and default configurations
        for i, next_config in enumerate(
            ([default_cfg] + metalearning_configurations)):
            # Do not evaluate default configurations more than once
            if i >= len([default_cfg]) and next_config in [default_cfg]:
                continue

            config_name = 'meta-learning' if i >= len([default_cfg]) \
                else 'default'

            self.logger.info(
                "Starting to evaluate %d. configuration "
                "(%s configuration) with time limit %ds.", num_run,
                config_name, self.func_eval_time_limit)
            self.logger.info(next_config)
            self.reset_data_manager()
            info = eval_with_limits(self.datamanager, self.tmp_dir,
                                    next_config, seed, num_run,
                                    self.resampling_strategy,
                                    self.resampling_strategy_args,
                                    self.memory_limit,
                                    self.func_eval_time_limit)
            (duration, result, _, additional_run_info, status) = info
            run_history.add(config=next_config,
                            cost=result,
                            time=duration,
                            status=status,
                            instance_id=instance_id,
                            seed=seed)
            run_history.update_cost(next_config, result)
            self.logger.info(
                "Finished evaluating %d. configuration. "
                "Duration %f; loss %f; status %s; additional run "
                "info: %s ", num_run, duration, result, str(status),
                additional_run_info)
            num_run += 1
            if smac.incumbent is None:
                smac.incumbent = next_config
            elif result < run_history.get_cost(smac.incumbent):
                smac.incumbent = next_config

            if self.scenario.shared_model:
                pSMAC.write(run_history=run_history,
                            output_directory=self.scenario.output_dir,
                            num_run=self.seed)

        # == after metalearning run SMAC loop
        smac.runhistory = run_history
        smac_iter = 0
        finished = False
        while not finished:
            if self.scenario.shared_model:
                pSMAC.read(run_history=run_history,
                           output_directory=self.scenario.output_dir,
                           configuration_space=self.config_space,
                           logger=self.logger)

            next_configs = []
            time_for_choose_next = -1
            try:
                X_cfg, Y_cfg = rh2EPM.transform(run_history)

                if not run_history.empty():
                    # Update costs by normalization
                    dataset_minimum = np.min(Y_cfg[:, 0])
                    Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) /
                                       (1. - dataset_minimum))
                    Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2

                if len(X_meta) > 0 and len(X_cfg) > 0:
                    pass
                    #X_cfg = np.concatenate((X_meta, X_cfg))
                    #Y_cfg = np.concatenate((Y_meta, Y_cfg))
                elif len(X_meta) > 0:
                    X_cfg = X_meta.copy()
                    Y_cfg = Y_meta.copy()
                elif len(X_cfg) > 0:
                    X_cfg = X_cfg.copy()
                    Y_cfg = Y_cfg.copy()
                else:
                    raise ValueError(
                        'No training data for SMAC random forest!')

                self.logger.info('Using %d training points for SMAC.' %
                                 X_cfg.shape[0])
                choose_next_start_time = time.time()
                next_configs_tmp = smac.choose_next(
                    X_cfg,
                    Y_cfg,
                    num_interleaved_random=110,
                    num_configurations_by_local_search=10,
                    num_configurations_by_random_search_sorted=100)
                time_for_choose_next = time.time() - choose_next_start_time
                self.logger.info('Used %g seconds to find next '
                                 'configurations' % (time_for_choose_next))
                next_configs.extend(next_configs_tmp)
            # TODO put Exception here!
            except Exception as e:
                self.logger.error(e)
                self.logger.error("Error in getting next configurations "
                                  "with SMAC. Using random configuration!")
                next_config = self.config_space.sample_configuration()
                next_configs.append(next_config)

            models_fitted_this_iteration = 0
            start_time_this_iteration = time.time()
            for next_config in next_configs:
                x_runtime = impute_inactive_values(next_config)
                x_runtime = impute_inactive_values(x_runtime).get_array()
                # predicted_runtime = runtime_rf.predict_marginalized_over_instances(
                #     x_runtime.reshape((1, -1)))
                # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1

                self.logger.info(
                    "Starting to evaluate %d. configuration (from "
                    "SMAC) with time limit %ds.", num_run,
                    self.func_eval_time_limit)
                self.logger.info(next_config)
                self.reset_data_manager()
                info = eval_with_limits(self.datamanager, self.tmp_dir,
                                        next_config, seed, num_run,
                                        self.resampling_strategy,
                                        self.resampling_strategy_args,
                                        self.memory_limit,
                                        self.func_eval_time_limit)
                (duration, result, _, additional_run_info, status) = info
                run_history.add(config=next_config,
                                cost=result,
                                time=duration,
                                status=status,
                                instance_id=instance_id,
                                seed=seed)
                run_history.update_cost(next_config, result)

                #self.logger.info('Predicted runtime %g, true runtime %g',
                #                 predicted_runtime, duration)

                # TODO add unittest to make sure everything works fine and
                # this does not get outdated!
                if smac.incumbent is None:
                    smac.incumbent = next_config
                elif result < run_history.get_cost(smac.incumbent):
                    smac.incumbent = next_config

                self.logger.info(
                    "Finished evaluating %d. configuration. "
                    "Duration: %f; loss: %f; status %s; additional "
                    "run info: %s ", num_run, duration, result, str(status),
                    additional_run_info)
                smac_iter += 1
                num_run += 1

                models_fitted_this_iteration += 1
                time_used_this_iteration = time.time(
                ) - start_time_this_iteration
                if models_fitted_this_iteration >= 2 and \
                        time_for_choose_next > 0 and \
                        time_used_this_iteration > time_for_choose_next:
                    break
                elif time_for_choose_next <= 0 and \
                        models_fitted_this_iteration >= 1:
                    break
                elif models_fitted_this_iteration >= 50:
                    break

                if max_iters is not None:
                    finished = (smac_iter < max_iters)

            if self.scenario.shared_model:
                pSMAC.write(run_history=run_history,
                            output_directory=self.scenario.output_dir,
                            num_run=self.seed)
Esempio n. 8
0
    def run_smbo(self):

        self.watcher.start_task('SMBO')

        # == first things first: load the datamanager
        self.reset_data_manager()

        # == Initialize non-SMBO stuff
        # first create a scenario
        seed = self.seed
        self.config_space.seed(seed)
        num_params = len(self.config_space.get_hyperparameters())
        # allocate a run history
        num_run = self.start_num_run
        instance_id = self.dataset_name + SENTINEL

        # Initialize some SMAC dependencies
        runhistory = RunHistory(aggregate_func=average_cost)
        # meta_runhistory = RunHistory(aggregate_func=average_cost)
        # meta_runs_dataset_indices = {}

        # == METALEARNING suggestions
        # we start by evaluating the defaults on the full dataset again
        # and add the suggestions from metalearning behind it

        if self.num_metalearning_cfgs > 0:
            if self.metadata_directory is None:
                metalearning_directory = os.path.dirname(
                    autosklearn.metalearning.__file__)
                # There is no multilabel data in OpenML
                if self.task == MULTILABEL_CLASSIFICATION:
                    meta_task = BINARY_CLASSIFICATION
                else:
                    meta_task = self.task
                metadata_directory = os.path.join(
                    metalearning_directory, 'files', '%s_%s_%s' %
                    (METRIC_TO_STRING[self.metric],
                     TASK_TYPES_TO_STRING[meta_task], 'sparse'
                     if self.datamanager.info['is_sparse'] else 'dense'))
                self.metadata_directory = metadata_directory

            self.logger.info('Metadata directory: %s', self.metadata_directory)
            meta_base = MetaBase(self.config_space, self.metadata_directory)

            metafeature_calculation_time_limit = int(
                self.total_walltime_limit / 4)
            metafeature_calculation_start_time = time.time()
            meta_features = self._calculate_metafeatures_with_limits(
                metafeature_calculation_time_limit)
            metafeature_calculation_end_time = time.time()
            metafeature_calculation_time_limit = \
                metafeature_calculation_time_limit - (
                metafeature_calculation_end_time -
                metafeature_calculation_start_time)

            if metafeature_calculation_time_limit < 1:
                self.logger.warning(
                    'Time limit for metafeature calculation less '
                    'than 1 seconds (%f). Skipping calculation '
                    'of metafeatures for encoded dataset.',
                    metafeature_calculation_time_limit)
                meta_features_encoded = None
            else:
                with warnings.catch_warnings():
                    warnings.showwarning = self._send_warnings_to_log
                    self.datamanager.perform1HotEncoding()
                meta_features_encoded = \
                    self._calculate_metafeatures_encoded_with_limits(
                        metafeature_calculation_time_limit)

            # In case there is a problem calculating the encoded meta-features
            if meta_features is None:
                if meta_features_encoded is not None:
                    meta_features = meta_features_encoded
            else:
                if meta_features_encoded is not None:
                    meta_features.metafeature_values.update(
                        meta_features_encoded.metafeature_values)

            if meta_features is not None:
                meta_base.add_dataset(instance_id, meta_features)
                # Do mean imputation of the meta-features - should be done specific
                # for each prediction model!
                all_metafeatures = meta_base.get_metafeatures(
                    features=list(meta_features.keys()))
                all_metafeatures.fillna(all_metafeatures.mean(), inplace=True)

                with warnings.catch_warnings():
                    warnings.showwarning = self._send_warnings_to_log
                    metalearning_configurations = self.collect_metalearning_suggestions(
                        meta_base)
                if metalearning_configurations is None:
                    metalearning_configurations = []
                self.reset_data_manager()

                self.logger.info('%s', meta_features)

                # Convert meta-features into a dictionary because the scenario
                # expects a dictionary
                meta_features_dict = {}
                for dataset, series in all_metafeatures.iterrows():
                    meta_features_dict[dataset] = series.values
                meta_features_list = []
                for meta_feature_name in all_metafeatures.columns:
                    meta_features_list.append(
                        meta_features[meta_feature_name].value)
                meta_features_list = np.array(meta_features_list).reshape(
                    (1, -1))
                self.logger.info(list(meta_features_dict.keys()))

                # meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric])
                # meta_runs_index = 0
                # try:
                #    meta_durations = meta_base.get_all_runs('runtime')
                #    read_runtime_data = True
                # except KeyError:
                #    read_runtime_data = False
                #    self.logger.critical('Cannot read runtime data.')
                #    if self.acquisition_function == 'EIPS':
                #        self.logger.critical('Reverting to acquisition function EI!')
                #        self.acquisition_function = 'EI'

                # for meta_dataset in meta_runs.index:
                #     meta_dataset_start_index = meta_runs_index
                #     for meta_configuration in meta_runs.columns:
                #         if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]):
                #             try:
                #                 config = meta_base.get_configuration_from_algorithm_index(
                #                     meta_configuration)
                #                 cost = meta_runs.loc[meta_dataset, meta_configuration]
                #                 if read_runtime_data:
                #                     runtime = meta_durations.loc[meta_dataset,
                #                                                  meta_configuration]
                #                 else:
                #                     runtime = 1
                #                 # TODO read out other status types!
                #                 meta_runhistory.add(config, cost, runtime,
                #                                     StatusType.SUCCESS,
                #                                     instance_id=meta_dataset)
                #                 meta_runs_index += 1
                #             except:
                #                 # TODO maybe add warning
                #                 pass
                #
                #     meta_runs_dataset_indices[meta_dataset] = (
                #         meta_dataset_start_index, meta_runs_index)

        else:
            meta_features = None

        if meta_features is None:
            if self.acquisition_function == 'EIPS':
                self.logger.critical('Reverting to acquisition function EI!')
                self.acquisition_function = 'EI'
            meta_features_list = []
            meta_features_dict = {}
            metalearning_configurations = []

        if self.resampling_strategy in [
                'partial-cv', 'partial-cv-iterative-fit'
        ]:
            num_folds = self.resampling_strategy_args['folds']
            instances = [[fold_number] for fold_number in range(num_folds)]
        else:
            instances = None

        startup_time = self.watcher.wall_elapsed(self.dataset_name)
        total_walltime_limit = self.total_walltime_limit - startup_time - 5
        scenario_dict = {
            'cs': self.config_space,
            'cutoff-time': self.func_eval_time_limit,
            'memory-limit': self.memory_limit,
            'wallclock-limit': total_walltime_limit,
            # 'instances': [[name] for name in meta_features_dict],
            'output-dir': self.backend.temporary_directory,
            'shared-model': self.shared_mode,
            'run-obj': 'quality',
            'deterministic': 'true',
            'instances': instances
        }

        if self.configuration_mode == 'RANDOM':
            scenario_dict['minR'] = len(
                instances) if instances is not None else 1
            scenario_dict['initial_incumbent'] = 'RANDOM'

        self.scenario = Scenario(scenario_dict)

        # TODO rebuild target algorithm to be it's own target algorithm
        # evaluator, which takes into account that a run can be killed prior
        # to the model being fully fitted; thus putting intermediate results
        # into a queue and querying them once the time is over
        exclude = dict()
        include = dict()
        if self.include_preprocessors is not None and \
                self.exclude_preprocessors is not None:
            raise ValueError('Cannot specify include_preprocessors and '
                             'exclude_preprocessors.')
        elif self.include_preprocessors is not None:
            include['preprocessor'] = self.include_preprocessors
        elif self.exclude_preprocessors is not None:
            exclude['preprocessor'] = self.exclude_preprocessors
        if self.include_estimators is not None and \
                self.exclude_preprocessors is not None:
            raise ValueError('Cannot specify include_estimators and '
                             'exclude_estimators.')
        elif self.include_estimators is not None:
            if self.task in CLASSIFICATION_TASKS:
                include['classifier'] = self.include_estimators
            elif self.task in REGRESSION_TASKS:
                include['regressor'] = self.include_estimators
            else:
                raise ValueError(self.task)
        elif self.exclude_estimators is not None:
            if self.task in CLASSIFICATION_TASKS:
                exclude['classifier'] = self.exclude_estimators
            elif self.task in REGRESSION_TASKS:
                exclude['regressor'] = self.exclude_estimators
            else:
                raise ValueError(self.task)

        ta = ExecuteTaFuncWithQueue(
            backend=self.backend,
            autosklearn_seed=seed,
            resampling_strategy=self.resampling_strategy,
            initial_num_run=num_run,
            logger=self.logger,
            include=include,
            exclude=exclude,
            memory_limit=self.memory_limit,
            disable_file_output=self.disable_file_output,
            **self.resampling_strategy_args)

        types = get_types(self.config_space, self.scenario.feature_array)

        # TODO extract generation of SMAC object into it's own function for
        # testing
        if self.acquisition_function == 'EI':
            model = RandomForestWithInstances(
                types,
                #instance_features=meta_features_list,
                seed=1,
                num_trees=10)
            rh2EPM = RunHistory2EPM4Cost(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=[
                                             StatusType.SUCCESS,
                                             StatusType.MEMOUT,
                                             StatusType.TIMEOUT
                                         ],
                                         impute_censored_data=False,
                                         impute_state=None)
            _smac_arguments = dict(scenario=self.scenario,
                                   model=model,
                                   rng=seed,
                                   runhistory2epm=rh2EPM,
                                   tae_runner=ta,
                                   runhistory=runhistory)
        elif self.acquisition_function == 'EIPS':
            rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=[
                                             StatusType.SUCCESS,
                                             StatusType.MEMOUT,
                                             StatusType.TIMEOUT
                                         ],
                                         impute_censored_data=False,
                                         impute_state=None)
            model = UncorrelatedMultiObjectiveRandomForestWithInstances(
                ['cost', 'runtime'],
                types,
                num_trees=10,
                instance_features=meta_features_list,
                seed=1)
            acquisition_function = EIPS(model)
            _smac_arguments = dict(scenario=self.scenario,
                                   model=model,
                                   rng=seed,
                                   tae_runner=ta,
                                   runhistory2epm=rh2EPM,
                                   runhistory=runhistory,
                                   acquisition_function=acquisition_function)
        else:
            raise ValueError('Unknown acquisition function value %s!' %
                             self.acquisition_function)

        if self.configuration_mode == 'SMAC':
            smac = SMAC(**_smac_arguments)
        elif self.configuration_mode in ['ROAR', 'RANDOM']:
            for not_in_roar in ['runhistory2epm', 'model']:
                if not_in_roar in _smac_arguments:
                    del _smac_arguments[not_in_roar]
            smac = ROAR(**_smac_arguments)
        else:
            raise ValueError(self.configuration_mode)

        # Build a runtime model
        # runtime_rf = RandomForestWithInstances(types,
        #                                        instance_features=meta_features_list,
        #                                        seed=1, num_trees=10)
        # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
        #                                      scenario=self.scenario,
        #                                      success_states=None,
        #                                      impute_censored_data=False,
        #                                      impute_state=None)
        # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory)
        # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten())
        # X_meta, Y_meta = rh2EPM.transform(meta_runhistory)
        # # Transform Y_meta on a per-dataset base
        # for meta_dataset in meta_runs_dataset_indices:
        #     start_index, end_index = meta_runs_dataset_indices[meta_dataset]
        #     end_index += 1  # Python indexing
        #     Y_meta[start_index:end_index, 0]\
        #         [Y_meta[start_index:end_index, 0] >2.0] =  2.0
        #     dataset_minimum = np.min(Y_meta[start_index:end_index, 0])
        #     Y_meta[start_index:end_index, 0] = 1 - (
        #         (1. - Y_meta[start_index:end_index, 0]) /
        #         (1. - dataset_minimum))
        #     Y_meta[start_index:end_index, 0]\
        #           [Y_meta[start_index:end_index, 0] > 2] = 2

        smac.solver.stats.start_timing()
        # == first, evaluate all metelearning and default configurations
        smac.solver.incumbent = smac.solver.initial_design.run()

        for challenger in metalearning_configurations:

            smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify(
                challengers=[challenger],
                incumbent=smac.solver.incumbent,
                run_history=smac.solver.runhistory,
                aggregate_func=smac.solver.aggregate_func,
                time_bound=self.total_walltime_limit)

            if smac.solver.scenario.shared_model:
                pSMAC.write(run_history=smac.solver.runhistory,
                            output_directory=smac.solver.scenario.output_dir,
                            num_run=self.seed)

            if smac.solver.stats.is_budget_exhausted():
                break

        # == after metalearning run SMAC loop
        while True:

            if smac.solver.scenario.shared_model:
                pSMAC.read(run_history=smac.solver.runhistory,
                           output_directory=self.scenario.output_dir,
                           configuration_space=self.config_space,
                           logger=self.logger)

            choose_next_start_time = time.time()
            try:
                challengers = self.choose_next(smac)
            except Exception as e:
                self.logger.error(e)
                self.logger.error("Error in getting next configurations "
                                  "with SMAC. Using random configuration!")
                next_config = self.config_space.sample_configuration()
                challengers = [next_config]
            time_for_choose_next = time.time() - choose_next_start_time
            self.logger.info('Used %g seconds to find next '
                             'configurations' % (time_for_choose_next))

            time_for_choose_next = max(time_for_choose_next, 1.0)
            smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify(
                challengers=challengers,
                incumbent=smac.solver.incumbent,
                run_history=smac.solver.runhistory,
                aggregate_func=smac.solver.aggregate_func,
                time_bound=time_for_choose_next)

            if smac.solver.scenario.shared_model:
                pSMAC.write(run_history=smac.solver.runhistory,
                            output_directory=smac.solver.scenario.output_dir,
                            num_run=self.seed)

            if smac.solver.stats.is_budget_exhausted():
                break

        self.runhistory = smac.solver.runhistory
        self.trajectory = smac.solver.intensifier.traj_logger.trajectory

        return self.runhistory, self.trajectory
Esempio n. 9
0
    def build_pc_smbo(self,
                      tae_runner,
                      stats,
                      scenario,
                      runhistory,
                      aggregate_func,
                      acq_func_name,
                      model_target_names,
                      logging_directory,
                      double_intensification=False,
                      constant_pipeline_steps=None,
                      variable_pipeline_steps=None,
                      cached_pipeline_steps=None,
                      seed=None,
                      intensification_instances=None,
                      num_marginalized_configurations_by_random_search=20,
                      num_configs_for_marginalization=40,
                      random_splitting_number=5,
                      random_splitting_enabled=False):

        # Build intensifier
        rng = np.random.RandomState(seed)
        traj_logger = TrajLogger(logging_directory, stats)
        intensifier = Intensifier(tae_runner=tae_runner,
                                  stats=stats,
                                  traj_logger=traj_logger,
                                  rng=rng,
                                  cutoff=scenario.cutoff,
                                  deterministic=scenario.deterministic,
                                  run_obj_time=scenario.run_obj == "runtime",
                                  run_limit=scenario.ta_run_limit,
                                  instances=intensification_instances,
                                  maxR=len(intensification_instances))

        # Build model
        types, bounds = get_types(scenario.cs, scenario.feature_array)
        #types = get_types(scenario.cs)
        if len(model_target_names) > 1:
            # model_target_names = ['cost','time']
            model = UncorrelatedMultiObjectiveRandomForestWithInstances(
                target_names=model_target_names, bounds=bounds, types=types)
            # UncorrelatedMultiObjectiveRandomForestWithInstances(target_names=model_target_names,
            #                                                    types=types)
        elif len(model_target_names) == 1:
            model = RandomForestWithInstances(types=types, bounds=bounds)
        else:
            model = RandomEPM(rng=rng)
            # model = RandomForestWithInstances(types=types)

        # Build acquisition function, runhistory2epm and local search
        num_params = len(scenario.cs.get_hyperparameters())
        if acq_func_name in ["ei", "pc-ei"]:
            acquisition_func = EI(model)
            acq_func_wrapper = PCAquisitionFunctionWrapper(
                acquisition_func=acquisition_func,
                config_space=scenario.cs,
                runhistory=runhistory,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps)
            runhistory2epm = RunHistory2EPM4Cost(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                                       config_space=scenario.cs)
            select_configuration = SelectConfigurations(
                scenario=scenario,
                stats=stats,
                runhistory=runhistory,
                model=model,
                acq_optimizer=local_search,
                acquisition_func=acq_func_wrapper,
                rng=rng,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps)
        elif acq_func_name in ["m-ei", "pc-m-ei"]:
            #acquisition_func = MEI(model)
            acquisition_func = EI(model)
            acq_func_wrapper = PCAquisitionFunctionWrapper(
                acquisition_func=acquisition_func,
                config_space=scenario.cs,
                runhistory=runhistory,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps)
            runhistory2epm = RunHistory2EPM4Cost(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                                       config_space=scenario.cs)
            # TODO: num_configs_for_marginalization
            select_configuration = SelectConfigurationsWithMarginalization(
                scenario=scenario,
                stats=stats,
                runhistory=runhistory,
                model=model,
                acq_optimizer=local_search,
                acquisition_func=acq_func_wrapper,
                rng=rng,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps,
                num_marginalized_configurations_by_random_search=
                num_marginalized_configurations_by_random_search,
                num_configs_for_marginalization=num_configs_for_marginalization
            )
        elif acq_func_name in ['eips', 'pc-eips']:
            acquisition_func = EIPS(model)
            acq_func_wrapper = PCAquisitionFunctionWrapper(
                acquisition_func=acquisition_func,
                config_space=scenario.cs,
                runhistory=runhistory,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps)
            runhistory2epm = RunHistory2EPM4EIPS(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                                       config_space=scenario.cs)
            select_configuration = SelectConfigurations(
                scenario=scenario,
                stats=stats,
                runhistory=runhistory,
                model=model,
                acq_optimizer=local_search,
                acquisition_func=acq_func_wrapper,
                rng=rng,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps)
        elif acq_func_name in ["m-eips", "pc-m-eips"]:
            acquisition_func = EIPS(model)
            acq_func_wrapper = PCAquisitionFunctionWrapper(
                acquisition_func=acquisition_func,
                config_space=scenario.cs,
                runhistory=runhistory,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps)
            runhistory2epm = RunHistory2EPM4EIPS(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                                       config_space=scenario.cs)
            # TODO: num_configs_for_marginalization
            select_configuration = SelectConfigurationsWithMarginalization(
                scenario=scenario,
                stats=stats,
                runhistory=runhistory,
                model=model,
                acq_optimizer=local_search,
                acquisition_func=acq_func_wrapper,
                rng=rng,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps,
                num_marginalized_configurations_by_random_search=
                num_marginalized_configurations_by_random_search,
                num_configs_for_marginalization=num_configs_for_marginalization
            )
        elif acq_func_name == 'pceips':
            acquisition_func = PCEIPS(model)
            acq_func_wrapper = PCAquisitionFunctionWrapperWithCachingReduction(
                acquisition_func=acquisition_func,
                config_space=scenario.cs,
                runhistory=runhistory,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps,
                cached_pipeline_steps=cached_pipeline_steps)
            runhistory2epm = RunHistory2EPM4EIPS(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                                       config_space=scenario.cs)
            if constant_pipeline_steps == None or variable_pipeline_steps == None or cached_pipeline_steps == None:
                raise ValueError(
                    "Constant_pipeline_steps and variable pipeline steps should not be none\
                                    when using PCEIPS")
            select_configuration = SelectConfigurations(
                scenario=scenario,
                stats=stats,
                runhistory=runhistory,
                model=model,
                acq_optimizer=local_search,
                acquisition_func=acq_func_wrapper,
                rng=rng,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps)
        elif acq_func_name == 'pc-m-pceips':
            acquisition_func = PCEIPS(model)
            acq_func_wrapper = PCAquisitionFunctionWrapperWithCachingReduction(
                acquisition_func=acquisition_func,
                config_space=scenario.cs,
                runhistory=runhistory,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps,
                cached_pipeline_steps=cached_pipeline_steps)
            runhistory2epm = RunHistory2EPM4EIPS(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                                       config_space=scenario.cs)
            if constant_pipeline_steps == None or variable_pipeline_steps == None or cached_pipeline_steps == None:
                raise ValueError(
                    "Constant_pipeline_steps and variable pipeline steps should not be none\
                                    when using PCEIPS")
            select_configuration = SelectConfigurationsWithMarginalization(
                scenario=scenario,
                stats=stats,
                runhistory=runhistory,
                model=model,
                acq_optimizer=local_search,
                acquisition_func=acq_func_wrapper,
                rng=rng,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps,
                num_marginalized_configurations_by_random_search=
                num_marginalized_configurations_by_random_search,
                num_configs_for_marginalization=num_configs_for_marginalization
            )
        elif acq_func_name == "roar":
            runhistory2epm = RunHistory2EPM4Cost(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            select_configuration = SelectConfigurationsRandom(
                scenario=scenario)
        elif acq_func_name == "pc-roar-mrs":
            runhistory2epm = RunHistory2EPM4Cost(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            select_configuration = SelectConfigurationsMRS(
                scenario=scenario,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps,
                splitting_number=random_splitting_number,
                random_splitting_enabled=random_splitting_enabled)
        elif acq_func_name == "pc-roar-sigmoid-rs":
            runhistory2epm = RunHistory2EPM4Cost(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            select_configuration = SelectConfigurationsSigmoidRS(
                scenario=scenario,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps,
                fraction=random_splitting_number)
        else:
            # Not a valid acquisition function
            raise ValueError("The provided acquisition function is not valid")

        # Build initial design
        # initial_design = RandomConfiguration(tae_runner=tae_runner,
        #                                      scenario=scenario,
        #                                      stats=stats,
        #                                      traj_logger=traj_logger,
        #                                      rng=rng)
        initial_configs = scenario.cs.sample_configuration(size=2)
        for config in initial_configs:
            config._populate_values()
        initial_design = MultiConfigInitialDesign(
            tae_runner=tae_runner,
            scenario=scenario,
            stats=stats,
            traj_logger=traj_logger,
            runhistory=runhistory,
            rng=rng,
            configs=initial_configs,
            intensifier=intensifier,
            aggregate_func=aggregate_func)

        # run id
        num_run = rng.randint(1234567980)

        # Build pc_smbo
        if acq_func_name not in ['pc-roar-sigmoid-rs']:
            smbo = PCSMBO(scenario=scenario,
                          stats=stats,
                          initial_design=initial_design,
                          runhistory=runhistory,
                          runhistory2epm=runhistory2epm,
                          intensifier=intensifier,
                          aggregate_func=aggregate_func,
                          num_run=num_run,
                          model=model,
                          rng=rng,
                          select_configuration=select_configuration,
                          double_intensification=double_intensification)
        else:
            smbo = PCSMBOSigmoidRandomSearch(
                scenario=scenario,
                stats=stats,
                initial_design=initial_design,
                runhistory=runhistory,
                runhistory2epm=runhistory2epm,
                intensifier=intensifier,
                aggregate_func=aggregate_func,
                num_run=num_run,
                model=model,
                rng=rng,
                select_configuration=select_configuration)

        return smbo