def model(self, model_short_name='urfi'):
     if model_short_name not in ['urfi', 'rfi']:
         raise ValueError(
             'Specified model %s does not exist or not supported!' %
             model_short_name)
     elif model_short_name == 'rfi':
         self.types, self.bounds = get_types(self.scenario.cs,
                                             self.scenario.feature_array)
         self._model = RandomForestWithInstances(
             self.types,
             self.bounds,
             instance_features=self.scenario.feature_array,
             seed=12345)
     elif model_short_name == 'urfi':
         if not self._preprocessed:
             self.types, self.bounds = get_types(
                 self.scenario.cs, self.scenario.feature_array)
             self._model = UnloggedEPARXrfi(
                 self.types,
                 self.bounds,
                 instance_features=self.scenario.feature_array,
                 seed=12345,
                 cutoff=self.cutoff,
                 threshold=self.threshold)
         else:
             self.types, self.bounds = get_types(self.scenario.cs, None)
             self._model = Unloggedrfwi(self.types,
                                        self.bounds,
                                        instance_features=None,
                                        seed=12345)
     self._model.rf_opts.compute_oob_error = True
Exemple #2
0
def optimize(scenario, run, forest=False, seed=8, ratio=0.8):
    types, bounds = get_types(scenario.cs, scenario.feature_array)
    rfr = RandomForestWithInstances(types=types, bounds=bounds, instance_features=scenario.feature_array, seed=seed)
    ei = EI(model=rfr)
    if forest:
        optimizer = ForestSearch(ei, scenario.cs, ratio=ratio)
    else:
        optimizer = InterleavedLocalAndRandomSearch(ei, scenario.cs)

    scenario.output_dir = "%s_%s_%d_%lf" % ("./logs/run_", "forest_" if forest else "random_", seed, time.time())
    smac = SMAC(
        scenario=scenario,
        rng=np.random.RandomState(seed),
        model=rfr,
        acquisition_function=ei,
        acquisition_function_optimizer=optimizer,
        tae_runner=run,
    )

    try:
        incumbent = smac.optimize()
    finally:
        incumbent = smac.solver.incumbent

    return smac.get_tae_runner().run(incumbent, 1)[1]
Exemple #3
0
    def setUp(self):
        logging.basicConfig(level=logging.DEBUG)
        self.cs = ConfigurationSpace()
        self.cs.add_hyperparameter(CategoricalHyperparameter(
                name="cat_a_b", choices=["a", "b"], default_value="a"))
        self.cs.add_hyperparameter(UniformFloatHyperparameter(
                name="float_0_1", lower=0, upper=1, default_value=0.5))
        self.cs.add_hyperparameter(UniformIntegerHyperparameter(
                name='integer_0_100', lower=-10, upper=10, default_value=0))

        self.rh = runhistory.RunHistory(aggregate_func=average_cost)
        rs = numpy.random.RandomState(1)
        to_count = 0
        cn_count = 0
        for i in range(500):
            config, seed, runtime, status, instance_id = \
                generate_config(cs=self.cs, rs=rs)
            if runtime == 40:
                to_count += 1
            if runtime < 40 and status == StatusType.TIMEOUT:
                cn_count += 1
            self.rh.add(config=config, cost=runtime, time=runtime,
                        status=status, instance_id=instance_id,
                        seed=seed, additional_info=None)
        print("%d TIMEOUTs, %d censored" % (to_count, cn_count))

        self.scen = Scen()
        self.scen.run_obj = "runtime"
        self.scen.overall_obj = "par10"
        self.scen.cutoff = 40

        types, bounds = get_types(self.cs, None)
        self.model = RandomForestWithInstances(
                types=types, bounds=bounds,
                instance_features=None, seed=1234567980)
Exemple #4
0
    def testRandomImputation(self):
        rs = numpy.random.RandomState(1)

        for i in range(0, 150, 15):
            # First random imputation sanity check
            num_samples = max(1, i * 10)
            num_feat = max(1, i)
            num_censored = int(num_samples * 0.1)
            X = rs.rand(num_samples, num_feat)
            y = numpy.sin(X[:, 0:1])

            cutoff = max(y) * 0.9
            y[y > cutoff] = cutoff

            # We have some cen data
            cen_X = X[:num_censored, :]
            cen_y = y[:num_censored]
            uncen_X = X[num_censored:, :]
            uncen_y = y[num_censored:]

            cen_y /= 2

            cs = ConfigurationSpace()
            for i in range(num_feat):
                cs.add_hyperparameter(
                    UniformFloatHyperparameter(name="a_%d" % i,
                                               lower=0,
                                               upper=1,
                                               default_value=0.5))

            types, bounds = get_types(cs, None)
            print(types)
            print(bounds)
            print('#' * 120)
            print(cen_X)
            print(uncen_X)
            print('~' * 120)
            self.model = RandomForestWithInstances(types=types,
                                                   bounds=bounds,
                                                   instance_features=None,
                                                   seed=1234567980)
            imputor = rfr_imputator.RFRImputator(rng=rs,
                                                 cutoff=cutoff,
                                                 threshold=cutoff * 10,
                                                 change_threshold=0.01,
                                                 max_iter=5,
                                                 model=self.model)

            imp_y = imputor.impute(censored_X=cen_X,
                                   censored_y=cen_y,
                                   uncensored_X=uncen_X,
                                   uncensored_y=uncen_y)

            if imp_y is None:
                continue

            for idx in range(cen_y.shape[0]):
                self.assertGreater(imp_y[idx], cen_y[idx])
            self.assertTrue(numpy.isfinite(imp_y).all())
 def test_init_EIPS_as_arguments(self):
     for objective in ['runtime', 'quality']:
         self.scenario.run_obj = objective
         types, bounds = get_types(self.scenario.cs, None)
         umrfwi = UncorrelatedMultiObjectiveRandomForestWithInstances(
             ['cost', 'runtime'], types, bounds)
         eips = EIPS(umrfwi)
         rh2EPM = RunHistory2EPM4EIPS(self.scenario, 2)
         smbo = SMAC(self.scenario, model=umrfwi, acquisition_function=eips,
                     runhistory2epm=rh2EPM).solver
         self.assertIs(umrfwi, smbo.model)
         self.assertIs(eips, smbo.acquisition_func)
         self.assertIs(rh2EPM, smbo.rh2EPM)
Exemple #6
0
    def setUp(self):
        unittest.TestCase.setUp(self)

        self.rh = runhistory.RunHistory(aggregate_func=average_cost)
        self.cs = get_config_space()
        self.config1 = Configuration(self.cs,
                                     values={'a': 0, 'b': 100})
        self.config2 = Configuration(self.cs,
                                     values={'a': 100, 'b': 0})
        self.config3 = Configuration(self.cs,
                                     values={'a': 100, 'b': 100})

        self.scen = Scenario({"cutoff_time": 20, 'cs': self.cs})
        self.types, self.bounds = get_types(self.cs, None)
        self.scen = Scenario({"cutoff_time": 20, 'cs': self.cs,
                              'output_dir': ''})
Exemple #7
0
    def test_with_ordinal(self):
        cs = smac.configspace.ConfigurationSpace()
        a = cs.add_hyperparameter(
            CategoricalHyperparameter('a', [0, 1], default_value=0))
        b = cs.add_hyperparameter(
            OrdinalHyperparameter('b', [0, 1], default_value=1))
        b = cs.add_hyperparameter(
            UniformFloatHyperparameter('c',
                                       lower=0.,
                                       upper=1.,
                                       default_value=1))
        b = cs.add_hyperparameter(
            UniformIntegerHyperparameter('d',
                                         lower=0,
                                         upper=10,
                                         default_value=1))
        cs.seed(1)

        feat_array = np.array([0, 0, 0]).reshape(1, -1)
        types, bounds = get_types(cs, feat_array)
        model = RandomForestWithInstances(types=types,
                                          bounds=bounds,
                                          instance_features=feat_array,
                                          seed=1,
                                          ratio_features=1.0,
                                          pca_components=9)
        self.assertEqual(bounds[0][0], 2)
        self.assertTrue(bounds[0][1] is np.nan)
        self.assertEqual(bounds[1][0], 0)
        self.assertEqual(bounds[1][1], 1)
        self.assertEqual(bounds[2][0], 0.)
        self.assertEqual(bounds[2][1], 1.)
        self.assertEqual(bounds[3][0], 0.)
        self.assertEqual(bounds[3][1], 1.)
        X = np.array(
            [[0., 0., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 0., 0., 0.],
             [0., 1., 0., 9., 0., 0., 0.], [0., 1., 1., 4., 0., 0., 0.]],
            dtype=np.float64)
        y = np.array([0, 1, 2, 3], dtype=np.float64)

        X_train = np.vstack((X, X, X, X, X, X, X, X, X, X))
        y_train = np.vstack((y, y, y, y, y, y, y, y, y, y))

        model.train(X_train, y_train.reshape((-1, 1)))
        mean, _ = model.predict(X)
        for idx, m in enumerate(mean):
            self.assertAlmostEqual(y[idx], m, 0.05)
def get_eips_object_callback(
        scenario_dict,
        seed,
        ta,
        backend,
        metalearning_configurations,
        runhistory,
):
    scenario_dict['input_psmac_dirs'] = backend.get_smac_output_glob()
    scenario = Scenario(scenario_dict)
    rh2EPM = RunHistory2EPM4EIPS(
        num_params=len(scenario.cs.get_hyperparameters()),
        scenario=scenario,
        success_states=[
            StatusType.SUCCESS,
            StatusType.MEMOUT,
            StatusType.TIMEOUT,
            StatusType.CRASHED
        ],
        impute_censored_data=False,
        impute_state=None
    )
    types, bounds = get_types(scenario.cs,
                              scenario.feature_array)
    model = UncorrelatedMultiObjectiveRandomForestWithInstances(
        ['cost', 'runtime'],
        types=types,
        bounds=bounds,
        instance_features=scenario.feature_array,
        rf_kwargs={'seed': 1,},
    )
    acquisition_function = EIPS(model)
    return SMAC(
        runhistory=runhistory,
        scenario=scenario,
        rng=seed,
        tae_runner=ta,
        runhistory2epm=rh2EPM,
        model=model,
        acquisition_function=acquisition_function,
        run_id=seed,
    )
 def model(self, model_short_name='urfi'):
     self.types, self.bounds = get_types(self.scenario.cs,
                                         self.scenario.feature_array)
     if model_short_name not in ['urfi', 'rfi']:
         raise ValueError(
             'Specified model %s does not exist or not supported!' %
             model_short_name)
     elif model_short_name == 'rfi':
         self._model = RandomForestWithInstances(
             self.types,
             self.bounds,
             instance_features=self.scenario.feature_array,
             seed=self.rng.randint(99999))
     elif model_short_name == 'urfi':
         self._model = UnloggedRandomForestWithInstances(
             self.types,
             self.bounds,
             self.scenario.feature_array,
             seed=self.rng.randint(99999),
             cutoff=self.cutoff,
             threshold=self.threshold)
     self._model.rf_opts.compute_oob_error = True
Exemple #10
0
    def _get_mean_var_time(self, validator, traj, pred, rh):
        # TODO kinda important: docstrings, what is this function doing?
        validator.traj = traj  # set trajectory
        time, configs = [], []

        if pred:
            for entry in traj:
                time.append(entry["wallclock_time"])
                configs.append(entry["incumbent"])
                # self.logger.debug('Time: %d Runs: %d', time[-1],
                #                   len(rh.get_runs_for_config(configs[-1])))

            self.logger.debug(
                "Using %d samples (%d distinct) from trajectory.", len(time),
                len(set(configs)))

            if validator.epm:  # not log as validator epm is trained on cost, not log cost
                epm = validator.epm
            else:
                self.logger.debug(
                    "No EPM passed! Training new one from runhistory.")
                # Train random forest and transform training data (from given rh)
                # Not using validator because we want to plot uncertainties
                rh2epm = RunHistory2EPM4Cost(num_params=len(
                    self.scenario.cs.get_hyperparameters()),
                                             scenario=self.scenario)
                X, y = rh2epm.transform(rh)
                self.logger.debug(
                    "Training model with data of shape X: %s, y:%s",
                    str(X.shape), str(y.shape))

                types, bounds = get_types(self.scenario.cs,
                                          self.scenario.feature_array)
                epm = RandomForestWithInstances(
                    types=types,
                    bounds=bounds,
                    instance_features=self.scenario.feature_array,
                    # seed=self.rng.randint(MAXINT),
                    ratio_features=1.0)
                epm.train(X, y)
            config_array = convert_configurations_to_array(configs)
            mean, var = epm.predict_marginalized_over_instances(config_array)
            var = np.zeros(mean.shape)
            # We don't want to show the uncertainty of the model but uncertainty over multiple optimizer runs
            # This variance is computed in an outer loop.
        else:
            mean, var = [], []
            for entry in traj:
                time.append(entry["wallclock_time"])
                configs.append(entry["incumbent"])
                costs = _cost(configs[-1], rh,
                              rh.get_runs_for_config(configs[-1]))
                # self.logger.debug(len(costs), time[-1]
                if not costs:
                    time.pop()
                else:
                    mean.append(np.mean(costs))
                    var.append(0)  # No variance over instances
            mean, var = np.array(mean).reshape(-1, 1), np.array(var).reshape(
                -1, 1)
        return mean, var, time
Exemple #11
0
    def run_smbo(self):

        self.watcher.start_task('SMBO')

        # == first things first: load the datamanager
        self.reset_data_manager()

        # == Initialize non-SMBO stuff
        # first create a scenario
        seed = self.seed
        self.config_space.seed(seed)
        num_params = len(self.config_space.get_hyperparameters())
        # allocate a run history
        num_run = self.start_num_run
        instance_id = self.dataset_name + SENTINEL

        # Initialize some SMAC dependencies
        runhistory = RunHistory(aggregate_func=average_cost)
        # meta_runhistory = RunHistory(aggregate_func=average_cost)
        # meta_runs_dataset_indices = {}

        # == METALEARNING suggestions
        # we start by evaluating the defaults on the full dataset again
        # and add the suggestions from metalearning behind it

        if self.num_metalearning_cfgs > 0:
            if self.metadata_directory is None:
                metalearning_directory = os.path.dirname(
                    autosklearn.metalearning.__file__)
                # There is no multilabel data in OpenML
                if self.task == MULTILABEL_CLASSIFICATION:
                    meta_task = BINARY_CLASSIFICATION
                else:
                    meta_task = self.task
                metadata_directory = os.path.join(
                    metalearning_directory, 'files', '%s_%s_%s' %
                    (METRIC_TO_STRING[self.metric],
                     TASK_TYPES_TO_STRING[meta_task], 'sparse'
                     if self.datamanager.info['is_sparse'] else 'dense'))
                self.metadata_directory = metadata_directory

            self.logger.info('Metadata directory: %s', self.metadata_directory)
            meta_base = MetaBase(self.config_space, self.metadata_directory)

            metafeature_calculation_time_limit = int(
                self.total_walltime_limit / 4)
            metafeature_calculation_start_time = time.time()
            meta_features = self._calculate_metafeatures_with_limits(
                metafeature_calculation_time_limit)
            metafeature_calculation_end_time = time.time()
            metafeature_calculation_time_limit = \
                metafeature_calculation_time_limit - (
                metafeature_calculation_end_time -
                metafeature_calculation_start_time)

            if metafeature_calculation_time_limit < 1:
                self.logger.warning(
                    'Time limit for metafeature calculation less '
                    'than 1 seconds (%f). Skipping calculation '
                    'of metafeatures for encoded dataset.',
                    metafeature_calculation_time_limit)
                meta_features_encoded = None
            else:
                with warnings.catch_warnings():
                    warnings.showwarning = self._send_warnings_to_log
                    self.datamanager.perform1HotEncoding()
                meta_features_encoded = \
                    self._calculate_metafeatures_encoded_with_limits(
                        metafeature_calculation_time_limit)

            # In case there is a problem calculating the encoded meta-features
            if meta_features is None:
                if meta_features_encoded is not None:
                    meta_features = meta_features_encoded
            else:
                if meta_features_encoded is not None:
                    meta_features.metafeature_values.update(
                        meta_features_encoded.metafeature_values)

            if meta_features is not None:
                meta_base.add_dataset(instance_id, meta_features)
                # Do mean imputation of the meta-features - should be done specific
                # for each prediction model!
                all_metafeatures = meta_base.get_metafeatures(
                    features=list(meta_features.keys()))
                all_metafeatures.fillna(all_metafeatures.mean(), inplace=True)

                with warnings.catch_warnings():
                    warnings.showwarning = self._send_warnings_to_log
                    metalearning_configurations = self.collect_metalearning_suggestions(
                        meta_base)
                if metalearning_configurations is None:
                    metalearning_configurations = []
                self.reset_data_manager()

                self.logger.info('%s', meta_features)

                # Convert meta-features into a dictionary because the scenario
                # expects a dictionary
                meta_features_dict = {}
                for dataset, series in all_metafeatures.iterrows():
                    meta_features_dict[dataset] = series.values
                meta_features_list = []
                for meta_feature_name in all_metafeatures.columns:
                    meta_features_list.append(
                        meta_features[meta_feature_name].value)
                meta_features_list = np.array(meta_features_list).reshape(
                    (1, -1))
                self.logger.info(list(meta_features_dict.keys()))

                # meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric])
                # meta_runs_index = 0
                # try:
                #    meta_durations = meta_base.get_all_runs('runtime')
                #    read_runtime_data = True
                # except KeyError:
                #    read_runtime_data = False
                #    self.logger.critical('Cannot read runtime data.')
                #    if self.acquisition_function == 'EIPS':
                #        self.logger.critical('Reverting to acquisition function EI!')
                #        self.acquisition_function = 'EI'

                # for meta_dataset in meta_runs.index:
                #     meta_dataset_start_index = meta_runs_index
                #     for meta_configuration in meta_runs.columns:
                #         if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]):
                #             try:
                #                 config = meta_base.get_configuration_from_algorithm_index(
                #                     meta_configuration)
                #                 cost = meta_runs.loc[meta_dataset, meta_configuration]
                #                 if read_runtime_data:
                #                     runtime = meta_durations.loc[meta_dataset,
                #                                                  meta_configuration]
                #                 else:
                #                     runtime = 1
                #                 # TODO read out other status types!
                #                 meta_runhistory.add(config, cost, runtime,
                #                                     StatusType.SUCCESS,
                #                                     instance_id=meta_dataset)
                #                 meta_runs_index += 1
                #             except:
                #                 # TODO maybe add warning
                #                 pass
                #
                #     meta_runs_dataset_indices[meta_dataset] = (
                #         meta_dataset_start_index, meta_runs_index)

        else:
            meta_features = None

        if meta_features is None:
            if self.acquisition_function == 'EIPS':
                self.logger.critical('Reverting to acquisition function EI!')
                self.acquisition_function = 'EI'
            meta_features_list = []
            meta_features_dict = {}
            metalearning_configurations = []

        if self.resampling_strategy in [
                'partial-cv', 'partial-cv-iterative-fit'
        ]:
            num_folds = self.resampling_strategy_args['folds']
            instances = [[fold_number] for fold_number in range(num_folds)]
        else:
            instances = None

        startup_time = self.watcher.wall_elapsed(self.dataset_name)
        total_walltime_limit = self.total_walltime_limit - startup_time - 5
        scenario_dict = {
            'cs': self.config_space,
            'cutoff-time': self.func_eval_time_limit,
            'memory-limit': self.memory_limit,
            'wallclock-limit': total_walltime_limit,
            # 'instances': [[name] for name in meta_features_dict],
            'output-dir': self.backend.temporary_directory,
            'shared-model': self.shared_mode,
            'run-obj': 'quality',
            'deterministic': 'true',
            'instances': instances
        }

        if self.configuration_mode == 'RANDOM':
            scenario_dict['minR'] = len(
                instances) if instances is not None else 1
            scenario_dict['initial_incumbent'] = 'RANDOM'

        self.scenario = Scenario(scenario_dict)

        # TODO rebuild target algorithm to be it's own target algorithm
        # evaluator, which takes into account that a run can be killed prior
        # to the model being fully fitted; thus putting intermediate results
        # into a queue and querying them once the time is over
        exclude = dict()
        include = dict()
        if self.include_preprocessors is not None and \
                self.exclude_preprocessors is not None:
            raise ValueError('Cannot specify include_preprocessors and '
                             'exclude_preprocessors.')
        elif self.include_preprocessors is not None:
            include['preprocessor'] = self.include_preprocessors
        elif self.exclude_preprocessors is not None:
            exclude['preprocessor'] = self.exclude_preprocessors
        if self.include_estimators is not None and \
                self.exclude_preprocessors is not None:
            raise ValueError('Cannot specify include_estimators and '
                             'exclude_estimators.')
        elif self.include_estimators is not None:
            if self.task in CLASSIFICATION_TASKS:
                include['classifier'] = self.include_estimators
            elif self.task in REGRESSION_TASKS:
                include['regressor'] = self.include_estimators
            else:
                raise ValueError(self.task)
        elif self.exclude_estimators is not None:
            if self.task in CLASSIFICATION_TASKS:
                exclude['classifier'] = self.exclude_estimators
            elif self.task in REGRESSION_TASKS:
                exclude['regressor'] = self.exclude_estimators
            else:
                raise ValueError(self.task)

        ta = ExecuteTaFuncWithQueue(
            backend=self.backend,
            autosklearn_seed=seed,
            resampling_strategy=self.resampling_strategy,
            initial_num_run=num_run,
            logger=self.logger,
            include=include,
            exclude=exclude,
            memory_limit=self.memory_limit,
            disable_file_output=self.disable_file_output,
            **self.resampling_strategy_args)

        types = get_types(self.config_space, self.scenario.feature_array)

        # TODO extract generation of SMAC object into it's own function for
        # testing
        if self.acquisition_function == 'EI':
            model = RandomForestWithInstances(
                types,
                #instance_features=meta_features_list,
                seed=1,
                num_trees=10)
            rh2EPM = RunHistory2EPM4Cost(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=[
                                             StatusType.SUCCESS,
                                             StatusType.MEMOUT,
                                             StatusType.TIMEOUT
                                         ],
                                         impute_censored_data=False,
                                         impute_state=None)
            _smac_arguments = dict(scenario=self.scenario,
                                   model=model,
                                   rng=seed,
                                   runhistory2epm=rh2EPM,
                                   tae_runner=ta,
                                   runhistory=runhistory)
        elif self.acquisition_function == 'EIPS':
            rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=[
                                             StatusType.SUCCESS,
                                             StatusType.MEMOUT,
                                             StatusType.TIMEOUT
                                         ],
                                         impute_censored_data=False,
                                         impute_state=None)
            model = UncorrelatedMultiObjectiveRandomForestWithInstances(
                ['cost', 'runtime'],
                types,
                num_trees=10,
                instance_features=meta_features_list,
                seed=1)
            acquisition_function = EIPS(model)
            _smac_arguments = dict(scenario=self.scenario,
                                   model=model,
                                   rng=seed,
                                   tae_runner=ta,
                                   runhistory2epm=rh2EPM,
                                   runhistory=runhistory,
                                   acquisition_function=acquisition_function)
        else:
            raise ValueError('Unknown acquisition function value %s!' %
                             self.acquisition_function)

        if self.configuration_mode == 'SMAC':
            smac = SMAC(**_smac_arguments)
        elif self.configuration_mode in ['ROAR', 'RANDOM']:
            for not_in_roar in ['runhistory2epm', 'model']:
                if not_in_roar in _smac_arguments:
                    del _smac_arguments[not_in_roar]
            smac = ROAR(**_smac_arguments)
        else:
            raise ValueError(self.configuration_mode)

        # Build a runtime model
        # runtime_rf = RandomForestWithInstances(types,
        #                                        instance_features=meta_features_list,
        #                                        seed=1, num_trees=10)
        # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
        #                                      scenario=self.scenario,
        #                                      success_states=None,
        #                                      impute_censored_data=False,
        #                                      impute_state=None)
        # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory)
        # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten())
        # X_meta, Y_meta = rh2EPM.transform(meta_runhistory)
        # # Transform Y_meta on a per-dataset base
        # for meta_dataset in meta_runs_dataset_indices:
        #     start_index, end_index = meta_runs_dataset_indices[meta_dataset]
        #     end_index += 1  # Python indexing
        #     Y_meta[start_index:end_index, 0]\
        #         [Y_meta[start_index:end_index, 0] >2.0] =  2.0
        #     dataset_minimum = np.min(Y_meta[start_index:end_index, 0])
        #     Y_meta[start_index:end_index, 0] = 1 - (
        #         (1. - Y_meta[start_index:end_index, 0]) /
        #         (1. - dataset_minimum))
        #     Y_meta[start_index:end_index, 0]\
        #           [Y_meta[start_index:end_index, 0] > 2] = 2

        smac.solver.stats.start_timing()
        # == first, evaluate all metelearning and default configurations
        smac.solver.incumbent = smac.solver.initial_design.run()

        for challenger in metalearning_configurations:

            smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify(
                challengers=[challenger],
                incumbent=smac.solver.incumbent,
                run_history=smac.solver.runhistory,
                aggregate_func=smac.solver.aggregate_func,
                time_bound=self.total_walltime_limit)

            if smac.solver.scenario.shared_model:
                pSMAC.write(run_history=smac.solver.runhistory,
                            output_directory=smac.solver.scenario.output_dir,
                            num_run=self.seed)

            if smac.solver.stats.is_budget_exhausted():
                break

        # == after metalearning run SMAC loop
        while True:

            if smac.solver.scenario.shared_model:
                pSMAC.read(run_history=smac.solver.runhistory,
                           output_directory=self.scenario.output_dir,
                           configuration_space=self.config_space,
                           logger=self.logger)

            choose_next_start_time = time.time()
            try:
                challengers = self.choose_next(smac)
            except Exception as e:
                self.logger.error(e)
                self.logger.error("Error in getting next configurations "
                                  "with SMAC. Using random configuration!")
                next_config = self.config_space.sample_configuration()
                challengers = [next_config]
            time_for_choose_next = time.time() - choose_next_start_time
            self.logger.info('Used %g seconds to find next '
                             'configurations' % (time_for_choose_next))

            time_for_choose_next = max(time_for_choose_next, 1.0)
            smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify(
                challengers=challengers,
                incumbent=smac.solver.incumbent,
                run_history=smac.solver.runhistory,
                aggregate_func=smac.solver.aggregate_func,
                time_bound=time_for_choose_next)

            if smac.solver.scenario.shared_model:
                pSMAC.write(run_history=smac.solver.runhistory,
                            output_directory=smac.solver.scenario.output_dir,
                            num_run=self.seed)

            if smac.solver.stats.is_budget_exhausted():
                break

        self.runhistory = smac.solver.runhistory
        self.trajectory = smac.solver.intensifier.traj_logger.trajectory

        return self.runhistory, self.trajectory
Exemple #12
0
def convert_data_for_epm(scenario: Scenario,
                         runhistory: RunHistory,
                         logger=None):
    """
    converts data from runhistory into EPM format

    Parameters
    ----------
    scenario: Scenario
        smac.scenario.scenario.Scenario Object
    runhistory: RunHistory
        smac.runhistory.runhistory.RunHistory Object with all necessary data

    Returns
    -------
    X: np.array
        X matrix with configuartion x features for all observed samples
    y: np.array
        y matrix with all observations
    types: np.array
        types of X cols -- necessary to train our RF implementation
    """
    types, bounds = get_types(scenario.cs, scenario.feature_array)
    model = RandomForestWithInstances(types, bounds)

    params = scenario.cs.get_hyperparameters()
    num_params = len(params)

    run_obj = scenario.run_obj

    if run_obj == "runtime":
        # if we log the performance data,
        # the RFRImputator will already get
        # log transform data from the runhistory
        cutoff = np.log10(scenario.cutoff)
        threshold = np.log10(scenario.cutoff * scenario.par_factor)

        imputor = RFRImputator(rng=np.random.RandomState(42),
                               cutoff=cutoff,
                               threshold=threshold,
                               model=model,
                               change_threshold=0.01,
                               max_iter=10)
        # TODO: Adapt runhistory2EPM object based on scenario
        rh2EPM = RunHistory2EPM4LogCost(scenario=scenario,
                                        num_params=num_params,
                                        success_states=[
                                            StatusType.SUCCESS,
                                        ],
                                        impute_censored_data=True,
                                        impute_state=[
                                            StatusType.TIMEOUT,
                                        ],
                                        imputor=imputor)
        X, Y = rh2EPM.transform(runhistory)
    else:
        rh2EPM = RunHistory2EPM4Cost(scenario=scenario,
                                     num_params=num_params,
                                     success_states=None,
                                     impute_censored_data=False,
                                     impute_state=None)
        X, Y = rh2EPM.transform(runhistory)

    return X, Y, types
Exemple #13
0
    def __init__(
        self,
        scenario: Scenario,
        tae_runner: typing.Optional[typing.Union[ExecuteTARun,
                                                 typing.Callable]] = None,
        runhistory: typing.Optional[RunHistory] = None,
        intensifier: typing.Optional[Intensifier] = None,
        acquisition_function: typing.
        Optional[AbstractAcquisitionFunction] = None,
        acquisition_function_optimizer: typing.
        Optional[AcquisitionFunctionMaximizer] = None,
        model: typing.Optional[AbstractEPM] = None,
        runhistory2epm: typing.Optional[AbstractRunHistory2EPM] = None,
        initial_design: typing.Optional[InitialDesign] = None,
        initial_configurations: typing.Optional[
            typing.List[Configuration]] = None,
        stats: typing.Optional[Stats] = None,
        restore_incumbent: typing.Optional[Configuration] = None,
        rng: typing.Optional[typing.Union[np.random.RandomState, int]] = None,
        smbo_class: typing.Optional[SMBO] = None,
        run_id: typing.Optional[int] = None,
        random_configuration_chooser: typing.
        Optional[RandomConfigurationChooser] = None):
        """
        Constructor

        Parameters
        ----------
        scenario : ~smac.scenario.scenario.Scenario
            Scenario object
        tae_runner : ~smac.tae.execute_ta_run.ExecuteTARun or callable
            Callable or implementation of
            :class:`~smac.tae.execute_ta_run.ExecuteTARun`. In case a
            callable is passed it will be wrapped by
            :class:`~smac.tae.execute_func.ExecuteTAFuncDict`.
            If not set, it will be initialized with the
            :class:`~smac.tae.execute_ta_run_old.ExecuteTARunOld`.
        runhistory : RunHistory
            runhistory to store all algorithm runs
        intensifier : Intensifier
            intensification object to issue a racing to decide the current
            incumbent
        acquisition_function : ~smac.optimizer.acquisition.AbstractAcquisitionFunction
            Object that implements the :class:`~smac.optimizer.acquisition.AbstractAcquisitionFunction`.
            Will use :class:`~smac.optimizer.acquisition.EI` if not set.
        acquisition_function_optimizer : ~smac.optimizer.ei_optimization.AcquisitionFunctionMaximizer
            Object that implements the :class:`~smac.optimizer.ei_optimization.AcquisitionFunctionMaximizer`.
            Will use :class:`smac.optimizer.ei_optimization.InterleavedLocalAndRandomSearch` if not set.
        model : AbstractEPM
            Model that implements train() and predict(). Will use a
            :class:`~smac.epm.rf_with_instances.RandomForestWithInstances` if not set.
        runhistory2epm : ~smac.runhistory.runhistory2epm.RunHistory2EMP
            Object that implements the AbstractRunHistory2EPM. If None,
            will use :class:`~smac.runhistory.runhistory2epm.RunHistory2EPM4Cost`
            if objective is cost or
            :class:`~smac.runhistory.runhistory2epm.RunHistory2EPM4LogCost`
            if objective is runtime.
        initial_design : InitialDesign
            initial sampling design
        initial_configurations : typing.List[Configuration]
            list of initial configurations for initial design --
            cannot be used together with initial_design
        stats : Stats
            optional stats object
        rng : np.random.RandomState
            Random number generator
        restore_incumbent : Configuration
            incumbent used if restoring to previous state
        smbo_class : ~smac.optimizer.smbo.SMBO
            Class implementing the SMBO interface which will be used to
            instantiate the optimizer class.
        run_id : int (optional)
            Run ID will be used as subfolder for output_dir. If no ``run_id`` is given, a random ``run_id`` will be
            chosen.
        random_configuration_chooser : ~smac.optimizer.random_configuration_chooser.RandomConfigurationChooser
            How often to choose a random configuration during the intensification procedure.

        """
        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)

        aggregate_func = average_cost

        self.scenario = scenario
        self.output_dir = ""
        if not restore_incumbent:
            # restore_incumbent is used by the CLI interface which provides a method for restoring a SMAC run given an
            # output directory. This is the default path.
            # initial random number generator
            run_id, rng = get_rng(rng=rng, run_id=run_id, logger=self.logger)
            self.output_dir = create_output_directory(scenario, run_id)
        elif scenario.output_dir is not None:
            run_id, rng = get_rng(rng=rng, run_id=run_id, logger=self.logger)
            # output-directory is created in CLI when restoring from a
            # folder. calling the function again in the facade results in two
            # folders being created: run_X and run_X.OLD. if we are
            # restoring, the output-folder exists already and we omit creating it,
            # but set the self-output_dir to the dir.
            # necessary because we want to write traj to new output-dir in CLI.
            self.output_dir = scenario.output_dir_for_this_run

        if (scenario.deterministic is True
                and getattr(scenario, 'tuner_timeout', None) is None
                and scenario.run_obj == 'quality'):
            self.logger.info('Optimizing a deterministic scenario for '
                             'quality without a tuner timeout - will make '
                             'SMAC deterministic!')
            scenario.intensification_percentage = 1e-10
        scenario.write()

        # initialize stats object
        if stats:
            self.stats = stats
        else:
            self.stats = Stats(scenario)

        if self.scenario.run_obj == "runtime" and not self.scenario.transform_y == "LOG":
            self.logger.warn(
                "Runtime as objective automatically activates log(y) transformation"
            )
            self.scenario.transform_y = "LOG"

        # initialize empty runhistory
        if runhistory is None:
            runhistory = RunHistory(aggregate_func=aggregate_func)
        # inject aggr_func if necessary
        if runhistory.aggregate_func is None:
            runhistory.aggregate_func = aggregate_func

        if not random_configuration_chooser:
            random_configuration_chooser = ChooserProb(prob=scenario.rand_prob,
                                                       rng=rng)

        # reset random number generator in config space to draw different
        # random configurations with each seed given to SMAC
        scenario.cs.seed(rng.randint(MAXINT))

        # initial Trajectory Logger
        traj_logger = TrajLogger(output_dir=self.output_dir, stats=self.stats)

        # initial EPM
        types, bounds = get_types(scenario.cs, scenario.feature_array)
        if model is None:
            model = RandomForestWithInstances(
                types=types,
                bounds=bounds,
                instance_features=scenario.feature_array,
                seed=rng.randint(MAXINT),
                pca_components=scenario.PCA_DIM,
                log_y=scenario.transform_y in ["LOG", "LOGS"],
                num_trees=scenario.rf_num_trees,
                do_bootstrapping=scenario.rf_do_bootstrapping,
                ratio_features=scenario.rf_ratio_features,
                min_samples_split=scenario.rf_min_samples_split,
                min_samples_leaf=scenario.rf_min_samples_leaf,
                max_depth=scenario.rf_max_depth)
        # initial acquisition function
        if acquisition_function is None:
            if scenario.transform_y in ["LOG", "LOGS"]:
                acquisition_function = LogEI(model=model)
            else:
                acquisition_function = EI(model=model)

        # inject model if necessary
        if acquisition_function.model is None:
            acquisition_function.model = model

        # initialize optimizer on acquisition function
        if acquisition_function_optimizer is None:
            acquisition_function_optimizer = InterleavedLocalAndRandomSearch(
                acquisition_function=acquisition_function,
                config_space=scenario.cs,
                rng=np.random.RandomState(seed=rng.randint(MAXINT)),
                max_steps=scenario.sls_max_steps,
                n_steps_plateau_walk=scenario.sls_n_steps_plateau_walk)
        elif not isinstance(
                acquisition_function_optimizer,
                AcquisitionFunctionMaximizer,
        ):
            raise ValueError(
                "Argument 'acquisition_function_optimizer' must be of type"
                "'AcquisitionFunctionMaximizer', but is '%s'" %
                type(acquisition_function_optimizer))

        # initialize tae_runner
        # First case, if tae_runner is None, the target algorithm is a call
        # string in the scenario file
        if tae_runner is None:
            tae_runner = ExecuteTARunOld(
                ta=scenario.ta,
                stats=self.stats,
                run_obj=scenario.run_obj,
                runhistory=runhistory,
                par_factor=scenario.par_factor,
                cost_for_crash=scenario.cost_for_crash,
                abort_on_first_run_crash=scenario.abort_on_first_run_crash)
        # Second case, the tae_runner is a function to be optimized
        elif callable(tae_runner):
            tae_runner = ExecuteTAFuncDict(
                ta=tae_runner,
                stats=self.stats,
                run_obj=scenario.run_obj,
                memory_limit=scenario.memory_limit,
                runhistory=runhistory,
                par_factor=scenario.par_factor,
                cost_for_crash=scenario.cost_for_crash,
                abort_on_first_run_crash=scenario.abort_on_first_run_crash)
        # Third case, if it is an ExecuteTaRun we can simply use the
        # instance. Otherwise, the next check raises an exception
        elif not isinstance(tae_runner, ExecuteTARun):
            raise TypeError("Argument 'tae_runner' is %s, but must be "
                            "either a callable or an instance of "
                            "ExecuteTaRun. Passing 'None' will result in the "
                            "creation of target algorithm runner based on the "
                            "call string in the scenario file." %
                            type(tae_runner))

        # Check that overall objective and tae objective are the same
        if tae_runner.run_obj != scenario.run_obj:
            raise ValueError("Objective for the target algorithm runner and "
                             "the scenario must be the same, but are '%s' and "
                             "'%s'" % (tae_runner.run_obj, scenario.run_obj))

        # inject stats if necessary
        if tae_runner.stats is None:
            tae_runner.stats = self.stats
        # inject runhistory if necessary
        if tae_runner.runhistory is None:
            tae_runner.runhistory = runhistory
        # inject cost_for_crash
        if tae_runner.crash_cost != scenario.cost_for_crash:
            tae_runner.crash_cost = scenario.cost_for_crash

        # initialize intensification
        if intensifier is None:
            intensifier = Intensifier(
                tae_runner=tae_runner,
                stats=self.stats,
                traj_logger=traj_logger,
                rng=rng,
                instances=scenario.train_insts,
                cutoff=scenario.cutoff,
                deterministic=scenario.deterministic,
                run_obj_time=scenario.run_obj == "runtime",
                always_race_against=scenario.cs.get_default_configuration()
                if scenario.always_race_default else None,
                use_ta_time_bound=scenario.use_ta_time,
                instance_specifics=scenario.instance_specific,
                minR=scenario.minR,
                maxR=scenario.maxR,
                adaptive_capping_slackfactor=scenario.
                intens_adaptive_capping_slackfactor,
                min_chall=scenario.intens_min_chall)
        # inject deps if necessary
        if intensifier.tae_runner is None:
            intensifier.tae_runner = tae_runner
        if intensifier.stats is None:
            intensifier.stats = self.stats
        if intensifier.traj_logger is None:
            intensifier.traj_logger = traj_logger

        # initial design
        if initial_design is not None and initial_configurations is not None:
            raise ValueError(
                "Either use initial_design or initial_configurations; but not both"
            )

        if initial_configurations is not None:
            initial_design = MultiConfigInitialDesign(
                tae_runner=tae_runner,
                scenario=scenario,
                stats=self.stats,
                traj_logger=traj_logger,
                runhistory=runhistory,
                rng=rng,
                configs=initial_configurations,
                intensifier=intensifier,
                aggregate_func=aggregate_func)
        elif initial_design is None:
            if scenario.initial_incumbent == "DEFAULT":
                initial_design = DefaultConfiguration(tae_runner=tae_runner,
                                                      scenario=scenario,
                                                      stats=self.stats,
                                                      traj_logger=traj_logger,
                                                      rng=rng)
            elif scenario.initial_incumbent == "RANDOM":
                initial_design = RandomConfiguration(tae_runner=tae_runner,
                                                     scenario=scenario,
                                                     stats=self.stats,
                                                     traj_logger=traj_logger,
                                                     rng=rng)
            elif scenario.initial_incumbent == "LHD":
                initial_design = LHDesign(runhistory=runhistory,
                                          intensifier=intensifier,
                                          aggregate_func=aggregate_func,
                                          tae_runner=tae_runner,
                                          scenario=scenario,
                                          stats=self.stats,
                                          traj_logger=traj_logger,
                                          rng=rng)
            elif scenario.initial_incumbent == "FACTORIAL":
                initial_design = FactorialInitialDesign(
                    runhistory=runhistory,
                    intensifier=intensifier,
                    aggregate_func=aggregate_func,
                    tae_runner=tae_runner,
                    scenario=scenario,
                    stats=self.stats,
                    traj_logger=traj_logger,
                    rng=rng)
            elif scenario.initial_incumbent == "SOBOL":
                initial_design = SobolDesign(runhistory=runhistory,
                                             intensifier=intensifier,
                                             aggregate_func=aggregate_func,
                                             tae_runner=tae_runner,
                                             scenario=scenario,
                                             stats=self.stats,
                                             traj_logger=traj_logger,
                                             rng=rng)
            else:
                raise ValueError("Don't know what kind of initial_incumbent "
                                 "'%s' is" % scenario.initial_incumbent)
        # inject deps if necessary
        if initial_design.tae_runner is None:
            initial_design.tae_runner = tae_runner
        if initial_design.scenario is None:
            initial_design.scenario = scenario
        if initial_design.stats is None:
            initial_design.stats = self.stats
        if initial_design.traj_logger is None:
            initial_design.traj_logger = traj_logger

        # initial conversion of runhistory into EPM data
        if runhistory2epm is None:

            num_params = len(scenario.cs.get_hyperparameters())
            if scenario.run_obj == 'runtime':

                # if we log the performance data,
                # the RFRImputator will already get
                # log transform data from the runhistory
                cutoff = np.log(scenario.cutoff)
                threshold = np.log(scenario.cutoff * scenario.par_factor)

                imputor = RFRImputator(rng=rng,
                                       cutoff=cutoff,
                                       threshold=threshold,
                                       model=model,
                                       change_threshold=0.01,
                                       max_iter=2)

                runhistory2epm = RunHistory2EPM4LogCost(
                    scenario=scenario,
                    num_params=num_params,
                    success_states=[
                        StatusType.SUCCESS,
                    ],
                    impute_censored_data=True,
                    impute_state=[
                        StatusType.CAPPED,
                    ],
                    imputor=imputor)

            elif scenario.run_obj == 'quality':
                if scenario.transform_y == "NONE":
                    runhistory2epm = RunHistory2EPM4Cost(
                        scenario=scenario,
                        num_params=num_params,
                        success_states=[
                            StatusType.SUCCESS, StatusType.CRASHED
                        ],
                        impute_censored_data=False,
                        impute_state=None)
                elif scenario.transform_y == "LOG":
                    runhistory2epm = RunHistory2EPM4LogCost(
                        scenario=scenario,
                        num_params=num_params,
                        success_states=[
                            StatusType.SUCCESS, StatusType.CRASHED
                        ],
                        impute_censored_data=False,
                        impute_state=None)
                elif scenario.transform_y == "LOGS":
                    runhistory2epm = RunHistory2EPM4LogScaledCost(
                        scenario=scenario,
                        num_params=num_params,
                        success_states=[
                            StatusType.SUCCESS, StatusType.CRASHED
                        ],
                        impute_censored_data=False,
                        impute_state=None)
                elif scenario.transform_y == "INVS":
                    runhistory2epm = RunHistory2EPM4InvScaledCost(
                        scenario=scenario,
                        num_params=num_params,
                        success_states=[
                            StatusType.SUCCESS, StatusType.CRASHED
                        ],
                        impute_censored_data=False,
                        impute_state=None)

            else:
                raise ValueError('Unknown run objective: %s. Should be either '
                                 'quality or runtime.' % self.scenario.run_obj)

        # inject scenario if necessary:
        if runhistory2epm.scenario is None:
            runhistory2epm.scenario = scenario

        smbo_args = {
            'scenario': scenario,
            'stats': self.stats,
            'initial_design': initial_design,
            'runhistory': runhistory,
            'runhistory2epm': runhistory2epm,
            'intensifier': intensifier,
            'aggregate_func': aggregate_func,
            'num_run': run_id,
            'model': model,
            'acq_optimizer': acquisition_function_optimizer,
            'acquisition_func': acquisition_function,
            'rng': rng,
            'restore_incumbent': restore_incumbent,
            'random_configuration_chooser': random_configuration_chooser
        }

        if smbo_class is None:
            self.solver = SMBO(**smbo_args)
        else:
            self.solver = smbo_class(**smbo_args)
Exemple #14
0
    def __init__(
            self,
            scenario: Scenario,
            # TODO: once we drop python3.4 add type hint
            # typing.Union[ExecuteTARun, callable]
            tae_runner=None,
            runhistory: RunHistory = None,
            intensifier: Intensifier = None,
            acquisition_function: AbstractAcquisitionFunction = None,
            model: AbstractEPM = None,
            runhistory2epm: AbstractRunHistory2EPM = None,
            initial_design: InitialDesign = None,
            initial_configurations: typing.List[Configuration] = None,
            stats: Stats = None,
            rng: np.random.RandomState = None):
        '''
        Facade to use SMAC default mode

        Parameters
        ----------
        scenario: smac.scenario.scenario.Scenario
            Scenario object
        tae_runner: ExecuteTARun or callable
            Callable or implementation of :class:`ExecuteTaRun`. In case a
            callable is passed it will be wrapped by tae.ExecuteTaFunc().
            If not set, tae_runner will be initialized with
            the tae.ExecuteTARunOld()
        runhistory: RunHistory
            runhistory to store all algorithm runs
        intensifier: Intensifier
            intensification object to issue a racing to decide the current
            incumbent
        acquisition_function : AcquisitionFunction
            Object that implements the AbstractAcquisitionFunction. Will use
            EI if not set.
        model : AbstractEPM
            Model that implements train() and predict(). Will use a
            RandomForest if not set.
        runhistory2epm : RunHistory2EMP
            Object that implements the AbstractRunHistory2EPM. If None,
            will use RunHistory2EPM4Cost if objective is cost or
            RunHistory2EPM4LogCost if objective is runtime.
        initial_design: InitialDesign
            initial sampling design
        initial_configurations: typing.List[Configuration]
            list of initial configurations for initial design -- 
            cannot be used together with initial_design
        stats: Stats
            optional stats object
        rng: np.random.RandomState
            Random number generator
        '''
        self.logger = logging.getLogger("SMAC")

        aggregate_func = average_cost

        # initialize stats object
        if stats:
            self.stats = stats
        else:
            self.stats = Stats(scenario)

        # initialize empty runhistory
        if runhistory is None:
            runhistory = RunHistory(aggregate_func=aggregate_func)

        # initial random number generator
        num_run, rng = self._get_rng(rng=rng)

        # reset random number generator in config space to draw different
        # random configurations with each seed given to SMAC
        scenario.cs.seed(rng.randint(MAXINT))

        # initial Trajectory Logger
        traj_logger = TrajLogger(output_dir=scenario.output_dir,
                                 stats=self.stats)

        # initial EPM
        types = get_types(scenario.cs, scenario.feature_array)
        if model is None:
            model = RandomForestWithInstances(
                types=types,
                instance_features=scenario.feature_array,
                seed=rng.randint(MAXINT))
        # initial acquisition function
        if acquisition_function is None:
            acquisition_function = EI(model=model)

        # initialize optimizer on acquisition function
        local_search = LocalSearch(acquisition_function, scenario.cs)

        # initialize tae_runner
        # First case, if tae_runner is None, the target algorithm is a call
        # string in the scenario file
        if tae_runner is None:
            tae_runner = ExecuteTARunOld(ta=scenario.ta,
                                         stats=self.stats,
                                         run_obj=scenario.run_obj,
                                         runhistory=runhistory,
                                         par_factor=scenario.par_factor)
        # Second case, the tae_runner is a function to be optimized
        elif callable(tae_runner):
            tae_runner = ExecuteTAFuncDict(ta=tae_runner,
                                           stats=self.stats,
                                           run_obj=scenario.run_obj,
                                           memory_limit=scenario.memory_limit,
                                           runhistory=runhistory,
                                           par_factor=scenario.par_factor)
        # Third case, if it is an ExecuteTaRun we can simply use the
        # instance. Otherwise, the next check raises an exception
        elif not isinstance(tae_runner, ExecuteTARun):
            raise TypeError("Argument 'tae_runner' is %s, but must be "
                            "either a callable or an instance of "
                            "ExecuteTaRun. Passing 'None' will result in the "
                            "creation of target algorithm runner based on the "
                            "call string in the scenario file." %
                            type(tae_runner))

        # Check that overall objective and tae objective are the same
        if tae_runner.run_obj != scenario.run_obj:
            raise ValueError("Objective for the target algorithm runner and "
                             "the scenario must be the same, but are '%s' and "
                             "'%s'" % (tae_runner.run_obj, scenario.run_obj))

        # inject stats if necessary
        if tae_runner.stats is None:
            tae_runner.stats = self.stats
        # inject runhistory if necessary
        if tae_runner.runhistory is None:
            tae_runner.runhistory = runhistory

        # initial intensification
        if intensifier is None:
            intensifier = Intensifier(
                tae_runner=tae_runner,
                stats=self.stats,
                traj_logger=traj_logger,
                rng=rng,
                instances=scenario.train_insts,
                cutoff=scenario.cutoff,
                deterministic=scenario.deterministic,
                run_obj_time=scenario.run_obj == "runtime",
                instance_specifics=scenario.instance_specific,
                minR=scenario.minR,
                maxR=scenario.maxR)

        # initial design
        if initial_design is not None and initial_configurations is not None:
            raise ValueError(
                "Either use initial_design or initial_configurations; but not both"
            )

        if initial_configurations is not None:
            initial_design = MultiConfigInitialDesign(
                tae_runner=tae_runner,
                scenario=scenario,
                stats=self.stats,
                traj_logger=traj_logger,
                runhistory=runhistory,
                rng=rng,
                configs=initial_configurations,
                intensifier=intensifier,
                aggregate_func=aggregate_func)
        elif initial_design is None:
            if scenario.initial_incumbent == "DEFAULT":
                initial_design = DefaultConfiguration(tae_runner=tae_runner,
                                                      scenario=scenario,
                                                      stats=self.stats,
                                                      traj_logger=traj_logger,
                                                      rng=rng)
            elif scenario.initial_incumbent == "RANDOM":
                initial_design = RandomConfiguration(tae_runner=tae_runner,
                                                     scenario=scenario,
                                                     stats=self.stats,
                                                     traj_logger=traj_logger,
                                                     rng=rng)
            else:
                raise ValueError("Don't know what kind of initial_incumbent "
                                 "'%s' is" % scenario.initial_incumbent)

        # initial conversion of runhistory into EPM data
        if runhistory2epm is None:

            num_params = len(scenario.cs.get_hyperparameters())
            if scenario.run_obj == "runtime":

                # if we log the performance data,
                # the RFRImputator will already get
                # log transform data from the runhistory
                cutoff = np.log10(scenario.cutoff)
                threshold = np.log10(scenario.cutoff * scenario.par_factor)

                imputor = RFRImputator(rs=rng,
                                       cutoff=cutoff,
                                       threshold=threshold,
                                       model=model,
                                       change_threshold=0.01,
                                       max_iter=2)

                runhistory2epm = RunHistory2EPM4LogCost(
                    scenario=scenario,
                    num_params=num_params,
                    success_states=[
                        StatusType.SUCCESS,
                    ],
                    impute_censored_data=True,
                    impute_state=[
                        StatusType.TIMEOUT,
                    ],
                    imputor=imputor)

            elif scenario.run_obj == 'quality':
                runhistory2epm = RunHistory2EPM4Cost\
                    (scenario=scenario, num_params=num_params,
                     success_states=[StatusType.SUCCESS, ],
                     impute_censored_data=False, impute_state=None)

            else:
                raise ValueError('Unknown run objective: %s. Should be either '
                                 'quality or runtime.' % self.scenario.run_obj)

        self.solver = SMBO(scenario=scenario,
                           stats=self.stats,
                           initial_design=initial_design,
                           runhistory=runhistory,
                           runhistory2epm=runhistory2epm,
                           intensifier=intensifier,
                           aggregate_func=aggregate_func,
                           num_run=num_run,
                           model=model,
                           acq_optimizer=local_search,
                           acquisition_func=acquisition_function,
                           rng=rng)
Exemple #15
0
    def __init__(
            self,
            scenario: Scenario,
            tae_runner: typing.Union[ExecuteTARun, typing.Callable] = None,
            runhistory: RunHistory = None,
            intensifier: Intensifier = None,
            acquisition_function: AbstractAcquisitionFunction = None,
            acquisition_function_optimizer: AcquisitionFunctionMaximizer = None,
            model: AbstractEPM = None,
            runhistory2epm: AbstractRunHistory2EPM = None,
            initial_design: InitialDesign = None,
            initial_configurations: typing.List[Configuration] = None,
            stats: Stats = None,
            restore_incumbent: Configuration = None,
            rng: typing.Union[np.random.RandomState, int] = None,
            smbo_class: SMBO = None,
            run_id: int = 1):
        """Constructor

        Parameters
        ----------
        scenario : ~smac.scenario.scenario.Scenario
            Scenario object
        tae_runner : ~smac.tae.execute_ta_run.ExecuteTARun or callable
            Callable or implementation of
            :class:`~smac.tae.execute_ta_run.ExecuteTARun`. In case a
            callable is passed it will be wrapped by
            :class:`~smac.tae.execute_func.ExecuteTAFuncDict`.
            If not set, it will be initialized with the
            :class:`~smac.tae.execute_ta_run_old.ExecuteTARunOld`.
        runhistory : RunHistory
            runhistory to store all algorithm runs
        intensifier : Intensifier
            intensification object to issue a racing to decide the current
            incumbent
        acquisition_function : ~smac.optimizer.acquisition.AbstractAcquisitionFunction
            Object that implements the :class:`~smac.optimizer.acquisition.AbstractAcquisitionFunction`.
            Will use :class:`~smac.optimizer.acquisition.EI` if not set.
        acquisition_function_optimizer : ~smac.optimizer.ei_optimization.AcquisitionFunctionMaximizer
            Object that implements the :class:`~smac.optimizer.ei_optimization.AcquisitionFunctionMaximizer`.
            Will use :class:`smac.optimizer.ei_optimization.InterleavedLocalAndRandomSearch` if not set.
        model : AbstractEPM
            Model that implements train() and predict(). Will use a
            :class:`~smac.epm.rf_with_instances.RandomForestWithInstances` if not set.
        runhistory2epm : ~smac.runhistory.runhistory2epm.RunHistory2EMP
            Object that implements the AbstractRunHistory2EPM. If None,
            will use :class:`~smac.runhistory.runhistory2epm.RunHistory2EPM4Cost`
            if objective is cost or
            :class:`~smac.runhistory.runhistory2epm.RunHistory2EPM4LogCost`
            if objective is runtime.
        initial_design : InitialDesign
            initial sampling design
        initial_configurations : typing.List[Configuration]
            list of initial configurations for initial design --
            cannot be used together with initial_design
        stats : Stats
            optional stats object
        rng : np.random.RandomState
            Random number generator
        restore_incumbent : Configuration
            incumbent used if restoring to previous state
        smbo_class : ~smac.optimizer.smbo.SMBO
            Class implementing the SMBO interface which will be used to
            instantiate the optimizer class.
        run_id: int, (default: 1)
            Run ID will be used as subfolder for output_dir.
        """

        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)

        aggregate_func = average_cost

        self.output_dir = create_output_directory(scenario, run_id)
        scenario.write()

        # initialize stats object
        if stats:
            self.stats = stats
        else:
            self.stats = Stats(scenario)

        # initialize empty runhistory
        if runhistory is None:
            runhistory = RunHistory(aggregate_func=aggregate_func)
        # inject aggr_func if necessary
        if runhistory.aggregate_func is None:
            runhistory.aggregate_func = aggregate_func

        # initial random number generator
        num_run, rng = self._get_rng(rng=rng)

        # reset random number generator in config space to draw different
        # random configurations with each seed given to SMAC
        scenario.cs.seed(rng.randint(MAXINT))

        # initial Trajectory Logger
        traj_logger = TrajLogger(output_dir=self.output_dir, stats=self.stats)

        # initial EPM
        types, bounds = get_types(scenario.cs, scenario.feature_array)
        if model is None:
            model = RandomForestWithInstances(
                types=types,
                bounds=bounds,
                instance_features=scenario.feature_array,
                seed=rng.randint(MAXINT),
                pca_components=scenario.PCA_DIM)
        # initial acquisition function
        if acquisition_function is None:
            if scenario.run_obj == "runtime":
                acquisition_function = LogEI(model=model)
            else:
                acquisition_function = EI(model=model)
        # inject model if necessary
        if acquisition_function.model is None:
            acquisition_function.model = model

        # initialize optimizer on acquisition function
        if acquisition_function_optimizer is None:
            acquisition_function_optimizer = InterleavedLocalAndRandomSearch(
                acquisition_function, scenario.cs,
                np.random.RandomState(seed=rng.randint(MAXINT)))
        elif not isinstance(
                acquisition_function_optimizer,
                AcquisitionFunctionMaximizer,
        ):
            raise ValueError(
                "Argument 'acquisition_function_optimizer' must be of type"
                "'AcquisitionFunctionMaximizer', but is '%s'" %
                type(acquisition_function_optimizer))

        # initialize tae_runner
        # First case, if tae_runner is None, the target algorithm is a call
        # string in the scenario file
        if tae_runner is None:
            tae_runner = ExecuteTARunOld(
                ta=scenario.ta,
                stats=self.stats,
                run_obj=scenario.run_obj,
                runhistory=runhistory,
                par_factor=scenario.par_factor,
                cost_for_crash=scenario.cost_for_crash)
        # Second case, the tae_runner is a function to be optimized
        elif callable(tae_runner):
            tae_runner = ExecuteTAFuncDict(
                ta=tae_runner,
                stats=self.stats,
                run_obj=scenario.run_obj,
                memory_limit=scenario.memory_limit,
                runhistory=runhistory,
                par_factor=scenario.par_factor,
                cost_for_crash=scenario.cost_for_crash)
        # Third case, if it is an ExecuteTaRun we can simply use the
        # instance. Otherwise, the next check raises an exception
        elif not isinstance(tae_runner, ExecuteTARun):
            raise TypeError("Argument 'tae_runner' is %s, but must be "
                            "either a callable or an instance of "
                            "ExecuteTaRun. Passing 'None' will result in the "
                            "creation of target algorithm runner based on the "
                            "call string in the scenario file." %
                            type(tae_runner))

        # Check that overall objective and tae objective are the same
        if tae_runner.run_obj != scenario.run_obj:
            raise ValueError("Objective for the target algorithm runner and "
                             "the scenario must be the same, but are '%s' and "
                             "'%s'" % (tae_runner.run_obj, scenario.run_obj))

        # inject stats if necessary
        if tae_runner.stats is None:
            tae_runner.stats = self.stats
        # inject runhistory if necessary
        if tae_runner.runhistory is None:
            tae_runner.runhistory = runhistory
        # inject cost_for_crash
        if tae_runner.crash_cost != scenario.cost_for_crash:
            tae_runner.crash_cost = scenario.cost_for_crash

        # initialize intensification
        if intensifier is None:
            intensifier = Intensifier(tae_runner=tae_runner,
                                      stats=self.stats,
                                      traj_logger=traj_logger,
                                      rng=rng,
                                      instances=scenario.train_insts,
                                      cutoff=scenario.cutoff,
                                      deterministic=scenario.deterministic,
                                      run_obj_time=scenario.run_obj == "runtime",
                                      always_race_against=scenario.cs.get_default_configuration() \
                                        if scenario.always_race_default else None,
                                      instance_specifics=scenario.instance_specific,
                                      minR=scenario.minR,
                                      maxR=scenario.maxR)
        # inject deps if necessary
        if intensifier.tae_runner is None:
            intensifier.tae_runner = tae_runner
        if intensifier.stats is None:
            intensifier.stats = self.stats
        if intensifier.traj_logger is None:
            intensifier.traj_logger = traj_logger

        # initial design
        if initial_design is not None and initial_configurations is not None:
            raise ValueError(
                "Either use initial_design or initial_configurations; but not both"
            )

        if initial_configurations is not None:
            initial_design = MultiConfigInitialDesign(
                tae_runner=tae_runner,
                scenario=scenario,
                stats=self.stats,
                traj_logger=traj_logger,
                runhistory=runhistory,
                rng=rng,
                configs=initial_configurations,
                intensifier=intensifier,
                aggregate_func=aggregate_func)
        elif initial_design is None:
            if scenario.initial_incumbent == "DEFAULT":
                initial_design = DefaultConfiguration(tae_runner=tae_runner,
                                                      scenario=scenario,
                                                      stats=self.stats,
                                                      traj_logger=traj_logger,
                                                      rng=rng)
            elif scenario.initial_incumbent == "RANDOM":
                initial_design = RandomConfiguration(tae_runner=tae_runner,
                                                     scenario=scenario,
                                                     stats=self.stats,
                                                     traj_logger=traj_logger,
                                                     rng=rng)
            else:
                raise ValueError("Don't know what kind of initial_incumbent "
                                 "'%s' is" % scenario.initial_incumbent)
        # inject deps if necessary
        if initial_design.tae_runner is None:
            initial_design.tae_runner = tae_runner
        if initial_design.scenario is None:
            initial_design.scenario = scenario
        if initial_design.stats is None:
            initial_design.stats = self.stats
        if initial_design.traj_logger is None:
            initial_design.traj_logger = traj_logger

        # initial conversion of runhistory into EPM data
        if runhistory2epm is None:

            num_params = len(scenario.cs.get_hyperparameters())
            if scenario.run_obj == "runtime":

                # if we log the performance data,
                # the RFRImputator will already get
                # log transform data from the runhistory
                cutoff = np.log10(scenario.cutoff)
                threshold = np.log10(scenario.cutoff * scenario.par_factor)

                imputor = RFRImputator(rng=rng,
                                       cutoff=cutoff,
                                       threshold=threshold,
                                       model=model,
                                       change_threshold=0.01,
                                       max_iter=2)

                runhistory2epm = RunHistory2EPM4LogCost(
                    scenario=scenario,
                    num_params=num_params,
                    success_states=[
                        StatusType.SUCCESS,
                    ],
                    impute_censored_data=True,
                    impute_state=[
                        StatusType.CAPPED,
                    ],
                    imputor=imputor)

            elif scenario.run_obj == 'quality':
                runhistory2epm = RunHistory2EPM4Cost(
                    scenario=scenario,
                    num_params=num_params,
                    success_states=[StatusType.SUCCESS, StatusType.CRASHED],
                    impute_censored_data=False,
                    impute_state=None)

            else:
                raise ValueError('Unknown run objective: %s. Should be either '
                                 'quality or runtime.' % self.scenario.run_obj)

        # inject scenario if necessary:
        if runhistory2epm.scenario is None:
            runhistory2epm.scenario = scenario

        smbo_args = {
            'scenario': scenario,
            'stats': self.stats,
            'initial_design': initial_design,
            'runhistory': runhistory,
            'runhistory2epm': runhistory2epm,
            'intensifier': intensifier,
            'aggregate_func': aggregate_func,
            'num_run': num_run,
            'model': model,
            'acq_optimizer': acquisition_function_optimizer,
            'acquisition_func': acquisition_function,
            'rng': rng,
            'restore_incumbent': restore_incumbent
        }
        if smbo_class is None:
            self.solver = SMBO(**smbo_args)
        else:
            self.solver = smbo_class(**smbo_args)
Exemple #16
0
    def validate_epm(
        self,
        config_mode: Union[str, typing.List[Configuration]] = 'def',
        instance_mode: Union[str, typing.List[str]] = 'test',
        repetitions: int = 1,
        runhistory: RunHistory = None,
        output_fn="",
        reuse_epm=True,
    ) -> RunHistory:
        """
        Use EPM to predict costs/runtimes for unknown config/inst-pairs.

        side effect: if output is specified, saves runhistory to specified
        output directory.

        Parameters
        ----------
        output_fn: str
            path to runhistory to be saved. if the suffix is not '.json', will
            be interpreted as directory and filename will be
            'validated_runhistory_EPM.json'
        config_mode: str or list<Configuration>
            string or directly a list of Configuration, string from [def, inc, def+inc, wallclock_time, cpu_time, all].
            time evaluates at cpu- or wallclock-timesteps of:
            [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time
        instance_mode: str or list<str>
            what instances to use for validation, either from
            [train, test, train+test] or directly a list of instances
        repetitions: int
            number of repetitions in nondeterministic algorithms
        runhistory: RunHistory
            optional, RunHistory-object to reuse runs
        reuse_epm: bool
            if true (and if `self.epm`), reuse epm to validate runs

        Returns
        -------
        runhistory: RunHistory
            runhistory with predicted runs
        """
        if not isinstance(runhistory, RunHistory) and (self.epm is None
                                                       or reuse_epm is False):
            raise ValueError(
                "No runhistory specified for validating with EPM!")
        elif reuse_epm is False or self.epm is None:
            # Create RandomForest
            types, bounds = get_types(self.scen.cs, self.scen.feature_array)
            self.epm = RandomForestWithInstances(
                types=types,
                bounds=bounds,
                instance_features=self.scen.feature_array,
                seed=self.rng.randint(MAXINT),
                ratio_features=1.0)
            # Use imputor if objective is runtime
            imputor = None
            impute_state = None
            impute_censored_data = False
            if self.scen.run_obj == 'runtime':
                threshold = self.scen.cutoff * self.scen.par_factor
                imputor = RFRImputator(rng=self.rng,
                                       cutoff=self.scen.cutoff,
                                       threshold=threshold,
                                       model=self.epm)
                impute_censored_data = True
                impute_state = [StatusType.CAPPED]
            # Transform training data (from given rh)
            rh2epm = RunHistory2EPM4Cost(
                num_params=len(self.scen.cs.get_hyperparameters()),
                scenario=self.scen,
                rng=self.rng,
                impute_censored_data=impute_censored_data,
                imputor=imputor,
                impute_state=impute_state)
            X, y = rh2epm.transform(runhistory)
            self.logger.debug("Training model with data of shape X: %s, y:%s",
                              str(X.shape), str(y.shape))
            # Train random forest
            self.epm.train(X, y)

        # Predict desired runs
        runs, rh_epm = self._get_runs(config_mode, instance_mode, repetitions,
                                      runhistory)

        feature_array_size = len(self.scen.cs.get_hyperparameters())
        if self.scen.feature_array is not None:
            feature_array_size += self.scen.feature_array.shape[1]

        X_pred = np.empty((len(runs), feature_array_size))
        for idx, run in enumerate(runs):
            if self.scen.feature_array is not None and run.inst is not None:
                X_pred[idx] = np.hstack([
                    convert_configurations_to_array([run.config])[0],
                    self.scen.feature_dict[run.inst]
                ])
            else:
                X_pred[idx] = convert_configurations_to_array([run.config])[0]
        self.logger.debug("Predicting desired %d runs, data has shape %s",
                          len(runs), str(X_pred.shape))

        y_pred = self.epm.predict(X_pred)

        # Add runs to runhistory
        for run, pred in zip(runs, y_pred[0]):
            rh_epm.add(
                config=run.config,
                cost=float(pred),
                time=float(pred),
                status=StatusType.SUCCESS,
                instance_id=run.inst,
                seed=-1,
                additional_info={"additional_info": "ESTIMATED USING EPM!"})

        if output_fn:
            self._save_results(rh_epm,
                               output_fn,
                               backup_fn="validated_runhistory_EPM.json")
        return rh_epm
Exemple #17
0
    def build_pc_smbo(self,
                      tae_runner,
                      stats,
                      scenario,
                      runhistory,
                      aggregate_func,
                      acq_func_name,
                      model_target_names,
                      logging_directory,
                      double_intensification=False,
                      constant_pipeline_steps=None,
                      variable_pipeline_steps=None,
                      cached_pipeline_steps=None,
                      seed=None,
                      intensification_instances=None,
                      num_marginalized_configurations_by_random_search=20,
                      num_configs_for_marginalization=40,
                      random_splitting_number=5,
                      random_splitting_enabled=False):

        # Build intensifier
        rng = np.random.RandomState(seed)
        traj_logger = TrajLogger(logging_directory, stats)
        intensifier = Intensifier(tae_runner=tae_runner,
                                  stats=stats,
                                  traj_logger=traj_logger,
                                  rng=rng,
                                  cutoff=scenario.cutoff,
                                  deterministic=scenario.deterministic,
                                  run_obj_time=scenario.run_obj == "runtime",
                                  run_limit=scenario.ta_run_limit,
                                  instances=intensification_instances,
                                  maxR=len(intensification_instances))

        # Build model
        types, bounds = get_types(scenario.cs, scenario.feature_array)
        #types = get_types(scenario.cs)
        if len(model_target_names) > 1:
            # model_target_names = ['cost','time']
            model = UncorrelatedMultiObjectiveRandomForestWithInstances(
                target_names=model_target_names, bounds=bounds, types=types)
            # UncorrelatedMultiObjectiveRandomForestWithInstances(target_names=model_target_names,
            #                                                    types=types)
        elif len(model_target_names) == 1:
            model = RandomForestWithInstances(types=types, bounds=bounds)
        else:
            model = RandomEPM(rng=rng)
            # model = RandomForestWithInstances(types=types)

        # Build acquisition function, runhistory2epm and local search
        num_params = len(scenario.cs.get_hyperparameters())
        if acq_func_name in ["ei", "pc-ei"]:
            acquisition_func = EI(model)
            acq_func_wrapper = PCAquisitionFunctionWrapper(
                acquisition_func=acquisition_func,
                config_space=scenario.cs,
                runhistory=runhistory,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps)
            runhistory2epm = RunHistory2EPM4Cost(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                                       config_space=scenario.cs)
            select_configuration = SelectConfigurations(
                scenario=scenario,
                stats=stats,
                runhistory=runhistory,
                model=model,
                acq_optimizer=local_search,
                acquisition_func=acq_func_wrapper,
                rng=rng,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps)
        elif acq_func_name in ["m-ei", "pc-m-ei"]:
            #acquisition_func = MEI(model)
            acquisition_func = EI(model)
            acq_func_wrapper = PCAquisitionFunctionWrapper(
                acquisition_func=acquisition_func,
                config_space=scenario.cs,
                runhistory=runhistory,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps)
            runhistory2epm = RunHistory2EPM4Cost(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                                       config_space=scenario.cs)
            # TODO: num_configs_for_marginalization
            select_configuration = SelectConfigurationsWithMarginalization(
                scenario=scenario,
                stats=stats,
                runhistory=runhistory,
                model=model,
                acq_optimizer=local_search,
                acquisition_func=acq_func_wrapper,
                rng=rng,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps,
                num_marginalized_configurations_by_random_search=
                num_marginalized_configurations_by_random_search,
                num_configs_for_marginalization=num_configs_for_marginalization
            )
        elif acq_func_name in ['eips', 'pc-eips']:
            acquisition_func = EIPS(model)
            acq_func_wrapper = PCAquisitionFunctionWrapper(
                acquisition_func=acquisition_func,
                config_space=scenario.cs,
                runhistory=runhistory,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps)
            runhistory2epm = RunHistory2EPM4EIPS(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                                       config_space=scenario.cs)
            select_configuration = SelectConfigurations(
                scenario=scenario,
                stats=stats,
                runhistory=runhistory,
                model=model,
                acq_optimizer=local_search,
                acquisition_func=acq_func_wrapper,
                rng=rng,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps)
        elif acq_func_name in ["m-eips", "pc-m-eips"]:
            acquisition_func = EIPS(model)
            acq_func_wrapper = PCAquisitionFunctionWrapper(
                acquisition_func=acquisition_func,
                config_space=scenario.cs,
                runhistory=runhistory,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps)
            runhistory2epm = RunHistory2EPM4EIPS(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                                       config_space=scenario.cs)
            # TODO: num_configs_for_marginalization
            select_configuration = SelectConfigurationsWithMarginalization(
                scenario=scenario,
                stats=stats,
                runhistory=runhistory,
                model=model,
                acq_optimizer=local_search,
                acquisition_func=acq_func_wrapper,
                rng=rng,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps,
                num_marginalized_configurations_by_random_search=
                num_marginalized_configurations_by_random_search,
                num_configs_for_marginalization=num_configs_for_marginalization
            )
        elif acq_func_name == 'pceips':
            acquisition_func = PCEIPS(model)
            acq_func_wrapper = PCAquisitionFunctionWrapperWithCachingReduction(
                acquisition_func=acquisition_func,
                config_space=scenario.cs,
                runhistory=runhistory,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps,
                cached_pipeline_steps=cached_pipeline_steps)
            runhistory2epm = RunHistory2EPM4EIPS(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                                       config_space=scenario.cs)
            if constant_pipeline_steps == None or variable_pipeline_steps == None or cached_pipeline_steps == None:
                raise ValueError(
                    "Constant_pipeline_steps and variable pipeline steps should not be none\
                                    when using PCEIPS")
            select_configuration = SelectConfigurations(
                scenario=scenario,
                stats=stats,
                runhistory=runhistory,
                model=model,
                acq_optimizer=local_search,
                acquisition_func=acq_func_wrapper,
                rng=rng,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps)
        elif acq_func_name == 'pc-m-pceips':
            acquisition_func = PCEIPS(model)
            acq_func_wrapper = PCAquisitionFunctionWrapperWithCachingReduction(
                acquisition_func=acquisition_func,
                config_space=scenario.cs,
                runhistory=runhistory,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps,
                cached_pipeline_steps=cached_pipeline_steps)
            runhistory2epm = RunHistory2EPM4EIPS(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                                       config_space=scenario.cs)
            if constant_pipeline_steps == None or variable_pipeline_steps == None or cached_pipeline_steps == None:
                raise ValueError(
                    "Constant_pipeline_steps and variable pipeline steps should not be none\
                                    when using PCEIPS")
            select_configuration = SelectConfigurationsWithMarginalization(
                scenario=scenario,
                stats=stats,
                runhistory=runhistory,
                model=model,
                acq_optimizer=local_search,
                acquisition_func=acq_func_wrapper,
                rng=rng,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps,
                num_marginalized_configurations_by_random_search=
                num_marginalized_configurations_by_random_search,
                num_configs_for_marginalization=num_configs_for_marginalization
            )
        elif acq_func_name == "roar":
            runhistory2epm = RunHistory2EPM4Cost(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            select_configuration = SelectConfigurationsRandom(
                scenario=scenario)
        elif acq_func_name == "pc-roar-mrs":
            runhistory2epm = RunHistory2EPM4Cost(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            select_configuration = SelectConfigurationsMRS(
                scenario=scenario,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps,
                splitting_number=random_splitting_number,
                random_splitting_enabled=random_splitting_enabled)
        elif acq_func_name == "pc-roar-sigmoid-rs":
            runhistory2epm = RunHistory2EPM4Cost(
                scenario, num_params, success_states=[StatusType.SUCCESS])
            select_configuration = SelectConfigurationsSigmoidRS(
                scenario=scenario,
                constant_pipeline_steps=constant_pipeline_steps,
                variable_pipeline_steps=variable_pipeline_steps,
                fraction=random_splitting_number)
        else:
            # Not a valid acquisition function
            raise ValueError("The provided acquisition function is not valid")

        # Build initial design
        # initial_design = RandomConfiguration(tae_runner=tae_runner,
        #                                      scenario=scenario,
        #                                      stats=stats,
        #                                      traj_logger=traj_logger,
        #                                      rng=rng)
        initial_configs = scenario.cs.sample_configuration(size=2)
        for config in initial_configs:
            config._populate_values()
        initial_design = MultiConfigInitialDesign(
            tae_runner=tae_runner,
            scenario=scenario,
            stats=stats,
            traj_logger=traj_logger,
            runhistory=runhistory,
            rng=rng,
            configs=initial_configs,
            intensifier=intensifier,
            aggregate_func=aggregate_func)

        # run id
        num_run = rng.randint(1234567980)

        # Build pc_smbo
        if acq_func_name not in ['pc-roar-sigmoid-rs']:
            smbo = PCSMBO(scenario=scenario,
                          stats=stats,
                          initial_design=initial_design,
                          runhistory=runhistory,
                          runhistory2epm=runhistory2epm,
                          intensifier=intensifier,
                          aggregate_func=aggregate_func,
                          num_run=num_run,
                          model=model,
                          rng=rng,
                          select_configuration=select_configuration,
                          double_intensification=double_intensification)
        else:
            smbo = PCSMBOSigmoidRandomSearch(
                scenario=scenario,
                stats=stats,
                initial_design=initial_design,
                runhistory=runhistory,
                runhistory2epm=runhistory2epm,
                intensifier=intensifier,
                aggregate_func=aggregate_func,
                num_run=num_run,
                model=model,
                rng=rng,
                select_configuration=select_configuration)

        return smbo
Exemple #18
0
    def run_smbo(self, max_iters=1000):
        global evaluator

        self.watcher.start_task('SMBO')

        # == first things first: load the datamanager
        self.reset_data_manager()
        
        # == Initialize non-SMBO stuff
        # first create a scenario
        seed = self.seed
        self.config_space.seed(seed)
        num_params = len(self.config_space.get_hyperparameters())
        # allocate a run history
        num_run = self.start_num_run
        instance_id = self.dataset_name + SENTINEL

        # Initialize some SMAC dependencies
        runhistory = RunHistory(aggregate_func=average_cost)
        # meta_runhistory = RunHistory(aggregate_func=average_cost)
        # meta_runs_dataset_indices = {}

        # == METALEARNING suggestions
        # we start by evaluating the defaults on the full dataset again
        # and add the suggestions from metalearning behind it

        if self.metadata_directory is None:
            metalearning_directory = os.path.dirname(
                autosklearn.metalearning.__file__)
            # There is no multilabel data in OpenML
            if self.task == MULTILABEL_CLASSIFICATION:
                meta_task = BINARY_CLASSIFICATION
            else:
                meta_task = self.task
            metadata_directory = os.path.join(
                metalearning_directory, 'files',
                '%s_%s_%s' % (METRIC_TO_STRING[self.metric],
                              TASK_TYPES_TO_STRING[meta_task],
                              'sparse' if self.datamanager.info['is_sparse']
                              else 'dense'))
            self.metadata_directory = metadata_directory

        self.logger.info('Metadata directory: %s', self.metadata_directory)
        meta_base = MetaBase(self.config_space, self.metadata_directory)

        metafeature_calculation_time_limit = int(
            self.total_walltime_limit / 4)
        metafeature_calculation_start_time = time.time()
        meta_features = self._calculate_metafeatures_with_limits(
            metafeature_calculation_time_limit)
        metafeature_calculation_end_time = time.time()
        metafeature_calculation_time_limit = \
            metafeature_calculation_time_limit - (
            metafeature_calculation_end_time -
            metafeature_calculation_start_time)

        if metafeature_calculation_time_limit < 1:
            self.logger.warning('Time limit for metafeature calculation less '
                                'than 1 seconds (%f). Skipping calculation '
                                'of metafeatures for encoded dataset.',
                                metafeature_calculation_time_limit)
            meta_features_encoded = None
        else:
            with warnings.catch_warnings():
                warnings.showwarning = self._send_warnings_to_log
                self.datamanager.perform1HotEncoding()
            meta_features_encoded = \
                self._calculate_metafeatures_encoded_with_limits(
                    metafeature_calculation_time_limit)

        # In case there is a problem calculating the encoded meta-features
        if meta_features is None:
            if meta_features_encoded is not None:
                meta_features = meta_features_encoded
        else:
            if meta_features_encoded is not None:
                meta_features.metafeature_values.update(
                    meta_features_encoded.metafeature_values)

        if meta_features is not None:
            meta_base.add_dataset(instance_id, meta_features)
            # Do mean imputation of the meta-features - should be done specific
            # for each prediction model!
            all_metafeatures = meta_base.get_metafeatures(
                features=list(meta_features.keys()))
            all_metafeatures.fillna(all_metafeatures.mean(), inplace=True)

            with warnings.catch_warnings():
                warnings.showwarning = self._send_warnings_to_log
                metalearning_configurations = self.collect_metalearning_suggestions(
                    meta_base)
            if metalearning_configurations is None:
                metalearning_configurations = []
            self.reset_data_manager()

            self.logger.info('%s', meta_features)

            # Convert meta-features into a dictionary because the scenario
            # expects a dictionary
            meta_features_dict = {}
            for dataset, series in all_metafeatures.iterrows():
                meta_features_dict[dataset] = series.values
            meta_features_list = []
            for meta_feature_name in all_metafeatures.columns:
                meta_features_list.append(meta_features[meta_feature_name].value)
            meta_features_list = np.array(meta_features_list).reshape((1, -1))
            self.logger.info(list(meta_features_dict.keys()))

            #meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric])
            #meta_runs_index = 0
            #try:
            #    meta_durations = meta_base.get_all_runs('runtime')
            #    read_runtime_data = True
            #except KeyError:
            #    read_runtime_data = False
            #    self.logger.critical('Cannot read runtime data.')
            #    if self.acquisition_function == 'EIPS':
            #        self.logger.critical('Reverting to acquisition function EI!')
            #        self.acquisition_function = 'EI'

            # for meta_dataset in meta_runs.index:
            #     meta_dataset_start_index = meta_runs_index
            #     for meta_configuration in meta_runs.columns:
            #         if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]):
            #             try:
            #                 config = meta_base.get_configuration_from_algorithm_index(
            #                     meta_configuration)
            #                 cost = meta_runs.loc[meta_dataset, meta_configuration]
            #                 if read_runtime_data:
            #                     runtime = meta_durations.loc[meta_dataset,
            #                                                  meta_configuration]
            #                 else:
            #                     runtime = 1
            #                 # TODO read out other status types!
            #                 meta_runhistory.add(config, cost, runtime,
            #                                     StatusType.SUCCESS,
            #                                     instance_id=meta_dataset)
            #                 meta_runs_index += 1
            #             except:
            #                 # TODO maybe add warning
            #                 pass
            #
            #     meta_runs_dataset_indices[meta_dataset] = (
            #         meta_dataset_start_index, meta_runs_index)
        else:
            if self.acquisition_function == 'EIPS':
                self.logger.critical('Reverting to acquisition function EI!')
                self.acquisition_function = 'EI'
            meta_features_list = []
            meta_features_dict = {}
            metalearning_configurations = []

        self.scenario = Scenario({'cs': self.config_space,
                                  'cutoff-time': self.func_eval_time_limit,
                                  'memory-limit': self.memory_limit,
                                  'wallclock-limit': self.total_walltime_limit,
                                  #'instances': [[name] for name in meta_features_dict],
                                  'output-dir': self.backend.temporary_directory,
                                  'shared-model': self.shared_mode,
                                  'run-obj': 'quality',
                                  'deterministic': 'true'})

        # TODO rebuild target algorithm to be it's own target algorithm
        # evaluator, which takes into account that a run can be killed prior
        # to the model being fully fitted; thus putting intermediate results
        # into a queue and querying them once the time is over
        ta = ExecuteTaFuncWithQueue(backend=self.backend,
                                    autosklearn_seed=seed,
                                    resampling_strategy=self.resampling_strategy,
                                    initial_num_run=num_run,
                                    logger=self.logger,
                                    **self.resampling_strategy_args)

        types = get_types(self.config_space, self.scenario.feature_array)

        # TODO extract generation of SMAC object into it's own function for
        # testing
        if self.acquisition_function == 'EI':
            model = RandomForestWithInstances(types,
                                              #instance_features=meta_features_list,
                                              seed=1, num_trees=10)
            smac = SMAC(scenario=self.scenario,
                        model=model,
                        rng=seed,
                        tae_runner=ta,
                        runhistory=runhistory)
        elif self.acquisition_function == 'EIPS':
            rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=None,
                                         impute_censored_data=False,
                                         impute_state=None)
            model = UncorrelatedMultiObjectiveRandomForestWithInstances(
                ['cost', 'runtime'], types, num_trees = 10,
                instance_features=meta_features_list, seed=1)
            acquisition_function = EIPS(model)
            smac = SMAC(scenario=self.scenario, tae_runner=ta,
                        acquisition_function=acquisition_function,
                        model=model, runhistory2epm=rh2EPM, rng=seed,
                        runhistory=runhistory)
        else:
            raise ValueError('Unknown acquisition function value %s!' %
                             self.acquisition_function)

        smac.solver.stats.start_timing()
        smac.solver.incumbent = smac.solver.initial_design.run()

        # Build a runtime model
        # runtime_rf = RandomForestWithInstances(types,
        #                                        instance_features=meta_features_list,
        #                                        seed=1, num_trees=10)
        # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
        #                                      scenario=self.scenario,
        #                                      success_states=None,
        #                                      impute_censored_data=False,
        #                                      impute_state=None)
        # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory)
        # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten())
        # X_meta, Y_meta = rh2EPM.transform(meta_runhistory)
        # # Transform Y_meta on a per-dataset base
        # for meta_dataset in meta_runs_dataset_indices:
        #     start_index, end_index = meta_runs_dataset_indices[meta_dataset]
        #     end_index += 1  # Python indexing
        #     Y_meta[start_index:end_index, 0]\
        #         [Y_meta[start_index:end_index, 0] >2.0] =  2.0
        #     dataset_minimum = np.min(Y_meta[start_index:end_index, 0])
        #     Y_meta[start_index:end_index, 0] = 1 - (
        #         (1. - Y_meta[start_index:end_index, 0]) /
        #         (1. - dataset_minimum))
        #     Y_meta[start_index:end_index, 0]\
        #           [Y_meta[start_index:end_index, 0] > 2] = 2

        smac.solver.stats.start_timing()
        # == first, evaluate all metelearning and default configurations
        smac.solver.incumbent = smac.solver.initial_design.run()

        for challenger in metalearning_configurations:

            smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify(
                challengers=[challenger],
                incumbent=smac.solver.incumbent,
                run_history=smac.solver.runhistory,
                aggregate_func=smac.solver.aggregate_func,
                time_bound=self.total_walltime_limit)

            if smac.solver.scenario.shared_model:
                pSMAC.write(run_history=smac.solver.runhistory,
                            output_directory=smac.solver.scenario.output_dir,
                            num_run=self.seed)

            if smac.solver.stats.is_budget_exhausted():
                break

        # == after metalearning run SMAC loop
        while True:
            if smac.solver.scenario.shared_model:
                pSMAC.read(run_history=smac.solver.runhistory,
                           output_directory=self.scenario.output_dir,
                           configuration_space=self.config_space,
                           logger=self.logger)

            choose_next_start_time = time.time()
            try:
                challengers = self.choose_next(smac)
            except Exception as e:
                self.logger.error(e)
                self.logger.error("Error in getting next configurations "
                                  "with SMAC. Using random configuration!")
                next_config = self.config_space.sample_configuration()
                challengers = [next_config]
            time_for_choose_next = time.time() - choose_next_start_time
            self.logger.info('Used %g seconds to find next '
                             'configurations' % (time_for_choose_next))

            smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify(
                challengers=challengers,
                incumbent=smac.solver.incumbent,
                run_history=smac.solver.runhistory,
                aggregate_func=smac.solver.aggregate_func,
                time_bound=time_for_choose_next)

            if smac.solver.scenario.shared_model:
                pSMAC.write(run_history=smac.solver.runhistory,
                            output_directory=smac.solver.scenario.output_dir,
                            num_run=self.seed)

            if smac.solver.stats.is_budget_exhausted():
                break

        self.runhistory = smac.solver.runhistory
        return runhistory
Exemple #19
0
    def __init__(self, model_type='gp_mcmc', **kwargs):
        """
        Constructor
        see ~smac.facade.smac_facade for documentation
        """
        scenario = kwargs['scenario']
        if scenario.initial_incumbent not in ['LHD', 'FACTORIAL', 'SOBOL']:
            scenario.initial_incumbent = 'SOBOL'

        if scenario.transform_y is 'NONE':
            scenario.transform_y = "LOGS"

        if kwargs.get('model') is None:
            _, rng = get_rng(rng=kwargs.get("rng", None),
                             run_id=kwargs.get("run_id", None),
                             logger=None)

            cov_amp = 2
            types, bounds = get_types(kwargs['scenario'].cs,
                                      instance_features=None)
            n_dims = len(types)

            initial_ls = np.ones([n_dims])
            exp_kernel = george.kernels.Matern52Kernel(initial_ls, ndim=n_dims)
            kernel = cov_amp * exp_kernel

            prior = DefaultPrior(len(kernel) + 1, rng=rng)

            n_hypers = 3 * len(kernel)
            if n_hypers % 2 == 1:
                n_hypers += 1

            if model_type == "gp":
                model = GaussianProcess(
                    types=types,
                    bounds=bounds,
                    kernel=kernel,
                    prior=prior,
                    rng=rng,
                    normalize_output=True,
                    normalize_input=True,
                )
            elif model_type == "gp_mcmc":
                model = GaussianProcessMCMC(
                    types=types,
                    bounds=bounds,
                    kernel=kernel,
                    prior=prior,
                    n_hypers=n_hypers,
                    chain_length=200,
                    burnin_steps=100,
                    normalize_input=True,
                    normalize_output=True,
                    rng=rng,
                )
            kwargs['model'] = model
        super().__init__(**kwargs)

        if self.solver.scenario.n_features > 0:
            raise NotImplementedError("BOGP cannot handle instances")

        self.logger.info(self.__class__)

        self.solver.random_configuration_chooser.prob = 0.0

        # only 1 configuration per SMBO iteration
        self.solver.scenario.intensification_percentage = 1e-10
        self.solver.intensifier.min_chall = 1

        # better improve acqusition function optimization
        # 1. increase number of sls iterations
        self.solver.acq_optimizer.n_sls_iterations = 100
        # 2. more randomly sampled configurations
        self.solver.scenario.acq_opt_challengers = 1000

        # activate predict incumbent
        self.solver.predict_incumbent = True
Exemple #20
0
    def plot_cost_over_time(self,
                            rh,
                            traj,
                            output="performance_over_time.png",
                            validator=None):
        """ Plot performance over time, using all trajectory entries
            with max_time = wallclock_limit or (if inf) the highest
            recorded time

            Parameters
            ----------
            rh: RunHistory
                runhistory to use
            traj: List
                trajectory to take times/incumbents from
            output: str
                path to output-png
            epm: RandomForestWithInstances
                emperical performance model (expecting trained on all runs)
        """
        self.logger.debug("Estimating costs over time for best run.")
        validator.traj = traj  # set trajectory
        time, configs = [], []

        for entry in traj:
            time.append(entry["wallclock_time"])
            configs.append(entry["incumbent"])

        self.logger.debug("Using %d samples (%d distinct) from trajectory.",
                          len(time), len(set(configs)))

        if validator.epm:  # not log as validator epm is trained on cost, not log cost
            epm = validator.epm
        else:
            self.logger.debug(
                "No EPM passed! Training new one from runhistory.")
            # Train random forest and transform training data (from given rh)
            # Not using validator because we want to plot uncertainties
            rh2epm = RunHistory2EPM4Cost(num_params=len(
                self.scenario.cs.get_hyperparameters()),
                                         scenario=self.scenario)
            X, y = rh2epm.transform(rh)
            self.logger.debug("Training model with data of shape X: %s, y:%s",
                              str(X.shape), str(y.shape))

            types, bounds = get_types(self.scenario.cs,
                                      self.scenario.feature_array)
            epm = RandomForestWithInstances(
                types=types,
                bounds=bounds,
                instance_features=self.scenario.feature_array,
                #seed=self.rng.randint(MAXINT),
                ratio_features=1.0)
            epm.train(X, y)

        ## not necessary right now since the EPM only knows the features
        ## of the training instances
        # use only training instances
        #=======================================================================
        # if self.scenario.feature_dict:
        #     feat_array = []
        #     for inst in self.scenario.train_insts:
        #         feat_array.append(self.scenario.feature_dict[inst])
        #     backup_features_epm = epm.instance_features
        #     epm.instance_features = np.array(feat_array)
        #=======================================================================

        # predict performance for all configurations in trajectory
        config_array = convert_configurations_to_array(configs)
        mean, var = epm.predict_marginalized_over_instances(config_array)

        #=======================================================================
        # # restore feature array in epm
        # if self.scenario.feature_dict:
        #     epm.instance_features = backup_features_epm
        #=======================================================================

        mean = mean[:, 0]
        var = var[:, 0]
        uncertainty_upper = mean + np.sqrt(var)
        uncertainty_lower = mean - np.sqrt(var)
        if self.scenario.run_obj == 'runtime':  # We have to clip at 0 as we want to put y on the logscale
            uncertainty_lower[uncertainty_lower < 0] = 0
            uncertainty_upper[uncertainty_upper < 0] = 0

        # plot
        fig = plt.figure()
        ax = fig.add_subplot(111)

        ax.set_ylabel('performance')
        ax.set_xlabel('time [sec]')
        ax.plot(time, mean, 'r-', label="estimated performance")
        ax.fill_between(time,
                        uncertainty_upper,
                        uncertainty_lower,
                        alpha=0.8,
                        label="standard deviation")
        ax.set_xscale("log", nonposx='clip')
        if self.scenario.run_obj == 'runtime':
            ax.set_yscale('log')

        # ax.set_ylim(min(mean)*0.8, max(mean)*1.2)
        # start after 1% of the configuration budget
        ax.set_xlim(min(time) + (max(time) - min(time)) * 0.01, max(time))

        ax.legend()
        plt.tight_layout()
        fig.savefig(output)
        plt.close(fig)
def run_experiment():
    pipeline_space = PipelineSpace()
    o_s = OneHotEncodingStep()
    i_s = ImputationStep()
    r_s = RescalingStep()
    b_s = BalancingStep()
    p_s = PreprocessingStep()
    c_s = ClassificationStep()
    pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s])

    runhistory = PCRunHistory(average_cost)

    cs_builder = ConfigSpaceBuilder(pipeline_space)
    config_space = cs_builder.build_config_space()

    args = {
        'cs': config_space,
        'run_obj': "quality",
        'runcount_limit': 100,
        'wallclock_limit': 100,
        'memory_limit': 100,
        'cutoff_time': 100,
        'deterministic': "true"
    }
    scenario = Scenario(args)

    # Build stats
    stats = Stats(scenario, output_dir=None, stamp="")

    types, bounds = get_types(scenario.cs, scenario.feature_array)

    model = RandomForestWithInstances(types=types, bounds=bounds)

    constant_pipeline_steps = [
        "one_hot_encoder", "imputation", "rescaling", "balancing",
        "feature_preprocessor"
    ]
    variable_pipeline_steps = ["classifier"]
    rng = np.random.RandomState()
    num_params = len(scenario.cs.get_hyperparameters())

    acquisition_func = EI(model)
    acq_func_wrapper = PCAquisitionFunctionWrapper(
        acquisition_func=acquisition_func,
        config_space=scenario.cs,
        runhistory=runhistory,
        constant_pipeline_steps=constant_pipeline_steps,
        variable_pipeline_steps=variable_pipeline_steps)
    runhistory2epm = RunHistory2EPM4Cost(scenario,
                                         num_params,
                                         success_states=[StatusType.SUCCESS])
    local_search = LocalSearch(acquisition_function=acq_func_wrapper,
                               config_space=scenario.cs)
    select_configuration = SelectConfigurationsWithMarginalization(
        scenario=scenario,
        stats=stats,
        runhistory=runhistory,
        model=model,
        acq_optimizer=local_search,
        acquisition_func=acq_func_wrapper,
        rng=rng,
        constant_pipeline_steps=constant_pipeline_steps,
        variable_pipeline_steps=variable_pipeline_steps,
        num_marginalized_configurations_by_random_search=40,
        num_configs_for_marginalization=200)

    # sample configurations to fill runhistory
    sample_configs = config_space.sample_configuration(size=10)
    for config in sample_configs:
        runhistory.add(config, 1, 1, StatusType.SUCCESS)

    # test select_configurations procedure
    X, Y = runhistory2epm.transform(runhistory)
    challengers = select_configuration.run(
        X,
        Y,
        sample_configs[0],
        num_configurations_by_random_search_sorted=100,
        num_configurations_by_local_search=10,
        random_leaf_size=1)

    print(challengers[0])
Exemple #22
0
    def __init__(
            self,
            scenario: Scenario,
            # TODO: once we drop python3.4 add type hint
            # typing.Union[ExecuteTARun, callable]
            tae_runner=None,
            runhistory: RunHistory = None,
            intensifier: Intensifier = None,
            acquisition_function: AbstractAcquisitionFunction = None,
            model: AbstractEPM = None,
            runhistory2epm: AbstractRunHistory2EPM = None,
            initial_design: InitialDesign = None,
            initial_configurations: typing.List[Configuration] = None,
            stats: Stats = None,
            rng: np.random.RandomState = None,
            run_id: int = 1):
        """Constructor"""
        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)

        aggregate_func = average_cost
        self.runhistory = None
        self.trajectory = None

        # initialize stats object
        if stats:
            self.stats = stats
        else:
            self.stats = Stats(scenario)

        self.output_dir = create_output_directory(scenario, run_id)
        scenario.write()

        # initialize empty runhistory
        if runhistory is None:
            runhistory = RunHistory(aggregate_func=aggregate_func)
        # inject aggr_func if necessary
        if runhistory.aggregate_func is None:
            runhistory.aggregate_func = aggregate_func

        # initial random number generator
        num_run, rng = self._get_rng(rng=rng)

        # reset random number generator in config space to draw different
        # random configurations with each seed given to SMAC
        scenario.cs.seed(rng.randint(MAXINT))

        # initial Trajectory Logger
        traj_logger = TrajLogger(output_dir=self.output_dir, stats=self.stats)

        # initial EPM
        types, bounds = get_types(scenario.cs, scenario.feature_array)
        if model is None:
            model = RandomForestWithInstances(
                types=types,
                bounds=bounds,
                instance_features=scenario.feature_array,
                seed=rng.randint(MAXINT),
                pca_components=scenario.PCA_DIM,
                num_trees=scenario.rf_num_trees,
                do_bootstrapping=scenario.rf_do_bootstrapping,
                ratio_features=scenario.rf_ratio_features,
                min_samples_split=scenario.rf_min_samples_split,
                min_samples_leaf=scenario.rf_min_samples_leaf,
                max_depth=scenario.rf_max_depth)
        # initial acquisition function
        if acquisition_function is None:
            if scenario.run_obj == "runtime":
                acquisition_function = LogEI(model=model)
            else:
                acquisition_function = EI(model=model)
        # inject model if necessary
        if acquisition_function.model is None:
            acquisition_function.model = model

        # initialize optimizer on acquisition function
        local_search = LocalSearch(
            acquisition_function,
            scenario.cs,
            max_steps=scenario.sls_max_steps,
            n_steps_plateau_walk=scenario.sls_n_steps_plateau_walk)

        # initialize tae_runner
        # First case, if tae_runner is None, the target algorithm is a call
        # string in the scenario file
        if tae_runner is None:
            tae_runner = ExecuteTARunOld(
                ta=scenario.ta,
                stats=self.stats,
                run_obj=scenario.run_obj,
                runhistory=runhistory,
                par_factor=scenario.par_factor,
                cost_for_crash=scenario.cost_for_crash)
        # Second case, the tae_runner is a function to be optimized
        elif callable(tae_runner):
            tae_runner = ExecuteTAFuncDict(
                ta=tae_runner,
                stats=self.stats,
                run_obj=scenario.run_obj,
                memory_limit=scenario.memory_limit,
                runhistory=runhistory,
                par_factor=scenario.par_factor,
                cost_for_crash=scenario.cost_for_crash)
        # Third case, if it is an ExecuteTaRun we can simply use the
        # instance. Otherwise, the next check raises an exception
        elif not isinstance(tae_runner, ExecuteTARun):
            raise TypeError("Argument 'tae_runner' is %s, but must be "
                            "either a callable or an instance of "
                            "ExecuteTaRun. Passing 'None' will result in the "
                            "creation of target algorithm runner based on the "
                            "call string in the scenario file." %
                            type(tae_runner))

        # Check that overall objective and tae objective are the same
        if tae_runner.run_obj != scenario.run_obj:
            raise ValueError("Objective for the target algorithm runner and "
                             "the scenario must be the same, but are '%s' and "
                             "'%s'" % (tae_runner.run_obj, scenario.run_obj))

        # inject stats if necessary
        if tae_runner.stats is None:
            tae_runner.stats = self.stats
        # inject runhistory if necessary
        if tae_runner.runhistory is None:
            tae_runner.runhistory = runhistory
        # inject cost_for_crash
        if tae_runner.crash_cost != scenario.cost_for_crash:
            tae_runner.crash_cost = scenario.cost_for_crash

        # initialize intensification
        if intensifier is None:
            intensifier = Intensifier(
                tae_runner=tae_runner,
                stats=self.stats,
                traj_logger=traj_logger,
                rng=rng,
                instances=scenario.train_insts,
                cutoff=scenario.cutoff,
                deterministic=scenario.deterministic,
                run_obj_time=scenario.run_obj == "runtime",
                always_race_against=scenario.cs.get_default_configuration()
                if scenario.always_race_default else None,
                instance_specifics=scenario.instance_specific,
                minR=scenario.minR,
                maxR=scenario.maxR,
                adaptive_capping_slackfactor=scenario.
                intens_adaptive_capping_slackfactor,
                min_chall=scenario.intens_min_chall)
        # inject deps if necessary
        if intensifier.tae_runner is None:
            intensifier.tae_runner = tae_runner
        if intensifier.stats is None:
            intensifier.stats = self.stats
        if intensifier.traj_logger is None:
            intensifier.traj_logger = traj_logger

        # initial design
        if initial_design is not None and initial_configurations is not None:
            raise ValueError(
                "Either use initial_design or initial_configurations; but not both"
            )

        if initial_configurations is not None:
            initial_design = MultiConfigInitialDesign(
                tae_runner=tae_runner,
                scenario=scenario,
                stats=self.stats,
                traj_logger=traj_logger,
                runhistory=runhistory,
                rng=rng,
                configs=initial_configurations,
                intensifier=intensifier,
                aggregate_func=aggregate_func)
        elif initial_design is None:
            if scenario.initial_incumbent == "DEFAULT":
                initial_design = DefaultConfiguration(tae_runner=tae_runner,
                                                      scenario=scenario,
                                                      stats=self.stats,
                                                      traj_logger=traj_logger,
                                                      rng=rng)
            elif scenario.initial_incumbent == "RANDOM":
                initial_design = RandomConfiguration(tae_runner=tae_runner,
                                                     scenario=scenario,
                                                     stats=self.stats,
                                                     traj_logger=traj_logger,
                                                     rng=rng)
            else:
                raise ValueError("Don't know what kind of initial_incumbent "
                                 "'%s' is" % scenario.initial_incumbent)
        # inject deps if necessary
        if initial_design.tae_runner is None:
            initial_design.tae_runner = tae_runner
        if initial_design.scenario is None:
            initial_design.scenario = scenario
        if initial_design.stats is None:
            initial_design.stats = self.stats
        if initial_design.traj_logger is None:
            initial_design.traj_logger = traj_logger

        # initial conversion of runhistory into EPM data
        if runhistory2epm is None:

            num_params = len(scenario.cs.get_hyperparameters())
            if scenario.run_obj == "runtime":

                # if we log the performance data,
                # the RFRImputator will already get
                # log transform data from the runhistory
                cutoff = np.log(scenario.cutoff)
                threshold = np.log(scenario.cutoff * scenario.par_factor)

                imputor = RFRImputator(rng=rng,
                                       cutoff=cutoff,
                                       threshold=threshold,
                                       model=model,
                                       change_threshold=0.01,
                                       max_iter=2)

                runhistory2epm = RunHistory2EPM4LogCost(
                    scenario=scenario,
                    num_params=num_params,
                    success_states=[
                        StatusType.SUCCESS,
                    ],
                    impute_censored_data=True,
                    impute_state=[
                        StatusType.CAPPED,
                    ],
                    imputor=imputor)

            elif scenario.run_obj == 'quality':
                runhistory2epm = RunHistory2EPM4Cost(
                    scenario=scenario,
                    num_params=num_params,
                    success_states=[
                        StatusType.SUCCESS,
                    ],
                    impute_censored_data=False,
                    impute_state=None)

            else:
                raise ValueError('Unknown run objective: %s. Should be either '
                                 'quality or runtime.' % self.scenario.run_obj)

        # inject scenario if necessary:
        if runhistory2epm.scenario is None:
            runhistory2epm.scenario = scenario

        self.solver = EPILS_Solver(scenario=scenario,
                                   stats=self.stats,
                                   initial_design=initial_design,
                                   runhistory=runhistory,
                                   runhistory2epm=runhistory2epm,
                                   intensifier=intensifier,
                                   aggregate_func=aggregate_func,
                                   num_run=num_run,
                                   model=model,
                                   acq_optimizer=local_search,
                                   acquisition_func=acquisition_function,
                                   rng=rng)
Exemple #23
0
    def _component_builder(self, conf:typing.Union[Configuration, dict]) \
        -> typing.Tuple[AbstractAcquisitionFunction, AbstractEPM]:
        """
            builds new Acquisition function object
            and EPM object and returns these
            
            Parameters
            ----------
            conf: typing.Union[Configuration, dict]
                configuration specificing "model" and "acq_func"
                
            Returns
            -------
            typing.Tuple[AbstractAcquisitionFunction, AbstractEPM]
            
        """
        types, bounds = get_types(self.config_space, instance_features=self.scenario.feature_array)
        
        if conf["model"] == "RF":
            model = RandomForestWithInstances(
                  types=types,
                  bounds=bounds,
                  instance_features=self.scenario.feature_array,
                  seed=self.rng.randint(MAXINT),
                  pca_components=conf.get("pca_dim", self.scenario.PCA_DIM),
                  log_y=conf.get("log_y", self.scenario.transform_y in ["LOG", "LOGS"]),
                  num_trees=conf.get("num_trees", self.scenario.rf_num_trees), 
                  do_bootstrapping=conf.get("do_bootstrapping", self.scenario.rf_do_bootstrapping),  
                  ratio_features=conf.get("ratio_features", self.scenario.rf_ratio_features),
                  min_samples_split=conf.get("min_samples_split", self.scenario.rf_min_samples_split),
                  min_samples_leaf=conf.get("min_samples_leaf", self.scenario.rf_min_samples_leaf),
                  max_depth=conf.get("max_depth", self.scenario.rf_max_depth))
            
        elif conf["model"] == "GP":
            cov_amp = 2
            n_dims = len(types)

            initial_ls = np.ones([n_dims])
            exp_kernel = george.kernels.Matern52Kernel(initial_ls, ndim=n_dims)
            kernel = cov_amp * exp_kernel

            prior = DefaultPrior(len(kernel) + 1, rng=self.rng)

            n_hypers = 3 * len(kernel)
            if n_hypers % 2 == 1:
                n_hypers += 1

            model = GaussianProcessMCMC(
                types=types,
                bounds=bounds,
                kernel=kernel,
                prior=prior,
                n_hypers=n_hypers,
                chain_length=200,
                burnin_steps=100,
                normalize_input=True,
                normalize_output=True,
                rng=self.rng,
            )
            
        if conf["acq_func"] == "EI":
            acq = EI(model=model,
                     par=conf.get("par_ei", 0))
        elif conf["acq_func"] == "LCB":
            acq = LCB(model=model,
                par=conf.get("par_lcb", 0))
        elif conf["acq_func"] == "PI":
            acq = PI(model=model,
                     par=conf.get("par_pi", 0))
        elif conf["acq_func"] == "LogEI":
            # par value should be in log-space
            acq = LogEI(model=model,
                        par=conf.get("par_logei", 0))
        
        return acq, model
    def create_optimizer(self):
        from smac.epm.rf_with_instances import RandomForestWithInstances
        from smac.initial_design.default_configuration_design import DefaultConfiguration
        from smac.intensification.intensification import Intensifier
        from smac.optimizer.smbo import SMBO
        from smac.optimizer.acquisition import EI
        from smac.optimizer.ei_optimization import InterleavedLocalAndRandomSearch
        from smac.optimizer.objective import average_cost
        from smac.runhistory.runhistory2epm import RunHistory2EPM4Cost
        from smac.tae.execute_ta_run import StatusType
        from smac.utils.constants import MAXINT
        from smac.utils.util_funcs import get_types

        TAE_RUNNER = self._priv_evaluator

        runhistory2epm = RunHistory2EPM4Cost(
            scenario=self.scenario,
            num_params=len(self.param_space),
            success_states=[StatusType.SUCCESS, StatusType.CRASHED],
            impute_censored_data=False,
            impute_state=None)

        intensifier = Intensifier(tae_runner=TAE_RUNNER, stats=self.stats, traj_logger=self.traj_logger,
                                        rng=self.rng, instances=self.scenario.train_insts, cutoff=self.scenario.cutoff,
                                        deterministic=self.scenario.deterministic, run_obj_time=self.scenario.run_obj == "runtime",
                                        always_race_against=self.scenario.cs.get_default_configuration() \
                                              if self.scenario.always_race_default else None,
                                        instance_specifics=self.scenario.instance_specific,
                                        minR=self.scenario.minR, maxR=self.scenario.maxR)

        types, bounds = get_types(self.scenario.cs,
                                  self.scenario.feature_array)
        model = RandomForestWithInstances(
            types=types,
            bounds=bounds,
            seed=self.rng.randint(MAXINT),
            instance_features=self.scenario.feature_array,
            pca_components=self.scenario.PCA_DIM)
        acq_func = EI(model=model)

        smbo_args = {
            'scenario':
            self.scenario,
            'stats':
            self.stats,
            'initial_design':
            DefaultConfiguration(tae_runner=TAE_RUNNER,
                                 scenario=self.scenario,
                                 stats=self.stats,
                                 traj_logger=self.traj_logger,
                                 rng=self.rng),
            'runhistory':
            self.runhistory,
            'runhistory2epm':
            runhistory2epm,
            'intensifier':
            intensifier,
            'aggregate_func':
            average_cost,
            'num_run':
            self.seed,
            'model':
            model,
            'acq_optimizer':
            InterleavedLocalAndRandomSearch(
                acq_func, self.scenario.cs,
                np.random.RandomState(seed=self.rng.randint(MAXINT))),
            'acquisition_func':
            acq_func,
            'rng':
            self.rng,
            'restore_incumbent':
            None,
        }

        self.smbo = SMBO(**smbo_args)
Exemple #25
0
    def run(self):
        """
        Implementation of the forward selection loop.
        Uses SMACs EPM (RF) wrt the feature space to minimize the OOB error.

        Returns
        -------
        feature_importance: OrderedDict
            dict_keys (first key -> most important) -> OOB error
        """
        parameters = [p.name for p in self.scenario.cs.get_hyperparameters()]
        self.logger.debug("Parameters: %s", parameters)

        rh2epm = RunHistory2EPM4Cost(scenario=self.scenario,
                                     num_params=len(parameters),
                                     success_states=[
                                         StatusType.SUCCESS, StatusType.CAPPED,
                                         StatusType.CRASHED
                                     ],
                                     impute_censored_data=False,
                                     impute_state=None)

        X, y = rh2epm.transform(self.rh)

        # reduce sample size to speedup computation
        if X.shape[0] > self.MAX_SAMPLES:
            idx = np.random.choice(X.shape[0],
                                   size=self.MAX_SAMPLES,
                                   replace=False)
            X = X[idx, :]
            y = y[idx]

        self.logger.debug(
            "Shape of X: %s, of y: %s, #parameters: %s, #feats: %s", X.shape,
            y.shape, len(parameters), len(self.scenario.feature_names))
        names = copy.deepcopy(self.scenario.feature_names)
        self.logger.debug("Features: %s", names)

        used = list(range(0, len(parameters)))
        feat_ids = {f: i for i, f in enumerate(names, len(used))}
        ids_feat = {i: f for f, i in feat_ids.items()}
        self.logger.debug("Used: %s", used)
        evaluated_feature_importance = OrderedDict()

        types, bounds = get_types(self.scenario.cs,
                                  self.scenario.feature_array)

        last_error = np.inf

        for _round in range(self.to_evaluate):  # Main Loop
            errors = []
            for f in names:
                i = feat_ids[f]
                self.logger.debug('Evaluating %s', f)
                used.append(i)
                self.logger.debug(
                    'Used features: %s',
                    str([ids_feat[j] for j in used[len(parameters):]]))

                start = time.time()
                self._refit_model(types[sorted(used)], bounds, X[:,
                                                                 sorted(used)],
                                  y)  # refit the model every round
                errors.append(self.model.rf.out_of_bag_error())
                used.pop()
                self.logger.debug('Refitted RF (sec %.2f; error: %.4f)' %
                                  (time.time() - start, errors[-1]))
            else:
                self.logger.debug('Evaluating None')
                start = time.time()
                self._refit_model(types[sorted(used)], bounds, X[:,
                                                                 sorted(used)],
                                  y)  # refit the model every round
                errors.append(self.model.rf.out_of_bag_error())
                self.logger.debug('Refitted RF (sec %.2f; error: %.4f)' %
                                  (time.time() - start, errors[-1]))
                if _round == 0:
                    evaluated_feature_importance['None'] = errors[-1]
            best_idx = np.argmin(errors)
            lowest_error = errors[best_idx]

            if best_idx == len(errors) - 1:
                self.logger.info('Best thing to do is add nothing')
                best_feature = 'None'
                # evaluated_feature_importance[best_feature] = lowest_error
                break
            elif lowest_error >= last_error:
                break
            else:
                last_error = lowest_error
                best_feature = names.pop(best_idx)
                used.append(feat_ids[best_feature])

            self.logger.debug('%s: %.4f' % (best_feature, lowest_error))
            evaluated_feature_importance[best_feature] = lowest_error

        self.logger.debug(evaluated_feature_importance)
        self.evaluated_feature_importance = evaluated_feature_importance
        return evaluated_feature_importance