Exemple #1
0
    def test_choose_next_2(self):
        def side_effect(X, derivative):
            return np.mean(X, axis=1).reshape((-1, 1))

        smbo = SMAC(self.scenario, rng=1).solver
        smbo.incumbent = self.scenario.cs.sample_configuration()
        smbo.runhistory = RunHistory(aggregate_func=average_cost)
        smbo.model = mock.Mock(spec=RandomForestWithInstances)
        smbo.acquisition_func._compute = mock.Mock(
            spec=RandomForestWithInstances)
        smbo.acquisition_func._compute.side_effect = side_effect

        X = smbo.rng.rand(10, 2)
        Y = smbo.rng.rand(10, 1)

        x = smbo.choose_next(X, Y)

        self.assertEqual(smbo.model.train.call_count, 1)
        self.assertEqual(len(x), 2020)
        num_random_search = 0
        num_local_search = 0
        for i in range(0, 2020, 2):
            #print(x[i].origin)
            self.assertIsInstance(x[i], Configuration)
            if 'Random Search (sorted)' in x[i].origin:
                num_random_search += 1
            elif 'Local Search' in x[i].origin:
                num_local_search += 1
        # number of local search configs has to be least 10
        # since x can have duplicates
        # which can be associated with the local search
        self.assertGreaterEqual(num_local_search, 10)
        for i in range(1, 2020, 2):
            self.assertIsInstance(x[i], Configuration)
            self.assertEqual(x[i].origin, 'Random Search')
 def test_get_runhistory_and_trajectory(self):
     ta = ExecuteTAFuncDict(lambda x: x ** 2)
     smac = SMAC(tae_runner=ta, scenario=self.scenario)
     self.assertRaises(ValueError, smac.get_runhistory)
     self.assertRaises(ValueError, smac.get_trajectory)
     smac.trajectory = 'dummy'
     self.assertEqual(smac.get_trajectory(), 'dummy')
     smac.runhistory = 'dummy'
     self.assertEqual(smac.get_runhistory(), 'dummy')
Exemple #3
0
    def test_choose_next(self):
        seed = 42
        smbo = SMAC(self.scenario, rng=seed).solver
        smbo.runhistory = RunHistory(aggregate_func=average_cost)
        X = self.scenario.cs.sample_configuration().get_array()[None, :]
        smbo.incumbent = self.scenario.cs.sample_configuration()

        Y = self.branin(X)
        x = smbo.choose_next(X, Y)[0].get_array()
        assert x.shape == (2, )
Exemple #4
0
    def test_choose_next_w_empty_rh(self):
        seed = 42
        smbo = SMAC(self.scenario, rng=seed).solver
        smbo.runhistory = RunHistory(aggregate_func=average_cost)
        X = self.scenario.cs.sample_configuration().get_array()[None, :]

        Y = self.branin(X)
        self.assertRaises(ValueError, smbo.choose_next, **{"X": X, "Y": Y})

        x = next(smbo.choose_next(X, Y, incumbent_value=0.0)).get_array()
        assert x.shape == (2, )
Exemple #5
0
    def test_choose_next_3(self):
        # Test with ten configurations in the runhistory
        def side_effect(X):
            return np.mean(X, axis=1).reshape((-1, 1))

        def side_effect_predict(X):
            m, v = np.ones((X.shape[0], 1)), None
            return m, v

        smbo = SMAC(self.scenario, rng=1).solver
        smbo.incumbent = self.scenario.cs.sample_configuration()
        previous_configs = [smbo.incumbent] + [
            self.scenario.cs.sample_configuration() for i in range(0, 20)
        ]
        smbo.runhistory = RunHistory(aggregate_func=average_cost)
        for i in range(0, len(previous_configs)):
            smbo.runhistory.add(previous_configs[i], i, 10, 1)
        smbo.model = mock.Mock(spec=RandomForestWithInstances)
        smbo.model.predict_marginalized_over_instances.side_effect = side_effect_predict
        smbo.acquisition_func._compute = mock.Mock(
            spec=RandomForestWithInstances)
        smbo.acquisition_func._compute.side_effect = side_effect

        X = smbo.rng.rand(10, 2)
        Y = smbo.rng.rand(10, 1)

        challengers = smbo.choose_next(X, Y)
        # Convert challenger list (a generator) to a real list
        challengers = [c for c in challengers]

        self.assertEqual(smbo.model.train.call_count, 1)

        # For each configuration it is randomly sampled whether to take it from the list of challengers or to sample it
        # completely at random. Therefore, it is not guaranteed to obtain twice the number of configurations selected
        # by EI.
        self.assertEqual(len(challengers), 9913)
        num_random_search_sorted = 0
        num_random_search = 0
        num_local_search = 0
        for c in challengers:
            self.assertIsInstance(c, Configuration)
            if 'Random Search (sorted)' == c.origin:
                num_random_search_sorted += 1
            elif 'Random Search' == c.origin:
                num_random_search += 1
            elif 'Local Search' == c.origin:
                num_local_search += 1
            else:
                raise ValueError(c.origin)

        self.assertEqual(num_local_search, 10)
        self.assertEqual(num_random_search_sorted, 4990)
        self.assertEqual(num_random_search, 4913)
Exemple #6
0
    def run_smbo(self):

        self.watcher.start_task('SMBO')

        # == first things first: load the datamanager
        self.reset_data_manager()

        # == Initialize non-SMBO stuff
        # first create a scenario
        seed = self.seed
        self.config_space.seed(seed)
        num_params = len(self.config_space.get_hyperparameters())
        # allocate a run history
        num_run = self.start_num_run

        # Initialize some SMAC dependencies
        runhistory = RunHistory(aggregate_func=average_cost)
        # meta_runhistory = RunHistory(aggregate_func=average_cost)
        # meta_runs_dataset_indices = {}

        # == METALEARNING suggestions
        # we start by evaluating the defaults on the full dataset again
        # and add the suggestions from metalearning behind it

        if self.num_metalearning_cfgs > 0:
            if self.metadata_directory is None:
                metalearning_directory = os.path.dirname(
                    autosklearn.metalearning.__file__)
                # There is no multilabel data in OpenML
                if self.task == MULTILABEL_CLASSIFICATION:
                    meta_task = BINARY_CLASSIFICATION
                else:
                    meta_task = self.task
                metadata_directory = os.path.join(
                    metalearning_directory, 'files', '%s_%s_%s' %
                    (self.metric, TASK_TYPES_TO_STRING[meta_task], 'sparse'
                     if self.datamanager.info['is_sparse'] else 'dense'))
                self.metadata_directory = metadata_directory

            if os.path.exists(self.metadata_directory):

                self.logger.info('Metadata directory: %s',
                                 self.metadata_directory)
                meta_base = MetaBase(self.config_space,
                                     self.metadata_directory)

                try:
                    meta_base.remove_dataset(self.dataset_name)
                except:
                    pass

                metafeature_calculation_time_limit = int(
                    self.total_walltime_limit / 4)
                metafeature_calculation_start_time = time.time()
                meta_features = self._calculate_metafeatures_with_limits(
                    metafeature_calculation_time_limit)
                metafeature_calculation_end_time = time.time()
                metafeature_calculation_time_limit = \
                    metafeature_calculation_time_limit - (
                    metafeature_calculation_end_time -
                    metafeature_calculation_start_time)

                if metafeature_calculation_time_limit < 1:
                    self.logger.warning(
                        'Time limit for metafeature calculation less '
                        'than 1 seconds (%f). Skipping calculation '
                        'of metafeatures for encoded dataset.',
                        metafeature_calculation_time_limit)
                    meta_features_encoded = None
                else:
                    with warnings.catch_warnings():
                        warnings.showwarning = self._send_warnings_to_log
                        self.datamanager.perform1HotEncoding()
                    meta_features_encoded = \
                        self._calculate_metafeatures_encoded_with_limits(
                            metafeature_calculation_time_limit)

                # In case there is a problem calculating the encoded meta-features
                if meta_features is None:
                    if meta_features_encoded is not None:
                        meta_features = meta_features_encoded
                else:
                    if meta_features_encoded is not None:
                        meta_features.metafeature_values.update(
                            meta_features_encoded.metafeature_values)

                if meta_features is not None:
                    meta_base.add_dataset(self.dataset_name, meta_features)
                    # Do mean imputation of the meta-features - should be done specific
                    # for each prediction model!
                    all_metafeatures = meta_base.get_metafeatures(
                        features=list(meta_features.keys()))
                    all_metafeatures.fillna(all_metafeatures.mean(),
                                            inplace=True)

                    with warnings.catch_warnings():
                        warnings.showwarning = self._send_warnings_to_log
                        metalearning_configurations = self.collect_metalearning_suggestions(
                            meta_base)
                    if metalearning_configurations is None:
                        metalearning_configurations = []
                    self.reset_data_manager()

                    self.logger.info('%s', meta_features)

                    # Convert meta-features into a dictionary because the scenario
                    # expects a dictionary
                    meta_features_dict = {}
                    for dataset, series in all_metafeatures.iterrows():
                        meta_features_dict[dataset] = series.values
                    meta_features_list = []
                    for meta_feature_name in all_metafeatures.columns:
                        meta_features_list.append(
                            meta_features[meta_feature_name].value)
                    meta_features_list = np.array(meta_features_list).reshape(
                        (1, -1))
                    self.logger.info(list(meta_features_dict.keys()))

                    # meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric])
                    # meta_runs_index = 0
                    # try:
                    #    meta_durations = meta_base.get_all_runs('runtime')
                    #    read_runtime_data = True
                    # except KeyError:
                    #    read_runtime_data = False
                    #    self.logger.critical('Cannot read runtime data.')
                    #    if self.acquisition_function == 'EIPS':
                    #        self.logger.critical('Reverting to acquisition function EI!')
                    #        self.acquisition_function = 'EI'

                    # for meta_dataset in meta_runs.index:
                    #     meta_dataset_start_index = meta_runs_index
                    #     for meta_configuration in meta_runs.columns:
                    #         if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]):
                    #             try:
                    #                 config = meta_base.get_configuration_from_algorithm_index(
                    #                     meta_configuration)
                    #                 cost = meta_runs.loc[meta_dataset, meta_configuration]
                    #                 if read_runtime_data:
                    #                     runtime = meta_durations.loc[meta_dataset,
                    #                                                  meta_configuration]
                    #                 else:
                    #                     runtime = 1
                    #                 # TODO read out other status types!
                    #                 meta_runhistory.add(config, cost, runtime,
                    #                                     StatusType.SUCCESS,
                    #                                     instance_id=meta_dataset)
                    #                 meta_runs_index += 1
                    #             except:
                    #                 # TODO maybe add warning
                    #                 pass
                    #
                    #     meta_runs_dataset_indices[meta_dataset] = (
                    #         meta_dataset_start_index, meta_runs_index)
            else:
                meta_features = None
                self.logger.warning('Could not find meta-data directory %s' %
                                    metadata_directory)

        else:
            meta_features = None

        if meta_features is None:
            if self.acquisition_function == 'EIPS':
                self.logger.critical('Reverting to acquisition function EI!')
                self.acquisition_function = 'EI'
            meta_features_list = []
            meta_features_dict = {}
            metalearning_configurations = []

        if self.resampling_strategy in [
                'partial-cv', 'partial-cv-iterative-fit'
        ]:
            num_folds = self.resampling_strategy_args['folds']
            instances = [[
                json.dumps({
                    'task_id': self.dataset_name,
                    'fold': fold_number
                })
            ] for fold_number in range(num_folds)]
        else:
            instances = [[json.dumps({'task_id': self.dataset_name})]]

        startup_time = self.watcher.wall_elapsed(self.dataset_name)
        total_walltime_limit = self.total_walltime_limit - startup_time - 5
        scenario_dict = {
            'cs': self.config_space,
            'cutoff-time': self.func_eval_time_limit,
            'memory-limit': self.memory_limit,
            'wallclock-limit': total_walltime_limit,
            'output-dir': self.backend.get_smac_output_directory(self.seed),
            'shared-model': self.shared_mode,
            'run-obj': 'quality',
            'deterministic': 'true',
            'instances': instances
        }

        if self.configuration_mode == 'RANDOM':
            scenario_dict['minR'] = len(
                instances) if instances is not None else 1
            scenario_dict['initial_incumbent'] = 'RANDOM'

        self.scenario = Scenario(scenario_dict)

        # TODO rebuild target algorithm to be it's own target algorithm
        # evaluator, which takes into account that a run can be killed prior
        # to the model being fully fitted; thus putting intermediate results
        # into a queue and querying them once the time is over
        exclude = dict()
        include = dict()
        if self.include_preprocessors is not None and \
                self.exclude_preprocessors is not None:
            raise ValueError('Cannot specify include_preprocessors and '
                             'exclude_preprocessors.')
        elif self.include_preprocessors is not None:
            include['preprocessor'] = self.include_preprocessors
        elif self.exclude_preprocessors is not None:
            exclude['preprocessor'] = self.exclude_preprocessors
        if self.include_estimators is not None and \
                self.exclude_preprocessors is not None:
            raise ValueError('Cannot specify include_estimators and '
                             'exclude_estimators.')
        elif self.include_estimators is not None:
            if self.task in CLASSIFICATION_TASKS:
                include['classifier'] = self.include_estimators
            elif self.task in REGRESSION_TASKS:
                include['regressor'] = self.include_estimators
            else:
                raise ValueError(self.task)
        elif self.exclude_estimators is not None:
            if self.task in CLASSIFICATION_TASKS:
                exclude['classifier'] = self.exclude_estimators
            elif self.task in REGRESSION_TASKS:
                exclude['regressor'] = self.exclude_estimators
            else:
                raise ValueError(self.task)

        ta = ExecuteTaFuncWithQueue(
            backend=self.backend,
            autosklearn_seed=seed,
            resampling_strategy=self.resampling_strategy,
            initial_num_run=num_run,
            logger=self.logger,
            include=include,
            exclude=exclude,
            metric=self.metric,
            memory_limit=self.memory_limit,
            disable_file_output=self.disable_file_output,
            **self.resampling_strategy_args)

        types, bounds = get_types(self.config_space,
                                  self.scenario.feature_array)

        # TODO extract generation of SMAC object into it's own function for
        # testing
        if self.acquisition_function == 'EI':
            model = RandomForestWithInstances(
                types=types,
                bounds=bounds,
                #instance_features=meta_features_list,
                seed=1,
                num_trees=10)
            rh2EPM = RunHistory2EPM4Cost(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=[
                                             StatusType.SUCCESS,
                                             StatusType.MEMOUT,
                                             StatusType.TIMEOUT
                                         ],
                                         impute_censored_data=False,
                                         impute_state=None)
            _smac_arguments = dict(scenario=self.scenario,
                                   model=model,
                                   rng=seed,
                                   runhistory2epm=rh2EPM,
                                   tae_runner=ta,
                                   runhistory=runhistory)
        elif self.acquisition_function == 'EIPS':
            rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=[
                                             StatusType.SUCCESS,
                                             StatusType.MEMOUT,
                                             StatusType.TIMEOUT
                                         ],
                                         impute_censored_data=False,
                                         impute_state=None)
            model = UncorrelatedMultiObjectiveRandomForestWithInstances(
                ['cost', 'runtime'],
                types=types,
                bounds=bounds,
                num_trees=10,
                instance_features=meta_features_list,
                seed=1)
            acquisition_function = EIPS(model)
            _smac_arguments = dict(scenario=self.scenario,
                                   model=model,
                                   rng=seed,
                                   tae_runner=ta,
                                   runhistory2epm=rh2EPM,
                                   runhistory=runhistory,
                                   acquisition_function=acquisition_function)
        else:
            raise ValueError('Unknown acquisition function value %s!' %
                             self.acquisition_function)

        if self.configuration_mode == 'SMAC':
            smac = SMAC(**_smac_arguments)
        elif self.configuration_mode in ['ROAR', 'RANDOM']:
            for not_in_roar in ['runhistory2epm', 'model']:
                if not_in_roar in _smac_arguments:
                    del _smac_arguments[not_in_roar]
            smac = ROAR(**_smac_arguments)
        else:
            raise ValueError(self.configuration_mode)

        # Build a runtime model
        # runtime_rf = RandomForestWithInstances(types,
        #                                        instance_features=meta_features_list,
        #                                        seed=1, num_trees=10)
        # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
        #                                      scenario=self.scenario,
        #                                      success_states=None,
        #                                      impute_censored_data=False,
        #                                      impute_state=None)
        # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory)
        # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten())
        # X_meta, Y_meta = rh2EPM.transform(meta_runhistory)
        # # Transform Y_meta on a per-dataset base
        # for meta_dataset in meta_runs_dataset_indices:
        #     start_index, end_index = meta_runs_dataset_indices[meta_dataset]
        #     end_index += 1  # Python indexing
        #     Y_meta[start_index:end_index, 0]\
        #         [Y_meta[start_index:end_index, 0] >2.0] =  2.0
        #     dataset_minimum = np.min(Y_meta[start_index:end_index, 0])
        #     Y_meta[start_index:end_index, 0] = 1 - (
        #         (1. - Y_meta[start_index:end_index, 0]) /
        #         (1. - dataset_minimum))
        #     Y_meta[start_index:end_index, 0]\
        #           [Y_meta[start_index:end_index, 0] > 2] = 2

        smac.solver.stats.start_timing()
        # == first, evaluate all metelearning and default configurations
        smac.solver.incumbent = smac.solver.initial_design.run()

        for challenger in metalearning_configurations:

            smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify(
                challengers=[challenger],
                incumbent=smac.solver.incumbent,
                run_history=smac.solver.runhistory,
                aggregate_func=smac.solver.aggregate_func,
                time_bound=self.total_walltime_limit)

            if smac.solver.scenario.shared_model:
                pSMAC.write(run_history=smac.solver.runhistory,
                            output_directory=smac.solver.scenario.output_dir,
                            num_run=self.seed)

            if smac.solver.stats.is_budget_exhausted():
                break

        # == after metalearning run SMAC loop
        while True:

            if smac.solver.scenario.shared_model:
                pSMAC.read(run_history=smac.solver.runhistory,
                           output_dirs=glob.glob(
                               self.backend.get_smac_output_glob()),
                           configuration_space=self.config_space,
                           logger=self.logger)

            choose_next_start_time = time.time()
            try:
                challengers = self.choose_next(smac)
            except Exception as e:
                self.logger.error(e)
                self.logger.error("Error in getting next configurations "
                                  "with SMAC. Using random configuration!")
                next_config = self.config_space.sample_configuration()
                challengers = [next_config]
            time_for_choose_next = time.time() - choose_next_start_time
            self.logger.info('Used %g seconds to find next '
                             'configurations' % (time_for_choose_next))

            time_for_choose_next = max(time_for_choose_next, 1.0)
            smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify(
                challengers=challengers,
                incumbent=smac.solver.incumbent,
                run_history=smac.solver.runhistory,
                aggregate_func=smac.solver.aggregate_func,
                time_bound=time_for_choose_next)

            if smac.solver.scenario.shared_model:
                pSMAC.write(run_history=smac.solver.runhistory,
                            output_directory=smac.solver.scenario.output_dir,
                            num_run=self.seed)

            if smac.solver.stats.is_budget_exhausted():
                break

        self.runhistory = smac.solver.runhistory
        self.trajectory = smac.solver.intensifier.traj_logger.trajectory
        smac.runhistory = self.runhistory
        self.fANOVA_input = smac.get_X_y()

        return self.runhistory, self.trajectory, self.fANOVA_input