예제 #1
0
    def setUp(self):
        self.cwd = os.getcwd()
        data_dir = os.path.dirname(__file__)
        data_dir = os.path.join(data_dir, 'test_meta_base_data')
        os.chdir(data_dir)

        cs = autosklearn.pipeline.classification.SimpleClassificationPipeline()\
            .get_hyperparameter_search_space()

        self.base = MetaBase(cs, data_dir)
예제 #2
0
def get_autosklearn_metalearning(X_train, y_train, cat, metric,
                                 num_initial_configurations):
    task_id = "new_task"
    is_sparse = scipy.sparse.issparse(X_train)

    dataset_properties = {
        'signed': True,
        'multiclass': False if len(np.unique(y_train)) == 2 == 2 else True,
        'task': 1 if len(np.unique(y_train)) == 2 else 2,
        'sparse': is_sparse,
        'is_sparse': is_sparse,
        'target_type': 'classification',
        'multilabel': False
    }

    config_space = pipeline.get_configuration_space(dataset_properties, None,
                                                    None, None, None)

    metalearning_dir = os.path.join(
        os.path.dirname(metalearning.__file__), "files",
        "balanced_accuracy_{0}.classification_{1}".format(
            "multiclass" if dataset_properties["multiclass"] else "binary",
            "sparse" if dataset_properties["sparse"] else "dense"))
    metabase = MetaBase(config_space, metalearning_dir)

    meta_features = None
    try:
        rvals, sparse = perform_one_hot_encoding(
            dataset_properties["sparse"], [c in ['categorical'] for c in cat],
            [X_train])
        meta_features = _calculate_metafeatures_encoded__(
            task_id, rvals[0], y_train)
        X_train = rvals
    except:
        meta_features = _calculate_metafeatures__(cat,
                                                  MULTICLASS_CLASSIFICATION,
                                                  task_id, X_train, y_train)

    if meta_features is None:
        raise Exception("Error calculating metafeatures")

    metabase.add_dataset(task_id, meta_features)

    configs, list_nn = (suggest_via_metalearning_(
        metabase, task_id, metric,
        2 if dataset_properties["multiclass"] else 1, False,
        num_initial_configurations))

    return configs, list_nn
예제 #3
0
    def setUp(self):
        self.cwd = os.getcwd()
        data_dir = os.path.dirname(__file__)
        data_dir = os.path.join(data_dir, 'test_meta_base_data')
        os.chdir(data_dir)

        cs = ParamSklearn.classification.ParamSklearnClassifier.get_hyperparameter_search_space()

        self.base = MetaBase(cs, data_dir)
예제 #4
0
    def __init__(self,
                 dataset_name,
                 configuration_space,
                 aslib_directory,
                 distance='l1',
                 seed=None,
                 use_features='',
                 distance_kwargs=None,
                 subset='all'):
        """Metalearning optimizer.

        Parameters
        ----------
        dataset_name : str
            Name of the dataset

        configuration_space : HPOlibConfigSpace.configuration_space.ConfigurationSpace

        datasets_file : str
            Path to an aslib directory

        distance : str, "l1" or "l2" or "random"
            Distance function to be used by the kNearestDatasets algorithm.

        seed

        use_features

        metric_kwargs

        subset
        """
        self.dataset_name = dataset_name
        self.configuration_space = configuration_space
        self.aslib_dir = aslib_directory
        self.distance = distance
        self.seed = seed
        self.use_features = use_features
        self.distance_kwargs = distance_kwargs
        self.subset = subset
        self.kND = None  # For caching, makes things faster...

        self.meta_base = MetaBase(configuration_space, self.aslib_dir)
        self.logger = logging.getLogger(__name__)
예제 #5
0
    def setUp(self):
        self.cwd = os.getcwd()
        data_dir = os.path.dirname(__file__)
        data_dir = os.path.join(data_dir, 'test_meta_base_data')
        os.chdir(data_dir)

        cs = autosklearn.pipeline.classification.SimpleClassificationPipeline\
            .get_hyperparameter_search_space()

        self.base = MetaBase(cs, data_dir)
    def setUp(self):
        self.cwd = os.getcwd()
        data_dir = os.path.dirname(__file__)
        data_dir = os.path.join(data_dir, 'test_meta_base_data')
        os.chdir(data_dir)

        self.cs = autosklearn.pipeline.classification\
            .SimpleClassificationPipeline().get_hyperparameter_search_space()

        meta_base = MetaBase(self.cs, data_dir)
        self.meta_optimizer = metalearner.MetaLearningOptimizer(
            '233', self.cs, meta_base)
예제 #7
0
class MetaBaseTest(unittest.TestCase):
    _multiprocess_can_split_ = True

    def setUp(self):
        self.cwd = os.getcwd()
        data_dir = os.path.dirname(__file__)
        data_dir = os.path.join(data_dir, 'test_meta_base_data')
        os.chdir(data_dir)

        cs = autosklearn.pipeline.classification.SimpleClassificationPipeline\
            .get_hyperparameter_search_space()

        self.base = MetaBase(cs, data_dir)

    def tearDown(self):
        os.chdir(self.cwd)

    def test_get_all_runs(self):
        runs = self.base.get_all_runs()
        self.assertIsInstance(runs, pd.DataFrame)
        # TODO update this ASAP
        self.assertEqual((134, 24), runs.shape)

    def test_get_runs(self):
        runs = self.base.get_runs('38_acc')
        # TODO update this ASAP
        self.assertEqual(24, len(runs))
        self.assertIsInstance(runs, pd.Series)

    def test_get_metafeatures_as_pandas(self):
        mf = self.base.get_metafeatures('38_acc')
        self.assertTrue(np.isfinite(mf).all())
        self.assertEqual(type(mf), pd.Series)
        self.assertEqual(mf.name, u'38_acc')
        self.assertEqual(mf.loc['NumberOfInstances'], 2527.0)

    def test_get_all_metafeatures_as_pandas(self):
        mf = self.base.get_all_metafeatures()
        self.assertIsInstance(mf, pd.DataFrame)
        self.assertEqual((140, 46), mf.shape)
class MetaBaseTest(unittest.TestCase):
    _multiprocess_can_split_ = True

    def setUp(self):
        self.cwd = os.getcwd()
        data_dir = os.path.dirname(__file__)
        data_dir = os.path.join(data_dir, 'test_meta_base_data')
        os.chdir(data_dir)

        cs = autosklearn.pipeline.classification.SimpleClassificationPipeline()\
            .get_hyperparameter_search_space()

        self.logger = logging.getLogger()
        self.base = MetaBase(cs, data_dir, logger=self.logger)

    def tearDown(self):
        os.chdir(self.cwd)

    def test_get_all_runs(self):
        runs = self.base.get_all_runs()
        self.assertIsInstance(runs, pd.DataFrame)
        # TODO update this ASAP
        self.assertEqual((125, 125), runs.shape)

    def test_get_runs(self):
        runs = self.base.get_runs('233')
        # TODO update this ASAP
        self.assertEqual(125, len(runs))
        self.assertIsInstance(runs, pd.Series)

    def test_get_metafeatures_single_dataset(self):
        mf = self.base.get_metafeatures('233')
        self.assertIsInstance(mf, pd.Series)
        self.assertEqual(mf.name, '233')
        self.assertEqual(mf.loc['NumberOfInstances'], 2142.0)

    def test_get_metafeatures_single_feature(self):
        mf = self.base.get_metafeatures(features='NumberOfInstances')
        self.assertIsInstance(mf, pd.Series)
        self.assertEqual(mf.shape, (132, ))

    def test_get_metafeatures_single_dataset_and_single_feature(self):
        mf = self.base.get_metafeatures('233', features='NumberOfInstances')
        self.assertEqual(mf.shape, ())

    def test_get_metafeatures_multiple_datasets(self):
        mf = self.base.get_metafeatures(['233', '236'])
        self.assertIsInstance(mf, pd.DataFrame)
        self.assertEqual(mf.shape, (2, 46))

    def test_get_metafeatures_multiple_features(self):
        mf = self.base.get_metafeatures(
            features=['NumberOfInstances', 'NumberOfClasses'])
        self.assertIsInstance(mf, pd.DataFrame)
        self.assertEqual(mf.shape, (132, 2))
예제 #9
0
    def __init__(self, dataset_name, configuration_space,
                 aslib_directory, distance='l1', seed=None, use_features='',
                 distance_kwargs=None, subset='all'):
        """Metalearning optimizer.

        Parameters
        ----------
        dataset_name : str
            Name of the dataset

        configuration_space : HPOlibConfigSpace.configuration_space.ConfigurationSpace

        datasets_file : str
            Path to an aslib directory

        distance : str, "l1" or "l2" or "random"
            Distance function to be used by the kNearestDatasets algorithm.

        seed

        use_features

        metric_kwargs

        subset
        """
        self.dataset_name = dataset_name
        self.configuration_space = configuration_space
        self.aslib_dir = aslib_directory
        self.distance = distance
        self.seed = seed
        self.use_features = use_features
        self.distance_kwargs = distance_kwargs
        self.subset = subset
        self.kND = None     # For caching, makes things faster...

        self.meta_base = MetaBase(configuration_space, self.aslib_dir)
        self.logger = logging.getLogger(__name__)
    #    args['metalearning_directory'])

    with open(args["task_files_list"]) as fh:
        task_files_list = fh.readlines()
    with open(args["experiments_list"]) as fh:
        experiments_list = fh.readlines()

    if 'keep_configurations' in args:
        keep_configurations = args['keep_configurations']
        keep_configurations = keep_configurations.split(',')
        keep_configurations = tuple(
            [tuple(kc.split('=')) for kc in keep_configurations])
    else:
        keep_configurations = None

    meta_base = MetaBase(task_files_list, experiments_list,
                         keep_configurations)
    metafeatures = meta_base.get_all_train_metafeatures_as_pandas()
    runs = meta_base.get_all_runs()

    # This can print the best hyperparameters of every dataset
    # for dataset in runs:
    # print dataset, sorted(runs[dataset], key=lambda t: t.result)[0]

    rf = LearnedDistanceRF(**params)
    X, Y = rf._create_dataset(metafeatures, runs)
    import cPickle

    with open("test.pkl", "w") as fh:
        cPickle.dump((X, Y, metafeatures), fh, -1)

    print("Metafeatures", metafeatures.shape)
예제 #11
0
    #    args['metalearning_directory'])

    with open(args["task_files_list"]) as fh:
        task_files_list = fh.readlines()
    with open(args["experiments_list"]) as fh:
        experiments_list = fh.readlines()

    if 'keep_configurations' in args:
        keep_configurations = args['keep_configurations']
        keep_configurations = keep_configurations.split(',')
        keep_configurations = tuple(
            [tuple(kc.split('=')) for kc in keep_configurations])
    else:
        keep_configurations = None

    meta_base = MetaBase(task_files_list, experiments_list, keep_configurations)
    metafeatures = meta_base.get_all_train_metafeatures_as_pandas()
    runs = meta_base.get_all_runs()

    # This can print the best hyperparameters of every dataset
    # for dataset in runs:
    # print dataset, sorted(runs[dataset], key=lambda t: t.result)[0]

    rf = LearnedDistanceRF(**params)
    X, Y = rf._create_dataset(metafeatures, runs)
    import cPickle

    with open("test.pkl", "w") as fh:
        cPickle.dump((X, Y, metafeatures), fh, -1)

    print("Metafeatures", metafeatures.shape)
예제 #12
0
    def run_smbo(self):

        self.watcher.start_task('SMBO')

        # == first things first: load the datamanager
        self.reset_data_manager()

        # == Initialize non-SMBO stuff
        # first create a scenario
        seed = self.seed
        self.config_space.seed(seed)
        num_params = len(self.config_space.get_hyperparameters())
        # allocate a run history
        num_run = self.start_num_run
        instance_id = self.dataset_name + SENTINEL

        # Initialize some SMAC dependencies
        runhistory = RunHistory(aggregate_func=average_cost)
        # meta_runhistory = RunHistory(aggregate_func=average_cost)
        # meta_runs_dataset_indices = {}

        # == METALEARNING suggestions
        # we start by evaluating the defaults on the full dataset again
        # and add the suggestions from metalearning behind it

        if self.num_metalearning_cfgs > 0:
            if self.metadata_directory is None:
                metalearning_directory = os.path.dirname(
                    autosklearn.metalearning.__file__)
                # There is no multilabel data in OpenML
                if self.task == MULTILABEL_CLASSIFICATION:
                    meta_task = BINARY_CLASSIFICATION
                else:
                    meta_task = self.task
                metadata_directory = os.path.join(
                    metalearning_directory, 'files', '%s_%s_%s' %
                    (METRIC_TO_STRING[self.metric],
                     TASK_TYPES_TO_STRING[meta_task], 'sparse'
                     if self.datamanager.info['is_sparse'] else 'dense'))
                self.metadata_directory = metadata_directory

            self.logger.info('Metadata directory: %s', self.metadata_directory)
            meta_base = MetaBase(self.config_space, self.metadata_directory)

            metafeature_calculation_time_limit = int(
                self.total_walltime_limit / 4)
            metafeature_calculation_start_time = time.time()
            meta_features = self._calculate_metafeatures_with_limits(
                metafeature_calculation_time_limit)
            metafeature_calculation_end_time = time.time()
            metafeature_calculation_time_limit = \
                metafeature_calculation_time_limit - (
                metafeature_calculation_end_time -
                metafeature_calculation_start_time)

            if metafeature_calculation_time_limit < 1:
                self.logger.warning(
                    'Time limit for metafeature calculation less '
                    'than 1 seconds (%f). Skipping calculation '
                    'of metafeatures for encoded dataset.',
                    metafeature_calculation_time_limit)
                meta_features_encoded = None
            else:
                with warnings.catch_warnings():
                    warnings.showwarning = self._send_warnings_to_log
                    self.datamanager.perform1HotEncoding()
                meta_features_encoded = \
                    self._calculate_metafeatures_encoded_with_limits(
                        metafeature_calculation_time_limit)

            # In case there is a problem calculating the encoded meta-features
            if meta_features is None:
                if meta_features_encoded is not None:
                    meta_features = meta_features_encoded
            else:
                if meta_features_encoded is not None:
                    meta_features.metafeature_values.update(
                        meta_features_encoded.metafeature_values)

            if meta_features is not None:
                meta_base.add_dataset(instance_id, meta_features)
                # Do mean imputation of the meta-features - should be done specific
                # for each prediction model!
                all_metafeatures = meta_base.get_metafeatures(
                    features=list(meta_features.keys()))
                all_metafeatures.fillna(all_metafeatures.mean(), inplace=True)

                with warnings.catch_warnings():
                    warnings.showwarning = self._send_warnings_to_log
                    metalearning_configurations = self.collect_metalearning_suggestions(
                        meta_base)
                if metalearning_configurations is None:
                    metalearning_configurations = []
                self.reset_data_manager()

                self.logger.info('%s', meta_features)

                # Convert meta-features into a dictionary because the scenario
                # expects a dictionary
                meta_features_dict = {}
                for dataset, series in all_metafeatures.iterrows():
                    meta_features_dict[dataset] = series.values
                meta_features_list = []
                for meta_feature_name in all_metafeatures.columns:
                    meta_features_list.append(
                        meta_features[meta_feature_name].value)
                meta_features_list = np.array(meta_features_list).reshape(
                    (1, -1))
                self.logger.info(list(meta_features_dict.keys()))

                # meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric])
                # meta_runs_index = 0
                # try:
                #    meta_durations = meta_base.get_all_runs('runtime')
                #    read_runtime_data = True
                # except KeyError:
                #    read_runtime_data = False
                #    self.logger.critical('Cannot read runtime data.')
                #    if self.acquisition_function == 'EIPS':
                #        self.logger.critical('Reverting to acquisition function EI!')
                #        self.acquisition_function = 'EI'

                # for meta_dataset in meta_runs.index:
                #     meta_dataset_start_index = meta_runs_index
                #     for meta_configuration in meta_runs.columns:
                #         if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]):
                #             try:
                #                 config = meta_base.get_configuration_from_algorithm_index(
                #                     meta_configuration)
                #                 cost = meta_runs.loc[meta_dataset, meta_configuration]
                #                 if read_runtime_data:
                #                     runtime = meta_durations.loc[meta_dataset,
                #                                                  meta_configuration]
                #                 else:
                #                     runtime = 1
                #                 # TODO read out other status types!
                #                 meta_runhistory.add(config, cost, runtime,
                #                                     StatusType.SUCCESS,
                #                                     instance_id=meta_dataset)
                #                 meta_runs_index += 1
                #             except:
                #                 # TODO maybe add warning
                #                 pass
                #
                #     meta_runs_dataset_indices[meta_dataset] = (
                #         meta_dataset_start_index, meta_runs_index)

        else:
            meta_features = None

        if meta_features is None:
            if self.acquisition_function == 'EIPS':
                self.logger.critical('Reverting to acquisition function EI!')
                self.acquisition_function = 'EI'
            meta_features_list = []
            meta_features_dict = {}
            metalearning_configurations = []

        if self.resampling_strategy in [
                'partial-cv', 'partial-cv-iterative-fit'
        ]:
            num_folds = self.resampling_strategy_args['folds']
            instances = [[fold_number] for fold_number in range(num_folds)]
        else:
            instances = None

        startup_time = self.watcher.wall_elapsed(self.dataset_name)
        total_walltime_limit = self.total_walltime_limit - startup_time - 5
        scenario_dict = {
            'cs': self.config_space,
            'cutoff-time': self.func_eval_time_limit,
            'memory-limit': self.memory_limit,
            'wallclock-limit': total_walltime_limit,
            # 'instances': [[name] for name in meta_features_dict],
            'output-dir': self.backend.temporary_directory,
            'shared-model': self.shared_mode,
            'run-obj': 'quality',
            'deterministic': 'true',
            'instances': instances
        }

        if self.configuration_mode == 'RANDOM':
            scenario_dict['minR'] = len(
                instances) if instances is not None else 1
            scenario_dict['initial_incumbent'] = 'RANDOM'

        self.scenario = Scenario(scenario_dict)

        # TODO rebuild target algorithm to be it's own target algorithm
        # evaluator, which takes into account that a run can be killed prior
        # to the model being fully fitted; thus putting intermediate results
        # into a queue and querying them once the time is over
        exclude = dict()
        include = dict()
        if self.include_preprocessors is not None and \
                self.exclude_preprocessors is not None:
            raise ValueError('Cannot specify include_preprocessors and '
                             'exclude_preprocessors.')
        elif self.include_preprocessors is not None:
            include['preprocessor'] = self.include_preprocessors
        elif self.exclude_preprocessors is not None:
            exclude['preprocessor'] = self.exclude_preprocessors
        if self.include_estimators is not None and \
                self.exclude_preprocessors is not None:
            raise ValueError('Cannot specify include_estimators and '
                             'exclude_estimators.')
        elif self.include_estimators is not None:
            if self.task in CLASSIFICATION_TASKS:
                include['classifier'] = self.include_estimators
            elif self.task in REGRESSION_TASKS:
                include['regressor'] = self.include_estimators
            else:
                raise ValueError(self.task)
        elif self.exclude_estimators is not None:
            if self.task in CLASSIFICATION_TASKS:
                exclude['classifier'] = self.exclude_estimators
            elif self.task in REGRESSION_TASKS:
                exclude['regressor'] = self.exclude_estimators
            else:
                raise ValueError(self.task)

        ta = ExecuteTaFuncWithQueue(
            backend=self.backend,
            autosklearn_seed=seed,
            resampling_strategy=self.resampling_strategy,
            initial_num_run=num_run,
            logger=self.logger,
            include=include,
            exclude=exclude,
            memory_limit=self.memory_limit,
            disable_file_output=self.disable_file_output,
            **self.resampling_strategy_args)

        types = get_types(self.config_space, self.scenario.feature_array)

        # TODO extract generation of SMAC object into it's own function for
        # testing
        if self.acquisition_function == 'EI':
            model = RandomForestWithInstances(
                types,
                #instance_features=meta_features_list,
                seed=1,
                num_trees=10)
            rh2EPM = RunHistory2EPM4Cost(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=[
                                             StatusType.SUCCESS,
                                             StatusType.MEMOUT,
                                             StatusType.TIMEOUT
                                         ],
                                         impute_censored_data=False,
                                         impute_state=None)
            _smac_arguments = dict(scenario=self.scenario,
                                   model=model,
                                   rng=seed,
                                   runhistory2epm=rh2EPM,
                                   tae_runner=ta,
                                   runhistory=runhistory)
        elif self.acquisition_function == 'EIPS':
            rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=[
                                             StatusType.SUCCESS,
                                             StatusType.MEMOUT,
                                             StatusType.TIMEOUT
                                         ],
                                         impute_censored_data=False,
                                         impute_state=None)
            model = UncorrelatedMultiObjectiveRandomForestWithInstances(
                ['cost', 'runtime'],
                types,
                num_trees=10,
                instance_features=meta_features_list,
                seed=1)
            acquisition_function = EIPS(model)
            _smac_arguments = dict(scenario=self.scenario,
                                   model=model,
                                   rng=seed,
                                   tae_runner=ta,
                                   runhistory2epm=rh2EPM,
                                   runhistory=runhistory,
                                   acquisition_function=acquisition_function)
        else:
            raise ValueError('Unknown acquisition function value %s!' %
                             self.acquisition_function)

        if self.configuration_mode == 'SMAC':
            smac = SMAC(**_smac_arguments)
        elif self.configuration_mode in ['ROAR', 'RANDOM']:
            for not_in_roar in ['runhistory2epm', 'model']:
                if not_in_roar in _smac_arguments:
                    del _smac_arguments[not_in_roar]
            smac = ROAR(**_smac_arguments)
        else:
            raise ValueError(self.configuration_mode)

        # Build a runtime model
        # runtime_rf = RandomForestWithInstances(types,
        #                                        instance_features=meta_features_list,
        #                                        seed=1, num_trees=10)
        # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
        #                                      scenario=self.scenario,
        #                                      success_states=None,
        #                                      impute_censored_data=False,
        #                                      impute_state=None)
        # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory)
        # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten())
        # X_meta, Y_meta = rh2EPM.transform(meta_runhistory)
        # # Transform Y_meta on a per-dataset base
        # for meta_dataset in meta_runs_dataset_indices:
        #     start_index, end_index = meta_runs_dataset_indices[meta_dataset]
        #     end_index += 1  # Python indexing
        #     Y_meta[start_index:end_index, 0]\
        #         [Y_meta[start_index:end_index, 0] >2.0] =  2.0
        #     dataset_minimum = np.min(Y_meta[start_index:end_index, 0])
        #     Y_meta[start_index:end_index, 0] = 1 - (
        #         (1. - Y_meta[start_index:end_index, 0]) /
        #         (1. - dataset_minimum))
        #     Y_meta[start_index:end_index, 0]\
        #           [Y_meta[start_index:end_index, 0] > 2] = 2

        smac.solver.stats.start_timing()
        # == first, evaluate all metelearning and default configurations
        smac.solver.incumbent = smac.solver.initial_design.run()

        for challenger in metalearning_configurations:

            smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify(
                challengers=[challenger],
                incumbent=smac.solver.incumbent,
                run_history=smac.solver.runhistory,
                aggregate_func=smac.solver.aggregate_func,
                time_bound=self.total_walltime_limit)

            if smac.solver.scenario.shared_model:
                pSMAC.write(run_history=smac.solver.runhistory,
                            output_directory=smac.solver.scenario.output_dir,
                            num_run=self.seed)

            if smac.solver.stats.is_budget_exhausted():
                break

        # == after metalearning run SMAC loop
        while True:

            if smac.solver.scenario.shared_model:
                pSMAC.read(run_history=smac.solver.runhistory,
                           output_directory=self.scenario.output_dir,
                           configuration_space=self.config_space,
                           logger=self.logger)

            choose_next_start_time = time.time()
            try:
                challengers = self.choose_next(smac)
            except Exception as e:
                self.logger.error(e)
                self.logger.error("Error in getting next configurations "
                                  "with SMAC. Using random configuration!")
                next_config = self.config_space.sample_configuration()
                challengers = [next_config]
            time_for_choose_next = time.time() - choose_next_start_time
            self.logger.info('Used %g seconds to find next '
                             'configurations' % (time_for_choose_next))

            time_for_choose_next = max(time_for_choose_next, 1.0)
            smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify(
                challengers=challengers,
                incumbent=smac.solver.incumbent,
                run_history=smac.solver.runhistory,
                aggregate_func=smac.solver.aggregate_func,
                time_bound=time_for_choose_next)

            if smac.solver.scenario.shared_model:
                pSMAC.write(run_history=smac.solver.runhistory,
                            output_directory=smac.solver.scenario.output_dir,
                            num_run=self.seed)

            if smac.solver.stats.is_budget_exhausted():
                break

        self.runhistory = smac.solver.runhistory
        self.trajectory = smac.solver.intensifier.traj_logger.trajectory

        return self.runhistory, self.trajectory
예제 #13
0
class MetaLearningOptimizer(object):
    def __init__(self,
                 dataset_name,
                 configuration_space,
                 aslib_directory,
                 distance='l1',
                 seed=None,
                 use_features='',
                 distance_kwargs=None,
                 subset='all'):
        """Metalearning optimizer.

        Parameters
        ----------
        dataset_name : str
            Name of the dataset

        configuration_space : HPOlibConfigSpace.configuration_space.ConfigurationSpace

        datasets_file : str
            Path to an aslib directory

        distance : str, "l1" or "l2" or "random"
            Distance function to be used by the kNearestDatasets algorithm.

        seed

        use_features

        metric_kwargs

        subset
        """
        self.dataset_name = dataset_name
        self.configuration_space = configuration_space
        self.aslib_dir = aslib_directory
        self.distance = distance
        self.seed = seed
        self.use_features = use_features
        self.distance_kwargs = distance_kwargs
        self.subset = subset
        self.kND = None  # For caching, makes things faster...

        self.meta_base = MetaBase(configuration_space, self.aslib_dir)
        self.logger = logging.getLogger(__name__)

    def perform_sequential_optimization(self,
                                        target_algorithm=test_function,
                                        time_budget=None,
                                        evaluation_budget=None):
        raise NotImplementedError("Right now this is not implemented due to "
                                  "timing issues.")
        time_taken = 0
        num_evaluations = 0
        history = []

        self.logger.info("Taking distance measure %s" % self.distance)
        while True:
            if time_budget is not None and time_taken >= time_budget:
                self.logger.info("Reached time budget. Exiting optimization.")
                break
            if evaluation_budget is not None and \
                    num_evaluations >= evaluation_budget:
                self.logger.info(
                    "Reached maximum number of evaluations. Exiting "
                    "optimization.")
                break

            params = self.metalearning_suggest(history)

            fixed_params = OrderedDict()
            # Hack to remove all trailing - from the params which are
            # accidently in the experiment pickle of the current HPOlib version
            for key in params:
                if key[0] == "-":
                    fixed_params[key[1:]] = params[key]
                else:
                    fixed_params[key] = params[key]

            self.logger.info(
                "%d/%d, parameters: %s" %
                (num_evaluations, evaluation_budget, str(fixed_params)))
            result = target_algorithm(fixed_params)
            history.append(Run(params, result))
            num_evaluations += 1

        return min([run.result for run in history])

    def metalearning_suggest_all(self, exclude_double_configurations=True):
        """Return a list of the best hyperparameters of neighboring datasets"""
        # TODO check if _learn was called before!
        neighbors = self._learn(exclude_double_configurations)
        hp_list = []
        for neighbor in neighbors:
            try:
                configuration = \
                    self.meta_base.get_configuration_from_algorithm_index(
                    neighbor[2])
                self.logger.info("%s %s %s" %
                                 (neighbor[0], neighbor[1], configuration))
            except (KeyError):
                self.logger.warning("Configuration %s not found" % neighbor[2])
                continue

            hp_list.append(configuration)
        return hp_list

    def metalearning_suggest(self, history):
        """Suggest the next most promising hyperparameters which were not yet evaluated"""
        # TODO test the object in the history!
        neighbors = self._learn()
        # Iterate over all datasets which are sorted ascending by distance

        history_with_indices = []
        for run in history:
            history_with_indices.append(\
                self.meta_base.get_algorithm_index_from_configuration(run))

        for idx, neighbor in enumerate(neighbors):
            already_evaluated = False
            # Check if that dataset was already evaluated
            for run in history_with_indices:
                # If so, return to the outer loop

                if neighbor[2] == run:
                    already_evaluated = True
                    break

            if not already_evaluated:
                self.logger.info(
                    "Nearest dataset with hyperparameters of best value "
                    "not evaluated yet is %s with a distance of %f" %
                    (neighbor[0], neighbor[1]))
                return self.meta_base.get_configuration_from_algorithm_index(
                    neighbor[2])
        raise StopIteration("No more values available.")

    def _learn(self, exclude_double_configurations=True):
        dataset_metafeatures, all_other_metafeatures = self._get_metafeatures()

        # Remove metafeatures which could not be calculated for the target
        # dataset
        keep = []
        for idx in dataset_metafeatures.index:
            if np.isfinite(dataset_metafeatures.loc[idx]):
                keep.append(idx)

        dataset_metafeatures = dataset_metafeatures.loc[keep]
        all_other_metafeatures = all_other_metafeatures.loc[:, keep]

        # Do mean imputation of all other metafeatures
        all_other_metafeatures = all_other_metafeatures.fillna(
            all_other_metafeatures.mean())

        if self.kND is None:
            # In case that we learn our distance function, get_value the parameters for
            #  the random forest
            if self.distance_kwargs:
                rf_params = ast.literal_eval(self.distance_kwargs)
            else:
                rf_params = None

            # To keep the distance the same in every iteration, we create a new
            # random state
            random_state = sklearn.utils.check_random_state(self.seed)
            kND = KNearestDatasets(metric=self.distance,
                                   random_state=random_state,
                                   metric_params=rf_params)

            runs = dict()
            # TODO move this code to the metabase
            for task_id in all_other_metafeatures.index:
                try:
                    runs[task_id] = self.meta_base.get_runs(task_id)
                except KeyError:
                    # TODO should I really except this?
                    self.logger.warning("Could not find runs for instance %s" %
                                        task_id)
                    runs[task_id] = pd.Series([], name=task_id)
            runs = pd.DataFrame(runs)

            kND.fit(all_other_metafeatures, runs)
            self.kND = kND
        return self.kND.kBestSuggestions(
            dataset_metafeatures,
            k=-1,
            exclude_double_configurations=exclude_double_configurations)

    def _get_metafeatures(self):
        """This is inside an extra function for testing purpose"""
        # Load the task

        self.logger.info("Going to use the metafeature subset: %s",
                         self.subset)
        all_metafeatures = self.meta_base.get_all_metafeatures()
        self.logger.info(" ".join(all_metafeatures.columns))

        # TODO: buggy and hacky, replace with a list seperated by commas
        if self.use_features and \
                (type(self.use_features) != str or self.use_features != ''):
            #ogger.warn("Going to keep the following features %s",
            #        str(self.use_features))
            if type(self.use_features) == str:
                use_features = self.use_features.split(",")
            elif type(self.use_features) in (list, np.ndarray):
                use_features = self.use_features
            else:
                raise NotImplementedError(type(self.use_features))

            if len(use_features) == 0:
                self.logger.info(
                    "You just tried to remove all metafeatures...")
            else:
                keep = [
                    col for col in all_metafeatures.columns
                    if col in use_features
                ]
                if len(use_features) == 0:
                    self.logger.info(
                        "You just tried to remove all metafeatures...")
                else:
                    all_metafeatures = all_metafeatures.loc[:, keep]
                    self.logger.info(
                        "Going to keep the following metafeatures:")
                    self.logger.info(str(keep))

        return self._split_metafeature_array(self.dataset_name,
                                             all_metafeatures)

    def _split_metafeature_array(self, dataset_name, metafeatures):
        """Split the metafeature array into dataset metafeatures and all other.

        This is inside an extra function for testing purpose.
        """
        dataset_metafeatures = metafeatures.loc[dataset_name].copy()
        metafeatures = metafeatures[metafeatures.index != dataset_name]
        return dataset_metafeatures, metafeatures

    def read_task_list(self, fh):
        dataset_filenames = list()
        for line in fh:
            line = line.replace("\n", "")
            if line:
                dataset_filenames.append(line)
            else:
                raise ValueError("Blank lines in the task list are not "
                                 "supported.")
        return dataset_filenames

    def read_experiments_list(self, fh):
        experiments_list = list()
        for line in fh.readlines():
            experiments_list.append(line.split())
        return experiments_list
예제 #14
0
                              'sparse'))

configuration_space = get_configuration_space(
               {
                 'metric': metric,
                'task': task,
                'is_sparse': False
               },
               include_preprocessors=['no_preprocessing'])
               
print(metadata_directory)

dataset_name = 'diabetes'
X_train, Y_train, X_test, Y_test = get_dataset(dataset_name)
print(X_train)   
meta_base = MetaBase(configuration_space, metadata_directory)

print(meta_base)

dataset_metafeatures = meta_base.get_metafeatures('1030_a_metric', None)

print(dataset_metafeatures)

all_other_datasets = meta_base.get_all_dataset_names()
print(all_other_datasets)

res = suggest_via_metalearning(meta_base,'198_a_metric',metric,task,False,1)  


print(res)
print(type(res))
예제 #15
0
파일: smbo.py 프로젝트: automl/auto-sklearn
    def run_smbo(self, max_iters=1000):
        global evaluator

        self.watcher.start_task('SMBO')

        # == first things first: load the datamanager
        self.reset_data_manager()
        
        # == Initialize non-SMBO stuff
        # first create a scenario
        seed = self.seed
        self.config_space.seed(seed)
        num_params = len(self.config_space.get_hyperparameters())
        # allocate a run history
        num_run = self.start_num_run
        instance_id = self.dataset_name + SENTINEL

        # Initialize some SMAC dependencies
        runhistory = RunHistory(aggregate_func=average_cost)
        # meta_runhistory = RunHistory(aggregate_func=average_cost)
        # meta_runs_dataset_indices = {}

        # == METALEARNING suggestions
        # we start by evaluating the defaults on the full dataset again
        # and add the suggestions from metalearning behind it

        if self.metadata_directory is None:
            metalearning_directory = os.path.dirname(
                autosklearn.metalearning.__file__)
            # There is no multilabel data in OpenML
            if self.task == MULTILABEL_CLASSIFICATION:
                meta_task = BINARY_CLASSIFICATION
            else:
                meta_task = self.task
            metadata_directory = os.path.join(
                metalearning_directory, 'files',
                '%s_%s_%s' % (METRIC_TO_STRING[self.metric],
                              TASK_TYPES_TO_STRING[meta_task],
                              'sparse' if self.datamanager.info['is_sparse']
                              else 'dense'))
            self.metadata_directory = metadata_directory

        self.logger.info('Metadata directory: %s', self.metadata_directory)
        meta_base = MetaBase(self.config_space, self.metadata_directory)

        metafeature_calculation_time_limit = int(
            self.total_walltime_limit / 4)
        metafeature_calculation_start_time = time.time()
        meta_features = self._calculate_metafeatures_with_limits(
            metafeature_calculation_time_limit)
        metafeature_calculation_end_time = time.time()
        metafeature_calculation_time_limit = \
            metafeature_calculation_time_limit - (
            metafeature_calculation_end_time -
            metafeature_calculation_start_time)

        if metafeature_calculation_time_limit < 1:
            self.logger.warning('Time limit for metafeature calculation less '
                                'than 1 seconds (%f). Skipping calculation '
                                'of metafeatures for encoded dataset.',
                                metafeature_calculation_time_limit)
            meta_features_encoded = None
        else:
            with warnings.catch_warnings():
                warnings.showwarning = self._send_warnings_to_log
                self.datamanager.perform1HotEncoding()
            meta_features_encoded = \
                self._calculate_metafeatures_encoded_with_limits(
                    metafeature_calculation_time_limit)

        # In case there is a problem calculating the encoded meta-features
        if meta_features is None:
            if meta_features_encoded is not None:
                meta_features = meta_features_encoded
        else:
            if meta_features_encoded is not None:
                meta_features.metafeature_values.update(
                    meta_features_encoded.metafeature_values)

        if meta_features is not None:
            meta_base.add_dataset(instance_id, meta_features)
            # Do mean imputation of the meta-features - should be done specific
            # for each prediction model!
            all_metafeatures = meta_base.get_metafeatures(
                features=list(meta_features.keys()))
            all_metafeatures.fillna(all_metafeatures.mean(), inplace=True)

            with warnings.catch_warnings():
                warnings.showwarning = self._send_warnings_to_log
                metalearning_configurations = self.collect_metalearning_suggestions(
                    meta_base)
            if metalearning_configurations is None:
                metalearning_configurations = []
            self.reset_data_manager()

            self.logger.info('%s', meta_features)

            # Convert meta-features into a dictionary because the scenario
            # expects a dictionary
            meta_features_dict = {}
            for dataset, series in all_metafeatures.iterrows():
                meta_features_dict[dataset] = series.values
            meta_features_list = []
            for meta_feature_name in all_metafeatures.columns:
                meta_features_list.append(meta_features[meta_feature_name].value)
            meta_features_list = np.array(meta_features_list).reshape((1, -1))
            self.logger.info(list(meta_features_dict.keys()))

            #meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric])
            #meta_runs_index = 0
            #try:
            #    meta_durations = meta_base.get_all_runs('runtime')
            #    read_runtime_data = True
            #except KeyError:
            #    read_runtime_data = False
            #    self.logger.critical('Cannot read runtime data.')
            #    if self.acquisition_function == 'EIPS':
            #        self.logger.critical('Reverting to acquisition function EI!')
            #        self.acquisition_function = 'EI'

            # for meta_dataset in meta_runs.index:
            #     meta_dataset_start_index = meta_runs_index
            #     for meta_configuration in meta_runs.columns:
            #         if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]):
            #             try:
            #                 config = meta_base.get_configuration_from_algorithm_index(
            #                     meta_configuration)
            #                 cost = meta_runs.loc[meta_dataset, meta_configuration]
            #                 if read_runtime_data:
            #                     runtime = meta_durations.loc[meta_dataset,
            #                                                  meta_configuration]
            #                 else:
            #                     runtime = 1
            #                 # TODO read out other status types!
            #                 meta_runhistory.add(config, cost, runtime,
            #                                     StatusType.SUCCESS,
            #                                     instance_id=meta_dataset)
            #                 meta_runs_index += 1
            #             except:
            #                 # TODO maybe add warning
            #                 pass
            #
            #     meta_runs_dataset_indices[meta_dataset] = (
            #         meta_dataset_start_index, meta_runs_index)
        else:
            if self.acquisition_function == 'EIPS':
                self.logger.critical('Reverting to acquisition function EI!')
                self.acquisition_function = 'EI'
            meta_features_list = []
            meta_features_dict = {}
            metalearning_configurations = []

        self.scenario = Scenario({'cs': self.config_space,
                                  'cutoff-time': self.func_eval_time_limit,
                                  'memory-limit': self.memory_limit,
                                  'wallclock-limit': self.total_walltime_limit,
                                  #'instances': [[name] for name in meta_features_dict],
                                  'output-dir': self.backend.temporary_directory,
                                  'shared-model': self.shared_mode,
                                  'run-obj': 'quality',
                                  'deterministic': 'true'})

        # TODO rebuild target algorithm to be it's own target algorithm
        # evaluator, which takes into account that a run can be killed prior
        # to the model being fully fitted; thus putting intermediate results
        # into a queue and querying them once the time is over
        ta = ExecuteTaFuncWithQueue(backend=self.backend,
                                    autosklearn_seed=seed,
                                    resampling_strategy=self.resampling_strategy,
                                    initial_num_run=num_run,
                                    logger=self.logger,
                                    **self.resampling_strategy_args)

        types = get_types(self.config_space, self.scenario.feature_array)

        # TODO extract generation of SMAC object into it's own function for
        # testing
        if self.acquisition_function == 'EI':
            model = RandomForestWithInstances(types,
                                              #instance_features=meta_features_list,
                                              seed=1, num_trees=10)
            smac = SMAC(scenario=self.scenario,
                        model=model,
                        rng=seed,
                        tae_runner=ta,
                        runhistory=runhistory)
        elif self.acquisition_function == 'EIPS':
            rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=None,
                                         impute_censored_data=False,
                                         impute_state=None)
            model = UncorrelatedMultiObjectiveRandomForestWithInstances(
                ['cost', 'runtime'], types, num_trees = 10,
                instance_features=meta_features_list, seed=1)
            acquisition_function = EIPS(model)
            smac = SMAC(scenario=self.scenario, tae_runner=ta,
                        acquisition_function=acquisition_function,
                        model=model, runhistory2epm=rh2EPM, rng=seed,
                        runhistory=runhistory)
        else:
            raise ValueError('Unknown acquisition function value %s!' %
                             self.acquisition_function)

        smac.solver.stats.start_timing()
        smac.solver.incumbent = smac.solver.initial_design.run()

        # Build a runtime model
        # runtime_rf = RandomForestWithInstances(types,
        #                                        instance_features=meta_features_list,
        #                                        seed=1, num_trees=10)
        # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
        #                                      scenario=self.scenario,
        #                                      success_states=None,
        #                                      impute_censored_data=False,
        #                                      impute_state=None)
        # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory)
        # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten())
        # X_meta, Y_meta = rh2EPM.transform(meta_runhistory)
        # # Transform Y_meta on a per-dataset base
        # for meta_dataset in meta_runs_dataset_indices:
        #     start_index, end_index = meta_runs_dataset_indices[meta_dataset]
        #     end_index += 1  # Python indexing
        #     Y_meta[start_index:end_index, 0]\
        #         [Y_meta[start_index:end_index, 0] >2.0] =  2.0
        #     dataset_minimum = np.min(Y_meta[start_index:end_index, 0])
        #     Y_meta[start_index:end_index, 0] = 1 - (
        #         (1. - Y_meta[start_index:end_index, 0]) /
        #         (1. - dataset_minimum))
        #     Y_meta[start_index:end_index, 0]\
        #           [Y_meta[start_index:end_index, 0] > 2] = 2

        smac.solver.stats.start_timing()
        # == first, evaluate all metelearning and default configurations
        smac.solver.incumbent = smac.solver.initial_design.run()

        for challenger in metalearning_configurations:

            smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify(
                challengers=[challenger],
                incumbent=smac.solver.incumbent,
                run_history=smac.solver.runhistory,
                aggregate_func=smac.solver.aggregate_func,
                time_bound=self.total_walltime_limit)

            if smac.solver.scenario.shared_model:
                pSMAC.write(run_history=smac.solver.runhistory,
                            output_directory=smac.solver.scenario.output_dir,
                            num_run=self.seed)

            if smac.solver.stats.is_budget_exhausted():
                break

        # == after metalearning run SMAC loop
        while True:
            if smac.solver.scenario.shared_model:
                pSMAC.read(run_history=smac.solver.runhistory,
                           output_directory=self.scenario.output_dir,
                           configuration_space=self.config_space,
                           logger=self.logger)

            choose_next_start_time = time.time()
            try:
                challengers = self.choose_next(smac)
            except Exception as e:
                self.logger.error(e)
                self.logger.error("Error in getting next configurations "
                                  "with SMAC. Using random configuration!")
                next_config = self.config_space.sample_configuration()
                challengers = [next_config]
            time_for_choose_next = time.time() - choose_next_start_time
            self.logger.info('Used %g seconds to find next '
                             'configurations' % (time_for_choose_next))

            smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify(
                challengers=challengers,
                incumbent=smac.solver.incumbent,
                run_history=smac.solver.runhistory,
                aggregate_func=smac.solver.aggregate_func,
                time_bound=time_for_choose_next)

            if smac.solver.scenario.shared_model:
                pSMAC.write(run_history=smac.solver.runhistory,
                            output_directory=smac.solver.scenario.output_dir,
                            num_run=self.seed)

            if smac.solver.stats.is_budget_exhausted():
                break

        self.runhistory = smac.solver.runhistory
        return runhistory
예제 #16
0
    def get_metalearning_suggestions(self):
        # == METALEARNING suggestions
        # we start by evaluating the defaults on the full dataset again
        # and add the suggestions from metalearning behind it
        if self.num_metalearning_cfgs > 0:
            if self.metadata_directory is None:
                metalearning_directory = os.path.dirname(
                    autosklearn.metalearning.__file__)
                # There is no multilabel data in OpenML
                if self.task == MULTILABEL_CLASSIFICATION:
                    meta_task = BINARY_CLASSIFICATION
                else:
                    meta_task = self.task
                metadata_directory = os.path.join(
                    metalearning_directory, 'files',
                    '%s_%s_%s' % (self.metric, TASK_TYPES_TO_STRING[meta_task],
                                  'sparse' if self.datamanager.info['is_sparse']
                                  else 'dense'))
                self.metadata_directory = metadata_directory

            if os.path.exists(self.metadata_directory):

                self.logger.info('Metadata directory: %s',
                                 self.metadata_directory)
                meta_base = MetaBase(self.config_space, self.metadata_directory)

                try:
                    meta_base.remove_dataset(self.dataset_name)
                except:
                    pass

                metafeature_calculation_time_limit = int(
                    self.total_walltime_limit / 4)
                metafeature_calculation_start_time = time.time()
                meta_features = self._calculate_metafeatures_with_limits(
                    metafeature_calculation_time_limit)
                metafeature_calculation_end_time = time.time()
                metafeature_calculation_time_limit = \
                    metafeature_calculation_time_limit - (
                        metafeature_calculation_end_time -
                        metafeature_calculation_start_time)

                if metafeature_calculation_time_limit < 1:
                    self.logger.warning(
                        'Time limit for metafeature calculation less '
                        'than 1 seconds (%f). Skipping calculation '
                        'of metafeatures for encoded dataset.',
                        metafeature_calculation_time_limit)
                    meta_features_encoded = None
                else:
                    with warnings.catch_warnings():
                        warnings.showwarning = self._send_warnings_to_log
                        self.datamanager.perform1HotEncoding()
                    meta_features_encoded = \
                        self._calculate_metafeatures_encoded_with_limits(
                            metafeature_calculation_time_limit)

                # In case there is a problem calculating the encoded meta-features
                if meta_features is None:
                    if meta_features_encoded is not None:
                        meta_features = meta_features_encoded
                else:
                    if meta_features_encoded is not None:
                        meta_features.metafeature_values.update(
                            meta_features_encoded.metafeature_values)

                if meta_features is not None:
                    meta_base.add_dataset(self.dataset_name, meta_features)
                    # Do mean imputation of the meta-features - should be done specific
                    # for each prediction model!
                    all_metafeatures = meta_base.get_metafeatures(
                        features=list(meta_features.keys()))
                    all_metafeatures.fillna(all_metafeatures.mean(),
                                            inplace=True)

                    with warnings.catch_warnings():
                        warnings.showwarning = self._send_warnings_to_log
                        metalearning_configurations = self.collect_metalearning_suggestions(
                            meta_base)
                    if metalearning_configurations is None:
                        metalearning_configurations = []
                    self.reset_data_manager()

                    self.logger.info('%s', meta_features)

                    # Convert meta-features into a dictionary because the scenario
                    # expects a dictionary
                    meta_features_dict = {}
                    for dataset, series in all_metafeatures.iterrows():
                        meta_features_dict[dataset] = series.values
                    meta_features_list = []
                    for meta_feature_name in all_metafeatures.columns:
                        meta_features_list.append(
                            meta_features[meta_feature_name].value)
                    meta_features_list = np.array(meta_features_list).reshape(
                        (1, -1))
                    self.logger.info(list(meta_features_dict.keys()))

            else:
                meta_features = None
                self.logger.warning('Could not find meta-data directory %s' %
                                    metadata_directory)

        else:
            meta_features = None
        if meta_features is None:
            meta_features_list = []
            metalearning_configurations = []
        return metalearning_configurations
예제 #17
0
class MetaBaseTest(unittest.TestCase):
    _multiprocess_can_split_ = True

    def setUp(self):
        self.cwd = os.getcwd()
        data_dir = os.path.dirname(__file__)
        data_dir = os.path.join(data_dir, 'test_meta_base_data')
        os.chdir(data_dir)

        cs = autosklearn.pipeline.classification.SimpleClassificationPipeline()\
            .get_hyperparameter_search_space()

        self.base = MetaBase(cs, data_dir)

    def tearDown(self):
        os.chdir(self.cwd)

    def test_get_all_runs(self):
        runs = self.base.get_all_runs()
        self.assertIsInstance(runs, pd.DataFrame)
        # TODO update this ASAP
        self.assertEqual((134, 24), runs.shape)

    def test_get_runs(self):
        runs = self.base.get_runs('38_acc')
        # TODO update this ASAP
        self.assertEqual(24, len(runs))
        self.assertIsInstance(runs, pd.Series)

    def test_get_metafeatures_single_dataset(self):
        mf = self.base.get_metafeatures('38_acc')
        self.assertIsInstance(mf, pd.Series)
        self.assertEqual(mf.name, u'38_acc')
        self.assertEqual(mf.loc['NumberOfInstances'], 2527.0)

    def test_get_metafeatures_single_feature(self):
        mf = self.base.get_metafeatures(features='NumberOfInstances')
        self.assertIsInstance(mf, pd.Series)
        self.assertEqual(mf.shape, (140, ))

    def test_get_metafeatures_single_dataset_and_single_feature(self):
        mf = self.base.get_metafeatures('38_acc', features='NumberOfInstances')
        self.assertEqual(mf.shape, ())

    def test_get_metafeatures_multiple_datasets(self):
        mf = self.base.get_metafeatures(['38_acc', '24_acc'])
        self.assertIsInstance(mf, pd.DataFrame)
        self.assertEqual(mf.shape, (2, 46))

    def test_get_metafeatures_multiple_features(self):
        mf = self.base.get_metafeatures(features=['NumberOfInstances',
                                                  'NumberOfClasses'])
        self.assertIsInstance(mf, pd.DataFrame)
        self.assertEqual(mf.shape, (140, 2))

    def test_remove_dataset(self):
        name = "1000_acc"
        for key in self.base.algorithm_runs:
            self.assertIn(name, self.base.algorithm_runs[key].index)
        self.assertIn(name, self.base.metafeatures.index)
        metafeatures_shape = self.base.metafeatures.shape
        self.base.remove_dataset(name)
        for key in self.base.algorithm_runs:
            self.assertNotIn(name, self.base.algorithm_runs[key].index)
        self.assertNotIn(name, self.base.metafeatures.index)
        # Check that only one thing was removed
        self.assertEqual(self.base.metafeatures.shape,
                         (metafeatures_shape[0] - 1, metafeatures_shape[1]))
예제 #18
0
    def run_smbo(self, max_iters=1000):
        global evaluator

        # == first things first: load the datamanager
        self.reset_data_manager()

        # == Initialize SMBO stuff
        # first create a scenario
        seed = self.seed  # TODO
        num_params = len(self.config_space.get_hyperparameters())
        # allocate a run history
        run_history = RunHistory()
        meta_runhistory = RunHistory()
        meta_runs_dataset_indices = {}
        num_run = self.start_num_run
        instance_id = self.dataset_name + SENTINEL

        # == Train on subset
        #    before doing anything, let us run the default_cfg
        #    on a subset of the available data to ensure that
        #    we at least have some models
        #    we will try three different ratios of decreasing magnitude
        #    in the hope that at least on the last one we will be able
        #    to get a model
        n_data = self.datamanager.data['X_train'].shape[0]
        subset_ratio = 10000. / n_data
        if subset_ratio >= 0.5:
            subset_ratio = 0.33
            subset_ratios = [subset_ratio, subset_ratio * 0.10]
        else:
            subset_ratios = [subset_ratio, 500. / n_data]
        self.logger.info("Training default configurations on a subset of "
                         "%d/%d data points." %
                         (int(n_data * subset_ratio), n_data))

        # the time limit for these function evaluations is rigorously
        # set to only 1/2 of a full function evaluation
        subset_time_limit = max(5, int(self.func_eval_time_limit / 2))
        # the configs we want to run on the data subset are:
        # 1) the default configs
        # 2) a set of configs we selected for training on a subset
        subset_configs = [self.config_space.get_default_configuration()] \
                          + self.collect_additional_subset_defaults()
        subset_config_succesful = [False] * len(subset_configs)
        for subset_config_id, next_config in enumerate(subset_configs):
            for i, ratio in enumerate(subset_ratios):
                self.reset_data_manager()
                n_data_subsample = int(n_data * ratio)

                # run the config, but throw away the result afterwards
                # since this cfg was evaluated only on a subset
                # and we don't want  to confuse SMAC
                self.logger.info(
                    "Starting to evaluate %d on SUBSET "
                    "with size %d and time limit %ds.", num_run,
                    n_data_subsample, subset_time_limit)
                self.logger.info(next_config)
                _info = eval_with_limits(self.datamanager, self.tmp_dir,
                                         next_config, seed, num_run,
                                         self.resampling_strategy,
                                         self.resampling_strategy_args,
                                         self.memory_limit, subset_time_limit,
                                         n_data_subsample)
                (duration, result, _, additional_run_info, status) = _info
                self.logger.info(
                    "Finished evaluating %d. configuration on SUBSET. "
                    "Duration %f; loss %f; status %s; additional run "
                    "info: %s ", num_run, duration, result, str(status),
                    additional_run_info)

                num_run += 1
                if i < len(subset_ratios) - 1:
                    if status != StatusType.SUCCESS:
                        # Do not increase num_run here, because we will try
                        # the same configuration with less data
                        self.logger.info(
                            "A CONFIG did not finish "
                            " for subset ratio %f -> going smaller", ratio)
                        continue
                    else:
                        self.logger.info(
                            "Finished SUBSET training sucessfully "
                            "with ratio %f", ratio)
                        subset_config_succesful[subset_config_id] = True
                        break
                else:
                    if status != StatusType.SUCCESS:
                        self.logger.info(
                            "A CONFIG did not finish "
                            " for subset ratio %f.", ratio)
                        continue
                    else:
                        self.logger.info(
                            "Finished SUBSET training sucessfully "
                            "with ratio %f", ratio)
                        subset_config_succesful[subset_config_id] = True
                        break

        # Use the first non-failing configuration from the subsets as the new
        #  default configuration -> this guards us against the random forest
        # failing on large, sparse datasets
        default_cfg = None
        for subset_config_id, next_config in enumerate(subset_configs):
            if subset_config_succesful[subset_config_id]:
                default_cfg = next_config
                break
        if default_cfg is None:
            default_cfg = self.config_space.get_default_configuration()

        # == METALEARNING suggestions
        # we start by evaluating the defaults on the full dataset again
        # and add the suggestions from metalearning behind it

        if self.metadata_directory is None:
            metalearning_directory = os.path.dirname(
                autosklearn.metalearning.__file__)
            # There is no multilabel data in OpenML
            if self.task == MULTILABEL_CLASSIFICATION:
                meta_task = BINARY_CLASSIFICATION
            else:
                meta_task = self.task
            metadata_directory = os.path.join(
                metalearning_directory, 'files', '%s_%s_%s' %
                (METRIC_TO_STRING[self.metric],
                 TASK_TYPES_TO_STRING[meta_task],
                 'sparse' if self.datamanager.info['is_sparse'] else 'dense'))
            self.metadata_directory = metadata_directory

        self.logger.info('Metadata directory: %s', self.metadata_directory)
        meta_base = MetaBase(self.config_space, self.metadata_directory)

        metafeature_calculation_time_limit = int(self.total_walltime_limit / 4)
        metafeature_calculation_start_time = time.time()
        meta_features = self._calculate_metafeatures_with_limits(
            metafeature_calculation_time_limit)
        metafeature_calculation_end_time = time.time()
        metafeature_calculation_time_limit = \
            metafeature_calculation_time_limit - (
            metafeature_calculation_end_time -
            metafeature_calculation_start_time)

        if metafeature_calculation_time_limit < 1:
            self.logger.warning(
                'Time limit for metafeature calculation less '
                'than 1 seconds (%f). Skipping calculation '
                'of metafeatures for encoded dataset.',
                metafeature_calculation_time_limit)
            meta_features_encoded = None
        else:
            self.datamanager.perform1HotEncoding()
            meta_features_encoded = \
                self._calculate_metafeatures_encoded_with_limits(
                    metafeature_calculation_time_limit)

        # In case there is a problem calculating the encoded meta-features
        if meta_features is None:
            if meta_features_encoded is not None:
                meta_features = meta_features_encoded
        else:
            if meta_features_encoded is not None:
                meta_features.metafeature_values.update(
                    meta_features_encoded.metafeature_values)

        if meta_features is not None:
            meta_base.add_dataset(instance_id, meta_features)
            # Do mean imputation of the meta-features - should be done specific
            # for each prediction model!
            all_metafeatures = meta_base.get_metafeatures(
                features=list(meta_features.keys()))
            all_metafeatures.fillna(all_metafeatures.mean(), inplace=True)

            metalearning_configurations = self.collect_metalearning_suggestions(
                meta_base)
            if metalearning_configurations is None:
                metalearning_configurations = []
            self.reset_data_manager()

            self.logger.info('%s', meta_features)

            # Convert meta-features into a dictionary because the scenario
            # expects a dictionary
            meta_features_dict = {}
            for dataset, series in all_metafeatures.iterrows():
                meta_features_dict[dataset] = series.values
            meta_features_list = []
            for meta_feature_name in all_metafeatures.columns:
                meta_features_list.append(
                    meta_features[meta_feature_name].value)
            meta_features_list = np.array(meta_features_list).reshape((1, -1))
            self.logger.info(list(meta_features_dict.keys()))

            meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric])
            meta_runs_index = 0
            try:
                meta_durations = meta_base.get_all_runs('runtime')
                read_runtime_data = True
            except KeyError:
                read_runtime_data = False
                self.logger.critical('Cannot read runtime data.')
                if self.acquisition_function == 'EIPS':
                    self.logger.critical(
                        'Reverting to acquisition function EI!')
                    self.acquisition_function = 'EI'

            for meta_dataset in meta_runs.index:
                meta_dataset_start_index = meta_runs_index
                for meta_configuration in meta_runs.columns:
                    if np.isfinite(meta_runs.loc[meta_dataset,
                                                 meta_configuration]):
                        try:
                            config = meta_base.get_configuration_from_algorithm_index(
                                meta_configuration)
                            cost = meta_runs.loc[meta_dataset,
                                                 meta_configuration]
                            if read_runtime_data:
                                runtime = meta_durations.loc[
                                    meta_dataset, meta_configuration]
                            else:
                                runtime = 1
                            # TODO read out other status types!
                            meta_runhistory.add(config,
                                                cost,
                                                runtime,
                                                StatusType.SUCCESS,
                                                instance_id=meta_dataset)
                            meta_runs_index += 1
                        except:
                            # TODO maybe add warning
                            pass

                meta_runs_dataset_indices[meta_dataset] = (
                    meta_dataset_start_index, meta_runs_index)
        else:
            if self.acquisition_function == 'EIPS':
                self.logger.critical('Reverting to acquisition function EI!')
                self.acquisition_function = 'EI'
            meta_features_list = []
            meta_features_dict = {}
            metalearning_configurations = []

        self.scenario = AutoMLScenario(self.config_space,
                                       self.total_walltime_limit,
                                       self.func_eval_time_limit,
                                       meta_features_dict, self.tmp_dir,
                                       self.shared_mode)

        types = get_types(self.config_space, self.scenario.feature_array)
        if self.acquisition_function == 'EI':
            rh2EPM = RunHistory2EPM4Cost(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=None,
                                         impute_censored_data=False,
                                         impute_state=None)
            model = RandomForestWithInstances(
                types,
                instance_features=meta_features_list,
                seed=1,
                num_trees=10)
            smac = SMBO(self.scenario, model=model, rng=seed)
        elif self.acquisition_function == 'EIPS':
            rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=None,
                                         impute_censored_data=False,
                                         impute_state=None)
            model = UncorrelatedMultiObjectiveRandomForestWithInstances(
                ['cost', 'runtime'],
                types,
                num_trees=10,
                instance_features=meta_features_list,
                seed=1)
            acquisition_function = EIPS(model)
            smac = SMBO(self.scenario,
                        acquisition_function=acquisition_function,
                        model=model,
                        runhistory2epm=rh2EPM,
                        rng=seed)
        else:
            raise ValueError('Unknown acquisition function value %s!' %
                             self.acquisition_function)

        # Build a runtime model
        # runtime_rf = RandomForestWithInstances(types,
        #                                        instance_features=meta_features_list,
        #                                        seed=1, num_trees=10)
        # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
        #                                      scenario=self.scenario,
        #                                      success_states=None,
        #                                      impute_censored_data=False,
        #                                      impute_state=None)
        # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory)
        # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten())
        X_meta, Y_meta = rh2EPM.transform(meta_runhistory)
        # Transform Y_meta on a per-dataset base
        for meta_dataset in meta_runs_dataset_indices:
            start_index, end_index = meta_runs_dataset_indices[meta_dataset]
            end_index += 1  # Python indexing
            Y_meta[start_index:end_index, 0]\
                [Y_meta[start_index:end_index, 0] >2.0] =  2.0
            dataset_minimum = np.min(Y_meta[start_index:end_index, 0])
            Y_meta[start_index:end_index,
                   0] = 1 - ((1. - Y_meta[start_index:end_index, 0]) /
                             (1. - dataset_minimum))
            Y_meta[start_index:end_index, 0]\
                  [Y_meta[start_index:end_index, 0] > 2] = 2

        # == first, evaluate all metelearning and default configurations
        for i, next_config in enumerate(
            ([default_cfg] + metalearning_configurations)):
            # Do not evaluate default configurations more than once
            if i >= len([default_cfg]) and next_config in [default_cfg]:
                continue

            config_name = 'meta-learning' if i >= len([default_cfg]) \
                else 'default'

            self.logger.info(
                "Starting to evaluate %d. configuration "
                "(%s configuration) with time limit %ds.", num_run,
                config_name, self.func_eval_time_limit)
            self.logger.info(next_config)
            self.reset_data_manager()
            info = eval_with_limits(self.datamanager, self.tmp_dir,
                                    next_config, seed, num_run,
                                    self.resampling_strategy,
                                    self.resampling_strategy_args,
                                    self.memory_limit,
                                    self.func_eval_time_limit)
            (duration, result, _, additional_run_info, status) = info
            run_history.add(config=next_config,
                            cost=result,
                            time=duration,
                            status=status,
                            instance_id=instance_id,
                            seed=seed)
            run_history.update_cost(next_config, result)
            self.logger.info(
                "Finished evaluating %d. configuration. "
                "Duration %f; loss %f; status %s; additional run "
                "info: %s ", num_run, duration, result, str(status),
                additional_run_info)
            num_run += 1
            if smac.incumbent is None:
                smac.incumbent = next_config
            elif result < run_history.get_cost(smac.incumbent):
                smac.incumbent = next_config

            if self.scenario.shared_model:
                pSMAC.write(run_history=run_history,
                            output_directory=self.scenario.output_dir,
                            num_run=self.seed)

        # == after metalearning run SMAC loop
        smac.runhistory = run_history
        smac_iter = 0
        finished = False
        while not finished:
            if self.scenario.shared_model:
                pSMAC.read(run_history=run_history,
                           output_directory=self.scenario.output_dir,
                           configuration_space=self.config_space,
                           logger=self.logger)

            next_configs = []
            time_for_choose_next = -1
            try:
                X_cfg, Y_cfg = rh2EPM.transform(run_history)

                if not run_history.empty():
                    # Update costs by normalization
                    dataset_minimum = np.min(Y_cfg[:, 0])
                    Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) /
                                       (1. - dataset_minimum))
                    Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2

                if len(X_meta) > 0 and len(X_cfg) > 0:
                    pass
                    #X_cfg = np.concatenate((X_meta, X_cfg))
                    #Y_cfg = np.concatenate((Y_meta, Y_cfg))
                elif len(X_meta) > 0:
                    X_cfg = X_meta.copy()
                    Y_cfg = Y_meta.copy()
                elif len(X_cfg) > 0:
                    X_cfg = X_cfg.copy()
                    Y_cfg = Y_cfg.copy()
                else:
                    raise ValueError(
                        'No training data for SMAC random forest!')

                self.logger.info('Using %d training points for SMAC.' %
                                 X_cfg.shape[0])
                choose_next_start_time = time.time()
                next_configs_tmp = smac.choose_next(
                    X_cfg,
                    Y_cfg,
                    num_interleaved_random=110,
                    num_configurations_by_local_search=10,
                    num_configurations_by_random_search_sorted=100)
                time_for_choose_next = time.time() - choose_next_start_time
                self.logger.info('Used %g seconds to find next '
                                 'configurations' % (time_for_choose_next))
                next_configs.extend(next_configs_tmp)
            # TODO put Exception here!
            except Exception as e:
                self.logger.error(e)
                self.logger.error("Error in getting next configurations "
                                  "with SMAC. Using random configuration!")
                next_config = self.config_space.sample_configuration()
                next_configs.append(next_config)

            models_fitted_this_iteration = 0
            start_time_this_iteration = time.time()
            for next_config in next_configs:
                x_runtime = impute_inactive_values(next_config)
                x_runtime = impute_inactive_values(x_runtime).get_array()
                # predicted_runtime = runtime_rf.predict_marginalized_over_instances(
                #     x_runtime.reshape((1, -1)))
                # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1

                self.logger.info(
                    "Starting to evaluate %d. configuration (from "
                    "SMAC) with time limit %ds.", num_run,
                    self.func_eval_time_limit)
                self.logger.info(next_config)
                self.reset_data_manager()
                info = eval_with_limits(self.datamanager, self.tmp_dir,
                                        next_config, seed, num_run,
                                        self.resampling_strategy,
                                        self.resampling_strategy_args,
                                        self.memory_limit,
                                        self.func_eval_time_limit)
                (duration, result, _, additional_run_info, status) = info
                run_history.add(config=next_config,
                                cost=result,
                                time=duration,
                                status=status,
                                instance_id=instance_id,
                                seed=seed)
                run_history.update_cost(next_config, result)

                #self.logger.info('Predicted runtime %g, true runtime %g',
                #                 predicted_runtime, duration)

                # TODO add unittest to make sure everything works fine and
                # this does not get outdated!
                if smac.incumbent is None:
                    smac.incumbent = next_config
                elif result < run_history.get_cost(smac.incumbent):
                    smac.incumbent = next_config

                self.logger.info(
                    "Finished evaluating %d. configuration. "
                    "Duration: %f; loss: %f; status %s; additional "
                    "run info: %s ", num_run, duration, result, str(status),
                    additional_run_info)
                smac_iter += 1
                num_run += 1

                models_fitted_this_iteration += 1
                time_used_this_iteration = time.time(
                ) - start_time_this_iteration
                if models_fitted_this_iteration >= 2 and \
                        time_for_choose_next > 0 and \
                        time_used_this_iteration > time_for_choose_next:
                    break
                elif time_for_choose_next <= 0 and \
                        models_fitted_this_iteration >= 1:
                    break
                elif models_fitted_this_iteration >= 50:
                    break

                if max_iters is not None:
                    finished = (smac_iter < max_iters)

            if self.scenario.shared_model:
                pSMAC.write(run_history=run_history,
                            output_directory=self.scenario.output_dir,
                            num_run=self.seed)
예제 #19
0
class MetaLearningOptimizer(object):
    def __init__(self, dataset_name, configuration_space,
                 aslib_directory, distance='l1', seed=None, use_features='',
                 distance_kwargs=None, subset='all'):
        """Metalearning optimizer.

        Parameters
        ----------
        dataset_name : str
            Name of the dataset

        configuration_space : HPOlibConfigSpace.configuration_space.ConfigurationSpace

        datasets_file : str
            Path to an aslib directory

        distance : str, "l1" or "l2" or "random"
            Distance function to be used by the kNearestDatasets algorithm.

        seed

        use_features

        metric_kwargs

        subset
        """
        self.dataset_name = dataset_name
        self.configuration_space = configuration_space
        self.aslib_dir = aslib_directory
        self.distance = distance
        self.seed = seed
        self.use_features = use_features
        self.distance_kwargs = distance_kwargs
        self.subset = subset
        self.kND = None     # For caching, makes things faster...

        self.meta_base = MetaBase(configuration_space, self.aslib_dir)
        self.logger = logging.getLogger(__name__)

    def perform_sequential_optimization(self, target_algorithm=test_function,
                                        time_budget=None,
                                        evaluation_budget=None):
        raise NotImplementedError("Right now this is not implemented due to "
                                  "timing issues.")
        time_taken = 0
        num_evaluations = 0
        history = []

        self.logger.info("Taking distance measure %s" % self.distance)
        while True:
            if time_budget is not None and time_taken >= time_budget:
                self.logger.info("Reached time budget. Exiting optimization.")
                break
            if evaluation_budget is not None and \
                    num_evaluations >= evaluation_budget:
                self.logger.info("Reached maximum number of evaluations. Exiting "
                            "optimization.")
                break

            params = self.metalearning_suggest(history)

            fixed_params = OrderedDict()
            # Hack to remove all trailing - from the params which are
            # accidently in the experiment pickle of the current HPOlib version
            for key in params:
                if key[0] == "-":
                    fixed_params[key[1:]] = params[key]
                else:
                    fixed_params[key] = params[key]

            self.logger.info("%d/%d, parameters: %s" % (num_evaluations,
                                                   evaluation_budget,
                                                   str(fixed_params)))
            result = target_algorithm(fixed_params)
            history.append(Run(params, result))
            num_evaluations += 1

        return min([run.result for run in history])

    def metalearning_suggest_all(self, exclude_double_configurations=True):
        """Return a list of the best hyperparameters of neighboring datasets"""
        # TODO check if _learn was called before!
        neighbors = self._learn(exclude_double_configurations)
        hp_list = []
        for neighbor in neighbors:
            try:
                configuration = \
                    self.meta_base.get_configuration_from_algorithm_index(
                    neighbor[2])
                self.logger.info("%s %s %s" % (neighbor[0], neighbor[1], configuration))
            except (KeyError):
                self.logger.warning("Configuration %s not found" % neighbor[2])
                continue

            hp_list.append(configuration)
        return hp_list

    def metalearning_suggest(self, history):
        """Suggest the next most promising hyperparameters which were not yet evaluated"""
        # TODO test the object in the history!
        neighbors = self._learn()
        # Iterate over all datasets which are sorted ascending by distance

        history_with_indices = []
        for run in history:
            history_with_indices.append(\
                self.meta_base.get_algorithm_index_from_configuration(run))

        for idx, neighbor in enumerate(neighbors):
            already_evaluated = False
            # Check if that dataset was already evaluated
            for run in history_with_indices:
                # If so, return to the outer loop

                if neighbor[2] == run:
                    already_evaluated = True
                    break

            if not already_evaluated:
                self.logger.info("Nearest dataset with hyperparameters of best value "
                            "not evaluated yet is %s with a distance of %f" %
                            (neighbor[0], neighbor[1]))
                return self.meta_base.get_configuration_from_algorithm_index(
                    neighbor[2])
        raise StopIteration("No more values available.")

    def _learn(self, exclude_double_configurations=True):
        dataset_metafeatures, all_other_metafeatures = self._get_metafeatures()

        # Remove metafeatures which could not be calculated for the target
        # dataset
        keep = []
        for idx in dataset_metafeatures.index:
            if np.isfinite(dataset_metafeatures.loc[idx]):
               keep.append(idx)

        dataset_metafeatures = dataset_metafeatures.loc[keep]
        all_other_metafeatures = all_other_metafeatures.loc[:,keep]

        # Do mean imputation of all other metafeatures
        all_other_metafeatures = all_other_metafeatures.fillna(
            all_other_metafeatures.mean())

        if self.kND is None:
            # In case that we learn our distance function, get_value the parameters for
            #  the random forest
            if self.distance_kwargs:
                rf_params = ast.literal_eval(self.distance_kwargs)
            else:
                rf_params = None

            # To keep the distance the same in every iteration, we create a new
            # random state
            random_state = sklearn.utils.check_random_state(self.seed)
            kND = KNearestDatasets(metric=self.distance,
                                   random_state=random_state,
                                   metric_params=rf_params)

            runs = dict()
            # TODO move this code to the metabase
            for task_id in all_other_metafeatures.index:
                try:
                    runs[task_id] = self.meta_base.get_runs(task_id)
                except KeyError:
                    # TODO should I really except this?
                    self.logger.warning("Could not find runs for instance %s" % task_id)
                    runs[task_id] = pd.Series([], name=task_id)
            runs = pd.DataFrame(runs)

            kND.fit(all_other_metafeatures, runs)
            self.kND = kND
        return self.kND.kBestSuggestions(dataset_metafeatures, k=-1,
            exclude_double_configurations=exclude_double_configurations)

    def _get_metafeatures(self):
        """This is inside an extra function for testing purpose"""
        # Load the task

        self.logger.info("Going to use the metafeature subset: %s", self.subset)
        all_metafeatures = self.meta_base.get_all_metafeatures()
        self.logger.info(" ".join(all_metafeatures.columns))

        # TODO: buggy and hacky, replace with a list seperated by commas
        if self.use_features and \
                (type(self.use_features) != str or self.use_features != ''):
            #ogger.warn("Going to keep the following features %s",
            #        str(self.use_features))
            if type(self.use_features) == str:
                use_features = self.use_features.split(",")
            elif type(self.use_features) in (list, np.ndarray):
                use_features = self.use_features
            else:
                raise NotImplementedError(type(self.use_features))

            if len(use_features) == 0:
                self.logger.info("You just tried to remove all metafeatures...")
            else:
                keep = [col for col in all_metafeatures.columns if col in use_features]
                if len(use_features) == 0:
                    self.logger.info("You just tried to remove all metafeatures...")
                else:
                    all_metafeatures = all_metafeatures.loc[:,keep]
                    self.logger.info("Going to keep the following metafeatures:")
                    self.logger.info(str(keep))

        return self._split_metafeature_array(self.dataset_name, all_metafeatures)

    def _split_metafeature_array(self, dataset_name, metafeatures):
        """Split the metafeature array into dataset metafeatures and all other.

        This is inside an extra function for testing purpose.
        """
        dataset_metafeatures = metafeatures.loc[dataset_name].copy()
        metafeatures = metafeatures[metafeatures.index != dataset_name]
        return dataset_metafeatures, metafeatures

    def read_task_list(self, fh):
        dataset_filenames = list()
        for line in fh:
            line = line.replace("\n", "")
            if line:
                dataset_filenames.append(line)
            else:
                raise ValueError("Blank lines in the task list are not "
                                 "supported.")
        return dataset_filenames

    def read_experiments_list(self, fh):
        experiments_list = list()
        for line in fh.readlines():
            experiments_list.append(line.split())
        return experiments_list
예제 #20
0
    def get_metalearning_suggestions(self):
        # == METALEARNING suggestions
        # we start by evaluating the defaults on the full dataset again
        # and add the suggestions from metalearning behind it
        if self.num_metalearning_cfgs > 0:
            # If metadata directory is None, use default
            if self.metadata_directory is None:
                metalearning_directory = os.path.dirname(
                    autosklearn.metalearning.__file__)
                # There is no multilabel data in OpenML
                if self.task == MULTILABEL_CLASSIFICATION:
                    meta_task = BINARY_CLASSIFICATION
                else:
                    meta_task = self.task
                metadata_directory = os.path.join(
                    metalearning_directory, 'files',
                    '%s_%s_%s' % (self.metric, TASK_TYPES_TO_STRING[meta_task],
                                  'sparse' if self.datamanager.info['is_sparse']
                                  else 'dense'))
                self.metadata_directory = metadata_directory

            # If metadata directory is specified by user,
            # then verify that it exists.
            else:
                if not os.path.exists(self.metadata_directory):
                    raise ValueError('The specified metadata directory \'%s\' '
                                     'does not exist!' % self.metadata_directory)

                else:
                    # There is no multilabel data in OpenML
                    if self.task == MULTILABEL_CLASSIFICATION:
                        meta_task = BINARY_CLASSIFICATION
                    else:
                        meta_task = self.task

                    metadata_directory = os.path.join(
                        self.metadata_directory,
                        '%s_%s_%s' % (self.metric, TASK_TYPES_TO_STRING[meta_task],
                                      'sparse' if self.datamanager.info['is_sparse']
                                      else 'dense'))
                    # Check that the metadata directory has the correct
                    # subdirectory needed for this dataset.
                    if os.path.basename(metadata_directory) not in \
                            os.listdir(self.metadata_directory):
                        raise ValueError('The specified metadata directory '
                                         '\'%s\' does not have the correct '
                                         'subdirectory \'%s\'' %
                                         (self.metadata_directory,
                                          os.path.basename(metadata_directory))
                                         )
                self.metadata_directory = metadata_directory

            if os.path.exists(self.metadata_directory):

                self.logger.info('Metadata directory: %s',
                                 self.metadata_directory)
                meta_base = MetaBase(self.config_space, self.metadata_directory)

                metafeature_calculation_time_limit = int(
                    self.total_walltime_limit / 4)
                metafeature_calculation_start_time = time.time()
                meta_features = self._calculate_metafeatures_with_limits(
                    metafeature_calculation_time_limit)
                metafeature_calculation_end_time = time.time()
                metafeature_calculation_time_limit = \
                    metafeature_calculation_time_limit - (
                        metafeature_calculation_end_time -
                        metafeature_calculation_start_time)

                if metafeature_calculation_time_limit < 1:
                    self.logger.warning(
                        'Time limit for metafeature calculation less '
                        'than 1 seconds (%f). Skipping calculation '
                        'of metafeatures for encoded dataset.',
                        metafeature_calculation_time_limit)
                    meta_features_encoded = None
                else:
                    with warnings.catch_warnings():
                        warnings.showwarning = self._send_warnings_to_log
                        self.datamanager.perform1HotEncoding()
                    meta_features_encoded = \
                        self._calculate_metafeatures_encoded_with_limits(
                            metafeature_calculation_time_limit)

                # In case there is a problem calculating the encoded meta-features
                if meta_features is None:
                    if meta_features_encoded is not None:
                        meta_features = meta_features_encoded
                else:
                    if meta_features_encoded is not None:
                        meta_features.metafeature_values.update(
                            meta_features_encoded.metafeature_values)

                if meta_features is not None:
                    meta_base.add_dataset(self.dataset_name, meta_features)
                    # Do mean imputation of the meta-features - should be done specific
                    # for each prediction model!
                    all_metafeatures = meta_base.get_metafeatures(
                        features=list(meta_features.keys()))
                    all_metafeatures.fillna(all_metafeatures.mean(),
                                            inplace=True)

                    with warnings.catch_warnings():
                        warnings.showwarning = self._send_warnings_to_log
                        metalearning_configurations = self.collect_metalearning_suggestions(
                            meta_base)
                    if metalearning_configurations is None:
                        metalearning_configurations = []
                    self.reset_data_manager()

                    self.logger.info('%s', meta_features)

                    # Convert meta-features into a dictionary because the scenario
                    # expects a dictionary
                    meta_features_dict = {}
                    for dataset, series in all_metafeatures.iterrows():
                        meta_features_dict[dataset] = series.values
                    meta_features_list = []
                    for meta_feature_name in all_metafeatures.columns:
                        meta_features_list.append(
                            meta_features[meta_feature_name].value)
                    meta_features_list = np.array(meta_features_list).reshape(
                        (1, -1))
                    self.logger.info(list(meta_features_dict.keys()))

            else:
                meta_features = None
                self.logger.warning('Could not find meta-data directory %s' %
                                    metadata_directory)

        else:
            meta_features = None
        if meta_features is None:
            meta_features_list = []
            metalearning_configurations = []
        return metalearning_configurations
예제 #21
0
파일: smbo.py 프로젝트: Ayaro/auto-sklearn
    def run_smbo(self, max_iters=1000):
        global evaluator

        # == first things first: load the datamanager
        self.reset_data_manager()
        
        # == Initialize SMBO stuff
        # first create a scenario
        seed = self.seed # TODO
        num_params = len(self.config_space.get_hyperparameters())
        # allocate a run history
        run_history = RunHistory()
        meta_runhistory = RunHistory()
        meta_runs_dataset_indices = {}
        num_run = self.start_num_run
        instance_id = self.dataset_name + SENTINEL

        # == Train on subset
        #    before doing anything, let us run the default_cfg
        #    on a subset of the available data to ensure that
        #    we at least have some models
        #    we will try three different ratios of decreasing magnitude
        #    in the hope that at least on the last one we will be able
        #    to get a model
        n_data = self.datamanager.data['X_train'].shape[0]
        subset_ratio = 10000. / n_data
        if subset_ratio >= 0.5:
            subset_ratio = 0.33
            subset_ratios = [subset_ratio, subset_ratio * 0.10]
        else:
            subset_ratios = [subset_ratio, 500. / n_data]
        self.logger.info("Training default configurations on a subset of "
                         "%d/%d data points." %
                         (int(n_data * subset_ratio), n_data))

        # the time limit for these function evaluations is rigorously
        # set to only 1/2 of a full function evaluation
        subset_time_limit = max(5, int(self.func_eval_time_limit / 2))
        # the configs we want to run on the data subset are:
        # 1) the default configs
        # 2) a set of configs we selected for training on a subset
        subset_configs = [self.config_space.get_default_configuration()] \
                          + self.collect_additional_subset_defaults()
        subset_config_succesful = [False] * len(subset_configs)
        for subset_config_id, next_config in enumerate(subset_configs):
            for i, ratio in enumerate(subset_ratios):
                self.reset_data_manager()
                n_data_subsample = int(n_data * ratio)

                # run the config, but throw away the result afterwards
                # since this cfg was evaluated only on a subset
                # and we don't want  to confuse SMAC
                self.logger.info("Starting to evaluate %d on SUBSET "
                                 "with size %d and time limit %ds.",
                                 num_run, n_data_subsample,
                                 subset_time_limit)
                self.logger.info(next_config)
                _info = eval_with_limits(
                    self.datamanager, self.tmp_dir, next_config,
                    seed, num_run,
                    self.resampling_strategy,
                    self.resampling_strategy_args,
                    self.memory_limit,
                    subset_time_limit, n_data_subsample)
                (duration, result, _, additional_run_info, status) = _info
                self.logger.info("Finished evaluating %d. configuration on SUBSET. "
                                 "Duration %f; loss %f; status %s; additional run "
                                 "info: %s ", num_run, duration, result,
                                 str(status), additional_run_info)

                num_run += 1
                if i < len(subset_ratios) - 1:
                    if status != StatusType.SUCCESS:
                        # Do not increase num_run here, because we will try
                        # the same configuration with less data
                        self.logger.info("A CONFIG did not finish "
                                         " for subset ratio %f -> going smaller",
                                         ratio)
                        continue
                    else:
                        self.logger.info("Finished SUBSET training sucessfully "
                                         "with ratio %f", ratio)
                        subset_config_succesful[subset_config_id] = True
                        break
                else:
                    if status != StatusType.SUCCESS:
                        self.logger.info("A CONFIG did not finish "
                                         " for subset ratio %f.",
                                         ratio)
                        continue
                    else:
                        self.logger.info("Finished SUBSET training sucessfully "
                                         "with ratio %f", ratio)
                        subset_config_succesful[subset_config_id] = True
                        break

        # Use the first non-failing configuration from the subsets as the new
        #  default configuration -> this guards us against the random forest
        # failing on large, sparse datasets
        default_cfg = None
        for subset_config_id, next_config in enumerate(subset_configs):
            if subset_config_succesful[subset_config_id]:
                default_cfg = next_config
                break
        if default_cfg is None:
            default_cfg = self.config_space.get_default_configuration()

        # == METALEARNING suggestions
        # we start by evaluating the defaults on the full dataset again
        # and add the suggestions from metalearning behind it

        if self.metadata_directory is None:
            metalearning_directory = os.path.dirname(
                autosklearn.metalearning.__file__)
            # There is no multilabel data in OpenML
            if self.task == MULTILABEL_CLASSIFICATION:
                meta_task = BINARY_CLASSIFICATION
            else:
                meta_task = self.task
            metadata_directory = os.path.join(
                metalearning_directory, 'files',
                '%s_%s_%s' % (METRIC_TO_STRING[self.metric],
                              TASK_TYPES_TO_STRING[meta_task],
                              'sparse' if self.datamanager.info['is_sparse']
                              else 'dense'))
            self.metadata_directory = metadata_directory

        self.logger.info('Metadata directory: %s', self.metadata_directory)
        meta_base = MetaBase(self.config_space, self.metadata_directory)

        metafeature_calculation_time_limit = int(
            self.total_walltime_limit / 4)
        metafeature_calculation_start_time = time.time()
        meta_features = self._calculate_metafeatures_with_limits(
            metafeature_calculation_time_limit)
        metafeature_calculation_end_time = time.time()
        metafeature_calculation_time_limit = \
            metafeature_calculation_time_limit - (
            metafeature_calculation_end_time -
            metafeature_calculation_start_time)

        if metafeature_calculation_time_limit < 1:
            self.logger.warning('Time limit for metafeature calculation less '
                                'than 1 seconds (%f). Skipping calculation '
                                'of metafeatures for encoded dataset.',
                                metafeature_calculation_time_limit)
            meta_features_encoded = None
        else:
            self.datamanager.perform1HotEncoding()
            meta_features_encoded = \
                self._calculate_metafeatures_encoded_with_limits(
                    metafeature_calculation_time_limit)

        # In case there is a problem calculating the encoded meta-features
        if meta_features is None:
            if meta_features_encoded is not None:
                meta_features = meta_features_encoded
        else:
            if meta_features_encoded is not None:
                meta_features.metafeature_values.update(
                    meta_features_encoded.metafeature_values)

        if meta_features is not None:
            meta_base.add_dataset(instance_id, meta_features)
            # Do mean imputation of the meta-features - should be done specific
            # for each prediction model!
            all_metafeatures = meta_base.get_metafeatures(
                features=list(meta_features.keys()))
            all_metafeatures.fillna(all_metafeatures.mean(), inplace=True)

            metalearning_configurations = self.collect_metalearning_suggestions(
                meta_base)
            if metalearning_configurations is None:
                metalearning_configurations = []
            self.reset_data_manager()

            self.logger.info('%s', meta_features)

            # Convert meta-features into a dictionary because the scenario
            # expects a dictionary
            meta_features_dict = {}
            for dataset, series in all_metafeatures.iterrows():
                meta_features_dict[dataset] = series.values
            meta_features_list = []
            for meta_feature_name in all_metafeatures.columns:
                meta_features_list.append(meta_features[meta_feature_name].value)
            meta_features_list = np.array(meta_features_list).reshape((1, -1))
            self.logger.info(list(meta_features_dict.keys()))

            meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric])
            meta_runs_index = 0
            try:
                meta_durations = meta_base.get_all_runs('runtime')
                read_runtime_data = True
            except KeyError:
                read_runtime_data = False
                self.logger.critical('Cannot read runtime data.')
                if self.acquisition_function == 'EIPS':
                    self.logger.critical('Reverting to acquisition function EI!')
                    self.acquisition_function = 'EI'

            for meta_dataset in meta_runs.index:
                meta_dataset_start_index = meta_runs_index
                for meta_configuration in meta_runs.columns:
                    if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]):
                        try:
                            config = meta_base.get_configuration_from_algorithm_index(
                                meta_configuration)
                            cost = meta_runs.loc[meta_dataset, meta_configuration]
                            if read_runtime_data:
                                runtime = meta_durations.loc[meta_dataset,
                                                             meta_configuration]
                            else:
                                runtime = 1
                            # TODO read out other status types!
                            meta_runhistory.add(config, cost, runtime,
                                                StatusType.SUCCESS,
                                                instance_id=meta_dataset)
                            meta_runs_index += 1
                        except:
                            # TODO maybe add warning
                            pass

                meta_runs_dataset_indices[meta_dataset] = (
                    meta_dataset_start_index, meta_runs_index)
        else:
            if self.acquisition_function == 'EIPS':
                self.logger.critical('Reverting to acquisition function EI!')
                self.acquisition_function = 'EI'
            meta_features_list = []
            meta_features_dict = {}
            metalearning_configurations = []

        self.scenario = AutoMLScenario(self.config_space,
                                       self.total_walltime_limit,
                                       self.func_eval_time_limit,
                                       meta_features_dict,
                                       self.tmp_dir,
                                       self.shared_mode)

        types = get_types(self.config_space, self.scenario.feature_array)
        if self.acquisition_function == 'EI':
            rh2EPM = RunHistory2EPM4Cost(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=None,
                                         impute_censored_data=False,
                                         impute_state=None)
            model = RandomForestWithInstances(types,
                                              instance_features=meta_features_list,
                                              seed=1, num_trees=10)
            smac = SMBO(self.scenario, model=model,
                        rng=seed)
        elif self.acquisition_function == 'EIPS':
            rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
                                         scenario=self.scenario,
                                         success_states=None,
                                         impute_censored_data=False,
                                         impute_state=None)
            model = UncorrelatedMultiObjectiveRandomForestWithInstances(
                ['cost', 'runtime'], types, num_trees = 10,
                instance_features=meta_features_list, seed=1)
            acquisition_function = EIPS(model)
            smac = SMBO(self.scenario,
                        acquisition_function=acquisition_function,
                        model=model, runhistory2epm=rh2EPM, rng=seed)
        else:
            raise ValueError('Unknown acquisition function value %s!' %
                             self.acquisition_function)

        # Build a runtime model
        # runtime_rf = RandomForestWithInstances(types,
        #                                        instance_features=meta_features_list,
        #                                        seed=1, num_trees=10)
        # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params,
        #                                      scenario=self.scenario,
        #                                      success_states=None,
        #                                      impute_censored_data=False,
        #                                      impute_state=None)
        # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory)
        # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten())
        X_meta, Y_meta = rh2EPM.transform(meta_runhistory)
        # Transform Y_meta on a per-dataset base
        for meta_dataset in meta_runs_dataset_indices:
            start_index, end_index = meta_runs_dataset_indices[meta_dataset]
            end_index += 1  # Python indexing
            Y_meta[start_index:end_index, 0]\
                [Y_meta[start_index:end_index, 0] >2.0] =  2.0
            dataset_minimum = np.min(Y_meta[start_index:end_index, 0])
            Y_meta[start_index:end_index, 0] = 1 - (
                (1. - Y_meta[start_index:end_index, 0]) /
                (1. - dataset_minimum))
            Y_meta[start_index:end_index, 0]\
                  [Y_meta[start_index:end_index, 0] > 2] = 2

        # == first, evaluate all metelearning and default configurations
        for i, next_config in enumerate(([default_cfg] +
                                          metalearning_configurations)):
            # Do not evaluate default configurations more than once
            if i >= len([default_cfg]) and next_config in [default_cfg]:
                continue

            config_name = 'meta-learning' if i >= len([default_cfg]) \
                else 'default'

            self.logger.info("Starting to evaluate %d. configuration "
                             "(%s configuration) with time limit %ds.",
                             num_run, config_name, self.func_eval_time_limit)
            self.logger.info(next_config)
            self.reset_data_manager()
            info = eval_with_limits(self.datamanager, self.tmp_dir, next_config,
                                    seed, num_run,
                                    self.resampling_strategy,
                                    self.resampling_strategy_args,
                                    self.memory_limit,
                                    self.func_eval_time_limit)
            (duration, result, _, additional_run_info, status) = info
            run_history.add(config=next_config, cost=result,
                            time=duration , status=status,
                            instance_id=instance_id, seed=seed)
            run_history.update_cost(next_config, result)
            self.logger.info("Finished evaluating %d. configuration. "
                             "Duration %f; loss %f; status %s; additional run "
                             "info: %s ", num_run, duration, result,
                             str(status), additional_run_info)
            num_run += 1
            if smac.incumbent is None:
                smac.incumbent = next_config
            elif result < run_history.get_cost(smac.incumbent):
                smac.incumbent = next_config

            if self.scenario.shared_model:
                pSMAC.write(run_history=run_history,
                            output_directory=self.scenario.output_dir,
                            num_run=self.seed)

        # == after metalearning run SMAC loop
        smac.runhistory = run_history
        smac_iter = 0
        finished = False
        while not finished:
            if self.scenario.shared_model:
                pSMAC.read(run_history=run_history,
                           output_directory=self.scenario.output_dir,
                           configuration_space=self.config_space,
                           logger=self.logger)

            next_configs = []
            time_for_choose_next = -1
            try:
                X_cfg, Y_cfg = rh2EPM.transform(run_history)

                if not run_history.empty():
                    # Update costs by normalization
                    dataset_minimum = np.min(Y_cfg[:, 0])
                    Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) /
                                       (1. - dataset_minimum))
                    Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2

                if len(X_meta) > 0 and len(X_cfg) > 0:
                    pass
                    #X_cfg = np.concatenate((X_meta, X_cfg))
                    #Y_cfg = np.concatenate((Y_meta, Y_cfg))
                elif len(X_meta) > 0:
                    X_cfg = X_meta.copy()
                    Y_cfg = Y_meta.copy()
                elif len(X_cfg) > 0:
                    X_cfg = X_cfg.copy()
                    Y_cfg = Y_cfg.copy()
                else:
                    raise ValueError('No training data for SMAC random forest!')

                self.logger.info('Using %d training points for SMAC.' %
                                 X_cfg.shape[0])
                choose_next_start_time = time.time()
                next_configs_tmp = smac.choose_next(X_cfg, Y_cfg,
                                                    num_interleaved_random=110,
                                                    num_configurations_by_local_search=10,
                                                    num_configurations_by_random_search_sorted=100)
                time_for_choose_next = time.time() - choose_next_start_time
                self.logger.info('Used %g seconds to find next '
                                 'configurations' % (time_for_choose_next))
                next_configs.extend(next_configs_tmp)
            # TODO put Exception here!
            except Exception as e:
                self.logger.error(e)
                self.logger.error("Error in getting next configurations "
                                  "with SMAC. Using random configuration!")
                next_config = self.config_space.sample_configuration()
                next_configs.append(next_config)

            models_fitted_this_iteration = 0
            start_time_this_iteration = time.time()
            for next_config in next_configs:
                x_runtime = impute_inactive_values(next_config)
                x_runtime = impute_inactive_values(x_runtime).get_array()
                # predicted_runtime = runtime_rf.predict_marginalized_over_instances(
                #     x_runtime.reshape((1, -1)))
                # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1

                self.logger.info("Starting to evaluate %d. configuration (from "
                                 "SMAC) with time limit %ds.", num_run,
                                 self.func_eval_time_limit)
                self.logger.info(next_config)
                self.reset_data_manager()
                info = eval_with_limits(self.datamanager, self.tmp_dir, next_config,
                                        seed, num_run,
                                        self.resampling_strategy,
                                        self.resampling_strategy_args,
                                        self.memory_limit,
                                        self.func_eval_time_limit)
                (duration, result, _, additional_run_info, status) = info
                run_history.add(config=next_config, cost=result,
                                time=duration , status=status,
                                instance_id=instance_id, seed=seed)
                run_history.update_cost(next_config, result)

                #self.logger.info('Predicted runtime %g, true runtime %g',
                #                 predicted_runtime, duration)

                # TODO add unittest to make sure everything works fine and
                # this does not get outdated!
                if smac.incumbent is None:
                    smac.incumbent = next_config
                elif result < run_history.get_cost(smac.incumbent):
                    smac.incumbent = next_config

                self.logger.info("Finished evaluating %d. configuration. "
                                 "Duration: %f; loss: %f; status %s; additional "
                                 "run info: %s ", num_run, duration, result,
                                 str(status), additional_run_info)
                smac_iter += 1
                num_run += 1

                models_fitted_this_iteration += 1
                time_used_this_iteration = time.time() - start_time_this_iteration
                if models_fitted_this_iteration >= 2 and \
                        time_for_choose_next > 0 and \
                        time_used_this_iteration > time_for_choose_next:
                    break
                elif time_for_choose_next <= 0 and \
                        models_fitted_this_iteration >= 1:
                    break
                elif models_fitted_this_iteration >= 50:
                    break

                if max_iters is not None:
                    finished = (smac_iter < max_iters)

            if self.scenario.shared_model:
                pSMAC.write(run_history=run_history,
                            output_directory=self.scenario.output_dir,
                            num_run=self.seed)