예제 #1
0
    def fit(self,
            train_data,
            val_data=None,
            train_size=0.9,
            random_state=None,
            time_limit=None):
        """Fit auto estimator given the input data.

        Parameters
        ----------
        train_data : pd.DataFrame or iterator
            Training data.
        val_data : pd.DataFrame or iterator, optional
            Validation data, optional. If `train_data` is DataFrame, `val_data` will be split from
            `train_data` given `train_size`.
        train_size : float
            The portion of train data split from original `train_data` if `val_data` is not provided.
        random_state : int
            Random state for splitting, for `np.random.seed`.
        time_limit : int, default is None
            The wall clock time limit(second) for fit process, if `None`, time limit is not enforced.
            If `fit` takes longer than `time_limit`, the process will terminate early and return the
            model prematurally.
            Due to callbacks and additional validation functions, the `time_limit` may not be very precise
            (few minutes allowance), but you can use it to safe-guard a very long training session.
            If `time_limits` key set in __init__ with config, the `time_limit` value will overwrite configuration
            if not `None`.

        Returns
        -------
        Estimator
            The estimator obtained by training on the specified dataset.

        """
        config = self._config.copy()
        if time_limit is None:
            if config.get('time_limits', None):
                time_limit = config['time_limits']
            else:
                time_limit = math.inf
        elif not isinstance(time_limit, int):
            raise TypeError(
                f'Invalid type `time_limit={time_limit}`, int or None expected'
            )
        self.scheduler_options['time_out'] = time_limit
        wall_clock_tick = time.time() + time_limit
        # split train/val before HPO to make fair comparisons
        if not isinstance(train_data, pd.DataFrame):
            assert val_data is not None, \
                "Please provide `val_data` as we do not know how to split `train_data` of type: \
                {}"                   .format(type(train_data))

        if val_data is None:
            assert 0 <= train_size <= 1.0
            if random_state:
                np.random.seed(random_state)
            split_mask = np.random.rand(len(train_data)) < train_size
            train = train_data[split_mask]
            val = train_data[~split_mask]
            self._logger.info(
                'Randomly split train_data into train[%d]/validation[%d] splits.',
                len(train), len(val))
            train_data, val_data = train, val

        estimator = config.get('estimator', None)
        if estimator is None:
            estimator = [ImageClassificationEstimator]
        else:
            if isinstance(estimator, ag.Space):
                estimator = estimator.data
            elif isinstance(estimator, str):
                estimator = [estimator]
            for i, e in enumerate(estimator):
                if e == 'img_cls':
                    estimator[i] = ImageClassificationEstimator
                else:
                    estimator.pop(e)
        if not estimator:
            raise ValueError(
                'Unable to determine the estimator for fit function.')
        if len(estimator) == 1:
            config['estimator'] = estimator[0]
        else:
            config['estimator'] = ag.Categorical(*estimator)

        # register args
        config['train_data'] = train_data
        config['val_data'] = val_data
        config['wall_clock_tick'] = wall_clock_tick
        config['log_dir'] = os.path.join(config.get('log_dir', os.getcwd()),
                                         str(uuid.uuid4())[:8])
        _train_image_classification.register_args(**config)

        start_time = time.time()
        self._fit_summary = {}
        self._results = {}
        if config.get('num_trials', 1) < 2:
            rand_config = RandomSearcher(
                _train_image_classification.cs).get_config()
            self._logger.info("Starting fit without HPO")
            results = _train_image_classification(
                _train_image_classification.args, rand_config)
            best_config = sample_config(_train_image_classification.args,
                                        rand_config)
            best_config.pop('train_data', None)
            best_config.pop('val_data', None)
            self._fit_summary.update({
                'train_acc':
                results.get('train_acc', -1),
                'valid_acc':
                results.get('valid_acc', -1),
                'total_time':
                results.get('time',
                            time.time() - start_time),
                'best_config':
                best_config
            })
            self._results = self._fit_summary
        else:
            self._logger.info("Starting HPO experiments")
            results = self.run_fit(_train_image_classification,
                                   self.search_strategy,
                                   self.scheduler_options)
            if isinstance(results, dict):
                ks = ('best_reward', 'best_config', 'total_time',
                      'config_history', 'reward_attr')
                self._results.update(
                    {k: v
                     for k, v in results.items() if k in ks})
        end_time = time.time()
        self._logger.info("Finished, total runtime is %.2f s",
                          end_time - start_time)
        if config.get('num_trials', 1) > 1:
            best_config = sample_config(_train_image_classification.args,
                                        results['best_config'])
            # convert best config to nested form
            best_config = config_to_nested(best_config)
            best_config.pop('train_data', None)
            best_config.pop('val_data', None)
            self._fit_summary.update({
                'train_acc':
                results.get('train_acc', -1),
                'valid_acc':
                results.get('valid_acc', results.get('best_reward', -1)),
                'total_time':
                results.get('total_time',
                            time.time() - start_time),
                'best_config':
                best_config
            })
        self._logger.info(pprint.pformat(self._fit_summary, indent=2))

        if self._cleanup_disk:
            shutil.rmtree(config['log_dir'], ignore_errors=True)
        model_checkpoint = results.get('model_checkpoint', None)
        if model_checkpoint is None:
            if results.get('traceback', '') == 'timeout':
                raise TimeoutError(
                    f'Unable to fit a usable model given `time_limit={time_limit}`'
                )
            raise RuntimeError(
                f'Unexpected error happened during fit: {pprint.pformat(results, indent=2)}'
            )
        estimator = pickle.loads(results['model_checkpoint'])
        return estimator
예제 #2
0
    def fit(self,
            train_data,
            val_data=None,
            train_size=0.9,
            random_state=None):
        """Fit auto estimator given the input data.

        Parameters
        ----------
        train_data : pd.DataFrame or iterator
            Training data.
        val_data : pd.DataFrame or iterator, optional
            Validation data, optional. If `train_data` is DataFrame, `val_data` will be split from
            `train_data` given `train_size`.
        train_size : float
            The portion of train data split from original `train_data` if `val_data` is not provided.
        random_state : int
            Random state for splitting, for `np.random.seed`.

        Returns
        -------
        Estimator
            The estimator obtained by training on the specified dataset.

        """
        # split train/val before HPO to make fair comparisons
        if not isinstance(train_data, pd.DataFrame):
            assert val_data is not None, \
                "Please provide `val_data` as we do not know how to split `train_data` of type: \
                {}"                   .format(type(train_data))

        if val_data is None:
            assert 0 <= train_size <= 1.0
            if random_state:
                np.random.seed(random_state)
            split_mask = np.random.rand(len(train_data)) < train_size
            train = train_data[split_mask]
            val = train_data[~split_mask]
            self._logger.info(
                'Randomly split train_data into train[%d]/validation[%d] splits.',
                len(train), len(val))
            train_data, val_data = train, val

        # automatically suggest some hyperparameters based on the dataset statistics(experimental)
        estimator = self._config.get('estimator', None)
        if estimator is None:
            estimator = [ImageClassificationEstimator]
        self._config['estimator'] = ag.Categorical(*estimator)

        # register args
        config = self._config.copy()
        config['train_data'] = train_data
        config['val_data'] = val_data
        _train_image_classification.register_args(**config)

        start_time = time.time()
        self._fit_summary = {}
        if config.get('num_trials', 1) < 2:
            rand_config = RandomSearcher(
                _train_image_classification.cs).get_config()
            self._logger.info("Starting fit without HPO")
            results = _train_image_classification(
                _train_image_classification.args, rand_config)
            best_config = sample_config(_train_image_classification.args,
                                        rand_config)
            best_config.pop('train_data', None)
            best_config.pop('val_data', None)
            self._fit_summary.update({
                'train_acc':
                results.get('train_acc', -1),
                'valid_acc':
                results.get('valid_acc', -1),
                'total_time':
                results.get('time',
                            time.time() - start_time),
                'best_config':
                best_config
            })
        else:
            self._logger.info("Starting HPO experiments")
            results = self.run_fit(_train_image_classification,
                                   self.search_strategy,
                                   self.scheduler_options)
        end_time = time.time()
        self._logger.info(
            ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> finish model fitting")
        self._logger.info("total runtime is %.2f s", end_time - start_time)
        if config.get('num_trials', 1) > 1:
            best_config = sample_config(_train_image_classification.args,
                                        results['best_config'])
            # convert best config to nested form
            best_config = config_to_nested(best_config)
            best_config.pop('train_data', None)
            best_config.pop('val_data', None)
            self._fit_summary.update({
                'train_acc':
                results.get('train_acc', -1),
                'valid_acc':
                results.get('valid_acc', results.get('best_reward', -1)),
                'total_time':
                results.get('total_time',
                            time.time() - start_time),
                'best_config':
                best_config
            })
        self._logger.info(pprint.pformat(self._fit_summary, indent=2))

        # TODO: checkpointing needs to be done in a better way
        model_checkpoint = results.get('model_checkpoint', None)
        if model_checkpoint is None:
            raise RuntimeError(
                f'Unexpected error happened during fit: {pprint.pformat(results, indent=2)}'
            )
        estimator = pickle.loads(results['model_checkpoint'])
        return estimator
예제 #3
0
def test_skoptsearcher():
    logger.debug('Start testing SKoptSearcher')
    random.seed(1)
    reward_attribute = 'accuracy'
    # Create configuration space:
    cs = CS.ConfigurationSpace()
    a = CSH.UniformFloatHyperparameter('a', lower=1e-4, upper=1e-1, log=True) # log-scale float
    b = CSH.UniformFloatHyperparameter('b', lower=-2, upper=0) # float with uniform prior
    c = CSH.UniformIntegerHyperparameter('c', lower=0, upper=1000) # integer
    d = CSH.CategoricalHyperparameter('d', choices=['good','neutral','bad']) # categorical
    cs.add_hyperparameters([a,b,c,d])
    # Determine reward of optimal config:
    optimal_config = cs.sample_configuration()
    optimal_config['a'] = 1e-1
    optimal_config['b'] = 0
    optimal_config['c'] = 1000
    optimal_config['d'] = 'good' 
    optimal_reward = toy_reward(optimal_config) # should ~= 7025.58
    # Compare skopt searchers VS random sampling searcher:
    num_configs_totry = 15
    skopt_searcher = SKoptSearcher(
        cs, reward_attribute=reward_attribute)
    skopt_config_list = [None]*num_configs_totry
    skopt_reward_list = [0.0]*num_configs_totry # stores rewards scaled between 0-1
    # Also try skopt searcher which uses various kwargs (random forest surrgoate model, expected improvement acquisition):
    skrf_searcher = SKoptSearcher(
        cs, reward_attribute=reward_attribute, base_estimator='RF',
        acq_func='EI')
    skrf_config_list = [None]*num_configs_totry 
    skrf_reward_list = [0.0]*num_configs_totry # stores rewards scaled between 0-1
    # Benchmark against random searcher:
    rs_searcher = RandomSearcher(cs, reward_attribute=reward_attribute)
    random_config_list = [None]*num_configs_totry
    random_reward_list = [0.0]*num_configs_totry
    # Run search:
    reported_result = {reward_attribute: 0.0}
    for i in range(num_configs_totry):
        skopt_config = skopt_searcher.get_config()
        skopt_reward = toy_reward(skopt_config) / optimal_reward
        reported_result[reward_attribute] = skopt_reward
        skopt_searcher.update(skopt_config, **reported_result)
        skopt_config_list[i] = skopt_config
        skopt_reward_list[i] = skopt_reward
        skrf_config = skrf_searcher.get_config()
        skrf_reward = toy_reward(skrf_config) / optimal_reward
        reported_result[reward_attribute] = skrf_reward
        skrf_searcher.update(skrf_config, **reported_result)
        skrf_config_list[i] = skrf_config
        skrf_reward_list[i] = skrf_reward
        rs_config = rs_searcher.get_config()
        rs_reward = toy_reward(rs_config) / optimal_reward
        reported_result[reward_attribute] = rs_reward
        rs_searcher.update(rs_config, **reported_result)
        random_config_list[i] = rs_config
        random_reward_list[i] = rs_reward
        # print("Round %d: skopt best reward=%f" % (i,max(skopt_reward_list)))
    # Summarize results:
    logger.debug("best reward from SKopt: %f,  best reward from SKopt w/ RF: %f,  best reward from Random search: %f" % 
          (max(skopt_reward_list), max(skrf_reward_list), max(random_reward_list)))
    # Ensure skopt outperformed random search:
    assert (max(skopt_reward_list) + 0.05 >= max(random_reward_list)), "SKopt did significantly worse than Random Search"
    # Ensure skopt found reasonably good config within num_configs_totry:
    assert (max(skopt_reward_list) >= 0.6), "SKopt performed poorly"
    logger.debug('Test Finished.')