def fit(self, train_data, val_data=None, train_size=0.9, random_state=None, time_limit=None): """Fit auto estimator given the input data. Parameters ---------- train_data : pd.DataFrame or iterator Training data. val_data : pd.DataFrame or iterator, optional Validation data, optional. If `train_data` is DataFrame, `val_data` will be split from `train_data` given `train_size`. train_size : float The portion of train data split from original `train_data` if `val_data` is not provided. random_state : int Random state for splitting, for `np.random.seed`. time_limit : int, default is None The wall clock time limit(second) for fit process, if `None`, time limit is not enforced. If `fit` takes longer than `time_limit`, the process will terminate early and return the model prematurally. Due to callbacks and additional validation functions, the `time_limit` may not be very precise (few minutes allowance), but you can use it to safe-guard a very long training session. If `time_limits` key set in __init__ with config, the `time_limit` value will overwrite configuration if not `None`. Returns ------- Estimator The estimator obtained by training on the specified dataset. """ config = self._config.copy() if time_limit is None: if config.get('time_limits', None): time_limit = config['time_limits'] else: time_limit = math.inf elif not isinstance(time_limit, int): raise TypeError( f'Invalid type `time_limit={time_limit}`, int or None expected' ) self.scheduler_options['time_out'] = time_limit wall_clock_tick = time.time() + time_limit # split train/val before HPO to make fair comparisons if not isinstance(train_data, pd.DataFrame): assert val_data is not None, \ "Please provide `val_data` as we do not know how to split `train_data` of type: \ {}" .format(type(train_data)) if val_data is None: assert 0 <= train_size <= 1.0 if random_state: np.random.seed(random_state) split_mask = np.random.rand(len(train_data)) < train_size train = train_data[split_mask] val = train_data[~split_mask] self._logger.info( 'Randomly split train_data into train[%d]/validation[%d] splits.', len(train), len(val)) train_data, val_data = train, val estimator = config.get('estimator', None) if estimator is None: estimator = [ImageClassificationEstimator] else: if isinstance(estimator, ag.Space): estimator = estimator.data elif isinstance(estimator, str): estimator = [estimator] for i, e in enumerate(estimator): if e == 'img_cls': estimator[i] = ImageClassificationEstimator else: estimator.pop(e) if not estimator: raise ValueError( 'Unable to determine the estimator for fit function.') if len(estimator) == 1: config['estimator'] = estimator[0] else: config['estimator'] = ag.Categorical(*estimator) # register args config['train_data'] = train_data config['val_data'] = val_data config['wall_clock_tick'] = wall_clock_tick config['log_dir'] = os.path.join(config.get('log_dir', os.getcwd()), str(uuid.uuid4())[:8]) _train_image_classification.register_args(**config) start_time = time.time() self._fit_summary = {} self._results = {} if config.get('num_trials', 1) < 2: rand_config = RandomSearcher( _train_image_classification.cs).get_config() self._logger.info("Starting fit without HPO") results = _train_image_classification( _train_image_classification.args, rand_config) best_config = sample_config(_train_image_classification.args, rand_config) best_config.pop('train_data', None) best_config.pop('val_data', None) self._fit_summary.update({ 'train_acc': results.get('train_acc', -1), 'valid_acc': results.get('valid_acc', -1), 'total_time': results.get('time', time.time() - start_time), 'best_config': best_config }) self._results = self._fit_summary else: self._logger.info("Starting HPO experiments") results = self.run_fit(_train_image_classification, self.search_strategy, self.scheduler_options) if isinstance(results, dict): ks = ('best_reward', 'best_config', 'total_time', 'config_history', 'reward_attr') self._results.update( {k: v for k, v in results.items() if k in ks}) end_time = time.time() self._logger.info("Finished, total runtime is %.2f s", end_time - start_time) if config.get('num_trials', 1) > 1: best_config = sample_config(_train_image_classification.args, results['best_config']) # convert best config to nested form best_config = config_to_nested(best_config) best_config.pop('train_data', None) best_config.pop('val_data', None) self._fit_summary.update({ 'train_acc': results.get('train_acc', -1), 'valid_acc': results.get('valid_acc', results.get('best_reward', -1)), 'total_time': results.get('total_time', time.time() - start_time), 'best_config': best_config }) self._logger.info(pprint.pformat(self._fit_summary, indent=2)) if self._cleanup_disk: shutil.rmtree(config['log_dir'], ignore_errors=True) model_checkpoint = results.get('model_checkpoint', None) if model_checkpoint is None: if results.get('traceback', '') == 'timeout': raise TimeoutError( f'Unable to fit a usable model given `time_limit={time_limit}`' ) raise RuntimeError( f'Unexpected error happened during fit: {pprint.pformat(results, indent=2)}' ) estimator = pickle.loads(results['model_checkpoint']) return estimator
def fit(self, train_data, val_data=None, train_size=0.9, random_state=None): """Fit auto estimator given the input data. Parameters ---------- train_data : pd.DataFrame or iterator Training data. val_data : pd.DataFrame or iterator, optional Validation data, optional. If `train_data` is DataFrame, `val_data` will be split from `train_data` given `train_size`. train_size : float The portion of train data split from original `train_data` if `val_data` is not provided. random_state : int Random state for splitting, for `np.random.seed`. Returns ------- Estimator The estimator obtained by training on the specified dataset. """ # split train/val before HPO to make fair comparisons if not isinstance(train_data, pd.DataFrame): assert val_data is not None, \ "Please provide `val_data` as we do not know how to split `train_data` of type: \ {}" .format(type(train_data)) if val_data is None: assert 0 <= train_size <= 1.0 if random_state: np.random.seed(random_state) split_mask = np.random.rand(len(train_data)) < train_size train = train_data[split_mask] val = train_data[~split_mask] self._logger.info( 'Randomly split train_data into train[%d]/validation[%d] splits.', len(train), len(val)) train_data, val_data = train, val # automatically suggest some hyperparameters based on the dataset statistics(experimental) estimator = self._config.get('estimator', None) if estimator is None: estimator = [ImageClassificationEstimator] self._config['estimator'] = ag.Categorical(*estimator) # register args config = self._config.copy() config['train_data'] = train_data config['val_data'] = val_data _train_image_classification.register_args(**config) start_time = time.time() self._fit_summary = {} if config.get('num_trials', 1) < 2: rand_config = RandomSearcher( _train_image_classification.cs).get_config() self._logger.info("Starting fit without HPO") results = _train_image_classification( _train_image_classification.args, rand_config) best_config = sample_config(_train_image_classification.args, rand_config) best_config.pop('train_data', None) best_config.pop('val_data', None) self._fit_summary.update({ 'train_acc': results.get('train_acc', -1), 'valid_acc': results.get('valid_acc', -1), 'total_time': results.get('time', time.time() - start_time), 'best_config': best_config }) else: self._logger.info("Starting HPO experiments") results = self.run_fit(_train_image_classification, self.search_strategy, self.scheduler_options) end_time = time.time() self._logger.info( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> finish model fitting") self._logger.info("total runtime is %.2f s", end_time - start_time) if config.get('num_trials', 1) > 1: best_config = sample_config(_train_image_classification.args, results['best_config']) # convert best config to nested form best_config = config_to_nested(best_config) best_config.pop('train_data', None) best_config.pop('val_data', None) self._fit_summary.update({ 'train_acc': results.get('train_acc', -1), 'valid_acc': results.get('valid_acc', results.get('best_reward', -1)), 'total_time': results.get('total_time', time.time() - start_time), 'best_config': best_config }) self._logger.info(pprint.pformat(self._fit_summary, indent=2)) # TODO: checkpointing needs to be done in a better way model_checkpoint = results.get('model_checkpoint', None) if model_checkpoint is None: raise RuntimeError( f'Unexpected error happened during fit: {pprint.pformat(results, indent=2)}' ) estimator = pickle.loads(results['model_checkpoint']) return estimator
def test_skoptsearcher(): logger.debug('Start testing SKoptSearcher') random.seed(1) reward_attribute = 'accuracy' # Create configuration space: cs = CS.ConfigurationSpace() a = CSH.UniformFloatHyperparameter('a', lower=1e-4, upper=1e-1, log=True) # log-scale float b = CSH.UniformFloatHyperparameter('b', lower=-2, upper=0) # float with uniform prior c = CSH.UniformIntegerHyperparameter('c', lower=0, upper=1000) # integer d = CSH.CategoricalHyperparameter('d', choices=['good','neutral','bad']) # categorical cs.add_hyperparameters([a,b,c,d]) # Determine reward of optimal config: optimal_config = cs.sample_configuration() optimal_config['a'] = 1e-1 optimal_config['b'] = 0 optimal_config['c'] = 1000 optimal_config['d'] = 'good' optimal_reward = toy_reward(optimal_config) # should ~= 7025.58 # Compare skopt searchers VS random sampling searcher: num_configs_totry = 15 skopt_searcher = SKoptSearcher( cs, reward_attribute=reward_attribute) skopt_config_list = [None]*num_configs_totry skopt_reward_list = [0.0]*num_configs_totry # stores rewards scaled between 0-1 # Also try skopt searcher which uses various kwargs (random forest surrgoate model, expected improvement acquisition): skrf_searcher = SKoptSearcher( cs, reward_attribute=reward_attribute, base_estimator='RF', acq_func='EI') skrf_config_list = [None]*num_configs_totry skrf_reward_list = [0.0]*num_configs_totry # stores rewards scaled between 0-1 # Benchmark against random searcher: rs_searcher = RandomSearcher(cs, reward_attribute=reward_attribute) random_config_list = [None]*num_configs_totry random_reward_list = [0.0]*num_configs_totry # Run search: reported_result = {reward_attribute: 0.0} for i in range(num_configs_totry): skopt_config = skopt_searcher.get_config() skopt_reward = toy_reward(skopt_config) / optimal_reward reported_result[reward_attribute] = skopt_reward skopt_searcher.update(skopt_config, **reported_result) skopt_config_list[i] = skopt_config skopt_reward_list[i] = skopt_reward skrf_config = skrf_searcher.get_config() skrf_reward = toy_reward(skrf_config) / optimal_reward reported_result[reward_attribute] = skrf_reward skrf_searcher.update(skrf_config, **reported_result) skrf_config_list[i] = skrf_config skrf_reward_list[i] = skrf_reward rs_config = rs_searcher.get_config() rs_reward = toy_reward(rs_config) / optimal_reward reported_result[reward_attribute] = rs_reward rs_searcher.update(rs_config, **reported_result) random_config_list[i] = rs_config random_reward_list[i] = rs_reward # print("Round %d: skopt best reward=%f" % (i,max(skopt_reward_list))) # Summarize results: logger.debug("best reward from SKopt: %f, best reward from SKopt w/ RF: %f, best reward from Random search: %f" % (max(skopt_reward_list), max(skrf_reward_list), max(random_reward_list))) # Ensure skopt outperformed random search: assert (max(skopt_reward_list) + 0.05 >= max(random_reward_list)), "SKopt did significantly worse than Random Search" # Ensure skopt found reasonably good config within num_configs_totry: assert (max(skopt_reward_list) >= 0.6), "SKopt performed poorly" logger.debug('Test Finished.')