def get_searchspace_regression_baseline(): params = { 'learning_rate': Real(lower=5e-3, upper=0.2, default=0.05, log=True), 'feature_fraction': Real(lower=0.75, upper=1.0, default=1.0), 'min_data_in_leaf': Int(lower=2, upper=60, default=20), 'num_leaves': Int(lower=16, upper=96, default=31), } return params
def get_searchspace_multiclass_baseline(): params = { 'learning_rate': Real(lower=5e-3, upper=0.2, default=0.05, log=True), 'feature_fraction': Real(lower=0.75, upper=1.0, default=1.0), 'min_data_in_leaf': Int( lower=2, upper=60, default=20 ), # TODO: Use size of dataset to set upper, if row count is small upper should be small 'num_leaves': Int( lower=16, upper=96, default=31 ), # TODO: Use row count and feature count to set this, the higher feature count the higher num_leaves upper # TODO: Bin size max increase } return params
def get_searchspace_regression_baseline(): params = { 'objective': 'regression', 'learning_rate': Real(lower=5e-3, upper=0.2, default=0.1, log=True), 'feature_fraction': Real(lower=0.75, upper=1.0, default=1.0), 'min_data_in_leaf': Int(lower=2, upper=30, default=20), 'num_leaves': Int(lower=16, upper=96, default=31), 'num_boost_round': DEFAULT_NUM_BOOST_ROUND, 'boosting_type': 'gbdt', 'verbose': -1, 'two_round': True, 'seed_value': None, } return params
def get_base_searchspace(): base_params = { 'n_estimators': DEFAULT_NUM_BOOST_ROUND, 'booster': 'gbtree', 'n_jobs': os.cpu_count(), # TODO: xgboost plans to accept -1 for compability with other packages. After that, resolving this issue. 'learning_rate': Real(lower=5e-3, upper=0.2, default=0.1, log=True), 'max_depth': Int(lower=3, upper=10, default=3), 'min_child_weight': Int(lower=1, upper=5, default=1), 'gamma': Real(lower=0, upper=5, default=0.01), 'subsample': Real(lower=0.5, upper=1.0, default=1.0), 'colsample_bytree': Real(lower=0.5, upper=1.0, default=1.0), 'reg_alpha': Real(lower=0.0, upper=10.0, default=0.0), 'reg_lambda': Real(lower=0.0, upper=10.0, default=1.0), } return base_params
def get_base_searchspace(): base_params = { 'n_estimators': DEFAULT_NUM_BOOST_ROUND, 'booster': 'gbtree', 'n_jobs': -1, 'learning_rate': Real(lower=5e-3, upper=0.2, default=0.1, log=True), 'max_depth': Int(lower=3, upper=10, default=6), 'min_child_weight': Int(lower=1, upper=5, default=1), 'gamma': Real(lower=0, upper=5, default=0.01), 'subsample': Real(lower=0.5, upper=1.0, default=1.0), 'colsample_bytree': Real(lower=0.5, upper=1.0, default=1.0), 'reg_alpha': Real(lower=0.0, upper=10.0, default=0.0), 'reg_lambda': Real(lower=0.0, upper=10.0, default=1.0), } return base_params
def get_default_searchspace(): params = { 'lr': Real(5e-5, 5e-3, default=1e-3, log=True), 'weight_decay': Real(1e-6, 5e-2, default=1e-6, log=True), 'p_dropout': Categorical(0.1, 0, 0.2, 0.3, 0.4, 0.5), 'n_heads': Categorical(8, 2, 4), 'hidden_dim': Categorical(128, 32, 64, 256), 'n_layers': Categorical(1, 2, 3, 4, 5), 'feature_dim': Int(8, 128, default=64), 'tab_readout': Categorical('none', 'readout_emb', 'mean', 'concat_pool', 'concat_pool_all', 'concat_pool_add', 'all_feat_embs', 'mean_feat_embs'), 'num_output_layers': Categorical(2, 1, 3), } return params.copy()
def get_searchspace_binary(): spaces = { # See docs: https://docs.fast.ai/tabular.models.html 'layers': Categorical(None, [200, 100], [200], [500], [1000], [500, 200], [50, 25], [1000, 500], [200, 100, 50], [500, 200, 100], [1000, 500, 200]), 'emb_drop': Real(0.0, 0.5, default=0.1), 'ps': Real(0.0, 0.5, default=0.1), 'bs': Categorical(256, 64, 128, 512, 1024, 2048, 4096), 'lr': Real(5e-5, 1e-1, default=1e-2, log=True), 'epochs': Int(lower=5, upper=30, default=30), 'early.stopping.min_delta': 0.0001, 'early.stopping.patience': 20, 'smoothing': Real(0.0, 0.3, default=0.0, log=True), } return spaces
def get_searchspace_regression_baseline(): params = { 'learning_rate': Real(lower=5e-3, upper=0.2, default=0.05, log=True), 'depth': Int(lower=5, upper=8, default=6), 'l2_leaf_reg': Real(lower=1, upper=5, default=3), } return params
def get_searchspace_multiclass_baseline(num_classes): params = { 'learning_rate': Real(lower=5e-3, upper=0.2, default=0.1, log=True), 'depth': Int(lower=5, upper=8, default=6), 'l2_leaf_reg': Real(lower=1, upper=5, default=3), } return params
def get_searchspace_multiclass_baseline(num_classes): params = { 'objective': 'multiclass', 'num_classes': num_classes, 'learning_rate': Real(lower=5e-3, upper=0.2, default=0.1, log=True), 'feature_fraction': Real(lower=0.75, upper=1.0, default=1.0), 'min_data_in_leaf': Int(lower=2, upper=30, default=20), # TODO: Use size of dataset to set upper, if row count is small upper should be small 'num_leaves': Int(lower=16, upper=96, default=31), # TODO: Use row count and feature count to set this, the higher feature count the higher num_leaves upper 'num_boost_round': DEFAULT_NUM_BOOST_ROUND, 'boosting_type': 'gbdt', 'verbose': -1, 'two_round': True, 'seed_value': None, # 'device': 'gpu' # needs GPU-enabled lightGBM build # TODO: Bin size max increase } return params
def get_default_searchspace(): params = { 'lr': Real(5e-5, 5e-3, default=1e-3, log=True), 'weight_decay': Real(1e-6, 5e-2, default=1e-6, log=True), 'p_dropout': Categorical(0.1, 0, 0.5), 'n_heads': Categorical(8, 4), 'hidden_dim': Categorical(128, 32, 64, 256), 'n_layers': Categorical(2, 1, 3, 4, 5), 'feature_dim': Int(8, 128, default=64), 'num_output_layers': Categorical(1, 2), } return params.copy()
def sanitize_batch_size(batch_size, min_value=1, max_value=np.inf): if isinstance(batch_size, Categorical): valid_bs = [] bs_values = batch_size.data for bs_value in bs_values: if isinstance(bs_value, int) and min_value < bs_value < max_value: valid_bs.append(bs_value) if valid_bs != bs_values: warnings.warn( f'Pruning batch size from {batch_size} to {valid_bs} due to memory limit.' ) if len(valid_bs) == 1: new_bs = valid_bs[0] else: new_bs = Categorical(*valid_bs) elif isinstance(batch_size, Int): lower = batch_size.lower upper = batch_size.upper if not isinstance(lower, int) or not isinstance(upper, int): raise TypeError( f'Invalid lower {lower} or upper {upper} bound for Int space') lower = max(lower, min_value) upper = min(upper, max_value) new_bs = Int(lower=lower, upper=upper) if lower != batch_size.lower or upper != batch_size.higher: warnings.warn( f'Adjusting batch size range from {batch_size} to {new_bs} due to memory limit.' ) elif isinstance(batch_size, int): new_bs = max(min(batch_size, max_value), min_value) if new_bs != batch_size: warnings.warn( f'Adjusting batch size from {batch_size} to {new_bs} due to memory limit.' ) else: raise TypeError( f'Expecting batch size to be (Categorical/Int/int), given {type(batch_size)}.' ) return new_bs
def hyperparameter_tune(self, X_train, y_train, X_val, y_val, scheduler_options, **kwargs): time_start = time.time() logger.log(15, "Beginning hyperparameter tuning for Gradient Boosting Model...") self._set_default_searchspace() params_copy = self.params.copy() if isinstance(params_copy['min_data_in_leaf'], Int): upper_minleaf = params_copy['min_data_in_leaf'].upper if upper_minleaf > X_train.shape[0]: # TODO: this min_data_in_leaf adjustment based on sample size may not be necessary upper_minleaf = max(1, int(X_train.shape[0] / 5.0)) lower_minleaf = params_copy['min_data_in_leaf'].lower if lower_minleaf > upper_minleaf: lower_minleaf = max(1, int(upper_minleaf / 3.0)) params_copy['min_data_in_leaf'] = Int(lower=lower_minleaf, upper=upper_minleaf) directory = self.path # also create model directory if it doesn't exist # TODO: This will break on S3! Use tabular/utils/savers for datasets, add new function os.makedirs(directory, exist_ok=True) scheduler_func, scheduler_options = scheduler_options # Unpack tuple if scheduler_func is None or scheduler_options is None: raise ValueError("scheduler_func and scheduler_options cannot be None for hyperparameter tuning") num_threads = scheduler_options['resource'].get('num_cpus', -1) params_copy['num_threads'] = num_threads # num_gpus = scheduler_options['resource']['num_gpus'] # TODO: unused dataset_train, dataset_val = self.generate_datasets(X_train=X_train, y_train=y_train, params=params_copy, X_val=X_val, y_val=y_val) dataset_train_filename = "dataset_train.bin" train_file = self.path + dataset_train_filename if os.path.exists(train_file): # clean up old files first os.remove(train_file) dataset_train.save_binary(train_file) dataset_val_filename = "dataset_val.bin" # names without directory info val_file = self.path + dataset_val_filename if os.path.exists(val_file): # clean up old files first os.remove(val_file) dataset_val.save_binary(val_file) dataset_val_pkl_filename = 'dataset_val.pkl' val_pkl_path = directory + dataset_val_pkl_filename save_pkl.save(path=val_pkl_path, object=(X_val, y_val)) if not np.any([isinstance(params_copy[hyperparam], Space) for hyperparam in params_copy]): logger.warning("Attempting to do hyperparameter optimization without any search space (all hyperparameters are already fixed values)") else: logger.log(15, "Hyperparameter search space for Gradient Boosting Model: ") for hyperparam in params_copy: if isinstance(params_copy[hyperparam], Space): logger.log(15, f'{hyperparam}: {params_copy[hyperparam]}') util_args = dict( dataset_train_filename=dataset_train_filename, dataset_val_filename=dataset_val_filename, dataset_val_pkl_filename=dataset_val_pkl_filename, directory=directory, model=self, time_start=time_start, time_limit=scheduler_options['time_out'] ) lgb_trial.register_args(util_args=util_args, **params_copy) scheduler = scheduler_func(lgb_trial, **scheduler_options) if ('dist_ip_addrs' in scheduler_options) and (len(scheduler_options['dist_ip_addrs']) > 0): # This is multi-machine setting, so need to copy dataset to workers: logger.log(15, "Uploading data to remote workers...") scheduler.upload_files([train_file, val_file, val_pkl_path]) # TODO: currently does not work. directory = self.path # TODO: need to change to path to working directory used on every remote machine lgb_trial.update(directory=directory) logger.log(15, "uploaded") scheduler.run() scheduler.join_jobs() return self._get_hpo_results(scheduler=scheduler, scheduler_options=scheduler_options, time_start=time_start)