예제 #1
0
def get_searchspace_regression_baseline():
    params = {
        'learning_rate': Real(lower=5e-3, upper=0.2, default=0.05, log=True),
        'feature_fraction': Real(lower=0.75, upper=1.0, default=1.0),
        'min_data_in_leaf': Int(lower=2, upper=60, default=20),
        'num_leaves': Int(lower=16, upper=96, default=31),
    }
    return params
예제 #2
0
def get_searchspace_multiclass_baseline():
    params = {
        'learning_rate': Real(lower=5e-3, upper=0.2, default=0.05, log=True),
        'feature_fraction': Real(lower=0.75, upper=1.0, default=1.0),
        'min_data_in_leaf': Int(
            lower=2, upper=60, default=20
        ),  # TODO: Use size of dataset to set upper, if row count is small upper should be small
        'num_leaves': Int(
            lower=16, upper=96, default=31
        ),  # TODO: Use row count and feature count to set this, the higher feature count the higher num_leaves upper
        # TODO: Bin size max increase
    }
    return params
예제 #3
0
def get_searchspace_regression_baseline():
    params = {
        'objective': 'regression',
        'learning_rate': Real(lower=5e-3, upper=0.2, default=0.1, log=True),
        'feature_fraction': Real(lower=0.75, upper=1.0, default=1.0),
        'min_data_in_leaf': Int(lower=2, upper=30, default=20),
        'num_leaves': Int(lower=16, upper=96, default=31),
        'num_boost_round': DEFAULT_NUM_BOOST_ROUND,
        'boosting_type': 'gbdt',
        'verbose': -1,
        'two_round': True,
        'seed_value': None,
    }
    return params
예제 #4
0
def get_base_searchspace():
    base_params = {
        'n_estimators': DEFAULT_NUM_BOOST_ROUND,
        'booster': 'gbtree',
        'n_jobs': os.cpu_count(), # TODO: xgboost plans to accept -1 for compability with other packages. After that, resolving this issue.
        'learning_rate': Real(lower=5e-3, upper=0.2, default=0.1, log=True),
        'max_depth': Int(lower=3, upper=10, default=3),
        'min_child_weight': Int(lower=1, upper=5, default=1),
        'gamma': Real(lower=0, upper=5, default=0.01),
        'subsample': Real(lower=0.5, upper=1.0, default=1.0),
        'colsample_bytree': Real(lower=0.5, upper=1.0, default=1.0),
        'reg_alpha': Real(lower=0.0, upper=10.0, default=0.0),
        'reg_lambda': Real(lower=0.0, upper=10.0, default=1.0),
    }
    return base_params
예제 #5
0
def get_base_searchspace():
    base_params = {
        'n_estimators': DEFAULT_NUM_BOOST_ROUND,
        'booster': 'gbtree',
        'n_jobs': -1,
        'learning_rate': Real(lower=5e-3, upper=0.2, default=0.1, log=True),
        'max_depth': Int(lower=3, upper=10, default=6),
        'min_child_weight': Int(lower=1, upper=5, default=1),
        'gamma': Real(lower=0, upper=5, default=0.01),
        'subsample': Real(lower=0.5, upper=1.0, default=1.0),
        'colsample_bytree': Real(lower=0.5, upper=1.0, default=1.0),
        'reg_alpha': Real(lower=0.0, upper=10.0, default=0.0),
        'reg_lambda': Real(lower=0.0, upper=10.0, default=1.0),
    }
    return base_params
예제 #6
0
def get_default_searchspace():
    params = {
        'lr':
        Real(5e-5, 5e-3, default=1e-3, log=True),
        'weight_decay':
        Real(1e-6, 5e-2, default=1e-6, log=True),
        'p_dropout':
        Categorical(0.1, 0, 0.2, 0.3, 0.4, 0.5),
        'n_heads':
        Categorical(8, 2, 4),
        'hidden_dim':
        Categorical(128, 32, 64, 256),
        'n_layers':
        Categorical(1, 2, 3, 4, 5),
        'feature_dim':
        Int(8, 128, default=64),
        'tab_readout':
        Categorical('none', 'readout_emb', 'mean', 'concat_pool',
                    'concat_pool_all', 'concat_pool_add', 'all_feat_embs',
                    'mean_feat_embs'),
        'num_output_layers':
        Categorical(2, 1, 3),
    }

    return params.copy()
예제 #7
0
def get_searchspace_binary():
    spaces = {
        # See docs: https://docs.fast.ai/tabular.models.html
        'layers':
        Categorical(None, [200, 100], [200], [500], [1000], [500, 200],
                    [50, 25], [1000, 500], [200, 100, 50], [500, 200, 100],
                    [1000, 500, 200]),
        'emb_drop':
        Real(0.0, 0.5, default=0.1),
        'ps':
        Real(0.0, 0.5, default=0.1),
        'bs':
        Categorical(256, 64, 128, 512, 1024, 2048, 4096),
        'lr':
        Real(5e-5, 1e-1, default=1e-2, log=True),
        'epochs':
        Int(lower=5, upper=30, default=30),
        'early.stopping.min_delta':
        0.0001,
        'early.stopping.patience':
        20,
        'smoothing':
        Real(0.0, 0.3, default=0.0, log=True),
    }
    return spaces
def get_searchspace_regression_baseline():
    params = {
        'learning_rate': Real(lower=5e-3, upper=0.2, default=0.05, log=True),
        'depth': Int(lower=5, upper=8, default=6),
        'l2_leaf_reg': Real(lower=1, upper=5, default=3),
    }
    return params
예제 #9
0
def get_searchspace_multiclass_baseline(num_classes):
    params = {
        'learning_rate': Real(lower=5e-3, upper=0.2, default=0.1, log=True),
        'depth': Int(lower=5, upper=8, default=6),
        'l2_leaf_reg': Real(lower=1, upper=5, default=3),
    }
    return params
예제 #10
0
def get_searchspace_multiclass_baseline(num_classes):
    params = {
        'objective': 'multiclass',
        'num_classes': num_classes,
        'learning_rate': Real(lower=5e-3, upper=0.2, default=0.1, log=True),
        'feature_fraction': Real(lower=0.75, upper=1.0, default=1.0),
        'min_data_in_leaf': Int(lower=2, upper=30, default=20),  # TODO: Use size of dataset to set upper, if row count is small upper should be small
        'num_leaves': Int(lower=16, upper=96, default=31),  # TODO: Use row count and feature count to set this, the higher feature count the higher num_leaves upper
        'num_boost_round': DEFAULT_NUM_BOOST_ROUND,
        'boosting_type': 'gbdt',
        'verbose': -1,
        'two_round': True,
        'seed_value': None,
        # 'device': 'gpu'  # needs GPU-enabled lightGBM build
        # TODO: Bin size max increase
    }
    return params
def get_default_searchspace():
    params = {
        'lr': Real(5e-5, 5e-3, default=1e-3, log=True),
        'weight_decay': Real(1e-6, 5e-2, default=1e-6, log=True),
        'p_dropout': Categorical(0.1, 0, 0.5),
        'n_heads': Categorical(8, 4),
        'hidden_dim': Categorical(128, 32, 64, 256),
        'n_layers': Categorical(2, 1, 3, 4, 5),
        'feature_dim': Int(8, 128, default=64),
        'num_output_layers': Categorical(1, 2),
    }

    return params.copy()
예제 #12
0
def sanitize_batch_size(batch_size, min_value=1, max_value=np.inf):
    if isinstance(batch_size, Categorical):
        valid_bs = []
        bs_values = batch_size.data
        for bs_value in bs_values:
            if isinstance(bs_value, int) and min_value < bs_value < max_value:
                valid_bs.append(bs_value)
        if valid_bs != bs_values:
            warnings.warn(
                f'Pruning batch size from {batch_size} to {valid_bs} due to memory limit.'
            )
        if len(valid_bs) == 1:
            new_bs = valid_bs[0]
        else:
            new_bs = Categorical(*valid_bs)
    elif isinstance(batch_size, Int):
        lower = batch_size.lower
        upper = batch_size.upper
        if not isinstance(lower, int) or not isinstance(upper, int):
            raise TypeError(
                f'Invalid lower {lower} or upper {upper} bound for Int space')
        lower = max(lower, min_value)
        upper = min(upper, max_value)
        new_bs = Int(lower=lower, upper=upper)
        if lower != batch_size.lower or upper != batch_size.higher:
            warnings.warn(
                f'Adjusting batch size range from {batch_size} to {new_bs} due to memory limit.'
            )
    elif isinstance(batch_size, int):
        new_bs = max(min(batch_size, max_value), min_value)
        if new_bs != batch_size:
            warnings.warn(
                f'Adjusting batch size from {batch_size} to {new_bs} due to memory limit.'
            )
    else:
        raise TypeError(
            f'Expecting batch size to be (Categorical/Int/int), given {type(batch_size)}.'
        )
    return new_bs
예제 #13
0
    def hyperparameter_tune(self, X_train, y_train, X_val, y_val, scheduler_options, **kwargs):
        time_start = time.time()
        logger.log(15, "Beginning hyperparameter tuning for Gradient Boosting Model...")
        self._set_default_searchspace()
        params_copy = self.params.copy()
        if isinstance(params_copy['min_data_in_leaf'], Int):
            upper_minleaf = params_copy['min_data_in_leaf'].upper
            if upper_minleaf > X_train.shape[0]:  # TODO: this min_data_in_leaf adjustment based on sample size may not be necessary
                upper_minleaf = max(1, int(X_train.shape[0] / 5.0))
                lower_minleaf = params_copy['min_data_in_leaf'].lower
                if lower_minleaf > upper_minleaf:
                    lower_minleaf = max(1, int(upper_minleaf / 3.0))
                params_copy['min_data_in_leaf'] = Int(lower=lower_minleaf, upper=upper_minleaf)

        directory = self.path  # also create model directory if it doesn't exist
        # TODO: This will break on S3! Use tabular/utils/savers for datasets, add new function
        os.makedirs(directory, exist_ok=True)
        scheduler_func, scheduler_options = scheduler_options  # Unpack tuple
        if scheduler_func is None or scheduler_options is None:
            raise ValueError("scheduler_func and scheduler_options cannot be None for hyperparameter tuning")
        num_threads = scheduler_options['resource'].get('num_cpus', -1)
        params_copy['num_threads'] = num_threads
        # num_gpus = scheduler_options['resource']['num_gpus'] # TODO: unused

        dataset_train, dataset_val = self.generate_datasets(X_train=X_train, y_train=y_train, params=params_copy, X_val=X_val, y_val=y_val)
        dataset_train_filename = "dataset_train.bin"
        train_file = self.path + dataset_train_filename
        if os.path.exists(train_file):  # clean up old files first
            os.remove(train_file)
        dataset_train.save_binary(train_file)
        dataset_val_filename = "dataset_val.bin"  # names without directory info
        val_file = self.path + dataset_val_filename
        if os.path.exists(val_file):  # clean up old files first
            os.remove(val_file)
        dataset_val.save_binary(val_file)
        dataset_val_pkl_filename = 'dataset_val.pkl'
        val_pkl_path = directory + dataset_val_pkl_filename
        save_pkl.save(path=val_pkl_path, object=(X_val, y_val))

        if not np.any([isinstance(params_copy[hyperparam], Space) for hyperparam in params_copy]):
            logger.warning("Attempting to do hyperparameter optimization without any search space (all hyperparameters are already fixed values)")
        else:
            logger.log(15, "Hyperparameter search space for Gradient Boosting Model: ")
            for hyperparam in params_copy:
                if isinstance(params_copy[hyperparam], Space):
                    logger.log(15, f'{hyperparam}:   {params_copy[hyperparam]}')

        util_args = dict(
            dataset_train_filename=dataset_train_filename,
            dataset_val_filename=dataset_val_filename,
            dataset_val_pkl_filename=dataset_val_pkl_filename,
            directory=directory,
            model=self,
            time_start=time_start,
            time_limit=scheduler_options['time_out']
        )
        lgb_trial.register_args(util_args=util_args, **params_copy)
        scheduler = scheduler_func(lgb_trial, **scheduler_options)
        if ('dist_ip_addrs' in scheduler_options) and (len(scheduler_options['dist_ip_addrs']) > 0):
            # This is multi-machine setting, so need to copy dataset to workers:
            logger.log(15, "Uploading data to remote workers...")
            scheduler.upload_files([train_file, val_file, val_pkl_path])  # TODO: currently does not work.
            directory = self.path  # TODO: need to change to path to working directory used on every remote machine
            lgb_trial.update(directory=directory)
            logger.log(15, "uploaded")

        scheduler.run()
        scheduler.join_jobs()

        return self._get_hpo_results(scheduler=scheduler, scheduler_options=scheduler_options, time_start=time_start)