Beispiel #1
0
    def fit(self, to_fit: Tuple[ExpressionPartitions, Dict], y=None) -> Dict:
        partitions, parameters = to_fit
        cat = partitions.categorical_index if partitions.features.has_categorical else "auto"
        lgb_train = lgb.Dataset(partitions.X,
                                partitions.Y,
                                categorical_feature=cat,
                                free_raw_data=False)

        num_boost_round = self.num_boost_round(parameters)
        iterations = parameters.get("num_boost_round") if parameters.get(
            "num_iterations") is None else parameters.get("num_boost_round")
        stopping_callback = lgb.early_stopping(self.early_stopping_rounds)
        eval_hist = lgb.cv(
            parameters,
            lgb_train,
            folds=partitions.folds,
            metrics=["mae", "mse", "huber"],
            categorical_feature=cat,
            show_stdv=True,
            verbose_eval=num_boost_round,
            seed=partitions.seed,
            num_boost_round=num_boost_round,
            #early_stopping_rounds=self.early_stopping_rounds,
            callbacks=[stopping_callback])
        self.evaluation = ResultsCV(parameters, eval_hist)
        return self
Beispiel #2
0
    def _train(
        self,
        params: Dict[str, Any],
        lgb_train: lgb.Dataset,
        eval_sets: List[lgb.Dataset],
        eval_names: List[str],
    ) -> lgb.Booster:
        """Trains a LightGBM model.

        Args:
            params: parameters for LightGBM
            lgb_train: LightGBM dataset for training
            eval_sets: LightGBM datasets for evaluation
            eval_names: names of the evaluation datasets

        Returns:
            LightGBM Booster model
        """
        gbm = lgb.train(
            params,
            lgb_train,
            num_boost_round=self.num_boost_round,
            valid_sets=eval_sets,
            valid_names=eval_names,
            feature_name=list(self.model.input_features.keys()),
            # NOTE: hummingbird does not support categorical features
            # categorical_feature=categorical_features,
            callbacks=[
                lgb.early_stopping(stopping_rounds=self.early_stop),
                lgb.log_evaluation(),
            ],
        )

        return gbm
 def fit(
     self,
     dataset: DatasetH,
     num_boost_round=1000,
     early_stopping_rounds=50,
     verbose_eval=20,
     evals_result=None,
 ):
     if evals_result is None:
         evals_result = dict()
     dtrain, dvalid = self._prepare_data(dataset)
     early_stopping_callback = lgb.early_stopping(early_stopping_rounds)
     verbose_eval_callback = lgb.log_evaluation(period=verbose_eval)
     evals_result_callback = lgb.record_evaluation(evals_result)
     self.model = lgb.train(
         self.params,
         dtrain,
         num_boost_round=num_boost_round,
         valid_sets=[dtrain, dvalid],
         valid_names=["train", "valid"],
         callbacks=[
             early_stopping_callback, verbose_eval_callback,
             evals_result_callback
         ],
     )
     evals_result["train"] = list(evals_result["train"].values())[0]
     evals_result["valid"] = list(evals_result["valid"].values())[0]
Beispiel #4
0
def test_early_stopping_callback_is_picklable(serializer):
    rounds = 5
    callback = lgb.early_stopping(stopping_rounds=rounds)
    callback_from_disk = pickle_and_unpickle_object(obj=callback,
                                                    serializer=serializer)
    assert callback_from_disk.order == 30
    assert callback_from_disk.before_iteration is False
    assert callback.stopping_rounds == callback_from_disk.stopping_rounds
    assert callback.stopping_rounds == rounds
Beispiel #5
0
def run_gbt(x_train, y_train, x_test, y_test, feature_names, binarize=False):
    param = dict(device_type='cpu',
                 boosting='gbdt',
                 nthread=8,
                 objective='regression',
                 metric='rmse',
                 lambda_l1=1,
                 lambda_l2=1,
                 learning_rate=.01,
                 tree_learner='serial',
                 max_bin=63,
                 num_leaves=10,
                 max_depth=10,
                 feature_fraction=.5,
                 min_data_in_leaf=1,
                 min_gain_to_split=1,
                 verbose=-1)
    model_name = 'gbt'

    if binarize:
        param['objective'] = 'binary'
        param['metric'] = 'auc'
        y_train, y_test = convert_to_binary(y_train, y_test)
        model_name = 'gbt_binary'

    train_data = lgb.Dataset(x_train,
                             label=y_train,
                             feature_name=feature_names)
    validation_data = lgb.Dataset(x_test,
                                  label=y_test,
                                  feature_name=feature_names)
    num_round = 1000
    bst = lgb.train(
        param,
        train_data,
        num_round,
        valid_sets=validation_data,
        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=0)])
    table = bst.feature_importance()
    feats = pd.Series(table, index=feature_names)
    selected_feat = feats[feats > 0].index.values

    preds = bst.predict(x_test, num_iteration=bst.best_iteration)
    error, r2, pearson = score_all(y_test, preds)
    auc = np.nan
    if binarize:
        auc = metrics.average_precision_score(y_test, preds)
    return {
        'mse': error,
        'r2': r2,
        'pearsonr': pearsonr(y_test, preds)[0],
        'model': model_name,
        'feature_names': selected_feat,
        'auc': auc
    }
Beispiel #6
0
 def fit(self, data):
     params = {
         'boosting_type': 'gbdt',
         'verbosity': 0} 
     if data.tasktype == 'regression':    
         params['objective'] = 'regression',
     else:
         if len(data.Xy_test[1].shape) > 1:
             params['objective'] = 'multiclass',
         else:
             params['objective'] = 'binary',
     if data.kfold > 1:
         cv_eval = {}
         for k, cv_fold in enumerate(data.Xy_train.keys()):
             [(X_train, y_train), (X_val, y_val)] = data.Xy_train[cv_fold]
             lgb_train = lgb.Dataset(X_train, y_train)
             lgb_eval = lgb.Dataset(X_val, y_val)
             gbm = lgb.train(params,
                             lgb_train, 
                             valid_sets=lgb_eval,
                             callbacks=[lgb.early_stopping(stopping_rounds=5)])
             eval_metrics = weareval.eval_output(gbm.predict(X_val, num_iteration=gbm.best_iteration), y_val, tasktype=data.tasktype)
             cv_eval[cv_fold] = {'model': gbm, 
                                 # 'data': [(X_train, y_train), (X_val, y_val)], # store just IDs?
                                 'metric': eval_metrics['mae'] if data.tasktype=='regression' else eval_metrics['balanced_acc_adj'],
                                 'metrics': eval_metrics}
         # retain only best model
         tmp = {cv_fold:cv_eval[cv_fold]['metric'] for cv_fold in cv_eval.keys()}
         bst_fold = min(tmp, key=tmp.get) if data.tasktype=='regression' else max(tmp, key=tmp.get)
         self.gbm = cv_eval[bst_fold]['model']
         return {'model': self.gbm, 'metrics': cv_eval[bst_fold]['metrics']}
     else:
         X_train, y_train = data.Xy_train
         X_val, y_val = data.Xy_val
         lgb_train = lgb.Dataset(X_train, y_train)
         lgb_eval = lgb.Dataset(X_val, y_val)
         self.gbm = lgb.train(params,
                         lgb_train, 
                         valid_sets=lgb_eval,
                         callbacks=[lgb.early_stopping(stopping_rounds=5)])
         eval_metrics = weareval.eval_output(self.gbm.predict(X_val, num_iteration=gbm.best_iteration), y_val, tasktype=data.tasktype)
         return {'model': self.gbm, 'metrics': eval_metrics}
def test_lgb_autolog_logs_metrics_with_early_stopping(bst_params, train_set):
    mlflow.lightgbm.autolog()
    evals_result = {}
    params = {"metric": ["multi_error", "multi_logloss"]}
    params.update(bst_params)
    valid_sets = [train_set, lgb.Dataset(train_set.data)]
    valid_names = ["train", "valid"]
    if Version(lgb.__version__) <= Version("3.3.1"):
        model = lgb.train(
            params,
            train_set,
            num_boost_round=10,
            early_stopping_rounds=5,
            valid_sets=valid_sets,
            valid_names=valid_names,
            evals_result=evals_result,
        )
    else:
        model = lgb.train(
            params,
            train_set,
            num_boost_round=10,
            valid_sets=valid_sets,
            valid_names=valid_names,
            callbacks=[
                lgb.record_evaluation(evals_result),
                lgb.early_stopping(5),
            ],
        )
    run = get_latest_run()
    data = run.data
    client = mlflow.tracking.MlflowClient()
    assert "best_iteration" in data.metrics
    assert int(data.metrics["best_iteration"]) == model.best_iteration
    assert "stopped_iteration" in data.metrics
    assert int(data.metrics["stopped_iteration"]) == len(
        evals_result["train"]["multi_logloss"])

    for valid_name in valid_names:
        for metric_name in params["metric"]:
            metric_key = "{}-{}".format(valid_name, metric_name)
            metric_history = [
                x.value
                for x in client.get_metric_history(run.info.run_id, metric_key)
            ]
            assert metric_key in data.metrics

            best_metrics = evals_result[valid_name][metric_name][
                model.best_iteration - 1]
            assert metric_history == evals_result[valid_name][metric_name] + [
                best_metrics
            ]
Beispiel #8
0
def train_model(X_train: pd.DataFrame, y_train: pd.DataFrame,
                X_test: pd.DataFrame, y_test: pd.DataFrame,
                parameters: Dict[str, Any]) -> Any:
    if parameters["model"]["name"] == "lightgbm":
        params = {
            "learning_rate": parameters["model"]["learning_rate"],
            "n_estimators": parameters["epochs"],
        }
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train,
            y_train,
            eval_set=(X_test, y_test),
            eval_metric=["softmax"],
            callbacks=[lgb.early_stopping(10)],
        )
    else:
        raise NotImplementedError
    return model
Beispiel #9
0
 def fit(
     self,
     dataset: DatasetH,
     num_boost_round=None,
     early_stopping_rounds=None,
     verbose_eval=20,
     evals_result=None,
     reweighter=None,
     **kwargs,
 ):
     if evals_result is None:
         evals_result = {}  # in case of unsafety of Python default values
     ds_l = self._prepare_data(dataset, reweighter)
     ds, names = list(zip(*ds_l))
     early_stopping_callback = lgb.early_stopping(
         self.early_stopping_rounds
         if early_stopping_rounds is None else early_stopping_rounds)
     # NOTE: if you encounter error here. Please upgrade your lightgbm
     verbose_eval_callback = lgb.log_evaluation(period=verbose_eval)
     evals_result_callback = lgb.record_evaluation(evals_result)
     self.model = lgb.train(
         self.params,
         ds[0],  # training dataset
         num_boost_round=self.num_boost_round
         if num_boost_round is None else num_boost_round,
         valid_sets=ds,
         valid_names=names,
         callbacks=[
             early_stopping_callback, verbose_eval_callback,
             evals_result_callback
         ],
         **kwargs,
     )
     for k in names:
         for key, val in evals_result[k].items():
             name = f"{key}.{k}"
             for epoch, m in enumerate(val):
                 R.log_metrics(**{name.replace("@", "_"): m}, step=epoch)
Beispiel #10
0
    def _train(
        self,
        params: Dict[str, Any],
        lgb_train: "RayDMatrix",  # noqa: F821
        eval_sets: List["RayDMatrix"],  # noqa: F821
        eval_names: List[str],
    ) -> lgb.Booster:
        """Trains a LightGBM model using ray.

        Args:
            params: parameters for LightGBM
            lgb_train: RayDMatrix dataset for training
            eval_sets: RayDMatrix datasets for evaluation
            eval_names: names of the evaluation datasets

        Returns:
            LightGBM Booster model
        """
        from lightgbm_ray import train as lgb_ray_train

        gbm = lgb_ray_train(
            params,
            lgb_train,
            num_boost_round=self.num_boost_round,
            valid_sets=eval_sets,
            valid_names=eval_names,
            feature_name=list(self.model.input_features.keys()),
            # NOTE: hummingbird does not support categorical features
            # categorical_feature=categorical_features,
            callbacks=[
                lgb.early_stopping(stopping_rounds=self.early_stop),
                log_eval_distributed(10),
            ],
            ray_params=_map_to_lgb_ray_params(self.trainer_kwargs),
        )

        return gbm.booster_
Beispiel #11
0
    def regression_model(self,
                         X_train,
                         X_test,
                         y_train,
                         y_test,
                         parameters: Dict,
                         categorical=None,
                         num_boost_round: int = 250,
                         seed: int = None) -> Booster:
        '''
        trains a regression model
        :param X_train:
        :param X_test:
        :param y_train:
        :param y_test:
        :param categorical:
        :param parameters:
        :return:
        '''
        cat = categorical if (categorical
                              is not None) and len(categorical) > 0 else "auto"
        lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=cat)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
        evals_result = {}

        stopping_callback = lgb.early_stopping(self.early_stopping_rounds)
        if seed is not None:
            parameters["seed"] = seed
        gbm = lgb.train(parameters,
                        lgb_train,
                        num_boost_round=num_boost_round,
                        valid_sets=lgb_eval,
                        evals_result=evals_result,
                        verbose_eval=num_boost_round,
                        callbacks=[stopping_callback])
        return gbm, BasicMetrics.parse_eval(evals_result)
Beispiel #12
0
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['rmse', 'l2', 'l1', 'huber'],
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbosity': -1
}
wandb.config.update(params)

# train
# add lightgbm callback
gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=20,
    valid_sets=lgb_eval,
    valid_names=('validation'),
    callbacks=[wandb_callback(),
               lgb.early_stopping(stopping_rounds=5)])

# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred)**0.5)

# log feature importance and model checkpoint
log_summary(gbm, save_model_checkpoint=True)
Beispiel #13
0
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                callbacks=[lgb.early_stopping(stopping_rounds=5)])

print('Saving model...')
# save model to file
gbm.save_model('model.txt')

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}')
Beispiel #14
0
def test_register_logger(tmp_path):
    logger = logging.getLogger("LightGBM")
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(levelname)s | %(message)s')
    log_filename = tmp_path / "LightGBM_test_logger.log"
    file_handler = logging.FileHandler(log_filename,
                                       mode="w",
                                       encoding="utf-8")
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    def dummy_metric(_, __):
        logger.debug('In dummy_metric')
        return 'dummy_metric', 1, True

    lgb.register_logger(logger)

    X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]],
                 dtype=np.float32)
    y = np.array([0, 1, 1, 0])
    lgb_data = lgb.Dataset(X, y)

    eval_records = {}
    callbacks = [
        lgb.record_evaluation(eval_records),
        lgb.log_evaluation(2),
        lgb.early_stopping(4)
    ]
    lgb.train({
        'objective': 'binary',
        'metric': ['auc', 'binary_error']
    },
              lgb_data,
              num_boost_round=10,
              feval=dummy_metric,
              valid_sets=[lgb_data],
              categorical_feature=[1],
              callbacks=callbacks)

    lgb.plot_metric(eval_records)

    expected_log = r"""
INFO | [LightGBM] [Warning] There are no meaningful features, as all feature values are constant.
INFO | [LightGBM] [Info] Number of positive: 2, number of negative: 2
INFO | [LightGBM] [Info] Total Bins 0
INFO | [LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0
INFO | [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | Training until validation scores don't improve for 4 rounds
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [2]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [4]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [6]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [8]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [10]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
WARNING | More than one metric available, picking one to plot.
""".strip()

    gpu_lines = [
        "INFO | [LightGBM] [Info] This is the GPU trainer",
        "INFO | [LightGBM] [Info] Using GPU Device:",
        "INFO | [LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...",
        "INFO | [LightGBM] [Info] GPU programs have been built",
        "INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found",
        "INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported.",
        "INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.",
        "INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!"
    ]
    with open(log_filename, "rt", encoding="utf-8") as f:
        actual_log = f.read().strip()
        actual_log_wo_gpu_stuff = []
        for line in actual_log.split("\n"):
            if not any(line.startswith(gpu_line) for gpu_line in gpu_lines):
                actual_log_wo_gpu_stuff.append(line)

    assert "\n".join(actual_log_wo_gpu_stuff) == expected_log
Beispiel #15
0
        def objective(trial):

            hyper_params = {
                "boosting":
                "gbdt",
                "eta":
                trial.suggest_loguniform("eta", params["eta"][0],
                                         params["eta"][1]),
                "max_depth":
                trial.suggest_int("max_depth", params["max_depth"][0],
                                  params["max_depth"][1]),
                "num_leaves":
                trial.suggest_int("num_leaves",
                                  params["num_leaves"][0],
                                  params["num_leaves"][1],
                                  step=16),
                "min_data_in_leaf":
                trial.suggest_int("min_data_in_leaf",
                                  params["min_data_in_leaf"][0],
                                  params["min_data_in_leaf"][1],
                                  step=100),
                "lambda_l1":
                trial.suggest_int("lambda_l1",
                                  params["lambda_l1"][0],
                                  params["lambda_l1"][1],
                                  step=1),
                "lambda_l2":
                trial.suggest_int("lambda_l2",
                                  params["lambda_l2"][0],
                                  params["lambda_l2"][1],
                                  step=1),
                "min_gain_to_split":
                trial.suggest_loguniform("min_gain_to_split",
                                         params["min_gain_to_split"][0],
                                         params["min_gain_to_split"][1]),
                "min_sum_hessian_in_leaf":
                trial.suggest_int("min_sum_hessian_in_leaf",
                                  params["min_sum_hessian_in_leaf"][0],
                                  params["min_sum_hessian_in_leaf"][1]),
                "subsample":
                trial.suggest_float("subsample", params["subsample"][0],
                                    params["subsample"][1]),
                "feature_fraction":
                trial.suggest_float("feature_fraction",
                                    params["feature_fraction"][0],
                                    params["feature_fraction"][1])
            }

            # Add pruning and early stopping
            pruning_callback = LightGBMPruningCallback(trial,
                                                       "NegLogLikelihood")
            early_stopping_callback = lgb.early_stopping(
                stopping_rounds=early_stopping_rounds, verbose=False)

            lgblss_param_tuning = lightgbmlss.cv(
                hyper_params,
                dtrain,
                dist,
                num_boost_round=num_boost_round,
                nfold=nfold,
                callbacks=[pruning_callback, early_stopping_callback])

            # Add opt_rounds as a trial attribute, accessible via study.trials_dataframe(). # https://github.com/optuna/optuna/issues/1169
            opt_rounds = np.argmin(
                np.array(lgblss_param_tuning["NegLogLikelihood-mean"])) + 1
            trial.set_user_attr("opt_round", int(opt_rounds))

            # Extract the best score
            best_score = np.min(
                np.array(lgblss_param_tuning["NegLogLikelihood-mean"]))

            return best_score
Beispiel #16
0
lgb_params = dict(
    learning_rate=0.05,
    n_estimators=500,
)
model = lgb.LGBMClassifier(**lgb_params)

def mlflow_callback():
    def callback(env):
        for name, loss_name, loss_value, _ in env.evaluation_result_list:
            mlflow.log_metric(key=loss_name, value=loss_value, step=env.iteration)
    return callback

mlflow.set_tracking_uri(os.environ["MLFLOW_HOST"])
mlflow.set_experiment("MLMAN-1")
with mlflow.start_run():
    mlflow.log_params({**params, **lgb_params})
    model.fit(
        X_train,
        y_train,
        eval_set=(X_test, y_test),
        eval_metric=["softmax"],
        callbacks=[
            lgb.early_stopping(10),
            mlflow_callback(),
        ])

    # Log an artifact (output file)
    with open("output.txt", "w") as f:
        f.write("Hello world!")
    mlflow.log_artifact("output.txt")
Beispiel #17
0
def run_lgb(sources, drug_name):
    if not isinstance(sources, str):
        out_name = '_'.join(sorted(sources))
    else:
        out_name = sources
    df_subset = data.get_trainable_data(sources, drug_name)

    cols = list(set(df_subset.columns.values))
    cols.remove(drug_name)

    features = df_subset[cols].copy()
    target = df_subset[drug_name].values.reshape(-1, 1).ravel()

    n_features_before = features.shape[1]

    features = features.loc[:, features.mean() > 0]
    features = features.loc[:, features.std() > 0]
    feature_names = list(set(features.columns.values))
    n_features = features.shape[1]
    print(f"Using {n_features} out of {n_features_before}"
          f" ({n_features_before - n_features} removed)")
    if features.shape[0] < 100:
        return {}

    X_train, X_test, y_train, y_test = train_test_split(
        features,
        target,
        test_size=0.2,
        shuffle=True,
        random_state=101,
    )

    scaler = preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)

    train_data = lgb.Dataset(X_train, label=y_train,
                             feature_name=feature_names)
    validation_data = lgb.Dataset(X_test, label=y_test,
                                  feature_name=feature_names, )

    param = dict(
        device_type='gpu',
        boosting='gbdt',
        nthread=1,
        objective='regression',
        metric='rmse',
        #         lambda_l1=.5,
        #         lambda_l2=.5,
        learning_rate=.01,
        tree_learner='serial',
        max_bin=63,
        num_leaves=6,
        max_depth=6,
        feature_fraction=.5,
        min_data_in_leaf=1,
        min_gain_to_split=1,
        verbose=-1

    )

    num_round = 1000
    bst = lgb.train(
        param,
        train_data,
        num_round,
        valid_sets=validation_data,
        callbacks=[lgb.early_stopping(stopping_rounds=100)]
    )
    # bst.save_model('model.txt', num_iteration=bst.best_iteration)
    #     lgb.plot_importance(bst, figsize =(4, 8))
    #     plt.show()

    # t_preds = bst.predict(X_train, num_iteration=bst.best_iteration)
    preds = bst.predict(X_test, num_iteration=bst.best_iteration)
    error = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    print(f"MSE: {error:0.3f} | $R^2$ {r2}")
    return {
        'data_sets': out_name,
        'drug_name': drug_name,
        'mse': error,
        'r2': r2
    }
Beispiel #18
0
df_test = pd.read_csv(str(regression_example_dir / 'regression.test'), header=None, sep='\t')

y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

print('Starting training...')
# train
gbm = lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        callbacks=[lgb.early_stopping(5)])

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}')

# feature importances
print(f'Feature importances: {list(gbm.feature_importances_)}')


# self-defined eval metric
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
# Root Mean Squared Logarithmic Error (RMSLE)