def get_importances_from_model(X, y, features=None, verbose=50, early_stopping_rounds=200): lgb_params = {} lgb_params['boosting_type'] = 'gbdt' lgb_params['objective'] = 'binary' lgb_params['learning_rate'] = 0.03 lgb_params['metric'] = 'auc' lgb_params['num_iterations'] = 10000 lgb_params["colsample_bytree"] = 0.5 lgb_params["subsample"] = 0.8 lgb_params["reg_alpha"] = 0.3 lgb_params['reg_lambda'] = 0.3 lgb_params['max_depth'] = 8 if features == None: features = X.columns.tolist() train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=2017) lgb_train = Dataset(data=train_X, label=train_y, feature_name=features) lgb_val = Dataset(data=val_X, label=val_y, feature_name=features) lgb_booster = train(params=lgb_params, train_set=lgb_train, valid_sets=[lgb_train, lgb_val], valid_names=["train", "validation"], verbose_eval=verbose, early_stopping_rounds=early_stopping_rounds) return lgb_booster
def _evaluate(self, scores: np.ndarray, clases: lgb.Dataset) -> Tuple[str, int, bool]: labels = clases.get_label() weights = clases.get_weight() score_corte = self.prob_corte nombre, valor = self._evaluar_funcion_ganancia(scores, labels, weights, score_corte) return nombre, valor, True
def fit_lgb(x_tr, y_tr, x_va, y_va, cat_feats, args): from lightgbm import Dataset if args.clip_target != -1: y_tr = y_tr.clip(upper=args.clip_target) tr_ds = Dataset(x_tr, label=y_tr, free_raw_data=False) if args.mode not in ['full', 'fold']: va_ds = Dataset(x_va, label=y_va, free_raw_data=False) valid_sets = [tr_ds, va_ds] else: valid_sets = [tr_ds] params = { 'learning_rate': 0.02, 'max_depth': -1, 'boosting': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'is_training_metric': True, 'num_leaves': args.num_leaves, 'feature_fraction': 0.9, 'bagging_fraction': 0.7, 'lambda_l2': 0.7, 'bagging_freq': 5, 'seed': 42 } kwargs = { 'train_set': tr_ds, 'categorical_feature': cat_feats, 'verbose_eval': args.verbose_eval, 'num_boost_round': args.num_boost_round, } if args.mode not in ['full', 'fold']: kwargs['early_stopping_rounds'] = 200 kwargs['valid_sets'] = valid_sets if args.lr_decay: kwargs['callbacks'] = [ lgb.reset_parameter( learning_rate=learning_rate_010_decay_power_0995) ] m = lgb.train(params, **kwargs) tr_pred = np.clip(m.predict(tr_ds.data), 0, 361) tr_score = np.sqrt(mean_squared_error(tr_pred, tr_ds.label)) if args.mode not in ['full', 'fold']: va_pred = np.clip(m.predict(va_ds.data), 0, 361) va_score = np.sqrt(mean_squared_error(va_pred, va_ds.label)) else: va_score = 0. return m, tr_score, va_score
def Dist_Objective(predt: np.ndarray, data: lgb.Dataset): """A customized objective function to train each distributional parameter using custom gradient and hessian. """ target = torch.tensor(data.get_label()) # When num_class!= 0, preds has shape (n_obs, n_dist_param). # Each element in a row represents a raw prediction (leaf weight, hasn't gone through response function yet). predt = predt.reshape(-1, Gaussian.n_dist_param(), order="F") preds_location = Gaussian.param_dict()["location"](predt[:, 0]) preds_location = torch.tensor(preds_location, requires_grad=True) preds_scale = Gaussian.param_dict()["scale"](predt[:, 1]) preds_scale = torch.tensor(preds_scale, requires_grad=True) # Weights if data.get_weight() == None: # Use 1 as weight if no weights are specified weights = np.ones_like(target, dtype=float) else: weights = data.get_weight() # Initialize Gradient and Hessian Matrices grad = np.zeros(shape=(len(target), Gaussian.n_dist_param())) hess = np.zeros(shape=(len(target), Gaussian.n_dist_param())) # Specify Metric for Auto Derivation dGaussian = Normal(preds_location, preds_scale) autograd_metric = -dGaussian.log_prob(target).nansum() # Location grad[:, 0] = stabilize_derivative( auto_grad(metric=autograd_metric, parameter=preds_location, n=1) * weights, Gaussian.stabilize) hess[:, 0] = stabilize_derivative( auto_grad(metric=autograd_metric, parameter=preds_location, n=2) * weights, Gaussian.stabilize) # Scale grad[:, 1] = stabilize_derivative( auto_grad(metric=autograd_metric, parameter=preds_scale, n=1) * weights, Gaussian.stabilize) hess[:, 1] = stabilize_derivative( auto_grad(metric=autograd_metric, parameter=preds_scale, n=2) * weights, Gaussian.stabilize) # Reshaping grad = grad.ravel(order="F") hess = hess.ravel(order="F") return grad, hess
def _make_validation_labels_purchase_only(valid_ds: lgb.Dataset): valid_ds.construct() labels = np.array(valid_ds.get_label()) non_purchase = (labels != _PURCHASE_LABEL) non_purchase_interaction = np.logical_and(non_purchase, labels != _NOTHING_LABEL) logging.info( f"Number of non-purchase interactions in valid: {non_purchase_interaction.sum()}" ) logging.info( f"Number of total non-purchases in valid: {non_purchase.sum()}") labels[non_purchase] = 0.0 valid_ds.set_label(labels)
def Dist_Objective(predt: np.ndarray, data: lgb.Dataset): """A customized objective function to train each distributional parameter using custom gradient and hessian. """ target = data.get_label() # When num_class!= 0, preds has shape (n_obs, n_dist_param). # Each element in a row represents a raw prediction (leaf weight, hasn't gone through response function yet). predt = predt.reshape(-1, Gaussian.n_dist_param(), order="F") preds_location = Gaussian.param_dict()["location"](predt[:, 0]) preds_scale = Gaussian.param_dict()["scale"](predt[:, 1]) # Weights if data.get_weight() == None: # Use 1 as weight if no weights are specified weights = np.ones_like(target, dtype=float) else: weights = data.get_weight() # Initialize Gradient and Hessian Matrices grad = np.zeros(shape=(len(target), Gaussian.n_dist_param())) hess = np.zeros(shape=(len(target), Gaussian.n_dist_param())) # Location grad[:, 0] = Gaussian.gradient_location(y=target, location=preds_location, scale=preds_scale, weights=weights) hess[:, 0] = Gaussian.hessian_location(scale=preds_scale, weights=weights) # Scale grad[:, 1] = Gaussian.gradient_scale(y=target, location=preds_location, scale=preds_scale, weights=weights) hess[:, 1] = Gaussian.hessian_scale(scale=preds_scale, weights=weights) # Reshaping grad = grad.ravel(order="F") hess = hess.ravel(order="F") return grad, hess
def lightgbm_trainer(training_data, label, model_params): """Train LightGBM model on training data. Args: training_data (lightgbm.Dataset): Training data. label (str): Target column in training data. model_params (dict): Training parameters. Returns: lightgbm.Booster: Trained LightGBM model. """ training_data = Dataset(data=training_data.drop(label, axis=1), label=training_data[LABEL]) return train(train_set=training_data, params=model_params)
def lgb_compatible_f1_score(y_hat: np.ndarray, data: lgb.Dataset) -> Tuple[str, float, bool]: y_true = data.get_label().astype(int) y_hat = np.round(y_hat).astype( int) # scikit's f1 doesn't work with probabilities return "f1_score", f1_score(y_true, y_hat), True
def get_importances_from_model(X, y, features=None): lgb_params = {} lgb_params['boosting_type'] = 'gbdt' lgb_params['objective'] = 'binary' lgb_params['learning_rate'] = 0.02 lgb_params['metric'] = 'auc' # lgb_params['num_leaves'] = 34 lgb_params['colsample_bytree'] = 0.75 lgb_params['subsample'] = 0.75 lgb_params['n_estimators'] = 1500 # lgb_params['max_depth'] = 8 # lgb_params["reg_alpha"] = 0.041545473 # lgb_params['reg_lambda'] = 0.0735294 # lgb_params['min_split_gain'] = 0.0735294 # lgb_params['min_child_weight'] = 0.0735294 # lgb_params['silent'] = False if features == None: features = X.columns.tolist() lgb_train = Dataset(data=X, label=y, feature_name=features) lgb_booster = train(params=lgb_params, train_set=lgb_train, verbose_eval=50, num_boost_round=1500) return lgb_booster
def lgb_custom_eval(y_pred: np.ndarray, data: lgb.Dataset, func_loss, func_name: str, is_higher_better: bool, is_lgbdataset: bool = True): """ lightGBMのcustomized objectiveの共通関数 Params:: y_pred: 予測値. multi classの場合は、n_sample * n_class の長さになったいる 値は、array([0データ目0ラベルの予測値, ..., Nデータ目0ラベルの予測値, 0データ目1ラベルの予測値, ..., ]) data: train_set に set した値 func_loss: y_pred, y_true を入力に持つ """ if is_lgbdataset == False: y_true = y_pred.copy() y_pred = data else: y_true = data.label if is_callable(data, "ndf_label"): y_true = data.get_culstom_label(y_true.astype(int)) if y_pred.shape[0] != y_true.shape[0]: # multi class の場合 y_pred = y_pred.reshape(-1, y_true.shape[0]).T value = func_loss(y_pred, y_true) return func_name, value, is_higher_better
def lgb_custom_objective(y_pred: np.ndarray, data: lgb.Dataset, func_loss, is_lgbdataset: bool = True): """ lightGBMのcustomized objectiveの共通関数 Params:: y_pred: 予測値. multi classの場合は、n_sample * n_class の長さになったいる 値は、array([0データ目0ラベルの予測値, ..., Nデータ目0ラベルの予測値, 0データ目1ラベルの予測値, ..., ]) data: train_set に set した値 func_loss: y_pred, y_true を入力に持ち、y_pred と同じ shape を持つ return をする is_lgbdataset: lgb.dataset でなかった場合は入力が逆転するので気をつける """ if is_lgbdataset == False: y_true = y_pred.copy() y_pred = data else: y_true = data.label if is_callable(data, "ndf_label"): y_true = data.get_culstom_label(y_true.astype(int)) if y_pred.shape[0] != y_true.shape[0]: # multi class の場合 y_pred = y_pred.reshape(-1, y_true.shape[0]).T grad, hess = func_loss(y_pred, y_true) return grad.T.reshape(-1), hess.T.reshape(-1)
def test_onnxrt_python_lightgbm_categorical_iris(self): iris = load_iris() X, y = iris.data, iris.target X = (X * 10).astype(numpy.int32) X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11) other_x = numpy.random.randint(0, high=10, size=(1500, X_train.shape[1])) X_train = numpy.vstack([X_train, other_x]).astype(dtype=numpy.int32) y_train = numpy.hstack([ y_train, numpy.zeros(500) + 3, numpy.zeros(500) + 4, numpy.zeros(500) + 5 ]).astype(dtype=numpy.int32) self.assertEqual(y_train.shape, (X_train.shape[0], )) y_train = y_train % 2 # Classic gbm = LGBMClassifier() gbm.fit(X_train, y_train) exp = gbm.predict_proba(X_test) onx = to_onnx(gbm, initial_types=[('X', Int64TensorType([None, X_train.shape[1]]))]) self.assertIn('ZipMap', str(onx)) oif = OnnxInference(onx) got = oif.run({'X': X_test}) values = pandas.DataFrame(got['output_probability']).values self.assertEqualArray(exp, values, decimal=5) # categorical_feature=[0, 1] train_data = Dataset(X_train, label=y_train, feature_name=['c1', 'c2', 'c3', 'c4'], categorical_feature=['c1', 'c2']) params = { "boosting_type": "gbdt", "learning_rate": 0.05, "n_estimators": 2, "objective": "binary", "max_bin": 5, "min_child_samples": 100, 'verbose': -1, } booster = lgb_train(params, train_data) exp = booster.predict(X_test) onx = to_onnx(booster, initial_types=[('X', Int64TensorType([None, X_train.shape[1]]))]) self.assertIn('ZipMap', str(onx)) oif = OnnxInference(onx) got = oif.run({'X': X_test}) values = pandas.DataFrame(got['output_probability']).values self.assertEqualArray(exp, values[:, 1], decimal=5)
def lgb_f1_loss_multiclass(preds: np.ndarray, train_data: lgb.Dataset, clip: float = 1e-5): """Custom loss for optimizing f1. Args: preds: np.ndarray. train_data: lgb dataset. clip: clip values. Returns: lgb loss output. """ y_true = train_data.get_label().astype(np.int32) preds = preds.reshape((y_true.shape[0], -1), order='F') # softmax preds = np.clip(softmax_ax1(preds), clip, 1 - clip) # make ohe y_ohe = np.zeros_like(preds) np.add.at(y_ohe, (np.arange(y_true.shape[0]), y_true), 1) # grad grad = (preds - y_ohe) * preds # hess hess = (1 - preds) * preds * np.clip((2 * preds - y_ohe), 1e-3, np.inf) # reshape back preds return grad.reshape((-1, ), order='F'), hess.reshape((-1, ), order='F')
def lgb_f1_loss_multiclass( preds: np.ndarray, train_data: lgb.Dataset, clip: float = 1e-5) -> Tuple[np.ndarray, np.ndarray]: """Custom loss for optimizing f1. Args: preds: Predctions. train_data: Dataset in LightGBM format. clip: Clump constant. Returns: Gradient, hessian. """ y_true = train_data.get_label().astype(np.int32) preds = preds.reshape((y_true.shape[0], -1), order="F") # softmax preds = np.clip(softmax_ax1(preds), clip, 1 - clip) # make ohe y_ohe = np.zeros_like(preds) np.add.at(y_ohe, (np.arange(y_true.shape[0]), y_true), 1) # grad grad = (preds - y_ohe) * preds # hess hess = (1 - preds) * preds * np.clip((2 * preds - y_ohe), 1e-3, np.inf) # reshape back preds return grad.reshape((-1, ), order="F"), hess.reshape((-1, ), order="F")
def test_lightgbm_booster_multi_classifier(self): X = [[0, 1], [1, 1], [2, 0], [1, 2], [-1, 2], [1, -2]] X = numpy.array(X, dtype=numpy.float32) y = [0, 1, 0, 1, 2, 2] data = Dataset(X, label=y) model = train( { 'boosting_type': 'gbdt', 'objective': 'multiclass', 'n_estimators': 3, 'min_child_samples': 1, 'num_class': 3 }, data) update_registered_converter(WrappedLightGbmBoosterClassifier, 'WrappedLightGbmBoosterClassifier', calculate_lightgbm_output_shapes, convert_lightgbm, parser=lightgbm_parser, options={ 'zipmap': [False, True], 'nocl': [False, True] }) update_registered_converter(WrappedBooster, 'WrappedBooster', calculate_lightgbm_output_shapes, convert_lightgbm, parser=lightgbm_parser, options={ 'zipmap': [False, True], 'nocl': [False, True] }) update_registered_converter(Booster, 'LightGbmBooster', calculate_lightgbm_output_shapes, convert_lightgbm, parser=lightgbm_parser) model_onnx = to_onnx( model, initial_types=[('X', FloatTensorType([None, 2]))], options={WrappedLightGbmBoosterClassifier: { 'zipmap': False }}, target_opset={ '': TARGET_OPSET, 'ai.onnx.ml': TARGET_OPSET_ML }) try: sess = InferenceSession(model_onnx.SerializeToString()) except InvalidArgument as e: raise AssertionError("Cannot load model\n%r" % str(model_onnx)) from e expected = model.predict(X) res = sess.run(None, {'X': X}) assert_almost_equal(expected, res[1])
def Dist_Objective(predt: np.ndarray, data: lgb.Dataset): """A customized objective function to train each distributional parameter using custom gradient and hessian. """ target = data.get_label() # When num_class!= 0, preds has shape (n_obs, n_dist_param). # Each element in a row represents a raw prediction (leaf weight, hasn't gone through response function yet). preds_expectile = predt.reshape(-1, Expectile.n_dist_param(), order="F") # Weights if data.get_weight() == None: # Use 1 as weight if no weights are specified weights = np.ones_like(target, dtype=float) else: weights = data.get_weight() # Initialize Gradient and Hessian Matrices grad = np.zeros(shape=(len(target), len(Expectile.expectiles))) hess = np.zeros(shape=(len(target), len(Expectile.expectiles))) for i in range(len(Expectile.expectiles)): grad[:, i] = Expectile.gradient_expectile( y=target, expectile=preds_expectile[:, i], tau=Expectile.expectiles[i], weights=weights) hess[:, i] = Expectile.hessian_expectile(y=target, expectile=preds_expectile[:, i], tau=Expectile.expectiles[i], weights=weights) # Reshaping grad = grad.ravel(order="F") hess = hess.ravel(order="F") return grad, hess
def __call__(self, pred: np.ndarray, dtrain: lgb.Dataset) -> Tuple[str, float, bool]: label = dtrain.get_label() weights = dtrain.get_weight() if label.shape[0] != pred.shape[0]: pred = pred.reshape((label.shape[0], -1), order='F') label = label.astype(np.int32) pred = self.bw_func(pred) # for weighted case try: val = self.metric_func(label, pred, sample_weight=weights) except TypeError: val = self.metric_func(label, pred) # TODO: what if grouped case return 'Opt metric', val, self.greater_is_better
def test_lightgbm_booster_classifier(self): from lightgbm import Dataset, train as lgb_train X = numpy.array([[0, 1], [1, 1], [2, 0], [1, 2]], dtype=numpy.float32) y = [0, 1, 0, 1] data = Dataset(X, label=y) model = lgb_train({'boosting_type': 'rf', 'objective': 'binary', 'n_estimators': 3, 'min_child_samples': 1, 'subsample_freq': 1, 'bagging_fraction': 0.5, 'feature_fraction': 0.5}, data) model_onnx = to_onnx(model, X, verbose=0, rewrite_ops=True, target_opset=TARGET_OPSET) self.assertNotEmpty(model_onnx)
def lgb_mape(preds: np.ndarray, lgb_train: Dataset) -> Tuple[str, float, bool]: """ Mean average precision error metric for evaluation in lightgbm. Args: preds: Array of predictions lgb_train: LightGBM Dataset Returns: Tuple of error name (str) and error (float) """ labels = lgb_train.get_label() mask = labels != 0 return "mape", (np.fabs(labels - preds) / labels)[mask].mean(), False
def lgb_pr_auc(preds: np.ndarray, lgb_train: Dataset) -> Tuple[str, float, bool]: """ Precision Recall AUC (Area under Curve) of our prediction in lightgbm Args: preds: Array of predictions lgb_train: LightGBM Dataset Returns: Precision Recall AUC (Area under Curve) """ labels = lgb_train.get_label() precision, recall, _ = precision_recall_curve(labels, preds) return "pr_auc", auc(recall, precision), True
def top2_accuray_lgb( predt: np.ndarray, data: lgb.Dataset, threshold: float = 0.5, ) -> Tuple[str, float, bool]: s_0 = 31 s_1 = int(len(predt) / s_0) predt = predt.reshape(s_0, s_1) y = data.get_label() p = predt.argsort(axis=0)[::-1, :] accuracy = ((y == p[0, :]) | (y == p[1, :])).mean() # # eval_name, eval_result, is_higher_better return 'top2_accuray', float(accuracy), True
def Dist_Metric(predt: np.ndarray, data: lgb.Dataset): """A customized evaluation metric that evaluates the predictions using the negative log-likelihood. """ target = data.get_label() is_higher_better = False # Using a custom objective function, the custom metric receives raw predictions which need to be transformed # with the corresponding response function. predt = predt.reshape(-1, Gaussian.n_dist_param(), order="F") preds_location = Gaussian.param_dict()["location"](predt[:, 0]) preds_scale = Gaussian.param_dict()["scale"](predt[:, 1]) nll = -np.nansum(norm.logpdf(x=target, loc=preds_location, scale=preds_scale)) return "NegLogLikelihood", nll, is_higher_better
def lgb_mape_exp(preds: np.ndarray, lgb_train: Dataset) -> Tuple[str, float, bool]: """ Mean average precision error metric for evaluation in lightgbm. NOTE: This will exponentiate the predictions first, in the case where our actual is logged Args: preds: Array of predictions lgb_train: LightGBM Dataset Returns: Tuple of error name (str) and error (float) """ labels = lgb_train.get_label() mask = labels != 0 return "mape_exp", (np.fabs(labels - np.exp(preds)) / labels)[mask].mean(), False
def corr_sharpe_lgb( time_id_fold, y_pred: np.array, dtrain: lgb.Dataset, ) -> Tuple[str, float, bool]: """ Pearson correlation coefficient metric """ y_true = dtrain.get_label() pd_info = pd.DataFrame({ 'time_id': time_id_fold, 'y_pred': y_pred, 'y_true': y_true }) sharpe_corr = calculate_corr(pd_info, sharpe=True)[0] return 'pearson_corr_sharpe', sharpe_corr, True
def test_onnxrt_python_lightgbm_categorical_iris_booster3_real(self): from lightgbm import LGBMClassifier, Dataset, train as lgb_train iris = load_iris() X, y = iris.data, iris.target X = (X * 10).astype(numpy.float32) X_train, X_test, y_train, _ = train_test_split( X, y, random_state=11) # Classic gbm = LGBMClassifier() gbm.fit(X_train, y_train) exp = gbm.predict_proba(X_test) onx = to_onnx(gbm.booster_, initial_types=[ ('X', FloatTensorType([None, X_train.shape[1]]))], target_opset=TARGET_OPSET) self.assertIn('ZipMap', str(onx)) oif = OnnxInference(onx) got = oif.run({'X': X_test}) values = pandas.DataFrame(got['output_probability']).values self.assertEqualArray(exp, values, decimal=5) # categorical_feature=[0, 1] train_data = Dataset( X_train, label=y_train, feature_name=['c1', 'c2', 'c3', 'c4'], categorical_feature=['c1', 'c2']) params = { "boosting_type": "gbdt", "learning_rate": 0.05, "n_estimators": 2, "objective": "multiclass", "max_bin": 5, "min_child_samples": 100, 'verbose': -1, 'num_class': 3} booster = lgb_train(params, train_data) exp = booster.predict(X_test) onx = to_onnx(booster, initial_types=[ ('X', FloatTensorType([None, X_train.shape[1]]))], target_opset=TARGET_OPSET) self.assertIn('ZipMap', str(onx)) oif = OnnxInference(onx) got = oif.run({'X': X_test}) values = pandas.DataFrame(got['output_probability']).values self.assertEqualArray(exp, values, decimal=5)
def objective(params, n_folds=self.n_folds): self.iteration += 1 subsample = params['boosting_type'].get('subsample', 1.0) params['boosting_type'] = params['boosting_type']['boosting_type'] params['subsample'] = subsample params['verbose'] = -1 for p in ['num_leaves', 'subsample_for_bin', 'min_child_samples']: params[p] = int(params[p]) params['histogram_pool_size'] = 1024 # NOTE: Above parameter is introduced to reduce memory consumption self.logger.debug("Parameters: {}".format(params)) start = timer() train_set = Dataset(x_train, label=y_train) # Perform n_folds cross validation cv_results = cv(params, train_set, num_boost_round=10000, nfold=n_folds, early_stopping_rounds=100, metrics='auc', seed=self.seed) run_time = timer() - start # Loss must be minimized best_score = np.max(cv_results['auc-mean']) loss = 1 - best_score # Boosting rounds that returned the highest cv score n_estimators = int(np.argmax(cv_results['auc-mean']) + 1) return { 'loss': loss, 'params': params, 'iteration': self.iteration, 'estimators': n_estimators, 'train_time': run_time, 'status': STATUS_OK }
def fit_lightgbm(self, x, y, early_stopping_rounds): self.model = LGBMModel(**self.optimized_params) if early_stopping_rounds is not None: x_valid, y_valid = train_test_split(x, stratify=y, shuffle=True, test_size=self.test_size, random_state=self.random_state) self.model.fit(x, y, eval_set=Dataset(x_valid, y_valid), early_stopping_rounds=early_stopping_rounds, verbose=self.verbose) else: self.model.fit(x, y)
def Dist_Metric(predt: np.ndarray, data: lgb.Dataset): """A customized evaluation metric that evaluates the predictions using the negative log-likelihood. """ target = torch.tensor(data.get_label()) is_higher_better = False # Using a custom objective function, the custom metric receives raw predictions which need to be transformed # with the corresponding response function. predt = predt.reshape(-1, Gaussian.n_dist_param(), order="F") preds_location = Gaussian.param_dict()["location"](predt[:, 0]) preds_location = torch.tensor(preds_location, requires_grad=True) preds_scale = Gaussian.param_dict()["scale"](predt[:, 1]) preds_scale = torch.tensor(preds_scale, requires_grad=True) dGaussian = Normal(preds_location, preds_scale) nll = -dGaussian.log_prob(target).nansum() nll = nll.detach().numpy() nll = np.round(nll, 5) return "NegLogLikelihood", nll, is_higher_better
def Dist_Metric(predt: np.ndarray, data: lgb.Dataset): """A customized evaluation metric that evaluates the predictions using the negative log-likelihood. """ target = data.get_label() is_higher_better = False # Using a custom objective function, the custom metric receives raw predictions which need to be transformed # with the corresponding response function. preds_expectile = predt.reshape(-1, Expectile.n_dist_param(), order="F") loss_expectile = [] for i in range(len(Expectile.expectiles)): loss_expectile.append( Expectile.expectile_loss(y=target, expectile=preds_expectile[:, i], tau=Expectile.expectiles[i])) nll = np.nanmean(loss_expectile) return "NegLogLikelihood", nll, is_higher_better
def main(verbose=True, force=False, test=False): import datetime IGNORE_FEATURES = [] os.makedirs(ANALYSIS_PATH, exist_ok=True) os.makedirs(TRAIN_PATH, exist_ok=True) raw_df_name = os.path.join(TRAIN_PATH, 'data_raw.pyt') scaled_df_name = os.path.join(TRAIN_PATH, 'data_scaled.pyt') st_time = datetime.datetime.now() print('Loading the data...') if not os.path.isfile(raw_df_name) or force: df = read() df.set_index(ID, inplace=True) print('\tWriting \033[92m%s\033[0m' % (raw_df_name)) with open(raw_df_name, 'wb') as pyt: joblib.dump(df, pyt) else: print('\tLoading data from \033[92m%s\033[0m' % (raw_df_name)) df = joblib.load(raw_df_name) print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) # log-scale the predictors & predictand bins_target = np.logspace(np.log10(df[PREDICTAND].min()), np.log10(df[PREDICTAND].max()), 20) df[PREDICTAND] = np.log1p(df[PREDICTAND]) predictors = [c for c in df.columns if c not in ['isTrain', PREDICTAND]] # Counts of 0s or non-0s is very different between test and train sets ! pstep = 5 percs = np.arange(pstep, 100, pstep) calculated_cols = [] columns_then = df.columns # Add the info relative to the leak as it affects the training / test processes leak_file = os.path.join(TRAIN_PATH, "df_leaked_%s.pyt" % N_LAGS) if os.path.isfile(leak_file): df_leaked = joblib.load(leak_file) else: df_ = df[predictors].reset_index(level=0) df_[PREDICTAND] = df[PREDICTAND] df_ = df_[['ID', PREDICTAND] + predictors] df_leaked = get_all_leak(df_, COLUMNS_LEAK, N_LAGS) leak_cols = [c for c in df_leaked if c.startswith('leak')] df_leaked = df_leaked[leak_cols] with open(leak_file, 'wb') as pyt: joblib.dump(df_leaked, pyt) df_leaked.index = df.index leak_cols = df_leaked.columns df['nb_potential_leaks'] = df_leaked.notnull().sum(axis=1) df['leak_mean'] = df_leaked.mean(axis=1).fillna(0) df['leak_median'] = df_leaked.median(axis=1).fillna(0) df['leak_max'] = df_leaked.max(axis=1).fillna(0) df['leak_min'] = df_leaked.min(axis=1).fillna(0) # Clustering on sorted dataframe (row by row) to detect similar entries df_ = df[predictors].copy() for row in range(len(df_)): arr = df_.iloc[row, :] df_.iloc[row, :] = np.sort(arr) # Hierarchical clustering seems to have a predictive power #distance = "euclidean" n_clusters = 12 for distance in [ "hamming", "jaccard", "sokalmichener", "sokalsneath", "euclidean" ]: st_time = datetime.datetime.now() print( 'Finding \033[92m%i clusters\033[0m with \033[92m%s distance\033[0m' % (n_clusters, distance)) dist_fname = os.path.join(TRAIN_PATH, "%s_dists.pyt" % distance) if os.path.isfile(dist_fname): dist = joblib.load(dist_fname) print('-- Pairwise distance loading took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) else: if distance == "euclidean": dist = ss.distance.pdist(df_[predictors].values, distance) else: dist = ss.distance.pdist(df[predictors].values.astype(bool), distance) print('-- Pairwise distance computation took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) with open(dist_fname, 'wb') as pyt: joblib.dump(dist, pyt) ward_linkage = hierarchy.ward(dist) tree = hierarchy.to_tree(ward_linkage) cluster_colname = 'cluster_%s' % distance df[cluster_colname] = hierarchy.fcluster(ward_linkage, _get_height_at( tree, n_clusters), criterion="distance") print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) CATEGORICAL_FEATURES.append(cluster_colname) sns.catplot(x=cluster_colname, y=PREDICTAND, data=df.groupby('isTrain').get_group(True), kind="violin") plt.savefig(os.path.join(ANALYSIS_PATH, '%s.png' % cluster_colname)) plt.close() # Keep euclidean clusters as 'cluster_colname' for K-fold grouping cluster_colname = "cluster_euclidean" print('Mojena stopping rule') clusters_for_plot = np.arange(1, 101) heights = np.array( [_get_height_at(tree, n_clusters) for n_clusters in clusters_for_plot]) plt.figure() plt.plot(clusters_for_plot, heights, 'ko--') plt.grid() plt.xlabel('Number of clusters') plt.ylabel('Dendrogram height') plt.savefig(os.path.join(ANALYSIS_PATH, '%s_mojena.png' % cluster_colname)) plt.close() print('Dendrogram for Euclidean distance') dn = hierarchy.dendrogram(ward_linkage, no_labels=True, above_threshold_color='k') plt.ylabel('height') plt.xlabel('samples') plt.savefig( os.path.join(ANALYSIS_PATH, '%s_dendrogram.png' % cluster_colname)) plt.close() df[predictors] = np.log1p(df[predictors]) st_time = datetime.datetime.now() def func_agg(row): r = row[row > 0] return np.append([ (row > 0).sum(), r.mean(), (r**2).mean(), r.std(), r.max(), r.min(), r.skew(), r.kurtosis(), ], r.quantile(q=percs / 100)) print('Computing non-zero aggregates...') df[[ 'count_nonzero', 'mean_nonzero', 'meansq_nonzero', 'std_nonzero', 'max_nonzero', 'min_nonzero', 'skew_nonzero', 'kurt_nonzero', ] + ['p%i' % p for p in percs]] = df[predictors].apply( func_agg, axis=1, result_type="expand").fillna(0) print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) st_time = datetime.datetime.now() def func_agg(row): r = row[row > 0].diff().abs() return np.append([ r.mean(), (r**2).mean(), r.std(), r.max(), r.min(), r.skew(), r.kurtosis(), ], r.quantile(q=percs / 100)) print('Computing diff aggregates...') df[[ 'diff_mean_nonzero', 'diff_meansq_nonzero', 'diff_std_nonzero', 'diff_max_nonzero', 'diff_min_nonzero', 'diff_skew_nonzero', 'diff_kurtosis_nonzero', ] + ['diff_p%i' % p for p in percs]] = df[predictors].apply( func_agg, axis=1, result_type="expand").fillna(0) print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) # add occurrences (will it help ?) print('Computing distributions...') def func_epd(row): epd = np.histogram(np.exp(row[row > 0].values) - 1, bins=bins_target, normed=True)[0] return epd / np.sum(epd) df[['epd_%i' % b for b in bins_target[:-1] ]] = df[predictors].apply(func_epd, axis=1, result_type="expand").fillna(0) columns_now = df.columns calculated_cols.extend([c for c in columns_now if c not in columns_then]) ## Scale the features #st_time = datetime.datetime.now() #print('Scaling (log) the features') #for col in df.columns: #if col not in [PREDICTAND, ID, 'isTrain']: #df[col] = np.log(df[col] + 1) #print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) df.drop(predictors, axis=1, inplace=True) predictors = [c for c in calculated_cols if c in df.columns] print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) #with open(os.path.join(TRAIN_PATH, 'predictors_%s.pyt' % datetime.datetime.now().strftime('%Y%m%d%H')), #'wb') as pyt: #joblib.dump(df, pyt) st_time = datetime.datetime.now() print('Transforming the features') cols_to_remove = [] cols_to_add = [] for col in predictors: if col in CATEGORICAL_FEATURES: print('\tFeature %s is categorical -> OneHot' % col) transf = OneHotEncoder() transf.fit(df[col].values.reshape(-1, 1)) res = transf.transform(df[col].values.reshape(-1, 1)) for i, ax in enumerate(res.transpose(), 1): onehot = '{}_{}'.format(col, i) df[onehot] = ax.toarray().squeeze() cols_to_add.append(onehot) cols_to_remove.append(col) else: print('\tFeature %s is numerical -> QuantileTransformer' % col) try: df[col] = QuantileTransformer().fit_transform( df[col].values.reshape(-1, 1)) except: print("\033[91mQuantileTransformer failed on %s\033[0m" % col) print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) #df.drop(cols_to_remove, axis=1, inplace=True) IGNORE_FEATURES.extend(cols_to_remove) for col in cols_to_remove: predictors.remove(col) calculated_cols.remove(col) predictors.extend(cols_to_add) # T-SNE st_time = datetime.datetime.now() print('Running T-SNE...') fname = os.path.join(ANALYSIS_PATH, "tsne", "tsne.png") os.makedirs(os.path.dirname(fname), exist_ok=True) tsne_comps = tsne( df[predictors + [PREDICTAND, 'isTrain']], fname, nb=len(df), perplexity=40, title=None, visu_tsne=None, cmap='viridis', predictand=PREDICTAND, binary=False, #do_not_plot=[c for c in predictors if not c in calculated_cols + ['isTrain', PREDICTAND]], ) with open( os.path.join( TRAIN_PATH, "tsne_%s.pyt" % (datetime.datetime.now().strftime('%Y%m%d%H%M'))), 'wb') as pyt: joblib.dump(tsne_comps, pyt) try: for i, tsne_ax in enumerate(tsne_comps.transpose(), 1): df['tsne%i' % i] = tsne_ax calculated_cols.append('tsne%i' % i) except: print('\033[91mWARNING ! could not add t-sne values\033[0m') print_exc() pass print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) analyze(df, calculated_cols, step='preprocessed') #analyze_bivariate(df, cfgs, step='preprocessed') df_train = select_sample(df, "train") df_test = select_sample(df, "test") fname = os.path.join(TRAIN_PATH, 'df_train.pyt') print('Saving df_train to \033[92m%s\033[0m' % fname) with open(fname, 'wb') as pyt: joblib.dump(df_train, pyt) predictors = [c for c in df_train.columns if c not in IGNORE_FEATURES] predictors.remove(PREDICTAND) X_train = df_train[predictors].values y_train = df_train[PREDICTAND].values X_test = df_test[predictors].values test_rows = df_test.index # Load the "leaked" target leaked_target = df_leaked.loc[test_rows, leak_cols].median(axis=1) leaked_count = df_leaked.loc[test_rows, leak_cols].notnull().sum(axis=1) leak_inds = np.where(leaked_count > 0)[0] #reg, _ = train_and_validate( #df_train, #predictors, #PREDICTAND, #wdir=TRAIN_PATH, #kind='regression', #MLP_options={'hidden_layer_sizes': (100, 100)}, #GradientBoosting_options={'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 600, 'random_state': 42}, #XGBoost_options={'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 600, 'random_state': 42}, #LightGBM_options={'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 600, 'random_state': 42, 'verbose': -1, 'num_leaves': 124}, #RandomForest_options={'max_depth': None, 'n_estimators': 900, 'max_features': 1, 'min_samples_leaf': 3, 'min_samples_split': 10, 'criterion': 'mse', 'random_state': 42}, #) #os.makedirs(OUTPUT_PATH, exist_ok=True) #for name, regdict in reg.items(): #model = regdict['model'] #fname = os.path.join(OUTPUT_PATH, '%s.csv' % name) #y_pred = model.predict(X_test) #y_pred = np.expm1(y_pred) ##y_pred[leak_inds] = leaked_target.values[leak_inds] #df_result = pd.DataFrame({ID: df_test.index, #PREDICTAND: y_pred}) #df_result.to_csv(fname, index=False) #print('Wrote prediction file: \033[94;1m%s\033[0m' % fname) def save_model(model, name, y_pred=None, replace_leak=False): if model is not None: fname = os.path.join(TRAIN_PATH, "%s.pyt" % name) os.makedirs(TRAIN_PATH, exist_ok=True) with open(fname, "wb") as pyt: joblib.dump({'model': model}, pyt) print('\tSaved model to \033[92m%s\033[0m' % fname) fname = os.path.join(OUTPUT_PATH, "%s.csv" % name) if y_pred is None: y_pred = model.predict(X_test) y_pred = np.expm1(y_pred) if replace_leak: y_pred[leak_inds] = leaked_target.values[leak_inds] fname = fname.replace('.csv', '_leak.csv') df_result = pd.DataFrame({ID: df_test.index, PREDICTAND: y_pred}) df_result.to_csv(fname, index=False) print('\tSaved prediction to \033[92m%s\033[0m' % fname) from lightgbm import Dataset from lightgbm import train as train_lgb nfolds = 10 #folds = KFold(n_splits=nfolds, shuffle=True, random_state=21) folds = GroupKFold(n_splits=nfolds) y_pred_xgb = np.zeros(len(X_test)) y_train_xgb = np.zeros(len(X_train)) y_pred_lgbm = np.zeros(len(X_test)) y_train_lgbm = np.zeros(len(X_train)) lgb_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': {'mse'}, 'num_leaves': 124, 'learning_rate': 0.05, 'feature_fraction': 0.8, 'verbose': -1, 'num_boost_round': 15000, 'early_stopping_rounds': 100, 'nthread': 26 } def _rmse_func(predictions, ground_truth): return np.sqrt(mean_squared_error(predictions, ground_truth)) def rmse(predictions, train_data): labels = train_data.get_label() return 'RMSE', _rmse_func(predictions, labels), False for ifold, (trn_idx, val_idx) in enumerate( folds.split(X_train, y_train, df_train[cluster_colname].values)): print("Fold nb. %i" % ifold) lgb_train = Dataset(data=X_train[trn_idx, :], label=y_train[trn_idx], feature_name=predictors) lgb_val = Dataset(data=X_train[val_idx, :], label=y_train[val_idx], feature_name=predictors) reg = XGBRegressor(n_estimators=600, max_depth=5, learning_rate=0.05, random_state=42) reg.fit(df_train[predictors].iloc[trn_idx, :].values, df_train[[PREDICTAND]].iloc[trn_idx, :].values.squeeze()) pred_fold = reg.predict(df_train[predictors].iloc[val_idx].values) print('\t[XGBoost] oof RMSE is: \033[92m%.4f\033[0m' % np.sqrt( mean_squared_error( df_train[[PREDICTAND]].iloc[val_idx].values.squeeze(), pred_fold))) y_train_xgb += reg.predict(X_train) / nfolds y_pred_xgb += reg.predict(X_test) / nfolds reg = train_lgb(lgb_params, lgb_train, num_boost_round=15000, early_stopping_rounds=100, verbose_eval=100, valid_sets=[lgb_train, lgb_val], feval=rmse) y_pred = reg.predict(X_train[val_idx, :], num_iteration=reg.best_iteration) score = np.sqrt(mean_squared_error(y_train[val_idx], y_pred)) print('\t[LGBM] Best iteration: \033[92m%i\033[0m' % reg.best_iteration) print('\t[LGBM] oof RMSE is: \033[92m%.4f\033[0m' % score) y_train_lgbm += reg.predict(X_train, num_iteration=reg.best_iteration) / nfolds y_pred_lgbm += reg.predict(X_test, num_iteration=reg.best_iteration) / nfolds save_model(None, "LightGBM_folded", y_pred_lgbm, replace_leak=True) save_model(None, "XGBoost_folded", y_pred_xgb) save_model(None, "LightGBM_folded", y_pred_lgbm) save_model(None, "XGB-LGBM_folded", 0.5 * (y_pred_xgb + y_pred_lgbm)) gsDict = {} ## AdaBoost #print('\033[1mGridSearch - AdaBoostRegressor\033[0m') #reg_base = DecisionTreeRegressor() #reg = AdaBoostRegressor(reg_base, random_state=42) #ada_param_grid = { #"base_estimator__criterion": ["mse", "mae"], #"base_estimator__splitter": ["best", "random"], #"algorithm": ["SAMME", "SAMME.R"], #"n_estimators": [2, 10, 50], #"learning_rate": [0.001, 0.01, 0.1]} #gsAdaBoost = GridSearchCV(reg, param_grid=ada_param_grid, #cv=nfolds, scoring="neg_mean_squared_error", #n_jobs=20, verbose=1) #gsAdaBoost.fit(X_train, y_train) #ada_best = gsAdaBoost.best_estimator_ #print('\tBest score: \033[92m%.4f\033[0m' % gsAdaBoost.best_score_) #ada_best.fit(X_train, y_train) #save_model(ada_best, "AdaBoost") #gsDict["AdaBoost"] = gsAdaBoost ## ExtraTrees #print('\033[1mGridSearch - ExtraTreesRegressor\033[0m') #reg = ExtraTreesRegressor() ## Search grid for optimal parameters #ex_param_grid = { #"max_depth": [None], #"max_features": [1, 3, 10], #"min_samples_split": [2, 3, 10], #"min_samples_leaf": [1, 3, 10], #"bootstrap": [False], #"n_estimators": [100, 300, 900], #"criterion": ["mse", "mae"]} #gsExtraTrees = GridSearchCV(reg, param_grid=ex_param_grid, #cv=nfolds, scoring="neg_mean_squared_error", #n_jobs=20, verbose=1) #gsExtraTrees.fit(X_train, y_train) #etc_best = gsExtraTrees.best_estimator_ #print('\tBest score: \033[92m%.4f\033[0m' % gsExtraTrees.best_score_) #etc_best.fit(X_train, y_train) #save_model(etc_best, "ExtraTrees") #gsDict["ExtraTrees"] = gsExtraTrees ## RF Parameters #print('\033[1mGridSearch - RandomForestRegressor\033[0m') #reg = RandomForestRegressor() ## Search grid for optimal parameters #rf_param_grid = { #"max_depth": [None, 4, 5], #"max_features": [1, 3, 10], #"min_samples_split": [2, 3, 10], #"min_samples_leaf": [1, 3, 10], #"bootstrap": [False], #"n_estimators": [100, 300, 900], #"criterion": ["mse", "mae"]} #gsRandomForest = GridSearchCV( #reg, param_grid=rf_param_grid, #cv=nfolds, scoring="neg_mean_squared_error", #n_jobs=36, verbose=1) #gsRandomForest.fit(X_train, y_train) #rfc_best = gsRandomForest.best_estimator_ #print('\tBest score: \033[92m%.4f\033[0m' % gsRandomForest.best_score_) #for key in rf_param_grid.keys(): #print('\t\t%s: \033[92m%s\033[0m' % (key, getattr(rfc_best, key, '-'))) #rfc_best.fit(X_train, y_train) #save_model(rfc_best, "RandomForest") #gsDict["RandomForest"] = gsRandomForest ## Gradient boosting #print('\033[1mGridSearch - GradientBoostingRegressor\033[0m') #reg = GradientBoostingRegressor() #gb_param_grid = { #'loss' : ["ls", "lad", "huber"], #'n_estimators' : [600, 300, 900], #'learning_rate': [0.1, 0.05, 0.01], #'max_depth': [5, 4, 6], #'min_samples_leaf': [10, 50], #'max_features': ["sqrt", "auto"] #} #gsGradientBoosting = GridSearchCV( #reg, param_grid=gb_param_grid, #cv=nfolds, scoring="neg_mean_squared_error", #n_jobs=36, verbose=1) #gsGradientBoosting.fit(X_train, y_train) #gbc_best = gsGradientBoosting.best_estimator_ #print('\tBest score: \033[92m%.4f\033[0m' % gsGradientBoosting.best_score_) #for key in gb_param_grid.keys(): #print('\t\t%s: \033[92m%s\033[0m' % (key, getattr(gbc_best, key, '-'))) #gbc_best.fit(X_train, y_train) #save_model(gbc_best, "GradientBoosting") #gsDict["GradientBoosting"] = gsGradientBoosting # Gradient boosting print('\033[1mGridSearch - XGBRegressor\033[0m') reg = XGBRegressor() xgb_param_grid = { 'n_estimators': [600, 300, 900], 'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [5, 4, 6], 'missing': [None, 0.], 'booster': ["gbtree", "gblinear", "dart"], } gsXGBoost = GridSearchCV(reg, param_grid=xgb_param_grid, cv=nfolds, scoring="neg_mean_squared_error", n_jobs=36, verbose=1) gsXGBoost.fit(X_train, y_train) gbc_best = gsXGBoost.best_estimator_ print('\tBest score: \033[92m%.4f\033[0m' % gsXGBoost.best_score_) for key in xgb_param_grid.keys(): print('\t\t%s: \033[92m%s\033[0m' % (key, getattr(gbc_best, key, '-'))) gbc_best.fit(X_train, y_train) save_model(gbc_best, "XGBoost") gsDict["XGBoost"] = gsXGBoost