def get_importances_from_model(X, y, features=None, verbose=50, early_stopping_rounds=200): lgb_params = {} lgb_params['boosting_type'] = 'gbdt' lgb_params['objective'] = 'binary' lgb_params['learning_rate'] = 0.03 lgb_params['metric'] = 'auc' lgb_params['num_iterations'] = 10000 lgb_params["colsample_bytree"] = 0.5 lgb_params["subsample"] = 0.8 lgb_params["reg_alpha"] = 0.3 lgb_params['reg_lambda'] = 0.3 lgb_params['max_depth'] = 8 if features == None: features = X.columns.tolist() train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=2017) lgb_train = Dataset(data=train_X, label=train_y, feature_name=features) lgb_val = Dataset(data=val_X, label=val_y, feature_name=features) lgb_booster = train(params=lgb_params, train_set=lgb_train, valid_sets=[lgb_train, lgb_val], valid_names=["train", "validation"], verbose_eval=verbose, early_stopping_rounds=early_stopping_rounds) return lgb_booster
def fit_lgb(x_tr, y_tr, x_va, y_va, cat_feats, args): from lightgbm import Dataset if args.clip_target != -1: y_tr = y_tr.clip(upper=args.clip_target) tr_ds = Dataset(x_tr, label=y_tr, free_raw_data=False) if args.mode not in ['full', 'fold']: va_ds = Dataset(x_va, label=y_va, free_raw_data=False) valid_sets = [tr_ds, va_ds] else: valid_sets = [tr_ds] params = { 'learning_rate': 0.02, 'max_depth': -1, 'boosting': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'is_training_metric': True, 'num_leaves': args.num_leaves, 'feature_fraction': 0.9, 'bagging_fraction': 0.7, 'lambda_l2': 0.7, 'bagging_freq': 5, 'seed': 42 } kwargs = { 'train_set': tr_ds, 'categorical_feature': cat_feats, 'verbose_eval': args.verbose_eval, 'num_boost_round': args.num_boost_round, } if args.mode not in ['full', 'fold']: kwargs['early_stopping_rounds'] = 200 kwargs['valid_sets'] = valid_sets if args.lr_decay: kwargs['callbacks'] = [ lgb.reset_parameter( learning_rate=learning_rate_010_decay_power_0995) ] m = lgb.train(params, **kwargs) tr_pred = np.clip(m.predict(tr_ds.data), 0, 361) tr_score = np.sqrt(mean_squared_error(tr_pred, tr_ds.label)) if args.mode not in ['full', 'fold']: va_pred = np.clip(m.predict(va_ds.data), 0, 361) va_score = np.sqrt(mean_squared_error(va_pred, va_ds.label)) else: va_score = 0. return m, tr_score, va_score
def get_importances_from_model(X, y, features=None): lgb_params = {} lgb_params['boosting_type'] = 'gbdt' lgb_params['objective'] = 'binary' lgb_params['learning_rate'] = 0.02 lgb_params['metric'] = 'auc' # lgb_params['num_leaves'] = 34 lgb_params['colsample_bytree'] = 0.75 lgb_params['subsample'] = 0.75 lgb_params['n_estimators'] = 1500 # lgb_params['max_depth'] = 8 # lgb_params["reg_alpha"] = 0.041545473 # lgb_params['reg_lambda'] = 0.0735294 # lgb_params['min_split_gain'] = 0.0735294 # lgb_params['min_child_weight'] = 0.0735294 # lgb_params['silent'] = False if features == None: features = X.columns.tolist() lgb_train = Dataset(data=X, label=y, feature_name=features) lgb_booster = train(params=lgb_params, train_set=lgb_train, verbose_eval=50, num_boost_round=1500) return lgb_booster
def test_onnxrt_python_lightgbm_categorical_iris(self): iris = load_iris() X, y = iris.data, iris.target X = (X * 10).astype(numpy.int32) X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11) other_x = numpy.random.randint(0, high=10, size=(1500, X_train.shape[1])) X_train = numpy.vstack([X_train, other_x]).astype(dtype=numpy.int32) y_train = numpy.hstack([ y_train, numpy.zeros(500) + 3, numpy.zeros(500) + 4, numpy.zeros(500) + 5 ]).astype(dtype=numpy.int32) self.assertEqual(y_train.shape, (X_train.shape[0], )) y_train = y_train % 2 # Classic gbm = LGBMClassifier() gbm.fit(X_train, y_train) exp = gbm.predict_proba(X_test) onx = to_onnx(gbm, initial_types=[('X', Int64TensorType([None, X_train.shape[1]]))]) self.assertIn('ZipMap', str(onx)) oif = OnnxInference(onx) got = oif.run({'X': X_test}) values = pandas.DataFrame(got['output_probability']).values self.assertEqualArray(exp, values, decimal=5) # categorical_feature=[0, 1] train_data = Dataset(X_train, label=y_train, feature_name=['c1', 'c2', 'c3', 'c4'], categorical_feature=['c1', 'c2']) params = { "boosting_type": "gbdt", "learning_rate": 0.05, "n_estimators": 2, "objective": "binary", "max_bin": 5, "min_child_samples": 100, 'verbose': -1, } booster = lgb_train(params, train_data) exp = booster.predict(X_test) onx = to_onnx(booster, initial_types=[('X', Int64TensorType([None, X_train.shape[1]]))]) self.assertIn('ZipMap', str(onx)) oif = OnnxInference(onx) got = oif.run({'X': X_test}) values = pandas.DataFrame(got['output_probability']).values self.assertEqualArray(exp, values[:, 1], decimal=5)
def test_lightgbm_booster_multi_classifier(self): X = [[0, 1], [1, 1], [2, 0], [1, 2], [-1, 2], [1, -2]] X = numpy.array(X, dtype=numpy.float32) y = [0, 1, 0, 1, 2, 2] data = Dataset(X, label=y) model = train( { 'boosting_type': 'gbdt', 'objective': 'multiclass', 'n_estimators': 3, 'min_child_samples': 1, 'num_class': 3 }, data) update_registered_converter(WrappedLightGbmBoosterClassifier, 'WrappedLightGbmBoosterClassifier', calculate_lightgbm_output_shapes, convert_lightgbm, parser=lightgbm_parser, options={ 'zipmap': [False, True], 'nocl': [False, True] }) update_registered_converter(WrappedBooster, 'WrappedBooster', calculate_lightgbm_output_shapes, convert_lightgbm, parser=lightgbm_parser, options={ 'zipmap': [False, True], 'nocl': [False, True] }) update_registered_converter(Booster, 'LightGbmBooster', calculate_lightgbm_output_shapes, convert_lightgbm, parser=lightgbm_parser) model_onnx = to_onnx( model, initial_types=[('X', FloatTensorType([None, 2]))], options={WrappedLightGbmBoosterClassifier: { 'zipmap': False }}, target_opset={ '': TARGET_OPSET, 'ai.onnx.ml': TARGET_OPSET_ML }) try: sess = InferenceSession(model_onnx.SerializeToString()) except InvalidArgument as e: raise AssertionError("Cannot load model\n%r" % str(model_onnx)) from e expected = model.predict(X) res = sess.run(None, {'X': X}) assert_almost_equal(expected, res[1])
def lightgbm_trainer(training_data, label, model_params): """Train LightGBM model on training data. Args: training_data (lightgbm.Dataset): Training data. label (str): Target column in training data. model_params (dict): Training parameters. Returns: lightgbm.Booster: Trained LightGBM model. """ training_data = Dataset(data=training_data.drop(label, axis=1), label=training_data[LABEL]) return train(train_set=training_data, params=model_params)
def test_lightgbm_booster_classifier(self): from lightgbm import Dataset, train as lgb_train X = numpy.array([[0, 1], [1, 1], [2, 0], [1, 2]], dtype=numpy.float32) y = [0, 1, 0, 1] data = Dataset(X, label=y) model = lgb_train({'boosting_type': 'rf', 'objective': 'binary', 'n_estimators': 3, 'min_child_samples': 1, 'subsample_freq': 1, 'bagging_fraction': 0.5, 'feature_fraction': 0.5}, data) model_onnx = to_onnx(model, X, verbose=0, rewrite_ops=True, target_opset=TARGET_OPSET) self.assertNotEmpty(model_onnx)
def test_onnxrt_python_lightgbm_categorical_iris_booster3_real(self): from lightgbm import LGBMClassifier, Dataset, train as lgb_train iris = load_iris() X, y = iris.data, iris.target X = (X * 10).astype(numpy.float32) X_train, X_test, y_train, _ = train_test_split( X, y, random_state=11) # Classic gbm = LGBMClassifier() gbm.fit(X_train, y_train) exp = gbm.predict_proba(X_test) onx = to_onnx(gbm.booster_, initial_types=[ ('X', FloatTensorType([None, X_train.shape[1]]))], target_opset=TARGET_OPSET) self.assertIn('ZipMap', str(onx)) oif = OnnxInference(onx) got = oif.run({'X': X_test}) values = pandas.DataFrame(got['output_probability']).values self.assertEqualArray(exp, values, decimal=5) # categorical_feature=[0, 1] train_data = Dataset( X_train, label=y_train, feature_name=['c1', 'c2', 'c3', 'c4'], categorical_feature=['c1', 'c2']) params = { "boosting_type": "gbdt", "learning_rate": 0.05, "n_estimators": 2, "objective": "multiclass", "max_bin": 5, "min_child_samples": 100, 'verbose': -1, 'num_class': 3} booster = lgb_train(params, train_data) exp = booster.predict(X_test) onx = to_onnx(booster, initial_types=[ ('X', FloatTensorType([None, X_train.shape[1]]))], target_opset=TARGET_OPSET) self.assertIn('ZipMap', str(onx)) oif = OnnxInference(onx) got = oif.run({'X': X_test}) values = pandas.DataFrame(got['output_probability']).values self.assertEqualArray(exp, values, decimal=5)
def objective(params, n_folds=self.n_folds): self.iteration += 1 subsample = params['boosting_type'].get('subsample', 1.0) params['boosting_type'] = params['boosting_type']['boosting_type'] params['subsample'] = subsample params['verbose'] = -1 for p in ['num_leaves', 'subsample_for_bin', 'min_child_samples']: params[p] = int(params[p]) params['histogram_pool_size'] = 1024 # NOTE: Above parameter is introduced to reduce memory consumption self.logger.debug("Parameters: {}".format(params)) start = timer() train_set = Dataset(x_train, label=y_train) # Perform n_folds cross validation cv_results = cv(params, train_set, num_boost_round=10000, nfold=n_folds, early_stopping_rounds=100, metrics='auc', seed=self.seed) run_time = timer() - start # Loss must be minimized best_score = np.max(cv_results['auc-mean']) loss = 1 - best_score # Boosting rounds that returned the highest cv score n_estimators = int(np.argmax(cv_results['auc-mean']) + 1) return { 'loss': loss, 'params': params, 'iteration': self.iteration, 'estimators': n_estimators, 'train_time': run_time, 'status': STATUS_OK }
def fit_lightgbm(self, x, y, early_stopping_rounds): self.model = LGBMModel(**self.optimized_params) if early_stopping_rounds is not None: x_valid, y_valid = train_test_split(x, stratify=y, shuffle=True, test_size=self.test_size, random_state=self.random_state) self.model.fit(x, y, eval_set=Dataset(x_valid, y_valid), early_stopping_rounds=early_stopping_rounds, verbose=self.verbose) else: self.model.fit(x, y)
# seeds=[i for i in range(100)] seed = None datafilepath = './data/cleanData.csv' label_flag = 'categorical' test_size = 0.2 if __name__ == '__main__': x, y = loadXY(datafilepath, label_flag) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed, shuffle=True) # x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=test_size, random_state=seed, shuffle=True) # ''' train_data = Dataset(x_train, label=y_train) # val_data = Dataset(x_val, label=y_val) test_data = Dataset(x_test, label=y_test) param = { 'num_leaves': 100, 'num_trees': 300, 'objective': 'binary', 'metric': ['auc', 'binary_logloss'] } num_round = 10 # bstcv = lgb.cv(param, train0_data, num_round, nfold=10) bst = lgb.train(param, train_data, num_round, valid_sets=[train_data],
def test_onnxrt_python_lightgbm_categorical_iris_dataframe(self): iris = load_iris() X, y = iris.data, iris.target X = (X * 10).astype(numpy.int32) X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11) other_x = numpy.random.randint(0, high=10, size=(1500, X_train.shape[1])) X_train = numpy.vstack([X_train, other_x]).astype(dtype=numpy.int32) y_train = numpy.hstack([ y_train, numpy.zeros(500) + 3, numpy.zeros(500) + 4, numpy.zeros(500) + 5 ]).astype(dtype=numpy.int32) self.assertEqual(y_train.shape, (X_train.shape[0], )) y_train = y_train % 2 df_train = pandas.DataFrame(X_train) df_train.columns = ['c1', 'c2', 'c3', 'c4'] df_train['c1'] = df_train['c1'].astype('category') df_train['c2'] = df_train['c2'].astype('category') df_train['c3'] = df_train['c3'].astype('category') df_train['c4'] = df_train['c4'].astype('category') df_test = pandas.DataFrame(X_test) df_test.columns = ['c1', 'c2', 'c3', 'c4'] df_test['c1'] = df_test['c1'].astype('category') df_test['c2'] = df_test['c2'].astype('category') df_test['c3'] = df_test['c3'].astype('category') df_test['c4'] = df_test['c4'].astype('category') # categorical_feature=[0, 1] train_data = Dataset(df_train, label=y_train) params = { "boosting_type": "gbdt", "learning_rate": 0.05, "n_estimators": 2, "objective": "binary", "max_bin": 5, "min_child_samples": 100, 'verbose': -1, } booster = lgb_train(params, train_data) exp = booster.predict(X_test) onx = to_onnx(booster, df_train) self.assertIn('ZipMap', str(onx)) oif = OnnxInference(onx) got = oif.run(df_test) values = pandas.DataFrame(got['output_probability']).values self.assertEqualArray(exp, values[:, 1], decimal=5) onx.ir_version = get_ir_version_from_onnx() oif = OnnxInference(onx, runtime='onnxruntime1') got = oif.run(df_test) values = pandas.DataFrame(got['output_probability']).values self.assertEqualArray(exp, values[:, 1], decimal=5) onx = to_onnx(booster, df_train, options={booster.__class__: { 'cast': True }}) self.assertIn('op_type: "Cast"', str(onx)) oif = OnnxInference(onx) got = oif.run(df_test) values = pandas.DataFrame(got['output_probability']).values self.assertEqualArray(exp, values[:, 1], decimal=5)
import torch import pandas as pd from lightgbm import Dataset import numpy as np df = torch.load('prep/vanilla0.pt') dtest = df[df.split == 'test'] ids = dtest.pop('id') m = torch.load('model_full.pt') cols = [ c for c in dtest.columns if c not in ['fold', 'id', 'split', 'ad_periods'] ] x_te = dtest[m.feature_name()] te_ds = Dataset(x_te, free_raw_data=False) te_pred = m.predict(te_ds.data) sub = pd.read_csv('data/sample_submission.csv') assert np.all(sub.id.values == ids.values) sub.ad_periods = te_pred sub.to_csv('pred/nov12.csv', index=False)
# corr.to_csv("ModelCorr.csv") # # for col in corr.columns: # # print(corr[col][corr[col] < 0.9].index) # selected_columns = ["preds_01", "preds_03", "preds_07", "preds_10"] # df = preds[selected_columns] # print(df.head(10)) # df["TARGET"] = df.mean(axis=1) # df[["TARGET"]].to_csv("Ensemble_LowCorr.csv") # Convert data to DMatrix lgb_train = Dataset(X_train, y_train) lgb_test = Dataset(X_test) # Define estimator parameters params = { 'task': 'train', 'objective': 'binary', 'learning_rate': 0.1, 'num_leaves': 31, 'max_depth': 8, 'min_data_in_leaf': 20, 'min_sum_hessian_in_leaf': 0.001, 'lambda_l1': 0, 'lambda_l2': 0, 'scale_pos_weight': 1, 'metric': 'auc',
test_size = 0.4 if __name__ == '__main__': x, y = loadXY(datafilepath, label_flag) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed, shuffle=True) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=test_size, random_state=seed, shuffle=True) # ''' train_data = Dataset(x_train, label=y_train) val_data = Dataset(x_val, label=y_val) test_data = Dataset(x_test, label=y_test) param = { 'num_leaves': 31, 'num_trees': 100, 'objective': 'binary', 'metric': ['auc', 'binary_logloss'] } num_round = 10 bstcv = lgb.cv(param, train_data, num_round, nfold=10) bst = lgb.train(param, train_data, num_round, valid_sets=[val_data],
def train_lgb(model_output_path, valid_limit=500, thread_num=2, save_rounds=100, num_boost_round=2000, former_model_path=None, max_epochs=100, batch_size=100, nb_worker=4, mini_batch_size=3000, limit=2000, iteration_per_epoch=100): train_file_numbers = range(1, 540) + range(750, 800) + range( 870, 920) + range(970, 1020) + range(1100, 1200) valid_file_numbers = range(400, 440) + range(700, 750) + range( 845, 870) + range(945, 970) + range(1045, 1100) test_file_numbers = range(540, 640) + range(800, 845) + range( 920, 945) + range(1020, 1045) + range(1200, 1214) DATA_ROOT = '/media/user/Data0/hjw/datas/Quant_Datas_v4.0//gzip_datas_norm' train_filepath_list = [ os.path.join(DATA_ROOT, '%s_trans_norm.gz' % fn) for fn in train_file_numbers ] valid_filepath_list = [ os.path.join(DATA_ROOT, '%s_trans_norm.gz' % fn) for fn in valid_file_numbers ] # basic_model = build_model(feature_dim=4560, output_dim=1) # train_filepath_list = ['/Users/jayveehe/git_project/FundDataAnalysis/pipelines/datas/gzip_datas/993_trans.gz'] # valid_filepath_list = ['/Users/jayveehe/git_project/FundDataAnalysis/pipelines/datas/gzip_datas/993_trans.gz'] # train_generator = gzip_sample_generator(train_filepath_list, batch_size=1000, total_limit=1000000, # per_file_limit=10000) valid_generator = gzip_sample_generator(valid_filepath_list, batch_size=50000, total_limit=100000, per_file_limit=10000) params = { 'objective': 'regression_l2', 'num_leaves': 128, 'boosting': 'gbdt', 'feature_fraction': 0.9, 'bagging_fraction': 0.7, 'bagging_freq': 100, 'verbose': 0, 'is_unbalance': False, 'metric': 'l1,l2,huber', 'num_threads': thread_num } if former_model_path: former_model = cPickle.load(open(former_model_path, 'rb')) else: former_model = None tmp_model = former_model tmp_num = num_boost_round valid_x, valid_y = next(valid_generator) valid_set = Dataset(valid_x, valid_y, free_raw_data=False) gbm = None eval_res = {} for epoch in xrange(max_epochs): train_generator = gzip_sample_generator(train_filepath_list, batch_size=50000, total_limit=1000000, per_file_limit=100000) for iter_n in xrange(iteration_per_epoch): train_x, train_y = next(train_generator) tmp_dataset = Dataset(train_x, train_y, free_raw_data=False) # if not gbm: gbm = lgb.train(params, tmp_dataset, num_boost_round=save_rounds, early_stopping_rounds=30, keep_training_booster=True, learning_rates=lambda iter_num: max( 1 * (0.98**iter_num / (iteration_per_epoch * 0.05)), 0.008), valid_sets=[valid_set], init_model=tmp_model, evals_result=eval_res) # else: # gbm.update(train_set=tmp_dataset) tmp_model = gbm # print 'eval result: %s' % eval_res print 'saving model' gbm.save_model(model_output_path)
def _train(params: dict, x_train: np.ndarray, y_train: np.ndarray, loss_func, *args, x_valid: np.ndarray = None, y_valid: np.ndarray = None, loss_func_grad: Callable[[float, float], float] = None, loss_func_eval: Callable[[float, float], float] = None, use_custom_dataset: bool = False, func_train=None, **kwargs): """ Params:: loss_func: custom loss or string string: lightgbm の公式に従う. https://lightgbm.readthedocs.io/en/latest/Parameters.html#core-parameters binary, multiclass, regression_l1, huber, ... custom loss loss_func_grad: grad, hess の値が出る function があれば set する. None かつ loss_func が custom の場合は, scipy で grad,hessを自動計算. None かつ loss_func が string の場合は original の実装通り. loss_func_eval: eval function or string. function の場合 func_embed に mean で set する string の場合 lightgbm の公式に従う. https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric-parameters None の場合 loss_func が string の場合 loss_func を使う loss_func が function の場合 loss_func を func_embed に埋め込んで使う """ logger.info("START") dataset = None if use_custom_dataset: dataset = KkLgbDataset(x_train) dataset.set_culstom_label(y_train) else: dataset = Dataset(x_train, label=y_train) if not (isinstance(x_valid, list) or isinstance(x_valid, tuple)): x_valid = [] if x_valid is None else [x_valid] y_valid = [] if y_valid is None else [y_valid] list_dataset_valid = [dataset] for _x_valid, _y_valid in zip(x_valid, y_valid): if use_custom_dataset: list_dataset_valid.append(KkLgbDataset(_x_valid)) list_dataset_valid[-1].set_culstom_label(_y_valid) else: list_dataset_valid.append(Dataset(_x_valid, label=_y_valid)) fobj = None if loss_func_grad is None and (not isinstance(loss_func, str)): loss_func_grad = partial(calc_grad_hess, loss_func=loss_func) if loss_func_grad is not None: fobj = lambda x, y: lgb_custom_objective( x, y, loss_func_grad, is_lgbdataset=True) feval = None if loss_func_eval is not None and (not isinstance(loss_func_eval, str)): feval = lambda x, y: lgb_custom_eval(x, y, func_embed(loss_func_eval, calc_type="mean"), "myloss", is_higher_better=False, is_lgbdataset=True) elif loss_func_eval is None: if isinstance(loss_func, str): loss_func_eval = loss_func else: feval = lambda x, y: lgb_custom_eval(x, y, func_embed(loss_func, calc_type="mean"), "myloss", is_higher_better=False, is_lgbdataset=True) if fobj is None and isinstance(loss_func, str): params["objective"] = loss_func if feval is None and isinstance(loss_func_eval, str): params["metric"] = loss_func_eval evals_result = {} # metric の履歴 logger.info( f"params: {params}, dataset: {dataset}, fobj: {fobj}, feval: {feval}") obj = func_train( params, dataset, valid_sets=list_dataset_valid, valid_names=["train"] + ["valid" + str(i) for i in range(len(list_dataset_valid) - 1)], fobj=fobj, feval=feval, evals_result=evals_result, **kwargs) logger.info("END") return obj
def kfold_lightgbm(df, num_folds, stratified=False, epochs=1, corr_save=False, importance_save=False): df = df.drop('Unnamed: 0', axis=1) # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] # Correlation csv processing if corr_save == True: target_corr = train_df.corr()['TARGET'].sort_values() corr_df = pd.DataFrame() corr_df['feature'] = target_corr.index corr_df['corr'] = target_corr.values corr_df = corr_df[corr_df['feature'] != 'feature'] corr_df.to_csv('../output/correlation.csv') del target_corr, corr_df # Delete variables from memory del df print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) # Initialise predictions and importance dataframes and epoch weights sub_df = test_df[['SK_ID_CURR']].copy() sub_df['TARGET'] = 0 ep_ave = 1 / epochs epv_preds = np.zeros(train_df.shape[0]) epv_df = train_df[['SK_ID_CURR']].copy() epv_df['TARGET'] = 0 feature_importance_df = pd.DataFrame() for n in range(epochs): print('Epoch number {} of {} starting'.format(n + 1, epochs)) # Cross validation model if epochs == 1: if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001) else: if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True) else: folds = KFold(n_splits=num_folds, shuffle=True) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feats = [ f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index'] ] for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['TARGET'])): dtrain = Dataset(data=train_df[feats].iloc[train_idx], label=train_df['TARGET'].iloc[train_idx], free_raw_data=False, silent=True) dvalid = Dataset(data=train_df[feats].iloc[valid_idx], label=train_df['TARGET'].iloc[valid_idx], free_raw_data=False, silent=True) # LightGBM parameters found by Bayesian optimization params = { 'objective': 'binary', 'boosting_type': 'gbdt', # 'goss' 'nthread': 4, 'learning_rate': 0.02, # 02, 'num_leaves': 20, 'colsample_bytree': 0.9497036, 'subsample': 0.8715623, 'subsample_freq': 1, 'max_depth': 8, 'reg_alpha': 0.041545473, 'reg_lambda': 0.0735294, 'min_split_gain': 0.0222415, 'min_child_weight': 60, #39.3259775 'seed': 0, 'verbose': -1, 'metric': 'auc', } clf = train(params=params, train_set=dtrain, num_boost_round=10000, valid_sets=[dtrain, dvalid], early_stopping_rounds=200, verbose_eval=100) # params = { # 'objective': 'binary', # 'boosting_type': 'gbdt', # 'goss' # 'nthread': 4, # 'learning_rate': 0.1, # 02, # 'num_leaves': 35, # 'colsample_bytree': 0.2, # 'subsample': 1, # 'subsample_freq': 1, # 'max_depth': -1, # 'reg_alpha': 0.0, # 'reg_lambda': 100.0, # 'min_split_gain': 0.5, # 'min_child_weight': 60, #39.3259775 # 'seed': 0, # 'verbose': -1, # 'metric': 'auc', # 'scale_pos_weight': 1, # 'min_child_samples': 50, # 'subsample_for_bin': 300 # } # clf = train( # params=params, # train_set=dtrain, # num_boost_round=5000, # valid_sets=[dtrain, dvalid], # early_stopping_rounds= 100, # verbose_eval=100 # ) oof_preds[valid_idx] = clf.predict(dvalid.data) sub_preds += clf.predict(test_df[feats]) / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importance( importance_type='gain') fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(dvalid.label, oof_preds[valid_idx]))) del clf, dtrain, dvalid print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance sub_df['TARGET'] += ep_ave * sub_preds epv_preds += ep_ave * oof_preds # epv_df['TARGET'] += ep_ave*oof_preds print('Epoch number {} of {} ended'.format(n + 1, epochs)) sub_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index=False) # epv_df[['SK_ID_CURR', 'TARGET']].to_csv('../data/lightgbm-fast-train.csv', index= False) print('Full AUC score over all epochs %.6f' % roc_auc_score(train_df['TARGET'], epv_preds)) display_importances(feature_importance_df) # Save feature importance df as csv if importance_save == True: feature_importance_df = feature_importance_df.groupby('feature').agg( 'mean').drop('fold', axis=1).sort_values('importance') feature_importance_df.to_csv('../output/importance_3.9.csv')
#tr_df = tr_df[cols] #te_df = te_df[cols] # svr_43_int is not fit at all .. ignored #cols = [c for c in tr_df.columns if c.startswith('lgb')] #tr_df = tr_df[cols] #te_df = te_df[cols] #tr_df.drop(cols_to_drop, axis=1, inplace=True) #te_df.drop(cols_to_drop, axis=1, inplace=True) x_tr, x_va, y_tr, y_va = train_test_split(tr_df, y, test_size=0.1, shuffle=True, random_state=42) tr_ds = Dataset(x_tr, label=y_tr, free_raw_data=False) va_ds = Dataset(x_va, label=y_va, free_raw_data=False) valid_sets = [tr_ds, va_ds] #hpsearch_lgb(x_tr, y_tr, x_va, y_va) params = { 'learning_rate': 0.001, 'max_depth': -1, 'boosting': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'is_training_metric': True, 'feature_fraction': 0.9, 'bagging_fraction': 0.7, 'lambda_l2': 0.2,
min_child_weight=0.001, min_split_gain=0.0, n_estimators=5, n_jobs=1, num_leaves=31, objective='mse', random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1., subsample_for_bin=200000, subsample_freq=0) X, y = load_boston(return_X_y=True) dataset = Dataset(data=X, label=y, free_raw_data=False) train_index = np.arange(0, 100) test_index = np.arange(100, 200) train_set0 = Dataset(X[train_index], y[train_index], free_raw_data=False).construct() train_set1 = dataset.subset(used_indices=train_index).construct() assert_array_equal(train_set0.data, train_set1.data) assert_array_equal(train_set0.label, train_set1.label) booster0 = train(params=params, train_set=train_set0, num_boost_round=5) booster1 = train(params=params, train_set=train_set1, num_boost_round=5) pred0 = booster0.predict(X[test_index])
def lgbm_feat_selector(train_x, train_y, valid_x, valid_y, params=PARAMS, drop_size=1, keep_size=20): """ :param train_x: :param train_y: :param valid_x: :param valid_y: :param params: :param drop_size: :param keep_size: :return:list[dict({"round":select_round, "features":left_features_this_round, "train_round":best_iteration_of_model, "auc":auc-score})] """ res = [] feat_list = list(train_x.columns) round_ = 0 while len(feat_list) > keep_size: print("-" * 25 + "selector round {}".format(round_) + "-" * 25) sub_col_train_x = train_x[feat_list] sub_col_valid_x = valid_x[feat_list] data_train = Dataset(sub_col_train_x.values, train_y.values) data_valid = Dataset(sub_col_valid_x.values, valid_y.values) print("-" * 25 + "training" + "-" * 25) cv_res = lgb.cv(params=params, train_set=data_train, nfold=4, stratified=True, shuffle=True, early_stopping_rounds=10, num_boost_round=1000) train_score = max(cv_res["auc-mean"]) iteration = len(cv_res["auc-mean"]) print("-" * 25 + "saving result" + "-" * 25) res.append({ "round": round_, "features": feat_list, "train_round": iteration, "auc": train_score }) # drop tail features model = lgb.train(params=params, train_set=data_train, num_boost_round=iteration, valid_sets=[data_train, data_valid]) del data_train, data_valid print("-" * 25 + "dropping tail {} features".format(drop_size) + "-" * 25) feature_importance = pd.Series( model.feature_importance(), index=sub_col_train_x.columns).sort_values() tail_features = list(feature_importance.head(drop_size).index) feat_list = list(set(feat_list) - set(tail_features)) round_ += 1 return res
sess = rt.InferenceSession(onx.SerializeToString()) input_name = sess.get_inputs()[0].name label_name = sess.get_outputs()[0].name pred_onx = sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0] print(pred_onx) ############################################### # With Dataset # ++++++++++++ # # Huge datasets cannot be handled with the scikit-learn API. # DMatrix must be used. Let's see how to convert the trained # model. dtrain = Dataset(X_train, label=y_train) param = {'objective': 'multiclass', 'num_class': 3} bst = train_lgbm(param, dtrain, 10) initial_type = [('float_input', FloatTensorType([None, 4]))] onx = convert_lightgbm(bst, initial_types=initial_type) sess = rt.InferenceSession(onx.SerializeToString()) input_name = sess.get_inputs()[0].name label_name = sess.get_outputs()[0].name pred_onx = sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0] print(pred_onx) ##################################
tr_dfs = [] for fn in baseroot.glob("*_tr.csv"): tr_dfs.append(pd.read_csv(fn)) tr_df = pd.concat(tr_dfs, axis=1) for fn in baseroot.glob("*_te.csv"): te_dfs.append(pd.read_csv(fn)) te_df = pd.concat(te_dfs, axis=1) # svr_43_int is not fit at all .. ignored tr_df.drop('svr_43_int', axis=1, inplace=True) te_df.drop('svr_43_int', axis=1, inplace=True) tr_ds = Dataset(tr_df, label=y, free_raw_data=False) te_ds = Dataset(te_df, free_raw_data=False) #hpsearch_lgb(x_tr, y_tr, x_va, y_va) #params = { # 'learning_rate': 0.02, # 'max_depth': 3, # 'boosting': 'gbdt', # 'objective': 'regression', # 'metric': 'rmse', # 'is_training_metric': True, # 'feature_fraction': 0.9, # 'bagging_fraction': 0.7, # 'lambda_l2': 0.7, # 'bagging_freq': 5, # 'seed':42
def __call__(self, trial): # sample params model_id = self.model_parameter.model_id para_dict = self.model_parameter.grid_search(trial) self.setup_eval(data_dict=self.data_dict, eval_dict=self.eval_dict) k_flod_average = 0. for i in range(self.fold_num): # evaluation over k-fold data fold_k = i + 1 study = self.k_studies[i] train_data, test_data, vali_data = self.load_data( self.eval_dict, self.data_dict, fold_k) data_id = self.data_dict['data_id'] train_presort, validation_presort, test_presort = self.data_dict['train_presort'], self.data_dict['validation_presort'],\ self.data_dict['test_presort'] file_train, file_vali, file_test = self.determine_files( data_dict=self.data_dict, fold_k=fold_k) self.update_save_model_dir(data_dict=self.data_dict, fold_k=fold_k) # prepare training & testing datasets file_train_data, file_train_group = load_letor_data_as_libsvm_data( file_train, split_type=SPLIT_TYPE.Train, data_dict=self.data_dict, eval_dict=self.eval_dict, presort=train_presort) x_train, y_train = load_svmlight_file(file_train_data) group_train = np.loadtxt(file_train_group) train_set = Dataset(data=x_train, label=y_train, group=group_train) file_test_data, file_test_group = load_letor_data_as_libsvm_data( file_test, split_type=SPLIT_TYPE.Test, data_dict=self.data_dict, eval_dict=self.eval_dict, presort=test_presort) x_test, y_test = load_svmlight_file(file_test_data) group_test = np.loadtxt(file_test_group) file_vali_data, file_vali_group = load_letor_data_as_libsvm_data( file_vali, split_type=SPLIT_TYPE.Validation, data_dict=self.data_dict, eval_dict=self.eval_dict, presort=validation_presort) x_valid, y_valid = load_svmlight_file(file_vali_data) group_valid = np.loadtxt(file_vali_group) valid_set = Dataset(data=x_valid, label=y_valid, group=group_valid) if para_dict['custom_dict']['custom'] and para_dict['custom_dict'][ 'use_LGBMRanker']: lgbm_ranker = lgbm.LGBMRanker() else: lgbm_ranker = lgbm study.optimize(TreeLTRObjective(model_id=model_id, data_id = data_id, x_train=x_train , y_train=y_train, group_train=group_train, train_set=train_set, x_valid=x_valid , y_valid=y_valid, group_valid=group_valid, valid_set=valid_set, \ ranker=lgbm_ranker, fold_k=fold_k, para_dict=para_dict, data_dict=self.data_dict, eval_dict=self.eval_dict, save_model_dir=self.save_model_dir), n_trials=1) # ??? the meaning of n_trials # store loss if data_id in YAHOO_LTR: model_file = self.save_model_dir + 'model.txt' else: model_file = self.save_model_dir + '_'.join( ['fold', str(fold_k), 'model']) + '.txt' lgbm_ranker = lgbm.Booster(model_file=model_file) vali_eval_tmp = ndcg_at_k(ranker=lgbm_ranker, test_data=vali_data, k=self.vali_k, label_type=vali_data.label_type, gpu=self.gpu, device=self.device) vali_eval_v = vali_eval_tmp.data.numpy() k_flod_average += vali_eval_v # calculate loss todo average k-fold validation score k_flod_average /= self.fold_num return k_flod_average
def run(self, fold_k, file_train, file_vali, file_test, data_dict=None, eval_dict=None, save_model_dir=None): """ Run lambdaMART model based on the specified datasets. :param fold_k: :param file_train: :param file_vali: :param file_test: :param data_dict: :param eval_dict: :return: """ data_id, do_validation = data_dict['data_id'], eval_dict[ 'do_validation'] # prepare training & testing datasets file_train_data, file_train_group = \ load_letor_data_as_libsvm_data(file_train, train=True, data_dict=data_dict, eval_dict=eval_dict) x_train, y_train = load_svmlight_file(file_train_data) group_train = np.loadtxt(file_train_group) train_set = Dataset(data=x_train, label=y_train, group=group_train) file_test_data, file_test_group = \ load_letor_data_as_libsvm_data(file_test, data_dict=data_dict, eval_dict=eval_dict) x_test, y_test = load_svmlight_file(file_test_data) group_test = np.loadtxt(file_test_group) # test_set = Dataset(data=x_test, label=y_test, group=group_test) if do_validation: # prepare validation dataset if needed file_vali_data, file_vali_group = \ load_letor_data_as_libsvm_data(file_vali, data_dict=data_dict, eval_dict=eval_dict) x_valid, y_valid = load_svmlight_file(file_vali_data) group_valid = np.loadtxt(file_vali_group) valid_set = Dataset(data=x_valid, label=y_valid, group=group_valid) if self.custom_dict['custom'] and self.custom_dict[ 'use_LGBMRanker']: lgbm_ranker = lgbm.LGBMRanker() lgbm_ranker.set_params(**self.lightgbm_para_dict) ''' objective : string, callable or None, optional (default=None) Specify the learning task and the corresponding learning objective or a custom objective function to be used (see note below). Default: 'regression' for LGBMRegressor, 'binary' or 'multiclass' for LGBMClassifier, 'lambdarank' for LGBMRanker. ''' custom_obj_dict = dict(objective=self.get_custom_obj( custom_obj_id=self.custom_dict['custom_obj_id'])) lgbm_ranker.set_params(**custom_obj_dict) ''' eval_set (list or None, optional (default=None)) – A list of (X, y) tuple pairs to use as validation sets. cf. https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRanker.html ''' lgbm_ranker.fit(x_train, y_train, group=group_train, eval_set=[(x_valid, y_valid)], eval_group=[group_valid], eval_at=[5], early_stopping_rounds=eval_dict['epochs'], verbose=10) elif self.custom_dict['custom']: # use the argument of fobj lgbm_ranker = lgbm.train( params=self.lightgbm_para_dict, verbose_eval=10, train_set=train_set, valid_sets=[valid_set], early_stopping_rounds=eval_dict['epochs'], fobj=self.get_custom_obj( custom_obj_id=self.custom_dict['custom_obj_id'], fobj=True)) else: # trained booster as ranker lgbm_ranker = lgbm.train( params=self.lightgbm_para_dict, verbose_eval=10, train_set=train_set, valid_sets=[valid_set], early_stopping_rounds=eval_dict['epochs']) else: # without validation if self.custom_dict['custom'] and self.custom_dict[ 'use_LGBMRanker']: lgbm_ranker = lgbm.LGBMRanker() lgbm_ranker.set_params(**self.lightgbm_para_dict) custom_obj_dict = dict(objective=self.get_custom_obj( custom_obj_id=self.custom_dict['custom_obj_id'])) lgbm_ranker.set_params(**custom_obj_dict) lgbm_ranker.fit(x_train, y_train, group=group_train, verbose=10, eval_at=[5], early_stopping_rounds=eval_dict['epochs']) elif self.custom_dict['custom']: # use the argument of fobj lgbm_ranker = lgbm.train( params=self.lightgbm_para_dict, verbose_eval=10, train_set=train_set, num_boost_round=eval_dict['epochs'], fobj=self.get_custom_obj( custom_obj_id=self.custom_dict['custom_obj_id'], fobj=True)) else: # trained booster as ranker lgbm_ranker = lgbm.train(params=self.lightgbm_para_dict, verbose_eval=10, train_set=train_set, num_boost_round=eval_dict['epochs']) if data_id in YAHOO_LTR: model_file = save_model_dir + 'model.txt' else: model_file = save_model_dir + '_'.join( ['fold', str(fold_k), 'model']) + '.txt' if self.custom_dict['custom'] and self.custom_dict['use_LGBMRanker']: lgbm_ranker.booster_.save_model(model_file) else: lgbm_ranker.save_model(model_file) y_pred = lgbm_ranker.predict(x_test) # fold-wise prediction return y_test, group_test, y_pred
score = roc_auc_score(y_train[val_idx], y_pred) #print('\t[XGBoost ] best iteration: \033[92m%i\033[0m' % xgb_model.best_iteration) print('\t[XGBoost ] oof ROC-AUC is: \033[92m%.4f\033[0m' % score) y_train_xgb.append(xgb_model.predict(xgb_alltrain)) y_pred_xgb.append(xgb_model.predict(xgb_alltest)) xgb_scores.append(score) xgb_feature_importances.append( xgb_model.get_fscore()) # Then LightGBM lgbm_train = Dataset( data=X_train[trn_idx, :], label=y_train[trn_idx], feature_name=predictors, categorical_feature=categorical_features) lgbm_val = Dataset( data=X_train[val_idx, :], label=y_train[val_idx], feature_name=predictors, categorical_feature=categorical_features) print('\t[LightGBM] training...') lgbm_model = train_lgb( lgbm_params, lgbm_train, num_boost_round=15000, early_stopping_rounds=100,
def main(verbose=True, force=False, test=False): import datetime IGNORE_FEATURES = [] os.makedirs(ANALYSIS_PATH, exist_ok=True) os.makedirs(TRAIN_PATH, exist_ok=True) raw_df_name = os.path.join(TRAIN_PATH, 'data_raw.pyt') scaled_df_name = os.path.join(TRAIN_PATH, 'data_scaled.pyt') st_time = datetime.datetime.now() print('Loading the data...') if not os.path.isfile(raw_df_name) or force: df = read() df.set_index(ID, inplace=True) print('\tWriting \033[92m%s\033[0m' % (raw_df_name)) with open(raw_df_name, 'wb') as pyt: joblib.dump(df, pyt) else: print('\tLoading data from \033[92m%s\033[0m' % (raw_df_name)) df = joblib.load(raw_df_name) print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) # log-scale the predictors & predictand bins_target = np.logspace(np.log10(df[PREDICTAND].min()), np.log10(df[PREDICTAND].max()), 20) df[PREDICTAND] = np.log1p(df[PREDICTAND]) predictors = [c for c in df.columns if c not in ['isTrain', PREDICTAND]] # Counts of 0s or non-0s is very different between test and train sets ! pstep = 5 percs = np.arange(pstep, 100, pstep) calculated_cols = [] columns_then = df.columns # Add the info relative to the leak as it affects the training / test processes leak_file = os.path.join(TRAIN_PATH, "df_leaked_%s.pyt" % N_LAGS) if os.path.isfile(leak_file): df_leaked = joblib.load(leak_file) else: df_ = df[predictors].reset_index(level=0) df_[PREDICTAND] = df[PREDICTAND] df_ = df_[['ID', PREDICTAND] + predictors] df_leaked = get_all_leak(df_, COLUMNS_LEAK, N_LAGS) leak_cols = [c for c in df_leaked if c.startswith('leak')] df_leaked = df_leaked[leak_cols] with open(leak_file, 'wb') as pyt: joblib.dump(df_leaked, pyt) df_leaked.index = df.index leak_cols = df_leaked.columns df['nb_potential_leaks'] = df_leaked.notnull().sum(axis=1) df['leak_mean'] = df_leaked.mean(axis=1).fillna(0) df['leak_median'] = df_leaked.median(axis=1).fillna(0) df['leak_max'] = df_leaked.max(axis=1).fillna(0) df['leak_min'] = df_leaked.min(axis=1).fillna(0) # Clustering on sorted dataframe (row by row) to detect similar entries df_ = df[predictors].copy() for row in range(len(df_)): arr = df_.iloc[row, :] df_.iloc[row, :] = np.sort(arr) # Hierarchical clustering seems to have a predictive power #distance = "euclidean" n_clusters = 12 for distance in [ "hamming", "jaccard", "sokalmichener", "sokalsneath", "euclidean" ]: st_time = datetime.datetime.now() print( 'Finding \033[92m%i clusters\033[0m with \033[92m%s distance\033[0m' % (n_clusters, distance)) dist_fname = os.path.join(TRAIN_PATH, "%s_dists.pyt" % distance) if os.path.isfile(dist_fname): dist = joblib.load(dist_fname) print('-- Pairwise distance loading took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) else: if distance == "euclidean": dist = ss.distance.pdist(df_[predictors].values, distance) else: dist = ss.distance.pdist(df[predictors].values.astype(bool), distance) print('-- Pairwise distance computation took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) with open(dist_fname, 'wb') as pyt: joblib.dump(dist, pyt) ward_linkage = hierarchy.ward(dist) tree = hierarchy.to_tree(ward_linkage) cluster_colname = 'cluster_%s' % distance df[cluster_colname] = hierarchy.fcluster(ward_linkage, _get_height_at( tree, n_clusters), criterion="distance") print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) CATEGORICAL_FEATURES.append(cluster_colname) sns.catplot(x=cluster_colname, y=PREDICTAND, data=df.groupby('isTrain').get_group(True), kind="violin") plt.savefig(os.path.join(ANALYSIS_PATH, '%s.png' % cluster_colname)) plt.close() # Keep euclidean clusters as 'cluster_colname' for K-fold grouping cluster_colname = "cluster_euclidean" print('Mojena stopping rule') clusters_for_plot = np.arange(1, 101) heights = np.array( [_get_height_at(tree, n_clusters) for n_clusters in clusters_for_plot]) plt.figure() plt.plot(clusters_for_plot, heights, 'ko--') plt.grid() plt.xlabel('Number of clusters') plt.ylabel('Dendrogram height') plt.savefig(os.path.join(ANALYSIS_PATH, '%s_mojena.png' % cluster_colname)) plt.close() print('Dendrogram for Euclidean distance') dn = hierarchy.dendrogram(ward_linkage, no_labels=True, above_threshold_color='k') plt.ylabel('height') plt.xlabel('samples') plt.savefig( os.path.join(ANALYSIS_PATH, '%s_dendrogram.png' % cluster_colname)) plt.close() df[predictors] = np.log1p(df[predictors]) st_time = datetime.datetime.now() def func_agg(row): r = row[row > 0] return np.append([ (row > 0).sum(), r.mean(), (r**2).mean(), r.std(), r.max(), r.min(), r.skew(), r.kurtosis(), ], r.quantile(q=percs / 100)) print('Computing non-zero aggregates...') df[[ 'count_nonzero', 'mean_nonzero', 'meansq_nonzero', 'std_nonzero', 'max_nonzero', 'min_nonzero', 'skew_nonzero', 'kurt_nonzero', ] + ['p%i' % p for p in percs]] = df[predictors].apply( func_agg, axis=1, result_type="expand").fillna(0) print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) st_time = datetime.datetime.now() def func_agg(row): r = row[row > 0].diff().abs() return np.append([ r.mean(), (r**2).mean(), r.std(), r.max(), r.min(), r.skew(), r.kurtosis(), ], r.quantile(q=percs / 100)) print('Computing diff aggregates...') df[[ 'diff_mean_nonzero', 'diff_meansq_nonzero', 'diff_std_nonzero', 'diff_max_nonzero', 'diff_min_nonzero', 'diff_skew_nonzero', 'diff_kurtosis_nonzero', ] + ['diff_p%i' % p for p in percs]] = df[predictors].apply( func_agg, axis=1, result_type="expand").fillna(0) print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) # add occurrences (will it help ?) print('Computing distributions...') def func_epd(row): epd = np.histogram(np.exp(row[row > 0].values) - 1, bins=bins_target, normed=True)[0] return epd / np.sum(epd) df[['epd_%i' % b for b in bins_target[:-1] ]] = df[predictors].apply(func_epd, axis=1, result_type="expand").fillna(0) columns_now = df.columns calculated_cols.extend([c for c in columns_now if c not in columns_then]) ## Scale the features #st_time = datetime.datetime.now() #print('Scaling (log) the features') #for col in df.columns: #if col not in [PREDICTAND, ID, 'isTrain']: #df[col] = np.log(df[col] + 1) #print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) df.drop(predictors, axis=1, inplace=True) predictors = [c for c in calculated_cols if c in df.columns] print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) #with open(os.path.join(TRAIN_PATH, 'predictors_%s.pyt' % datetime.datetime.now().strftime('%Y%m%d%H')), #'wb') as pyt: #joblib.dump(df, pyt) st_time = datetime.datetime.now() print('Transforming the features') cols_to_remove = [] cols_to_add = [] for col in predictors: if col in CATEGORICAL_FEATURES: print('\tFeature %s is categorical -> OneHot' % col) transf = OneHotEncoder() transf.fit(df[col].values.reshape(-1, 1)) res = transf.transform(df[col].values.reshape(-1, 1)) for i, ax in enumerate(res.transpose(), 1): onehot = '{}_{}'.format(col, i) df[onehot] = ax.toarray().squeeze() cols_to_add.append(onehot) cols_to_remove.append(col) else: print('\tFeature %s is numerical -> QuantileTransformer' % col) try: df[col] = QuantileTransformer().fit_transform( df[col].values.reshape(-1, 1)) except: print("\033[91mQuantileTransformer failed on %s\033[0m" % col) print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) #df.drop(cols_to_remove, axis=1, inplace=True) IGNORE_FEATURES.extend(cols_to_remove) for col in cols_to_remove: predictors.remove(col) calculated_cols.remove(col) predictors.extend(cols_to_add) # T-SNE st_time = datetime.datetime.now() print('Running T-SNE...') fname = os.path.join(ANALYSIS_PATH, "tsne", "tsne.png") os.makedirs(os.path.dirname(fname), exist_ok=True) tsne_comps = tsne( df[predictors + [PREDICTAND, 'isTrain']], fname, nb=len(df), perplexity=40, title=None, visu_tsne=None, cmap='viridis', predictand=PREDICTAND, binary=False, #do_not_plot=[c for c in predictors if not c in calculated_cols + ['isTrain', PREDICTAND]], ) with open( os.path.join( TRAIN_PATH, "tsne_%s.pyt" % (datetime.datetime.now().strftime('%Y%m%d%H%M'))), 'wb') as pyt: joblib.dump(tsne_comps, pyt) try: for i, tsne_ax in enumerate(tsne_comps.transpose(), 1): df['tsne%i' % i] = tsne_ax calculated_cols.append('tsne%i' % i) except: print('\033[91mWARNING ! could not add t-sne values\033[0m') print_exc() pass print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds()) analyze(df, calculated_cols, step='preprocessed') #analyze_bivariate(df, cfgs, step='preprocessed') df_train = select_sample(df, "train") df_test = select_sample(df, "test") fname = os.path.join(TRAIN_PATH, 'df_train.pyt') print('Saving df_train to \033[92m%s\033[0m' % fname) with open(fname, 'wb') as pyt: joblib.dump(df_train, pyt) predictors = [c for c in df_train.columns if c not in IGNORE_FEATURES] predictors.remove(PREDICTAND) X_train = df_train[predictors].values y_train = df_train[PREDICTAND].values X_test = df_test[predictors].values test_rows = df_test.index # Load the "leaked" target leaked_target = df_leaked.loc[test_rows, leak_cols].median(axis=1) leaked_count = df_leaked.loc[test_rows, leak_cols].notnull().sum(axis=1) leak_inds = np.where(leaked_count > 0)[0] #reg, _ = train_and_validate( #df_train, #predictors, #PREDICTAND, #wdir=TRAIN_PATH, #kind='regression', #MLP_options={'hidden_layer_sizes': (100, 100)}, #GradientBoosting_options={'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 600, 'random_state': 42}, #XGBoost_options={'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 600, 'random_state': 42}, #LightGBM_options={'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 600, 'random_state': 42, 'verbose': -1, 'num_leaves': 124}, #RandomForest_options={'max_depth': None, 'n_estimators': 900, 'max_features': 1, 'min_samples_leaf': 3, 'min_samples_split': 10, 'criterion': 'mse', 'random_state': 42}, #) #os.makedirs(OUTPUT_PATH, exist_ok=True) #for name, regdict in reg.items(): #model = regdict['model'] #fname = os.path.join(OUTPUT_PATH, '%s.csv' % name) #y_pred = model.predict(X_test) #y_pred = np.expm1(y_pred) ##y_pred[leak_inds] = leaked_target.values[leak_inds] #df_result = pd.DataFrame({ID: df_test.index, #PREDICTAND: y_pred}) #df_result.to_csv(fname, index=False) #print('Wrote prediction file: \033[94;1m%s\033[0m' % fname) def save_model(model, name, y_pred=None, replace_leak=False): if model is not None: fname = os.path.join(TRAIN_PATH, "%s.pyt" % name) os.makedirs(TRAIN_PATH, exist_ok=True) with open(fname, "wb") as pyt: joblib.dump({'model': model}, pyt) print('\tSaved model to \033[92m%s\033[0m' % fname) fname = os.path.join(OUTPUT_PATH, "%s.csv" % name) if y_pred is None: y_pred = model.predict(X_test) y_pred = np.expm1(y_pred) if replace_leak: y_pred[leak_inds] = leaked_target.values[leak_inds] fname = fname.replace('.csv', '_leak.csv') df_result = pd.DataFrame({ID: df_test.index, PREDICTAND: y_pred}) df_result.to_csv(fname, index=False) print('\tSaved prediction to \033[92m%s\033[0m' % fname) from lightgbm import Dataset from lightgbm import train as train_lgb nfolds = 10 #folds = KFold(n_splits=nfolds, shuffle=True, random_state=21) folds = GroupKFold(n_splits=nfolds) y_pred_xgb = np.zeros(len(X_test)) y_train_xgb = np.zeros(len(X_train)) y_pred_lgbm = np.zeros(len(X_test)) y_train_lgbm = np.zeros(len(X_train)) lgb_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': {'mse'}, 'num_leaves': 124, 'learning_rate': 0.05, 'feature_fraction': 0.8, 'verbose': -1, 'num_boost_round': 15000, 'early_stopping_rounds': 100, 'nthread': 26 } def _rmse_func(predictions, ground_truth): return np.sqrt(mean_squared_error(predictions, ground_truth)) def rmse(predictions, train_data): labels = train_data.get_label() return 'RMSE', _rmse_func(predictions, labels), False for ifold, (trn_idx, val_idx) in enumerate( folds.split(X_train, y_train, df_train[cluster_colname].values)): print("Fold nb. %i" % ifold) lgb_train = Dataset(data=X_train[trn_idx, :], label=y_train[trn_idx], feature_name=predictors) lgb_val = Dataset(data=X_train[val_idx, :], label=y_train[val_idx], feature_name=predictors) reg = XGBRegressor(n_estimators=600, max_depth=5, learning_rate=0.05, random_state=42) reg.fit(df_train[predictors].iloc[trn_idx, :].values, df_train[[PREDICTAND]].iloc[trn_idx, :].values.squeeze()) pred_fold = reg.predict(df_train[predictors].iloc[val_idx].values) print('\t[XGBoost] oof RMSE is: \033[92m%.4f\033[0m' % np.sqrt( mean_squared_error( df_train[[PREDICTAND]].iloc[val_idx].values.squeeze(), pred_fold))) y_train_xgb += reg.predict(X_train) / nfolds y_pred_xgb += reg.predict(X_test) / nfolds reg = train_lgb(lgb_params, lgb_train, num_boost_round=15000, early_stopping_rounds=100, verbose_eval=100, valid_sets=[lgb_train, lgb_val], feval=rmse) y_pred = reg.predict(X_train[val_idx, :], num_iteration=reg.best_iteration) score = np.sqrt(mean_squared_error(y_train[val_idx], y_pred)) print('\t[LGBM] Best iteration: \033[92m%i\033[0m' % reg.best_iteration) print('\t[LGBM] oof RMSE is: \033[92m%.4f\033[0m' % score) y_train_lgbm += reg.predict(X_train, num_iteration=reg.best_iteration) / nfolds y_pred_lgbm += reg.predict(X_test, num_iteration=reg.best_iteration) / nfolds save_model(None, "LightGBM_folded", y_pred_lgbm, replace_leak=True) save_model(None, "XGBoost_folded", y_pred_xgb) save_model(None, "LightGBM_folded", y_pred_lgbm) save_model(None, "XGB-LGBM_folded", 0.5 * (y_pred_xgb + y_pred_lgbm)) gsDict = {} ## AdaBoost #print('\033[1mGridSearch - AdaBoostRegressor\033[0m') #reg_base = DecisionTreeRegressor() #reg = AdaBoostRegressor(reg_base, random_state=42) #ada_param_grid = { #"base_estimator__criterion": ["mse", "mae"], #"base_estimator__splitter": ["best", "random"], #"algorithm": ["SAMME", "SAMME.R"], #"n_estimators": [2, 10, 50], #"learning_rate": [0.001, 0.01, 0.1]} #gsAdaBoost = GridSearchCV(reg, param_grid=ada_param_grid, #cv=nfolds, scoring="neg_mean_squared_error", #n_jobs=20, verbose=1) #gsAdaBoost.fit(X_train, y_train) #ada_best = gsAdaBoost.best_estimator_ #print('\tBest score: \033[92m%.4f\033[0m' % gsAdaBoost.best_score_) #ada_best.fit(X_train, y_train) #save_model(ada_best, "AdaBoost") #gsDict["AdaBoost"] = gsAdaBoost ## ExtraTrees #print('\033[1mGridSearch - ExtraTreesRegressor\033[0m') #reg = ExtraTreesRegressor() ## Search grid for optimal parameters #ex_param_grid = { #"max_depth": [None], #"max_features": [1, 3, 10], #"min_samples_split": [2, 3, 10], #"min_samples_leaf": [1, 3, 10], #"bootstrap": [False], #"n_estimators": [100, 300, 900], #"criterion": ["mse", "mae"]} #gsExtraTrees = GridSearchCV(reg, param_grid=ex_param_grid, #cv=nfolds, scoring="neg_mean_squared_error", #n_jobs=20, verbose=1) #gsExtraTrees.fit(X_train, y_train) #etc_best = gsExtraTrees.best_estimator_ #print('\tBest score: \033[92m%.4f\033[0m' % gsExtraTrees.best_score_) #etc_best.fit(X_train, y_train) #save_model(etc_best, "ExtraTrees") #gsDict["ExtraTrees"] = gsExtraTrees ## RF Parameters #print('\033[1mGridSearch - RandomForestRegressor\033[0m') #reg = RandomForestRegressor() ## Search grid for optimal parameters #rf_param_grid = { #"max_depth": [None, 4, 5], #"max_features": [1, 3, 10], #"min_samples_split": [2, 3, 10], #"min_samples_leaf": [1, 3, 10], #"bootstrap": [False], #"n_estimators": [100, 300, 900], #"criterion": ["mse", "mae"]} #gsRandomForest = GridSearchCV( #reg, param_grid=rf_param_grid, #cv=nfolds, scoring="neg_mean_squared_error", #n_jobs=36, verbose=1) #gsRandomForest.fit(X_train, y_train) #rfc_best = gsRandomForest.best_estimator_ #print('\tBest score: \033[92m%.4f\033[0m' % gsRandomForest.best_score_) #for key in rf_param_grid.keys(): #print('\t\t%s: \033[92m%s\033[0m' % (key, getattr(rfc_best, key, '-'))) #rfc_best.fit(X_train, y_train) #save_model(rfc_best, "RandomForest") #gsDict["RandomForest"] = gsRandomForest ## Gradient boosting #print('\033[1mGridSearch - GradientBoostingRegressor\033[0m') #reg = GradientBoostingRegressor() #gb_param_grid = { #'loss' : ["ls", "lad", "huber"], #'n_estimators' : [600, 300, 900], #'learning_rate': [0.1, 0.05, 0.01], #'max_depth': [5, 4, 6], #'min_samples_leaf': [10, 50], #'max_features': ["sqrt", "auto"] #} #gsGradientBoosting = GridSearchCV( #reg, param_grid=gb_param_grid, #cv=nfolds, scoring="neg_mean_squared_error", #n_jobs=36, verbose=1) #gsGradientBoosting.fit(X_train, y_train) #gbc_best = gsGradientBoosting.best_estimator_ #print('\tBest score: \033[92m%.4f\033[0m' % gsGradientBoosting.best_score_) #for key in gb_param_grid.keys(): #print('\t\t%s: \033[92m%s\033[0m' % (key, getattr(gbc_best, key, '-'))) #gbc_best.fit(X_train, y_train) #save_model(gbc_best, "GradientBoosting") #gsDict["GradientBoosting"] = gsGradientBoosting # Gradient boosting print('\033[1mGridSearch - XGBRegressor\033[0m') reg = XGBRegressor() xgb_param_grid = { 'n_estimators': [600, 300, 900], 'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [5, 4, 6], 'missing': [None, 0.], 'booster': ["gbtree", "gblinear", "dart"], } gsXGBoost = GridSearchCV(reg, param_grid=xgb_param_grid, cv=nfolds, scoring="neg_mean_squared_error", n_jobs=36, verbose=1) gsXGBoost.fit(X_train, y_train) gbc_best = gsXGBoost.best_estimator_ print('\tBest score: \033[92m%.4f\033[0m' % gsXGBoost.best_score_) for key in xgb_param_grid.keys(): print('\t\t%s: \033[92m%s\033[0m' % (key, getattr(gbc_best, key, '-'))) gbc_best.fit(X_train, y_train) save_model(gbc_best, "XGBoost") gsDict["XGBoost"] = gsXGBoost
from Estimators import LGBM from Utils import Profiler import pandas as pd from IPython.display import display import lightgbm as lgb import Gather_Data profile = Profiler() profile.Start() # Gather Data #train_X, test_X, train_Y = dataset.Load('AllData_v3') train_X, test_X, train_Y = Gather_Data.AllData_v4() # Convert data to DMatrix lgb_train = Dataset(train_X, train_Y) lgb_test = Dataset(test_X) # Define estimator parameters params = { 'task': 'train', 'objective': 'binary', 'learning_rate': 0.1, 'num_leaves': 31, 'max_depth': 8, 'min_data_in_leaf': 20, 'min_sum_hessian_in_leaf': 0.001, 'lambda_l1': 0, 'lambda_l2': 0, 'scale_pos_weight': 1, 'metric': 'auc',