def ctb_model(train_df, test_df, params): NFOLDS = 5 train_label = train_df['信用分'] kfold = KFold(n_splits=NFOLDS, shuffle=False, random_state=2019) kf = kfold.split(train_df, train_label) train = train_df.drop(['用户编码', '信用分'], axis=1) id_traget = pd.DataFrame() id_traget['id'] = train_df['用户编码'] id_traget['target'] = train_df['信用分'] test = test_df.drop(['用户编码'], axis=1) cv_pred = np.zeros(test.shape[0]) valid_best_l2_all = 0 valid_best_l2_all_list = [] models = [] count = 0 val_pred_all = pd.DataFrame() for i, (train_fold, validate) in enumerate(kf): print("model: cgb_mae. fold: ", i, "training...") val_id_target = id_traget.iloc[validate] X_train, label_train = train.iloc[train_fold], train_label.iloc[ train_fold] X_validate, label_validate = train.iloc[validate], train_label.iloc[ validate] cat = ctb.CatBoostRegressor(**params) bst = cat.fit(X_train, label_train, eval_set=[(X_train, label_train), (X_validate, label_validate)], early_stopping_rounds=2000, verbose=1000) val_pred = pd.DataFrame() val_pred['id'] = val_id_target['id'] val_pred['index'] = X_validate.index val_pred['target'] = label_validate val_pred['score'] = bst.predict(X_validate) val_pred_all = pd.concat([val_pred_all, val_pred], axis=0, ignore_index=True) cv_pred += bst.predict(test) valid_best_l2_all += mean_absolute_error(y_true=val_pred['target'], y_pred=val_pred['score']) valid_best_l2_all_list.append( mean_absolute_error(y_true=val_pred['target'], y_pred=val_pred['score'])) count += 1 models.append(bst) cv_pred /= NFOLDS valid_best_l2_all /= NFOLDS mae_score = 1 / (1 + valid_best_l2_all) print("cgb_mae cv score for valid is: ", mae_score) # print("----------------------------------------") # print("----------------------------------------") # print("xgb_mae feature importance:") # fea_importances = pd.DataFrame({ # 'column': train.columns, # 'importance': bst.feature_importance # }).sort_values(by='importance', ascending=False) # print(fea_importances) # print("----------------------------------------") # print("----------------------------------------") return val_pred_all, cv_pred, mae_score
i = 0 for train_index, valid_index in kf.split(train, train[target].astype(int).values): print("\nFold {}".format(i + 1)) X_train, label_train = train.iloc[train_index][feature_name], train.iloc[ train_index][target].astype(int).values X_valid, label_valid = train.iloc[valid_index][feature_name], train.iloc[ valid_index][target].astype(int).values clf = cbt.CatBoostRegressor( iterations=ITERATIONS, learning_rate=0.1, depth=10, l2_leaf_reg=10, loss_function='RMSE', eval_metric="RMSE", task_type='GPU', devices="0:1", simple_ctr='FeatureFreq', combinations_ctr='FeatureFreq', ) clf.fit(X_train, label_train, eval_set=[(X_valid, label_valid)], early_stopping_rounds=EARLY_STOP, verbose=VERBOSE * 10) x1 = clf.predict(X_valid) y1 = clf.predict(test[feature_name]) clf = xgb.XGBRegressor(learning_rate=0.1, max_depth=7,
def load_model(self): self.clf = catboost.CatBoostRegressor(**self.set_hyperparameters)
order_id = pd.DataFrame(np.unique(test_data.order_id), columns=["order_id"]) train_data.fillna(0, inplace=True) test_data.fillna(0, inplace=True) labels = train_data[['delivery_duration']].values.astype(np.float32).flatten() train_pool = Pool(train_data[features], label=labels) test_pool = Pool(test_data[features]) print('Start training...') # train cat = cb.CatBoostRegressor(iterations=args.round, learning_rate=100, depth=12, l2_leaf_reg=3, rsm=1, verbose=True, eval_metric="MAE", loss_function='MAE').fit(train_pool) print cat.get_feature_importance(train_pool) test_data.loc[:, "delivery_duration"] = cat.predict(test_pool) # print mean_absolute_error(test_data["delivery_duration"], test_data["delivery_duration_prd"]) rs = order_id.merge(test_data[["order_id", "delivery_duration"]], left_on="order_id", right_on="order_id", how="left") rs.to_csv(args.out_path, header=['order_id', 'delivery_duration'], index=False)
'ridge', 'limegreen', ), 'SVR RBF': ( SVR(), 'svr', 'darkorange', ), 'SVR Linear': ( LinearSVR(C=0.08, epsilon=0.06), #LinearSVR(C=0.28, epsilon=0.06), 'lin_svr', 'purple', ), 'CatBoost': ( cat.CatBoostRegressor(**cat_params), 'catboost', 'dimgrey', ), 'Lasso': ( Lasso(), 'lasso', 'blue', ), 'RidgeCV': ( RidgeCV(alphas=[num / 100 for num in range(5, 500, 5)]), 'ridgecv', 'yellow', ), 'SVR Poly': ( SVR(kernel='poly'), # C=5.0),
max_back_quarter=MAX_BACK_QUARTER) fc2 = BaseCompanyFeatures(cat_columns=CAT_COLUMNS) fc3 = QuarterlyDiffFeatures(columns=QUARTER_COLUMNS, compare_quarter_idxs=COMPARE_QUARTER_IDXS, max_back_quarter=MAX_BACK_QUARTER) feature = FeatureMerger(fc1, fc2, on='ticker') feature = FeatureMerger(feature, fc3, on=['ticker', 'date']) target = QuarterlyDiffTarget(col='marketcap') base_models = [ lgbm.sklearn.LGBMRegressor(), ctb.CatBoostRegressor(verbose=False) ] ensemble = EnsembleModel(base_models=base_models, bagging_fraction=BAGGING_FRACTION, model_cnt=MODEL_CNT) model = GroupedOOFModel(ensemble, group_column='ticker', fold_cnt=FOLD_CNT) pipeline = BasePipeline(feature=feature, target=target, model=model, metric=median_absolute_relative_error, out_name=OUT_NAME) result = pipeline.fit(data_loader, ticker_list)
objective='reg:linear', eval_metric='rmse', learning_rate=0.01), train_type=args.train_type, use_valid=True, debug=args.debug) model.load_params("parames/xgboost_regressor_default.yml") elif (args.regressor == "lightgbm"): model = LightGBMRegressor(model=lgb.LGBMRegressor( objective='regression', metric='rmse'), train_type=args.train_type, use_valid=True, debug=args.debug) elif (args.regressor == "catboost"): model = CatBoostRegressor( model=catboost.CatBoostRegressor(random_seed=args.seed + k), use_valid=True, debug=args.debug) # モデルのパラメータ設定 if not (args.params_file == ""): model.set_params(args.params_file) #-------------------- # モデルの学習処理 #-------------------- model.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold) #-------------------- # モデルの推論処理 #--------------------
def fit(self, train_X, train_y, valid_X=None, valid_y=None, auto_split_train_data=True, booster_params=None, num_boost_round=10, learning_rate=0.01, nthread=-1, eval_metric=None, # learning_rates=None, # callback... cat_features=None, pairs=None, pretrained_model=None, sample_weight=None, group_id=None, group_weight=None, subgroup_id=None, pairs_weight=None, baseline=None, use_best_model=None, verbose=None, # verbose_eval=None, # Alias as verbose... logging_level=None, plot=False, column_description=None, metric_period=None, silent=None, random_seed=0, early_stopping_rounds=100, # save_snapshot=None, # snapshot_file=None, # snapshot_interval=None, inplace_class_model=True, autosave_ckpt=True, ): if booster_params is not None: __booster_params = booster_params else: __booster_params = self.__booster_params # Update some training parameters: __booster_params['iterations'] = num_boost_round __booster_params['eta'] = learning_rate # __booster_params['eta'] = learning_rate __booster_params['random_seed'] = random_seed # Alias: random_state __booster_params['thread_count'] = nthread __booster_params['eval_metric'] = eval_metric __booster_params['logging_level'] = logging_level # Split training and validating data to prevent overfitting: if valid_X is None and valid_y is None: if auto_split_train_data: logging.info('Randomly split training data into 70% and 30%.') _train_X, _valid_X, _train_y, _valid_y = train_test_split(train_X, train_y, test_size=0.3, random_state=random_seed) logging.info('Training data size: {}, validation data size: {}'.format(_train_X.shape, _valid_X.shape)) # Make CatBoost Pool: train_pool = MyCat.make_pool(data=_train_X, label=_train_y, cat_features=cat_features, pairs=pairs) valid_pool = MyCat.make_pool(data=_valid_X, label=_valid_y, cat_features=cat_features, pairs=pairs) data_for_eval = [valid_pool] # [train_pool, valid_pool] else: train_pool = MyCat.make_pool(data=train_X, label=train_y, cat_features=cat_features, pairs=pairs) data_for_eval = None # [train_pool] elif valid_X is not None and valid_y is not None: if not isinstance(valid_X): valid_X = [valid_X] if not isinstance(valid_y): valid_y = [valid_y] assert len(valid_X) == len(valid_y), 'Input valid_X and valid_y should have same length.' logging.info('Training data size: {}'.format(train_X.shape[0])) train_pool = MyCat.make_pool(data=train_X, label=train_y, cat_features=cat_features, pairs=pairs) data_for_eval = [] for i in range(len(valid_X)): _valid_pool = MyCat.make_pool(data=valid_X[i], label=valid_y[i], cat_features=cat_features, pairs=pairs) data_for_eval.append(_valid_pool) logging.info('Validation data {} size: {}'.format(i, valid_X[i].shape)) # Start training procedure... try: _training_execution_time = MyCat.tic() if self.__application == 'regression': if pretrained_model is None: _this_cat = cb.CatBoostRegressor(**__booster_params) else: print('Use pretrained.') _this_cat = cb.CatBoostRegressor() _this_cat = _this_cat.load_model(pretrained_model, self.__model_artifact_format) elif self.__application == 'classification': if pretrained_model is None: _this_cat = cb.CatBoostClassifier(**__booster_params) else: print('Use pretrained.') _this_cat = cb.CatBoostClassifier() _this_cat = _this_cat.load_model(pretrained_model, self.__model_artifact_format) else: raise ValueError('Unknown application type. Should be either classification or regression.') with MyCat.timer('Model training'): _this_cat.fit( X=train_pool, eval_set=data_for_eval, verbose=verbose, plot=plot, early_stopping_rounds=early_stopping_rounds, silent=silent ) if autosave_ckpt: _ckpt_file = 'CatBoost_model_ckpt_{}'.format(_training_execution_time) _ckpt_file = os.path.join(self.__checkpoint_dir, _ckpt_file) _this_cat.save_model(_ckpt_file, self.__model_artifact_format) logging.info('Saved model artifact to {}'.format(_ckpt_file)) return _this_cat except Exception as e: _error_msg = 'Failed in training CatBoost model. Error: {}'.format(e) print(_error_msg) logging.error(_error_msg) raise
else: regression_model = GradientBoostingRegressor(**study.best_params, n_estimators=1000, validation_fraction=fraction_of_validation_samples, n_iter_no_change=100) regression_model.fit(train_x, train_y) best_n_estimators = len(regression_model.estimators_) regression_model = GradientBoostingRegressor(**study.best_params) elif method_flag == 14: # catboost train_x_tmp, train_x_validation, train_y_tmp, train_y_validation = train_test_split(train_x, train_y, test_size=fraction_of_validation_samples, random_state=0) if fraction_of_validation_samples == 0: best_n_estimators_in_cv = number_of_sub_models else: regression_model = cat.CatBoostRegressor(n_estimators=500, logging_level='Silent') regression_model.fit(train_x_tmp, train_y_tmp, eval_set=[(train_x_validation, train_y_validation)], early_stopping_rounds=30) best_n_estimators_in_cv = regression_model.best_iteration_ def objective(trial): param = { 'depth': trial.suggest_int('depth', 4, 10), 'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e0), 'random_strength': trial.suggest_int('random_strength', 0, 100), 'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 100), 'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']), 'od_wait': trial.suggest_int('od_wait', 10, 50) }
data, features, cate_feat = data_process() # cb_model = cb.CatBoostRegressor() # lgb_model = lgb.LGBMRegressor( # num_leaves=64, reg_alpha=0., reg_lambda=0.01, metric='rmse', # max_depth=-1, learning_rate=0.05, min_child_samples=10, seed=202011, # n_estimators=2000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1, # ) ctb_model = cb.CatBoostRegressor( loss_function="RMSE", eval_metric="MAE", task_type="CPU", learning_rate=0.01, iterations=10000, random_seed=202011, od_type="Iter", depth=6, early_stopping_rounds=400, ) data, predict_label = get_predict_w(ctb_model, data, label='label', feature=features, cate_feature=cate_feat, random_state=202011, n_splits=10, model_type='ctb')
subsample=0.67, colsample_bytree=0.054, colsample_bylevel=0.50) print("-" * 20 + "User-Level XGBoost Training" + "-" * 20) Kfolder.validate(user_train, user_test, features, xgbmodel, name="xgbfinal", prepare_stacking=True) print("-" * 20 + "Done Training" + "-" * 20) # CatBoost catmodel = cat.CatBoostRegressor(iterations=500, learning_rate=0.2, depth=5, random_seed=2019) print("-" * 20 + "User-Level CatBoost Training" + "-" * 20) Kfolder.validate(user_train, user_test, features, catmodel, name="catfinal", prepare_stacking=True, fit_params={ "use_best_model": True, "verbose": 100 }) print("-" * 20 + "Done Training" + "-" * 20) # Ensembling dragons
def reg_model(train, test, label_name, model_type, numerical_features, category_features, seed, cv=True): import lightgbm as lgb from xgboost import XGBRegressor from sklearn.ensemble import RandomForestRegressor train.reset_index(inplace=True, drop=True) test.reset_index(inplace=True, drop=True) if model_type == 'rf': train.fillna(0, inplace=True) # combine = pd.concat([train, test], axis=0) # combine = multi_column_LabelEncoder(combine, category_features, rename=True) # combine[category_features] = combine[category_features].astype('category') # train = combine[:train.shape[0]] # test = combine[train.shape[0]:] features = category_features + numerical_features train_x = train[features] train_y = train[label_name] test_x = test[features] if cv: n_fold = 2 count_fold = 0 preds_list = list() oof = np.zeros(train_x.shape[0]) kfolder = KFold(n_splits=n_fold, shuffle=True, random_state=seed) kfold = kfolder.split(train_x, train_y) for train_index, vali_index in kfold: print("training......fold", count_fold) count_fold = count_fold + 1 k_x_train = train_x.loc[train_index] k_y_train = train_y.loc[train_index] k_x_vali = train_x.loc[vali_index] k_y_vali = train_y.loc[vali_index] if model_type == 'lgb': lgb_model = lgb.LGBMRegressor(**lgb_params) if 'sample_weight' in train.columns: lgb_model = lgb_model.fit( k_x_train, k_y_train, eval_set=[(k_x_vali, k_y_vali)], early_stopping_rounds=200, verbose=False, eval_metric="mae", sample_weight=train.loc[train_index]['sample_weight'], categorical_feature=category_features) else: lgb_model = lgb_model.fit( k_x_train, k_y_train, eval_set=[(k_x_vali, k_y_vali)], early_stopping_rounds=200, verbose=False, eval_metric="mae", categorical_feature=category_features) k_pred = lgb_model.predict( k_x_vali, num_iteration=lgb_model.best_iteration_) pred = lgb_model.predict( test_x, num_iteration=lgb_model.best_iteration_) elif model_type == 'xgb': xgb_model = XGBRegressor(**xgb_params) xgb_model = xgb_model.fit(k_x_train, k_y_train, eval_set=[(k_x_train, k_y_train), (k_x_vali, k_y_vali)], early_stopping_rounds=200, verbose=False) k_pred = xgb_model.predict(k_x_vali) pred = xgb_model.predict(test_x) elif model_type == 'rf': rf_model = RandomForestRegressor(n_estimators=100, max_depth=3, criterion="mae", n_jobs=-1, random_state=2019) model = rf_model.fit(k_x_train, k_y_train) k_pred = rf_model.predict(k_x_vali) pred = rf_model.predict(test_x) elif model_type == 'cat': ctb_params = { 'n_estimators': 1000, 'learning_rate': 0.02, 'random_seed': 4590, 'reg_lambda': 0.08, 'subsample': 0.7, 'bootstrap_type': 'Bernoulli', 'boosting_type': 'Plain', 'one_hot_max_size': 100, 'rsm': 0.5, 'leaf_estimation_iterations': 5, 'use_best_model': True, 'max_depth': 5, 'verbose': -1, 'thread_count': 4, 'cat_features': category_features } cat_model = cat.CatBoostRegressor(**ctb_params) cat_model.fit(k_x_train, k_y_train, verbose=False, use_best_model=True, eval_set=[(k_x_vali, k_y_vali)]) k_pred = cat_model.predict(k_x_vali) pred = cat_model.predict(test_x) preds_list.append(pred) oof[vali_index] = k_pred # if model_type == 'lgb': # feature_importance_df = pd.DataFrame({ # 'column': features, # 'importance': lgb_model.feature_importances_, # }).sort_values(by='importance') # feature_importance_df.to_csv('feature_importance.csv', index=False, ) #print(feature_importance_df) # plt.figure(figsize=(15, 5)) # plt.barh(range(len(features)), lgb_model.feature_importances_) # plt.bar(range(len(features)), lgb_model.feature_importances_) # plt.xticks(range(len(features)), features, rotation=-45, fontsize=14) # plt.title('Feature importance', fontsize=14) # plt.show() # import shap # explainer = shap.TreeExplainer(lgb_model) # shap_values = explainer.shap_values(train_x) # player_explainer = pd.DataFrame() # player_explainer['feature'] = features # player_explainer['feature_value'] = train_x.iloc[10].values # player_explainer['shap_value'] = shap_values[10] # print(player_explainer) # shap.initjs() # aa = shap.force_plot(explainer.expected_value, shap_values[10], train_x.iloc[10]) # #bb = shap.summary_plot(shap_values, train_x) # cc = shap.summary_plot(shap_values, train_x, plot_type="bar") #shap.save_html('aa.html', bb) preds_columns = ['preds_{id}'.format(id=i) for i in range(n_fold)] preds_df = pd.DataFrame(data=preds_list) preds_df = preds_df.T preds_df.columns = preds_columns preds = list(preds_df.mean(axis=1)) return preds, oof else: lgb_model = lgb.LGBMRegressor(**lgb_params) lgb_model = lgb_model.fit(train_x, train_y, eval_metric='mse') preds = lgb_model.predict(test_x) oof = lgb_model.predict(train_x) return preds, oof
def __init__(self, task_type, module_type, compute_task, **params): """ :param task_type: # 任务类型 cal 或 reg :param module_type: :param compute_task: :param params: """ assert task_type in ["cla", "reg"] # 两种类型 assert module_type in ["balance", "debug", "performance"] # 三种 性能模型 assert compute_task in ["GPU", "CPU"] self.task_type = task_type # cal 或使用reg self.module_type = module_type # 模块 if self.module_type == "debug": params["thread_count"] = 1 elif self.module_type == "performance": # 性能模型 params["thread_count"] = cpu_count() # cpu核心数 else: # 均衡模型 params["thread_count"] = cpu_count() // 2 #通用参数 # learning_rate(eta) = automatically # depth(max_depth) = 6: 树的深度 # l2_leaf_reg(reg_lambda) = 3 L2正则化系数 # n_estimators(num_boost_round)(num_trees=1000) = 1000: 解决ml问题的树的最大数量 基分类器的数量 # one_hot_max_size = 2: 对于某些变量进行one - hot编码 # loss_function = “Logloss” # loss_function in ["Logloss","RMSE","MAE","CrossEntropy","MultiClass", "MultiClassOneVsAll"] 或使用自定义函数 # custom_metric = None 自定义指标 # custom_metric in ["RMSE","Logloss","MAE","CrossEntropy","Recall","Precision","F1","Accuracy","AUC","R2"] # eval_metric = Optimized objective 优化目标 # eval_metric in ["RMSE","Logloss","MAE","CrossEntropy","Recall","Precision","F1","Accuracy","AUC","R2"] # nan_mode = None:处理NAN的方法 # nan_mode in ["Forbidden","Min","Max"] # leaf_estimation_method = None:迭代求解的方法,梯度和牛顿 # leaf_estimation_method in ["Newton","Gradient"] # random_seed = None: 训练时候的随机种子 # 性能参数 # thread_count = -1:训练时所用的cpu / gpu核数 # used_ram_limit = None:CTR问题,计算时的内存限制 # gpu_ram_part = None:GPU内存限制 # 处理单元设置 # task_type = CPU:训练的器件 # devices = None:训练的GPU设备ID # counter_calc_method = None, # leaf_estimation_iterations = None, # use_best_model = None, # verbose = None, # model_size_reg = None, # rsm = None, # logging_level = None, # metric_period = None, # ctr_leaf_count_limit = None, # store_all_simple_ctr = None, # max_ctr_complexity = None, # has_time = None, # classes_count = None, # class_weights = None, # random_strength = None, # name = None, # ignored_features = None, # train_dir = None, # custom_loss = None, # bagging_temperature = None # border_count = None # feature_border_type = None, # save_snapshot = None, # snapshot_file = None, # fold_len_multiplier = None, # allow_writing_files = None, # final_ctr_computation_mode = None, # approx_on_full_history = None, # boosting_type = None, # simple_ctr = None, # combinations_ctr = None, # per_feature_ctr = None, # device_config = None, # bootstrap_type = None, # subsample = None, # colsample_bylevel = None, # random_state = None, # objective = None, # max_bin = None, # scale_pos_weight = None, # gpu_cat_features_storage = None, # data_partition = None self.compute_task = compute_task if self.compute_task == "gpu": # params["task_type"] = "GPU" else: params["task_type"] = "CPU" if self.task_type == "reg": # 做回归任务 """ # 使用相关的成本函数, RMSE, MultiRMSE, MAE, Quantile, LogLinQuantile, Poisson, MAPE, Lq or custom objective object" """ self.model = cb.CatBoostRegressor( iterations=None, learning_rate=params.get("leaning_rate", None), # 学习率 depth=params.get("depth", None), # 深度 l2_leaf_reg=params.get("l2_leaf_reg", None), #l2 正则 model_size_reg=params.get("model_size_reg", None), rsm=params.get("rms", None), # loss_function=params.get("loss_function", 'RMSE'), # 损失函数值 border_count=params.get("border_count", None), # 边界树 feature_border_type=params.get("feature_border_type", None), per_float_feature_quantization=params.get( "per_float_feature_quantization", None), input_borders=params.get("input_borders", None), output_borders=params.get("output_borders", None), fold_permutation_block=params.get("fold_permutation_block", None), od_pval=params.get("od_pval", None), od_wait=params.get("od_wait", None), od_type=params.get("od_type", None), nan_mode=params.get("nan_mode", None), counter_calc_method=params.get("counter_calc_method", None), leaf_estimation_iterations=params.get( "leaf_estimation_iterations", None), leaf_estimation_method=params.get("leaf_estimation_method", None), # 叶子及分类器方法 thread_count=params.get("thread_count", None), # 线程数 random_seed=params.get("random_seed", None), # 随机种子 use_best_model=params.get("use_best_model", None), best_model_min_trees=params.get("best_model_min_trees", None), # 最好模型最小数 verbose=params.get("verbose", None), silent=params.get("silent", None), logging_level=params.get("logging_level", None), metric_period=params.get("metric_period", None), ctr_leaf_count_limit=params.get("ctr_leaf_count_limit", None), store_all_simple_ctr=params.get("store_all_simple_ctr", None), max_ctr_complexity=params.get("max_ctr_complexity", None), has_time=params.get("has_time", None), allow_const_label=params.get("allow_const_label", None), one_hot_max_size=params.get("one_hot_max_size", None), random_strength=params.get("random_strength", None), name=params.get("name", None), ignored_features=params.get("ignored_features", None), train_dir=params.get("train_dir", None), custom_metric=params.get("custom_metric", None), eval_metric=params.get("eval_metric", None), bagging_temperature=params.get("bagging_temperature", None), save_snapshot=params.get("save_snapshot", None), snapshot_file=params.get("snapshot_file", None), snapshot_interval=params.get("snapshot_interval", None), fold_len_multiplier=params.get("fold_len_multiplier", None), used_ram_limit=params.get("used_ram_limit", None), gpu_ram_part=params.get("gpu_ram_part", None), pinned_memory_size=params.get("pinned_memory_size", None), allow_writing_files=params.get("allow_writing_files", None), final_ctr_computation_mode=params.get( "final_ctr_computation_mode", None), approx_on_full_history=params.get("final_ctr_computation_mode", None), boosting_type=params.get("boosting_type", None), simple_ctr=params.get("simple_ctr", None), combinations_ctr=params.get("combinations_ctr", None), per_feature_ctr=params.get("per_feature_ctr", None), ctr_target_border_count=params.get("ctr_target_border_count", None), task_type=params.get("task_type", None), # cpu 或GPU device_config=params.get("device_config", None), devices=params.get("devices", None), # 训练的gpu设备ID bootstrap_type=params.get("bootstrap_type", None), subsample=params.get("subsample", None), sampling_unit=params.get("sampling_unit", None), dev_score_calc_obj_block_size=params.get( "dev_score_calc_obj_block_size", None), max_depth=params.get("max_depth", None), # 最大树的深度,默认为6 ==depth n_estimators=params.get("n_estimators", None), # 基分类器的数量, # 决ml伪命题的树的最大数量,默认值为1000,==num_boost_round, ==num_trees=1000 num_boost_round=params.get("num_boost_round", None), # 提升轮数树 num_trees=params.get("num_trees", None), # 树数量 colsample_bylevel=params.get("colsample_bylevel", None), random_state=params.get("random_state", None), # 随机种子 reg_lambda=params.get("reg_lambda", None), # 正则化参数lambda objective=params.get("objective", None), # 目标函数 eta=params.get("eta", None), max_bin=params.get("max_bin", None), gpu_cat_features_storage=params.get("gpu_cat_features_storage", None), data_partition=params.get("data_partition", None), metadata=params.get("metadata", None), early_stopping_rounds=params.get("early_stopping_rounds", None), # 过早停止代数 cat_features=params.get("cat_features", None), grow_policy=params.get("grow_policy", None), min_data_in_leaf=params.get("min_data_in_leaf", None), # 叶子中的最小数 min_child_samples=params.get("min_child_samples", None), # 最小子样本 max_leaves=params.get("max_leaves", None), # 最大叶子数 num_leaves=params.get("num_leaves", None), # 叶子数量 score_function=params.get("score_function", None), # 得分函数 leaf_estimation_backtracking=params.get( "leaf_estimation_backtracking", None), ctr_history_unit=params.get("ctr_history_unit", None), monotone_constraints=params.get("monotone_constraints", None), feature_weights=params.get("feature_weights", None), # 特征全面直接拍卖行 penalties_coefficient=params.get("penalties_coefficient", None), first_feature_use_penalties=params.get( "first_feature_use_penalties", None), model_shrink_rate=params.get("model_shrink_rate", None), model_shrink_mode=params.get("model_shrink_mode", None), langevin=params.get("langevin", None), diffusion_temperature=params.get("diffusion_temperature", None), boost_from_average=params.get("boost_from_average", None)) else: # 做胡分类任务 self.model = cb.CatBoostClassifier( iterations=None, # 迭代数, 通用参数 learning_rate=params.get("leaning_rate", None), #学习率,通用参数 depth=params.get("depth", None), # 树的深度, l2_leaf_reg=params.get("l2_leaf_reg", None), # l2 正则化参数 model_size_reg=params.get("model_size_reg", None), rsm=params.get("rms", None), loss_function=params.get("loss_function", None), border_count=params.get("border_count", None), feature_border_type=params.get("feature_border_type", None), per_float_feature_quantization=params.get( "per_float_feature_quantization", None), input_borders=params.get("input_borders", None), output_borders=params.get("output_borders", None), fold_permutation_block=params.get("fold_permutation_block", None), od_pval=params.get("od_pval", None), od_wait=params.get("od_wait", None), od_type=params.get("od_type", None), nan_mode=params.get("nan_mode", None), counter_calc_method=params.get("counter_calc_method", None), leaf_estimation_iterations=params.get( "leaf_estimation_iterations", None), leaf_estimation_method=params.get("leaf_estimation_method", None), thread_count=params.get("thread_count", None), # 性能参数,使用-1时, # 使用过最大的cpu核心数进行训练 random_seed=params.get("random_seed", None), use_best_model=params.get("use_best_model", None), # best_model_min_trees=params.get("best_model_min_trees", None), verbose=params.get("verbose", None), # silent=params.get("silent", None), logging_level=params.get("logging_level", None), metric_period=params.get("metric_period", None), ctr_leaf_count_limit=params.get("ctr_leaf_count_limit", None), store_all_simple_ctr=params.get("store_all_simple_ctr", None), max_ctr_complexity=params.get("max_ctr_complexity", None), has_time=params.get("has_time", None), allow_const_label=params.get("allow_const_label", None), one_hot_max_size=params.get("one_hot_max_size", None), # one_hot编码的最大尺寸 random_strength=params.get("random_strength", None), name=params.get("name", None), ignored_features=params.get("ignored_features", None), train_dir=params.get("train_dir", None), custom_loss=params.get("custom_loss", None), custom_metric=params.get("custom_metric", None), eval_metric=params.get("eval_metric", None), bagging_temperature=params.get("bagging_temperature", None), save_snapshot=params.get("save_snapshot", None), snapshot_file=params.get("snapshot_file", None), snapshot_interval=params.get("snapshot_interval", None), fold_len_multiplier=params.get("fold_len_multiplier", None), used_ram_limit=params.get("used_ram_limit", None), #CTR问题, # 计算时的内存限制 性能参数 gpu_ram_part=params.get("gpu_ram_part", None), # 性能参数, GPU显存限制 # pinned_memory_size=params.get("pinned_memory_size", None), allow_writing_files=params.get("allow_writing_files", None), final_ctr_computation_mode=params.get( "final_ctr_computation_mode", None), approx_on_full_history=params.get("final_ctr_computation_mode", None), boosting_type=params.get("boosting_type", None), simple_ctr=params.get("simple_ctr", None), combinations_ctr=params.get("combinations_ctr", None), per_feature_ctr=params.get("per_feature_ctr", None), # ctr_target_border_count=params.get("ctr_target_border_count", None), task_type=params.get("task_type", None), device_config=params.get("device_config", None), devices=params.get("devices", None), bootstrap_type=params.get("bootstrap_type", None), subsample=params.get("subsample", None), sampling_unit=params.get("sampling_unit", None), dev_score_calc_obj_block_size=params.get( "dev_score_calc_obj_block_size", None), max_depth=params.get("max_depth", None), n_estimators=params.get("n_estimators", None), num_boost_round=params.get("num_boost_round", None), num_trees=params.get("num_trees", None), colsample_bylevel=params.get("colsample_bylevel", None), random_state=params.get("random_state", None), reg_lambda=params.get("reg_lambda", None), # 正则化参数, l2, ==l2_leaf_reg objective=params.get("objective", None), eta=params.get("eta", None), # 使用自动的学习率 ==learning_rate max_bin=params.get("max_bin", None), scale_pos_weight=params.get("scale_pos_weight", None), gpu_cat_features_storage=params.get("gpu_cat_features_storage", None), data_partition=params.get("data_partition", None), metadata=params.get("metadata", None), early_stopping_rounds=params.get("early_stopping_rounds", None), cat_features=params.get("cat_features", None), grow_policy=params.get("grow_policy", None), min_data_in_leaf=params.get("min_data_in_leaf", None), min_child_samples=params.get("min_child_samples", None), max_leaves=params.get("max_leaves", None), num_leaves=params.get("num_leaves", None), score_function=params.get("score_function", None), leaf_estimation_backtracking=params.get( "leaf_estimation_backtracking", None), ctr_history_unit=params.get("ctr_history_unit", None), monotone_constraints=params.get("monotone_constraints", None), feature_weights=params.get("feature_weights", None), penalties_coefficient=params.get("penalties_coefficient", None), first_feature_use_penalties=params.get( "first_feature_use_penalties", None), model_shrink_rate=params.get("model_shrink_rate", None), model_shrink_mode=params.get("model_shrink_mode", None), langevin=params.get("langevin", None), diffusion_temperature=params.get("diffusion_temperature", None), boost_from_average=params.get("boost_from_average", None), text_features=params.get("text_features", None), tokenizers=params.get("tokenizers", None), dictionaries=params.get("dictionaries", None), feature_calcers=params.get("feature_calcers", None), text_processing=params.get("text_processing", None))
def main(problem_type, data_type, data_sample, user_type, param_index): param = params[param_index] # Hyper params results = [] if user_type == 'new': X_train = X_group_train y_train = y_group_train X_test = X_group_test y_test = y_group_test X_val = X_group_val y_val = y_group_val elif user_type == 'cur': X_train = X_stratified_train y_train = y_stratified_train X_test = X_stratified_test y_test = y_stratified_test X_val = X_stratified_val y_val = y_stratified_val else: raise ValueError('Invalid value for <user_type>') if data_type == 'fitbit': X_train = X_train[fitbit_features] X_test = X_test[fitbit_features] X_val = X_val[fitbit_features] elif data_type == 'all': X_train = X_train[all_features] X_test = X_test[all_features] X_val = X_val[all_features] else: raise ValueError('Invalid value for <data_type>') if problem_type == 'cls': param['loss_function'] = 'Logloss' param['eval_metric'] = 'F1' X = pd.concat([X_train, X_test]) y = pd.concat([y_train, y_test]) group = y['subject'].values # Split K Fold if user_type == 'new': kfold = GroupKFold(n_splits=4).split(X,y,group) elif user_type == 'cur': kfold = StratifiedKFold(n_splits=4, random_state=27, shuffle=True).split(X, group) # Training for train_index, test_index in kfold: X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] y_train = y_train['valence_relative_bin'] y_test = y_test['valence_relative_bin'] # pdb.set_trace() # Data Sampling X_train, y_train, param = data_process(data_sample, X_train, y_train, param) # CatBoost cat = catboost.CatBoostClassifier(**param) cat.fit(X_train, y_train, eval_set = (X_test, y_test), use_best_model = True, verbose = False) y_pred = cat.predict(X_val) # pdb.set_trace() cat_res = eval_class(y_val['valence_relative_bin'], y_pred) results.append(cat_res +[0,0]) # feature_col = X_val.columns.values # fi_score = catboost_feature_importance(cat, feature_col) # fi_score.to_csv() elif problem_type == 'reg': param.pop('class_weights', None) param['loss_function'] = 'RMSE' param['eval_metric'] = 'RMSE' X = pd.concat([X_train, X_test]) y = pd.concat([y_train, y_test]) group = y['subject'].values # Split K Fold if user_type == 'new': kfold = GroupKFold(n_splits=4).split(X,y,group) elif user_type == 'cur': kfold = StratifiedKFold(n_splits=4, random_state=27, shuffle=True).split(X, group) # Training for train_index, test_index in kfold: X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # Two model, one for positive valence, one for negative valence y_p_train = y_train['valence_p'] y_p_test = y_test['valence_p'] y_n_train = y_train['valence_n'] y_n_test = y_test['valence_n'] # Regression problem no data sampling technique # CatBoost model_p = catboost.CatBoostRegressor(**param) model_p.fit(X_train, y_train['valence_p'], eval_set = (X_test, y_test['valence_p']), use_best_model = True, verbose = False) p_regr = model_p.predict(X_val) model_n = catboost.CatBoostRegressor(**param) model_n.fit(X_train, y_train['valence_n'], eval_set = (X_test, y_test['valence_n']), use_best_model = True, verbose = False) n_regr = model_n.predict(X_val) y_pred = n_regr - p_regr - y_val['valence_score_median'] y_reg = y_val['valence_relative'] # print(model_regr) y_pred_cls = [1 if x >= 0 else 0 for x in y_pred] y_reg_cls = y_val['valence_relative_bin'] # pdb.set_trace() cat_res = eval_class(y_reg_cls, y_pred_cls) reg_eval = eval_reg(y_reg, y_pred) results.append(cat_res + reg_eval) results = np.mean(results, axis=0) training_params = [problem_type, data_type, data_sample, user_type, param_index] return np.append(training_params, results)
def read_yaml(path): with open(path, "r") as f: return yaml.safe_load(f) MODEL_PARAMS = {"allow_writing_files": False, "iterations": 10} @pytest.fixture( scope="module", params=[ cb.CatBoost(MODEL_PARAMS), cb.CatBoostClassifier(**MODEL_PARAMS), cb.CatBoostRegressor(**MODEL_PARAMS), ], ids=["CatBoost", "CatBoostClassifier", "CatBoostRegressor"], ) def cb_model(request): model = request.param X, y = get_iris() return ModelWithData(model=model.fit(X, y), inference_dataframe=X) @pytest.fixture def reg_model(): model = cb.CatBoostRegressor(**MODEL_PARAMS) X, y = get_iris() return ModelWithData(model=model.fit(X, y), inference_dataframe=X)
'metric': {'mae'}, 'num_leaves': 256, 'min_sum_hessian_in_leaf': 20, 'max_depth': -12, 'learning_rate': 0.05, 'feature_fraction': 0.6, # 'bagging_fraction': 0.9, # 'bagging_freq': 3, 'verbose': 1 } print('Start training...') # train cat = cb.CatBoostRegressor(iterations=args.round, learning_rate=0.03, depth=8, l2_leaf_reg=3, rsm=1, loss_function='Logloss').fit( train_data[features], labels) test_data.loc[:, "delivery_duration_prd"] = cat.predict(test_data[features]) print mean_absolute_error(test_data["delivery_duration"], test_data["delivery_duration_prd"]) # rs = order_id.merge(test_data[["order_id", "delivery_duration"]], left_on="order_id", right_on="order_id", # how="left") # # rs.to_csv(args.out_path, header=['order_id', 'delivery_duration'], index=False)
def reg_model(): model = cb.CatBoostRegressor(**MODEL_PARAMS) X, y = get_iris() return ModelWithData(model=model.fit(X, y), inference_dataframe=X)
LabelBinarizer()]), ], df_out=True) Z_train = mapper.fit_transform(X_train) Z_test = mapper.transform(X_test) # # GridSearchCV to find best params for the pipe # params = { # 'iterations': [100,500], # 'learning_rate': [0.1,0.3,0.7], # 'depth': [4, 10], # } # grid = GridSearchCV(model, params, cv=3, n_jobs=-1, verbose=1) # grid.fit(Z_train, y_train) # print(grid.best_score_) # print(grid.best_params_) # CatBoostRegressor using the best params found above^ model = cb.CatBoostRegressor(depth=10, iterations=500, learning_rate=0.3) model.fit(Z_train, y_train) print(model.score(Z_train, y_train)) print(model.score(Z_test, y_test)) # pipe and pickle pipe = make_pipeline(mapper, model) pipe.fit(X_train, y_train) pipe.score(X_test, y_test) dill.dump(pipe, open('pipe.pkl', 'wb'))
silent=True, subsample=0.8, colsample_bytree=0.7, colsample_bylevel=0.5) Kfolder.validate(train, test, features, xgbmodel, name="xgbfinal", prepare_stacking=True) catmodel = cat.CatBoostRegressor(iterations=10000, learning_rate=0.01, depth=5, eval_metric='RMSE', colsample_bylevel=0.8, bagging_temperature=0.2, metric_period=None, early_stopping_rounds=200, random_seed=random_seed) Kfolder.validate(train, test, features, catmodel, name="catfinal", prepare_stacking=True, fit_params={ "use_best_model": True, "verbose": 100 })
reg_lambda=5, max_depth=7, n_estimators=10000, subsample=0.7, colsample_bytree=0.4, subsample_freq=2, min_child_samples=10, learning_rate=0.1, random_state=2019) cbt_attr_model = cbt.CatBoostRegressor( num_leaves=31, # reg_lambda=5, max_depth=7, n_estimators=10000, # subsample=0.7, # min_child_samples=10, learning_rate=0.1, random_state=2, eval_metric='MAE', task_type='GPU') ##组合需要的数据,方便训练 #gpr = GaussianProcessRegressor() tr_len = len(X_train) data1 = pd.concat((X_train, X_test), axis=0, ignore_index=True) features = list(data1.columns) ffff = [ 'Attribute1', 'Attribute10', 'Attribute2', 'Attribute3', 'Attribute4', 'Attribute5', 'Attribute6', 'Attribute7', 'Attribute8', 'Attribute9',
booster='gbtree', objective='reg:linear', eval_metric='rmse', learning_rate=0.01), train_type=args.train_type, use_valid=True, debug=args.debug) model.load_params("parames/xgboost_regressor_default.yml") elif (args.regressor == "lightgbm"): model = LightGBMRegressor(model=lgb.LGBMRegressor( objective='regression', metric='rmse'), train_type=args.train_type, use_valid=True, debug=args.debug) elif (args.regressor == "catboost"): model = CatBoostRegressor(model=catboost.CatBoostRegressor(), use_valid=True, debug=args.debug) # モデルのパラメータ設定 if not (args.params_file == ""): model.set_params(args.params_file) #-------------------- # モデルの学習処理 #-------------------- model.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold) #-------------------- # モデルの推論処理 #--------------------
import datetime import logging import emoji import json import os # Enable logging. logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - \ %(message)s', level=logging.INFO) logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) weather = Weather(unit=Unit.CELSIUS) model = catboost.CatBoostRegressor().load_model('model_fire_pred.uu') FIRE = emoji.emojize("Fire :fire:", use_aliases=True) SHARING_LOCATION = "Would you mind sharing your location with me?" SEND_LOCATION = emoji.emojize("Send location :round_pushpin:", use_aliases=True) GREETINGS = 'Hi User! The purpose of this bot is to monitor fires around you.' NEAREST_FIRE = emoji.emojize("Where's nearest fire? :eyes:", use_aliases=True) SEE_FIRE = emoji.emojize("I see fire! :scream:", use_aliases=True) WHAT_NEXT = emoji.emojize("What you want to do next? :point_down:", use_aliases=True) VISUAL = "In visual range" FAR = "It's far from here" IMAGE_FILE = 'ferry.png' PHONENUMBER = '8 (800) 100-94-00' THANKYOU = 'Thank you for your contribution in firefighting!'
'seed': 66, # 'nthread':12 } params['silent'] = 1 watchlist = [(xgb_train, 'train'), (xgb_eval, 'eval')] xgb_model = xgb.train(params, xgb_train, 5000, watchlist, early_stopping_rounds=40,verbose_eval = 40) train_model_pred['xgb_pred'].iloc[test_index] += xgb_model.predict(xgb_eval) test_model_pred['xgb_pred'] += xgb_model.predict(xgb_test) print('开始cb训练...') train_pool = Pool(train_feat[predictors].iloc[train_index], train_feat['loan_sum'].iloc[train_index]) eval_pool = Pool(train_feat[predictors].iloc[test_index], train_feat['loan_sum'].iloc[test_index]) test_pool = Pool(test_feat[predictors]) cb_model = cb.CatBoostRegressor(iterations=400, depth=7, learning_rate=0.06, eval_metric='RMSE', od_type='Iter', od_wait=20, random_seed=42, thread_count=7, bagging_temperature=0.85, rsm=0.85, verbose=False) cb_model.fit(train_pool) train_model_pred['cb_pred'].iloc[test_index] += cb_model.predict(eval_pool) test_model_pred['cb_pred'] += cb_model.predict(test_pool) test_model_pred = test_model_pred/5 from sklearn.linear_model import LinearRegression lr = LinearRegression() lr.fit(train_model_pred, train_feat.loan_sum) # import pickle # pickle.dump(lr,open('lr.model','wb+'))
def ctb_reg(self, para): reg = ctb.CatBoostRegressor(**para['reg_params']) return self.train_reg(reg, para)
def rgr_boost(data_root, out_csv_file, delta=True): """ # 去除邮编 train_accuracy:0.457 test_accuracy:0.435 a)保留z1 train_accuracy:0.454 test_accuracy:0.436 # 增加标准差 train_accuracy:0.457 test_accuracy:0.436 # 去除userId,movie_id train_accuracy:0.455 test_accuracy:0.431 :param data_root: :param out_csv_file: :param delta: :return: """ x, y, test, origin_test = load_data(data_root) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=10) cat_index = [ i for i, col in enumerate(x.columns) if col in [ 'useGender', 'useOccupation', 'z1', 'z2', 'z3', 'userId', 'movie_id', 'year', 'month', 'week', 'w_year', 'w_month', 'w_week' ] ] text_indx = [ i for i, col in enumerate(x.columns) if col in ['movie_title'] ] model = cb.CatBoostRegressor(iterations=1000, learning_rate=0.1, od_type="Iter", l2_leaf_reg=3, model_size_reg=3, depth=10, cat_features=cat_index) model.fit(x_train, y_train, eval_set=(x_test, y_test)) print("train_accuracy:{:.3f} \n" " test_accuracy:{:.3f}".format( accuracy(model, x_train, y_train, delta), accuracy(model, x_test, y_test, delta))) save_feature_importance(model, os.path.join(data_root, 'feature_rgr.png')) origin_test['score'] = predict(model, test, delta).round().astype('int32') origin_test[['userId', 'movie_id', 'time', 'score']].to_csv(os.path.join(data_root, out_csv_file), sep=',', index=False, header=None)
folds_item_ids = json.load(infile) fit_scores = {} val_scores = {} def rmse(y_true, y_pred): return metrics.mean_squared_error(y_true, y_pred)**0.5 model = catboost.CatBoostRegressor(iterations=2000, learning_rate=0.5, max_depth=6, use_best_model=True, loss_function='RMSE', eval_metric='RMSE', od_type='Iter', od_wait=20, logging_level='Verbose', random_seed=42, boosting_type='Plain', one_hot_max_size=50) for i in folds_item_ids.keys(): # Determine train and val folds fit_mask = X_train['item_id'].isin(folds_item_ids[i]['fit']) val_mask = X_train['item_id'].isin(folds_item_ids[i]['val']) X_fit = X_train[fit_mask].drop('item_id', axis='columns') y_fit = y_train[fit_mask] X_val = X_train[val_mask].drop('item_id', axis='columns') y_val = y_train[val_mask]
ab_id = [] # 设置样本权重 data['temp_label'] = data['score'] # 这里设置为None 而不是删除该数据,因为删除的话,线下一定是提升的,对于线上而言,异常数据依旧存在,所以应该关注在训练集无异常,而测试集有异常下的处理效果 data['sample_weight'] = data['temp_label'] + 200 data['sample_weight'] = data['sample_weight'] / data['sample_weight'].mean() # 方案1 ,不训练 data.loc[data.id.isin(ab_id), 'temp_label'] = None # 方案2,样本权重设置低一点 data.loc[data.id.isin(ab_id), 'sample_weight'] = 0.01 # 感谢大佬分享的参数 ctb_params = { 'n_estimators': 20000, 'learning_rate': 0.01, 'random_seed': 4590, 'reg_lambda': 0.08, 'subsample': 0.7, 'bootstrap_type': 'Bernoulli', 'boosting_type': 'Plain', 'one_hot_max_size': 10, 'rsm': 0.5, 'leaf_estimation_iterations': 5, 'use_best_model': True, 'max_depth': 6, 'verbose': -1, 'thread_count': 4 } ctb_model = ctb.CatBoostRegressor(**ctb_params)
train['客厅总面积'] = ws_s test_wss = list(copy.deepcopy(test['客厅均面积'])) test_ws = list(copy.deepcopy(test['厅的数量'])) ws_s = [] num = len(test_wss) for i in range(num): temp = test_wss[i] * test_ws[i] ws_s.append(temp) test['客厅总面积'] = ws_s del train_wss, train_ws, ws_s, num, i, temp, test_wss, test_ws test = test.sort_values(by=['id'], ascending=(True)) test_id = list(copy.deepcopy(test['id'])) test.drop('id', axis=1, inplace=True) train_label = list(copy.deepcopy(train['月租金'])) train.drop('月租金', axis=1, inplace=True) train_pool = Pool(train, train_label, cat_features=None) test_pool = Pool(test, cat_features=None) cb_model = cb.CatBoostRegressor(depth=11, learning_rate=0.11, iterations=2729, l2_leaf_reg=0.1, model_size_reg=2, loss_function='RMSE') cb_model.fit(train_pool, verbose=True) preds = cb_model.predict(test_pool) test_lgb = pd.DataFrame({'id': test_id, 'price': preds}) test_lgb.to_csv('./result/catboost.csv', index=False)
test, features, lgbmodel, name="lgbfinal", prepare_stacking=True, fit_params={ "early_stopping_rounds": 500, "eval_metric": "rmse" }) lgbmodel.FI.mean(axis=1).sort_values()[180:250].plot( kind="barh", title="Features Importance", figsize=(10, 10)) catmodel = cat.CatBoostRegressor(iterations=10000, learning_rate=0.01, depth=5, loss_function="RMSE", boost_from_average=True, colsample_bylevel=0.8, bagging_temperature=0.2, metric_period=None, random_seed=random_seed) Kfolder.validate(train, test, features, catmodel, name="catfinal", prepare_stacking=True, fit_params={ "early_stopping_rounds": 500, "use_best_model": True }) train['Revenue_lgb'] = train["lgbfinal"]
import numpy as np, shap, catboost from sklearn.model_selection import train_test_split from sklearn.datasets import load_boston X, y = shap.datasets.boston() X_train, X_test, y_train, y_test = train_test_split(X, y) reg = catboost.CatBoostRegressor(iterations=1000, learning_rate=0.25, silent=True) reg.fit(X_train, y_train, eval_set=(X_test, y_test)) explainer = shap.TreeExplainer(reg) shap_values = explainer.shap_values(X) print(load_boston().DESCR) shap.summary_plot(shap_values, X, plot_type="bar") n = np.random.randint(0, 506) print(y[n]) shap.force_plot(explainer.expected_value, shap_values[n], X.iloc[n], matplotlib=True)