Exemple #1
0
def ctb_model(train_df, test_df, params):
    NFOLDS = 5
    train_label = train_df['信用分']
    kfold = KFold(n_splits=NFOLDS, shuffle=False, random_state=2019)
    kf = kfold.split(train_df, train_label)

    train = train_df.drop(['用户编码', '信用分'], axis=1)

    id_traget = pd.DataFrame()
    id_traget['id'] = train_df['用户编码']
    id_traget['target'] = train_df['信用分']

    test = test_df.drop(['用户编码'], axis=1)

    cv_pred = np.zeros(test.shape[0])
    valid_best_l2_all = 0
    valid_best_l2_all_list = []
    models = []
    count = 0
    val_pred_all = pd.DataFrame()
    for i, (train_fold, validate) in enumerate(kf):
        print("model: cgb_mae. fold: ", i, "training...")

        val_id_target = id_traget.iloc[validate]
        X_train, label_train = train.iloc[train_fold], train_label.iloc[
            train_fold]
        X_validate, label_validate = train.iloc[validate], train_label.iloc[
            validate]

        cat = ctb.CatBoostRegressor(**params)
        bst = cat.fit(X_train,
                      label_train,
                      eval_set=[(X_train, label_train),
                                (X_validate, label_validate)],
                      early_stopping_rounds=2000,
                      verbose=1000)

        val_pred = pd.DataFrame()
        val_pred['id'] = val_id_target['id']
        val_pred['index'] = X_validate.index
        val_pred['target'] = label_validate
        val_pred['score'] = bst.predict(X_validate)
        val_pred_all = pd.concat([val_pred_all, val_pred],
                                 axis=0,
                                 ignore_index=True)

        cv_pred += bst.predict(test)
        valid_best_l2_all += mean_absolute_error(y_true=val_pred['target'],
                                                 y_pred=val_pred['score'])
        valid_best_l2_all_list.append(
            mean_absolute_error(y_true=val_pred['target'],
                                y_pred=val_pred['score']))
        count += 1
        models.append(bst)

    cv_pred /= NFOLDS
    valid_best_l2_all /= NFOLDS
    mae_score = 1 / (1 + valid_best_l2_all)
    print("cgb_mae cv score for valid is: ", mae_score)

    # print("----------------------------------------")
    # print("----------------------------------------")
    # print("xgb_mae  feature importance:")
    #    fea_importances = pd.DataFrame({
    #         'column': train.columns,
    #         'importance': bst.feature_importance
    #     }).sort_values(by='importance', ascending=False)
    #    print(fea_importances)
    # print("----------------------------------------")
    # print("----------------------------------------")

    return val_pred_all, cv_pred, mae_score
i = 0
for train_index, valid_index in kf.split(train,
                                         train[target].astype(int).values):
    print("\nFold {}".format(i + 1))
    X_train, label_train = train.iloc[train_index][feature_name], train.iloc[
        train_index][target].astype(int).values
    X_valid, label_valid = train.iloc[valid_index][feature_name], train.iloc[
        valid_index][target].astype(int).values

    clf = cbt.CatBoostRegressor(
        iterations=ITERATIONS,
        learning_rate=0.1,
        depth=10,
        l2_leaf_reg=10,
        loss_function='RMSE',
        eval_metric="RMSE",
        task_type='GPU',
        devices="0:1",
        simple_ctr='FeatureFreq',
        combinations_ctr='FeatureFreq',
    )
    clf.fit(X_train,
            label_train,
            eval_set=[(X_valid, label_valid)],
            early_stopping_rounds=EARLY_STOP,
            verbose=VERBOSE * 10)
    x1 = clf.predict(X_valid)
    y1 = clf.predict(test[feature_name])

    clf = xgb.XGBRegressor(learning_rate=0.1,
                           max_depth=7,
 def load_model(self):
     self.clf =  catboost.CatBoostRegressor(**self.set_hyperparameters)
    order_id = pd.DataFrame(np.unique(test_data.order_id), columns=["order_id"])

    train_data.fillna(0, inplace=True)
    test_data.fillna(0, inplace=True)

    labels = train_data[['delivery_duration']].values.astype(np.float32).flatten()

    train_pool = Pool(train_data[features], label=labels)
    test_pool = Pool(test_data[features])


    print('Start training...')
    # train
    cat = cb.CatBoostRegressor(iterations=args.round,
                               learning_rate=100,
                               depth=12,
                               l2_leaf_reg=3,
                               rsm=1,
                               verbose=True,
                               eval_metric="MAE",
                               loss_function='MAE').fit(train_pool)

    print cat.get_feature_importance(train_pool)
    test_data.loc[:, "delivery_duration"] = cat.predict(test_pool)

    # print mean_absolute_error(test_data["delivery_duration"], test_data["delivery_duration_prd"])
    rs = order_id.merge(test_data[["order_id", "delivery_duration"]], left_on="order_id", right_on="order_id",
                        how="left")

    rs.to_csv(args.out_path, header=['order_id', 'delivery_duration'], index=False)
Exemple #5
0
     'ridge',
     'limegreen',
 ),
 'SVR RBF': (
     SVR(),
     'svr',
     'darkorange',
 ),
 'SVR Linear': (
     LinearSVR(C=0.08, epsilon=0.06),
     #LinearSVR(C=0.28, epsilon=0.06),
     'lin_svr',
     'purple',
 ),
 'CatBoost': (
     cat.CatBoostRegressor(**cat_params),
     'catboost',
     'dimgrey',
 ),
 'Lasso': (
     Lasso(),
     'lasso',
     'blue',
 ),
 'RidgeCV': (
     RidgeCV(alphas=[num / 100 for num in range(5, 500, 5)]),
     'ridgecv',
     'yellow',
 ),
 'SVR Poly': (
     SVR(kernel='poly'),  # C=5.0),
Exemple #6
0
                            max_back_quarter=MAX_BACK_QUARTER)

    fc2 = BaseCompanyFeatures(cat_columns=CAT_COLUMNS)

    fc3 = QuarterlyDiffFeatures(columns=QUARTER_COLUMNS,
                                compare_quarter_idxs=COMPARE_QUARTER_IDXS,
                                max_back_quarter=MAX_BACK_QUARTER)

    feature = FeatureMerger(fc1, fc2, on='ticker')
    feature = FeatureMerger(feature, fc3, on=['ticker', 'date'])

    target = QuarterlyDiffTarget(col='marketcap')

    base_models = [
        lgbm.sklearn.LGBMRegressor(),
        ctb.CatBoostRegressor(verbose=False)
    ]

    ensemble = EnsembleModel(base_models=base_models,
                             bagging_fraction=BAGGING_FRACTION,
                             model_cnt=MODEL_CNT)

    model = GroupedOOFModel(ensemble, group_column='ticker', fold_cnt=FOLD_CNT)

    pipeline = BasePipeline(feature=feature,
                            target=target,
                            model=model,
                            metric=median_absolute_relative_error,
                            out_name=OUT_NAME)

    result = pipeline.fit(data_loader, ticker_list)
Exemple #7
0
                objective='reg:linear',
                eval_metric='rmse',
                learning_rate=0.01),
                                     train_type=args.train_type,
                                     use_valid=True,
                                     debug=args.debug)
            model.load_params("parames/xgboost_regressor_default.yml")
        elif (args.regressor == "lightgbm"):
            model = LightGBMRegressor(model=lgb.LGBMRegressor(
                objective='regression', metric='rmse'),
                                      train_type=args.train_type,
                                      use_valid=True,
                                      debug=args.debug)
        elif (args.regressor == "catboost"):
            model = CatBoostRegressor(
                model=catboost.CatBoostRegressor(random_seed=args.seed + k),
                use_valid=True,
                debug=args.debug)

        # モデルのパラメータ設定
        if not (args.params_file == ""):
            model.set_params(args.params_file)

        #--------------------
        # モデルの学習処理
        #--------------------
        model.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold)

        #--------------------
        # モデルの推論処理
        #--------------------
Exemple #8
0
    def fit(self,
            train_X,
            train_y,
            valid_X=None,
            valid_y=None,
            auto_split_train_data=True,
            booster_params=None,
            num_boost_round=10,
            learning_rate=0.01,
            nthread=-1,
            eval_metric=None,
            # learning_rates=None, # callback...
            cat_features=None,
            pairs=None,
            pretrained_model=None,
            sample_weight=None,
            group_id=None,
            group_weight=None,
            subgroup_id=None,
            pairs_weight=None,
            baseline=None,
            use_best_model=None,
            verbose=None,
            # verbose_eval=None, # Alias as verbose...
            logging_level=None,
            plot=False,
            column_description=None,
            metric_period=None,
            silent=None,
            random_seed=0,
            early_stopping_rounds=100,
            # save_snapshot=None,
            # snapshot_file=None,
            # snapshot_interval=None,
            inplace_class_model=True,
            autosave_ckpt=True,
            ):

        if booster_params is not None:
            __booster_params = booster_params
        else:
            __booster_params = self.__booster_params

        # Update some training parameters:
        __booster_params['iterations'] = num_boost_round
        __booster_params['eta'] = learning_rate
        # __booster_params['eta'] = learning_rate
        __booster_params['random_seed'] = random_seed # Alias: random_state
        __booster_params['thread_count'] = nthread
        __booster_params['eval_metric'] = eval_metric
        __booster_params['logging_level'] = logging_level

        # Split training and validating data to prevent overfitting:
        if valid_X is None and valid_y is None:
            if auto_split_train_data:
                logging.info('Randomly split training data into 70% and 30%.')
                _train_X, _valid_X, _train_y, _valid_y = train_test_split(train_X, train_y, test_size=0.3, random_state=random_seed)
                logging.info('Training data size: {}, validation data size: {}'.format(_train_X.shape, _valid_X.shape))

                # Make CatBoost Pool:
                train_pool = MyCat.make_pool(data=_train_X, label=_train_y, cat_features=cat_features, pairs=pairs)
                valid_pool = MyCat.make_pool(data=_valid_X, label=_valid_y, cat_features=cat_features, pairs=pairs)
                data_for_eval = [valid_pool] # [train_pool, valid_pool]
            else:
                train_pool = MyCat.make_pool(data=train_X, label=train_y, cat_features=cat_features, pairs=pairs)
                data_for_eval = None # [train_pool]
        elif valid_X is not None and valid_y is not None:
            if not isinstance(valid_X):
                valid_X = [valid_X]
            if not isinstance(valid_y):
                valid_y = [valid_y]

            assert len(valid_X) == len(valid_y), 'Input valid_X and valid_y should have same length.'

            logging.info('Training data size: {}'.format(train_X.shape[0]))
            train_pool = MyCat.make_pool(data=train_X, label=train_y, cat_features=cat_features, pairs=pairs)
            data_for_eval = []

            for i in range(len(valid_X)):
                _valid_pool = MyCat.make_pool(data=valid_X[i], label=valid_y[i], cat_features=cat_features, pairs=pairs)
                data_for_eval.append(_valid_pool)
                logging.info('Validation data {} size: {}'.format(i, valid_X[i].shape))

        # Start training procedure...
        try:
            _training_execution_time = MyCat.tic()
            if self.__application == 'regression':
                if pretrained_model is None:
                    _this_cat = cb.CatBoostRegressor(**__booster_params)
                else:
                    print('Use pretrained.')
                    _this_cat = cb.CatBoostRegressor()
                    _this_cat = _this_cat.load_model(pretrained_model, self.__model_artifact_format)
            elif self.__application == 'classification':
                if pretrained_model is None:
                    _this_cat = cb.CatBoostClassifier(**__booster_params)
                else:
                    print('Use pretrained.')
                    _this_cat = cb.CatBoostClassifier()
                    _this_cat = _this_cat.load_model(pretrained_model, self.__model_artifact_format)
            else:
                raise ValueError('Unknown application type. Should be either classification or regression.')

            with MyCat.timer('Model training'):
                _this_cat.fit(
                    X=train_pool,
                    eval_set=data_for_eval,
                    verbose=verbose,
                    plot=plot,
                    early_stopping_rounds=early_stopping_rounds,
                    silent=silent
                )

                if autosave_ckpt:
                    _ckpt_file = 'CatBoost_model_ckpt_{}'.format(_training_execution_time)
                    _ckpt_file = os.path.join(self.__checkpoint_dir, _ckpt_file)
                    _this_cat.save_model(_ckpt_file, self.__model_artifact_format)
                    logging.info('Saved model artifact to {}'.format(_ckpt_file))

                return _this_cat
        except Exception as e:
            _error_msg = 'Failed in training CatBoost model. Error: {}'.format(e)
            print(_error_msg)
            logging.error(_error_msg)
            raise
Exemple #9
0
    else:
        regression_model = GradientBoostingRegressor(**study.best_params, n_estimators=1000,
                                          validation_fraction=fraction_of_validation_samples, n_iter_no_change=100)
        regression_model.fit(train_x, train_y)
        best_n_estimators = len(regression_model.estimators_)
    regression_model = GradientBoostingRegressor(**study.best_params)

elif method_flag == 14:  # catboost
    train_x_tmp, train_x_validation, train_y_tmp, train_y_validation = train_test_split(train_x,
                                                                                        train_y,
                                                                                        test_size=fraction_of_validation_samples,
                                                                                        random_state=0)
    if fraction_of_validation_samples == 0:
        best_n_estimators_in_cv = number_of_sub_models
    else:
        regression_model = cat.CatBoostRegressor(n_estimators=500, logging_level='Silent')
        regression_model.fit(train_x_tmp, train_y_tmp,
                 eval_set=[(train_x_validation, train_y_validation)],
                 early_stopping_rounds=30)
        best_n_estimators_in_cv = regression_model.best_iteration_

    def objective(trial):
        param = {
            'depth': trial.suggest_int('depth', 4, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e0),
            'random_strength': trial.suggest_int('random_strength', 0, 100),
            'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 100),
            'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
            'od_wait': trial.suggest_int('od_wait', 10, 50)
        }
Exemple #10
0
    data, features, cate_feat = data_process()
    # cb_model = cb.CatBoostRegressor()

    # lgb_model = lgb.LGBMRegressor(
    #     num_leaves=64, reg_alpha=0., reg_lambda=0.01, metric='rmse',
    #     max_depth=-1, learning_rate=0.05, min_child_samples=10, seed=202011,
    #     n_estimators=2000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
    # )

    ctb_model = cb.CatBoostRegressor(
        loss_function="RMSE",
        eval_metric="MAE",
        task_type="CPU",
        learning_rate=0.01,
        iterations=10000,
        random_seed=202011,
        od_type="Iter",
        depth=6,
        early_stopping_rounds=400,
    )

    data, predict_label = get_predict_w(ctb_model,
                                        data,
                                        label='label',
                                        feature=features,
                                        cate_feature=cate_feat,
                                        random_state=202011,
                                        n_splits=10,
                                        model_type='ctb')
Exemple #11
0
    subsample=0.67,
    colsample_bytree=0.054,
    colsample_bylevel=0.50)

print("-" * 20 + "User-Level XGBoost Training" + "-" * 20)
Kfolder.validate(user_train,
                 user_test,
                 features,
                 xgbmodel,
                 name="xgbfinal",
                 prepare_stacking=True)
print("-" * 20 + "Done Training" + "-" * 20)

# CatBoost
catmodel = cat.CatBoostRegressor(iterations=500,
                                 learning_rate=0.2,
                                 depth=5,
                                 random_seed=2019)

print("-" * 20 + "User-Level CatBoost Training" + "-" * 20)
Kfolder.validate(user_train,
                 user_test,
                 features,
                 catmodel,
                 name="catfinal",
                 prepare_stacking=True,
                 fit_params={
                     "use_best_model": True,
                     "verbose": 100
                 })
print("-" * 20 + "Done Training" + "-" * 20)
# Ensembling dragons
Exemple #12
0
def reg_model(train,
              test,
              label_name,
              model_type,
              numerical_features,
              category_features,
              seed,
              cv=True):
    import lightgbm as lgb
    from xgboost import XGBRegressor
    from sklearn.ensemble import RandomForestRegressor
    train.reset_index(inplace=True, drop=True)
    test.reset_index(inplace=True, drop=True)
    if model_type == 'rf':
        train.fillna(0, inplace=True)

    # combine = pd.concat([train, test], axis=0)
    # combine = multi_column_LabelEncoder(combine, category_features, rename=True)
    # combine[category_features] = combine[category_features].astype('category')
    # train = combine[:train.shape[0]]
    # test = combine[train.shape[0]:]

    features = category_features + numerical_features
    train_x = train[features]
    train_y = train[label_name]
    test_x = test[features]
    if cv:
        n_fold = 2
        count_fold = 0
        preds_list = list()
        oof = np.zeros(train_x.shape[0])
        kfolder = KFold(n_splits=n_fold, shuffle=True, random_state=seed)
        kfold = kfolder.split(train_x, train_y)
        for train_index, vali_index in kfold:
            print("training......fold", count_fold)
            count_fold = count_fold + 1
            k_x_train = train_x.loc[train_index]
            k_y_train = train_y.loc[train_index]
            k_x_vali = train_x.loc[vali_index]
            k_y_vali = train_y.loc[vali_index]
            if model_type == 'lgb':
                lgb_model = lgb.LGBMRegressor(**lgb_params)
                if 'sample_weight' in train.columns:
                    lgb_model = lgb_model.fit(
                        k_x_train,
                        k_y_train,
                        eval_set=[(k_x_vali, k_y_vali)],
                        early_stopping_rounds=200,
                        verbose=False,
                        eval_metric="mae",
                        sample_weight=train.loc[train_index]['sample_weight'],
                        categorical_feature=category_features)
                else:
                    lgb_model = lgb_model.fit(
                        k_x_train,
                        k_y_train,
                        eval_set=[(k_x_vali, k_y_vali)],
                        early_stopping_rounds=200,
                        verbose=False,
                        eval_metric="mae",
                        categorical_feature=category_features)
                k_pred = lgb_model.predict(
                    k_x_vali, num_iteration=lgb_model.best_iteration_)
                pred = lgb_model.predict(
                    test_x, num_iteration=lgb_model.best_iteration_)
            elif model_type == 'xgb':
                xgb_model = XGBRegressor(**xgb_params)
                xgb_model = xgb_model.fit(k_x_train,
                                          k_y_train,
                                          eval_set=[(k_x_train, k_y_train),
                                                    (k_x_vali, k_y_vali)],
                                          early_stopping_rounds=200,
                                          verbose=False)
                k_pred = xgb_model.predict(k_x_vali)
                pred = xgb_model.predict(test_x)
            elif model_type == 'rf':
                rf_model = RandomForestRegressor(n_estimators=100,
                                                 max_depth=3,
                                                 criterion="mae",
                                                 n_jobs=-1,
                                                 random_state=2019)
                model = rf_model.fit(k_x_train, k_y_train)
                k_pred = rf_model.predict(k_x_vali)
                pred = rf_model.predict(test_x)
            elif model_type == 'cat':
                ctb_params = {
                    'n_estimators': 1000,
                    'learning_rate': 0.02,
                    'random_seed': 4590,
                    'reg_lambda': 0.08,
                    'subsample': 0.7,
                    'bootstrap_type': 'Bernoulli',
                    'boosting_type': 'Plain',
                    'one_hot_max_size': 100,
                    'rsm': 0.5,
                    'leaf_estimation_iterations': 5,
                    'use_best_model': True,
                    'max_depth': 5,
                    'verbose': -1,
                    'thread_count': 4,
                    'cat_features': category_features
                }

                cat_model = cat.CatBoostRegressor(**ctb_params)
                cat_model.fit(k_x_train,
                              k_y_train,
                              verbose=False,
                              use_best_model=True,
                              eval_set=[(k_x_vali, k_y_vali)])
                k_pred = cat_model.predict(k_x_vali)
                pred = cat_model.predict(test_x)
            preds_list.append(pred)
            oof[vali_index] = k_pred

        # if model_type == 'lgb':
        #     feature_importance_df = pd.DataFrame({
        #         'column': features,
        #         'importance': lgb_model.feature_importances_,
        #     }).sort_values(by='importance')
        #     feature_importance_df.to_csv('feature_importance.csv', index=False, )
        #print(feature_importance_df)

        # plt.figure(figsize=(15, 5))
        # plt.barh(range(len(features)), lgb_model.feature_importances_)
        # plt.bar(range(len(features)), lgb_model.feature_importances_)
        # plt.xticks(range(len(features)), features, rotation=-45, fontsize=14)
        # plt.title('Feature importance', fontsize=14)
        # plt.show()
        # import shap
        # explainer = shap.TreeExplainer(lgb_model)
        # shap_values = explainer.shap_values(train_x)
        # player_explainer = pd.DataFrame()
        # player_explainer['feature'] = features
        # player_explainer['feature_value'] = train_x.iloc[10].values
        # player_explainer['shap_value'] = shap_values[10]
        # print(player_explainer)
        # shap.initjs()
        # aa = shap.force_plot(explainer.expected_value, shap_values[10], train_x.iloc[10])
        # #bb = shap.summary_plot(shap_values, train_x)
        # cc = shap.summary_plot(shap_values, train_x, plot_type="bar")

        #shap.save_html('aa.html', bb)
        preds_columns = ['preds_{id}'.format(id=i) for i in range(n_fold)]
        preds_df = pd.DataFrame(data=preds_list)
        preds_df = preds_df.T
        preds_df.columns = preds_columns
        preds = list(preds_df.mean(axis=1))

        return preds, oof
    else:
        lgb_model = lgb.LGBMRegressor(**lgb_params)
        lgb_model = lgb_model.fit(train_x, train_y, eval_metric='mse')
        preds = lgb_model.predict(test_x)
        oof = lgb_model.predict(train_x)
        return preds, oof
Exemple #13
0
    def __init__(self, task_type, module_type, compute_task, **params):
        """
        :param task_type:   # 任务类型  cal 或 reg
        :param module_type:
        :param compute_task:
        :param params:
        """
        assert task_type in ["cla", "reg"]  # 两种类型
        assert module_type in ["balance", "debug", "performance"]  # 三种 性能模型
        assert compute_task in ["GPU", "CPU"]

        self.task_type = task_type  # cal   或使用reg
        self.module_type = module_type  # 模块
        if self.module_type == "debug":
            params["thread_count"] = 1
        elif self.module_type == "performance":  # 性能模型
            params["thread_count"] = cpu_count()  # cpu核心数
        else:  # 均衡模型
            params["thread_count"] = cpu_count() // 2

        #通用参数
        # learning_rate(eta) = automatically
        # depth(max_depth) = 6: 树的深度
        # l2_leaf_reg(reg_lambda) = 3  L2正则化系数
        # n_estimators(num_boost_round)(num_trees=1000) = 1000: 解决ml问题的树的最大数量  基分类器的数量
        # one_hot_max_size = 2: 对于某些变量进行one - hot编码

        # loss_function = “Logloss”
        # loss_function in ["Logloss","RMSE","MAE","CrossEntropy","MultiClass", "MultiClassOneVsAll"] 或使用自定义函数

        # custom_metric = None  自定义指标
        # custom_metric in ["RMSE","Logloss","MAE","CrossEntropy","Recall","Precision","F1","Accuracy","AUC","R2"]

        # eval_metric = Optimized objective 优化目标
        # eval_metric in ["RMSE","Logloss","MAE","CrossEntropy","Recall","Precision","F1","Accuracy","AUC","R2"]

        # nan_mode = None:处理NAN的方法
        # nan_mode in ["Forbidden","Min","Max"]

        # leaf_estimation_method = None:迭代求解的方法,梯度和牛顿
        # leaf_estimation_method in ["Newton","Gradient"]

        # random_seed = None: 训练时候的随机种子

        # 性能参数
        # thread_count = -1:训练时所用的cpu / gpu核数
        # used_ram_limit = None:CTR问题,计算时的内存限制
        # gpu_ram_part = None:GPU内存限制
        # 处理单元设置
        # task_type = CPU:训练的器件
        # devices = None:训练的GPU设备ID
        # counter_calc_method = None,
        # leaf_estimation_iterations = None,
        # use_best_model = None,
        # verbose = None,
        # model_size_reg = None,
        # rsm = None,
        # logging_level = None,
        # metric_period = None,
        # ctr_leaf_count_limit = None,
        # store_all_simple_ctr = None,
        # max_ctr_complexity = None,
        # has_time = None,
        # classes_count = None,
        # class_weights = None,
        # random_strength = None,
        # name = None,
        # ignored_features = None,
        # train_dir = None,
        # custom_loss = None,
        # bagging_temperature = None
        # border_count = None
        # feature_border_type = None,
        # save_snapshot = None,
        # snapshot_file = None,
        # fold_len_multiplier = None,
        # allow_writing_files = None,
        # final_ctr_computation_mode = None,
        # approx_on_full_history = None,
        # boosting_type = None,
        # simple_ctr = None,
        # combinations_ctr = None,
        # per_feature_ctr = None,
        # device_config = None,
        # bootstrap_type = None,
        # subsample = None,
        # colsample_bylevel = None,
        # random_state = None,
        # objective = None,
        # max_bin = None,
        # scale_pos_weight = None,
        # gpu_cat_features_storage = None,
        # data_partition = None
        self.compute_task = compute_task

        if self.compute_task == "gpu":  #
            params["task_type"] = "GPU"
        else:
            params["task_type"] = "CPU"

        if self.task_type == "reg":  #  做回归任务
            """  
            # 使用相关的成本函数, RMSE, MultiRMSE, MAE, Quantile, LogLinQuantile, Poisson, MAPE, Lq or custom objective object"
            """
            self.model = cb.CatBoostRegressor(
                iterations=None,
                learning_rate=params.get("leaning_rate", None),  #  学习率
                depth=params.get("depth", None),  # 深度
                l2_leaf_reg=params.get("l2_leaf_reg", None),  #l2 正则
                model_size_reg=params.get("model_size_reg", None),
                rsm=params.get("rms", None),  #
                loss_function=params.get("loss_function", 'RMSE'),  # 损失函数值
                border_count=params.get("border_count", None),  # 边界树
                feature_border_type=params.get("feature_border_type", None),
                per_float_feature_quantization=params.get(
                    "per_float_feature_quantization", None),
                input_borders=params.get("input_borders", None),
                output_borders=params.get("output_borders", None),
                fold_permutation_block=params.get("fold_permutation_block",
                                                  None),
                od_pval=params.get("od_pval", None),
                od_wait=params.get("od_wait", None),
                od_type=params.get("od_type", None),
                nan_mode=params.get("nan_mode", None),
                counter_calc_method=params.get("counter_calc_method", None),
                leaf_estimation_iterations=params.get(
                    "leaf_estimation_iterations", None),
                leaf_estimation_method=params.get("leaf_estimation_method",
                                                  None),  # 叶子及分类器方法
                thread_count=params.get("thread_count", None),  # 线程数
                random_seed=params.get("random_seed", None),  # 随机种子
                use_best_model=params.get("use_best_model", None),
                best_model_min_trees=params.get("best_model_min_trees",
                                                None),  # 最好模型最小数
                verbose=params.get("verbose", None),
                silent=params.get("silent", None),
                logging_level=params.get("logging_level", None),
                metric_period=params.get("metric_period", None),
                ctr_leaf_count_limit=params.get("ctr_leaf_count_limit", None),
                store_all_simple_ctr=params.get("store_all_simple_ctr", None),
                max_ctr_complexity=params.get("max_ctr_complexity", None),
                has_time=params.get("has_time", None),
                allow_const_label=params.get("allow_const_label", None),
                one_hot_max_size=params.get("one_hot_max_size", None),
                random_strength=params.get("random_strength", None),
                name=params.get("name", None),
                ignored_features=params.get("ignored_features", None),
                train_dir=params.get("train_dir", None),
                custom_metric=params.get("custom_metric", None),
                eval_metric=params.get("eval_metric", None),
                bagging_temperature=params.get("bagging_temperature", None),
                save_snapshot=params.get("save_snapshot", None),
                snapshot_file=params.get("snapshot_file", None),
                snapshot_interval=params.get("snapshot_interval", None),
                fold_len_multiplier=params.get("fold_len_multiplier", None),
                used_ram_limit=params.get("used_ram_limit", None),
                gpu_ram_part=params.get("gpu_ram_part", None),
                pinned_memory_size=params.get("pinned_memory_size", None),
                allow_writing_files=params.get("allow_writing_files", None),
                final_ctr_computation_mode=params.get(
                    "final_ctr_computation_mode", None),
                approx_on_full_history=params.get("final_ctr_computation_mode",
                                                  None),
                boosting_type=params.get("boosting_type", None),
                simple_ctr=params.get("simple_ctr", None),
                combinations_ctr=params.get("combinations_ctr", None),
                per_feature_ctr=params.get("per_feature_ctr", None),
                ctr_target_border_count=params.get("ctr_target_border_count",
                                                   None),
                task_type=params.get("task_type", None),  # cpu 或GPU
                device_config=params.get("device_config", None),
                devices=params.get("devices", None),  # 训练的gpu设备ID
                bootstrap_type=params.get("bootstrap_type", None),
                subsample=params.get("subsample", None),
                sampling_unit=params.get("sampling_unit", None),
                dev_score_calc_obj_block_size=params.get(
                    "dev_score_calc_obj_block_size", None),
                max_depth=params.get("max_depth", None),  # 最大树的深度,默认为6 ==depth
                n_estimators=params.get("n_estimators", None),  # 基分类器的数量,
                # 决ml伪命题的树的最大数量,默认值为1000,==num_boost_round, ==num_trees=1000
                num_boost_round=params.get("num_boost_round", None),  # 提升轮数树
                num_trees=params.get("num_trees", None),  # 树数量
                colsample_bylevel=params.get("colsample_bylevel", None),
                random_state=params.get("random_state", None),  # 随机种子
                reg_lambda=params.get("reg_lambda", None),  # 正则化参数lambda
                objective=params.get("objective", None),  # 目标函数
                eta=params.get("eta", None),
                max_bin=params.get("max_bin", None),
                gpu_cat_features_storage=params.get("gpu_cat_features_storage",
                                                    None),
                data_partition=params.get("data_partition", None),
                metadata=params.get("metadata", None),
                early_stopping_rounds=params.get("early_stopping_rounds",
                                                 None),  # 过早停止代数
                cat_features=params.get("cat_features", None),
                grow_policy=params.get("grow_policy", None),
                min_data_in_leaf=params.get("min_data_in_leaf",
                                            None),  # 叶子中的最小数
                min_child_samples=params.get("min_child_samples",
                                             None),  # 最小子样本
                max_leaves=params.get("max_leaves", None),  # 最大叶子数
                num_leaves=params.get("num_leaves", None),  # 叶子数量
                score_function=params.get("score_function", None),  # 得分函数
                leaf_estimation_backtracking=params.get(
                    "leaf_estimation_backtracking", None),
                ctr_history_unit=params.get("ctr_history_unit", None),
                monotone_constraints=params.get("monotone_constraints", None),
                feature_weights=params.get("feature_weights",
                                           None),  # 特征全面直接拍卖行
                penalties_coefficient=params.get("penalties_coefficient",
                                                 None),
                first_feature_use_penalties=params.get(
                    "first_feature_use_penalties", None),
                model_shrink_rate=params.get("model_shrink_rate", None),
                model_shrink_mode=params.get("model_shrink_mode", None),
                langevin=params.get("langevin", None),
                diffusion_temperature=params.get("diffusion_temperature",
                                                 None),
                boost_from_average=params.get("boost_from_average", None))

        else:  # 做胡分类任务
            self.model = cb.CatBoostClassifier(
                iterations=None,  # 迭代数, 通用参数
                learning_rate=params.get("leaning_rate", None),  #学习率,通用参数
                depth=params.get("depth", None),  # 树的深度,
                l2_leaf_reg=params.get("l2_leaf_reg", None),  # l2 正则化参数
                model_size_reg=params.get("model_size_reg", None),
                rsm=params.get("rms", None),
                loss_function=params.get("loss_function", None),
                border_count=params.get("border_count", None),
                feature_border_type=params.get("feature_border_type", None),
                per_float_feature_quantization=params.get(
                    "per_float_feature_quantization", None),
                input_borders=params.get("input_borders", None),
                output_borders=params.get("output_borders", None),
                fold_permutation_block=params.get("fold_permutation_block",
                                                  None),
                od_pval=params.get("od_pval", None),
                od_wait=params.get("od_wait", None),
                od_type=params.get("od_type", None),
                nan_mode=params.get("nan_mode", None),
                counter_calc_method=params.get("counter_calc_method", None),
                leaf_estimation_iterations=params.get(
                    "leaf_estimation_iterations", None),
                leaf_estimation_method=params.get("leaf_estimation_method",
                                                  None),
                thread_count=params.get("thread_count", None),  # 性能参数,使用-1时,
                # 使用过最大的cpu核心数进行训练
                random_seed=params.get("random_seed", None),
                use_best_model=params.get("use_best_model", None),
                # best_model_min_trees=params.get("best_model_min_trees", None),
                verbose=params.get("verbose", None),
                # silent=params.get("silent", None),
                logging_level=params.get("logging_level", None),
                metric_period=params.get("metric_period", None),
                ctr_leaf_count_limit=params.get("ctr_leaf_count_limit", None),
                store_all_simple_ctr=params.get("store_all_simple_ctr", None),
                max_ctr_complexity=params.get("max_ctr_complexity", None),
                has_time=params.get("has_time", None),
                allow_const_label=params.get("allow_const_label", None),
                one_hot_max_size=params.get("one_hot_max_size",
                                            None),  # one_hot编码的最大尺寸
                random_strength=params.get("random_strength", None),
                name=params.get("name", None),
                ignored_features=params.get("ignored_features", None),
                train_dir=params.get("train_dir", None),
                custom_loss=params.get("custom_loss", None),
                custom_metric=params.get("custom_metric", None),
                eval_metric=params.get("eval_metric", None),
                bagging_temperature=params.get("bagging_temperature", None),
                save_snapshot=params.get("save_snapshot", None),
                snapshot_file=params.get("snapshot_file", None),
                snapshot_interval=params.get("snapshot_interval", None),
                fold_len_multiplier=params.get("fold_len_multiplier", None),
                used_ram_limit=params.get("used_ram_limit", None),  #CTR问题,
                # 计算时的内存限制 性能参数
                gpu_ram_part=params.get("gpu_ram_part", None),  # 性能参数, GPU显存限制
                # pinned_memory_size=params.get("pinned_memory_size", None),
                allow_writing_files=params.get("allow_writing_files", None),
                final_ctr_computation_mode=params.get(
                    "final_ctr_computation_mode", None),
                approx_on_full_history=params.get("final_ctr_computation_mode",
                                                  None),
                boosting_type=params.get("boosting_type", None),
                simple_ctr=params.get("simple_ctr", None),
                combinations_ctr=params.get("combinations_ctr", None),
                per_feature_ctr=params.get("per_feature_ctr", None),
                # ctr_target_border_count=params.get("ctr_target_border_count", None),
                task_type=params.get("task_type", None),
                device_config=params.get("device_config", None),
                devices=params.get("devices", None),
                bootstrap_type=params.get("bootstrap_type", None),
                subsample=params.get("subsample", None),
                sampling_unit=params.get("sampling_unit", None),
                dev_score_calc_obj_block_size=params.get(
                    "dev_score_calc_obj_block_size", None),
                max_depth=params.get("max_depth", None),
                n_estimators=params.get("n_estimators", None),
                num_boost_round=params.get("num_boost_round", None),
                num_trees=params.get("num_trees", None),
                colsample_bylevel=params.get("colsample_bylevel", None),
                random_state=params.get("random_state", None),
                reg_lambda=params.get("reg_lambda",
                                      None),  # 正则化参数, l2, ==l2_leaf_reg
                objective=params.get("objective", None),
                eta=params.get("eta", None),  # 使用自动的学习率 ==learning_rate
                max_bin=params.get("max_bin", None),
                scale_pos_weight=params.get("scale_pos_weight", None),
                gpu_cat_features_storage=params.get("gpu_cat_features_storage",
                                                    None),
                data_partition=params.get("data_partition", None),
                metadata=params.get("metadata", None),
                early_stopping_rounds=params.get("early_stopping_rounds",
                                                 None),
                cat_features=params.get("cat_features", None),
                grow_policy=params.get("grow_policy", None),
                min_data_in_leaf=params.get("min_data_in_leaf", None),
                min_child_samples=params.get("min_child_samples", None),
                max_leaves=params.get("max_leaves", None),
                num_leaves=params.get("num_leaves", None),
                score_function=params.get("score_function", None),
                leaf_estimation_backtracking=params.get(
                    "leaf_estimation_backtracking", None),
                ctr_history_unit=params.get("ctr_history_unit", None),
                monotone_constraints=params.get("monotone_constraints", None),
                feature_weights=params.get("feature_weights", None),
                penalties_coefficient=params.get("penalties_coefficient",
                                                 None),
                first_feature_use_penalties=params.get(
                    "first_feature_use_penalties", None),
                model_shrink_rate=params.get("model_shrink_rate", None),
                model_shrink_mode=params.get("model_shrink_mode", None),
                langevin=params.get("langevin", None),
                diffusion_temperature=params.get("diffusion_temperature",
                                                 None),
                boost_from_average=params.get("boost_from_average", None),
                text_features=params.get("text_features", None),
                tokenizers=params.get("tokenizers", None),
                dictionaries=params.get("dictionaries", None),
                feature_calcers=params.get("feature_calcers", None),
                text_processing=params.get("text_processing", None))
Exemple #14
0
def main(problem_type, data_type, data_sample, user_type, param_index):
    param = params[param_index] # Hyper params
    results = []

    if user_type == 'new':
        X_train = X_group_train
        y_train = y_group_train
        X_test = X_group_test
        y_test = y_group_test
        X_val = X_group_val
        y_val = y_group_val
    elif user_type == 'cur':
        X_train = X_stratified_train
        y_train = y_stratified_train
        X_test = X_stratified_test
        y_test = y_stratified_test
        X_val = X_stratified_val
        y_val = y_stratified_val
    else:
        raise ValueError('Invalid value for <user_type>')

    if data_type == 'fitbit':
        X_train = X_train[fitbit_features]
        X_test = X_test[fitbit_features]
        X_val = X_val[fitbit_features]
    elif data_type == 'all':
        X_train = X_train[all_features]
        X_test = X_test[all_features]
        X_val = X_val[all_features]
    else:   
        raise ValueError('Invalid value for <data_type>')

    if problem_type == 'cls':
        param['loss_function'] = 'Logloss'
        param['eval_metric'] = 'F1'
        X = pd.concat([X_train, X_test])
        y = pd.concat([y_train, y_test])
        group = y['subject'].values
        # Split K Fold
        if user_type == 'new':
            kfold = GroupKFold(n_splits=4).split(X,y,group)
        elif user_type == 'cur':
            kfold = StratifiedKFold(n_splits=4, random_state=27, shuffle=True).split(X, group)
        # Training
        for train_index, test_index in kfold:
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            y_train = y_train['valence_relative_bin']
            y_test = y_test['valence_relative_bin']
            # pdb.set_trace()
            # Data Sampling
            X_train, y_train, param = data_process(data_sample, X_train, y_train, param)
            # CatBoost
            cat = catboost.CatBoostClassifier(**param)
            cat.fit(X_train, y_train,
                     eval_set = (X_test, y_test),
                     use_best_model = True,
                     verbose = False)
            y_pred = cat.predict(X_val)
            # pdb.set_trace()
            cat_res = eval_class(y_val['valence_relative_bin'], y_pred)
            results.append(cat_res +[0,0])

            # feature_col = X_val.columns.values
            # fi_score = catboost_feature_importance(cat, feature_col)
            # fi_score.to_csv()
    
    elif problem_type == 'reg':
        param.pop('class_weights', None)
        param['loss_function'] = 'RMSE'
        param['eval_metric'] = 'RMSE'
        X = pd.concat([X_train, X_test])
        y = pd.concat([y_train, y_test])
        group = y['subject'].values
        # Split K Fold
        if user_type == 'new':
            kfold = GroupKFold(n_splits=4).split(X,y,group)
        elif user_type == 'cur':
            kfold = StratifiedKFold(n_splits=4, random_state=27, shuffle=True).split(X, group)
        # Training
        for train_index, test_index in kfold:
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            # Two model, one for positive valence, one for negative valence
            y_p_train = y_train['valence_p']
            y_p_test = y_test['valence_p']
            y_n_train = y_train['valence_n']
            y_n_test = y_test['valence_n']
            # Regression problem no data sampling technique
            # CatBoost
            model_p = catboost.CatBoostRegressor(**param)
            model_p.fit(X_train, y_train['valence_p'],
                        eval_set = (X_test, y_test['valence_p']),
                        use_best_model = True,
                        verbose = False)
            p_regr = model_p.predict(X_val)

            model_n = catboost.CatBoostRegressor(**param)
            model_n.fit(X_train, y_train['valence_n'],
                        eval_set = (X_test, y_test['valence_n']),
                        use_best_model = True,
                        verbose = False)
            n_regr = model_n.predict(X_val)

            y_pred = n_regr - p_regr - y_val['valence_score_median']
            y_reg = y_val['valence_relative']
            # print(model_regr)
            y_pred_cls = [1 if x >= 0 else 0 for x in y_pred]
            y_reg_cls = y_val['valence_relative_bin']
            # pdb.set_trace()
            cat_res = eval_class(y_reg_cls, y_pred_cls)
            reg_eval = eval_reg(y_reg, y_pred)
            results.append(cat_res + reg_eval)

    results = np.mean(results, axis=0)
    training_params = [problem_type, data_type, data_sample, user_type, param_index]
    return np.append(training_params, results)

def read_yaml(path):
    with open(path, "r") as f:
        return yaml.safe_load(f)


MODEL_PARAMS = {"allow_writing_files": False, "iterations": 10}


@pytest.fixture(
    scope="module",
    params=[
        cb.CatBoost(MODEL_PARAMS),
        cb.CatBoostClassifier(**MODEL_PARAMS),
        cb.CatBoostRegressor(**MODEL_PARAMS),
    ],
    ids=["CatBoost", "CatBoostClassifier", "CatBoostRegressor"],
)
def cb_model(request):
    model = request.param
    X, y = get_iris()
    return ModelWithData(model=model.fit(X, y), inference_dataframe=X)


@pytest.fixture
def reg_model():
    model = cb.CatBoostRegressor(**MODEL_PARAMS)
    X, y = get_iris()
    return ModelWithData(model=model.fit(X, y), inference_dataframe=X)
Exemple #16
0
        'metric': {'mae'},
        'num_leaves': 256,
        'min_sum_hessian_in_leaf': 20,
        'max_depth': -12,
        'learning_rate': 0.05,
        'feature_fraction': 0.6,
        # 'bagging_fraction': 0.9,
        # 'bagging_freq': 3,
        'verbose': 1
    }

    print('Start training...')
    # train
    cat = cb.CatBoostRegressor(iterations=args.round,
                               learning_rate=0.03,
                               depth=8,
                               l2_leaf_reg=3,
                               rsm=1,
                               loss_function='Logloss').fit(
                                   train_data[features], labels)

    test_data.loc[:,
                  "delivery_duration_prd"] = cat.predict(test_data[features])

    print mean_absolute_error(test_data["delivery_duration"],
                              test_data["delivery_duration_prd"])
    # rs = order_id.merge(test_data[["order_id", "delivery_duration"]], left_on="order_id", right_on="order_id",
    #                     how="left")
    #
    # rs.to_csv(args.out_path, header=['order_id', 'delivery_duration'], index=False)
def reg_model():
    model = cb.CatBoostRegressor(**MODEL_PARAMS)
    X, y = get_iris()
    return ModelWithData(model=model.fit(X, y), inference_dataframe=X)
Exemple #18
0
                       LabelBinarizer()]),
    ],
    df_out=True)

Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

# # GridSearchCV to find best params for the pipe
# params = {
#     'iterations': [100,500],
#     'learning_rate': [0.1,0.3,0.7],
#     'depth': [4, 10],
# }
# grid = GridSearchCV(model, params, cv=3, n_jobs=-1, verbose=1)
# grid.fit(Z_train, y_train)
# print(grid.best_score_)
# print(grid.best_params_)

# CatBoostRegressor using the best params found above^

model = cb.CatBoostRegressor(depth=10, iterations=500, learning_rate=0.3)
model.fit(Z_train, y_train)
print(model.score(Z_train, y_train))
print(model.score(Z_test, y_test))

# pipe and pickle
pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
dill.dump(pipe, open('pipe.pkl', 'wb'))
Exemple #19
0
                            silent=True,
                            subsample=0.8,
                            colsample_bytree=0.7,
                            colsample_bylevel=0.5)
Kfolder.validate(train,
                 test,
                 features,
                 xgbmodel,
                 name="xgbfinal",
                 prepare_stacking=True)

catmodel = cat.CatBoostRegressor(iterations=10000,
                                 learning_rate=0.01,
                                 depth=5,
                                 eval_metric='RMSE',
                                 colsample_bylevel=0.8,
                                 bagging_temperature=0.2,
                                 metric_period=None,
                                 early_stopping_rounds=200,
                                 random_seed=random_seed)
Kfolder.validate(train,
                 test,
                 features,
                 catmodel,
                 name="catfinal",
                 prepare_stacking=True,
                 fit_params={
                     "use_best_model": True,
                     "verbose": 100
                 })
Exemple #20
0
                                   reg_lambda=5,
                                   max_depth=7,
                                   n_estimators=10000,
                                   subsample=0.7,
                                   colsample_bytree=0.4,
                                   subsample_freq=2,
                                   min_child_samples=10,
                                   learning_rate=0.1,
                                   random_state=2019)

cbt_attr_model = cbt.CatBoostRegressor(
    num_leaves=31,
    #        reg_lambda=5,
    max_depth=7,
    n_estimators=10000,
    #        subsample=0.7,
    #        min_child_samples=10,
    learning_rate=0.1,
    random_state=2,
    eval_metric='MAE',
    task_type='GPU')

##组合需要的数据,方便训练
#gpr  = GaussianProcessRegressor()
tr_len = len(X_train)
data1 = pd.concat((X_train, X_test), axis=0, ignore_index=True)
features = list(data1.columns)

ffff = [
    'Attribute1', 'Attribute10', 'Attribute2', 'Attribute3', 'Attribute4',
    'Attribute5', 'Attribute6', 'Attribute7', 'Attribute8', 'Attribute9',
                booster='gbtree',
                objective='reg:linear',
                eval_metric='rmse',
                learning_rate=0.01),
                                     train_type=args.train_type,
                                     use_valid=True,
                                     debug=args.debug)
            model.load_params("parames/xgboost_regressor_default.yml")
        elif (args.regressor == "lightgbm"):
            model = LightGBMRegressor(model=lgb.LGBMRegressor(
                objective='regression', metric='rmse'),
                                      train_type=args.train_type,
                                      use_valid=True,
                                      debug=args.debug)
        elif (args.regressor == "catboost"):
            model = CatBoostRegressor(model=catboost.CatBoostRegressor(),
                                      use_valid=True,
                                      debug=args.debug)

        # モデルのパラメータ設定
        if not (args.params_file == ""):
            model.set_params(args.params_file)

        #--------------------
        # モデルの学習処理
        #--------------------
        model.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold)

        #--------------------
        # モデルの推論処理
        #--------------------
Exemple #22
0
import datetime
import logging
import emoji
import json
import os

# Enable logging.
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - \
                            %(message)s',
                    level=logging.INFO)

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

weather = Weather(unit=Unit.CELSIUS)
model = catboost.CatBoostRegressor().load_model('model_fire_pred.uu')

FIRE = emoji.emojize("Fire :fire:", use_aliases=True)
SHARING_LOCATION = "Would you mind sharing your location with me?"
SEND_LOCATION = emoji.emojize("Send location :round_pushpin:",
                              use_aliases=True)
GREETINGS = 'Hi User! The purpose of this bot is to monitor fires around you.'
NEAREST_FIRE = emoji.emojize("Where's nearest fire? :eyes:", use_aliases=True)
SEE_FIRE = emoji.emojize("I see fire! :scream:", use_aliases=True)
WHAT_NEXT = emoji.emojize("What you want to do next? :point_down:",
                          use_aliases=True)
VISUAL = "In visual range"
FAR = "It's far from here"
IMAGE_FILE = 'ferry.png'
PHONENUMBER = '8 (800) 100-94-00'
THANKYOU = 'Thank you for your contribution in firefighting!'
Exemple #23
0
              'seed': 66,
              # 'nthread':12
              }
    params['silent'] = 1
    watchlist = [(xgb_train, 'train'), (xgb_eval, 'eval')]
    xgb_model = xgb.train(params, xgb_train, 5000, watchlist, early_stopping_rounds=40,verbose_eval = 40)
    train_model_pred['xgb_pred'].iloc[test_index] += xgb_model.predict(xgb_eval)
    test_model_pred['xgb_pred'] += xgb_model.predict(xgb_test)


    print('开始cb训练...')
    train_pool = Pool(train_feat[predictors].iloc[train_index], train_feat['loan_sum'].iloc[train_index])
    eval_pool = Pool(train_feat[predictors].iloc[test_index], train_feat['loan_sum'].iloc[test_index])
    test_pool = Pool(test_feat[predictors])
    cb_model = cb.CatBoostRegressor(iterations=400, depth=7, learning_rate=0.06, eval_metric='RMSE',
                                 od_type='Iter', od_wait=20, random_seed=42, thread_count=7,
                                 bagging_temperature=0.85, rsm=0.85, verbose=False)
    cb_model.fit(train_pool)
    train_model_pred['cb_pred'].iloc[test_index] += cb_model.predict(eval_pool)
    test_model_pred['cb_pred'] += cb_model.predict(test_pool)

test_model_pred = test_model_pred/5



from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_model_pred, train_feat.loan_sum)
# import pickle
# pickle.dump(lr,open('lr.model','wb+'))
 def ctb_reg(self, para):
     reg = ctb.CatBoostRegressor(**para['reg_params'])
     return self.train_reg(reg, para)
Exemple #25
0
def rgr_boost(data_root, out_csv_file, delta=True):
    """
    # 去除邮编
    train_accuracy:0.457
     test_accuracy:0.435
      a)保留z1
    train_accuracy:0.454
     test_accuracy:0.436

    # 增加标准差
    train_accuracy:0.457
     test_accuracy:0.436

    # 去除userId,movie_id
    train_accuracy:0.455
     test_accuracy:0.431


    :param data_root:
    :param out_csv_file:
    :param delta:
    :return:
    """
    x, y, test, origin_test = load_data(data_root)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=10)

    cat_index = [
        i for i, col in enumerate(x.columns) if col in [
            'useGender', 'useOccupation', 'z1', 'z2', 'z3', 'userId',
            'movie_id', 'year', 'month', 'week', 'w_year', 'w_month', 'w_week'
        ]
    ]
    text_indx = [
        i for i, col in enumerate(x.columns) if col in ['movie_title']
    ]

    model = cb.CatBoostRegressor(iterations=1000,
                                 learning_rate=0.1,
                                 od_type="Iter",
                                 l2_leaf_reg=3,
                                 model_size_reg=3,
                                 depth=10,
                                 cat_features=cat_index)

    model.fit(x_train, y_train, eval_set=(x_test, y_test))

    print("train_accuracy:{:.3f} \n"
          " test_accuracy:{:.3f}".format(
              accuracy(model, x_train, y_train, delta),
              accuracy(model, x_test, y_test, delta)))
    save_feature_importance(model, os.path.join(data_root, 'feature_rgr.png'))

    origin_test['score'] = predict(model, test, delta).round().astype('int32')
    origin_test[['userId', 'movie_id', 'time',
                 'score']].to_csv(os.path.join(data_root, out_csv_file),
                                  sep=',',
                                  index=False,
                                  header=None)
Exemple #26
0
    folds_item_ids = json.load(infile)

fit_scores = {}
val_scores = {}


def rmse(y_true, y_pred):
    return metrics.mean_squared_error(y_true, y_pred)**0.5


model = catboost.CatBoostRegressor(iterations=2000,
                                   learning_rate=0.5,
                                   max_depth=6,
                                   use_best_model=True,
                                   loss_function='RMSE',
                                   eval_metric='RMSE',
                                   od_type='Iter',
                                   od_wait=20,
                                   logging_level='Verbose',
                                   random_seed=42,
                                   boosting_type='Plain',
                                   one_hot_max_size=50)

for i in folds_item_ids.keys():

    # Determine train and val folds
    fit_mask = X_train['item_id'].isin(folds_item_ids[i]['fit'])
    val_mask = X_train['item_id'].isin(folds_item_ids[i]['val'])
    X_fit = X_train[fit_mask].drop('item_id', axis='columns')
    y_fit = y_train[fit_mask]
    X_val = X_train[val_mask].drop('item_id', axis='columns')
    y_val = y_train[val_mask]
Exemple #27
0
ab_id = []

# 设置样本权重
data['temp_label'] = data['score']
# 这里设置为None 而不是删除该数据,因为删除的话,线下一定是提升的,对于线上而言,异常数据依旧存在,所以应该关注在训练集无异常,而测试集有异常下的处理效果
data['sample_weight'] = data['temp_label'] + 200
data['sample_weight'] = data['sample_weight'] / data['sample_weight'].mean()
# 方案1 ,不训练
data.loc[data.id.isin(ab_id), 'temp_label'] = None
# 方案2,样本权重设置低一点
data.loc[data.id.isin(ab_id), 'sample_weight'] = 0.01

# 感谢大佬分享的参数
ctb_params = {
    'n_estimators': 20000,
    'learning_rate': 0.01,
    'random_seed': 4590,
    'reg_lambda': 0.08,
    'subsample': 0.7,
    'bootstrap_type': 'Bernoulli',
    'boosting_type': 'Plain',
    'one_hot_max_size': 10,
    'rsm': 0.5,
    'leaf_estimation_iterations': 5,
    'use_best_model': True,
    'max_depth': 6,
    'verbose': -1,
    'thread_count': 4
}
ctb_model = ctb.CatBoostRegressor(**ctb_params)
Exemple #28
0
train['客厅总面积'] = ws_s
test_wss = list(copy.deepcopy(test['客厅均面积']))
test_ws = list(copy.deepcopy(test['厅的数量']))
ws_s = []
num = len(test_wss)
for i in range(num):
    temp = test_wss[i] * test_ws[i]
    ws_s.append(temp)
test['客厅总面积'] = ws_s
del train_wss, train_ws, ws_s, num, i, temp, test_wss, test_ws

test = test.sort_values(by=['id'], ascending=(True))
test_id = list(copy.deepcopy(test['id']))
test.drop('id', axis=1, inplace=True)
train_label = list(copy.deepcopy(train['月租金']))
train.drop('月租金', axis=1, inplace=True)

train_pool = Pool(train, train_label, cat_features=None)
test_pool = Pool(test, cat_features=None)
cb_model = cb.CatBoostRegressor(depth=11,
                                learning_rate=0.11,
                                iterations=2729,
                                l2_leaf_reg=0.1,
                                model_size_reg=2,
                                loss_function='RMSE')
cb_model.fit(train_pool, verbose=True)
preds = cb_model.predict(test_pool)

test_lgb = pd.DataFrame({'id': test_id, 'price': preds})
test_lgb.to_csv('./result/catboost.csv', index=False)
Exemple #29
0
                 test,
                 features,
                 lgbmodel,
                 name="lgbfinal",
                 prepare_stacking=True,
                 fit_params={
                     "early_stopping_rounds": 500,
                     "eval_metric": "rmse"
                 })
lgbmodel.FI.mean(axis=1).sort_values()[180:250].plot(
    kind="barh", title="Features Importance", figsize=(10, 10))
catmodel = cat.CatBoostRegressor(iterations=10000,
                                 learning_rate=0.01,
                                 depth=5,
                                 loss_function="RMSE",
                                 boost_from_average=True,
                                 colsample_bylevel=0.8,
                                 bagging_temperature=0.2,
                                 metric_period=None,
                                 random_seed=random_seed)
Kfolder.validate(train,
                 test,
                 features,
                 catmodel,
                 name="catfinal",
                 prepare_stacking=True,
                 fit_params={
                     "early_stopping_rounds": 500,
                     "use_best_model": True
                 })
train['Revenue_lgb'] = train["lgbfinal"]
Exemple #30
0
import numpy as np, shap, catboost
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston

X, y = shap.datasets.boston()
X_train, X_test, y_train, y_test = train_test_split(X, y)
reg = catboost.CatBoostRegressor(iterations=1000,
                                 learning_rate=0.25,
                                 silent=True)
reg.fit(X_train, y_train, eval_set=(X_test, y_test))
explainer = shap.TreeExplainer(reg)
shap_values = explainer.shap_values(X)
print(load_boston().DESCR)
shap.summary_plot(shap_values, X, plot_type="bar")
n = np.random.randint(0, 506)
print(y[n])
shap.force_plot(explainer.expected_value,
                shap_values[n],
                X.iloc[n],
                matplotlib=True)