def init(PROPERTIES_PATH, LOAD_FROM_DISK):

    #	boost_params = {'n_estimators': 200,
    # 'min_samples_split': 40,
    # 'min_samples_leaf': 4,
    # 'max_features': 'sqrt',
    # 'max_depth': 20,
    # 'learning_rate': 0.05}
    #
    #	boost = GradientBoostingRegressor(**boost_params)

    boost = LGBMRegressor(learning_rate=0.05,
                          n_estimators=1127,
                          max_depth=-1,
                          min_child_weight=0,
                          num_leaves=68,
                          min_child_samples=5,
                          objective='regression',
                          subsample_for_bin=1000,
                          min_split_gain=0,
                          feature_fraction=0.5,
                          nthread=-1)
    train_data = load_all_data(get_connection(PROPERTIES_PATH),
                               TABLE_LIST,
                               is_train=True,
                               load_from_disk=LOAD_FROM_DISK)
    train_data = data_preprocessing(train_data)
    train_X, train_Y = train_data
    boost.fit(train_X, train_Y)
    np.save('col.npy', train_X.columns)

    print("training has been completed succesfully !!!!")
    print("--------------------------------------------")

    return boost
    def score_of_nonlinearmodel(self, model=None):
        """
        树模型
        :param models:
        :return:
        """
        if not [model]:
            if (self.numNull != 0) | (self.numInf != 0):
                print('特征中有NaN或Inf!!!')
                print('NaN:{},Inf:{}'.format(self.numNull, self.numInf))
            model = LGBMRegressor(n_estimators=100)

        model_name = str(model).split('(')[0]
        model.fit(self.train_X, self.train_y)

        if self.showFig:
            sns.barplot(abs(model.feature_importances_), self.continuous_feature_names)
            plt.title('{} importances of features'.format(model_name))
            plt.show()

        sc = [abs(x) for x in model.feature_importances_]
        sum_sc = sum(sc)
        featureScore = [round(s / sum_sc, 4) for s in sc]
        print(model_name + ' is finished')

        return featureScore
def lightBGM_model(X, Y):
    model = LGBMRegressor(num_leaves=36,
                          n_estimators=100,
                          learning_rate=0.07,
                          random_state=0)
    model.fit(X, Y, verbose=True)
    return model
Example #4
0
def train_lightgbm(verbose=True):
    """Train a boosted tree with LightGBM."""
    if verbose: print("Training with LightGBM")
    df = pd.read_csv(STAGE1_LABELS)
    x = np.array([
        np.mean(np.load(os.path.join(FEATURE_FOLDER, '%s.npy' % str(id))),
                axis=0).flatten() for id in df['id'].tolist()
    ])
    y = df['cancer'].as_matrix()

    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(
        x, y, random_state=42, stratify=y, test_size=0.20)
    '''
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'l2'},
        'num_leaves': 21,
        'learning_rate': 0.001,
        'nthread':24,
        'subsample':0.80,
        'colsample_bytree':0.80,
        'seed':42,
        'verbose': verbose,
    }
    '''

    skf = StratifiedKFold(n_splits=5, random_state=2048, shuffle=True)
    result = []
    clfs = []
    oof_preds = []
    for train_index, test_index in skf.split(x, y):
        trn_x, val_x = x[train_index, :], x[test_index, :]
        trn_y, val_y = y[train_index], y[test_index]

        val_ids = pd.DataFrame(ids.iloc[test_index].values, columns=['id'])

        clf = LGBMRegressor(max_depth=50,
                            num_leaves=21,
                            n_estimators=5000,
                            min_child_weight=1,
                            learning_rate=0.001,
                            nthread=24,
                            subsample=0.80,
                            colsample_bytree=0.80,
                            seed=42)

        clf.fit(trn_x,
                trn_y,
                eval_set=[(val_x, val_y)],
                verbose=verbose,
                eval_metric='l2',
                early_stopping_rounds=300)

        val_preds = pd.DataFrame(clf.predict(val_x), columns=["cancer"])
        oof_preds.append(pd.concat([val_ids, val_preds], axis=1))
        clfs.append(clf)

    return clfs, oof_preds
Example #5
0
def get_useful_features_byLightBGM(X, Y):
    # 特殊参数设置
    importance_filter = 6

    model_3 = LGBMRegressor(num_leaves=36,
                            n_estimators=100,
                            learning_rate=0.07,
                            random_state=0)
    Y_log = np.log1p(Y)
    model_3.fit(X, Y_log, verbose=True)

    feature_score = model_3.feature_importances_
    importance_feature_map = list(zip(feature_score, X.columns))

    useless_feature = []
    for i in importance_feature_map:
        if i[0] <= importance_filter:
            useless_feature.append(i[1])
    feature = [c for c in X.columns]
    useful_feature = [aa for aa in feature if aa not in useless_feature]
    print('有用:', len(useful_feature))
    print('无用:', len(useless_feature))
    print('全部:', len(feature))

    return useful_feature
Example #6
0
class LGBMRegressorPrim(primitive):
    def __init__(self, random_state=0):
        super(LGBMRegressorPrim, self).__init__(name='LGBMRegressor')
        self.hyperparams = []
        self.type = 'Regressor'
        self.description = "LightGBM is a gradient boosting framework that uses tree based learning algorithms."
        self.hyperparams_run = {'default': True}
        self.random_state = random_state
        self.model = LGBMRegressor()
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        # data = handle_data(data)
        return True

    def fit(self, data):
        data = handle_data(data)
        self.model.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        output['predictions'] = self.model.predict(output['X'])
        output['X'] = pd.DataFrame(output['predictions'], columns=[self.name+"Pred"])
        final_output = {0: output}
        return final_output
Example #7
0
def bulid_onetrain(train_data, test,pred= features,label= 'label',seed=1099,est=6000, is_shuffle=True):
    train_x,train_y=train_data[features].values,train_data[label].values
    clf=LGBMRegressor( learning_rate=0.01,
    boosting_type = 'gbdt',
    objective = 'regression',
    n_estimators=est,
    num_leaves=156,
    subsample=0.8,
    njobs=-1,
    max_depth=8,
    reg_lambda=0,
    colsample_bytree=0.8,
    random_state=2019,  # 2019
    metric=['mse'])

    clf.fit(
    train_x, train_y,
    eval_set=[(train_x, train_y)],
    eval_metric=['mse'],
    categorical_feature='auto',
    verbose=100)        

    #train_pred= clf.predict(train_x, num_iteration=clf.best_iteration_)


    test_pred= clf.predict(test[pred], num_iteration=clf.best_iteration_)

    #print('mean_squared_error:',mean_squared_error(train_y,train_pred))
    test['label'] = test_pred
    return test[['loadingOrder', 'label']],clf
Example #8
0
def get_ntree():
    rmse_t_total, rmse_v_total = [], []
    for ntree in range(10, 500, 10):
        lgb_base = LGBMRegressor(n_estimators=ntree,
                                 objective='regression',
                                 random_state=1234,
                                 n_jobs=2,
                                 colsample_bytree=0.8,
                                 reg_alpha=1,
                                 max_depth=10,
                                 subsample=0.8)

        print('此时 ntree = %s' % ntree)
        lgb_base.fit(X_t, y_t)
        y_t_pre = lgb_base.predict(X_t)
        y_v_pre = lgb_base.predict(X_v)
        rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre))
        rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre))
        rmse_t_total.append(rmse_t_each)
        rmse_v_total.append(rmse_v_each)
        myfile = open('D:\\workspace python\\statContest\\save\\' +
                      'lgbbase2_rmse_0412.txt',
                      'a',
                      encoding='utf-8')
        print(rmse_t_each, ',', rmse_v_each, file=myfile)
        myfile.close()
    return rmse_t_total, rmse_v_total
Example #9
0
def lightGBM_train_nocross(j,param,x_train, x_test, y_train, y_test):
    gbm = LGBMRegressor(**param,num_leaves=31,learning_rate=0.01,object='regression')
    gbm.fit(x_train, y_train)
    y_pred = gbm.predict(x_test)
    y_pred = DataFrame(y_pred)
    rmse_lightGBM.append(np.sqrt(mean_squared_error(y_pred, y_test)))
    r2_lightGBM.append(r2_score(y_test, y_pred))
    return rmse_lightGBM,r2_lightGBM,gbm
Example #10
0
 def __init__(self, random_state=0):
     super(LGBMRegressorPrim, self).__init__(name='LGBMRegressor')
     self.hyperparams = []
     self.type = 'Regressor'
     self.description = "LightGBM is a gradient boosting framework that uses tree based learning algorithms."
     self.hyperparams_run = {'default': True}
     self.random_state = random_state
     self.model = LGBMRegressor()
     self.accept_type = 'c_r'
Example #11
0
def lgb(x_train, y_train, x_val, y_val):
    lgb = LGBMRegressor(n_estimators=1000,
                        max_depth=10,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        learning_rate=0.01,
                        random_state=2020)
    lgb.fit(x_train, y_train)
    result = lgb.predict(x_val)
    score = mean_absolute_error(result, y_val)
    return score
Example #12
0
def train_lightgbm(trn_x, val_x, trn_y, val_y):
    clf = LGBMRegressor(max_depth=50,
                        num_leaves=21,
                        n_estimators=5000,
                        min_child_weight=9,
                        learning_rate=0.01,
                        nthread=24,
                        subsample=0.80,
                        colsample_bytree=0.80,
                        seed=42)
    clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='l2', early_stopping_rounds=300)
    return clf
    def setUp(self):
        X_train, y_train, X_test, y_test = titanic_fare()
        self.test_len = len(X_test)

        train_names, test_names = titanic_names()
        _, self.names = titanic_names()

        model = LGBMRegressor()
        model.fit(X_train, y_train)
        self.explainer = RegressionExplainer(model, X_test, y_test, r2_score, 
                                        shap='tree', 
                                        cats=['Sex', 'Deck', 'Embarked'],
                                        idxs=test_names, units="$")
Example #14
0
def predict_lgb(X, y, df2, params, ind):

    X_train, y_train = X, y

    output = df2[(df2.index >= ind) & (df2.index <
                                       (ind + 28))]  # dataset for prediction
    X = output.iloc[:, 1:]  # this basically drops the "value" column

    lgb_model = LGBMRegressor(**params)
    lgb_reg = lgb_model.fit(X_train, y_train.value.ravel())
    preds = lgb_reg.predict(X)

    return preds
Example #15
0
def evaluate(params, X, y):
    
    # Initilize instance of estimator
    est = LGBMRegressor(boosting='gbdt', n_jobs=-1, random_state=2018)
        
    # Set params
    est.set_params(**params)
    
    # Calc CV score
    scores = cross_val_score(estimator=est, X=X, y=y, 
                             scoring='r2', cv=4)
    score = np.mean(scores)

    return score
Example #16
0
 def train_LGBM(self,train, t_target, valid, v_target,parm,use_custom_loss = False,reg_alpha = 0,reg_lambda = 0):
     #entity_features_columns = ['total_floor','building_material','city_town', 'building_type', 'building_use', 'parking_way', 'I_index_50', 'I_index_500', 'I_index_1000', 'I_index_5000', 'I_index_10000', 'II_index_50', 'II_index_500', 'II_index_1000', 'II_index_5000', 'II_index_10000', 'III_index_50', 'III_index_500', 'III_index_1000', 'III_index_5000', 'III_index_10000', 'IV_index_50', 'IV_index_500', 'IV_index_1000', 'IV_index_5000', 'IV_index_10000', 'V_index_50', 'V_index_500', 'V_index_1000', 'V_index_5000', 'V_index_10000', 'VI_index_50', 'VI_index_500', 'VI_index_1000', 'VI_index_5000', 'VI_index_10000', 'VII_index_50', 'VII_index_500', 'VII_index_1000', 'VII_index_5000', 'VII_index_10000', 'VIII_index_50', 'VIII_index_500', 'VIII_index_1000', 'VIII_index_5000', 'VIII_index_10000', 'IX_index_50', 'IX_index_500', 'IX_index_1000', 'IX_index_5000', 'IX_index_10000', 'X_index_50', 'X_index_500', 'X_index_1000', 'X_index_5000', 'X_index_10000', 'XI_index_50', 'XI_index_500', 'XI_index_1000', 'XI_index_5000', 'XI_index_10000', 'XII_index_50', 'XII_index_500', 'XII_index_1000', 'XII_index_5000', 'XII_index_10000', 'XIII_index_50', 'XIII_index_500', 'XIII_index_1000', 'XIII_index_5000', 'XIII_index_10000', 'XIV_index_50', 'XIV_index_500', 'XIV_index_1000', 'XIV_index_5000', 'XIV_index_10000','parking_price_isna','txn_floor_isna']
     #entity_features_columns = ['building_material', 'city', 'town', 'village', 'building_type', 'building_use', 'parking_way','parking_price_isna','txn_floor_isna']
     if use_custom_loss:
         self.loss = custom_loss
     learning_rate = parm['learning_rate']
     n_estimators = parm['n_estimators']
     max_depth = parm['max_depth']
     num_leaves = parm['num_leaves']
     feature_fraction = parm['feature_fraction']
     flag = True
     good_depth = 0
     good_leaves = 0
     good_fraction = 0
     
     for depth in max_depth:
         for leaves in num_leaves:
             for fraction in feature_fraction:
                 rf = LGBMRegressor(learning_rate=learning_rate, 
                                    objective='regression', 
                                    n_estimators=n_estimators,
                                    max_depth=depth, 
                                    num_leaves=leaves, 
                                    reg_alpha=reg_alpha,
                                    reg_lambda = reg_lambda,
                                    feature_fraction=fraction, 
                                    bagging_freq=1,
                                    metric='rmse')           
                 rf.fit(train, t_target, # should we drop the features that are not correlate to our target?
                        eval_set=[(train, t_target), (valid, v_target)],
                        #early_stopping_rounds=100, 
                        verbose=5000,
                        eval_metric=self.loss,
                        categorical_feature=self.entity_features_columns
                        )
                 print("Finished.")
                 if flag:
                     self.model = rf
                     flag = False
                 y_predict ,y_true= self.predict(valid,v_target)
                 point = self.score(y_true,y_predict)
                 if point > self.max_point:
                     self.max_point = point
                     self.model = rf
                     good_depth = depth
                     good_leaves = leaves
                     good_fraction = fraction
     print(f"depth : {good_depth} leaves : {good_leaves} fraction :{good_fraction}")
     self.model.booster_.save_model(f'models/lightgbm{good_depth}_{good_leaves}_{good_fraction}.txt')
     return self
Example #17
0
 def __init__(self):
     # 在创建类的时候需要哪些参数
     self.model = LGBMRegressor(learning_rate=0.015,
                                objective="regression",
                                metric='mse',
                                num_leaves=12,
                                max_depth=9,
                                max_bin=130,
                                feature_fraction=0.9,
                                reg_lambda=50,
                                min_data=25,
                                min_child_weight=0.001,
                                num_boost_round=3000,
                                random_state=42)
Example #18
0
def get_model(brand_string, train_brand, test_brand):
    brand1 = pd.read_csv(brand_string)
    brand1 = brand1.iloc[90:, :].reset_index(drop=True)
    X_brand1 = brand1.drop(['brand', 'cnt'], axis=1)
    y_train = brand1['cnt'].values

    X_train = pd.concat([X_brand1, train_brand], axis=1)

    X_test = test.drop(['cnt'], axis=1)
    X_test = pd.concat([X_test, test_brand], axis=1)

    model = LGBMRegressor().fit(X_train, y_train)
    brand1_pre = model.predict(X_test)
    return brand1_pre
Example #19
0
def tune_params():
    rmse_t_total, rmse_v_total = [], []
    for max_depth in range(6, 11):
        for subsample in [0.6, 0.7, 0.8]:
            for colsample_bytree in [0.6, 0.7, 0.8]:
                for reg_alpha in [0.1, 1, 10]:
                    lgb_base = LGBMRegressor(n_estimators=150,
                                             objective='regression',
                                             random_state=1234,
                                             n_jobs=3,
                                             colsample_bytree=colsample_bytree,
                                             reg_alpha=reg_alpha,
                                             max_depth=max_depth,
                                             subsample=subsample)
                    _params = {
                        'max_depth': max_depth,
                        'subsample': subsample,
                        'colsample_bytree': colsample_bytree,
                        'reg_alpha': reg_alpha,
                    }
                    lgb_base.fit(X_t, y_t)
                    y_t_pre = lgb_base.predict(X_t)
                    y_v_pre = lgb_base.predict(X_v)
                    rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre))
                    rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre))
                    rmse_t_total.append(rmse_t_each)
                    rmse_v_total.append(rmse_v_each)
                    print(_params)
                    myfile1 = open(
                        'D:\\workspace python\\statContest\\save\\' +
                        'lgbbase2_saveparams_rmse_0412.txt',
                        'a',
                        encoding='utf-8')
                    print(_params['max_depth'],
                          _params['subsample'],
                          _params['colsample_bytree'],
                          _params['reg_alpha'],
                          file=myfile1)

                    myfile1.close()
                    print(rmse_t_each, rmse_v_each)
                    myfile = open('D:\\workspace python\\statContest\\save\\' +
                                  'lgbbase2_tunparms_rmse_0412.txt',
                                  'a',
                                  encoding='utf-8')
                    print(rmse_t_each, ',', rmse_v_each, file=myfile)
                    myfile.close()
    return rmse_t_total, rmse_v_total
def do():
    train_data = pd.read_csv(
        'D:/testFiles/for_excute_folder/activity_blFreight_2017_5_train_input.csv'
    )
    test_data = pd.read_csv(
        'D:/testFiles/for_excute_folder/activity_blFreight_2017_5_test_input.csv'
    )

    # Filter the Timeused <= 1000s
    train_data = train_data[train_data["TIME_USED"] <= 1000]
    test_data = test_data[test_data["TIME_USED"] <= 1000]

    # convert second to minute
    train_data['TIME_USED'] = train_data['TIME_USED'] / 60
    test_data['TIME_USED'] = test_data['TIME_USED'] / 60

    train_data['TIME_USERD_MEDIAN_S2'] = train_data['TIME_USERD_MEDIAN']**2
    test_data['TIME_USERD_MEDIAN_S2'] = test_data['TIME_USERD_MEDIAN']**2

    # bkgOffice_median_by_task_type

    train_data['TIME_USERD_MEDIAN_S3'] = train_data[
        'TIME_USERD_MEDIAN'] * train_data['bkgOffice_median_by_task_type']
    test_data['TIME_USERD_MEDIAN_S3'] = test_data[
        'TIME_USERD_MEDIAN'] * test_data['bkgOffice_median_by_task_type']

    print(train_data.head())

    y_train = train_data['TIME_USED'].values.tolist()
    X_train = train_data.drop(['TIME_USED'], axis=1).values.tolist()

    # 选一个模型

    # regressor = SGDRegressor(l1_ratio=0.1)
    # regressor = Ridge()
    # regressor = SVR()
    # regressor = RandomForestRegressor(n_estimators=100)
    # regressor = AdaBoostRegressor()
    # regressor = GradientBoostingRegressor()
    # regressor = BaggingRegressor()
    # regressor = XGBRegressor(n_estimators=400)    # NOT WORK!
    regressor = LGBMRegressor(n_estimators=400,
                              learning_rate=0.02,
                              seed=2017,
                              colsample_bytree=1)

    rfecv = RFECV(estimator=regressor, step=1, cv=5, scoring='r2', n_jobs=-1)
    rfecv.fit(X_train, y_train)

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

    print(rfecv.support_)
    print(rfecv.ranking_)
    def __init__(self, data, continuous_feature_names, label, score, t='R', showFig=False):
        self.data = data
        self.continuous_feature_names = continuous_feature_names
        self.label = label
        self.score = score
        self.K = len(continuous_feature_names)
        self.T = t
        self.showFig = showFig
        # 备选特征
        self.train_X = data[continuous_feature_names]
        self.train_y = data[label]
        self.numNull = self.train_X.isnull().sum().sum()
        self.numInf = np.isinf(self.train_X.values).sum()

        # 备选模型
        self.linearRegressionModel = [LinearRegression(), Ridge(), Lasso(), LinearSVR()]
        self.linearClassModel = [LogisticRegression(), LinearSVC(), RidgeClassifier()]

        self.treeRegressionModel = [ExtraTreesRegressor(),
                                    DecisionTreeRegressor(),
                                    RandomForestRegressor(),  # RF相对较慢
                                    GradientBoostingRegressor(),
                                    XGBRegressor(n_estimators=100, objective='reg:squarederror'),
                                    LGBMRegressor(n_estimators=100)]
        self.treeClassModel = [ExtraTreesClassifier(),
                               DecisionTreeClassifier(),
                               RandomForestClassifier(),
                               GradientBoostingClassifier(),
                               XGBClassifier(n_estimators=100, objective="binary:logistic"),
                               LGBMClassifier(n_estimators=100)]

        self.nonlinearRegressionModel = self.treeRegressionModel + [SVR(), MLPRegressor(solver='lbfgs', max_iter=100),]
        self.nonlinearClassModel = self.treeClassModel + [SVC(), MLPClassifier(),]
Example #22
0
def lightGBM_CV():
    print('获取内存占用率: ' + (str)(psutil.virtual_memory().percent) + '%')
    samples_df, data_df = make_train_set(train_step=True)
    labels = samples_df['label'].values
    samples_df = None
    values = data_df.values
    data_df = None
    param_test = {
        'max_depth': range(5, 15, 2),
    }
    estimator = LGBMRegressor(
        num_leaves=50,  # cv调节50是最优值
        max_depth=13,
        learning_rate=0.1,
        n_estimators=140,
        objective='regression',
        min_child_weight=1,
        subsample=0.8,
        colsample_bytree=0.8,
        nthread=7,
    )
    gsearch = GridSearchCV(estimator,
                           param_grid=param_test,
                           scoring='roc_auc',
                           cv=5)
    gsearch.fit(values, labels)
    gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_
    print_best_score(gsearch, param_test)
Example #23
0
    def create_model(self):
        # TODO: if learning rates are identical throughout - create a regular Classifier
        self.model_params['n_estimators'] = self.best_n_iterations
        self.model_params["learning_rate"] = self.learning_rates[
            0]  # TODO change

        final_model = LGBMRegressor(**self.model_params)
        return final_model


# class LGBClassifierLR(ClassifierMixin):
#     def __init__(self, model_params=None, n_estimators=None, learning_rates=None):
#         self.model_params = model_params
#         self.n_estimators = n_estimators
#         self.learning_rates = learning_rates
#
#     def fit(self, X, y, sample_weight=None):
#         dtrain = lgb.Dataset(X, label=y)
#         model = lgb.train(self.model_params
#                           , dtrain
#                           , num_boost_round=self.n_estimators
#                           , learning_rates=self.learning_rates
#                           )
#         self.model = model
#
#     def predict(self, X):
#         return self.model.predict(X)
#         # TODO Fix
#
#     def predict_proba(self, X):
#         return self.model.predict(X)
#
#     def get_params(self):
#         return self.learning_rates
Example #24
0
    def tune(self, training_set, logger=None, saver=None):
        self.training_set = training_set
        objective = generate_objective(self.training_set, self.tuning_metric)
        best = space_eval(
            self.space,
            fmin(fn=objective,
                 space=self.space,
                 trials=self.trials,
                 algo=tpe.suggest,
                 max_evals=self.max_evals))

        print(f'Search space: {self.space}')
        print(f'Best hyperparams: {best}')

        self.model = LGBMRegressor()
        self.model.set_params(**best)
        self.model.fit(training_set.X, training_set.y)
Example #25
0
def train_LightGBM(x_train, y_train):
    clf = LGBMRegressor(
        n_estimators=10000,
        learning_rate=0.02,
        boosting_type='gbdt',
        objective='regression_l1',
        max_depth=-1,
        num_leaves=31,
        min_child_samples=20,
        feature_fraction=0.8,
        bagging_freq=1,
        bagging_fraction=0.8,
        lambda_l2=2,
        random_state=2020,
    )
    clf.fit(x_train, y_train)
    return clf
Example #26
0
def train_lgb_model(best_nodes, X_train_scaled, Y_train):

    rsg = LGBMRegressor(
        learning_rate=best_nodes["learning_rate"],
        n_estimators=int(best_nodes["n_estimators"]),
        max_depth=best_nodes["max_depth"],
        #eval_metric=best_nodes["eval_metric"],
        num_leaves=best_nodes["num_leaves"],
        subsample=best_nodes["subsample"],
        colsample_bytree=best_nodes["colsample_bytree"],
        min_child_samples=best_nodes["min_child_samples"],
        min_child_weight=best_nodes["min_child_weight"])

    rsg.fit(X_train_scaled, Y_train)
    Y_pred = rsg.predict(X_train_scaled)
    print("mse:", np.mean((Y_pred - Y_train)**2))
    print("rmse:", np.sqrt(np.mean((Y_pred - Y_train)**2)))
    return rsg
def lightBGM_model_with_test(X, Y):
    model = LGBMRegressor(num_leaves=36,
                          n_estimators=100,
                          learning_rate=0.07,
                          random_state=0)

    useful_feature = get_useful_features_byLightBGM(X, Y)
    X_U = X[useful_feature]

    x1, x2, y1, y2 = train_test_split(X_U, Y, test_size=0.2)
    y1_log = np.log1p(y1)
    model.fit(x1, y1_log, verbose=True)

    predict_log = model.predict(x2)
    predict = np.expm1(predict_log)
    error = error_fun(predict, y2)[1]

    del x1, x2, y1, y2
    return error
def rf_cv(num_leaves, max_depth, subsample, min_child_samples):
    val = cross_val_score(
        LGBMRegressor(objective = 'regression_l1',
            num_leaves=int(num_leaves),
            max_depth=int(max_depth),
            subsample = subsample,
            min_child_samples = int(min_child_samples)
        ),
        X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)
    ).mean()
    return 1 - val
    def select_by_nonlinearmodel(self, models=None):
        """
        树模型
        :param models:
        :return:
        """
        if not models:
            if (self.numNull != 0) | (self.numInf != 0):
                print('特征中有NaN或Inf!!!')
                print('NaN:{},Inf:{}'.format(self.numNull, self.numInf))
                models = [XGBRegressor(n_estimators=100, objective='reg:squarederror'),
                      LGBMRegressor(n_estimators=100)]
            else:
                models = [
                      DecisionTreeRegressor(),
                      # RF相对较慢
                      RandomForestRegressor(),
                      GradientBoostingRegressor(),
                      MLPRegressor(solver='lbfgs', max_iter=100),
                      XGBRegressor(n_estimators=100, objective='reg:squarederror'),
                      LGBMRegressor(n_estimators=100)]

        # 使用SelectFromModel训练一次,选择特征
        for model in models:
            model_name = str(model).split('(')[0]
            selector = SelectFromModel(model, max_features=self.K, threshold=-np.inf)
            selector.fit_transform(X=self.train_X, y=self.train_y)
            mask = selector.get_support(True)
            feature_names = np.array(self.continuous_feature_names)[mask]
            print("{} selected feature:{}".format(model_name, feature_names))

        if self.showFig:
            for model in models:
                model_name = str(model).split('(')[0]
                model.fit(self.train_X, self.train_y)

                self.dict_features_score(model.feature_importances_)
                # print(sorted(dict(zip(self.continuous_feature_names, model.feature_importances_)).items(), key=lambda x: x[1], reverse=True))
                sns.barplot(abs(model.feature_importances_), self.continuous_feature_names)
                plt.title('{} importances of features'.format(model_name))
                plt.show()
def get_estimator(estimator):
    if estimator == 'ridge':
        clf = Ridge()
    elif estimator == 'rfr':
        clf = RandomForestRegressor()
    elif estimator == 'lasso':
        clf = Lasso()
    elif estimator == 'lgbm':
        clf = LGBMRegressor()
    else:
        raise Exception("Name of esimator is error.")
    return clf