Esempio n. 1
0
def get_useful_features_byLightBGM(X, Y):
    # 特殊参数设置
    importance_filter = 6

    model_3 = LGBMRegressor(num_leaves=36,
                            n_estimators=100,
                            learning_rate=0.07,
                            random_state=0)
    Y_log = np.log1p(Y)
    model_3.fit(X, Y_log, verbose=True)

    feature_score = model_3.feature_importances_
    importance_feature_map = list(zip(feature_score, X.columns))

    useless_feature = []
    for i in importance_feature_map:
        if i[0] <= importance_filter:
            useless_feature.append(i[1])
    feature = [c for c in X.columns]
    useful_feature = [aa for aa in feature if aa not in useless_feature]
    print('有用:', len(useful_feature))
    print('无用:', len(useless_feature))
    print('全部:', len(feature))

    return useful_feature
    def score_of_nonlinearmodel(self, model=None):
        """
        树模型
        :param models:
        :return:
        """
        if not [model]:
            if (self.numNull != 0) | (self.numInf != 0):
                print('特征中有NaN或Inf!!!')
                print('NaN:{},Inf:{}'.format(self.numNull, self.numInf))
            model = LGBMRegressor(n_estimators=100)

        model_name = str(model).split('(')[0]
        model.fit(self.train_X, self.train_y)

        if self.showFig:
            sns.barplot(abs(model.feature_importances_), self.continuous_feature_names)
            plt.title('{} importances of features'.format(model_name))
            plt.show()

        sc = [abs(x) for x in model.feature_importances_]
        sum_sc = sum(sc)
        featureScore = [round(s / sum_sc, 4) for s in sc]
        print(model_name + ' is finished')

        return featureScore
def lightBGM_model(X, Y):
    model = LGBMRegressor(num_leaves=36,
                          n_estimators=100,
                          learning_rate=0.07,
                          random_state=0)
    model.fit(X, Y, verbose=True)
    return model
Esempio n. 4
0
class LGBMRegressorPrim(primitive):
    def __init__(self, random_state=0):
        super(LGBMRegressorPrim, self).__init__(name='LGBMRegressor')
        self.hyperparams = []
        self.type = 'Regressor'
        self.description = "LightGBM is a gradient boosting framework that uses tree based learning algorithms."
        self.hyperparams_run = {'default': True}
        self.random_state = random_state
        self.model = LGBMRegressor()
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        # data = handle_data(data)
        return True

    def fit(self, data):
        data = handle_data(data)
        self.model.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        output['predictions'] = self.model.predict(output['X'])
        output['X'] = pd.DataFrame(output['predictions'], columns=[self.name+"Pred"])
        final_output = {0: output}
        return final_output
def init(PROPERTIES_PATH, LOAD_FROM_DISK):

    #	boost_params = {'n_estimators': 200,
    # 'min_samples_split': 40,
    # 'min_samples_leaf': 4,
    # 'max_features': 'sqrt',
    # 'max_depth': 20,
    # 'learning_rate': 0.05}
    #
    #	boost = GradientBoostingRegressor(**boost_params)

    boost = LGBMRegressor(learning_rate=0.05,
                          n_estimators=1127,
                          max_depth=-1,
                          min_child_weight=0,
                          num_leaves=68,
                          min_child_samples=5,
                          objective='regression',
                          subsample_for_bin=1000,
                          min_split_gain=0,
                          feature_fraction=0.5,
                          nthread=-1)
    train_data = load_all_data(get_connection(PROPERTIES_PATH),
                               TABLE_LIST,
                               is_train=True,
                               load_from_disk=LOAD_FROM_DISK)
    train_data = data_preprocessing(train_data)
    train_X, train_Y = train_data
    boost.fit(train_X, train_Y)
    np.save('col.npy', train_X.columns)

    print("training has been completed succesfully !!!!")
    print("--------------------------------------------")

    return boost
Esempio n. 6
0
def bulid_onetrain(train_data, test,pred= features,label= 'label',seed=1099,est=6000, is_shuffle=True):
    train_x,train_y=train_data[features].values,train_data[label].values
    clf=LGBMRegressor( learning_rate=0.01,
    boosting_type = 'gbdt',
    objective = 'regression',
    n_estimators=est,
    num_leaves=156,
    subsample=0.8,
    njobs=-1,
    max_depth=8,
    reg_lambda=0,
    colsample_bytree=0.8,
    random_state=2019,  # 2019
    metric=['mse'])

    clf.fit(
    train_x, train_y,
    eval_set=[(train_x, train_y)],
    eval_metric=['mse'],
    categorical_feature='auto',
    verbose=100)        

    #train_pred= clf.predict(train_x, num_iteration=clf.best_iteration_)


    test_pred= clf.predict(test[pred], num_iteration=clf.best_iteration_)

    #print('mean_squared_error:',mean_squared_error(train_y,train_pred))
    test['label'] = test_pred
    return test[['loadingOrder', 'label']],clf
Esempio n. 7
0
def train_lightgbm(verbose=True):
    """Train a boosted tree with LightGBM."""
    if verbose: print("Training with LightGBM")
    df = pd.read_csv(STAGE1_LABELS)
    x = np.array([
        np.mean(np.load(os.path.join(FEATURE_FOLDER, '%s.npy' % str(id))),
                axis=0).flatten() for id in df['id'].tolist()
    ])
    y = df['cancer'].as_matrix()

    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(
        x, y, random_state=42, stratify=y, test_size=0.20)
    '''
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'l2'},
        'num_leaves': 21,
        'learning_rate': 0.001,
        'nthread':24,
        'subsample':0.80,
        'colsample_bytree':0.80,
        'seed':42,
        'verbose': verbose,
    }
    '''

    skf = StratifiedKFold(n_splits=5, random_state=2048, shuffle=True)
    result = []
    clfs = []
    oof_preds = []
    for train_index, test_index in skf.split(x, y):
        trn_x, val_x = x[train_index, :], x[test_index, :]
        trn_y, val_y = y[train_index], y[test_index]

        val_ids = pd.DataFrame(ids.iloc[test_index].values, columns=['id'])

        clf = LGBMRegressor(max_depth=50,
                            num_leaves=21,
                            n_estimators=5000,
                            min_child_weight=1,
                            learning_rate=0.001,
                            nthread=24,
                            subsample=0.80,
                            colsample_bytree=0.80,
                            seed=42)

        clf.fit(trn_x,
                trn_y,
                eval_set=[(val_x, val_y)],
                verbose=verbose,
                eval_metric='l2',
                early_stopping_rounds=300)

        val_preds = pd.DataFrame(clf.predict(val_x), columns=["cancer"])
        oof_preds.append(pd.concat([val_ids, val_preds], axis=1))
        clfs.append(clf)

    return clfs, oof_preds
Esempio n. 8
0
def get_ntree():
    rmse_t_total, rmse_v_total = [], []
    for ntree in range(10, 500, 10):
        lgb_base = LGBMRegressor(n_estimators=ntree,
                                 objective='regression',
                                 random_state=1234,
                                 n_jobs=2,
                                 colsample_bytree=0.8,
                                 reg_alpha=1,
                                 max_depth=10,
                                 subsample=0.8)

        print('此时 ntree = %s' % ntree)
        lgb_base.fit(X_t, y_t)
        y_t_pre = lgb_base.predict(X_t)
        y_v_pre = lgb_base.predict(X_v)
        rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre))
        rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre))
        rmse_t_total.append(rmse_t_each)
        rmse_v_total.append(rmse_v_each)
        myfile = open('D:\\workspace python\\statContest\\save\\' +
                      'lgbbase2_rmse_0412.txt',
                      'a',
                      encoding='utf-8')
        print(rmse_t_each, ',', rmse_v_each, file=myfile)
        myfile.close()
    return rmse_t_total, rmse_v_total
Esempio n. 9
0
def lightGBM_train_nocross(j,param,x_train, x_test, y_train, y_test):
    gbm = LGBMRegressor(**param,num_leaves=31,learning_rate=0.01,object='regression')
    gbm.fit(x_train, y_train)
    y_pred = gbm.predict(x_test)
    y_pred = DataFrame(y_pred)
    rmse_lightGBM.append(np.sqrt(mean_squared_error(y_pred, y_test)))
    r2_lightGBM.append(r2_score(y_test, y_pred))
    return rmse_lightGBM,r2_lightGBM,gbm
Esempio n. 10
0
def lgb(x_train, y_train, x_val, y_val):
    lgb = LGBMRegressor(n_estimators=1000,
                        max_depth=10,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        learning_rate=0.01,
                        random_state=2020)
    lgb.fit(x_train, y_train)
    result = lgb.predict(x_val)
    score = mean_absolute_error(result, y_val)
    return score
Esempio n. 11
0
def train_lightgbm(trn_x, val_x, trn_y, val_y):
    clf = LGBMRegressor(max_depth=50,
                        num_leaves=21,
                        n_estimators=5000,
                        min_child_weight=9,
                        learning_rate=0.01,
                        nthread=24,
                        subsample=0.80,
                        colsample_bytree=0.80,
                        seed=42)
    clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='l2', early_stopping_rounds=300)
    return clf
Esempio n. 12
0
def build_model(train_data, test, pred, label, seed=2099, is_shuffle=True):
    train_pred = np.zeros((train_data.shape[0], ))
    test_pred = np.zeros((test.shape[0], ))
    n_splits = 5
    # Kfold
    fold = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=seed)
    kf_way = fold.split(train_data[pred])
    # params
    #     test_x=np.concatenate([test[pred].values,geohash_test],axis=1)
    # train
    for n_fold, (train_idx, valid_idx) in enumerate(kf_way, start=1):
        train_x, train_y = train_data[pred].iloc[train_idx].values, train_data[
            label].iloc[train_idx]
        valid_x, valid_y = train_data[pred].iloc[valid_idx].values, train_data[
            label].iloc[valid_idx]
        #         geohash_tr_x,geohash_val_x=geohash_train[train_idx],geohash_train[valid_idx]
        #         train_x=np.concatenate([train_x,geohash_tr_x],axis=1)
        #         valid_x=np.concatenate([valid_x,geohash_val_x],axis=1)

        # 数据加载
        clf = LGBMRegressor(
            learning_rate=0.5,
            n_estimators=6000,
            boosting_type='gbdt',
            objective='regression',
            num_leaves=156,
            subsample=0.8,
            njobs=-1,
            max_depth=6,
            reg_lambda=0,
            colsample_bytree=0.8,
            random_state=2019,  # 2019
            metric=['mse'])

        clf.fit(train_x,
                train_y,
                eval_set=[(valid_x, valid_y)],
                eval_metric=['mse'],
                categorical_feature='auto',
                early_stopping_rounds=100,
                verbose=100)

        train_pred[valid_idx] = clf.predict(valid_x,
                                            num_iteration=clf.best_iteration_)

        test_pred += clf.predict(
            test[pred], num_iteration=clf.best_iteration_) / fold.n_splits

    print('mean_squared_error:',
          mean_squared_error(train_data[label].values, train_pred))
    test['label'] = test_pred
    return test[['loadingOrder', 'label']], clf
    def setUp(self):
        X_train, y_train, X_test, y_test = titanic_fare()
        self.test_len = len(X_test)

        train_names, test_names = titanic_names()
        _, self.names = titanic_names()

        model = LGBMRegressor()
        model.fit(X_train, y_train)
        self.explainer = RegressionExplainer(model, X_test, y_test, r2_score, 
                                        shap='tree', 
                                        cats=['Sex', 'Deck', 'Embarked'],
                                        idxs=test_names, units="$")
Esempio n. 14
0
 def train_LGBM(self,train, t_target, valid, v_target,parm,use_custom_loss = False,reg_alpha = 0,reg_lambda = 0):
     #entity_features_columns = ['total_floor','building_material','city_town', 'building_type', 'building_use', 'parking_way', 'I_index_50', 'I_index_500', 'I_index_1000', 'I_index_5000', 'I_index_10000', 'II_index_50', 'II_index_500', 'II_index_1000', 'II_index_5000', 'II_index_10000', 'III_index_50', 'III_index_500', 'III_index_1000', 'III_index_5000', 'III_index_10000', 'IV_index_50', 'IV_index_500', 'IV_index_1000', 'IV_index_5000', 'IV_index_10000', 'V_index_50', 'V_index_500', 'V_index_1000', 'V_index_5000', 'V_index_10000', 'VI_index_50', 'VI_index_500', 'VI_index_1000', 'VI_index_5000', 'VI_index_10000', 'VII_index_50', 'VII_index_500', 'VII_index_1000', 'VII_index_5000', 'VII_index_10000', 'VIII_index_50', 'VIII_index_500', 'VIII_index_1000', 'VIII_index_5000', 'VIII_index_10000', 'IX_index_50', 'IX_index_500', 'IX_index_1000', 'IX_index_5000', 'IX_index_10000', 'X_index_50', 'X_index_500', 'X_index_1000', 'X_index_5000', 'X_index_10000', 'XI_index_50', 'XI_index_500', 'XI_index_1000', 'XI_index_5000', 'XI_index_10000', 'XII_index_50', 'XII_index_500', 'XII_index_1000', 'XII_index_5000', 'XII_index_10000', 'XIII_index_50', 'XIII_index_500', 'XIII_index_1000', 'XIII_index_5000', 'XIII_index_10000', 'XIV_index_50', 'XIV_index_500', 'XIV_index_1000', 'XIV_index_5000', 'XIV_index_10000','parking_price_isna','txn_floor_isna']
     #entity_features_columns = ['building_material', 'city', 'town', 'village', 'building_type', 'building_use', 'parking_way','parking_price_isna','txn_floor_isna']
     if use_custom_loss:
         self.loss = custom_loss
     learning_rate = parm['learning_rate']
     n_estimators = parm['n_estimators']
     max_depth = parm['max_depth']
     num_leaves = parm['num_leaves']
     feature_fraction = parm['feature_fraction']
     flag = True
     good_depth = 0
     good_leaves = 0
     good_fraction = 0
     
     for depth in max_depth:
         for leaves in num_leaves:
             for fraction in feature_fraction:
                 rf = LGBMRegressor(learning_rate=learning_rate, 
                                    objective='regression', 
                                    n_estimators=n_estimators,
                                    max_depth=depth, 
                                    num_leaves=leaves, 
                                    reg_alpha=reg_alpha,
                                    reg_lambda = reg_lambda,
                                    feature_fraction=fraction, 
                                    bagging_freq=1,
                                    metric='rmse')           
                 rf.fit(train, t_target, # should we drop the features that are not correlate to our target?
                        eval_set=[(train, t_target), (valid, v_target)],
                        #early_stopping_rounds=100, 
                        verbose=5000,
                        eval_metric=self.loss,
                        categorical_feature=self.entity_features_columns
                        )
                 print("Finished.")
                 if flag:
                     self.model = rf
                     flag = False
                 y_predict ,y_true= self.predict(valid,v_target)
                 point = self.score(y_true,y_predict)
                 if point > self.max_point:
                     self.max_point = point
                     self.model = rf
                     good_depth = depth
                     good_leaves = leaves
                     good_fraction = fraction
     print(f"depth : {good_depth} leaves : {good_leaves} fraction :{good_fraction}")
     self.model.booster_.save_model(f'models/lightgbm{good_depth}_{good_leaves}_{good_fraction}.txt')
     return self
Esempio n. 15
0
def predict(X_train, Y_train, X_test):
    print("Y_train is 1:", Y_train.count(1))
    print("Y_train is 0:", Y_train.count(0))
    clfs = [
        LGBMRegressor(learning_rate=0.0475, max_depth=13, n_estimators=100, num_leaves=80),
        XGBRegressor(learning_rate=0.0475, max_depth=4, n_estimators=300)]
    X = np.array(X_train, dtype='float32')
    y = np.array(Y_train, dtype='float32')
    X_predict = np.array(X_test, dtype='float32')
    dataset_blend_train = np.zeros((X.shape[0], len(clfs)), dtype='float32')
    dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)), dtype='float32')

    '''5折stacking'''
    n_folds = 5
    skf = StratifiedKFold(n_splits=n_folds)
    for j, clf in enumerate(clfs):
        '''依次训练各个单模型'''
        print("clf", j)
        dataset_blend_test_j = np.zeros((X_predict.shape[0], n_folds), dtype='float32')
        for i, (train, test) in enumerate(skf.split(X, y)):
            '''使用第i个部分作为预测,剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。'''
            print("stacking Fold", i)
            X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
            # if j == 0:
            #     class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
            #     clf.class_weight = dict(enumerate(class_weights))
            # else:
            #     class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
            #     clf.scale_pos_weight = class_weights[1] / class_weights[0]
            #     print('scale_pos_weight:', clf.scale_pos_weight)
            clf.fit(X_train, y_train)
            y_submission = clf.predict(X_test)
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict(X_predict)
        '''对于测试集,直接用这k个模型的预测值均值作为新的特征'''
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
        del dataset_blend_test_j
        # print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j]))
    # clf = LogisticRegression()
    # clf = GradientBoostingRegressor(learning_rate=0.02, max_depth=6)
    clf = LGBMRegressor()
    class_weights = class_weight.compute_class_weight('balanced', np.unique(y), y)
    clf.class_weight = dict(enumerate(class_weights))
    dataset_blend_train = np.append(dataset_blend_train, X, axis=1)
    dataset_blend_test = np.append(dataset_blend_test, X_predict, axis=1)
    clf.fit(dataset_blend_train, y)
    y_submission = clf.predict(dataset_blend_test)
    return y_submission
Esempio n. 16
0
def tune_params():
    rmse_t_total, rmse_v_total = [], []
    for max_depth in range(6, 11):
        for subsample in [0.6, 0.7, 0.8]:
            for colsample_bytree in [0.6, 0.7, 0.8]:
                for reg_alpha in [0.1, 1, 10]:
                    lgb_base = LGBMRegressor(n_estimators=150,
                                             objective='regression',
                                             random_state=1234,
                                             n_jobs=3,
                                             colsample_bytree=colsample_bytree,
                                             reg_alpha=reg_alpha,
                                             max_depth=max_depth,
                                             subsample=subsample)
                    _params = {
                        'max_depth': max_depth,
                        'subsample': subsample,
                        'colsample_bytree': colsample_bytree,
                        'reg_alpha': reg_alpha,
                    }
                    lgb_base.fit(X_t, y_t)
                    y_t_pre = lgb_base.predict(X_t)
                    y_v_pre = lgb_base.predict(X_v)
                    rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre))
                    rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre))
                    rmse_t_total.append(rmse_t_each)
                    rmse_v_total.append(rmse_v_each)
                    print(_params)
                    myfile1 = open(
                        'D:\\workspace python\\statContest\\save\\' +
                        'lgbbase2_saveparams_rmse_0412.txt',
                        'a',
                        encoding='utf-8')
                    print(_params['max_depth'],
                          _params['subsample'],
                          _params['colsample_bytree'],
                          _params['reg_alpha'],
                          file=myfile1)

                    myfile1.close()
                    print(rmse_t_each, rmse_v_each)
                    myfile = open('D:\\workspace python\\statContest\\save\\' +
                                  'lgbbase2_tunparms_rmse_0412.txt',
                                  'a',
                                  encoding='utf-8')
                    print(rmse_t_each, ',', rmse_v_each, file=myfile)
                    myfile.close()
    return rmse_t_total, rmse_v_total
    def LGB_train(self,X_train, X_valid, labels_train, labels_valid, X_test, lgb_param_all):
        lgb_param_contrl = {'early_stopping_rounds': 100, 'categorical_feature': 'auto'}
        lgb_param = lgb_param_all.copy()
        objective_type = lgb_param['objective_type']
        lgb_param.pop('objective_type')

        for k in ['early_stopping_rounds', 'categorical_feature']:
            if k in lgb_param:
                lgb_param_contrl[k] = lgb_param[k]
                lgb_param.pop(k)

        if not self.config.retrain:
            # 调用已有模型进行增量训练
            model_load = self.load_model()
            if not model_load:
                print('不存在模型:{},从头训练'.format(self.modelName))
                if objective_type == 'regressor':
                    clf = LGBMRegressor(**lgb_param)
                else:
                    clf = LGBMClassifier(**lgb_param)

                clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse',
                        early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'],
                        categorical_feature=lgb_param_contrl['categorical_feature'])
            else:
                clf = model_load.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse',
                                     early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'],
                                     categorical_feature=lgb_param_contrl['categorical_feature'])
        else:
            if objective_type == 'regressor':
                clf = LGBMRegressor(**lgb_param)
            else:
                clf = LGBMClassifier(**lgb_param)
            clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse',
                    early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'],
                    categorical_feature=lgb_param_contrl['categorical_feature'])


        val_lgb_pre = clf.predict(X_valid.values, num_iteration=clf.best_iteration_)
        test_lgb_pre = clf.predict(X_test.values, num_iteration=clf.best_iteration_)

        metrics_name = self.config.metrics_name
        myMetrics = defindMetrics.MyMetrics(metrics_name)
        score_lgb = myMetrics.metricsFunc(val_lgb_pre, labels_valid)

        self.save_model(clf, self.config.saveModel)
        return val_lgb_pre, test_lgb_pre, score_lgb
Esempio n. 18
0
def train_LightGBM(x_train, y_train):
    clf = LGBMRegressor(
        n_estimators=10000,
        learning_rate=0.02,
        boosting_type='gbdt',
        objective='regression_l1',
        max_depth=-1,
        num_leaves=31,
        min_child_samples=20,
        feature_fraction=0.8,
        bagging_freq=1,
        bagging_fraction=0.8,
        lambda_l2=2,
        random_state=2020,
    )
    clf.fit(x_train, y_train)
    return clf
Esempio n. 19
0
def train_lgb_model(best_nodes, X_train_scaled, Y_train):

    rsg = LGBMRegressor(
        learning_rate=best_nodes["learning_rate"],
        n_estimators=int(best_nodes["n_estimators"]),
        max_depth=best_nodes["max_depth"],
        #eval_metric=best_nodes["eval_metric"],
        num_leaves=best_nodes["num_leaves"],
        subsample=best_nodes["subsample"],
        colsample_bytree=best_nodes["colsample_bytree"],
        min_child_samples=best_nodes["min_child_samples"],
        min_child_weight=best_nodes["min_child_weight"])

    rsg.fit(X_train_scaled, Y_train)
    Y_pred = rsg.predict(X_train_scaled)
    print("mse:", np.mean((Y_pred - Y_train)**2))
    print("rmse:", np.sqrt(np.mean((Y_pred - Y_train)**2)))
    return rsg
def lightBGM_model_with_test(X, Y):
    model = LGBMRegressor(num_leaves=36,
                          n_estimators=100,
                          learning_rate=0.07,
                          random_state=0)

    useful_feature = get_useful_features_byLightBGM(X, Y)
    X_U = X[useful_feature]

    x1, x2, y1, y2 = train_test_split(X_U, Y, test_size=0.2)
    y1_log = np.log1p(y1)
    model.fit(x1, y1_log, verbose=True)

    predict_log = model.predict(x2)
    predict = np.expm1(predict_log)
    error = error_fun(predict, y2)[1]

    del x1, x2, y1, y2
    return error
Esempio n. 21
0
def train_lightgbm(verbose=True):
    """Train a boosted tree with LightGBM."""
    if verbose: print("Training with LightGBM")
    df = pd.read_csv(STAGE1_LABELS)
    x = np.array([np.mean(np.load(FEATURE_FOLDER+'%s.npy' % str(id)), axis=0).flatten() for id in df['id'].tolist()])
    y = df['cancer'].as_matrix()

    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y,
                                                                   test_size=0.20)
    clf = LGBMRegressor(max_depth=50,
                        num_leaves=21,
                        n_estimators=5000,
                        min_child_weight=1,
                        learning_rate=0.001,
                        nthread=24,
                        subsample=0.80,
                        colsample_bytree=0.80,
                        seed=42)
    clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=verbose, eval_metric='l2', early_stopping_rounds=300)

    return clf
Esempio n. 22
0
    def fit(self):
        if self.First_change:  #boxcox变换
            act = boxcox(self.train_label + 0.1)[0]
            self.act_ = boxcox(self.train_label + 0.1)[1]
        else:
            act = self.train_label
        steps = self.steps
        actual = act
        n_samples = len(self.train_label)
        y_pred_train = np.zeros(n_samples, np.float32)
        n_estimators_list = self.n_estimators_list
        for i in range(1):
            num = np.random.randint(0, 5000)
            print("----training begin----")
            for step in range(steps):
                print(step)
                actual = actual - y_pred_train  #残差计算
                if step > 0:  #残差进行标签压缩变换,和boxcox变换
                    actual_ = sigmod(actual)
                    actual_box = boxcox(actual_)[0]
                    actual_box_val = boxcox(actual_)[1]
                    self.box_value.append(actual_box_val)
                    actual_used = actual_box
                else:
                    actual_used = actual
#阶段模型生成
                model = LGBMRegressor(n_estimators=n_estimators_list[step],
                                      max_depth=3,
                                      learning_rate=0.02,
                                      subsample=1,
                                      colsample_bytree=1)
                model.fit(self.train.values, actual_used)  #阶段模型训练
                y_pred_train_ = model.predict(self.train.values)  #阶段预测输出
                if step > 0:  #阶段反变换计算输出
                    y_pred_train = (y_pred_train_ * actual_box_val +
                                    1)**(1 / actual_box_val)
                    y_pred_train = sigmod_trans(y_pred_train)
                else:
                    y_pred_train = y_pred_train_
                self.model_list.append(model)  #阶段模型存储
Esempio n. 23
0
def modelingLGBM(hold_out_train,hold_out_test):
    from sklearn.linear_model import LassoCV as LaCV
    from sklearn.ensemble import RandomForestRegressor as RFR

    from sklearn.linear_model import Ridge
    from sklearn.linear_model import RANSACRegressor
    from sklearn.neural_network import MLPRegressor as MLP
    from xgboost.sklearn import XGBRegressor as XGBR
    from xgboost.sklearn import DMatrix
    from lightgbm.sklearn import LGBMRegressor as LGBM

    traindata=hold_out_train.copy()
    testdata=hold_out_test.copy()
    traindata=traindata.drop(['Store','Customers','Date','Open','PromoInterval','monthstr'],axis=1)
    testdata=testdata.drop(['Store','Customers','Date','Open','PromoInterval','monthstr'],axis=1)
    train_x=traindata.drop(['Sales'],axis=1)
    train_y=np.log1p(traindata['Sales'])
    test_x=testdata.drop(['Sales'],axis=1)

    # #归一化
    # min_max_scaler = MinMaxScaler()
    # train_x = min_max_scaler.fit_transform(train_x)
    # test_x = min_max_scaler.fit_transform(test_x)

    smalest_rmspe=1000
    subsamples=np.arange(0.5,0.6,0.1)
    for subsample in subsamples:
        time1 = time.time()
        lgbmModel = LGBM(n_estimators=8000,subsample=0.8)
        print(lgbmModel)
        lgbmModel.fit(train_x, train_y)
        sales_predict = lgbmModel.predict(test_x)
        rmspe = RMSPE(testdata['Sales'], np.expm1(sales_predict))
        print(rmspe)
        time2 = time.time()
        print('耗费时间:', (time2 - time1))
        if smalest_rmspe>rmspe:
            smalest_rmspe=rmspe
            best_model=lgbmModel
    return best_model
Esempio n. 24
0
def predict_lgb(X, y, df2, params, ind):

    X_train, y_train = X, y

    output = df2[(df2.index >= ind) & (df2.index <
                                       (ind + 28))]  # dataset for prediction
    X = output.iloc[:, 1:]  # this basically drops the "value" column

    lgb_model = LGBMRegressor(**params)
    lgb_reg = lgb_model.fit(X_train, y_train.value.ravel())
    preds = lgb_reg.predict(X)

    return preds
Esempio n. 25
0
 def get_model_result(self, params: dict) -> dict:
     X, y = self.X, self.Y
     X_test, y_test = self.X_test, self.Y_test
     # X, y = self.X.values, self.Y.values
     # X_test, y_test = self.X_test.values, self.Y_test.values
     if isinstance(self.estimator, lgb.Booster):
         params["metric"] = "auc"
         estimator = lgb.train(params, self.dataset_train)
         pred_train = pd.Series(estimator.predict(self.dataset_train),
                                index=self.X.index)
         pred_test = pd.Series(estimator.predict(self.dataset_test),
                               index=self.X_test.index)
     elif isinstance(self.estimator, LGBMRegressor):
         estimator = LGBMRegressor(**params)
         estimator.fit(X, y, eval_metric="auc")
         pred_train = pd.Series(estimator.predict(X), index=self.X.index)
         pred_test = pd.Series(estimator.predict(X_test),
                               index=self.X_test.index)
     elif isinstance(self.estimator, LGBMClassifier):
         estimator = LGBMClassifier(**params)
         estimator.fit(X, y, eval_metric="auc")
         pred_train = pd.Series(estimator.predict_proba(X)[:, 1],
                                index=self.X.index)
         pred_test = pd.Series(estimator.predict_proba(X_test)[:, 1],
                               index=self.X_test.index)
     else:
         raise TypeError(
             "Input model should be a `lgb.Booster` or `LGBMClassifier`/`LGBMRegressor`!"
         )
     # 置空得分
     pred_train.loc[~pred_train.index.isin(self.hit_indices)] = np.nan
     pred_test.loc[~pred_test.index.isin(self.hit_indices)] = np.nan
     # 计算模型评估指标
     ks_train, ks_test = calc_ks(-pred_train,
                                 y), calc_ks(-pred_test, y_test)
     auc_train, auc_test = calc_auc(pred_train,
                                    y), calc_auc(pred_test, y_test)
     # return {'train': (ks_train, auc_train), 'test': (ks_test, auc_test)}
     return {"ks": (ks_train, ks_test), "auc": (auc_train, auc_test)}
Esempio n. 26
0
    def model_lgb(self, X, Y):
        # create dataset for lightgbm

        # specify your configurations as a dict
        # params = {
        #     'task': 'train',
        #     'boosting_type': 'gbdt',  # 可换为rf(随机森林) dart goss
        #     'objective': 'binary',
        #     'metric': {'cross_entropy'},  # cross_entropy
        #     'num_leaves': 80,  # 50
        #     # 'max_depth': 6,  # 6
        #     'learning_rate': 0.06,
        #     'bagging_fraction': 0.8,
        #     'bagging_freq': 5,
        #     'seed': 0,
        #     # 'min_data_in_leaf ': 100,
        # }  # f1 0.43
        # train
        # X, Y = SMOTE().fit_sample(X, Y)
        # print("Y is 1:", Y.count(1))
        # print("Y is 0:", Y.count(0))
        class_weights = class_weight.compute_class_weight(
            'balanced', np.unique(Y), Y)
        class_weights = dict(enumerate(class_weights))
        print("class_weights", class_weights)
        lgb_model = LGBMRegressor(learning_rate=0.0475,
                                  max_depth=13,
                                  n_estimators=100,
                                  num_leaves=50,
                                  class_weight=class_weights)
        # lgb_model = LGBMRegressor(learning_rate=0.0475, max_depth=13, n_estimators=100, num_leaves=60,
        #                           class_weight=class_weights) # 0.552
        # lgb_model = LGBMRegressor(learning_rate=0.0475, max_depth=13, n_estimators=100, num_leaves=70,
        #                           class_weight=class_weights) # 0.542
        # {'learning_rate': 0.0475, 'max_depth': 13, 'n_estimators': 100, 'num_leaves': 70} 0.464
        print("Training lgb model....")
        gbm = lgb_model.fit(X, Y)
        print("feature_importances_ : ", gbm.feature_importances_)
        print("Save model to " + self.model_path)
        dump(gbm, self.model_path)
for i in range(len(duration)):
    duration_hours.append(int(duration[i].split(sep = "h")[0]))    
    duration_mins.append(int(duration[i].split(sep = "m")[0].split(sep = "h")[-1]))

X["Duration_hours"] = duration_hours
X["Duration_mins"] = duration_mins
X.drop(["Duration"], axis = 1,inplace = True)

X.drop(["Dep_Time"], axis = 1,inplace = True)

X.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace = True)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

from lightgbm.sklearn import LGBMRegressor
reg = LGBMRegressor()
reg.fit(X_train,y_train)

y_pred=reg.predict(X_test)


from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

filename = 'flightfare.pkl'
pickle.dump(reg, open(filename, 'wb'))

Esempio n. 28
0
best_params = space_eval(hyper_space, best_vals)
print("BEST PARAMETERS: " + str(best_params))

# Print best CV score
scores = [-trial['result']['loss'] for trial in trials.trials]
print("BEST CV SCORE: " + str(np.max(scores)))

# Print execution time
tdiff = trials.trials[-1]['book_time'] - trials.trials[0]['book_time']
print("ELAPSED TIME: " + str(tdiff.total_seconds() / 60))    

# Set params
est.set_params(**best_params)

# Fit    
est.fit(X_train, y_train)
y_pred = est.predict(X_test)

# Predict
score = r2_score(y_test, y_pred)
print("R2 SCORE ON TEST DATA: {}".format(score))

#==============================================================================
# Tree structure of hyperparameter space (Optional)
#============================================================================== 
# You must change the evaluate function in order to extract learning rate 
# and n_estimators from choices. Please add the following code to the start of 
# evaluate function
#    # Choices
#    if 'choices' in params.keys():
#        params['learning_rate'] = params['choices']['learning_rate']
Esempio n. 29
0
class b_model:
    # 这个地方可以定义全局变量
    params = {
        'learning_rate': 0.015,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'mse',
        'num_leaves': 12,
        'max_depth': 9,
        'max_bin': 130,
        'feature_fraction': 0.9,
        'reg_lambda': 50,
        'min_data': 25,
        'min_child_weight': 0.001,
        'verbose': -1,
    }
    no_use = [
        "血糖", "blood_sugar", "id", "blood_sugar_log", '体检日期',
        'feature_5_less_25', 'feature_4_less_60', '性别'
    ]

    def __init__(self):
        # 在创建类的时候需要哪些参数
        self.model = LGBMRegressor(learning_rate=0.015,
                                   objective="regression",
                                   metric='mse',
                                   num_leaves=12,
                                   max_depth=9,
                                   max_bin=130,
                                   feature_fraction=0.9,
                                   reg_lambda=50,
                                   min_data=25,
                                   min_child_weight=0.001,
                                   num_boost_round=3000,
                                   random_state=42)

    def __make_feature(self, train, test):
        # 构造特征
        if train.empty:
            test['性别'] = test['性别'].map({'男': 1, '女': 0, '??': 1})
            return test
        if test.empty:
            train['性别'] = train['性别'].map({'男': 1, '女': 0, '??': 1})
            return train
        else:
            train_id = train.id.values.copy()
            test_id = test.id.values.copy()
            data = pd.concat([train, test])
            data['性别'] = data['性别'].map({'男': 1, '女': 0, '??': 1})
            train_feat = data[data.id.isin(train_id)]
            test_feat = data[data.id.isin(test_id)]
            return train_feat, test_feat

    def fit(self, X, y=None):
        X.drop(X[X["年龄"] >= 84].index, inplace=True)
        fea_train = pd.read_csv("./feature/fea_train.csv")
        fea_train1 = pd.read_csv("./feature/fea_train_1.csv")
        fea_train2 = pd.read_csv("./feature/fea_train_2.csv")
        X = pd.merge(X, fea_train, how="left", on="id")
        X = pd.merge(X, fea_train1, how="left", on="id")
        X = pd.merge(X, fea_train2, how="left", on="id")
        X = self.__make_feature(train=X, test=pd.DataFrame())
        if y == None:
            y = X["血糖"].values
        predictors = [f for f in list(X.columns) if f not in self.no_use]
        X_train, X_test, y_train, y_test = train_test_split(X[predictors],
                                                            y,
                                                            test_size=0.1,
                                                            random_state=42)
        self.model.fit(X_train[predictors],
                       y_train,
                       eval_metric="mse",
                       early_stopping_rounds=100,
                       verbose=100,
                       eval_set=(X_test[predictors], y_test))
        from sklearn.metrics import mean_squared_error
        print("线下误差:{}".format(0.5 * mean_squared_error(
            y_test, self.model.predict(X_test[predictors]))))
        return self

    def predict(self, X):
        # 对测试集进行预测,传入模型,和测试数据
        fea_test = pd.read_csv("./feature/fea_test.csv")
        fea_test1 = pd.read_csv("./feature/fea_test_1.csv")
        fea_test2 = pd.read_csv("./feature/fea_test_2.csv")
        X = pd.merge(X, fea_test, how="left", on="id")
        X = pd.merge(X, fea_test1, how="left", on="id")
        X = pd.merge(X, fea_test2, how="left", on="id")
        X = self.__make_feature(test=X, train=pd.DataFrame())
        predictors = [f for f in list(X.columns) if f not in self.no_use]
        test_pred = self.model.predict(X[predictors])
        print("最大值:{}".format(test_pred.max()))
        return test_pred

    def get_params(self):
        return self.params
Esempio n. 30
0
    X_val, Y_val = train.iloc[val_idx][feats], train.iloc[val_idx].label

    clf = LGBMRegressor(
        n_estimators=100000,
        learning_rate=0.1,
        num_leaves=255,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=2020,
        metric='RMSE',
        n_jobs=24,
    )
    clf.fit(
        X_trn,
        Y_trn,
        eval_set=[(X_val, Y_val)],
        early_stopping_rounds=200,
        verbose=1000,
    )
    oof[val_idx] = clf.predict(X_val)
    sub += clf.predict(X_test) / skf.n_splits

sub = pd.DataFrame({
    'queryid': test.query_id,
    'documentid': test.doc_id,
    'predict_label': sub,
})

oof = pd.DataFrame({
    'query_id': train.query_id,
    'doc_id': train.doc_id,