Esempio n. 1
0
    def LGBR_optimization(train, bindingEnergy_train, test, bindingEnergy_test, cpus):
        '''
        Parameter optimization for the Light Gradient Boosting Regressor.
        '''

        lr_list = [0.01, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]
        R2_train = []
        R2_test = []
        best_test_R2 = None
        best_train_R2 = None
        best_lr = None

        train_light = train.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
        test_light = test.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))

        for learning_rate in lr_list:
            lgb_reg = LGBMRegressor(
                n_estimators=10000, learning_rate=learning_rate, max_depth=15, random_state=0, n_jobs=cpus)
            lgb_reg.fit(train_light, bindingEnergy_train)

            R2_train.append(lgb_reg.score(train_light, bindingEnergy_train))
            R2_test.append(lgb_reg.score(test_light, bindingEnergy_test))

        best_test_R2 = max(R2_test)
        best_train_R2 = R2_train[R2_test.index(best_test_R2)]
        best_lr = lr_list[R2_test.index(best_test_R2)]

        return best_lr, best_train_R2, best_test_R2
Esempio n. 2
0
def model_lightgbm_regressor(X_train, X_test, y_train, y_test):
    model_name = f'model_{count}_lightgbm_regressor'

    model = LGBMRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    score = model.score(X_test, y_test)

    print(f'{model_name} accuracy: {score}')
    joblib.dump(model, f'model/{model_name}.joblib')
Esempio n. 3
0
def lgbm_regressor(x_trn: pd.DataFrame, y_trn: np.ndarray, x_val: pd.DataFrame,
                   y_val: np.ndarray) -> tuple:
    x_trn, x_val = x_trn.copy(), x_val.copy()
    y_trn, y_val = y_trn.copy(), y_val.copy()

    model = LGBMRegressor(boosting_type='gbdt',
                          objective='regression',
                          metric='mse',
                          n_estimators=400,
                          learning_rate=0.05,
                          min_child_samples=3,
                          num_iterations=700,
                          n_jobs=-1,
                          random_state=7)
    _ = model.fit(x_trn, y_trn)

    training_score = model.score(x_trn, y_trn)
    validation_score = model.score(x_val, y_val)

    return model, training_score, validation_score
Esempio n. 4
0
    def LGBR(train, bindingEnergy_train, test, bindingEnergy_test, best_lr, cpus):
        '''
        LightGradientBoostingRegressor algorithm.
        '''

        train_light = train.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
        test_light = test.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))

        lgb_reg = LGBMRegressor(n_estimators=10000, learning_rate=best_lr,
                                max_depth=15, random_state=0, n_jobs=cpus)

        lgb_reg.fit(train_light, bindingEnergy_train)

        predictions = lgb_reg.predict(test_light)
        R2_test = lgb_reg.score(test_light, bindingEnergy_test)
        MSE = mean_squared_error(bindingEnergy_test, predictions)
        MAE = mean_absolute_error(bindingEnergy_test, predictions)

        return predictions, R2_test, MSE, MAE
Esempio n. 5
0
def treinaML(df):
    x = df[[
        'feriado', 'dia', 'mes', 'ano', 'diaAno', 'diaSemana', 'diaUtil',
        'segDia5', 'segDia10', 'diaDeProducao', 'seg', 'diaUtil5', 'diaUtil10',
        'inicioSemana', 'semanaAno', 'inicioMes'
    ]]
    y = df['qtd']

    SEED = 5
    np.random.seed(SEED)
    x_treino, x_teste, y_treino, y_teste = train_test_split(x,
                                                            y,
                                                            test_size=0.30,
                                                            random_state=SEED)
    print("Treinaremos com %d elementos e testaremos com %d elementos" %
          (len(x_treino), len(x_teste)))

    lgb_model = LGBMRegressor()
    lgb_model.fit(x_treino, y_treino)
    print('R² = {}'.format(lgb_model.score(x_treino, y_treino).round(3)))
    y_previsto = lgb_model.predict(x_teste)
    print('R² = %s' % metrics.r2_score(y_teste, y_previsto).round(3))

    return lgb_model
le = preprocessing.LabelEncoder()
X[cat_cols] = X[cat_cols].apply(lambda col: le.fit_transform(col.astype(str)))

cat_cols = df_test_external.dtypes==object
cat_cols = df_test_external.columns[cat_cols].tolist()

le = preprocessing.LabelEncoder()
df_test_external[cat_cols] = df_test_external[cat_cols].apply(lambda col: le.fit_transform(col.astype(str)))

gbm=LGBMRegressor(objective='regression',learning_rate=0.05, n_estimators=300)
# train
gbm.fit(X,Y)

print('Accuracy of gbm regression on training set: {:.2f}'
     .format(gbm.score(X, Y)))
Y_pred_gbm=gbm.predict(df_test_external)

gbmResult = {'Id':Test_T_ID_external, 'SalePrice':Y_pred_gbm}
df_gbmResult = pd.DataFrame(gbmResult)
df_gbmResult.head()  
df_gbmResult.shape

df_gbmResult.to_csv('submissionLgbm_external.csv',index=False)

"""Describe the dataset and whether this data helps with prediction.

The dataset is an extension of the Ames Housing dataset which was compiled by Dean De C**k. Refernce: http://jse.amstat.org/v19n3/decock.pdf

Source: http://www.amstat.org/publications/jse/v19n3/decock/AmesHousing.xls
Esempio n. 7
0
# 模型评估
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred)**0.5)

# 特征重要度
print('Feature importances:', list(gbm.feature_importances_))

# 网格搜索,参数优化
estimator = LGBMRegressor(num_leaves=31)
param_grid = {'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40]}
gbm = GridSearchCV(estimator, param_grid)
gbm.fit(X_train, y_train)
print('Best parameters found by grid search are:', gbm.best_params_)

# regressor = LGBMRegressor()
gbm_score = gbm.score(X_test, y_test)
print('准确率:', gbm_score)

#添加标签画图
plt.figure()
plt.plot(range(len(y_pred)),
         y_pred,
         'red',
         linewidth=2.5,
         label="predict data")
plt.plot(range(len(y_test)), y_test, 'green', label="test data")
plt.show()

plt.figure()
y = y_pred - y_test
plt.plot(y)
Esempio n. 8
0
model = gridsearch.best_estimator_
score = gridsearch.best_score_
rmse_scores = (-score)
#for item in grid.grid_scores_:
    #print ("\t%s %s %s" % ('\tGRIDSCORES\t',  "R" , item))
#print ('%s\tHP\t%s\t%f' % ("R" , str(best_params) ,abs(score)))
print(best_params)
print(rmse_scores)
print(model)

from sklearn.metrics import mean_squared_error


lgbm = LGBMRegressor(random_state=0,learning_rate= 0.1, max_depth= 4, n_estimators = 100, num_leaves= 30,min_data_in_leaf=10,max_bin = 100,lambda_l1 = 0.001,lambda_l2 = 0.001,feature_fraction = 0.8,bagging_fraction=0.6)
lgbm.fit(x_pp_train,y_pp_train)
print("Accuracy on training set: {:.3f}".format(lgbm.score(x_pp_train, y_pp_train)))
print("Accuracy on test set: {:.3f}".format(lgbm.score(x_pp_test, y_pp_test)))
y_pred = lgbm.predict(x_pp_test)
y_pred_train = lgbm.predict(x_pp_train)
print("RMSE on train :{:.3f}".format(mean_squared_error(y_pp_train, y_pred_train, squared=False)))
print("RMSE on test :{:.3f}".format(mean_squared_error(y_pp_test, y_pred, squared=False)))

#Seeing the Feature Importance by Mean Decrease in Impurity (MDI)
features = x_pp_train.columns
importances = lgbm.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(20,100))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
Esempio n. 9
0
pca = PCA(n_components=1, whiten=True, random_state=60).fit(y_train)
y_train_pca = pca.transform(y_train)
y_test_pca = pca.transform(y_test)

y_train_pca = y_train_pca.reshape(8000, )
y_test_pca = y_test_pca.reshape(2000, )

model = LGBMRegressor(n_estimators=1000,
                      learning_rate=0.05,
                      max_depth=5,
                      colsample_bytree=0.7,
                      colsample_bylevel=0.7)

model.fit(x_train, y_train_pca)

score = model.score(x_test, y_test_pca)

print("R2:", score)

# thresholds = np.sort(model.feature_importances_) # 오름차순 정렬(feature_importances정렬)
# print(thresholds)

# models=[]
# res = np.array([])
# for thresh in thresholds:
#     selection = SelectFromModel(model, threshold=thresh, prefit=True)
#     select_x_train = selection.transform(x_train)
#     select_x_test = selection.transform(x_test)

#     model2 = LGBMRegressor(n_estimators=500, learning_rate=0.1, n_jobs=-1)
#     model2.fit(select_x_train, y_train_pca, verbose=False, eval_metric=['logloss','rmse'],
Esempio n. 10
0
lgbm = LGBMRegressor(n_estimators=100, learning_rate=0.1, n_jobs=-1)

lgbm.fit(x_train,
         y_train,
         verbose=True,
         eval_metric=["logloss", "rmse"],
         eval_set=[(x_train, y_train), (x_test, y_test)],
         early_stopping_rounds=20)

#rmse,mae,logloss,error,auc

y_pre = lgbm.predict(x_test)

r2 = r2_score(y_test, y_pre)
score = lgbm.score(x_test, y_test)
print(__file__)
print("r2")
print(r2)
print("score")
print(score)

#6)selectFromModel

thresholds = np.sort(lgbm.feature_importances_)

idx_max = -1
max = r2

for idx, thresh in enumerate(thresholds):
    #데이터 전처리
Esempio n. 11
0
### 데이터 ###
x, y = load_boston(return_X_y=True)
print(x.shape)  # (506, 13)
print(y.shape)  # (506, )

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    shuffle=True,
                                                    random_state=66)

### 기본 모델 ###
model = LGBMRegressor(n_estimators=300, learning_rate=0.1, n_jobs=-1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print('R2 :', score)

#== Default R2 : 0.9313126937746082 ==#

### feature engineering ###
thresholds = np.sort(model.feature_importances_)
print(thresholds)

models = []
res = np.array([])

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    select_x_train = selection.transform(x_train)
Esempio n. 12
0
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import pickle
dataset = load_boston()
x = dataset.data
y = dataset.target

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=66)
model = LGBMRegressor()
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
# print(score)

thresholds = np.sort(model.feature_importances_)

# print(thresholds)
models = []  # 빈 모델 배열 생성
res = np.array([])  #빈 결과값 배열 생성
for thres in thresholds:
    selection = SelectFromModel(model, threshold=thres,
                                prefit=True)  #중요하지 않는 컬럼부터 하나씩 빼면서 트레이닝한다
    #median
    selection_x_train = selection.transform(x_train)
    model2 = LGBMRegressor(n_estimators=1000)
    selection_x_test = selection.transform(x_test)
    model2.fit(selection_x_train,
Esempio n. 13
0
    'n_estimators': range(100, 300, 50),
    'eta': [0.1, 0.2],
    'max_depth': range(3, 10, 1),
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}
rand_search_xgb = RandomizedSearchCV(estimator=xgbreg,
                                     param_distributions=rand_param_xgb,
                                     verbose=1,
                                     n_jobs=-1,
                                     n_iter=200,
                                     cv=8)
rand_search_xgb.fit(X_train, y_train)
best_param = rand_search_xgb.best_params_
best_param

xgbreg = XGBRegressor(subsample=0.5, n_estimators=150, max_depth=4, eta=0.1)
xgbreg.fit(X_train, y_train)
xgbreg.score(X_test, y_test)

#Light GBM REgressor
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)
lgbm.score(X_test, y_test)
lgbm.score(X_train, y_train)

X_train.columns
#Saving models
joblib.dump(ranreg, 'RFReg_model.ml')
joblib.dump(xgbreg, 'XGBReg_model.ml')
joblib.dump(lgbm, 'LGBMReg_model.ml')
Esempio n. 14
0
#score_rfr = rfr.score(X_test, y_test)

gbr.fit(X_train, y_train)
preds = gbr.predict(X_test)
preds_test_gbr = gbr.predict(X_test_sub)
mae_gbr = mean_absolute_error(y_test, preds)
rmse_gbr = np.sqrt(mean_squared_error(y_test, preds))
score_gbr = gbr.score(X_test, y_test)
cv_gbr = mean_cross_val(gbr, X_train1, y)

lgbm.fit(X_train, y_train)
preds = lgbm.predict(X_test)
preds_test_lgbm = lgbm.predict(X_test_sub)
mae_lgbm = mean_absolute_error(y_test, preds)
rmse_lgbm = np.sqrt(mean_squared_error(y_test, preds))
score_lgbm = lgbm.score(X_test, y_test)
cv_lgbm = mean_cross_val(lgbm, X_train1, y)
"""
xgb.fit(X_train, y_train)   
preds = xgb.predict(X_test) 
preds_test_xgb = xgb.predict(X_test_sub)
mae_xgb = mean_absolute_error(y_test, preds)
rmse_xgb = np.sqrt(mean_squared_error(y_test, preds))
score_xgb = xgb.score(X_test, y_test)
cv_xgb = mean_cross_val(xgb, X_train1, y)
"""

cb.fit(X_train, y_train)
preds = cb.predict(X_test)
preds_test_cb = cb.predict(X_test_sub)
mae_cb = mean_absolute_error(y_test, preds)
Esempio n. 15
0
class LGBM:

    def __init__(self, params):
        self.name = "lgbm"
        
        # Fix this
        learning_rate = params['learning_rate']
        n_estimators = params['n_estimators']
        min_data_in_leaf = params['num_leaves']
        # key params
        num_leaves = params['num_leaves']
        min_gain_to_split = params['min_gain_to_split']
        max_depth = params['max_depth']

        # speed vs accuracy tradeoffs
        bagging_freq = params['bagging_freq']
        bagging_frac = params['bagging_fraction']
        feature_frac = params['feature_fraction']

        # Regularisation
        reg_alpha = params['reg_alpha']
        reg_lambda = params['reg_lambda']
        n_jobs = params['n_jobs']#3  # -1
        boosting_type = params['boosting_type'] #'gbdt'  #["dart", 'gbdt', 'goss', 'rf']

        self.model = LGBMRegressor(learning_rate=learning_rate,
                                    n_estimators=n_estimators, 
                                    num_leaves=num_leaves,
                                    min_data_in_leaf=min_data_in_leaf,
                                    max_depth=max_depth,
                                    min_split_gain=min_gain_to_split,
                                    bagging_fraction=bagging_frac,
                                    bagging_freq=bagging_freq,
                                    feature_frac=feature_frac,
                                    reg_alpha=reg_alpha,
                                    reg_lambda=reg_lambda,
                                    n_jobs=n_jobs,
                                    boosting_type=boosting_type
                                    )
        self.target_col = None

    def _split_data_maps(self, data_map, split_fraction):
        train = {}
        test = {}
        
        order = sorted(list(k for k in data_map.keys()))

        length = len(data_map[order[0]])
        splitpoint = int(length*split_fraction)
        for k in order:
            train[k] = data_map[k].iloc[:splitpoint]
            test[k] = data_map[k].iloc[splitpoint:]

        return train, test

    def _format_data(self, data_map):  
        if self.target_col is None:
            raise ValueError("Target col is None!")
        
        order = sorted(list(k for k in data_map.keys() if k != self.target_col))

        inputs = []
        num_stocks = data_map[order[0]].shape[-1]
        for i in range(num_stocks):
            stock_data = []
            for k in order:
                arr = data_map[k].iloc[:, i]
                stock_data.append(arr.values.reshape(-1, 1))
            inputs.append(np.concatenate(stock_data, axis=1))

        inputs = np.concatenate(inputs, axis=0)

        return inputs
    
    def _format_target(self, data_map):
        if self.target_col is None:
            raise ValueError("Target col is None!")

        target = data_map[self.target_col].values

        return target.reshape(-1, 1) # stacked targets

    def fit(self, data_map, target_col, valid_fraction=0.2, rs_iterations=-1):
        print("Formatting data")
        self.target_col = target_col

        print("Splitting data map")
        train_map, valid_map = self._split_data_maps(data_map, valid_fraction)

        y = self._format_target(train_map)
        X = self._format_data(train_map)

        print("Fitting", self.name)
        
        if rs_iterations > 0:
            
            param_dist = self.get_hyperparam_ranges()
            param_combinations = np.prod([len(param_dist[k]) for k in param_dist])
            rs_iterations = min(param_combinations, rs_iterations)
            
            print("Running {} iterations of random search".format(
                    rs_iterations))

            self.model = select.RandomizedSearchCV(self.model,
                                     param_distributions=param_dist,
                                     n_iter=rs_iterations,
                                     cv=3,
                                     n_jobs=2)
        self.model.fit(X, y)

        if valid_fraction != 0:
            y_valid = self._format_target(valid_map)
            X_valid = self._format_data(valid_map)

            print("Scoring on validation data")
            r2 = self.model.score(X_valid, y_valid)

            print("R2 for {}:".format(self.name.upper()), r2)
            return r2
        else:
            print("No validation data")
            return 0.0

    def predict(self, data_map):
        X = self._format_data(data_map)
        return self.model.predict(X)

    def get_save_name(self, model_folder):
        return os.path.join(model_folder, self.name+".joblib")

    def save(self, model_folder):
        name = self.get_save_name(model_folder)
        joblib.dump(self.model, name)

    def load(self, model_folder):
        name = self.get_save_name(model_folder)
        self.model = joblib.load(name)
    
    @classmethod
    def get_hyperparam_ranges(cls):
        param_grid = {  'max_depth': [-1],
                        'min_data_in_leaf': [20, 40, 80],
                        'num_leaves': [8, 16, 32, 64, 128],
                        'learning_rate': [1.0, 0.1, 0.05, 0.01],
                        'n_estimators': [50, 100, 200],
                        'feature_fraction': [0.2, 0.4, 0.6, 0.8],
                        'bagging_freq': [0],  # disables
                        'bagging_fraction': [1.0], # disables
                        'reg_alpha': [0.0, 1.0, 0.1,0.01],
                        'reg_lambda': [0.0,1.0, 0.1,0.01],
                        'min_gain_to_split':[0.001],
                        'n_jobs':[2],
                        'boosting_type':['gbdt', 'dart']
                    }

        return param_grid
Esempio n. 16
0
    'subsample': [0.7]
}]

settings = {
    'verbose': False,
    'eval_set': [(x_train, y_train), (x_test, y_test)]
}

kfold = KFold(n_splits=5, shuffle=True, random_state=66)
# 모델 컬럼별 4번
for i in range(4):
    model = LGBMRegressor()
    settings['eval_set'] = [(x_train, y_train[:, i]), (x_test, y_test[:, i])]
    model.fit(x_train, y_train[:, i], **settings)
    y_test_pred = model.predict(x_test)
    score = model.score(x_test, y_test[:, i])
    mae = MAE(y_test[:, i], y_test_pred)
    print("r2 : ", score)
    print("mae :", mae)
    thresholds = np.sort(model.feature_importances_)[[
        i for i in range(0, len(model.feature_importances_), 20)
    ]]
    print("model.feature_importances_ : ", model.feature_importances_)
    print(thresholds)
    best_mae = mae
    best_model = model
    best_y_pred = model.predict(x_pred)
    best_y_test_pred = y_test_pred
    print(best_y_pred.shape)
    for thresh in thresholds:
        if (thresh == 0): continue
Esempio n. 17
0
    def train(self, model='lr'):
        """
        lr score 0.80048125

        :return:
        """
        train_x, train_y, test = self.load_train_x_train_y_test_x()
        train_xx, test_xx, train_yy, test_yy = train_test_split(train_x, train_y, train_size=0.8)
        if model == 'lr':
            lr = LogisticRegression(penalty='l2', solver='liblinear', C=1, verbose=1)
            lr.fit(train_xx, train_yy.reshape(-1, ))
            score = lr.score(test_xx, test_yy)
            joblib.dump(lr, os.path.join(daikuan_path, 'lr_model_time_{}_score_{}'.format(int(time.time()), score)))
            print('lr score', score)

            # lr = joblib.load(os.path.join(daikuan_path, 'lr_model_time_1599821345_score_0.6554'))
            #
            # r = lr.predict(test)
            # with open(os.path.join(daikuan_path, 'samples.csv'), mode='w') as f:
            #     f.write('id,isDefault\n')
            #     for idx, y in enumerate(r):
            #         print('{},{}'.format(idx + 800000, y))
            #         f.write('{},{}\n'.format(idx + 800000, y))

            # r = lr.predict(test_xx)
            # for x, y in zip(r, test_yy):
            #     print(x, y)
            # print(lr.score(test_xx, test_yy))

        elif model == 'svm':
            # linear
            svc = SVC(C=1, kernel='rbf', verbose=True, max_iter=100)
            svc.fit(train_xx, train_yy.reshape(-1, ))
            score = svc.score(test_xx, test_yy)
            joblib.dump(svc, os.path.join(daikuan_path, 'svc_model_time_{}_score_{}'.format(int(time.time()), score)))
            print('svm score', score)

            # svc = joblib.load(os.path.join(daikuan_path, 'svc_model_time_1599796746_score_0.5245'))
            # r = svc.predict(test_xx)
        elif model == 'ada':
            ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4), n_estimators=400, random_state=7)
            ada.fit(train_xx, train_yy.reshape(-1, ))
            score = ada.score(test_xx, test_yy)
            joblib.dump(ada, os.path.join(daikuan_path, 'ada_model_time_{}_score_{}'.format(int(time.time()), score)))
            print('ada score', score)

        elif model == 'rf':
            pass
        elif model == 'gbm':
            lgbm = LGBMRegressor(num_leaves=30
                                 , max_depth=5
                                 , learning_rate=.02
                                 , n_estimators=1000
                                 , subsample_for_bin=5000
                                 , min_child_samples=200
                                 , colsample_bytree=.2
                                 , reg_alpha=.1
                                 , reg_lambda=.1)
            lgbm.fit(train_xx, train_yy)
            score = lgbm.score(test_xx, test_yy)
            print('lgbm score', score)
        elif model == 'gbdt':
            # 调参 https://blog.csdn.net/weixin_40924580/article/details/85043801
            # param_test1 = {'n_estimators': range(128, 256, 32)}
            param_test2 = {'max_depth': range(3, 14, 2), 'min_samples_split': range(100, 801, 200)}
            g_search = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300,
                                                                         min_samples_leaf=20,
                                                                         max_features='sqrt', subsample=0.8,
                                                                         random_state=10, verbose=1,
                                                                         n_estimators=192),
                                    param_grid=param_test2, scoring='roc_auc', iid=False, cv=5, verbose=1)
            # g_search.fit(train_xx, train_yy.reshape(-1, ))
            # print(g_search.best_params_)
            # print(g_search.best_score_)

            gbdt = GradientBoostingClassifier(n_estimators=192, learning_rate=0.1, min_samples_split=300,
                                              min_samples_leaf=20, verbose=1)
            gbdt.fit(train_xx, train_yy.reshape(-1, ))

            # gbdt = joblib.load(os.path.join(daikuan_path, 'gbdt_model_time_1599825531_score_0.6515666666666666'))

            y_pred = gbdt.predict(test_xx)
            y_predprob = gbdt.predict_proba(test_xx)[:, 1]
            print('accuracy', metrics.accuracy_score(test_yy, y_pred))
            print('AUC', metrics.roc_auc_score(test_yy, y_predprob))

            score = gbdt.score(test_xx, test_yy)
            joblib.dump(gbdt, os.path.join(daikuan_path, 'gbdt_model_time_{}_score_{}'.format(int(time.time()), score)))
            print('gbdt score', score)
# score1 : 72.2789
# mae1 : 1.1567
# score2 : 22.8805
# mae2 : 0.6918
# score3 : 26.0999
# mae3 : 2.0683
# score4 : 17.5051
# mae4 : 1.3463

model.fit(x_train,
          y_train1,
          verbose=False,
          eval_metric=['logloss'],
          eval_set=[(x_test, y_test1)],
          early_stopping_rounds=20)
score1 = model.score(x_test, y_test1)
print("score1 : %.4f" % (score1 * 100.0))
# print(model.feature_importances_)
y_pred_1 = model.predict(x_test)
mae1 = mean_absolute_error(y_test1, y_pred_1)
print('mae1 : %.4f' % (mae1))
y_pred1 = model.predict(x_pred)

model.fit(x_train,
          y_train2,
          verbose=False,
          eval_metric=['logloss'],
          eval_set=[(x_test, y_test2)],
          early_stopping_rounds=20)
score2 = model.score(x_test, y_test2)
print("score2 : %.4f" % (score2 * 100.0))
Esempio n. 19
0
    'num_leaves': [32, 48, 64, 80],
    'learning_rate': [0.01, 0.05, 1]
}
#%%
lgbm_reg = LGBMRegressor()
grid_cv = GridSearchCV(lgbm_reg,
                       param_grid=params,
                       cv=5,
                       n_jobs=-1,
                       scoring='neg_mean_squared_error')
grid_cv.fit(X_train, y_train)
# 평가지표를 MSE(오차제곱의 평균값)를 이용함, 이값이 낮아야 좋음
print('최적 하이퍼 파라미터:', grid_cv.best_params_)
print('최고 예측 점수:', -1 * grid_cv.best_score_)
#%%
# 최적 파라미터 값으로 모델을 다시 수행
# 평가지표로 R2값을 사용했으며 1과 가까워야 좋음
# 제대로된 평가를 위해 np.expm1을 사용해 역로그를 취함
from lightgbm import LGBMRegressor
lgbm_reg1 = LGBMRegressor(n_estimators=1000,
                          learning_rate=0.01,
                          max_depth=18,
                          num_leaves=48)
lgbm_reg1.fit(X_train, y_train)
lgbm_reg1.score(X_test, y_test)

#%%
import pickle
lgbmFile = open('lgbm_reg1.pckl', 'wb')
pickle.dump(lgbm_reg1, lgbmFile)
lgbmFile.close()
Esempio n. 20
0
y_pred = best_model.predict(best_x_test)
r2 = r2_score(y_test, y_pred)
print('r2 :', r2)

end1 = time.time()

import joblib
joblib.dump(best_model, './model/xgb_Save/sfm1-' + str(best_score) + '.dat')
model2 = joblib.load('./model/xgb_Save/sfm1-' + str(best_score) + '.dat')

#### LGBM 셀렉트

start2 = time.time()
model_LGBM = LGBMRegressor()
model_LGBM.fit(x_train, y_train)
score = model_LGBM.score(x_test, y_test)
print("r2 : ", score)

thresholds = np.sort(model_LGBM.feature_importances_)

print(thresholds)
print(x_train.shape)
print("========================")

best_x_train = x_train
best_x_train = x_test
best_score = score
best_model = model_LGBM

for thresh in thresholds:
    selection = SelectFromModel(model_LGBM, threshold=thresh, prefit=True)