min_child_weight=6, n_estimators=1000, max_depth=7, colsample_bytree=0.6) xgb.fit(X_train, y_train) xgb_pred = xgb.predict(X_test) accuracy = xgb.score(X_test, y_test) 'Accuracy: ' + str(np.round(accuracy * 100, 2)) + '%' mean_absolute_error(y_test, xgb_pred) mean_squared_error(y_test, xgb_pred) np.sqrt(mean_squared_error(y_test, xgb_pred)) lgb = LGBMRegressor(objective='regression') lgb.fit(X_train, y_train) lgb_pred = lgb.predict(X_test) accuracy = lgb.score(X_test, y_test) 'Accuracy: ' + str(np.round(accuracy * 100, 2)) + '%' mean_absolute_error(y_test, lgb_pred) mean_squared_error(y_test, lgb_pred) np.sqrt(mean_squared_error(y_test, lgb_pred)) from yellowbrick.regressor import ResidualsPlot visualizer = ResidualsPlot(xgb) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer rid_pred_t = rid.predict(X_train) la_pred_t = la.predict(X_train) plt.scatter(la_pred_t, y_train, c="blue", marker="s", label="Training data") plt.scatter(la_pred,
def do(): train_data = pd.read_csv( 'D:/testFiles/for_excute_folder/activity_blCodingFormatingESI_2017_5_train_input.csv' ) test_data = pd.read_csv( 'D:/testFiles/for_excute_folder/activity_blCodingFormatingESI_2017_5_test_input.csv' ) # test_data = pd.read_csv('D:/testFiles/for_excute_folder/activity_blCodingFormatingESI_2017_5_train_input.csv') # train_data = pd.read_csv('D:/testFiles/for_excute_folder/activity_blCodingFormatingESI_2017_5_test_input.csv') # drop_col_names = ['Global-SystemAdmin'] train_data = train_data.drop(train_data.columns[0], axis=1) test_data = test_data.drop(test_data.columns[0], axis=1) train_data = train_data[train_data["TIME_USED"] <= 1000] test_data = test_data[test_data["TIME_USED"] <= 1000] # train_data = train_data[train_data["ASSIGN_COUNT"] <= 1] # test_data = test_data[test_data["ASSIGN_COUNT"] <= 1] # train_data = train_data.drop(drop_col_names, axis=1) # test_data = test_data.drop(drop_col_names, axis=1) train_data['TIME_USED'] = train_data['TIME_USED'] / 60 test_data['TIME_USED'] = test_data['TIME_USED'] / 60 train_data['TIME_USERD_MEDIAN_S2'] = train_data['TIME_USERD_MEDIAN']**2 test_data['TIME_USERD_MEDIAN_S2'] = test_data['TIME_USERD_MEDIAN']**2 # bkgOffice_median_by_task_type train_data['TIME_USERD_MEDIAN_S3'] = train_data[ 'TIME_USERD_MEDIAN'] * train_data['bkgOffice_median_by_task_type'] test_data['TIME_USERD_MEDIAN_S3'] = test_data[ 'TIME_USERD_MEDIAN'] * test_data['bkgOffice_median_by_task_type'] # train_data = train_data[ # ['TIME_USED', 'TIME_USERD_MEDIAN', 'Freight_Type=Semi Auto', 'Freight_Type=No rate', 'TIME_USERD_COUNT', 'TIME_USERD_VAR', 'AWAY_COUNT', # 'AWAY_MEAN', 'TIME_USED_BY_REGION' ,'COUNT_BY_REGION', 'TIME_USED_VAR_BY_REGION']] # test_data = test_data[ # ['TIME_USED', 'TIME_USERD_MEDIAN', 'Freight_Type=Semi Auto', 'Freight_Type=No rate', 'TIME_USERD_COUNT', 'TIME_USERD_VAR', 'AWAY_COUNT', # 'AWAY_MEAN', 'TIME_USED_BY_REGION' ,'COUNT_BY_REGION', 'TIME_USED_VAR_BY_REGION']] print(test_data.head()) # print(train_data.describe()) y_train = train_data['TIME_USED'].values.tolist() X_train = train_data.drop(['TIME_USED'], axis=1).values.tolist() y_test = test_data['TIME_USED'].values.tolist() X_test = test_data.drop(['TIME_USED'], axis=1).values.tolist() # 选一个模型 # regressor = SGDRegressor(l1_ratio=0.1) # regressor = Ridge() # regressor = Lasso() # regressor = SVR() # regressor = RandomForestRegressor(n_estimators=100, n_jobs=-1) # regressor = AdaBoostRegressor() # regressor = GradientBoostingRegressor(n_estimators=400) # regressor = BaggingRegressor() # regressor = XGBRegressor(n_estimators=400, learning_rate=0.02, colsample_bytree=0.1, seed=2017) regressor = LGBMRegressor(n_estimators=400, learning_rate=0.02, seed=2017, colsample_bytree=1) # 用训练集做交叉验证 # scores = cross_val_score(regressor, X_train, y_train, cv=4, scoring='neg_mean_absolute_error', n_jobs=-1) # # print('交叉验证R方值:', scores) # print('交叉验证R方均值:', np.mean([scores])) # 用训练集训练模型 regressor.fit(X_train, y_train) # 用模型预测测试集, 打分方法也是r2 print('测试集R方值:', regressor.score(X_test, y_test)) # 对比预测数据与真实数据 y_predict = regressor.predict(X_test) df = DataFrame() df['predict'] = y_predict df['real'] = y_test df['diff'] = y_predict - y_test print(df.head(20)) print('MAE = ', mean_absolute_error(y_test, y_predict)) print('MSE = ', mean_squared_error(y_test, y_predict)) print('R2 = ', r2_score(y_test, y_predict)) print('feature_importances\n') print(regressor.feature_importances_ ) # Only tree based model has this attribute