XGB_new = XGB_preds.apply(lambda x: np.power(np.e,x).astype('int64')) GBR = GradientBoostingRegressor() GBR.fit(X_train, Y_train) GBR_preds = pd.DataFrame(GBR.predict(X_test),columns=['salePrice'],index=Y_test.index) print(mean_absolute_error(Y_test,GBR_preds)) GBR_new = GBR_preds.apply(lambda x: np.power(np.e,x).astype('int64')) sns.swarmplot(x=GBR_preds['salePrice'],y=Y_test) from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score kfold = KFold(n_splits=10, random_state=7) results = cross_val_score(XGB, X, y, cv=kfold) print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100)) results2 = cross_val_score(RFR, X, y, cv=kfold) print("Accuracy: %.2f%% (%.2f%%)" % (results2.mean()*100, results2.std()*100)) results3 = cross_val_score(GBR, X, y, cv=kfold) print("Accuracy: %.2f%% (%.2f%%)" % (results3.mean()*100, results3.std()*100)) XGB.score(X_train, Y_train) RFR.score(X_train, Y_train) GBR.score(X_train, Y_train) XGB.score(X_test, Y_test) RFR.score(X_test, Y_test) GBR.score(X_test, Y_test)
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=77) #2. 모델 model = XGBRegressor(n_estimators=100, learning_rate=0.01, n_jobs=8) #3. 훈련 model.fit(x_train, y_train, verbose=1, eval_metric=['rmse', 'logloss', 'mae'], eval_set=[(x_train, y_train), (x_test, y_test)]) aaa = model.score(x_test, y_test) print('score : ', aaa) y_pred = model.predict(x_test) r2 = r2_score(y_test, y_pred) print('r2 : ', r2) #validation 0 - train #validation 1 = teest # print('==================================') # results = model.evals_result() # print(results)
from sklearn import cross_validation train = pd.read_csv('../data/train_empty.csv') features = ['store_nbr', 'item_nbr', # 'units', 'station_nbr', 'tmax', 'tmin', 'tavg', 'depart', 'dewpoint', 'wetbulb', 'heat', 'cool', 'snowfall', 'preciptotal', 'stnpressure', 'sealevel', 'resultspeed', 'resultdir', 'avgspeed', 'HZ', 'FU', 'UP', 'TSSN', 'VCTS', 'DZ', 'BR', 'FG', 'BCFG', 'DU', 'FZRA', 'TS', 'RA', 'PL', 'GS', 'GR', 'FZDZ', 'VCFG', 'PRFG', 'FG+', 'TSRA', 'FZFG', 'BLDU', 'MIFG', 'SQ', 'BLSN', 'SN', 'SG', # 'month', # 'day', 'day_length'] # 'sunset_hour', # 'sunset_minute', # 'sunrise_hour', # 'sunrise_minute'] import xgboost X = xgboost.DMatrix(train[features].values, missing=np.nan) y = train["units"].values X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=0) clf = XGBRegressor(silent=False) print clf.score(X_test, y_test)
#concat two dataframes for better visualization featureScores = pd.concat([dfcolumns, dfscores], axis=1) featureScores.columns = ['Specs', 'Score'] #naming the dataframe columns '''print(featureScores.nlargest(10,'Score')) #print 10 best features''' #Using XGBoost as a model no_survived = train.drop("Survived", axis=1) survived = train["Survived"] xgb = XGBRegressor() xgb.fit(no_survived, survived, verbose=False) #making the prediction X_test = test.copy() XGBPredict = np.round(xgb.predict(X_test), 0) print("XGBoost") print(round(xgb.score(no_survived, survived) * 100, 2)) #got a score of 60.82 #Random Forest 74.162%, 77.033% with titles RF = RandomForestClassifier(n_estimators=100) RF.fit(no_survived, survived) RFPredict = RF.predict(X_test) RF.score(no_survived, survived) RFScore = round(RF.score(no_survived, survived) * 100, 2) print("Random Forest") print(RFScore) #Decision Tree 75.598%, 75.119% with titles decisionTree = DecisionTreeClassifier() decisionTree.fit(no_survived, survived) decisionTreePredict = decisionTree.predict(X_test) decisionTree.score(no_survived, survived)
def rest(F, X, Y, X_train, y_train, X_test, y_test): best_ada_score = float('-inf') best_ada_rmse = float('inf') best_ada_score_f = -1 best_ada_rmse_f = -1 best_xg_score = float('-inf') best_xg_rmse = float('inf') best_xg_score_f = -1 best_xg_rmse_f = -1 best_svr_score = float('-inf') best_svr_rmse = float('inf') best_svr_score_f = -1 best_svr_rmse_f = -1 for f in F: print("\npca %d" % f) # # PCA Feature Selection X_mu, X_Z = pca(f, X.values) X_pca = pca_proj(X.values, X_mu, X_Z) X_train_mu, X_train_Z = pca(f, X_train.values) X_train_pca = pca_proj(X_train.values, X_train_mu, X_train_Z) X_test_pca = pca_proj(X_test.values, X_train_mu, X_train_Z) # # AdaBoost print("\nAdaBoost") from sklearn.ensemble import AdaBoostRegressor adaBoost = AdaBoostRegressor() k_z, k_mse, k_rmse, b_z, b_mse, b_rmse = evaluate_model( adaBoost, f, X.values, Y.values.ravel(), k=5, B=5) if k_rmse < best_ada_rmse: best_ada_rmse = k_rmse best_ada_rmse_f = f adaBoost.fit(X_train_pca, y_train.values.ravel()) ada_score = adaBoost.score(X_test_pca, y_test.values.ravel()) print(ada_score) if ada_score > best_ada_score: best_ada_score = ada_score best_ada_score_f = f #View Predicted values predicted = adaBoost.predict(X_test_pca) ada_pred = y_test.copy() ada_pred['predicted'] = predicted ada_pred.head() # # XGBoost Regressor print("\nXGBoost") from xgboost import XGBRegressor xgb = XGBRegressor(max_depth=3, learning_rate=0.2, booster='gbtree', n_estimators=70) k_z, k_mse, k_rmse, b_z, b_mse, b_rmse = evaluate_model( xgb, f, X.values, Y.values.ravel(), k=5, B=5) if k_rmse < best_xg_rmse: best_xg_rmse = k_rmse best_xg_rmse_f = f xgb.fit(X_train_pca, y_train) xgb_score = xgb.score(X_test_pca, y_test.values.ravel()) print(xgb_score) if xgb_score > best_xg_score: best_xg_score = xgb_score best_xg_score_f = f predicted = xgb.predict(X_test_pca) xgb_pred = y_test.copy() xgb_pred['predicted'] = predicted xgb_pred.head() # # SVM (SVR) print("\nSVR") from sklearn import svm svr_model = svm.SVR(kernel="poly", coef0=-3500, gamma='scale') # coef0 only works with poly and sigmoid kernels # it just puts that value instead of the column of 1's # without it, this model breaks for some reason k_z, k_mse, k_rmse, b_z, b_mse, b_rmse = evaluate_model( svr_model, f, X.values, Y.values.ravel(), k=5, B=5) if k_rmse < best_svr_rmse: best_svr_rmse = k_rmse best_svr_rmse_f = f # epsilon, degree svr_model.fit(X_train_pca, y_train.values.ravel()) svr_score = svr_model.score(X_test_pca, y_test.values.ravel()) print(svr_score) if svr_score > best_svr_score: best_svr_score = svr_score best_svr_score_f = f svr_predicted = svr_model.predict(X_test_pca) svr_pred = y_test.copy() svr_pred["predicted"] = svr_predicted svr_pred.head() return ((best_ada_score, best_ada_score_f), (best_ada_rmse, best_ada_rmse_f), (best_xg_score, best_xg_score_f), (best_xg_rmse, best_xg_rmse_f), (best_svr_score, best_svr_score_f), (best_svr_rmse, best_svr_rmse_f))
#-----------------------------------------------------------------------------# # XGBoost from xgboost import XGBRegressor # initialize model xgb_clf = XGBRegressor() # fit model xgb_clf.fit(X_train, y_train) #predictions: test data y_pred = xgb_clf.predict(X_test) print('\n\n\nXGBoost') #Scores print('Train score') print(xgb_clf.score(X_train, y_train)) print('Test score') print(xgb_clf.score(X_test, y_test)) print('-------------------------------------------------------') # MAE print('Mean absolute error') print(mean_absolute_error(y_test, y_pred)) print('-------------------------------------------------------') # MSE print('Mean squared error') print(mean_squared_error(y_test, y_pred)) print('-------------------------------------------------------') # R-squared
y, train_size=0.8, random_state=66, shuffle=True) #2. 모델 # model = DecisionTreeRegressor(max_depth=4) # model = RandomForestRegressor(max_depth=4) #model = GradientBoostingRegressor(max_depth=4) model = XGBRegressor(n_job=-1) #3. 훈련 model.fit(x_train, y_train) #4. 평가, 예측 acc = model.score(x_test, y_test) print(model.feature_importances_) print("acc : ", acc) #5. 시각화 import matplotlib.pyplot as plt import numpy as np def plot_feature_importances_dataset(model): n_features = dataset.data.shape[1] plt.barh(np.arange(n_features), model.feature_importances_, align='center') plt.yticks(np.arange(n_features), dataset.feature_names) plt.xlabel("Feature Improtances") plt.ylabel("Features") plt.ylim(-1, n_features)
model = XGBRegressor(n_estimators=550, learning_rate=0.05, max_depth=8, colsample_bytree=0.7, reg_alpha=1, scale_pos_weight=1, reg_lambda=1.1, n_jobs=6) model.fit(x_train, y_train1, verbose=False, eval_metric=['logloss', 'mae'], eval_set=[(x_train, y_train1), (x_test, y_test1)], early_stopping_rounds=20) score1 = model.score(x_test, y_test1) print("score1 : %.4f" % (score1 * 100.0)) # print(model.feature_importances_) y_pred_1 = model.predict(x_test) mae1 = mean_absolute_error(y_test1, y_pred_1) print('mae1 : %.4f' % (mae1)) y_pred1 = model.predict(x_pred) model.fit(x_train, y_train2, verbose=False, eval_metric=['logloss', 'mae'], eval_set=[(x_train, y_train2), (x_test, y_test2)], early_stopping_rounds=20) score2 = model.score(x_test, y_test2) print("score2 : %.4f" % (score2 * 100.0))
# Shape of train and test data print(X_train_temp.shape, X_val_temp.shape) # -------------- from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error # Code starts here dt = DecisionTreeRegressor(random_state=5) dt.fit(X_train, y_train) accuracy = dt.score(X_val, y_val) y_pred = dt.predict(X_val) rmse = np.sqrt(mean_squared_error(y_val, y_pred)) print(accuracy) print(y_pred) print(rmse) # -------------- from xgboost import XGBRegressor xgb = XGBRegressor(max_depth=50, learning_rate=0.83, n_estimators=100) xgb.fit(X_train, y_train) accuracy = xgb.score(X_val, y_val) y_pred = xgb.predict(X_val) rmse = np.sqrt(mean_squared_error(y_val, y_pred)) print(accuracy) print(y_pred) print(rmse) # Code ends here
model = forest.RandomForestRegressor(max_depth=6, n_estimators=i) model.fit(x_train, y_train) ploter.append((i, mean_squared_error(y_test, model.predict(x_test)))) print(i, '/', 99) plt.plot([x for x, y in ploter], [y for x, y in ploter], c='b', linewidth=1.5) plt.show() pre_data = model_2.predict(test_data) pre_data = pd.DataFrame(pre_data) # # simple = pd.read_csv('SampleSubmission.csv') # simple['Item_Outlet_Sales'] = pre_data # simple.to_csv('generated_SampleSubmission.csv') model_3 = XGBRegressor(max_depth=4, n_estimators=12, learning_rate=.2) model_3.fit(x_train, y_train) print('model 3:', mean_squared_error(y_test, model_3.predict(x_test))) print('score :', model_3.score(x_test, y_test)) # 8,40 = 1324449 # 4, 34 = 1150544 # ploter=[] # for i in range(1, 80): # model = XGBRegressor(max_depth=4, n_estimators=i, learning_rate=.2) # model.fit(x_train, y_train) # ploter.append((i, mean_squared_error(y_test, model.predict(x_test)))) # print(i,'/',99) # plt.plot([x for x, y in ploter], [y for x, y in ploter], c='r', marker='.') # plt.show() #
y, train_size=0.8, random_state=66, shuffle=True) #2. 모델 # model = DecisionTreeRegressor(max_depth=4) # model = RandomForestRegressor(max_depth=4) # model = GradientBoostingRegressor(max_depth=4) model = XGBRegressor(n_job=-1) #3. 훈련 model.fit(x_train, y_train) #4. 평가, 예측 acc = model.score(x_test, y_test) print(model.feature_importances_) print("acc : ", acc) #5. 시각화 import matplotlib.pyplot as plt import numpy as np ''' def plot_feature_importances_dataset(model): n_features = dataset.data.shape[1] plt.barh(np.arange(n_features), model.feature_importances_, align='center') plt.yticks(np.arange(n_features), dataset.feature_names) plt.xlabel("Feature Improtances") plt.ylabel("Features") plt.ylim(-1, n_features)
rf = RandomForestRegressor(n_estimators=100, random_state=0) rf.fit(x_train, y_train) y_pred = rf.predict(x_test) plt.scatter(y_test, y_pred) rf.score(x_test, y_test) print('MAE :', " ", metrics.mean_absolute_error(y_test, y_pred)) print('MSE :', " ", metrics.mean_squared_error(y_test, y_pred)) print('RMSE :', " ", np.sqrt(metrics.mean_squared_error(y_test, y_pred))) # XGBoost xgb = XGBRegressor(n_estimators=500, max_depth=4, learning_rate=0.1, early_stopping_rounds=10) xgb.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False) y_pred = xgb.predict(x_test) plt.scatter(y_test, y_pred) xgb.score(x_test, y_test) plot_importance(xgb) print('MAE :', " ", metrics.mean_absolute_error(y_test, y_pred)) print('MSE :', " ", metrics.mean_squared_error(y_test, y_pred)) print('RMSE :', " ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
X = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420) # In[]: # B、弱评估器超参数: # 4、booster(选择弱评估器) for booster in ["gbtree", "gblinear", "dart"]: reg = XGBR(n_estimators=260, learning_rate=0.25, random_state=420, booster=booster, silent=True).fit(Xtrain, Ytrain) print(booster) print(reg.score(Xtest, Ytest)) # gblinear线性弱评估器表现最差: 说明 波斯顿数据集 不是线性数据集(特征X 与 因变量Y 不是线性联系) # In[]: # 5、objective(损失函数) # Sklearn的XGB: objective:默认reg:linear reg = XGBR(n_estimators=270, subsample=0.75, learning_rate=0.13, random_state=420).fit(Xtrain, Ytrain) print(reg.score(Xtest, Ytest)) print(MSE(Ytest, reg.predict(Xtest))) # In[]: # xgb原生库: obj:默认binary:logistic # 使用类Dmatrix读取数据
ada = AdaBoostClassifier() ada.fit(X_train, y_train) print(ada.score(X_test, y_test)) print('Parameters currently in use:\n') pprint(ada.get_params()) # XGBoost # XGBoost and Adaboost are bad in multiclassification. The reason can be c=found in the lightgbm literature review. from xgboost import XGBRegressor from sklearn.metrics import accuracy_score xgb = XGBRegressor() xgb.fit(X_train, y_train) xgb.score(X_test, y_test) from keras.models import Sequential from keras.layers import Dense from keras.wrappers.scikit_learn import KerasRegressor from sklearn.model_selection import cross_val_score from sklearn.model_selection import KFold from keras import optimizers def baseline_model(): # create model model = Sequential() model.add( Dense(64, input_dim=25, kernel_initializer='normal', activation='relu'))