def XGB(self, x_train, y_train, x_test, y_test): x_train, y_train = shuffle(x_train, y_train) xgb = XGBRegressor(max_depth=4, subsample=0.9) xgb.fit(x_train,y_train) y_pred = xgb.predict(x_test).reshape(x_test.shape[0], 1) loss = mean_squared_error(y_pred, y_test) print loss return y_pred, loss
def Stacking(real_train_tar): predictions_train = pd.DataFrame([np.expm1(y_lasso_predict), np.expm1(y_ridge_predict), np.expm1(y_rf_predict), np.expm1(y_xgb_predict)]).T sns.pairplot(predictions_train) learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] stack_model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_stack = RandomizedSearchCV(estimator=stack_model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_stack.fit(predictions_train, real_train_tar) xgb_stack.best_params_ write_pkl(xgb_stack.best_params_, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/stack_params.pkl') model_stacking = XGBRegressor(**xgb_stack.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_stacking.fit(predictions_train,real_train_tar) end=time.time() print("MSE for train data is: %f" % mean_squared_error(np.log1p(real_train_tar),np.log1p( model_stacking.predict(predictions_train)))) print('Time elapsed: %.4f seconds' % (end-start)) y_stack_predict=model_stacking.predict(predictions_train) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,y_stack_predict) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price')
class HousePricePredictor(BaseModel): def __init__(self): self.model = XGBRegressor() def predict(self, X): X = self._prepare_data(X) return self.model.predict(X) def _prepare_data(self, X): return pd.DataFrame(X, columns=FEATURES) def fit(self, X, y): model = XGBRegressor() clf = GridSearchCV( model, { 'max_depth': [6, ], 'learning_rate': [0.05, ], 'n_estimators': [450, 470, 475, 480, 485, ] }, n_jobs=4, cv=3, verbose=1 ) clf.fit(X, y) logging.info("Best Score: {}".format(clf.best_score_)) logging.info("Best Params: {}".format(clf.best_params_)) self.model = clf.best_estimator_ return self.model def dump(self, path): self.model.save_model(path) @classmethod def load(cls, path): house_model = HousePricePredictor() house_model.model.load_model(path) return house_model
test = test.drop(["单核细胞%"], axis=1) # X = X.drop(["淋巴细胞%"], axis=1) # X = X.drop(["乙肝e抗原"], axis=1) # X = X.drop(["乙肝表面抗体"], axis=1) Y = data["血糖"] Y = np.log1p(Y) clf = XGBRegressor() print("---111----") kfold = KFold(n_splits=5, random_state=7) test_score = np.sqrt( -cross_val_score(clf, X, Y, cv=kfold, scoring='neg_mean_squared_error')) print("------test_score--------") print(test_score) print(np.mean(test_score)) print("---2----") clf.fit(X, Y) FeatureImportances = pd.Series( clf.booster().get_fscore()).sort_values(ascending=False) print(FeatureImportances) print("---3----") pred = np.expm1(clf.predict(test)) pred_df = pd.DataFrame() pred_df["pred"] = pred pred_df.to_csv( '/Users/jianjun.yue/PycharmGItHub/data/人工智能辅助糖尿病遗传风险预测/sub_0107_XG_去掉负效果特征.csv', header=False, index=False, float_format='%.3f')
print('RMSE KNeighborsRegressor: ', RMSLE(np.log1p(train['visitors'].values), model2.predict(train[col]))) #test['visitors'] = (model1.predict(test[col]) + model2.predict(test[col])) / 2 test['visitors'] = model2.predict(test[col]) test['visitors'] = np.expm1(test['visitors']).clip(lower=0.) sub1 = test[['id','visitors']].copy() #del train; del data; sub1[['id', 'visitors']].to_csv(os.path.join(path_kaggle, 'naive_forecast2.csv'), index = False) from xgboost import XGBRegressor model3 = XGBRegressor() model3.fit(train[col], np.log1p(train['visitors'].values), verbose=False) print('XGBRegressor: ', RMSLE(np.log1p(train['visitors'].values), model3.predict(train[col]))) ## from hklee ## https://www.kaggle.com/zeemeen/weighted-mean-comparisons-lb-0-497-1st/code #dfs = { re.search('/([^/\.]*)\.csv', fn).group(1): # pd.read_csv(fn)for fn in glob.glob('../input/*.csv')} # #for k, v in dfs.items(): locals()[k] = v # #wkend_holidays = date_info.apply( # (lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1) #date_info.loc[wkend_holidays, 'holiday_flg'] = 0 #date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5 # #visit_data = air_visit_data.merge(date_info, left_on='visit_date', right_on='calendar_date', how='left') #visit_data.drop('calendar_date', axis=1, inplace=True)
mea = getmea(max_leaf_nodes,train_x,val_x,train_y,val_y) print("Max_leaf_nodes: %d ,mea: %d" %(max_leaf_nodes,mea)) ''' # clf = XGBRegressor() 17165 # XGBRegressor(n_estimators=400) 16330 ''' params = [.02,.03,.04,.05,.06,.07,.08,.09,.10]#[1:1001:50][100,200,300,400,500] test_scores = [] for param in params: clf = XGBRegressor(n_estimators=400,learning_rate=param) test_score = np.sqrt(-cross_val_score(clf, train_X, train_y, cv=10, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) plt.plot(params, test_scores) plt.title("n_estimator vs CV Error" + str(params)); # 一定要加上这句才能让画好的图显示在屏幕上 plt.show() ''' my_model = XGBRegressor(n_estimators=400) my_model.fit(train_X, train_y,verbose=False) predictions = my_model.predict(test_X) print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y))) #save model #joblib.dump(melbourne_model,'model.pickle') #load model #model = joblib.load('model.pickle')
def ValidateTrainTestErrorsWithDifferentModels(cvX_train, cvX_test, cvy_train, cvy_test,X_train,y_train,X_test): clfs = list() cvClfs = list() print "Building RF1" rfShortCV = ensemble.RandomForestRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", n_jobs=-1, random_state=0) rfShort = ensemble.RandomForestRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", n_jobs=-1, random_state=0) rfShortCV.fit(cvX_train, cvy_train); print 'RF1 CV Results :',mean_absolute_error(cvy_test,rfShortCV.predict(cvX_test)) pd.DataFrame({"Actual":cvy_test, "Predicted":rfShortCV.predict(cvX_test)}).to_csv("snehaRF.csv", index=False,header=True); rfShort.fit(X_train,y_train) cvClfs.append(rfShortCV) clfs.append(rfShort) pd.DataFrame({"ID":out_id, "Expected":rfShort.predict(X_test)}).to_csv("subRF1.csv", index=False,header=True); print "Building SVM" clfSVRCV = SVR(C=10.0) clfSVR = SVR(C=10.0) clfSVRCV.fit(cvX_train, cvy_train); print 'SVM CV Results :',mean_absolute_error(cvy_test,clfSVRCV.predict(cvX_test)) pd.DataFrame({"Actual":cvy_test, "Predicted":clfSVRCV.predict(cvX_test)}).to_csv("snehaSVR.csv", index=False,header=True); print "Building RF2" rfLongCV = ensemble.RandomForestRegressor(min_samples_split=200,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", n_jobs=4, random_state=0) rfLong = ensemble.RandomForestRegressor(min_samples_split=200,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", n_jobs=4, random_state=0) rfLongCV.fit(cvX_train, cvy_train); print 'RF2 CV Results :',mean_absolute_error(cvy_test,rfLongCV.predict(cvX_test)) rfLong.fit(X_train,y_train) cvClfs.append(rfLongCV) clfs.append(rfLong) pd.DataFrame({"ID":out_id, "Expected":rfLong.predict(X_test)}).to_csv("subRF2.csv", index=False,header=True); print "Building GB1" regGBCV1 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad') regGBCV1.fit(cvX_train, cvy_train); print 'GB1 CV Results :',mean_absolute_error(cvy_test,regGBCV1.predict(cvX_test)) regGB1 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad') regGB1.fit(X_train,y_train) cvClfs.append(regGBCV1) clfs.append(regGB1) pd.DataFrame({"ID":out_id, "Expected":regGB1.predict(X_test)}).to_csv("subGB1.csv", index=False,header=True); print 'Building GB2' regGBCV2 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad') regGBCV2.fit(cvX_train, cvy_train); print 'GB2 CV Results :',mean_absolute_error(cvy_test,regGBCV2.predict(cvX_test)) regGB2 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad') regGB2.fit(X_train,y_train) cvClfs.append(regGBCV2) clfs.append(regGB2) pd.DataFrame({"ID":out_id, "Expected":regGB2.predict(X_test)}).to_csv("subGB2.csv", index=False,header=True); print 'Feature Importances RF1:',sorted(zip(map(lambda x: round(x, 4), rfShort.feature_importances_), df_final.columns),reverse=True); print 'Feature Importances GB1:',sorted(zip(map(lambda x: round(x, 4), regGB1.feature_importances_), df_final.columns),reverse=True); print 'Feature Importances RF2:',sorted(zip(map(lambda x: round(x, 4), rfLong.feature_importances_), df_final.columns),reverse=True); print 'Feature Importances GB2:',sorted(zip(map(lambda x: round(x, 4), regGB2.feature_importances_), df_final.columns),reverse=True); print "Building XGB1" xgbCV1 = xgb.XGBRegressor(n_estimators=3000, nthread=-1, max_depth=None, learning_rate=0.01, silent=True, subsample=0.8, colsample_bytree=0.7) xgbCV1.fit(cvX_train, cvy_train); xgb1 = xgb.XGBRegressor(n_estimators=3000, nthread=-1, max_depth=None, learning_rate=0.01, silent=True, subsample=0.8, colsample_bytree=0.7) xgb1.fit(X_train,y_train); print 'XGB1 Model CV :',mean_absolute_error(cvy_test,xgbCV1.predict(cvX_test)); cvClfs.append(xgbCV1) clfs.append(xgb1) pd.DataFrame({"ID":out_id, "Expected":xgb1.predict(X_test)}).to_csv("subXGB1.csv", index=False,header=True); print "Building XGB2" params = {} params["objective"] = "reg:linear" params["learning_rate"] = 0.005 params["min_child_weight"] = 6 params["subsample"] = 0.7 params["colsample_bytree"] = 0.75 params["silent"] = 1 params["max_depth"] = 7 params["n_estimators"] = 3000 params['gamma'] = 1.25 params['nthread'] = -1 print 'XGBoost Training Process Started' xgbCV2 = XGBRegressor(**params); xgbCV2.fit(cvX_train, cvy_train); print 'XGB Model CV :',mean_absolute_error(cvy_test,xgbCV2.predict(cvX_test)); xgb2 = XGBRegressor(**params); xgb2.fit(X_train,y_train); cvClfs.append(xgbCV2) clfs.append(xgb2) pd.DataFrame({"ID":out_id, "Expected":xgb2.predict(X_test)}).to_csv("subXGB2.csv", index=False,header=True); # Return the cross validated models and the actual fitted models separately. return [clfs,cvClfs];
for param in params: clf = XGBRegressor(n_estimators=param) test_score = np.sqrt(-cross_val_score(clf, train_x, train_y, cv=10, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) print test_scores plt.plot(params, test_scores) plt.title("n_estimators vs CV Error"); # 一定要加上这句才能让画好的图显示在屏幕上 plt.show() # 将当前figure的图保存到文件result.png #plt.savefig('./xgboostparams.png') # 91 16889 xgb = XGBRegressor(max_depth=6,n_estimators=400) xgb.fit(X, y) print mean_absolute_error(val_y,xgb.predict(val_x)) print(mean_squared_error(val_y,xgb.predict(val_x))) #gbdt ''' print "GradientBoostingRegressor" gbdt = GradientBoostingRegressor(n_estimators = 1000,max_leaf_nodes = 400) gbdt.fit(X, y)#17083 #RandomForestRegressor 93 16938 #GradientBoostingRegressor 90 16866 #XGBRegressor 100 19939 print mean_absolute_error(val_y,gbdt.predict(val_x)) print(mean_squared_error(val_y,gbdt.predict(val_x))) # predict and save output #print ("The predictions are")
test_size=.3 X_train, X_test, y_train, y_test = train_test_split(X, log_loss, test_size=test_size, random_state=seed) model=XGBRegressor(learning_rate=0.08, max_depth=10, objective='reg:linear', nthread=3, gamma=0.2, subsample=0.9, n_estimators=100, ) model.fit(X_train, y_train) print(model) y_pred=model.predict(X_test) def mae(predicted, actual, logscale=False): if logscale == True: predexp=np.exp(predicted) actualexp=np.exp(actual) return np.mean(np.abs(predexp - actualexp)) else: return np.mean(np.abs(predicted - actual)) print(mae(y_pred, y_test, True)) # #Plotting Variable Importance # plt.bar(range(len(model.feature_importances_)), model.feature_importances_) # plt.title('Variable Importance')
print(X.head()) #%% train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0) my_model = XGBRegressor() #%% my_model.fit(train_X, train_y, early_stopping_rounds=5, eval_set=[(val_X, val_y)], verbose=False) #%% print("Making predictions for the following 5 situations:") print(X.head()) print("The predictions are") print(my_model.predict(X.head())) #%% predictions = my_model.predict(val_X) #print (predictions) #%% #print((mean_squared_error(val_y, ) rmse = np.sqrt(mean_squared_error(val_y, predictions)) R = r2_score(val_y, predictions) MAPE = np.mean(np.abs((val_y - predictions) / val_y)) * 100 print("RMSE: %f" % (rmse))
# alpha = 1, # gamma = 2, # min_child_weight = 1, # base_score = 7.76 # nrounds=5000, # nfold=5, # early_stopping_rounds=15, # print_every_n = 10, # verbose= 1, # feval=xg_eval_mae, # maximize=FALSE # ) folds = KFold(n_splits=3, shuffle=False) for k, (train_index, test_index) in enumerate(folds.split(train_xg_x)): xtr = train_xg_x[train_index] ytr = train_xg_y[train_index] xtest = train_xg_x[test_index] ytest = train_xg_y[test_index] print "Fitting on fold {}...".format(k) print "Checking xtest shape: ", xtest.shape print "Checking ytest shape: ", ytest.shape xgboosting.fit(xtr, ytr, verbose=True) np.savetxt('xgb_pred_fold_{}.txt'.format(k), np.exp(xgboosting.predict(xtest))) np.savetxt('xgb_test_fold_{}.txt'.format(k), ytest) # Training xgboost on test set (i.e. whole train set). xgboosting.fit(train_xg_x, train_xg_y, verbose=True) print "Fitting on test set..." np.savetxt('xgb_pred_test.txt', np.exp(xgboosting.predict(test_xg_x)))
ERR=0 min_err=1 for j in [15000]: # m=pd.read_csv('./data/result/6_0.0322.csv',names=['vid','收缩压', '舒张压', '血清甘油三酯', '血清高密度脂蛋白','血清低密度脂蛋白']) ############################################ # clf = LGBMRegressor(n_estimators=100, subsample_for_bin=j,learning_rate=0.08,num_leaves=46,subsample=0.97,min_split_gain=3) clf = XGBRegressor(n_estimators=200, max_depth=7, min_child_weight=1, gamma=0) for i in y_names: clf.fit(train_x, train_y[i]) pred_ = clf.predict(test_x) a = sum((np.log(pred_ + 1) - np.log(test_y[i] + 1)) ** 2) / len(pred_) joblib.dump(clf, './data/model/{0}.model_{1}'.format(i, a)) print(a) ERR += a print(ERR / 5.0) ################################################# # for i in y_names: # if i=='血清甘油三酯': # # clf=joblib.load('./data/model_select/血清甘油三酯.model_0.07747025561699392') # pass # elif i=='血清高密度脂蛋白': # clf=joblib.load('./data/model/血清高密度脂蛋白.model_0.011323631892292683') # elif i=='收缩压': # clf=joblib.load('./data/model/收缩压.model_0.014185246052119297') # elif i=='舒张压':
y = array[:,len(names)-1] # Split-out to test and validation sets test_size = 0.4 # Keep time series order by shuffle=False X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False) # Fit model model = XGBRegressor(n_estimator=200, max_depth=2, learning_rate=0.1, colsample_bylevel=0.8, n_jobs=-1) model.fit(X_train, y_train) print(model) # Make predictions predictions = model.predict(X_test) # Evaluate predictions evc = explained_variance_score(y_test, predictions) print("Explained variance: %.2f%%" % (evc * 100.0)) mae = mean_absolute_error(y_test, predictions) print("Mean absolute error: %.2f" % (mae)) rmse = sqrt(mean_squared_error(y_test, predictions)) print("RMSE: %.2f" % (rmse)) r2 = r2_score(y_test, predictions) print("R2 coefficient of determination: %.2f" % (r2)) # For sampled population confidence interval = standard error * 1.96; https://en.wikipedia.org/wiki/Standard_error # Here used 2 * std error = std(predictions)*2
model = XGBRegressor(n_estimators=1000, learning_rate=0.1) model.fit(x_train, y_train, verbose=True, eval_metric=["logloss", "rmse"], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20) #rmse, mae, logloss, error, auc #error은 회기 모델 지표가 아니다 #eval metric을 두가지 이상으로 할때는 리스트 형식으로 쓴다. result = model.evals_result() print("eval's results :", result) y_predict = model.predict(x_test) r2 = r2_score(y_predict, y_test) print("r2 Score : %.2f%%" % (r2 * 100.0)) print("r2 :", r2) # Stopping. Best iteration: # [28] validation_0-rmse:0.06268 validation_1-rmse:0.28525 #validation 이 올라가기 시작하면서 끊겼다. loss랑 validation 중에 중요한것은 validation 이다. import matplotlib.pyplot as plt epochs = len(result['validation_0']['logloss']) #우리가 하게 된 에포의 길이 x_axis = range(0, epochs) thresholds = np.sort(model.feature_importances_) #정렬 #중요도가 낮은 것부터 높은것 까지
xgb1 = XGBRegressor(n_estimators=100, subsample=0.9, learning_rate=0.08, reg_alpha=1e-05) xgb1.fit(X_train, y_train) print("---2----") predict_df = pd.read_excel( '/Users/jianjun.yue/PycharmGItHub/data/智能制造质量预测/测试A.xlsx', header=0, encoding='utf-8') # predict_df=predict_df.drop(["ID"],axis=1) predict_df = predict_df[quantity] # predict_df = Imputer().fit_transform(predict_df) print("---3----") pred = xgb1.predict(predict_df) pred_df = pd.DataFrame() pred_df["pred"] = pred pred_df.to_csv( '/Users/jianjun.yue/PycharmGItHub/data/智能制造质量预测/测试A-答案模板_pred.csv', index=False, float_format='%.4f') print("---4----") submission_df = pd.DataFrame() submission_iddf = pd.read_csv( '/Users/jianjun.yue/PycharmGItHub/data/智能制造质量预测/测试A-答案模板.csv') pred_df_TEMP = pd.read_csv( '/Users/jianjun.yue/PycharmGItHub/data/智能制造质量预测/测试A-答案模板_pred.csv') submission_df["id"] = submission_iddf["id"] submission_df["pred"] = pred_df_TEMP["pred"]
test_X = data_test.copy() numeric_cols = train_X.dtypes[train_X.dtypes != 'object'].index train_X = train_X[numeric_cols] test_X = test_X[numeric_cols] train_X = train_X.fillna(train_X.mean()) test_X = test_X.fillna(test_X.mean()) from xgboost import XGBRegressor XGBmodel = XGBRegressor() XGBmodel.fit(train_X, train_y, verbose=False) predictions = XGBmodel.predict(test_X) XGB_Submission = pd.read_csv( r'I:\Data Science Fundamentals\house-prices-advanced-regression-techniques\sample_submission.csv', index_col='Id') XGB_Submission['SalePrice'] = XGBmodel.predict(test_X) XGB_Submission.to_csv('XGB_Submission.csv') XGB_Submission.head() # In[97]: # Model 5: XGB Classifier train_X = data_train.loc[:, : 'SaleCondition'] #'SaleCondition' is the second last column before 'SalePrice' train_y = data_train.loc[:, 'SalePrice'] #This DataFrame will be used for the predictions
print(rmse_list) hparam = vals[np.argmin(rmse_list)] print('the best hyperparameter',hparam) plt.plot(vals,rmse_list) #1963 optimal n_estimators plt.xlabel('hyperparameter') plt.ylabel('mean absolute error') plt.title('hyperparameter tuning') plt.show() ''' #model fit for final predictions model = XGBRegressor(n_estimators=2300, gamma=0.06868686868686869, colsample_bytree=0.8383838384, subsample=0.36969696969696975, reg_lambda=0.4444444444444445, max_depth=1, eval_metric="rmse", reg_alpha=0.4050707070707071) model.fit(X, y, verbose=False) predictions = pd.Series(model.predict(predict_these)) predictions.to_csv("Submission.csv") f = open("Submission.csv", "r") f_out = open("Submission_out.csv", "w") for line in f.readlines(): line = line.split(',') line[0] = str(int(line[0]) + 1) line = ",".join(line) f_out.write(line) f.close() f_out.close()
# -*- coding: utf-8 -*- from xgboost import XGBRegressor import pandas as pd train = pd.read_csv("C:\\Users\\jowet\\Downloads\\Santander\\train.csv") test = pd.read_csv("C:\\Users\\jowet\\Downloads\\Santander\\test.csv") train.drop('ID', axis=1, inplace=True) y_train = train.pop('target') pred_index = test.pop('ID') reg = XGBRegressor() reg.fit(train, y_train) y_pred = reg.predict(test) submit = pd.DataFrame() submit['ID'] = pred_index submit['target'] = y_pred submit.to_csv('my_XGB_prediction.csv', index=False)
from sklearn.preprocessing import StandardScaler standard_scaler = StandardScaler() # %% scaled_X_train = standard_scaler.fit_transform(X_train_xgb) scaled_X_test = standard_scaler.transform(X_test_xgb) # %% from xgboost import XGBRegressor modelXGB = XGBRegressor(n_estimators=2000, learning_rate=0.8) modelXGB.fit(scaled_X_train, y_train_xgb, eval_set=[(scaled_X_train, y_train_xgb), (scaled_X_test, y_test_xgb)], verbose=True) modelXGB_pred = modelXGB.predict(scaled_X_test) # %% from sklearn.metrics import mean_absolute_error print('XGBOOST MAE = ', (mean_absolute_error(modelXGB_pred, y_test_xgb))) # %% XGBOOST_df = pd.DataFrame({'y': modelXGB_pred}) XGBOOST_df.index = y_test_xgb.index # %% def coversion(X, day_new): X['day'] = pd.DatetimeIndex(day_new).day X['month'] = pd.DatetimeIndex(day_new).month X['quarter'] = pd.DatetimeIndex(day_new).quarter
accuracies = cross_val_score(estimator=regressor, X=X_train, y=y_train, cv=5) accuracies.mean() accuracies.std() # Applying Grid Search to find the best model and the best parameters from sklearn.model_selection import GridSearchCV #parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, # {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}] parameters = [{ 'learning_rate': [0.01, 0.1, 0.2], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5] }] grid_search = GridSearchCV(estimator=regressor, param_grid=parameters, cv=5, n_jobs=-1) grid_search = grid_search.fit(X_train, y_train) best_accuracy = grid_search.best_score_ best_parameters = grid_search.best_params_ # Predicting new values with test data y_pred = regressor.predict(X_test) # Finding the rmse value import math from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error mae = mean_absolute_error(y_test, y_pred) rmse = math.sqrt(mean_squared_error(y_test, y_pred))
x1 = joblib.load(data[1]) X = pd.concat((x0, x1), axis=1) del x0, x1 X_train = X[:num_train] X_test = X[num_train:] del X # X_train, X_val, y_train, y_val = train_test_split(X, # y, # test_size=0.1, # random_state=42) xgbr.fit( X_train, y_train, # eval_metric='rmse', # early_stopping_rounds=30, verbose=True, # eval_set=[(X_val, y_val)], ) y_pred = xgbr.predict(X_test) res.append(y_pred) final_res = np.mean(res, axis=0) final_res[final_res < 1] = 1 final_res[final_res > 3] = 3 pd.DataFrame({"id": id_test, "relevance": final_res}).to_csv("xgbr_sub_20160423_bag.csv", index=False)
imputed_final_test[col + '_was_missing'] = imputed_final_test[col].isnull() # Imputation imputer = Imputer() train_X = imputer.fit_transform(imputed_X_train_plus) test_X = imputer.transform(imputed_X_test_plus) output_test = imputer.transform(imputed_final_test) # train the model model = XGBRegressor(n_estimators=1000, learning_rate=0.05) model.fit(train_X, train_y, early_stopping_rounds=5, eval_set=[(test_X, test_y)], verbose=False) # test the actuary val_predictions = model.predict(test_X) print("Mean Absolute Error : " + str(mean_absolute_error(val_predictions, test_y))) # output output_predictions = model.predict(output_test) print(output_predictions) my_submission = pd.DataFrame({ 'Id': test_data.Id, 'SalePrice': output_predictions }) my_submission.to_csv('submission.csv', index=False)
test_score = np.sqrt(-cross_val_score(clf, train_x, train_y, cv=10, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) print test_scores plt.plot(params, test_scores) plt.title("n_estimators vs CV Error"); # 一定要加上这句才能让画好的图显示在屏幕上 plt.show() # 将当前figure的图保存到文件result.png #plt.savefig('./xgboostparams.png') ''' # XGBRegressor 91 16889 print "XGBRegressor" xgb = XGBRegressor(max_depth=6,n_estimators=400) xgb.fit(X, y) print mean_absolute_error(val_y,xgb.predict(val_x)) print mean_squared_error(val_y,xgb.predict(val_x)) #gbdt print "GradientBoostingRegressor" gbdt = GradientBoostingRegressor(n_estimators = 1000,max_leaf_nodes = 400) gbdt.fit(X, y)#17083 #RandomForestRegressor 93 16938 #GradientBoostingRegressor 90 16866 print mean_absolute_error(val_y,gbdt.predict(val_x)) print mean_squared_error(val_y,gbdt.predict(val_x)) #xgb & gbdt predicted = (xgb.predict(val_x) + gbdt.predict(val_x))/2 print mean_absolute_error(val_y,predicted)
random_grid = {'n_estimators':n_estimators, 'max_depth':max_depth, 'learning_rate':learning_rate} print(random_grid) rand_cv = RandomizedSearchCV(estimator = xg_reg, param_distributions=random_grid, scoring= 'neg_mean_squared_error', cv = 10 ) rand_cv.fit(x_train, y_train) rand_cv.best_params_ xg_reg = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators= 1000) xg_reg.fit(x_train,y_train) y_rg = xg_reg.predict(x_test) sns.distplot(y_test - y_rg) print('MAE:', metrics.mean_absolute_error(y_test, y_rg)) print('MSE:', metrics.mean_squared_error(y_test, y_rg)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_rg))) from lightgbm import LGBMRegressor lg = LGBMRegressor() #HyperParameter tuning n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)] max_depth = [-5,-3,-2,-1,1,3,5,8,15,20,25,30] learning_rate = [0.001,0.002,0.005,0.01,0.02,0.05,0.1,0.2,0.5] from sklearn.model_selection import RandomizedSearchCV
typ = df_valid.dtypes df_valid.to_csv('df_valid_cat.csv', header=None, index=False) df_train.columns RFR = RandomForestRegressor() RFR.fit(X_train, Y_train) RFR_preds = pd.DataFrame(RFR.predict(X_test),columns=['salePrice'],index=Y_test.index) print(mean_absolute_error(Y_test, RFR_preds)) RFR_new = RFR_preds.apply(lambda x: np.power(np.e,x).astype('int64')) XGB = XGBRegressor() XGB.fit(X_train, Y_train, verbose=False) XGB_preds = pd.DataFrame(XGB.predict(X_test),columns=['salePrice'],index=Y_test.index).astype(int) print(mean_absolute_error(Y_test,XGB_preds)) XGB_new = XGB_preds.apply(lambda x: np.power(np.e,x).astype('int64')) GBR = GradientBoostingRegressor() GBR.fit(X_train, Y_train) GBR_preds = pd.DataFrame(GBR.predict(X_test),columns=['salePrice'],index=Y_test.index) print(mean_absolute_error(Y_test,GBR_preds)) GBR_new = GBR_preds.apply(lambda x: np.power(np.e,x).astype('int64')) sns.swarmplot(x=GBR_preds['salePrice'],y=Y_test) from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score
import time start = time.time() for thresh in thresholds: # 칼럼 수 만큼 돈다! selection = SelectFromModel(model, threshold=thresh, prefit=True) # threshold= median select_x_train = selection.transform(x_train) print(select_x_train.shape) selection_model = XGBRegressor() selection_model.fit(select_x_train, y_train) select_x_test = selection.transform(x_test) y_predict = selection_model.predict(select_x_test) score = r2_score(y_test, y_predict) print("Thresh=%.3f, n=%d, R2: %.2f%%" % (thresh, select_x_train.shape[1], score * 100.0)) end = time.time() - start print(end) import time start2 = time.time() for thresh in thresholds: # 칼럼 수 만큼 돈다! selection = SelectFromModel(model, threshold=thresh, prefit=True)
class PrudentialRegressorCVO(BaseEstimator, RegressorMixin): def __init__(self, objective='reg:linear', learning_rate=0.045, min_child_weight=50, subsample=0.8, colsample_bytree=0.7, max_depth=7, n_estimators=700, nthread=-1, seed=0, n_buckets=8, initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6, #1., 2., 3., 4., 5., 6., 7. ], minimizer='BFGS', scoring=NegQWKappaScorer): self.objective = objective self.learning_rate = learning_rate self.min_child_weight = min_child_weight self.subsample = subsample self.colsample_bytree = colsample_bytree self.max_depth = max_depth self.n_estimators = n_estimators self.nthread = nthread self.seed = seed self.n_buckets = n_buckets self.initial_params = initial_params self.minimizer = minimizer self.scoring = scoring return def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, """ 2 / 5 grid scores: mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65531 3 / 5 grid scores: mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65474 4 / 5 grid scores: mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65490 2 / 10 grid scores: mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65688 3 / 10 grid scores: mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65705 4 / 10 grid scores: mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65643 5 / 10 grid scores: mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65630 """ from sklearn.cross_validation import StratifiedKFold kf = StratifiedKFold(y, n_folds=2) print(kf) params = [] for itrain, itest in kf: ytrain = y[itrain] Xtrain = X.iloc[list(itrain)] ytest = y[itest] Xtest = X.iloc[list(itest)] self.xgb = XGBRegressor( objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) self.xgb.fit(Xtrain, ytrain) te_y_hat = self.xgb.predict(Xtest, ntree_limit=self.xgb.booster().best_iteration) print('XGB Test score is:', -self.scoring(te_y_hat, ytest)) self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.off.fit(te_y_hat, ytest) print("Offsets:", self.off.params) params += [list(self.off.params)] pass from numpy import array self.off.params = array(params).mean(axis=0) print("Mean Offsets:", self.off.params) self.xgb.fit(X, y) return self def predict(self, X): from numpy import clip te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) return clip(self.off.predict(te_y_hat), 1, 8) pass
options=[1, 2, 3, 4, 5, 6, 7, 8], index=2) # CREATE TRAIN & TEST SETS: training size = 80% test size =20% X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2) #,random_state=42) # TEST FOR TWO MODELS: lr = LinearRegression() lr.fit(X_train, y_train) y_pred1 = lr.predict(X_test) score = r2_score(y_test, y_pred1) xgb = XGBRegressor(n_jobs=-1, random_state=42) xgb.fit(X_train, y_train) y_pred = xgb.predict(X_test) #pred = [round(value) for value in y_pred] score2 = r2_score(y_test, y_pred) # print("R squared score is %.2f%%" % (r2_score(y_test,y_pred)*100.0)) disp_col.subheader('The average price of the house:') disp_col.write(abs(lr.predict([[n_area, n_beds, n_baths]]))) disp_col.subheader('R squared score of the MLR model:') disp_col.write(score) disp_col.subheader('R squared score of XGBRegressor model:') disp_col.write(score2) #disp_col.subheader('Mean squared error of the model:') #disp_col.write(mean_squared_error(y_test,predictions)) #disp_col.subheader('Mean absolute error of the model:')
'subsample': [0.6,0.7,0.8], 'colsample_bytree': [0.6,0.7,0.8]}, verbose=1, n_jobs=2) regressor = gscv.fit(np.array(train), train[goal]) print(regressor.best_score_) print(regressor.best_params_) else: regressor.fit(np.array(train[features]), train[goal]) print ' -> Training time:', time.time() - start # Evaluation and export result if sample: if not gridsearch: # Test results if logexp: print "RMSPE: " + str(rmspe(map(lambda x : np.exp(x)-1, regressor.predict(np.array(test[features]))),test[goal].values)) else: print "RMSPE: " + str(rmspe(regressor.predict(np.array(test[features])),test[goal].values)) else: csvfile = 'result/' + regressor.__class__.__name__ + '-submit.csv' with open(csvfile, 'w') as output: predictions = [] for i in test[myid].tolist(): # stores that haven't opened will have 0 sales if test[test[myid] == i]['Open'].item() == 0: predictions += [[i,0]] else: # import pdb;pdb.set_trace() if logexp: predictions += [[i,np.exp(regressor.predict(np.array(test[test[myid]==i][features]))[0])-1]] else:
class PrudentialRegressorFO(BaseEstimator, RegressorMixin): def __init__(self, objective='reg:linear', learning_rate=0.045, min_child_weight=50, subsample=0.8, colsample_bytree=0.7, max_depth=7, n_estimators=700, nthread=-1, seed=0, n_buckets=8, initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6, #1., 2., 3., 4., 5., 6., 7. ], minimizer='BFGS', scoring=NegQWKappaScorer): self.objective = objective self.learning_rate = learning_rate self.min_child_weight = min_child_weight self.subsample = subsample self.colsample_bytree = colsample_bytree self.max_depth = max_depth self.n_estimators = n_estimators self.nthread = nthread self.seed = seed self.n_buckets = n_buckets self.initial_params = initial_params self.minimizer = minimizer self.scoring = scoring return def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor self.xgb = XGBRegressor( objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.xgb.fit(X, y) tr_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) print('Train score is:', -self.scoring(tr_y_hat, y)) self.off.fit(tr_y_hat, y) print("Offsets:", self.off.params) return self def predict(self, X): from numpy import clip te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) return clip(self.off.predict(te_y_hat), 1, 8) pass
# In[210]: result = grid_search.fit(train,y) # summarize results print("Best: %f using %s" % (result.best_score_, result.best_params_)) # In[211]: model=XGBRegressor(learning_rate=0.3,n_estimators=100) for traincv,testcv in kfold: model.fit(train.iloc[traincv],y.iloc[testcv]) # In[212]: y_pred=model.predict(test) # In[213]: output2 = pd.DataFrame( data={"outlet_no":outlet,"total_sales_Actual": y_pred} ) output2.to_csv("model.csv", index=False,quoting=3) # In[ ]:
#Build the model #model=ExtraTreesRegressor() #model=RandomForestRegressor() #params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2, # 'learning_rate': 0.01, 'loss': 'ls'} params = {'n_estimators': 400, 'max_depth': 7} #model=GradientBoostingRegressor(**params) model=XGBRegressor(**params) #model=GaussianNB() #model=Ridge() #model=KNeighborsRegressor() #model=DecisionTreeRegressor() model.fit(train_dataset,train_target) #Predict with the model predictions=model.predict(test_dataset) # In[51]: ### Cross Validation ### #cv = StratifiedKFold(train_dataset, n_folds=5) ###scoring scores = cross_validation.cross_val_score(model, train_dataset, train_target, cv=5) print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2) ### getting the predictions ### #predicted = cross_validation.cross_val_predict(clf, train_dataset, train_target, cv=10) #print metrics.accuracy_score(train_target, predicted)
model = xgb.fit(X_train, Y_train) # model.feature_importances_; from xgboost import XGBRegressor #Get Data Y_train = train_df['price_doc'].values X_train = train_df.ix[:, train_df.columns != 'price_doc'].values X_test = test_df.values #Init Model xgb = XGBRegressor(learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.7) #Train Model model = xgb.fit(X_train, Y_train) #Make Predictions predictions = xgb.predict(X_test) #Make Submission File submission_df = pd.DataFrame({'id':test_full['id'], 'price_doc':predictions}) submission_df.to_csv('xgb-added_features.csv', index=False) ################################### SVM ############################################ # from sklearn.decomposition import PCA # pca = PCA(n_components=0.8, whiten=True) # train_x = pca.fit_transform(X_train) # test_x = pca.transform(X_test) # svm_dr = svm.SVC(kernel='rbf', C=10) # svm_dr.fit(train_x, ravel(Y_train)) # predictions=svm_dr.predict(test_x)
def Model(train_linear, test_linear): train_linear_fea=train_linear.drop(columns=['SalePrice']) train_linear_tar=train_linear.SalePrice x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0) def evaluate(model, test_features, test_labels,train_features, train_labels): predictions = model.predict(test_features) errors = abs(predictions - test_labels) mape = 100 * np.mean(errors / test_labels) accuracy = 100 - mape print('Model Performance') print('Average Error: {:0.4f} degrees.'.format(np.mean(errors))) print('Accuracy = {:0.2f}%.'.format(accuracy)) print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train))) print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test))) return accuracy real_train_tar=np.expm1(train_linear_tar) """ . Lasso model """ lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), ) lassocv.fit(train_linear_fea, train_linear_tar) lassocv_score = lassocv.score(train_linear_fea, train_linear_tar) lassocv_alpha = lassocv.alpha_ print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score) start=time.time() lasso =Lasso(normalize = True) lasso.set_params(alpha=lassocv_alpha,max_iter = 10000) lasso.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, lasso.predict(x_test)) coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(lasso,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_lasso_predict=lasso.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_lasso_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_lasso=np.expm1(lasso.predict(test_linear)) """ . Ridge model """ ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400)) ridgecv.fit(x_train, y_train) ridgecv_score = ridgecv.score(x_train, y_train) ridgecv_alpha = ridgecv.alpha_ print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score) coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) start=time.time() ridge =Ridge(normalize = True) ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000) ridge.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, ridge.predict(x_test)) coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(ridge,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_ridge_predict=ridge.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_ridge_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_ridge=np.expm1(ridge.predict(test_linear)) """ . Random Forest """ #train=train.drop(columns=['DateSold']) #test=test.drop(columns=['DateSold']) #X_train=train.drop(columns=['SalePrice']) #Y_train=train['SalePrice'] X_train=train_linear_fea Y_train=train_linear_tar x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0) n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False] random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap} rf = RandomForestRegressor() # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores # rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1) rf_random.fit(X_train, Y_train) #rf_random.fit(x_train_rf, y_train_rf) rf_random.best_params_ #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, # we can explicitly specify every combination of settings to try. param_grid = { 'bootstrap': [False], 'max_depth': [80, 90, 100, 110,120,130], 'max_features': [2, 3], 'min_samples_leaf': [1,2,3, 4], 'min_samples_split': [2,4,6,8, 10, 12], 'n_estimators': [600,700, 800, 900, 1000] } # Create a based model rf = RandomForestRegressor() # Instantiate the grid search model grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2) #grid_search.fit(x_train, y_train) grid_search.fit(X_train, Y_train) grid_search.best_params_ best_random = grid_search.best_estimator_ start=time.time() best_random.fit(x_train_rf,y_train_rf) end=time.time() evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_rf_predict=best_random.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_rf_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_rf = importance_rf.iloc[:20,] plt.barh(importance_top20_rf.features, importance_top20_rf.imp) plt.xlabel('Feature Importance') test_prediction_rf=np.expm1(best_random.predict(test_linear)) """ . Xgboost """ learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_random.fit(X_train, Y_train) xgb_random.best_params_ """ best_params_={'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 900, 'subsample': 0.5} """ model_xgb = XGBRegressor(**xgb_random.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_xgb.fit(x_train_rf,y_train_rf) end=time.time() evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_xgb_predict=model_xgb.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_xgb_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_xgb = importance_xgb.iloc[:20,] plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp) plt.xlabel('Feature Importance') test_prediction_xgb=np.expm1(model_xgb.predict(test_linear)) return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)
def XgBoost(train_linear, test_linear): learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_random.fit(X_train, Y_train) xgb_random.best_params_ from xgboost import XGBRegressor """ best_params_={'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 900, 'subsample': 0.5} """ model_xgb = XGBRegressor(**xgb_random.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_xgb.fit(x_train_rf,y_train_rf) end=time.time() evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_xgb_predict=model_xgb.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_xgb_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\ sort_values('imp',ascending=False) importance_xgb=importance_xgb[importance_xgb['features']!='Id'] importance_top20_xgb = importance_xgb.iloc[:20,] plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp) plt.xlabel('Feature Importance') test_prediction_xgb=np.expm1(model_xgb.predict(test_linear)) write_pkl(xgb_random.best_params_, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/xgb_params.pkl') return test_prediction_xgb