def XGB(self, x_train, y_train, x_test, y_test): x_train, y_train = shuffle(x_train, y_train) xgb = XGBRegressor(max_depth=4, subsample=0.9) xgb.fit(x_train,y_train) y_pred = xgb.predict(x_test).reshape(x_test.shape[0], 1) loss = mean_squared_error(y_pred, y_test) print loss return y_pred, loss
def Stacking(real_train_tar): predictions_train = pd.DataFrame([np.expm1(y_lasso_predict), np.expm1(y_ridge_predict), np.expm1(y_rf_predict), np.expm1(y_xgb_predict)]).T sns.pairplot(predictions_train) learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] stack_model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_stack = RandomizedSearchCV(estimator=stack_model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_stack.fit(predictions_train, real_train_tar) xgb_stack.best_params_ write_pkl(xgb_stack.best_params_, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/stack_params.pkl') model_stacking = XGBRegressor(**xgb_stack.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_stacking.fit(predictions_train,real_train_tar) end=time.time() print("MSE for train data is: %f" % mean_squared_error(np.log1p(real_train_tar),np.log1p( model_stacking.predict(predictions_train)))) print('Time elapsed: %.4f seconds' % (end-start)) y_stack_predict=model_stacking.predict(predictions_train) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,y_stack_predict) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price')
# Main advantages are as follows: # 1. Easy to use # 2. Computational efficiency # 3. Model Accuracy # 4. Feasibility — easy to tune parameters and modify objectives. # In[86]: model=XGBRegressor(max_depth=5) # In[87]: model.fit(X_train,y_train) # In[88]: y_pred=model.predict(X_test) # In[89]: print('R2 score using XG Boost= ',r2_score(y_test, y_pred), '/ 1.0') print('MSE score using XG Boost= ',mean_squared_error(y_test, y_pred), '/ 0.0')
elastic_model_full_data = elasticnet.fit(X, y) print(datetime.now(), '对一范数lasso收缩模型进行参数训练') lasso_model_full_data = lasso.fit(X, y) print(datetime.now(), '对二范数ridge岭回归模型进行参数训练') ridge_model_full_data = ridge.fit(X, y) print(datetime.now(), '对svr支持向量机模型进行参数训练') svr_model_full_data = svr.fit(X, y) print(datetime.now(), '对GradientBoosting梯度提升模型进行参数训练') gbr_model_full_data = gbr.fit(X, y) print(datetime.now(), '对xgboost二阶梯度提升模型进行参数训练') xgb_model_full_data = xgboost.fit(X, y) def blend_models_predict(X): return ((0.1 * elastic_model_full_data.predict(X)) + \ (0.05 * lasso_model_full_data.predict(X)) + \ (0.1 * ridge_model_full_data.predict(X)) + \ (0.1 * svr_model_full_data.predict(X)) + \ (0.1 * gbr_model_full_data.predict(X)) + \ (0.15 * xgb_model_full_data.predict(X)) + \ (0.3 * stack_gen_model.predict(np.array(X)))) print('融合后的训练模型对原数据重构时的均方根对数误差RMSLE score on train data:') print(rmsle(y, blend_models_predict(X)))
X_test = lda.transform(X_test) submit_X = pca.transform(submit_X) from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error, mean_squared_error linear_reg = LinearRegression() linear_reg.fit(X_train, y_train) y_pred = linear_reg.predict(X_test) print(mean_absolute_error(y_test, y_pred)) print(mean_squared_error(y_test, y_pred)) # Xgboost (best accuracy) from xgboost import XGBRegressor cls_ = XGBRegressor() cls_.fit(X_train, y_train) y_pred = cls_.predict(X_test) print(mean_absolute_error(y_test, y_pred)) print(mean_squared_error(y_test, y_pred)) #support vector machines from sklearn.model_selection import GridSearchCV from sklearn.svm import SVR '''parameters = [{'C':[1,10,100,1000,10000,100000], 'kernel':['linear']}, {'C':[1,10,100,1000,10000,100000], 'kernel':['poly'],'degree':[1,2,3]} ]''' parameters = [{ 'C': [1, 10, 100, 1000, 10000, 100000], 'kernel': ['rbf', 'linear'] }] grid_search = GridSearchCV(
mea = getmea(max_leaf_nodes,train_x,val_x,train_y,val_y) print("Max_leaf_nodes: %d ,mea: %d" %(max_leaf_nodes,mea)) ''' # clf = XGBRegressor() 17165 # XGBRegressor(n_estimators=400) 16330 ''' params = [.02,.03,.04,.05,.06,.07,.08,.09,.10]#[1:1001:50][100,200,300,400,500] test_scores = [] for param in params: clf = XGBRegressor(n_estimators=400,learning_rate=param) test_score = np.sqrt(-cross_val_score(clf, train_X, train_y, cv=10, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) plt.plot(params, test_scores) plt.title("n_estimator vs CV Error" + str(params)); # 一定要加上这句才能让画好的图显示在屏幕上 plt.show() ''' my_model = XGBRegressor(n_estimators=400) my_model.fit(train_X, train_y,verbose=False) predictions = my_model.predict(test_X) print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y))) #save model #joblib.dump(melbourne_model,'model.pickle') #load model #model = joblib.load('model.pickle')
print(x.shape) #(506, 13) print(y.shape) #(506,) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=66) print(x_test.shape) #(2000, 71) print(y_test.shape) #(2000, 4) print(type(x_test)) #<class 'numpy.ndarray'> print(type(y_test)) #<class 'numpy.ndarray'> model = XGBRegressor() model.fit(x_train, y_train) score = model.score(x_test, y_test) print("R2 :", score) #R2 : 0.925782578365577 thresholds = np.sort(model.feature_importances_) #정렬 #중요도가 낮은 것부터 높은것 까지 print(thresholds) for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) #median select_x_train = selection.transform(x_train) print(select_x_train.shape) parameters = [{ "n_estimators": [1000, 2000, 3000],
test_scores = [] for param in params: clf = XGBRegressor(n_estimators=param) test_score = np.sqrt(-cross_val_score(clf, train_x, train_y, cv=10, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) print test_scores plt.plot(params, test_scores) plt.title("n_estimators vs CV Error"); # 一定要加上这句才能让画好的图显示在屏幕上 plt.show() # 将当前figure的图保存到文件result.png #plt.savefig('./xgboostparams.png') # 91 16889 xgb = XGBRegressor(max_depth=6,n_estimators=400) xgb.fit(X, y) print mean_absolute_error(val_y,xgb.predict(val_x)) print(mean_squared_error(val_y,xgb.predict(val_x))) #gbdt ''' print "GradientBoostingRegressor" gbdt = GradientBoostingRegressor(n_estimators = 1000,max_leaf_nodes = 400) gbdt.fit(X, y)#17083 #RandomForestRegressor 93 16938 #GradientBoostingRegressor 90 16866 #XGBRegressor 100 19939 print mean_absolute_error(val_y,gbdt.predict(val_x)) print(mean_squared_error(val_y,gbdt.predict(val_x))) # predict and save output
x1 = joblib.load(data[1]) X = pd.concat((x0, x1), axis=1) del x0, x1 X_train = X[:num_train] X_test = X[num_train:] del X # X_train, X_val, y_train, y_val = train_test_split(X, # y, # test_size=0.1, # random_state=42) xgbr.fit( X_train, y_train, # eval_metric='rmse', # early_stopping_rounds=30, verbose=True, # eval_set=[(X_val, y_val)], ) y_pred = xgbr.predict(X_test) res.append(y_pred) final_res = np.mean(res, axis=0) final_res[final_res < 1] = 1 final_res[final_res > 3] = 3 pd.DataFrame({"id": id_test, "relevance": final_res}).to_csv("xgbr_sub_20160423_bag.csv", index=False)
# 1. dataset load boston = load_boston() x_names = boston.feature_names # x 변수명 # array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7') X, y = load_boston(return_X_y=True) X.shape # (506, 13) y # 비율척도, 비정규화 # 2. train_test_split x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 3. create model xgb = XGBRegressor() model = xgb.fit(x_train, y_train) model # objective='reg:squarederror' # 4. 중요변수 시각화 fscore = model.get_booster().get_fscore() fscore ''' {'f5': 378, 'f12': 254, 'f0': 642, 'f4': 135, 'f7': 238, 'f11': 289, 'f8': 27, 'f1': 60, 'f3': 15,
from xgboost import XGBRegressor from sklearn.datasets import load_boston from sklearn.metrics import r2_score boston = load_boston() x = boston.data y = boston.target x_train, x_test, y_train, y_test = tts(x, y, train_size=0.8, random_state=66) xgb = XGBRegressor(n_estimators=10, learning_rate=0.1) xgb.fit(x_train, y_train, verbose=True, eval_metric=["rmse", "logloss"], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20) #rmse,mae,logloss,error,auc y_pre = xgb.predict(x_test) r2 = r2_score(y_test, y_pre) score = xgb.score(x_test, y_test) result = xgb.evals_result() print(__file__) print(result) print("r2") print(r2) print("score") print(score)
print('all featrues data loaded!') # training model data_info = train_rdd.map(lambda x: combine(x)).collectAsMap() data = np.array(list(data_info.values())) x, y = data[:, 1:], data[:, 0].reshape(-1, 1) print('training dataset ready! ') model = XGBRegressor( n_estimators=200, learning_rate=0.025, reg_alpha=0.1, max_depth=5, ) model.fit(x, y) print('model fitted') mb_score_info = test_rdd.map(lambda x: recommend( x, rater_info, user_rating_info, user_avg_info, N=5, item_threshold=3 )).map(lambda x: ((x[0], x[1]), x[2])).collectAsMap() # making prediction pred_info = test_rdd.map(lambda x: combine_pred(x)) # res = test_rdd.map(lambda x: combine_pred(x)) \ # .map(lambda x: ((x[0][0], x[0][1]), # (model.predict(np.array(x[1]).reshape(1, -1))[0], # mb_score_info[(x[0][0], x[0][1])], # business_feats[x[0][1]][1] # ))) \ # .map(lambda x: (x[0][0], x[0][1], x[1][0] * (1 - 1 / x[1][2]) + x[1][1] * (1 / x[1][2]))).collect()
days = list(range(2, zones.loc[zones['Day'].idxmax()]['Day'])) test_days = [days[i] for i in random.sample(range(len(days)), int(len(days)*.3))] train_days = list(set(days) - set(test_days)) query_1 = create_query(train_days) query_2 = create_query(test_days) train, test = zones.query(query_1), zones.query(query_2) X_train, y_train = train.drop(drop + target, axis=1), train[target] X_test, y_test = test.drop(drop + target, axis=1), test[target] params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': .01, 'loss': 'ls'} clf = XGBRegressor(**params) clf.fit(X_train, y_train) predictions = clf.predict(X_test) mse = mean_squared_error(y_test, predictions) print('MSE: {}'.format(mse)) steps = 10 predictions, y_test = predictions[0::steps], y_test.values[0::steps] size = len(predictions) for idx, (vals, name, color) in enumerate([(predictions, 'Prediction', '#d896ff'), (y_test, 'Actual', '#ffaaa5')]): ax = plt.subplot(2, 1, idx+1) ax.set_title(name) plt.plot(range(size), vals, lw=.75, color=color) plt.yticks(np.arange(0, 90, 15)) plt.tight_layout() plt.savefig('./zone/xgboost.png')
'learning_rate': 0.12, 'gamma': 0.0, 'max_depth': 12, 'min_child_weight': 1, 'max_delta_weight': 20, 'rate_drop': 0.0} ] xgboost_params = xgboost_params_list[pred_index] # In[115]: xgb_model = XGBRegressor(**xgboost_params) xgb_model.fit(x_train, y_train) y_pred = xgb_model.predict(x_test) predictions = [round(value) for value in y_pred] mse = metric.mean_squared_error(y_test, predictions) rmse = math.sqrt(mse) print(rmse) # # LGBM # In[69]: import lightgbm import lightgbm as lgb
print('RMSE GradientBoostingRegressor: ', RMSLE(np.log1p(train['visitors'].values), model1.predict(train[col]))) print('RMSE KNeighborsRegressor: ', RMSLE(np.log1p(train['visitors'].values), model2.predict(train[col]))) #test['visitors'] = (model1.predict(test[col]) + model2.predict(test[col])) / 2 test['visitors'] = model2.predict(test[col]) test['visitors'] = np.expm1(test['visitors']).clip(lower=0.) sub1 = test[['id','visitors']].copy() #del train; del data; sub1[['id', 'visitors']].to_csv(os.path.join(path_kaggle, 'naive_forecast2.csv'), index = False) from xgboost import XGBRegressor model3 = XGBRegressor() model3.fit(train[col], np.log1p(train['visitors'].values), verbose=False) print('XGBRegressor: ', RMSLE(np.log1p(train['visitors'].values), model3.predict(train[col]))) ## from hklee ## https://www.kaggle.com/zeemeen/weighted-mean-comparisons-lb-0-497-1st/code #dfs = { re.search('/([^/\.]*)\.csv', fn).group(1): # pd.read_csv(fn)for fn in glob.glob('../input/*.csv')} # #for k, v in dfs.items(): locals()[k] = v # #wkend_holidays = date_info.apply( # (lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1) #date_info.loc[wkend_holidays, 'holiday_flg'] = 0 #date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5 #
else: return 0 # and the transformation is applied on the test data for later use. # The train data will be transformed while it is being fit. y_test_binary = pd.DataFrame(y_test["value"].apply(getBinary)) regressorLow = XGBRegressor(gamma=0.0, n_estimators=200, base_score=0.5, colsample_bytree=0.7, learning_rate=0.2, max_depth=5, objective="reg:linear") xgbModelLow = regressorLow.fit(X_train, y_train.value) xgboost.plot_importance(xgbModelLow) y_predicted = xgbModelLow.predict(X_test) y_predicted_binary = [1 if yp >= 0.5 else 0 for yp in y_predicted] print(accuracy_score(y_test_binary, y_predicted_binary)) fig = plt.figure(figsize=(8, 8)) plt.xticks(rotation='vertical') y_pos = np.arange(len(xgbModelLow.feature_importances_)) plt.barh([i for i in range(len(xgbModelLow.feature_importances_))], xgbModelLow.feature_importances_.tolist(), align='center', alpha=0.4) plt.yticks(y_pos, X_test.columns)
dataset = pd.concat([dataset, dataset_credits], axis=1) dataset = encode_json_column(dataset, 22, "name", 500, 1) y = dataset.iloc[:, 18].values #12 for revenue, 18 for rating X = dataset.iloc[:, 23:].values X_names = dataset.columns[23:].values from xgboost import XGBRegressor regressor = XGBRegressor(colsample_bytree=0.6, gamma=0.7, max_depth=4, min_child_weight=5, subsample=0.8, objective='reg:squarederror') regressor.fit(X, y) importances = {} count = 0 for feature_importance in regressor.feature_importances_: if feature_importance > 0.002: feature_name = X_names[count] importances[feature_name] = feature_importance count += 1 import operator sorted_importances = sorted(importances.items(), key=operator.itemgetter(1), reverse=True)
print('Feature:%0d -> %s, Score: %.5f' % (i, feature_names[i], v)) # plot feature importance pyplot.bar([x for x in range(len(rf_importance))], rf_importance) pyplot.show() print('Accuracy of Random Forest regressor on training set: {:.2f}'.format( rf_reg.score(X_train, y_train))) print('Accuracy of Random Forest regressor on test set: {:.2f}'.format( rf_reg.score(X_test, y_test))) """**XGBoost Regression Feature Importance**""" from xgboost import XGBRegressor xgb_reg = XGBRegressor() xgb_reg.fit(X_train, y_train) xgb_importance = xgb_reg.feature_importances_ for i, v in enumerate(xgb_importance): print('Feature:%0d -> %s, Score: %.5f' % (i, feature_names[i], v)) # plot feature importance pyplot.bar([x for x in range(len(xgb_importance))], xgb_importance) pyplot.show() print('Accuracy of Xgboost regressor on training set: {:.2f}'.format( xgb_reg.score(X_train, y_train))) print('Accuracy of Xgboost regressor on test set: {:.2f}'.format( xgb_reg.score(X_test, y_test))) """**Permutation Feature Importance for Regression**"""
seed=3 test_size=.3 X_train, X_test, y_train, y_test = train_test_split(X, log_loss, test_size=test_size, random_state=seed) model=XGBRegressor(learning_rate=0.08, max_depth=10, objective='reg:linear', nthread=3, gamma=0.2, subsample=0.9, n_estimators=100, ) model.fit(X_train, y_train) print(model) y_pred=model.predict(X_test) def mae(predicted, actual, logscale=False): if logscale == True: predexp=np.exp(predicted) actualexp=np.exp(actual) return np.mean(np.abs(predexp - actualexp)) else: return np.mean(np.abs(predicted - actual)) print(mae(y_pred, y_test, True)) # #Plotting Variable Importance
import pandas as pd from xgboost import XGBRegressor train_path = 'src/RL/outputs/train_data.csv' train_data = pd.read_csv(train_path) model = XGBRegressor() train_x = train_data.iloc[:, 0:-1] train_y = train_data[['rewards']] model.fit(train_x, train_y) print(model)
y = dataset['target'] print(y.shape) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=66) # 2. 모델 model = XGBRegressor(n_estimators=1000, learning_rate=0.01, n_jobs=8) # 이렇게 튜닝 할 줄 알기 # 3. 훈련 model.fit(x_train, y_train, verbose=1, eval_metric=['rmse'], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20) # eval_metric='rmse' 회귀인 경우 # 평가 aaa = model.score(x_test, y_test) print('score: ', aaa) y_pred = model.predict(x_test) r2 = r2_score(y_test, y_pred) #score 할 때 원데이터 (y_test를 앞에 넣기) print('r2 : ', r2) ## 이발셋 ============================= print('====================================') result = model.evals_result() # print('result: ', result)
import warnings warnings.filterwarnings('ignore') import pandas as pd from xgboost import XGBRegressor train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') X = train.iloc[:, 1:-1].values y = train.iloc[:, -1].values model = XGBRegressor(learning_rate=0.03, max_depth=4, n_estimators=100) model.fit(X, y) test_feature = test.iloc[:, 1:].values prediction = model.predict(test_feature) result = pd.DataFrame({'ID': test.ID, 'medv': prediction}) result.to_csv("result.csv", index=False)
# using one hot for types of Object X = pd.get_dummies(X) # align the one-hot testing data according to the training data # test_data = pd.read_csv('test.csv') # test_X = test_data.drop(['Id'], axis=1) # # test_X = pd.get_dummies(test_X) # test_X = X.align(test_X,join='left',axis=1) # split the data into training and validation set train_X, validation_X, train_y, validation_y = train_test_split(X.values, y.values, test_size=0.25) my_imputer = Imputer() train_X = my_imputer.fit_transform(train_X) validation_X = my_imputer.transform(validation_X) my_model = XGBRegressor(n_estimators=1000, learning_rate=0.01, early_stopping_rounds=5, n_jobs=4) # Add silent=True to avoid printing out updates with each cycle my_model.fit(train_X, train_y, verbose=False) # make predictions predictions = my_model.predict(validation_X) rms = np.sqrt(mean_squared_error(validation_y, predictions)) print(rms)
#results = {} #for i in [50, 100, 150, 200, 250, 300, 350, 400]: # results[i] = get_score(i) # the best n_estimator is 200 #print("Mean Absolute Error for RandomForestRegressor with Cross Validation: " + str(get_score())) import matplotlib.pyplot as plt #plt.plot(results.keys(), results.values()) #plt.show from xgboost import XGBRegressor xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.05) xgb_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False) xgb_preds = xgb_model.predict(X_valid) print("Mean Absolute Error for XGBoosting: " + str(mean_absolute_error(xgb_preds, y_valid))) X.loc[:, 'Elevation':'Horizontal_Distance_To_Fire_Points'].hist(bins=50, figsize=(20, 15)) plt.show
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=66) #2. model model = XGBRegressor(n_estimators=1000, learning_rate=0.01, n_jobs=8, use_label_encoder=False) #3. fit model.fit(x_train, y_train, verbose=1, eval_metric=['rmse', 'logloss', 'mae'], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=10) # 이 한 부분만 추가하면 된다, eval_metric에 들어가는 파라미터의 순서도 중요함 -> 마지막에 위치한 metric 기준으로 early_stopping 적용함 #4. score and predict aaa = model.score(x_test, y_test) print("aaa : ", aaa) y_pred = model.predict(x_test) r2 = r2_score(y_test, y_pred) print("r2 : ", r2) print("=====================") results = model.evals_result() print(results)
class VanillaModelRegression(Model): def __init__(self, configuration): self._configuration = configuration self._objects = {} self._annotation = 'Performance comparision of different MVA discriminants' if 'annotation' in self._configuration: self._annotation = self._configuration['annotation'] self.my_model = None self.fit_results = None self.Initialize() @log_with() def Initialize(self): self.build_best_prediction() pass @log_with() def get(self, name): """ Factory method """ if name in self._objects: return self._objects[name] else: return None #provide factory method implementation here return self._objects[name] @log_with() def get_data_provider(self, provider_name): """ Factory method for data providers """ from dataprovider import PandasDataProviderFromCSV_original if provider_name in self._objects: return self._objects[provider_name] else: if '.csv' in self._configuration[provider_name]['input_file']: provider = PandasDataProviderFromCSV_original( self._configuration[provider_name]['input_file']) self._objects[provider_name] = provider else: raise NotImplementedError return self._objects[provider_name] @log_with() def build_best_prediction(self): print("Dummy building vanilla model!") from matplotlib import pyplot from xgboost import XGBRegressor, plot_importance # from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error target_variable_names = self._configuration['model']['target'][0] data_provider = self.get_data_provider( self._configuration['model']['data_provider']) input_features_names = self._configuration['model']['input_features'] X_train = data_provider.train[input_features_names] y_train = data_provider.train[target_variable_names] X_test = data_provider.test[input_features_names] y_test = data_provider.test[target_variable_names] # print X_train.dtypes # print X_train.head() # print X_test.dtypes # print X_test.head() # print y_train.dtypes # print y_train.head() # print y_test.dtypes # print y_test.head() eval_set = [(X_train, y_train), (X_test, y_test)] self.my_model = XGBRegressor( n_estimators=self._configuration['model']['n_estimators'], max_depth=self._configuration['model']['max_depth'], learning_rate=self._configuration['model']['learning_rate'], verbosity=0) self.my_model.fit(X_train, y_train, eval_metric=["rmse", "mae"], eval_set=eval_set, verbose=False) y_pred = my_model.predict(X_test) # print "Max error: ", max_error(y_test,y_pred) print("Explained variance score: ", explained_variance_score(y_test, y_pred)) print("Mean absolute error: ", mean_absolute_error(y_test, y_pred)) print("Mean squared error: ", mean_squared_error(y_test, y_pred)) self.fit_results = self.my_model.evals_result() # print 'YO importance' # plot_importance(my_model) pickle.dump( self.my_model, open(self._configuration['model']['output_filename'], 'wb')) pass
dt = DecisionTreeRegressor(random_state=5) dt.fit(X_train, y_train) accuracy = dt.score(X_val, y_val) y_pred = dt.predict(X_val) rmse = np.sqrt(mean_squared_error(y_val, y_pred)) print('Accuracy', accuracy) print('rmse', rmse) # -------------- from xgboost import XGBRegressor # Code starts here xgb = XGBRegressor(max_depth=50, learning_rate=0.83, n_estimators=100) xgb.fit(X_train, y_train) accuracy = xgb.score(X_val, y_val) y_pred = xgb.predict(X_val) rmse = np.sqrt(mean_squared_error(y_val, y_pred)) print('Accuracy', accuracy) print('rmse', rmse) # Code ends here
'../../../../data/rossmann/intermediate/06_SalesModelingExtendedWeather/03_Store198/test_store198_X_3M.pkl' ) y_test_3M_198 = pd.read_pickle( '../../../../data/rossmann/intermediate/06_SalesModelingExtendedWeather/03_Store198/test_store198_y_3M.pkl' ) # Store 897 X_test_3M_897 = pd.read_pickle( '../../../../data/rossmann/intermediate/06_SalesModelingExtendedWeather/04_Store897/test_store897_X_3M.pkl' ) y_test_3M_897 = pd.read_pickle( '../../../../data/rossmann/intermediate/06_SalesModelingExtendedWeather/04_Store897/test_store897_y_3M.pkl' ) # Fit Model All Stores xgb_model_all = XGBRegressor(n_estimators=100, learning_rate=0.1) xgb_model_all.fit(X_train, y_train) # Save Model All Stores model_all_filename = "../../04_Evaluation/00_Models/xgb_model_all.pkl" with open(model_all_filename, 'wb') as file: pickle.dump(xgb_model_all, file) # Fit Model Store 708 xgb_model_708 = XGBRegressor(n_estimators=100, learning_rate=0.1) xgb_model_708.fit(X_train_708, y_train_708) # Save Model Store 708 model_708_filename = "../../04_Evaluation/00_Models/xgb_model_708.pkl" with open(model_708_filename, 'wb') as file: pickle.dump(xgb_model_708, file)
def main(): print("Loading data...") # The training data is used to train your model how to predict the targets. training_data = read_csv("numerai_training_data.csv") # The tournament data is the data that Numerai uses to evaluate your model. tournament_data = read_csv("numerai_tournament_data.csv") feature_names = [ f for f in training_data.columns if f.startswith("feature") ] print(f"Loaded {len(feature_names)} features") # This is the model that generates the included example predictions file. # Taking too long? Set learning_rate=0.1 and n_estimators=200 to make this run faster. # Remember to delete example_model.xgb if you change any of the parameters below. model = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, n_jobs=-1, colsample_bytree=0.1) if MODEL_FILE.is_file(): print("Loading pre-trained model...") model.load_model(MODEL_FILE) else: print("Training model...") model.fit(training_data[feature_names], training_data[TARGET_NAME]) model.save_model(MODEL_FILE) # Generate predictions on both training and tournament data print("Generating predictions...") try: training_data[PREDICTION_NAME] = model.predict( training_data[feature_names]) tournament_data[PREDICTION_NAME] = model.predict( tournament_data[feature_names]) except Exception as e: print(e) print( "If you received the error 'Floating point is not supported', this is likely due to using version >=1.4 of XGBoost" ) print( "Downgrade to XGBoost 1.3.3 by typing the following into your command line" ) print("pip install xgboost==1.3.3") print("\nAlternatively, change the lines that start with") print("training_data =...") print("tournament_data =...") print("\nTo the following") print( "training_data = pd.read_parquet(\"s3://numerai-public-datasets/latest_numerai_training_data.parquet\")" ) print( "training_data = pd.read_parquet(\"s3://numerai-public-datasets/latest_numerai_tournament_data.parquet\")" ) print("\nThis will require more RAM") # Check the per-era correlations on the training set (in sample) train_correlations = training_data.groupby("era").apply(score) print( f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std(ddof=0)}" ) print( f"On training the average per-era payout is {payout(train_correlations).mean()}" ) """Validation Metrics""" # Check the per-era correlations on the validation set (out of sample) validation_data = tournament_data[tournament_data.data_type == "validation"] validation_correlations = validation_data.groupby("era").apply(score) print( f"On validation the correlation has mean {validation_correlations.mean()} and " f"std {validation_correlations.std(ddof=0)}") print( f"On validation the average per-era payout is {payout(validation_correlations).mean()}" ) # Check the "sharpe" ratio on the validation set validation_sharpe = validation_correlations.mean( ) / validation_correlations.std(ddof=0) print(f"Validation Sharpe: {validation_sharpe}") print("checking max drawdown...") rolling_max = (validation_correlations + 1).cumprod().rolling( window=100, min_periods=1).max() daily_value = (validation_correlations + 1).cumprod() max_drawdown = -((rolling_max - daily_value) / rolling_max).max() print(f"max drawdown: {max_drawdown}") # Check the feature exposure of your validation predictions feature_exposures = validation_data[feature_names].apply( lambda d: correlation(validation_data[PREDICTION_NAME], d), axis=0) max_per_era = validation_data.groupby("era").apply( lambda d: d[feature_names].corrwith(d[PREDICTION_NAME]).abs().max()) max_feature_exposure = max_per_era.mean() print(f"Max Feature Exposure: {max_feature_exposure}") # Check feature neutral mean print("Calculating feature neutral mean...") feature_neutral_mean = get_feature_neutral_mean(validation_data) print(f"Feature Neutral Mean is {feature_neutral_mean}") # Load example preds to get MMC metrics example_preds = pd.read_csv("example_predictions.csv").set_index( "id")["prediction"] validation_example_preds = example_preds.loc[validation_data.index] validation_data["ExamplePreds"] = validation_example_preds print("calculating MMC stats...") # MMC over validation mmc_scores = [] corr_scores = [] for _, x in validation_data.groupby("era"): series = neutralize_series(pd.Series(unif(x[PREDICTION_NAME])), pd.Series(unif(x["ExamplePreds"]))) mmc_scores.append(np.cov(series, x[TARGET_NAME])[0, 1] / (0.29**2)) corr_scores.append( correlation(unif(x[PREDICTION_NAME]), x[TARGET_NAME])) val_mmc_mean = np.mean(mmc_scores) val_mmc_std = np.std(mmc_scores) val_mmc_sharpe = val_mmc_mean / val_mmc_std corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)] corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs) corr_plus_mmc_mean = np.mean(corr_plus_mmcs) corr_plus_mmc_sharpe_diff = corr_plus_mmc_sharpe - validation_sharpe print(f"MMC Mean: {val_mmc_mean}\n" f"Corr Plus MMC Sharpe:{corr_plus_mmc_sharpe}\n" f"Corr Plus MMC Diff:{corr_plus_mmc_sharpe_diff}") # Check correlation with example predictions full_df = pd.concat([ validation_example_preds, validation_data[PREDICTION_NAME], validation_data["era"] ], axis=1) full_df.columns = ["example_preds", "prediction", "era"] per_era_corrs = full_df.groupby('era').apply( lambda d: correlation(unif(d["prediction"]), unif(d["example_preds"]))) corr_with_example_preds = per_era_corrs.mean() print(f"Corr with example preds: {corr_with_example_preds}") # Save predictions as a CSV and upload to https://numer.ai tournament_data[PREDICTION_NAME].to_csv("submission.csv", header=True)
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler svr = make_pipeline(StandardScaler(), SVR(kernel='rbf', C=1000)) svr.fit(X, y) -1 * cross_val_score(svr, X, y, cv=5, scoring='neg_mean_absolute_error').mean() # - # # Extreme Gradient Boosting # + from xgboost import XGBRegressor xgb = XGBRegressor(learning_rate=0.1) xgb.fit(X, y) -1 * cross_val_score(xgb, X, y, cv=5, scoring='neg_mean_absolute_error').mean() # - # # Light Gradient Boosting # + from lightgbm import LGBMRegressor lgbm = LGBMRegressor(learning_rate=0.1, n_estimators=1000) lgbm.fit(X, y) -1 * cross_val_score(lgbm, X, y, cv=5, scoring='neg_mean_absolute_error').mean() # -
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from xgboost import XGBRegressor # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:-0.13400621878282912 exported_pipeline = XGBRegressor(booster="dart", learning_rate=0.1, max_depth=4, n_estimators=300, n_jobs=-1, objective="reg:linear") exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
X = data.drop(["id", "血糖", "体检日期"], axis=1) Y = data["血糖"] Y = np.log1p(Y) for column in X.columns: X[column] = np.log1p(X[column]) for column in test.columns: test[column] = np.log1p(test[column]) clf = XGBRegressor() print("---111----") kfold = KFold(n_splits=5, random_state=7) test_score = np.sqrt( -cross_val_score(clf, X, Y, cv=kfold, scoring='neg_mean_squared_error')) print("------test_score--------") print(test_score) print(np.mean(test_score)) print("---2----") clf.fit(X, Y) print("---3----") pred = np.expm1(clf.predict(test)) pred_df = pd.DataFrame() pred_df["pred"] = pred pred_df.to_csv( '/Users/jianjun.yue/PycharmGItHub/data/人工智能辅助糖尿病遗传风险预测/sub_0107_XG_log1p.csv', header=False, index=False, float_format='%.3f')
y = data.SalePrice X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object']) X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25) my_imputer = Imputer() X_train_impute = my_imputer.fit_transform(X_train) X_test_impute = my_imputer.transform(X_test) #1st model my_model = XGBRegressor() my_model.fit(X_train, y_train, verbose=False) predictions_1 = my_model.predict(X_test) print("Normal Error:" + str(mean_absolute_error(predictions_1, y_test))) #2nd model my_model = XGBRegressor(n_estimators=10000, learning_rate=0.05) my_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)
def float_col_objective( trial, X, y, random_state=22, n_splits=3, n_repeats=1, n_jobs=-1, # SOURCE https://discuss.xgboost.ai/t/n-jobs-1-no-longer-uses-all-cores/1955/6 early_stopping_rounds=50): # ANCHOR XGBoost parameters params = { "objective": "reg:squarederror", # NOTE Learning task parameters - https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters "n_estimators": trial.suggest_int("n_estimators", 50, 1000), "max_depth": trial.suggest_int("max_depth", 3, 20), "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.05), "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 0.8), "subsample": trial.suggest_loguniform("subsample", 0.4, 0.8), "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0), "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0), "gamma": trial.suggest_loguniform( "lambda", 1e-8, 10.0 ), # Minimum loss reduction required to make a further partition on a leaf node of the tree. "min_child_weight": trial.suggest_loguniform( "min_child_weight", 10, 1000 ), # A smaller value is chosen because it is a highly imbalanced class problem and leaf nodes can have smaller size groups. # "scale_pos_weight": 1, # because of high class imbalance "verbosity": 0, # 0 (silent) - 3 (debug) "seed": random_state, "tree_method": 'gpu_hist' # NOTE } xgb_model = XGBRegressor(**params) pruning_callback = optuna.integration.XGBoostPruningCallback( trial, "validation_0-rmse" ) # NOTE observation_keys - https://optuna.readthedocs.io/en/stable/reference/generated/optuna.integration.XGBoostPruningCallback.html # pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-auc") # CONSTRUCTION # NOTE oof - https://www.kaggle.com/vinhnguyen/accelerating-xgboost-with-gpu rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state) X_values = X.values y_values = y.values y_oof = np.zeros_like(y_values) res = 0 for fold, (train_index, valid_index) in enumerate(rkf.split(X_values, y_values)): X_A, X_B = X_values[train_index, :], X_values[valid_index, :] y_A, y_B = y_values[train_index], y_values[valid_index] xgb_model.fit( X_A, y_A, eval_set=[(X_B, y_B)], eval_metric= "rmse", # NOTE Learning task parameters - https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters 此用於檢視callback事件是否出現,不是直接用於找optuna hyperparameter early_stopping_rounds=early_stopping_rounds, callbacks=[pruning_callback], verbose=0) y_pred = xgb_model.predict(X_B) y_oof[valid_index] += y_pred res += np.sqrt(mean_squared_error(y_pred, y_B)) / (n_splits * n_repeats ) # I added it. y_oof /= n_repeats # Original trial.set_user_attr( key="best_booster", value=xgb_model) # NOTE update the best model in the optuna's table. # return np.sqrt(mean_squared_error(y_values, y_pred)) # Originally, the author uses y_train. I think it's incorrect. return res # I changed the last line to this one.
# alpha = 1, # gamma = 2, # min_child_weight = 1, # base_score = 7.76 # nrounds=5000, # nfold=5, # early_stopping_rounds=15, # print_every_n = 10, # verbose= 1, # feval=xg_eval_mae, # maximize=FALSE # ) folds = KFold(n_splits=3, shuffle=False) for k, (train_index, test_index) in enumerate(folds.split(train_xg_x)): xtr = train_xg_x[train_index] ytr = train_xg_y[train_index] xtest = train_xg_x[test_index] ytest = train_xg_y[test_index] print "Fitting on fold {}...".format(k) print "Checking xtest shape: ", xtest.shape print "Checking ytest shape: ", ytest.shape xgboosting.fit(xtr, ytr, verbose=True) np.savetxt('xgb_pred_fold_{}.txt'.format(k), np.exp(xgboosting.predict(xtest))) np.savetxt('xgb_test_fold_{}.txt'.format(k), ytest) # Training xgboost on test set (i.e. whole train set). xgboosting.fit(train_xg_x, train_xg_y, verbose=True) print "Fitting on test set..." np.savetxt('xgb_pred_test.txt', np.exp(xgboosting.predict(test_xg_x)))
def fitmodel(train, test, verbose=0, train_model=False, plot_graph=False): trainY = train['price'] testY = test['price'] trainX = train.drop(['price'], axis=1) testX = test.drop(['price'], axis=1) if train_model == True: print("Training model...") params = { "max_depth": st.randint(3, 40), "colsample_bytree": st.beta(10, 1), "subsample": st.beta(10, 1), "gamma": st.uniform(0, 10), "min_child_weight": st.expon(0, 50), } gboost = XGBRegressor(n_estimators=5, learning_rate=.2) tmp = RandomizedSearchCV(gboost, params, cv=10, n_jobs=-1, verbose=verbose, n_iter=25) tmp.fit(trainX, trainY) print("Optimised parameters: ") print(tmp.best_params_) gboost_opt = tmp.best_estimator_ gboost_opt.set_params(n_estimators=100, learning_rate=.1, n_jobs=-1) else: gboost_opt = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=0.87219466652443045, gamma=7.0610396795642156, learning_rate=0.1, max_delta_step=0, max_depth=23, min_child_weight=13.539302225736687, missing=None, n_estimators=100, n_jobs=-1, nthread=None, objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=0.95498622807161138) print("Final model: ") print(gboost_opt) gboost_opt.fit(trainX, trainY) trainY_pred = gboost_opt.predict(trainX) testY_pred = gboost_opt.predict(testX) print("Performance metrics: \n") print("RMSE : %.4f (train) %.4f (test)" % (mean_squared_error( trainY, trainY_pred)**.5, mean_squared_error(testY, testY_pred)**.5)) print("MAE : %.4f (train) %.4f (test)" % (mean_absolute_error( trainY, trainY_pred), mean_absolute_error(testY, testY_pred))) print("MedianAE : %.4f (train) %.4f (test)" % (median_absolute_error( trainY, trainY_pred), median_absolute_error(testY, testY_pred))) train_err = np.absolute(trainY - trainY_pred) / trainY test_err = np.absolute(testY - testY_pred) / testY print("Mean Absolute Percentage Error: %.4f (train) %.4f (test)" % (np.mean(train_err), np.mean(test_err))) print("Median Absolute Percentage Error: %.4f (train) %.4f (test)" % (np.median(train_err), np.median(test_err))) th = [.01, .05, .1, .2, .3] train_err_vec = np.zeros(len(th)) test_err_vec = np.zeros(len(th)) for i in range(len(th)): train_err_vec[i] = np.sum((train_err < th[i] * 1)) / len(train_err) test_err_vec[i] = np.sum((test_err < th[i] * 1)) / len(test_err) print( "Absolute Percentage Error within %.2f: %.2f (train), %.2f (test)" % (th[i], train_err_vec[i], test_err_vec[i])) if plot_graph == True: feat_imp = pd.Series(gboost_opt.feature_importances_, list(trainX)).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show()
# -*- coding: utf-8 -*- from xgboost import XGBRegressor import pandas as pd train = pd.read_csv("C:\\Users\\jowet\\Downloads\\Santander\\train.csv") test = pd.read_csv("C:\\Users\\jowet\\Downloads\\Santander\\test.csv") train.drop('ID', axis=1, inplace=True) y_train = train.pop('target') pred_index = test.pop('ID') reg = XGBRegressor() reg.fit(train, y_train) y_pred = reg.predict(test) submit = pd.DataFrame() submit['ID'] = pred_index submit['target'] = y_pred submit.to_csv('my_XGB_prediction.csv', index=False)
# #split train and test print("split") train = df[df.index.get_level_values(1) <= '2016-10-23'] test = df[df.index.get_level_values(1) >= '2016-11-01'] train_X, train_y = train[features], train[target] test_X, test_y = test[features], test[target] print("model") from xgboost import XGBRegressor from smape import XGBsmape xgbr = XGBRegressor(max_depth=9, learning_rate=0.05, n_estimators=1000, silent=True, objective='reg:linear', nthread=-1, subsample=0.8, colsample_bytree=0.8) xgbr.fit(train_X, train_y, eval_set=[(train_X, train_y), (test_X, test_y)], eval_metric=XGBsmape, early_stopping_rounds=10, verbose=True)
df_valid = df_valid.set_index('key_0') typ = df_valid.dtypes df_valid.to_csv('df_valid_cat.csv', header=None, index=False) df_train.columns RFR = RandomForestRegressor() RFR.fit(X_train, Y_train) RFR_preds = pd.DataFrame(RFR.predict(X_test),columns=['salePrice'],index=Y_test.index) print(mean_absolute_error(Y_test, RFR_preds)) RFR_new = RFR_preds.apply(lambda x: np.power(np.e,x).astype('int64')) XGB = XGBRegressor() XGB.fit(X_train, Y_train, verbose=False) XGB_preds = pd.DataFrame(XGB.predict(X_test),columns=['salePrice'],index=Y_test.index).astype(int) print(mean_absolute_error(Y_test,XGB_preds)) XGB_new = XGB_preds.apply(lambda x: np.power(np.e,x).astype('int64')) GBR = GradientBoostingRegressor() GBR.fit(X_train, Y_train) GBR_preds = pd.DataFrame(GBR.predict(X_test),columns=['salePrice'],index=Y_test.index) print(mean_absolute_error(Y_test,GBR_preds)) GBR_new = GBR_preds.apply(lambda x: np.power(np.e,x).astype('int64')) sns.swarmplot(x=GBR_preds['salePrice'],y=Y_test) from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score
y_predict = model.predict(x_test) print("최종 정답률 : ", r2_score(y_test, y_predict)) model = model.best_estimator_ thresholds = np.sort(model.feature_importances_) print(thresholds) n = 0 r2 = 0 for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) select_x_train = selection.transform(x_train) selection_model = XGBRegressor(n_jobs=-1) selection_model.fit(select_x_train, y_train) select_x_test = selection.transform(x_test) y_predict = selection_model.predict(select_x_test) score = r2_score(y_test, y_predict) if score * 100.0 > r2: n = select_x_train.shape[1] r2 = score * 100.0 L_selection = selection print("Thresh=%.3f, n=%d, R2: %.2f%%" % (thresh, select_x_train.shape[1], score * 100.0)) x_train = L_selection.transform(x_train) x_test = L_selection.transform(x_test)
class PrudentialRegressorCVO(BaseEstimator, RegressorMixin): def __init__(self, objective='reg:linear', learning_rate=0.045, min_child_weight=50, subsample=0.8, colsample_bytree=0.7, max_depth=7, n_estimators=700, nthread=-1, seed=0, n_buckets=8, initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6, #1., 2., 3., 4., 5., 6., 7. ], minimizer='BFGS', scoring=NegQWKappaScorer): self.objective = objective self.learning_rate = learning_rate self.min_child_weight = min_child_weight self.subsample = subsample self.colsample_bytree = colsample_bytree self.max_depth = max_depth self.n_estimators = n_estimators self.nthread = nthread self.seed = seed self.n_buckets = n_buckets self.initial_params = initial_params self.minimizer = minimizer self.scoring = scoring return def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, """ 2 / 5 grid scores: mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65531 3 / 5 grid scores: mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65474 4 / 5 grid scores: mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65490 2 / 10 grid scores: mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65688 3 / 10 grid scores: mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65705 4 / 10 grid scores: mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65643 5 / 10 grid scores: mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65630 """ from sklearn.cross_validation import StratifiedKFold kf = StratifiedKFold(y, n_folds=2) print(kf) params = [] for itrain, itest in kf: ytrain = y[itrain] Xtrain = X.iloc[list(itrain)] ytest = y[itest] Xtest = X.iloc[list(itest)] self.xgb = XGBRegressor( objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) self.xgb.fit(Xtrain, ytrain) te_y_hat = self.xgb.predict(Xtest, ntree_limit=self.xgb.booster().best_iteration) print('XGB Test score is:', -self.scoring(te_y_hat, ytest)) self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.off.fit(te_y_hat, ytest) print("Offsets:", self.off.params) params += [list(self.off.params)] pass from numpy import array self.off.params = array(params).mean(axis=0) print("Mean Offsets:", self.off.params) self.xgb.fit(X, y) return self def predict(self, X): from numpy import clip te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) return clip(self.off.predict(te_y_hat), 1, 8) pass
def ValidateTrainTestErrorsWithDifferentModels(cvX_train, cvX_test, cvy_train, cvy_test,X_train,y_train,X_test): clfs = list() cvClfs = list() print "Building RF1" rfShortCV = ensemble.RandomForestRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", n_jobs=-1, random_state=0) rfShort = ensemble.RandomForestRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", n_jobs=-1, random_state=0) rfShortCV.fit(cvX_train, cvy_train); print 'RF1 CV Results :',mean_absolute_error(cvy_test,rfShortCV.predict(cvX_test)) pd.DataFrame({"Actual":cvy_test, "Predicted":rfShortCV.predict(cvX_test)}).to_csv("snehaRF.csv", index=False,header=True); rfShort.fit(X_train,y_train) cvClfs.append(rfShortCV) clfs.append(rfShort) pd.DataFrame({"ID":out_id, "Expected":rfShort.predict(X_test)}).to_csv("subRF1.csv", index=False,header=True); print "Building SVM" clfSVRCV = SVR(C=10.0) clfSVR = SVR(C=10.0) clfSVRCV.fit(cvX_train, cvy_train); print 'SVM CV Results :',mean_absolute_error(cvy_test,clfSVRCV.predict(cvX_test)) pd.DataFrame({"Actual":cvy_test, "Predicted":clfSVRCV.predict(cvX_test)}).to_csv("snehaSVR.csv", index=False,header=True); print "Building RF2" rfLongCV = ensemble.RandomForestRegressor(min_samples_split=200,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", n_jobs=4, random_state=0) rfLong = ensemble.RandomForestRegressor(min_samples_split=200,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", n_jobs=4, random_state=0) rfLongCV.fit(cvX_train, cvy_train); print 'RF2 CV Results :',mean_absolute_error(cvy_test,rfLongCV.predict(cvX_test)) rfLong.fit(X_train,y_train) cvClfs.append(rfLongCV) clfs.append(rfLong) pd.DataFrame({"ID":out_id, "Expected":rfLong.predict(X_test)}).to_csv("subRF2.csv", index=False,header=True); print "Building GB1" regGBCV1 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad') regGBCV1.fit(cvX_train, cvy_train); print 'GB1 CV Results :',mean_absolute_error(cvy_test,regGBCV1.predict(cvX_test)) regGB1 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=None, min_samples_leaf=50, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad') regGB1.fit(X_train,y_train) cvClfs.append(regGBCV1) clfs.append(regGB1) pd.DataFrame({"ID":out_id, "Expected":regGB1.predict(X_test)}).to_csv("subGB1.csv", index=False,header=True); print 'Building GB2' regGBCV2 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad') regGBCV2.fit(cvX_train, cvy_train); print 'GB2 CV Results :',mean_absolute_error(cvy_test,regGBCV2.predict(cvX_test)) regGB2 = ensemble.GradientBoostingRegressor(min_samples_split=50,n_estimators=1000, max_depth=7, min_samples_leaf=200, max_features="auto", subsample=0.6, learning_rate=0.01, random_state=0,loss='lad') regGB2.fit(X_train,y_train) cvClfs.append(regGBCV2) clfs.append(regGB2) pd.DataFrame({"ID":out_id, "Expected":regGB2.predict(X_test)}).to_csv("subGB2.csv", index=False,header=True); print 'Feature Importances RF1:',sorted(zip(map(lambda x: round(x, 4), rfShort.feature_importances_), df_final.columns),reverse=True); print 'Feature Importances GB1:',sorted(zip(map(lambda x: round(x, 4), regGB1.feature_importances_), df_final.columns),reverse=True); print 'Feature Importances RF2:',sorted(zip(map(lambda x: round(x, 4), rfLong.feature_importances_), df_final.columns),reverse=True); print 'Feature Importances GB2:',sorted(zip(map(lambda x: round(x, 4), regGB2.feature_importances_), df_final.columns),reverse=True); print "Building XGB1" xgbCV1 = xgb.XGBRegressor(n_estimators=3000, nthread=-1, max_depth=None, learning_rate=0.01, silent=True, subsample=0.8, colsample_bytree=0.7) xgbCV1.fit(cvX_train, cvy_train); xgb1 = xgb.XGBRegressor(n_estimators=3000, nthread=-1, max_depth=None, learning_rate=0.01, silent=True, subsample=0.8, colsample_bytree=0.7) xgb1.fit(X_train,y_train); print 'XGB1 Model CV :',mean_absolute_error(cvy_test,xgbCV1.predict(cvX_test)); cvClfs.append(xgbCV1) clfs.append(xgb1) pd.DataFrame({"ID":out_id, "Expected":xgb1.predict(X_test)}).to_csv("subXGB1.csv", index=False,header=True); print "Building XGB2" params = {} params["objective"] = "reg:linear" params["learning_rate"] = 0.005 params["min_child_weight"] = 6 params["subsample"] = 0.7 params["colsample_bytree"] = 0.75 params["silent"] = 1 params["max_depth"] = 7 params["n_estimators"] = 3000 params['gamma'] = 1.25 params['nthread'] = -1 print 'XGBoost Training Process Started' xgbCV2 = XGBRegressor(**params); xgbCV2.fit(cvX_train, cvy_train); print 'XGB Model CV :',mean_absolute_error(cvy_test,xgbCV2.predict(cvX_test)); xgb2 = XGBRegressor(**params); xgb2.fit(X_train,y_train); cvClfs.append(xgbCV2) clfs.append(xgb2) pd.DataFrame({"ID":out_id, "Expected":xgb2.predict(X_test)}).to_csv("subXGB2.csv", index=False,header=True); # Return the cross validated models and the actual fitted models separately. return [clfs,cvClfs];
start = time.time() if (gridsearch & sample): # only do gridsearch if we run with sampled data. print "Attempting GridSearchCV for XGB model" gscv = GridSearchCV(regressor, { 'max_depth': [3, 5, 7, 11, 13, 17, 23], 'n_estimators': [32, 64, 128, 512, 1024, 2048, 4096], 'learning_rate': [0.15], 'subsample': [0.6,0.7,0.8], 'colsample_bytree': [0.6,0.7,0.8]}, verbose=1, n_jobs=2) regressor = gscv.fit(np.array(train), train[goal]) print(regressor.best_score_) print(regressor.best_params_) else: regressor.fit(np.array(train[features]), train[goal]) print ' -> Training time:', time.time() - start # Evaluation and export result if sample: if not gridsearch: # Test results if logexp: print "RMSPE: " + str(rmspe(map(lambda x : np.exp(x)-1, regressor.predict(np.array(test[features]))),test[goal].values)) else: print "RMSPE: " + str(rmspe(regressor.predict(np.array(test[features])),test[goal].values)) else: csvfile = 'result/' + regressor.__class__.__name__ + '-submit.csv' with open(csvfile, 'w') as output: predictions = [] for i in test[myid].tolist():
class PrudentialRegressorFO(BaseEstimator, RegressorMixin): def __init__(self, objective='reg:linear', learning_rate=0.045, min_child_weight=50, subsample=0.8, colsample_bytree=0.7, max_depth=7, n_estimators=700, nthread=-1, seed=0, n_buckets=8, initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6, #1., 2., 3., 4., 5., 6., 7. ], minimizer='BFGS', scoring=NegQWKappaScorer): self.objective = objective self.learning_rate = learning_rate self.min_child_weight = min_child_weight self.subsample = subsample self.colsample_bytree = colsample_bytree self.max_depth = max_depth self.n_estimators = n_estimators self.nthread = nthread self.seed = seed self.n_buckets = n_buckets self.initial_params = initial_params self.minimizer = minimizer self.scoring = scoring return def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor self.xgb = XGBRegressor( objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.xgb.fit(X, y) tr_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) print('Train score is:', -self.scoring(tr_y_hat, y)) self.off.fit(tr_y_hat, y) print("Offsets:", self.off.params) return self def predict(self, X): from numpy import clip te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) return clip(self.off.predict(te_y_hat), 1, 8) pass
kfold = StratifiedKFold(y, n_folds=3, shuffle=True, random_state=7) grid_search = GridSearchCV(model, param_grid, scoring="mean_absolute_error", n_jobs=-1, cv=kfold) # In[210]: result = grid_search.fit(train,y) # summarize results print("Best: %f using %s" % (result.best_score_, result.best_params_)) # In[211]: model=XGBRegressor(learning_rate=0.3,n_estimators=100) for traincv,testcv in kfold: model.fit(train.iloc[traincv],y.iloc[testcv]) # In[212]: y_pred=model.predict(test) # In[213]: output2 = pd.DataFrame( data={"outlet_no":outlet,"total_sales_Actual": y_pred} ) output2.to_csv("model.csv", index=False,quoting=3) # In[ ]:
# In[41]: #Build the model #model=ExtraTreesRegressor() #model=RandomForestRegressor() #params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2, # 'learning_rate': 0.01, 'loss': 'ls'} params = {'n_estimators': 400, 'max_depth': 7} #model=GradientBoostingRegressor(**params) model=XGBRegressor(**params) #model=GaussianNB() #model=Ridge() #model=KNeighborsRegressor() #model=DecisionTreeRegressor() model.fit(train_dataset,train_target) #Predict with the model predictions=model.predict(test_dataset) # In[51]: ### Cross Validation ### #cv = StratifiedKFold(train_dataset, n_folds=5) ###scoring scores = cross_validation.cross_val_score(model, train_dataset, train_target, cv=5) print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)
X_train = train_df.ix[:, train_df.columns != 'price_doc'].values X_test = test_df.values ################################## XGBRegressor ############################### #Initialize Model xgb = XGBRegressor() #Create cross-validation cv = TimeSeriesSplit(n_splits=5) #Train & Test Model cross_val_results = cross_val_score(xgb, X_train, Y_train, cv=cv, scoring='neg_mean_squared_error') print(cross_val_results.mean()) model = xgb.fit(X_train, Y_train) # model.feature_importances_; from xgboost import XGBRegressor #Get Data Y_train = train_df['price_doc'].values X_train = train_df.ix[:, train_df.columns != 'price_doc'].values X_test = test_df.values #Init Model xgb = XGBRegressor(learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.7) #Train Model model = xgb.fit(X_train, Y_train) #Make Predictions predictions = xgb.predict(X_test)
def Model(train_linear, test_linear): train_linear_fea=train_linear.drop(columns=['SalePrice']) train_linear_tar=train_linear.SalePrice x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0) def evaluate(model, test_features, test_labels,train_features, train_labels): predictions = model.predict(test_features) errors = abs(predictions - test_labels) mape = 100 * np.mean(errors / test_labels) accuracy = 100 - mape print('Model Performance') print('Average Error: {:0.4f} degrees.'.format(np.mean(errors))) print('Accuracy = {:0.2f}%.'.format(accuracy)) print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train))) print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test))) return accuracy real_train_tar=np.expm1(train_linear_tar) """ . Lasso model """ lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), ) lassocv.fit(train_linear_fea, train_linear_tar) lassocv_score = lassocv.score(train_linear_fea, train_linear_tar) lassocv_alpha = lassocv.alpha_ print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score) start=time.time() lasso =Lasso(normalize = True) lasso.set_params(alpha=lassocv_alpha,max_iter = 10000) lasso.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, lasso.predict(x_test)) coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(lasso,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_lasso_predict=lasso.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_lasso_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_lasso=np.expm1(lasso.predict(test_linear)) """ . Ridge model """ ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400)) ridgecv.fit(x_train, y_train) ridgecv_score = ridgecv.score(x_train, y_train) ridgecv_alpha = ridgecv.alpha_ print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score) coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) start=time.time() ridge =Ridge(normalize = True) ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000) ridge.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, ridge.predict(x_test)) coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(ridge,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_ridge_predict=ridge.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_ridge_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_ridge=np.expm1(ridge.predict(test_linear)) """ . Random Forest """ #train=train.drop(columns=['DateSold']) #test=test.drop(columns=['DateSold']) #X_train=train.drop(columns=['SalePrice']) #Y_train=train['SalePrice'] X_train=train_linear_fea Y_train=train_linear_tar x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0) n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False] random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap} rf = RandomForestRegressor() # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores # rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1) rf_random.fit(X_train, Y_train) #rf_random.fit(x_train_rf, y_train_rf) rf_random.best_params_ #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, # we can explicitly specify every combination of settings to try. param_grid = { 'bootstrap': [False], 'max_depth': [80, 90, 100, 110,120,130], 'max_features': [2, 3], 'min_samples_leaf': [1,2,3, 4], 'min_samples_split': [2,4,6,8, 10, 12], 'n_estimators': [600,700, 800, 900, 1000] } # Create a based model rf = RandomForestRegressor() # Instantiate the grid search model grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2) #grid_search.fit(x_train, y_train) grid_search.fit(X_train, Y_train) grid_search.best_params_ best_random = grid_search.best_estimator_ start=time.time() best_random.fit(x_train_rf,y_train_rf) end=time.time() evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_rf_predict=best_random.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_rf_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_rf = importance_rf.iloc[:20,] plt.barh(importance_top20_rf.features, importance_top20_rf.imp) plt.xlabel('Feature Importance') test_prediction_rf=np.expm1(best_random.predict(test_linear)) """ . Xgboost """ learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_random.fit(X_train, Y_train) xgb_random.best_params_ """ best_params_={'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 900, 'subsample': 0.5} """ model_xgb = XGBRegressor(**xgb_random.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_xgb.fit(x_train_rf,y_train_rf) end=time.time() evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_xgb_predict=model_xgb.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_xgb_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_xgb = importance_xgb.iloc[:20,] plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp) plt.xlabel('Feature Importance') test_prediction_xgb=np.expm1(model_xgb.predict(test_linear)) return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)
def XgBoost(train_linear, test_linear): learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_random.fit(X_train, Y_train) xgb_random.best_params_ from xgboost import XGBRegressor """ best_params_={'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 900, 'subsample': 0.5} """ model_xgb = XGBRegressor(**xgb_random.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_xgb.fit(x_train_rf,y_train_rf) end=time.time() evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_xgb_predict=model_xgb.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_xgb_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\ sort_values('imp',ascending=False) importance_xgb=importance_xgb[importance_xgb['features']!='Id'] importance_top20_xgb = importance_xgb.iloc[:20,] plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp) plt.xlabel('Feature Importance') test_prediction_xgb=np.expm1(model_xgb.predict(test_linear)) write_pkl(xgb_random.best_params_, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/xgb_params.pkl') return test_prediction_xgb