validation_predict_dir = validation_predict_dir_prefix + str(week) + '\\' test_predict_dir = test_predict_dir_prefix + str(week) + '\\' validation_predicts = [] test_predicts = [] validation_paths = os.listdir(validation_predict_dir) for p in validation_paths: validation_predicts.append(pd.read_csv(validation_predict_dir + p)['visitors_predict'].tolist()) validation_predicts_y = np.log1p(np.array(pd.read_csv(validation_predict_dir + p)['visitors'].tolist())) test_paths = os.listdir(test_predict_dir) for p in test_paths: test = pd.read_csv(test_predict_dir + p) test_predicts.append(test['visitors'].tolist()) validation_predicts = [np.array(predicts).reshape(-1, 1) for predicts in validation_predicts] test_predicts = [np.array(predicts).reshape(-1, 1) for predicts in test_predicts] train_merge = np.concatenate(validation_predicts, axis=1) test_merge = np.concatenate(test_predicts, axis=1) # lr = Ridge(random_state=1234, alpha=0.01, normalize=True) # lr = RandomForestRegressor(n_estimators=200, n_jobs=-1, max_depth=7, random_state=1234) lr = XGBRegressor(subsample=0.7, colsample_bytree=0.8, n_jobs=-1, random_state=1234, reg_lambda=0.01, reg_alpha=0.01, n_estimators=200, max_depth=7) # lr = SVR(C=10) lr.fit(train_merge, validation_predicts_y) predict = lr.predict(test_merge) temp = lr.predict(train_merge) print('RMSE stacking: ', RMSLE(validation_predicts_y, lr.predict(train_merge))) test['visitors'] = np.expm1(predict) test['visitors'] = test['visitors'].clip(lower=0.) test[['id', 'visitors']].to_csv(data_dir + 'submission_use_stacking3_' + str(week) + '.csv', index=False)
def __init__(self): self.__xgb = XGBRegressor() logging.getLogger('XGBoost').setLevel(logging.ERROR) logging.basicConfig(filename='xgboost.log', format='%(levelname)s %(asctime)s: %(message)s', level=logging.DEBUG)
'Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills', 'killStreaks', 'longestKill', 'matchDuration', 'maxPlace', 'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints', 'matchType_duo', 'matchType_duo-fpp', 'matchType_solo', 'matchType_solo-fpp', 'matchType_squad', 'matchType_squad-fpp', 'winPlacePerc' ]] # generate attribute X = clean_data.iloc[:, 3:33] # generate target regression target Y = clean_data.iloc[:, 33] # generate the parameter dict which can be fit in GridSearchCV model param_test = { 'learning_rate': [0.02, 0.03, 0.1], 'min_child_weight': [4, 6, 8], 'max_depth': [8, 10], "subsample": [0.6, 0.4], "n_estimators": [300, 500] } # define the target model and some parameters except these which should be chosen by greedy search model = XGBRegressor(n_estimators=-1) grid = GridSearchCV(model, param_test, cv=5, scoring='neg_mean_absolute_error') # fit the data into greedy search print("begin to find parameters") grid.fit(X, Y) # print the statistic data and choose the best parameters for our target model print(grid.best_score_, grid.best_estimator_, grid.best_params_)
""" y_df = pd.DataFrame({'Id': test.Id, 'SalePrice': pred_y}) return y_df.to_csv(file_name, index = False) #Reading Data iowa_data = pd.read_csv("Iowa Housing Prices.csv") #Setting iv and dv X = iowa_data.drop(labels = ["Id", "SalePrice"], axis = 1) X_imputed = impute_extension(X) X_OHE = OHE(X) X = X_imputed.join(X_OHE) y = iowa_data.SalePrice #Model XGBR_model = XGBRegressor(learning_rate=0.05, n_estimators= 1000) #Cross Validation from sklearn.model_selection import cross_val_score scores = cross_val_score(XGBR_model, X, y, scoring = "neg_mean_squared_log_error", cv = 5) rmse_log_score = np.sqrt(scores.mean()*-1) #Partial Dependence Plot #plot_importance(XGBR_model.fit(X,y), importance_type = "gain", max_num_features = 10) #Applying on Test data #Data preprocessing test = pd.read_csv("test.csv") test_X_imputed = impute_extension(test) test_X_OHE = OHE(test) test_X = test_X_imputed.join(test_X_OHE)
X = clean_training_data.drop(columns=target_variable).values y = clean_training_data[target_variable].values X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=SPLIT_SEED) # Remove outliers from training set X_train, y_train = remove_outliers(X_train, y_train) ''' 2. Create and train model(s) ''' models = [('GradientBoostingRegressor', GradientBoostingRegressor(n_estimators=400, max_depth=4, max_features=0.9, warm_start=True)), ('XGBRegressor', XGBRegressor(objective='reg:squarederror', n_estimators=400, max_depth=4, subsample=0.9)), ('Deep Neural Network', create_nn_model(X_train.shape[1]))] for name, model in models: # Train model on training dataset start_time = time.time() print('--- Starting Training ({}) ---'.format(name)) train_model(model, X_train, y_train) training_time = time.time() - start_time print('--- Training took {} sec ---'.format(training_time)) # Test model print('--- Testing ({}) ---'.format(name)) train_accuracy, train_pred = test_model(model, X_train, y_train) test_accuracy, test_pred = test_model(model, X_test, y_test)
# lgb params lgb_params = {} lgb_params['n_estimators'] = 120 # lgb_params['max_bin'] = 2 lgb_params['learning_rate'] = 0.01 # shrinkage_rate lgb_params['metric'] = 'mae' # or 'mae' lgb_params['sub_feature'] = 0.6 lgb_params['subsample'] = 0.8 # sub_row lgb_params['num_leaves'] = 11 # num_leaf lgb_params['min_data'] = 5 # min_data_in_leaf lgb_params['verbose'] = -1 lgb_params['feature_fraction_seed'] = 2 lgb_params['bagging_seed'] = 3 # XGB model xgb_model = XGBRegressor(n_estimators=120) gbdt_model = GradientBoostingRegressor() # lgb model lgb_model = LGBMRegressor(**lgb_params) # RF model rf_model = RandomForestRegressor(**rf_params) # ET model et_model = ExtraTreesRegressor() # SVR model # SVM is too slow in more then 10000 set # svr_model = SVR(kernel='rbf', C=5.0, epsilon=0.005)
train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:-15.21590991057142 exported_pipeline = make_pipeline( SelectFwe(score_func=f_regression, alpha=0.026000000000000002), StackingEstimator(estimator=LinearSVR(C=0.1, dual=False, epsilon=0.001, loss="squared_epsilon_insensitive", tol=0.01)), StackingEstimator( estimator=GradientBoostingRegressor(alpha=0.75, learning_rate=0.001, loss="quantile", max_depth=2, max_features=0.35000000000000003, min_samples_leaf=15, min_samples_split=17, n_estimators=100, subsample=0.7500000000000001)), Normalizer(norm="l1"), XGBRegressor(learning_rate=0.01, max_depth=6, min_child_weight=8, n_estimators=100, nthread=1, subsample=0.1)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
x = x_data.values y = y_data.values x_pred = test.values train_gap = np.load('./dacon/comp1/train_gap.npy') test_gap = np.load('./dacon/comp1/test_gap.npy') x = np.hstack((x, train_gap)) x_pred = np.hstack((x_pred, test_gap)) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=33) xgb = XGBRegressor() model = MultiOutputRegressor(xgb) model.fit(x_train, y_train) y_pred1 = model.predict(x_test) print('mae: ', mean_absolute_error(y_test, y_pred1)) ## feature_importances def plot_feature_importances(model): plt.figure(figsize=(10, 40)) n_features = x_data.shape[1] # n_features = column개수 plt.barh(
featimp.index = mergedassetmacrolag.columns[0:len(mergedassetmacrolag.columns ) - 1] featimp = featimp.sort_values(by=['FeatImp']) featimp.plot(kind='bar') rf_validationpred = rfreg.predict( X_validation.iloc[:, 0:len(X_validation.columns)].values) rf_validationact = yval valpreddiff = (rf_validationpred - rf_validationact) valpreddiffpct = ((rf_validationpred - rf_validationact) / rf_validationpred) print('act:%.3f, pred:%.3f, diff:%.3f, diffpct:%.3f' % (rf_validationact, rf_validationpred, valpreddiff, valpreddiffpct)) ######## Run XGBoost Algo on Assets with macro data ########## xgbreg = XGBRegressor(objective='reg:squarederror', learning_rate=0.1, max_depth=4, seed=1) xgbreg.fit(X_train, y_train) # Predicting asset value result with Random Forest Regression xgb_pred = xgbreg.predict(X_test) # Check the importance of each feature xgbimp = xgbreg.feature_importances_ xgbmse = mean_squared_error(y_test, xgb_pred) xgbrmse = np.sqrt(xgbmse) xgbmae = mean_absolute_error(y_test, xgb_pred) xgbr2 = r2_score(y_test, xgb_pred) print('MSE:%.4f, RMSE:%.4f, MAE:%.4f, R2:%.4f' % (xgbmse, xgbrmse, xgbmae, xgbr2))
select_x_train = selection.transform(x_train) select_x_test = selection.transform(x_test) select_x_pred = selection.transform(x_pred) print(select_x_train.shape[1]) parameter = { 'n_estimators': [100, 200, 400], 'learning_rate' : [0.05, 0.07, 0.1], 'colsample_bytree': [ 0.7, 0.8, 0.9], 'colsample_bylevel':[ 0.7, 0.8, 0.9], 'max_depth': [4, 5, 6] } # search = RandomizedSearchCV( XGBRegressor(gpu_id = 0, tree_method = 'gpu_hist'), parameter, cv =5) search = RandomizedSearchCV( XGBRegressor(), parameter, cv =5) multi_search = MultiOutputRegressor(search,n_jobs = -1) multi_search.fit(select_x_train, y_train ) y_pred = multi_search.predict(select_x_test) mae = mean_absolute_error(y_test, y_pred) score =r2_score(y_test, y_pred) print("Thresh=%.3f, n = %d, R2 : %.2f%%, MAE : %.3f"%(thres, select_x_train.shape[1], score*100.0, mae)) y_predict = multi_search.predict(select_x_pred) # submission a = np.arange(10000,20000) submission = pd.DataFrame(y_predict, a) submission.to_csv('./dacon/comp1/sub/select_XGB03_%i_%.5f.csv'%(i, mae),index = True, header=['hhb','hbo2','ca','na'],index_label='id')
Y_train = Y_train.clip(0, 20) Y_valid = Y_valid.clip(0, 20) # In[27]: from xgboost import XGBRegressor from matplotlib.pylab import rcParams rcParams['figure.figsize'] = 12, 4 # In[28]: model = XGBRegressor( max_depth=10, n_estimators=1000, min_child_weight=0.5, colsample_bytree=0.8, subsample=0.8, eta=0.1, # tree_method='gpu_hist', seed=42) model.fit(X_train, Y_train, eval_metric="rmse", eval_set=[(X_train, Y_train), (X_valid, Y_valid)], verbose=True, early_stopping_rounds=20) pickle.dump(model, open('model.pkl', 'wb')) model = pickle.load(open('model.pkl', 'rb'))
def get_model_from_name(model_name, training_params=None): # For Keras epochs = 250 if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning': print('Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy') epochs = 30 all_model_params = { 'LogisticRegression': {'n_jobs': -2}, 'RandomForestClassifier': {'n_jobs': -2}, 'ExtraTreesClassifier': {'n_jobs': -1}, 'AdaBoostClassifier': {'n_estimators': 10}, 'SGDClassifier': {'n_jobs': -1}, 'Perceptron': {'n_jobs': -1}, 'LinearSVC': {'dual': False}, 'LinearRegression': {'n_jobs': -2}, 'RandomForestRegressor': {'n_jobs': -2}, 'LinearSVR': {'dual': False, 'loss': 'squared_epsilon_insensitive'}, 'ExtraTreesRegressor': {'n_jobs': -1}, 'MiniBatchKMeans': {'n_clusters': 8}, 'GradientBoostingRegressor': {'presort': False, 'learning_rate': 0.05, 'warm_start': True}, 'GradientBoostingClassifier': {'presort': False, 'learning_rate': 0.05, 'warm_start': True}, 'SGDRegressor': {'shuffle': False}, 'PassiveAggressiveRegressor': {'shuffle': False}, 'AdaBoostRegressor': {'n_estimators': 10}, 'XGBRegressor': {'nthread':-1, 'n_estimators': 200}, 'XGBClassifier': {'nthread':-1, 'n_estimators': 200}, 'LGBMRegressor': {}, 'LGBMClassifier': {}, 'DeepLearningRegressor': {'epochs': epochs, 'batch_size': 50, 'verbose': 2}, 'DeepLearningClassifier': {'epochs': epochs, 'batch_size': 50, 'verbose': 2} } model_params = all_model_params.get(model_name, None) if model_params is None: model_params = {} if training_params is not None: print('Now using the model training_params that you passed in:') print(training_params) # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it) model_params.update(training_params) print('After overwriting our defaults with your values, here are the final params that will be used to initialize the model:') print(model_params) model_map = { # Classifiers 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'RidgeClassifier': RidgeClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'SGDClassifier': SGDClassifier(), 'Perceptron': Perceptron(), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(), 'LinearSVC': LinearSVC(), # Regressors 'LinearRegression': LinearRegression(), 'RandomForestRegressor': RandomForestRegressor(), 'Ridge': Ridge(), 'LinearSVR': LinearSVR(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'AdaBoostRegressor': AdaBoostRegressor(), 'RANSACRegressor': RANSACRegressor(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'Lasso': Lasso(), 'ElasticNet': ElasticNet(), 'LassoLars': LassoLars(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'BayesianRidge': BayesianRidge(), 'ARDRegression': ARDRegression(), 'SGDRegressor': SGDRegressor(), 'PassiveAggressiveRegressor': PassiveAggressiveRegressor(), # Clustering 'MiniBatchKMeans': MiniBatchKMeans() } if xgb_installed: model_map['XGBClassifier'] = XGBClassifier() model_map['XGBRegressor'] = XGBRegressor() if lgb_installed: model_map['LGBMRegressor'] = LGBMRegressor() model_map['LGBMClassifier'] = LGBMClassifier() if keras_installed: model_map['DeepLearningClassifier'] = KerasClassifier(build_fn=make_deep_learning_classifier) model_map['DeepLearningRegressor'] = KerasRegressor(build_fn=make_deep_learning_model) try: model_without_params = model_map[model_name] except KeyError as e: print('It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize') raise(e) model_with_params = model_without_params.set_params(**model_params) return model_with_params
# Shape of train and test data print(X_train_temp.shape, X_val_temp.shape) # -------------- from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error # Code starts here dt = DecisionTreeRegressor(random_state=5) dt.fit(X_train, y_train) accuracy = dt.score(X_val, y_val) y_pred = dt.predict(X_val) rmse = np.sqrt(mean_squared_error(y_pred, y_val)) print(accuracy) print(rmse) # -------------- from xgboost import XGBRegressor # Code starts here xgb = XGBRegressor(max_depth=50, learning_rate=0.83, n_estimators=100) xgb.fit(X_train, y_train) accuracy = xgb.score(X_val, y_val) y_pred = xgb.predict(X_val) rmse = np.sqrt(mean_squared_error(y_val, y_pred)) print(accuracy) print(rmse) # Code ends here
from sklearn.datasets import load_breast_cancer from sklearn.metrics import accuracy_score, r2_score # dataset = load_breast_cancer() # x = dataset.data # y = dataset.target x, y = load_breast_cancer(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=66) model = XGBRegressor(n_estimators=100, learning_rate=0.1) # 나무의 갯수(n_estimators)는 epoch model.fit(x_train, y_train, verbose=True, eval_metric=["logloss", "error"], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=100) # eval_set은 validation_0이 x_train, y_train// validation1이 x_test, y_test # train test val val지표가 중요 # rmse, mae, logloss, error(설명 error가 accuracy), auc(설명 accuracy친구) results = model.evals_result() print("eval's results : ", results)
'learning_rate': [0.1, 0.3, 0.001, 0.01], 'max_depth': [4, 5, 6] }, { 'n_estimators': [400, 600], 'learning_rate': [0.1, 0.001, 0.5], 'max_depth': [4, 5, 6], 'colsample_bytree': [0.6, 0.9, 1], 'colsample_bylevel': [0.6, 0.7, 0.9] }] #XGBRegressor : n_estimators 몇번돌릴건지 grid_random = [RandomizedSearchCV] #GridSearchCV kflod = KFold(n_splits=5, shuffle=True) for i in grid_random: model = i(XGBRegressor(n_jobs=8, tree_method='gpu_hist', predictor='gpu_predictor'), parameter, cv=kflod) #n_estimators == epochs model.fit(X_train, y_train, verbose=1, eval_metric=['rmse'], eval_set=[(X_train, y_train), (X_test, y_test)]) filename = '../data/h5/model_XGB_person.sav' pickle.dump(model, open(filename, 'wb')) # model = pickle.load(open(filename, 'rb')) y_pred = model.predict(X_test) acc = model.score(X_test, y_test)
param_grid = { 'n_estimators': [150, 250, 350], 'max_depth': [1, 2, 3], 'min_samples_split': [5, 6, 7] } opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, splits=splits, repeats=1) cv_score.name = model score_models = score_models.append(cv_score) model = 'XGB' opt_models[model] = XGBRegressor() param_grid = { 'n_estimators': [100, 200, 300, 400, 500], 'max_depth': [1, 2, 3], } opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, splits=splits, repeats=1) cv_score.name = model score_models = score_models.append(cv_score) X, y = get_training_data()
#1. 데이터 dataset = load_diabetes() x = dataset.data y = dataset.target x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=66, shuffle=True) #2. 모델 # model = DecisionTreeRegressor(max_depth=4) # model = RandomForestRegressor(max_depth=4) # model = GradientBoostingRegressor(max_depth=4) model = XGBRegressor(n_job=-1) #3. 훈련 model.fit(x_train, y_train) #4. 평가, 예측 acc = model.score(x_test, y_test) print(model.feature_importances_) print("acc : ", acc) #5. 시각화 import matplotlib.pyplot as plt import numpy as np ''' def plot_feature_importances_dataset(model): n_features = dataset.data.shape[1]
from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder numerical_transformer = SimpleImputer(strategy='constant') categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='most_frequent') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[( 'num', numerical_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols)]) from xgboost import XGBRegressor # Define the model my_model_1 = XGBRegressor(random_state=0) # Your code here clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', my_model_1)]) clf.fit(X_train, y_train) # Your code here from sklearn.metrics import mean_absolute_error # Get predictions predictions_1 = clf.predict(X_valid) # Your code here # Calculate MAE mae_1 = mean_absolute_error(predictions_1, y_valid) # Your code here # Uncomment to print MAE print("Mean Absolute Error:", mae_1)
# d = np.argmax(cumsum >=0.95) + 1 # print('cumsum >=0.95 : ',cumsum >=0.95) # print('d : ',d) # import matplotlib.pyplot as plt # plt.plot(cumsum) # plt.grid() # plt.show() #2 모델 구성 # model = RandomForestRegressor() model = XGBRegressor(n_jobs=-1, use_label_encoder=False, eval_metric='mlogloss') score = cross_val_score(model, x_train, y_train, cv=KFold) # model.fit(x_train,y_train,eval_metric='mlogloss') # r2 = model.score(x_test,y_test) # print(model.feature_importances_) # print('r2 : ',r2) print(score) # (442, 7) # (442,) # [0.34307936 0.41395492 0.57884725 0.32314566 0.24108945]
data = train_df.append(test_df) #data = autoclean(data) train, test = data[0:len(train_df)], data[len(train_df):] # Organize our data for training X = train.drop(["y"], axis=1) Y = train["y"] x_test = test.drop(["y"], axis=1) X, X_Val, Y, Y_Val = train_test_split(X, Y) # A parameter grid for XGBoost params = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)], 'subsample':[i/10.0 for i in range(6,11)], 'colsample_bytree':[i/10.0 for i in range(6,11)], 'max_depth': [2,3,4,5]} # Initialize XGB and GridSearch xgb = XGBRegressor(nthread=4) grid = GridSearchCV(xgb, params) grid.fit(X, Y) # Print the r2 score print(r2_score(Y_Val, grid.best_estimator_.predict(X_Val))) # Save the file y_test = grid.best_estimator_.predict(x_test) results_df = pd.DataFrame(data={'y':y_test}) ids = test_df["ID"] joined = pd.DataFrame(ids).join(results_df) joined.to_csv("mercedes.csv", index=False) # This scored 0.5563 for me on the LB
parameters = [{ 'max_depth': [13, 14, 15, 16], 'criterion': ['mse'], 'n_estimators': [298, 299], 'max_features': ['sqrt'], }] grid_search = GridSearchCV(estimator=rfr, param_grid=parameters, cv=10) grid_search = grid_search.fit(X, y) print("Best score:", grid_search.best_score_) print("Best parameters:", grid_search.best_params_) # ============================================================================= # STEP 7: BONUS Use XGBoost # ============================================================================= # Fitting XGBoost to the dataset xgbr = XGBRegressor() xgbr.fit(X_train, y_train) # Cross validation (10-fold validation) xgbr_score = cross_val_score(estimator=xgbr, X=X_train, y=y_train, cv=10) print("----------------------------------------------") print("Step 7:") print("XGBoost score:", xgbr_score) print("Mean score:", xgbr_score.mean()) print("Standard Deviation:", xgbr_score.std()) print("----------------------------------------------") # Show features' score by descending order ftscrxgboost = sorted(zip( map(lambda x: round(x, 4), xgbr.feature_importances_), backupdataset.columns.values),
rf = RandomForestRegressor(n_estimators=100, random_state=0) rf.fit(x_train, y_train) y_pred = rf.predict(x_test) plt.scatter(y_test, y_pred) rf.score(x_test, y_test) print('MAE :', " ", metrics.mean_absolute_error(y_test, y_pred)) print('MSE :', " ", metrics.mean_squared_error(y_test, y_pred)) print('RMSE :', " ", np.sqrt(metrics.mean_squared_error(y_test, y_pred))) # XGBoost xgb = XGBRegressor(n_estimators=500, max_depth=4, learning_rate=0.1, early_stopping_rounds=10) xgb.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False) y_pred = xgb.predict(x_test) plt.scatter(y_test, y_pred) xgb.score(x_test, y_test) plot_importance(xgb) print('MAE :', " ", metrics.mean_absolute_error(y_test, y_pred)) print('MSE :', " ", metrics.mean_squared_error(y_test, y_pred)) print('RMSE :', " ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
from sklearn.model_selection import train_test_split xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.3,random_state=0) # In[79]: from sklearn.model_selection import train_test_split xtrains,xtests,ytrains,ytests=train_test_split(Xs,Y,test_size=0.3,random_state=0) # In[56]: from xgboost import XGBRegressor model=XGBRegressor() model.fit(xtrain,ytrain) # In[80]: from xgboost import XGBRegressor model=XGBRegressor() model.fit(xtrains,ytrains) # In[ ]: ypred=model.predict(xtest)
logging.info('2. RandomForestRegressor- start predict') rf.predict(train[col]) score_rf = test_model(rf, X_test, y_test) print(score_rf) #train_error_rf = round(mean_squared_error(y_train, rf.predict(X_train)), 3) #test_error_rf = round(mean_squared_error(y_test, rf.predict(X_test)), 3) logging.info('2.1 Feature importances...') importances = rf.feature_importances_ std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0) indices = np.argsort(importances)[::-1] dtrain = xgboost.DMatrix(train[col], label=y) logging.info('3. Quick&Dirty XGBoost...') xgb = XGBRegressor() xgb.fit(X_train, y_train) #train_error_xgb = round(mean_squared_error(y_train, xgb.predict(X_train)), 3) #test_error_xgb = round(mean_squared_error(y_test, xgb.predict(X_test)), 3) #print('XGBoost train error: {}'.format(train_error_xgb)) #print('XGBoost test error: {}'.format(test_error_xgb)) score_xgb = test_model(xgb, X_test, y_test) logging.info('3. End XGBoost...') params = dict(max_depth=list(range(5, 10)), n_estimators=[100, 700], learning_rate=np.arange(0, 1, 0.05)) logging.info('4. Start Grid Search') grid_search = GridSearchCV(xgb, param_grid=params, n_jobs=-1).fit(X_train, y_train) logging.info('4. End Grid Search') # summarize the results of the grid search print('Best Estimator: {}'.format(grid_search.best_estimator_)) print('Best Parameters: {}'.format(grid_search.best_params_))
print(x_train.shape) # (8000, 71) print(x_test.shape) # (2000, 71) print(y_train.shape) # (8000, 4) print(y_test.shape) # (2000, 4) # # y_train1 = y_train[:, 0] # # y_train2 = y_train[:, 1] # # y_train3 = y_train[:, 2] # # y_train4 = y_train[:, 3] # # y_test1 = y_test[:, 0] # # y_test2 = y_test[:, 1] # # y_test3 = y_test[:, 2] # # y_test4 = y_test[:, 3] xgbr = XGBRegressor() # model.fit(x_train, y_train) # score = model.score(x_test, y_test) # print('R2 :', score) model = MultiOutputRegressor(xgbr) model.fit(x_train,y_train) # print(len(model.estimators_)) # print(model.estimators_[0].feature_importances_) for i in range(len(model.estimators_)): threshold = np.sort(model.estimators_[i].feature_importances_) for thresh in threshold: selection = SelectFromModel(model.estimators_[i], threshold=thresh, prefit=True)
y_test = y_test.iloc[1:] # %% from xgboost import XGBRegressor from sklearn.metrics import mean_squared_error import seaborn as sns import matplotlib.pyplot as plt import xgboost as xgb from xgboost import plot_importance, plot_tree plt.style.use('fivethirtyeight') model = XGBRegressor( n_estimators=1000, #max_depth=8, #min_child_weight=300, #colsample_bytree=0.8, #subsample=0.8, #eta=0.3, #seed=42 ) model.fit(X_train, y_train, eval_metric="rmse", eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False, early_stopping_rounds=100) # %% model.feature_importances_
def __init__(self, rfe_cv, *args, **kwargs): self.rfe = None self.rfe_cv = rfe_cv self.model = XGBRegressor(*args, **kwargs)
def setClf(self): self.clf = XGBRegressor(max_depth=7, learning_rate=0.01, n_estimators=100) return
def __init__(self): self.classifier_param_list = [ { "model": [LogisticRegression(fit_intercept=False)], "model__C": [1, 5, 10], }, { "model": [DecisionTreeClassifier()], "model__min_samples_split": [0.25, 0.5, 1.0], "model__max_depth": [5, 10, 15], }, { "model": [RandomForestClassifier()], "model__min_samples_split": [0.25, 0.5, 1.0], "model__max_depth": [5, 10, 15], }, { "model": [BaggingClassifier()], "model__n_estimators": [5, 10, 15], "model__max_features": [0.25, 0.5, 1.0], }, { "model": [AdaBoostClassifier()], "model__n_estimators": [5, 10, 15], "model__learning_rate": [0.001, 0.01, 0.1], }, { "model": [MLPClassifier()], "model__activation": ["identity", "logistic", "tanh", "relu"], "model__alpha": [0.001, 0.01, 0.1], }, { "model": [XGBClassifier()], "model__n_estimators": [5, 10, 15], "model__learning_rate": [0.001, 0.01, 0.1], }, { "model": [lgb.LGBMClassifier()], "model__learning_rate": [0.001, 0.01, 0.1], "model__n_estimators": [5, 10, 15], "model__num_leaves": [5, 10, 15], }, { "model": [CatBoostClassifier()], "model__learning_rate": [0.001, 0.01, 0.1], "model__depth": [5, 10, 15], "model__l2_leaf_reg": [5, 10, 15], }, ] self.regressor_param_list = [ { "model": [ElasticNet(fit_intercept=False)], "model__alpha": [0.001, 0.01, 0.1], "model__l1_ratio": [0.25, 0.5, 1.0], }, { "model": [DecisionTreeRegressor()], "model__min_samples_split": [0.25, 0.5, 1.0], "model__max_depth": [5, 10, 15], }, { "model": [RandomForestRegressor()], "model__min_samples_split": [0.25, 0.5, 1.0], "model__max_depth": [5, 10, 15], }, { "model": [BaggingRegressor()], "model__n_estimators": [5, 10, 15], "model__max_features": [0.25, 0.5, 1.0], }, { "model": [AdaBoostRegressor()], "model__n_estimators": [5, 10, 15], "model__learning_rate": [0.001, 0.01, 0.1], }, { "model": [MLPRegressor()], "model__activation": ["identity", "logistic", "tanh", "relu"], "model__alpha": [0.001, 0.01, 0.1], }, { "model": [XGBRegressor()], "model__n_estimators": [5, 10, 15], "model__learning_rate": [0.001, 0.01, 0.1], }, { "model": [lgb.LGBMRegressor()], "model__learning_rate": [0.001, 0.01, 0.1], "model__n_estimators": [5, 10, 15], "model__num_leaves": [5, 10, 15], }, { "model": [CatBoostRegressor()], "model__learning_rate": [0.001, 0.01, 0.1], "model__depth": [5, 10, 15], "model__l2_leaf_reg": [5, 10, 15], }, ]
X = df.drop(columns=['price'], axis=1) Y = df['price'] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33) #Encoding the regions regions_df = np.asarray(X['region']).reshape(1, -1) enc = OrdinalEncoder(encoding_method='ordered', variables=['region']) enc.fit(X_train, y_train) X_train_enc = enc.transform(X_train) X_test_enc = enc.transform(X_test) #fit model no training data regressor = XGBRegressor(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3) regressor.fit(X_train_enc, y_train) #make predictions for test data y_pred = regressor.predict(X_test_enc) predictions = [round(value) for value in y_pred] #Re-Normalizing price by multiplying with 1000000 price_predictions = y_pred * 1000000 print(len(y_pred)) print(y_pred) print(price_predictions) price_pred_round = np.round(price_predictions, 2) print(price_pred_round) # evaluate predictions mse = mean_squared_error(y_test, predictions)