def _build_model(self): """ Build the crucial components for model training """ _config = { 'n_estimators': self.n_estimators, 'max_leaf_nodes': self.max_leaf_nodes, 'min_impurity_split': self.min_impurity_split, 'n_jobs': self.n_jobs, 'random_state': self.random_state, 'max_samples': self.max_samples } if self.task_type == 'binaryclass': self.predictor = XGBClassifier(**_config, objective='binary:logistic', eval_metric="logloss") elif self.task_type == 'multiclass': self.predictor = XGBClassifier(**_config) elif self.task_type == 'multilabel': xgb_estimator = XGBClassifier(**_config, objective='binary:logistic', eval_metric="logloss") self.predictor = MultiOutputClassifier(xgb_estimator) elif self.task_type == 'regression': self.predictor = XGBRFRegressor(**_config) self._save_config(_config, 'predictor') _config = {'tasktype': self.task_type} self._save_config(_config, 'tasktype')
def test_xg_XGBRFRegressor(): print("Testing xgboost, XGBRFRegressor...") mod = XGBRFRegressor() X, y = iris_data mod.fit(X, y) docs = {'name': "XGBRFRegressor test"} fv = X[0, :] upload(mod, fv, docs)
def fit_model (X, y): model = XGBRFRegressor(n_estimators=1000, max_depth=7, random_state=42) model.fit(X, y) y_pred = model.predict(X) #print (y) err_mae = mean_absolute_error(y, y_pred) err_rmse = np.sqrt(mean_squared_error(y, y_pred)) return model, y_pred, err_mae, err_rmse
def __init__(self,src_file_index,bounds): self.model = XGBRFRegressor() self.model_name = "XGBRFRegressor" self.src = util.get_src_file(src_file_index=src_file_index) self.lower_bounds = bounds["lower_bounds"] self.upper_bounds = bounds["upper_bounds"] self.with_rain = False self.optimization_methods = optimization_methods self.num_iterations = 200 self.results = {} self.result_save_path = 'optimization_result/with_rain_'+str(self.with_rain)+'/'+self.src.split('.')[0].split('/')[-1]+'/' self.optimization() self.save_optimization_result()
class XGBRFRegressorOptimizer(BaseOptimizer): def __init__(self,src_file_index,bounds): self.model = XGBRFRegressor() self.model_name = "XGBRFRegressor" self.src = util.get_src_file(src_file_index=src_file_index) self.lower_bounds = bounds["lower_bounds"] self.upper_bounds = bounds["upper_bounds"] self.with_rain = False self.optimization_methods = optimization_methods self.num_iterations = 200 self.results = {} self.result_save_path = 'optimization_result/with_rain_'+str(self.with_rain)+'/'+self.src.split('.')[0].split('/')[-1]+'/' self.optimization() self.save_optimization_result() def objective_function(self,x): print("XGBRegressor优化中...") train_x, test_x, train_y, test_y = util.get_train_test_split(self.src,int(np.round(x[0])),int(np.round(x[1])),with_rain=self.with_rain) print(self.model_name) self.tune_params = ['offset','period','max_depth', # 'learning_rate', 'n_estimators', 'gasmma', 'min_child_weight','max_delta_step','subsample', 'colsample_bytree','colsample_bylevel','colsample_bynode','reg_alpha', 'reg_lambda','scale_pos_weight','base_score' ] self.model.max_depth = int(x[2]) self.model.n_estimators = int(x[3]) self.model.gamma = x[4] self.model.min_child_weight = int(x[5]) self.model.max_delta_step = int(x[6]) self.model.subsample = x[7] self.model.colsample_bytree = x[8] self.model.colsample_bylevel = x[9] self.model.colsample_bynode = x[10] self.model.reg_alpha = x[11] self.model.reg_lambda = x[12] self.model.scale_pos_weight = x[13] self.model.base_score = x[14] self.model.objective = 'reg:squarederror' self.model.learning_rate = 0.001 self.model.fit(X=train_x,y=train_y) y_hat = self.model.predict(test_x) mse = mean_squared_error(y_hat,test_y) return mse
def _build_model(self): """ Build the crucial components for model training """ _config = { 'n_estimators': self.n_estimators, 'criterion': self.criterion, 'max_depth': self.max_depth, 'min_samples_split': self.min_samples_split, 'min_samples_leaf': self.min_samples_leaf, 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, 'max_features': self.max_features, 'max_leaf_nodes': self.max_leaf_nodes, 'min_impurity_split': self.min_impurity_split, 'bootstrap': self.bootstrap, 'oob_score': self.oob_score, 'n_jobs': self.n_jobs, 'random_state': self.random_state, 'verbose': self.verbose, 'warm_start': self.warm_start, 'ccp_alpha': self.ccp_alpha, 'max_samples': self.max_samples } if self.task_type == 'binaryclass': self.predictor = XGBClassifier(**_config, objective='binary:logistic') elif self.task_type == 'multiclass': self.predictor = XGBClassifier(**_config) elif self.task_type == 'multilabel': xgb_estimator = XGBClassifier(**_config, objective='binary:logistic') self.predictor = MultiOutputClassifier(xgb_estimator) elif self.task_type == 'regression': self.predictor = XGBRFRegressor(**_config) self._save_config(_config, 'predictor') _config = {'tasktype': self.task_type} self._save_config(_config, 'tasktype')
def xgrfboost (train, target, n_estimators = 100, max_depth = 8, random_state = 17, learning_rate = 0.1, colsample_bytree = 0.9, colsample_bynode = 0.9, colsample_bylevel = 0.9, importance_type = 'split', reg_alpha = 2, reg_lambda = 2): '''XGRFBoost Regressor Params :- train - Training Set to train target - Target Set to predict n_estimators - no. of trees to predict (default set to 100) max_depth - Maximum depth that a tree can grow (default set to 8) random_state - A arbitary number to get same results when run on different machine with same params (default set to 17) learning_rate - size of step to to attain towards local minima colsample_bytree, colsample_bynode, colsample_bylevel - part of total features to use bytree, bynode, bylevel importance_type - metric to split samples (default set to split) reg_alpha, reg_lambda - L1 regularisation and L2 regularisation respectively''' from xgboost import XGBRFRegressor model = XGBRFRegressor(n_estimators = n_estimators, max_depth = max_depth, random_state = random_state, learning_rate = learning_rate, colsample_bytree = colsample_bytree, colsample_bynode = colsample_bynode, colsample_bylevel = colsample_bylevel, importance_type = importance_type, reg_alpha = reg_alpha, reg_lambda = reg_lambda) model.fit(train, target) return model
def _set_surrogate(self, X, y=None): if not hasattr(self, "_surrogate"): target = type_of_target(y) if target == "continuous": self._surrogate = XGBRFRegressor(max_depth=7, n_estimators=150) elif target in ["binary", "multiclass"]: self._surrogate = XGBRFClassifier(max_depth=7, n_estimators=150) else: raise ValueError( "Multioutput and multilabel datasets is not supported.")
def train(self): self.config.logger.info("XGBoostOptimiser::train") model = XGBRFRegressor(verbosity=1, **(self.config.params)) start = timer() inputs, exp_outputs = self.get_data_("train") end = timer() log_time(start, end, "for loading training data") log_memory_usage( ((inputs, "Input train data"), (exp_outputs, "Output train data"))) log_total_memory_usage("Memory usage after loading data") if self.config.plot_train: inputs_val, outputs_val = self.get_data_("validation") log_memory_usage(((inputs_val, "Input val data"), (outputs_val, "Output val data"))) log_total_memory_usage("Memory usage after loading val data") self.plot_train_(model, inputs, exp_outputs, inputs_val, outputs_val) start = timer() model.fit(inputs, exp_outputs) end = timer() log_time(start, end, "actual train") self.save_model(model)
def train(self): """ Train the optimizer. """ self.config.logger.info("XGBoostOptimiser::train") if self.config.dim_output > 1: logger = get_logger() logger.fatal( "YOU CAN PREDICT ONLY 1 DISTORTION. dim_output is bigger than 1." ) model = XGBRFRegressor(verbosity=1, **(self.config.params)) start = timer() inputs, exp_outputs, *_ = self.__get_data("train") end = timer() log_time(start, end, "for loading training data") log_memory_usage( ((inputs, "Input train data"), (exp_outputs, "Output train data"))) log_total_memory_usage("Memory usage after loading data") if self.config.plot_train: inputs_val, outputs_val, *_ = self.__get_data("validation") log_memory_usage(((inputs_val, "Input validation data"), (outputs_val, "Output validation data"))) log_total_memory_usage( "Memory usage after loading validation data") self.__plot_train(model, inputs, exp_outputs, inputs_val, outputs_val) start = timer() model.fit(inputs, exp_outputs) end = timer() log_time(start, end, "actual train") model.get_booster().feature_names = get_input_names_oned_idc( self.config.opt_usederivative, self.config.num_fourier_coeffs_train) self.__plot_feature_importance(model) self.save_model(model)
def ml_train(df_train, target): X_train, y_train = get_x_y(df_train, target) # ML train X_train_train, X_train_test, y_train_train, y_train_test = train_test_split( X_train, y_train, test_size=0.20, random_state=7) if target == 'target': # classification model = XGBClassifier() model.fit(X_train_train, y_train_train, eval_metric='mlogloss', eval_set=[(X_train_test, y_train_test)], early_stopping_rounds=25, verbose=False) else: # regression model = XGBRFRegressor() model.fit(X_train_train, y_train_train, eval_metric='rmse', eval_set=[(X_train_test, y_train_test)], early_stopping_rounds=25, verbose=False) print('Training Set: {} to {}'.format(df_train['date'].min(), df_train['date'].max())) # ML score y_pred = model.predict(X_train) if target == 'target': accuracy = accuracy_score(y_train, y_pred) print("In-Sample Accuracy: %.2f%%" % (accuracy * 100.0)) else: mse = mean_squared_error(y_train, y_pred) print("In-Sample RMSE: %.2f%%" % (sqrt(mse) * 100)) return model, X_train
def __create_pipeline(self): if self.mode == "bypass_knnsr": pipeline = [("regression", KNNSRBypassRegression(column=KNNSR_BYPASS))] self.pipeline_params = {} self.n_jobs = 1 self.training_cv_folds = 2 if self.mode == "xgb": pipeline = [ ("variance_treshold", VarianceThreshold()), ("scale", StandardScaler()), ("regression", XGBRFRegressor()), ] self.pipeline_params = { "regression__n_estimators": [100, 200, 400, 800], "regression__max_depth": [1, 3, 5, 7, 11], "regression__subsample": [0.5, 1], "regression__colsample_bylevel": [0.8, 1], "regression__random_state": [0], "regression__eval_metric": ["mae"], "regression__reg_lambda": [0, 1], "regression__reg_alpha": [0, 1], "regression__objective": ["reg:squarederror"], } if self.mode == "linear": pipeline = [ ( "filter", FilterColumns( columns=[BASELINE, AugmentedTADPOLEData.FORECAST_DIST ]), ), ("scale", StandardScaler()), ("polynomial_features", PolynomialFeatures()), ("regression", LinearRegression()), ] self.pipeline_params = {} self.n_jobs = 1 self.pipeline = Pipeline(pipeline)
# 1. 데이터 datasets = load_boston() x = datasets.data y = datasets.target print("init x.shape:", x.shape) # 1.1 데이터 전처리 (train_test_split) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=44, shuffle=True, test_size=0.2) # 2 모델 (XGBRFRegressor) model = XGBRFRegressor(max_depth=4) model.fit(x_train, y_train) # 4. 평가 acc = model.score(x_test, y_test) print("acc:", acc) print(model.feature_importances_) # 피쳐 임포턴스 자르는 함수 def earseLowFI_index(fi_arr, low_value, input_arr): input_arr = input_arr.T temp = [] for i in range(fi_arr.shape[0]): if fi_arr[i] >= low_value: temp.append(input_arr[i, :])
#1. 데이터 dataset = load_boston() x = dataset.data y = dataset.target df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=77) #2. 모델 # model = GradientBoostingClassifier(max_depth=4) model = XGBRFRegressor(n_jobs=-1) #3. 훈련 model.fit(x_train, y_train) #4. 평가, 예측 acc = model.score(x_test, y_test) print(model.feature_importances_ ) #[0.0244404 0.01669101 0.00766884 0.95119975] 다 합치면 1 print('acc : ', acc) fi = model.feature_importances_ new_data = [] feature = []
'n_estimators': [1, 50, 100], "max_depth": [2, 6, 8], 'min_child_weight': [1, 0.1, 0.3], 'eta': [0, 2, 10], 'gamma': [0, 1, 2], 'max_delta_step': [0, 1], 'subsample': [0.5, 0.6], 'colsample_bytree': [1, 0.5], 'colsample_bylevel': [0, 1], 'lambda': [1, 0.5, 1.5], 'alpha': [0, 1], 'scale_pos_weight': [1, 2], 'L1': [0] } model = RandomizedSearchCV(XGBRFRegressor(n_estimators=1000, penalty=('l1', 'l2')), parameters, cv=5, n_jobs=-1) model = MultiOutputRegressor(model) warnings.filterwarnings('ignore') model.fit(x_train, y_train) score = model.score(x_test, y_test) print(score) y4 = model.predict(test.values) #여기서 definition과 for 문을 써준 이유는 GB와 XGB에서는 스칼라 형태일때만 정보가 받아지기 때문에 저 두개의 모델을 구동시키기 위해서는 #현재 가지고 있는 데이터셋을 총 4번(4컬럼이니까) 으로 잘라줘서 스칼라의 형태로 만들어주는 것이다 . 이 for문은 그것을 진행해주기 위해서 있는것이다. #나머지 random forest와 decision tree는 스칼라의 형태로 구동을 하더라도 전혀 상관 없이 잘 구동된다.
}, { "anyway__n_estimators": [100, 200, 300], "anyway__learning_rate": [0.1, 0.09, 1], "anyway__colsample_bylevel": [0.6, 0.7, 0.8] }] # 1.1 데이터 전처리 (train_test_split) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=44, shuffle=True, test_size=0.2) kfold = KFold(n_splits=5, shuffle=True) pipe = Pipeline([('scaler', StandardScaler()), ('anyway', XGBRFRegressor())]) # model = XGBRFRegressor(max_depth=max_depth, learning_rate=learning_rate, # n_estimators=n_estimators, n_jobs=n_jobs, # colsample_bylevel = colsample_bylevel, # colsample_bytree=colsample_bytree ) # model = RandomizedSearchCV(XGBRFRegressor(), # parameters, # cv=kfold, # verbose=2) # kfold가 5번 x 20번 = 총 100번 model = RandomizedSearchCV(pipe, parameters, cv=5, verbose=2) model.fit(x_train, y_train) # 4. 평가, 예측 acc = model.score(x_test, y_test)
encoder = LabelEncoder() encoder.fit(df[column]) encoders[column] = encoder df_num = df.copy() for column in encoders.keys(): encoder = encoders[column] df_num[column] = encoder.transform(df[column]) # feature, target 설정 train_num = df_num.sample(frac=1, random_state=0) train_features = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1) train_target = np.log1p(train_num['AMT']) # 훈련 model = XGBRFRegressor(n_jobs=-1) model.fit(train_features, train_target) # 예측 템플릿 만들기 CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique() STD_CLSS_NMs = df_num['STD_CLSS_NM'].unique() HOM_SIDO_NMs = df_num['HOM_SIDO_NM'].unique() AGEs = df_num['AGE'].unique() SEX_CTGO_CDs = df_num['SEX_CTGO_CD'].unique() FLCs = df_num['FLC'].unique() years = [2020] months = [4, 7] temp = [] for CARD_SIDO_NM in CARD_SIDO_NMs: for STD_CLSS_NM in STD_CLSS_NMs:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from xgboost import XGBClassifier, XGBRFRegressor import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import load_diabetes x, y = load_diabetes(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, shuffle=True, train_size=0.8) model1 = XGBRFRegressor() model1.fit(x_train, y_train) default_score = model1.score(x_test, y_test) model = XGBRFRegressor() model.fit(x_train, y_train) print(model.feature_importances_) index7 = np.sort(model.feature_importances_)[::-1][int( 0.7 * len(model.feature_importances_))] delete_list = [] for i in model.feature_importances_: if i < index7: print(i, "제거 ")
from sklearn.pipeline import make_pipeline from sklearn.metrics import r2_score, accuracy_score from sklearn.feature_selection import SelectFromModel from sklearn.preprocessing import MinMaxScaler import numpy as np import pandas as pd x, y = load_boston(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, train_size=0.8, random_state=66) model = XGBRFRegressor(n_jobs=-1) model.fit(x_train, y_train) score = model.score(x_test, y_test) print('R2', score) thresholds = np.sort(model.feature_importances_) #피처를 소팅 print(thresholds) for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) # 피처의 개수를 하나씩 제거 select_x_train = selection.transform(x_train) # 피쳐의 개수를 줄인 트레인을 반환 selection_model = XGBRFRegressor(n_jobs=-1) # 모델 생성 selection_model.fit(select_x_train, y_train) #모델의 핏
coef6 = pd.Series(alg6.feature_importances_, predictors).sort_values(ascending=False) coef6.plot(kind='bar', title='Feature Importances') # In[ ]: # In[157]: from xgboost import XGBRFRegressor predictors = [x for x in train.columns if x not in [target]+IDcol] alg7 = XGBRFRegressor() modelfit(alg7, train, test, predictors, target, IDcol, 'alg7.csv') # In[ ]: # In[ ]:
from xgboost import XGBRFRegressor #from sklearn.linear_model import LinearRegression df = pd.read_csv("wheel_prediction_data.csv", encoding='ISO 8859-1', sep=";", decimal=",") df.head() # evaluate xgboost random forest ensemble for regression y = df[['km_till_OMS']].values X = df[["LeftWheelDiameter", "Littera", "VehicleOperatorname"]] # define the model model = XGBRFRegressor(n_estimators=100, subsample=0.9, colsample_bynode=0.2) # define the model evaluation procedure cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) # evaluate the model and collect the scores n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1) # report performance print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores))) # def model(df): # # # With Statsmodels, we need to add our intercept term, B0, manually
from lightgbm import LGBMRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.neighbors import KNeighborsRegressor def train_and_test(model): model.fit(X_train, y_train) y_hat = model.predict(X_test) report = metrics.mean_squared_error(y_test, y_hat) print(report) return y_hat # XGBRF Regression xgbrf_pred = train_and_test(XGBRFRegressor(n_estimators=400)) # kNN knn_pred_4 = train_and_test(KNeighborsRegressor(n_neighbors=14)) # Random Forest rf_pred = train_and_test( RandomForestRegressor(n_estimators=400, random_state=14)) # LGBM Regression lgbm_pred = train_and_test( LGBMRegressor(boosting_type='gbdt', random_state=94, colsample_bytree=0.9, max_depth=5, subsample=0.9, n_estimators=40)) #%%
parameters = { 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95], 'max_depth': [5, 10, 20, 30, 50, 80, 100], 'n_jobs': [-1] } fit_params = { 'verbose': True, 'eval_set': [(x_train, y_train), (x_test, y_test)], # 'early_stopping_rounds' : 5 } kfold = KFold(n_splits=5, shuffle=True, random_state=66) # 2. 모델 y_pred = [] y_test_pred = [] for i in range(4): model = RandomizedSearchCV(XGBRFRegressor(), parameters, cv=5, n_iter=50) model.fit(x_train, y_train[:, i]) print("acc : ", model.score(x_test, y_test[:, i])) y_test_pred.append(model.predict(x_test)) y_pred.append(model.predict(x_pred)) y_pred = np.array(y_pred).T y_test_pred = np.array(y_test_pred).T print(y_pred.shape) mspe = kaeri_metric(y_test, y_test_pred) print('mspe : ', mspe) submissions = pd.DataFrame({
print(x.shape) # (506, 13) print(y.shape) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True) parameters = [{ 'n_estimators': [300, 500, 3300], 'learning_rate': [0.01, 0.5, 1], 'colsample_bytree': [0.6, 0.8, 0.9], # 0.6~0.9사용 'colsample_bylevel': [0.6, 0.8, 0.9], 'max_depth': [6, 7, 8] }] model = GridSearchCV(XGBRFRegressor(), parameters, cv=5, n_jobs=-1) # 결측치제거 전처리 안해도된다. model.fit(x_train, y_train) print(model.best_estimator_) print("==========================================") print(model.best_params_) print("==========================================") score = model.score(x_test, y_test) print('정수: ', score) # plot_importance(model) # plt.show()
print("x_train 모양 : ", x_train.shape) #(8000, 71) print("x_test 모양 : ", x_test.shape) #(2000, 71) print("y_train 모양 : ", y_train.shape) #(8000, 4) print("y_test 모양 : ", y_test.shape) #(2000, 4) #트리구조 #MultiOutputRegressor(xgb.XGBRFRegressor()) ''' model = MultiOutputRegressor(XGBRegressor()) model.fit(x_train,y_train) score = model.score(x_test,y_test) print(score) y4 = model.predict(test.values) ''' model = MultiOutputRegressor(XGBRFRegressor()) model.fit(x_train, y_train) score = model.score(x_test, y_test) print(score) y4 = model.predict(test.values) n_features = x.data.shape[1] #30 def plot_feature_importances_x(model): n_features = x.data.shape[1] #30 plt.barh( np.arange(n_features), model.feature_importances_, #수평 가로 막대를 그린다. ( ) align='center') plt.yticks(np.arange(n_features),
y, train_size=0.8, shuffle=True, random_state=66) # 이 정도만 조작해 주면 됨 n_estimators = 1000 # The number of trees in the forest. learning_rate = 1 # 학습률 colsample_bytree = None # 트리의 샘플 / 실전0.6 ~ 0.9 사이 씀 / 실전 1씀 colsample_bylevel = 0.9 # [기본설정값: 1]: subsample, colsample_bytree 두 초모수 설정을 통해서 이미 의사결정나무 모형 개발에 사용될 변수갯수와 관측점 갯수를 사용했는데 추가로 colsample_bylevel을 지정하는 것이 특별한 의미를 갖는지 의문이 듦. max_depth = 29 # [기본설정값: 6]: 과적합 방지를 위해서 사용되는데 역시 CV를 사용해서 적절한 값이 제시되어야 하고 보통 3-10 사이 값이 적용된다. n_jobs = -1 # CV 써라 # XGB 속도가 굉장히 빠름, 전처리 결측치 제거 안해줘도 됨 model = XGBRFRegressor(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, colsample_bylevel=colsample_bylevel, colsample_bytree=colsample_bytree) model.fit(x_train, y_train) score = model.score(x_test, y_test) # score는 evaluate print('점수 :', score) # print(model.feature_importances_) plot_importance(model) # plt.show()
models = [ LinearRegression(), LassoCV(alphas=np.logspace(-6, 6, 13)), ElasticNetCV(alphas=np.logspace(-6, 6, 13)), SGDRegressor(), PassiveAggressiveRegressor(), Ridge(), PassiveAggressiveRegressor(), RandomForestRegressor(max_depth=5), GradientBoostingRegressor(), AdaBoostRegressor(loss='exponential'), BaggingRegressor(), SVR(), NuSVR(), XGBRFRegressor(max_depth=5, objective="reg:squarederror"), XGBRegressor(max_depth=5, objective="reg:squarederror") ] def show_score(x, y, estimator): """ Returns MAE scores for specified models. Also returns r2 scores if applicable Arguments: x {[array/DataFrame]} -- [Array or matrix of features. Can also be dataframe] y {[array]} -- [Target values] estimator {[str]} -- [The estimator being used] """ # Instantiate models and predict values
class XGBoostText: def __init__(self, expmodel_id='test.new', n_estimators=100, use_gpu=False, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None): """ XGboost from public XGBoostText Lib. Parameters ---------- """ check_model_dir(expmodel_id=expmodel_id) self.checkout_dir = os.path.join('./experiments_records', expmodel_id, 'checkouts') self.result_dir = os.path.join('./experiments_records', expmodel_id, 'results') # make saving directory if needed if not os.path.isdir(self.checkout_dir): os.makedirs(self.checkout_dir) if not os.path.isdir(self.result_dir): os.makedirs(self.result_dir) self.expmodel_id = expmodel_id self.n_estimators = n_estimators self.use_gpu = use_gpu self.criterion = criterion self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.bootstrap = bootstrap self.oob_score = oob_score self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose self.warm_start = warm_start self.class_weight = class_weight self.ccp_alpha = ccp_alpha self.max_samples = max_samples self.task_type = None # self._args_check() self.device = self._get_device() def _data_check(self, datalist): """ Target to 1) check train_data/valid_data valid, if not give tips about data problem 2) check loss function valid, if not recommend proper loss func Parameters ---------- datalist = [data1 = { 'x':list[episode_file_path], 'y':list[label], 'l':list[seq_len], 'feat_n': n of feature space, 'label_n': n of label space }, data2 = { 'x':list[episode_file_path], 'y':list[label], 'l':list[seq_len], 'feat_n': n of feature space, 'label_n': n of label space }, ... ] Returns ------- self : object """ label_n_check = set([]) task_type_check = set([]) for each_data in datalist: for each_x_path in each_data['x']: if os.path.exists(each_x_path) is False: raise Exception('episode file not exist') label_n_check.add(np.shape(np.array(each_data['y']))[1]) task_type_check.add( label_check(each_data['y'], hat_y=None, assign_task_type=self.task_type)) if len(task_type_check) != 1: raise Exception('task_type is inconformity in data') pre_task_type = list(task_type_check)[0] if self.task_type == None: self.task_type = pre_task_type elif self.task_type == pre_task_type: pass else: raise Exception( 'predifine task-type {0}, but data support task-type {1}'. format(self.task_type, pre_task_type)) print('current task can beed seen as {0}'.format(self.task_type)) def _get_device(self): if self.use_gpu: if torch.cuda.is_available(): device = torch.device("cuda") print('use GPU recource') else: device = torch.device("cpu") print('not find effcient GPU, use CPU recource') else: device = torch.device("cpu") print('use CPU recource') return device def _build_model(self): """ Build the crucial components for model training """ _config = { 'n_estimators': self.n_estimators, 'max_leaf_nodes': self.max_leaf_nodes, 'min_impurity_split': self.min_impurity_split, 'n_jobs': self.n_jobs, 'random_state': self.random_state, 'max_samples': self.max_samples } if self.task_type == 'binaryclass': self.predictor = XGBClassifier(**_config, objective='binary:logistic', eval_metric="logloss") elif self.task_type == 'multiclass': self.predictor = XGBClassifier(**_config) elif self.task_type == 'multilabel': xgb_estimator = XGBClassifier(**_config, objective='binary:logistic', eval_metric="logloss") self.predictor = MultiOutputClassifier(xgb_estimator) elif self.task_type == 'regression': self.predictor = XGBRFRegressor(**_config) self._save_config(_config, 'predictor') _config = {'tasktype': self.task_type} self._save_config(_config, 'tasktype') def _data_check(self, datalist): """ Target to 1) check train_data/valid_data valid, if not give tips about data problem 2) check loss function valid, if not recommend proper loss func Parameters ---------- datalist = [data1 = { 'x':list[episode_file_path], 'y':list[label], 'l':list[seq_len], 'feat_n': n of feature space, 'label_n': n of label space }, data2 = { 'x':list[episode_file_path], 'y':list[label], 'l':list[seq_len], 'feat_n': n of feature space, 'label_n': n of label space }, ... ] Returns ------- self : object """ label_n_check = set([]) task_type_check = set([]) for each_data in datalist: for each_x_path in each_data['x']: if os.path.exists(each_x_path) is False: raise Exception('episode file not exist') label_n_check.add(np.shape(np.array(each_data['y']))[1]) task_type_check.add( label_check(each_data['y'], hat_y=None, assign_task_type=self.task_type)) if len(task_type_check) != 1: raise Exception('task_type is inconformity in data') pre_task_type = list(task_type_check)[0] if self.task_type == None: self.task_type = pre_task_type elif self.task_type == pre_task_type: pass else: raise Exception( 'predifine task-type {0}, but data support task-type {1}'. format(self.task_type, pre_task_type)) def fit(self, data_dict, X=None, y=None, assign_task_type=None): """ Parameters ---------- train_data : { 'x':list[episode_file_path], 'y':list[label], 'l':list[seq_len], 'feat_n': n of feature space, 'label_n': n of label space } The input train samples dict. valid_data : { 'x':list[episode_file_path], 'y':list[label], 'l':list[seq_len], 'feat_n': n of feature space, 'label_n': n of label space } The input valid samples dict. Returns ------- self : object Fitted estimator. """ self.task_type = assign_task_type if data_dict != None: self._data_check([data_dict]) data = ml_reader.DatasetReader( data_dict, device=self.device, task_type=self.task_type).get_data() _X = np.array(data['X']) _y = np.array(data['Y']) elif X != None and y != None: self._data_check([{'X': X, 'Y': Y}]) _X = X _y = Y else: raise Exception('fill in correct data for model train') print(np.shape(_X), np.shape(_y)) self._build_model() self.predictor.fit(_X, _y) model_path = os.path.join(self.checkout_dir, 'best.model') joblib.dump(self.predictor, model_path) def _save_config(self, config, config_type): temp_path = os.path.join(self.checkout_dir, "{0}_config.json".format(config_type)) if os.path.exists(temp_path): os.remove(temp_path) with open(temp_path, "w", encoding='utf-8') as f: f.write(json.dumps(config, indent=4)) def _load_config(self, config_type): temp_path = os.path.join(self.checkout_dir, '{0}_config.json'.format(config_type)) assert os.path.exists( temp_path ), 'cannot find {0}_config.json, please it in dir {1}'.format( config_type, self.checkout_dir) with open(temp_path, 'r') as f: config = json.load(f) return config def load_model(self): """ Parameters ---------- loaded_epoch : str, loaded model name we save the model by <epoch_count>.epoch, latest.epoch, best.epoch Returns ------- self : object loaded estimator. """ model_path = os.path.join(self.checkout_dir, 'best.model') self.task_type = self._load_config('tasktype')['tasktype'] self.predictor = joblib.load(model_path) def inference(self, data_dict, X=None, y=None): """ Parameters ---------- test_data : { 'x':list[episode_file_path], 'y':list[label], 'l':list[seq_len], 'feat_n': n of feature space, 'label_n': n of label space } The input test samples dict. """ if data_dict != None: self._data_check([data_dict]) data = ml_reader.DatasetReader( data_dict, device=self.device, task_type=self.task_type).get_data() _X = data['X'] _y = data['Y'] elif X != None and y != None: self._data_check({'X': X, 'Y': y}) _X = X _y = y else: raise Exception('fill in correct data for model inference') if self.task_type in ['binaryclass', 'regression']: real_v = _y.reshape(-1, 1) prob_v = self.predictor.predict_proba(_X)[:, 1].reshape(-1, 1) elif self.task_type in ['multiclass']: real_v = np.array(_y) prob_v = self.predictor.predict_proba(_X).reshape( -1, np.shape(real_v)[1]) elif self.task_type in ['multilabel']: real_v = np.array(_y) prob_v = [] _prob_v = self.predictor.predict_proba(_X) for each_class in _prob_v: if len(each_class) == 1: each_class = np.array([each_class]) if np.shape(each_class)[1] == 2: v = each_class[:, 1].reshape((-1, 1)) else: v = each_class prob_v.append(v) prob_v = np.concatenate(prob_v, 1) pickle.dump(prob_v, open(os.path.join(self.result_dir, 'hat_y'), 'wb')) pickle.dump(real_v, open(os.path.join(self.result_dir, 'y'), 'wb')) def get_results(self): """ Load saved prediction results in current ExpID truth_value: proj_root/experiments_records/*****(exp_id)/results/y predict_value: proj_root/experiments_records/*****(exp_id)/results/hat_y xxx represents the loaded model """ try: hat_y = pickle.load( open(os.path.join(self.result_dir, 'hat_y'), 'rb')) except IOError: print('Error: cannot find file {0} or load failed'.format( os.path.join(self.result_dir, 'hat_y'))) try: y = pickle.load(open(os.path.join(self.result_dir, 'y'), 'rb')) except IOError: print('Error: cannot find file {0} or load failed'.format( os.path.join(self.result_dir, 'y'))) results = {'hat_y': hat_y, 'y': y} return results
random_state=44, shuffle=True, test_size=0.2) from xgboost import XGBClassifier, XGBRFRegressor, plot_importance from sklearn.model_selection import KFold from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from sklearn.metrics import r2_score kfold = KFold(n_splits=5, shuffle=True) # model = XGBRFRegressor(max_depth=max_depth, learning_rate=learning_rate, # n_estimators=n_estimators, n_jobs=n_jobs, # colsample_bylevel = colsample_bylevel, # colsample_bytree=colsample_bytree ) model = RandomizedSearchCV(XGBRFRegressor(), parameters, cv=kfold, verbose=2) # kfold가 5번 x 20번 = 총 100번 # score 디폴트로 했던 놈과 성능 비교 model.fit(x_train, y_train) acc = model.score(x_test, y_test) print("acc:", acc) print("최적의 매개변수:", model.best_estimator_) print("최적의 파라미터:", model.best_params_) y_predict = model.predict(x_test) print('최종정답률:', r2_score(y_test, y_predict)) '''
encoder = LabelEncoder() encoder.fit(df[column]) encoders[column] = encoder df_num = df.copy() for column in encoders.keys(): encoder = encoders[column] df_num[column] = encoder.transform(df[column]) # feature, target 설정 train_num = df_num.sample(frac=1, random_state=0) train_features = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1) train_target = np.log1p(train_num['AMT']) # 훈련 model = XGBRFRegressor(learning_rate=0.1) model.fit(train_features, train_target) CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique() STD_CLSS_NMs = df_num['STD_CLSS_NM'].unique() HOM_SIDO_NMs = df_num['HOM_SIDO_NM'].unique() AGEs = df_num['AGE'].unique() SEX_CTGO_CDs = df_num['SEX_CTGO_CD'].unique() FLCs = df_num['FLC'].unique() years = [2020] months = [4, 7] temp = [] for CARD_SIDO_NM in CARD_SIDO_NMs: for STD_CLSS_NM in STD_CLSS_NMs: for HOM_SIDO_NM in HOM_SIDO_NMs: