Ejemplo n.º 1
0
def fit_model (X, y):
    model = XGBRFRegressor(n_estimators=1000, max_depth=7, random_state=42)
    model.fit(X, y)
    y_pred = model.predict(X)
    #print (y)
    err_mae = mean_absolute_error(y, y_pred)
    err_rmse = np.sqrt(mean_squared_error(y, y_pred))
    return model, y_pred, err_mae, err_rmse
Ejemplo n.º 2
0
def test_xg_XGBRFRegressor():
    print("Testing xgboost, XGBRFRegressor...")
    mod = XGBRFRegressor()
    X, y = iris_data
    mod.fit(X, y)
    docs = {'name': "XGBRFRegressor test"}
    fv = X[0, :]
    upload(mod, fv, docs)
Ejemplo n.º 3
0
class XGBRFRegressorOptimizer(BaseOptimizer):
	def __init__(self,src_file_index,bounds):
		self.model = XGBRFRegressor()
		self.model_name = "XGBRFRegressor"
		self.src = util.get_src_file(src_file_index=src_file_index)
		self.lower_bounds = bounds["lower_bounds"]
		self.upper_bounds = bounds["upper_bounds"]
		self.with_rain = False
		self.optimization_methods = optimization_methods
		self.num_iterations = 200
		self.results = {}
		self.result_save_path = 'optimization_result/with_rain_'+str(self.with_rain)+'/'+self.src.split('.')[0].split('/')[-1]+'/'
		self.optimization()
		self.save_optimization_result()

	def objective_function(self,x):
		print("XGBRegressor优化中...")
		train_x, test_x, train_y, test_y = util.get_train_test_split(self.src,int(np.round(x[0])),int(np.round(x[1])),with_rain=self.with_rain)
		print(self.model_name)
		self.tune_params = ['offset','period','max_depth',
							# 'learning_rate',
		 					'n_estimators',
							'gasmma',
							'min_child_weight','max_delta_step','subsample',
							'colsample_bytree','colsample_bylevel','colsample_bynode','reg_alpha',
							'reg_lambda','scale_pos_weight','base_score'
							]
		self.model.max_depth = int(x[2])
		self.model.n_estimators = int(x[3])
		self.model.gamma = x[4]
		self.model.min_child_weight = int(x[5])
		self.model.max_delta_step = int(x[6])
		self.model.subsample = x[7]
		self.model.colsample_bytree = x[8]
		self.model.colsample_bylevel = x[9]
		self.model.colsample_bynode = x[10]
		self.model.reg_alpha = x[11]
		self.model.reg_lambda = x[12]
		self.model.scale_pos_weight = x[13]
		self.model.base_score = x[14]
		self.model.objective = 'reg:squarederror'
		self.model.learning_rate = 0.001
		self.model.fit(X=train_x,y=train_y)
		y_hat = self.model.predict(test_x)
		mse = mean_squared_error(y_hat,test_y)
		return mse
Ejemplo n.º 4
0
 def xgrfboost (train, target, n_estimators = 100, max_depth = 8, random_state = 17, learning_rate = 0.1, colsample_bytree = 0.9, colsample_bynode = 0.9, 
                colsample_bylevel = 0.9, importance_type = 'split', reg_alpha = 2, reg_lambda = 2):
     '''XGRFBoost Regressor
        Params :-
        train - Training Set to train
        target - Target Set to predict
        n_estimators - no. of trees to predict (default set to 100)
        max_depth - Maximum depth that a tree can grow (default set to 8)
        random_state - A arbitary number to get same results when run on different machine with same params (default set to 17)
        learning_rate - size of step to to attain towards local minima
        colsample_bytree, colsample_bynode, colsample_bylevel - part of total features to use bytree, bynode, bylevel
        importance_type - metric to split samples (default set to split)
        reg_alpha, reg_lambda - L1 regularisation and L2 regularisation respectively'''
     
     from xgboost import XGBRFRegressor
     model = XGBRFRegressor(n_estimators = n_estimators, max_depth = max_depth, random_state = random_state, learning_rate = learning_rate, 
                            colsample_bytree = colsample_bytree, colsample_bynode = colsample_bynode, colsample_bylevel = colsample_bylevel, 
                            importance_type = importance_type, reg_alpha = reg_alpha, reg_lambda = reg_lambda)
     model.fit(train, target)
     
     return model
Ejemplo n.º 5
0
 def train(self):
     self.config.logger.info("XGBoostOptimiser::train")
     model = XGBRFRegressor(verbosity=1, **(self.config.params))
     start = timer()
     inputs, exp_outputs = self.get_data_("train")
     end = timer()
     log_time(start, end, "for loading training data")
     log_memory_usage(
         ((inputs, "Input train data"), (exp_outputs, "Output train data")))
     log_total_memory_usage("Memory usage after loading data")
     if self.config.plot_train:
         inputs_val, outputs_val = self.get_data_("validation")
         log_memory_usage(((inputs_val, "Input val data"),
                           (outputs_val, "Output val data")))
         log_total_memory_usage("Memory usage after loading val data")
         self.plot_train_(model, inputs, exp_outputs, inputs_val,
                          outputs_val)
     start = timer()
     model.fit(inputs, exp_outputs)
     end = timer()
     log_time(start, end, "actual train")
     self.save_model(model)
Ejemplo n.º 6
0
    def train(self):
        """
        Train the optimizer.
        """
        self.config.logger.info("XGBoostOptimiser::train")
        if self.config.dim_output > 1:
            logger = get_logger()
            logger.fatal(
                "YOU CAN PREDICT ONLY 1 DISTORTION. dim_output is bigger than 1."
            )

        model = XGBRFRegressor(verbosity=1, **(self.config.params))
        start = timer()
        inputs, exp_outputs, *_ = self.__get_data("train")
        end = timer()
        log_time(start, end, "for loading training data")
        log_memory_usage(
            ((inputs, "Input train data"), (exp_outputs, "Output train data")))
        log_total_memory_usage("Memory usage after loading data")
        if self.config.plot_train:
            inputs_val, outputs_val, *_ = self.__get_data("validation")
            log_memory_usage(((inputs_val, "Input validation data"),
                              (outputs_val, "Output validation data")))
            log_total_memory_usage(
                "Memory usage after loading validation data")
            self.__plot_train(model, inputs, exp_outputs, inputs_val,
                              outputs_val)
        start = timer()
        model.fit(inputs, exp_outputs)
        end = timer()
        log_time(start, end, "actual train")
        model.get_booster().feature_names = get_input_names_oned_idc(
            self.config.opt_usederivative,
            self.config.num_fourier_coeffs_train)
        self.__plot_feature_importance(model)
        self.save_model(model)
from sklearn.metrics import r2_score, accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

x, y = load_boston(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    shuffle=True,
                                                    train_size=0.8,
                                                    random_state=66)

model = XGBRFRegressor(n_jobs=-1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print('R2', score)

thresholds = np.sort(model.feature_importances_)  #피처를 소팅
print(thresholds)

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh,
                                prefit=True)  # 피처의 개수를 하나씩 제거

    select_x_train = selection.transform(x_train)  # 피쳐의 개수를 줄인 트레인을 반환

    selection_model = XGBRFRegressor(n_jobs=-1)  # 모델 생성
    selection_model.fit(select_x_train, y_train)  #모델의 핏
Ejemplo n.º 8
0
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

# feature, target 설정
train_num = df_num.sample(frac=1, random_state=0)
train_features = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
train_target = np.log1p(train_num['AMT'])

# 훈련
model = XGBRFRegressor(n_jobs=-1)
model.fit(train_features, train_target)

# 예측 템플릿 만들기
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
AGEs          = df_num['AGE'].unique()
SEX_CTGO_CDs  = df_num['SEX_CTGO_CD'].unique()
FLCs          = df_num['FLC'].unique()
years         = [2020]
months        = [4, 7]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs:
Ejemplo n.º 9
0
class XGBoostOptimiser(Optimiser):
    name = "xgboost"

    def __init__(self, config):
        super().__init__(config)
        self.config.logger.info("XGBoostOptimiser::Init")
        self.model = XGBRFRegressor(verbosity=1, **(self.config.params))

    def train(self):
        self.config.logger.info("XGBoostOptimiser::train")
        inputs, exp_outputs = self.get_data_("train")
        self.model.fit(inputs, exp_outputs)
        self.save_model(self.model)

    def apply(self):
        self.config.logger.info("XGBoostOptimiser::apply, input size: %d",
                                self.config.dim_input)
        self.load_model()
        inputs, exp_outputs = self.get_data_("apply")
        pred_outputs = self.model.predict(inputs)
        self.plot_apply_(exp_outputs, pred_outputs)
        self.config.logger.info("Done apply")

    def search_grid(self):
        raise NotImplementedError("Search grid method not implemented yet")

    def save_model(self, model):
        # Snapshot - can be used for further training
        out_filename = "%s/xgbmodel_%s_nEv%d.json" %\
                (self.config.dirmodel, self.config.suffix, self.config.train_events)
        pickle.dump(model, open(out_filename, "wb"), protocol=4)

    def load_model(self):
        # Loading a snapshot
        filename = "%s/xgbmodel_%s_nEv%d.json" %\
                (self.config.dirmodel, self.config.suffix, self.config.train_events)
        self.model = pickle.load(open(filename, "rb"))

    def get_data_(self, partition):
        inputs = []
        exp_outputs = []
        for indexev in self.config.partition[partition]:
            inputs_single, exp_outputs_single = load_event_idc(
                self.config.dirinput_train, indexev, self.config.input_z_range,
                self.config.output_z_range, self.config.opt_predout)
            inputs.append(inputs_single)
            exp_outputs.append(exp_outputs_single)
        inputs = np.concatenate(inputs)
        exp_outputs = np.concatenate(exp_outputs)
        return inputs, exp_outputs

    def plot_apply_(self, exp_outputs, pred_outputs):
        myfile = TFile.Open("%s/output_%s_nEv%d.root" % \
                            (self.config.dirval, self.config.suffix, self.config.train_events),
                            "recreate")
        h_dist_all_events, h_deltas_all_events, h_deltas_vs_dist_all_events =\
                plot_utils.create_apply_histos(self.config, self.config.suffix, infix="all_events_")
        distortion_numeric_flat_m, distortion_predict_flat_m, deltas_flat_a, deltas_flat_m =\
            plot_utils.get_apply_results_single_event(pred_outputs, exp_outputs)
        plot_utils.fill_apply_tree(h_dist_all_events, h_deltas_all_events,
                                   h_deltas_vs_dist_all_events,
                                   distortion_numeric_flat_m,
                                   distortion_predict_flat_m, deltas_flat_a,
                                   deltas_flat_m)

        for hist in (h_dist_all_events, h_deltas_all_events,
                     h_deltas_vs_dist_all_events):
            hist.Write()
        plot_utils.fill_profile_apply_hist(h_deltas_vs_dist_all_events,
                                           self.config.profile_name,
                                           self.config.suffix)
        plot_utils.fill_std_dev_apply_hist(h_deltas_vs_dist_all_events,
                                           self.config.h_std_dev_name,
                                           self.config.suffix, "all_events_")

        myfile.Close()
Ejemplo n.º 10
0
from sklearn.datasets import load_boston
from sklearn.metrics import accuracy_score, r2_score

ds = load_boston()

x = ds.data
y = ds.target

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    shuffle=True,
                                                    random_state=66)

model = XGBRFRegressor(n_jobs=-1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print("R2: ", score)

thresholds = np.sort(model.feature_importances_)
print(thresholds)

for thresh in thresholds:
    # 칼럼수 만큼 돈다.
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    selection_x_train = selection.transform(x_train)
    # print(selection_x_train.shape)

    selection_model = XGBRFRegressor()
    selection_model.fit(selection_x_train, y_train)
Ejemplo n.º 11
0
xgb = XGBRFRegressor(colsample_bynode=1,
                     colsample_bytree=0.6,
                     learning_rate=0.01,
                     max_delta=4,
                     min_child_weight=1.5,
                     n_estimators=2400,
                     reg_alpha=0.6,
                     reg_lambda=0.6)
lgbm = LGBMRegressor(objective='regression',
                     num_leaves=4,
                     learning_rate=0.01,
                     n_estimators=12000)

# In[158]:

xgb.fit(X_train, y_train)
lgbm.fit(X_train, y_train, eval_metric='rmse')

# In[162]:

predict1 = xgb.predict(X_test)
predict2 = lgbm.predict(X_test)

# In[164]:

print('Root Mean Square Error test = ' +
      str(math.sqrt(mean_squared_error(y_test, predict1))))
print('Root Mean Square Erroe test = ' +
      str(math.sqrt(mean_squared_error(y_test, predict2))))

# In[165]:
Ejemplo n.º 12
0
from sklearn.datasets import load_boston
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRFRegressor

#
x, y = load_boston(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)

model = XGBRFRegressor(n_estimators=1000, learning_rate=0.1)

model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric="rmse",
          eval_set=[(x_train, y_train), (x_test, y_test)])

#rmse,mae,logloss,error,auc

results = model.evals_result()
print("eval:", results)

y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print("r2:", r2)
print("r2: %.2f%%", (r2 * 100.0))
Ejemplo n.º 13
0
from sklearn.datasets import load_boston
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRFRegressor


x, y = load_boston(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8, shuffle=True, random_state=66)

model = XGBRFRegressor(n_estimators=1000, learning_rate=0.1)

model.fit(x_train, y_train, verbose=True, eval_metric="rmse",
                    eval_set = [(x_train, y_train), (x_test, y_test)],
                    early_stopping_rounds=20)

#rmse,mae,logloss,error,auc

results = model.evals_result()
print("eval:", results)\

y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred) 
print("r2:", r2)
print("r2: %.2f%%" %(r2*100.0))
Ejemplo n.º 14
0
#     'colsample_bylevel': [0.6, 0.8, 0.9],
#     'max_depth' : [6,7,8]}
# ]

# model1 = XGBRFRegressor(n_estimators= 300,learning_rate=1,colsample_bytree=0.99,colsample_bylevel=0.99,max_depth=50,nrounds=1000,scale_pos_weight=1.5)
#model2 = XGBRFRegressor(n_estimators= 400,learning_rate=1,colsample_bytree=0.99,colsample_bylevel=0.99,max_depth=50,nrounds=1000,scale_pos_weight=1.5)
model3 = XGBRFRegressor(n_estimators= 400,learning_rate=1,colsample_bytree=0.99,colsample_bylevel=0.99,max_depth=10,nrounds=1000,scale_pos_weight=1.5)
# model4 = XGBRFRegressor(n_estimators= 100,learning_rate=1,colsample_bytree=0.99,colsample_bylevel=0.99,max_depth=50,nrounds=1000,scale_pos_weight=1.5)

# model = GridSearchCV(model, parameters, cv =5)
# model = MultiOutputRegressor(model)

warnings.filterwarnings('ignore')
# model1.fit(x_train, y1_train)
#model2.fit(x_train, y2_train)
model3.fit(x_train, y3_train)
# model4.fit(x_train, y4_train)

# y1_pred = model1.predict(test)
# print(y1_pred)
# print(y1_pred.shape)

# y2_pred = model2.predict(test)
# print(y2_pred)
# print(y2_pred.shape)

# y3_pred = model3.predict(test)
# print(y3_pred)
# print(y3_pred.shape)

# y4_pred = model4.predict(test)
Ejemplo n.º 15
0
class XGBoostText:
    def __init__(self,
                 expmodel_id='test.new',
                 n_estimators=100,
                 use_gpu=False,
                 criterion='gini',
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.0,
                 max_features='auto',
                 max_leaf_nodes=None,
                 min_impurity_decrease=0.0,
                 min_impurity_split=None,
                 bootstrap=True,
                 oob_score=False,
                 n_jobs=None,
                 random_state=None,
                 verbose=0,
                 warm_start=False,
                 class_weight=None,
                 ccp_alpha=0.0,
                 max_samples=None):
        """
        XGboost from public XGBoostText Lib.


        Parameters

        ----------

        """
        check_model_dir(expmodel_id=expmodel_id)
        self.checkout_dir = os.path.join('./experiments_records', expmodel_id,
                                         'checkouts')
        self.result_dir = os.path.join('./experiments_records', expmodel_id,
                                       'results')
        # make saving directory if needed
        if not os.path.isdir(self.checkout_dir):
            os.makedirs(self.checkout_dir)

        if not os.path.isdir(self.result_dir):
            os.makedirs(self.result_dir)

        self.expmodel_id = expmodel_id
        self.n_estimators = n_estimators
        self.use_gpu = use_gpu
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.min_impurity_split = min_impurity_split
        self.bootstrap = bootstrap
        self.oob_score = oob_score
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose
        self.warm_start = warm_start
        self.class_weight = class_weight
        self.ccp_alpha = ccp_alpha
        self.max_samples = max_samples
        self.task_type = None
        # self._args_check()
        self.device = self._get_device()

    def _data_check(self, datalist):
        """
        
        Target to 1) check train_data/valid_data valid, if not give tips about data problem
                  2) check loss function valid, if not recommend proper loss func
        
        Parameters

        ----------

        datalist = [data1 = {
                      'x':list[episode_file_path], 
                      'y':list[label], 
                      'l':list[seq_len], 
                      'feat_n': n of feature space, 
                      'label_n': n of label space
                    },
                    data2 = {
                      'x':list[episode_file_path], 
                      'y':list[label], 
                      'l':list[seq_len], 
                      'feat_n': n of feature space, 
                      'label_n': n of label space
                    }, ...
                    ]
        Returns

        -------

        self : object


        """

        label_n_check = set([])
        task_type_check = set([])
        for each_data in datalist:
            for each_x_path in each_data['x']:
                if os.path.exists(each_x_path) is False:
                    raise Exception('episode file not exist')
            label_n_check.add(np.shape(np.array(each_data['y']))[1])
            task_type_check.add(
                label_check(each_data['y'],
                            hat_y=None,
                            assign_task_type=self.task_type))

        if len(task_type_check) != 1:
            raise Exception('task_type is inconformity in data')

        pre_task_type = list(task_type_check)[0]
        if self.task_type == None:
            self.task_type = pre_task_type
        elif self.task_type == pre_task_type:
            pass
        else:
            raise Exception(
                'predifine task-type {0}, but data support task-type {1}'.
                format(self.task_type, pre_task_type))
        print('current task can beed seen as {0}'.format(self.task_type))

    def _get_device(self):
        if self.use_gpu:
            if torch.cuda.is_available():
                device = torch.device("cuda")
                print('use GPU recource')
            else:
                device = torch.device("cpu")
                print('not find effcient GPU, use CPU recource')
        else:
            device = torch.device("cpu")
            print('use CPU recource')
        return device

    def _build_model(self):
        """
        
        Build the crucial components for model training 
 
        
        """

        _config = {
            'n_estimators': self.n_estimators,
            'max_leaf_nodes': self.max_leaf_nodes,
            'min_impurity_split': self.min_impurity_split,
            'n_jobs': self.n_jobs,
            'random_state': self.random_state,
            'max_samples': self.max_samples
        }
        if self.task_type == 'binaryclass':
            self.predictor = XGBClassifier(**_config,
                                           objective='binary:logistic',
                                           eval_metric="logloss")
        elif self.task_type == 'multiclass':
            self.predictor = XGBClassifier(**_config)
        elif self.task_type == 'multilabel':
            xgb_estimator = XGBClassifier(**_config,
                                          objective='binary:logistic',
                                          eval_metric="logloss")
            self.predictor = MultiOutputClassifier(xgb_estimator)
        elif self.task_type == 'regression':
            self.predictor = XGBRFRegressor(**_config)
        self._save_config(_config, 'predictor')
        _config = {'tasktype': self.task_type}
        self._save_config(_config, 'tasktype')

    def _data_check(self, datalist):
        """
        
        Target to 1) check train_data/valid_data valid, if not give tips about data problem
                  2) check loss function valid, if not recommend proper loss func
        
        Parameters

        ----------

        datalist = [data1 = {
                      'x':list[episode_file_path], 
                      'y':list[label], 
                      'l':list[seq_len], 
                      'feat_n': n of feature space, 
                      'label_n': n of label space
                    },
                    data2 = {
                      'x':list[episode_file_path], 
                      'y':list[label], 
                      'l':list[seq_len], 
                      'feat_n': n of feature space, 
                      'label_n': n of label space
                    }, ...
                    ]
        Returns

        -------

        self : object


        """

        label_n_check = set([])
        task_type_check = set([])
        for each_data in datalist:
            for each_x_path in each_data['x']:
                if os.path.exists(each_x_path) is False:
                    raise Exception('episode file not exist')
            label_n_check.add(np.shape(np.array(each_data['y']))[1])
            task_type_check.add(
                label_check(each_data['y'],
                            hat_y=None,
                            assign_task_type=self.task_type))

        if len(task_type_check) != 1:
            raise Exception('task_type is inconformity in data')

        pre_task_type = list(task_type_check)[0]
        if self.task_type == None:
            self.task_type = pre_task_type
        elif self.task_type == pre_task_type:
            pass
        else:
            raise Exception(
                'predifine task-type {0}, but data support task-type {1}'.
                format(self.task_type, pre_task_type))

    def fit(self, data_dict, X=None, y=None, assign_task_type=None):
        """
        Parameters

        ----------

        train_data : {
                      'x':list[episode_file_path], 
                      'y':list[label], 
                      'l':list[seq_len], 
                      'feat_n': n of feature space, 
                      'label_n': n of label space
                      }

            The input train samples dict.
 
        valid_data : {
                      'x':list[episode_file_path], 
                      'y':list[label], 
                      'l':list[seq_len], 
                      'feat_n': n of feature space, 
                      'label_n': n of label space
                      }

            The input valid samples dict.


        Returns

        -------

        self : object

            Fitted estimator.

        """
        self.task_type = assign_task_type
        if data_dict != None:
            self._data_check([data_dict])
            data = ml_reader.DatasetReader(
                data_dict, device=self.device,
                task_type=self.task_type).get_data()
            _X = np.array(data['X'])
            _y = np.array(data['Y'])
        elif X != None and y != None:
            self._data_check([{'X': X, 'Y': Y}])
            _X = X
            _y = Y
        else:
            raise Exception('fill in correct data for model train')

        print(np.shape(_X), np.shape(_y))
        self._build_model()
        self.predictor.fit(_X, _y)
        model_path = os.path.join(self.checkout_dir, 'best.model')
        joblib.dump(self.predictor, model_path)

    def _save_config(self, config, config_type):
        temp_path = os.path.join(self.checkout_dir,
                                 "{0}_config.json".format(config_type))
        if os.path.exists(temp_path):
            os.remove(temp_path)
        with open(temp_path, "w", encoding='utf-8') as f:
            f.write(json.dumps(config, indent=4))

    def _load_config(self, config_type):
        temp_path = os.path.join(self.checkout_dir,
                                 '{0}_config.json'.format(config_type))
        assert os.path.exists(
            temp_path
        ), 'cannot find {0}_config.json, please it in dir {1}'.format(
            config_type, self.checkout_dir)
        with open(temp_path, 'r') as f:
            config = json.load(f)
        return config

    def load_model(self):
        """
        
        Parameters

        ----------

        loaded_epoch : str, loaded model name 
        
            we save the model by <epoch_count>.epoch, latest.epoch, best.epoch

        Returns

        -------

        self : object

            loaded estimator.

        """
        model_path = os.path.join(self.checkout_dir, 'best.model')
        self.task_type = self._load_config('tasktype')['tasktype']
        self.predictor = joblib.load(model_path)

    def inference(self, data_dict, X=None, y=None):
        """

        Parameters

        ----------

        test_data : {
                      'x':list[episode_file_path], 
                      'y':list[label], 
                      'l':list[seq_len], 
                      'feat_n': n of feature space, 
                      'label_n': n of label space
                      }

            The input test samples dict.
  
        """

        if data_dict != None:
            self._data_check([data_dict])
            data = ml_reader.DatasetReader(
                data_dict, device=self.device,
                task_type=self.task_type).get_data()
            _X = data['X']
            _y = data['Y']
        elif X != None and y != None:
            self._data_check({'X': X, 'Y': y})
            _X = X
            _y = y
        else:
            raise Exception('fill in correct data for model inference')

        if self.task_type in ['binaryclass', 'regression']:
            real_v = _y.reshape(-1, 1)
            prob_v = self.predictor.predict_proba(_X)[:, 1].reshape(-1, 1)
        elif self.task_type in ['multiclass']:
            real_v = np.array(_y)
            prob_v = self.predictor.predict_proba(_X).reshape(
                -1,
                np.shape(real_v)[1])
        elif self.task_type in ['multilabel']:
            real_v = np.array(_y)
            prob_v = []
            _prob_v = self.predictor.predict_proba(_X)
            for each_class in _prob_v:
                if len(each_class) == 1:
                    each_class = np.array([each_class])
                if np.shape(each_class)[1] == 2:
                    v = each_class[:, 1].reshape((-1, 1))
                else:
                    v = each_class
                prob_v.append(v)
            prob_v = np.concatenate(prob_v, 1)

        pickle.dump(prob_v, open(os.path.join(self.result_dir, 'hat_y'), 'wb'))
        pickle.dump(real_v, open(os.path.join(self.result_dir, 'y'), 'wb'))

    def get_results(self):
        """
        
        Load saved prediction results in current ExpID
            truth_value: proj_root/experiments_records/*****(exp_id)/results/y
            predict_value: proj_root/experiments_records/*****(exp_id)/results/hat_y
            xxx represents the loaded model
        
        """
        try:
            hat_y = pickle.load(
                open(os.path.join(self.result_dir, 'hat_y'), 'rb'))
        except IOError:
            print('Error: cannot find file {0} or load failed'.format(
                os.path.join(self.result_dir, 'hat_y')))
        try:
            y = pickle.load(open(os.path.join(self.result_dir, 'y'), 'rb'))
        except IOError:
            print('Error: cannot find file {0} or load failed'.format(
                os.path.join(self.result_dir, 'y')))

        results = {'hat_y': hat_y, 'y': y}

        return results
Ejemplo n.º 16
0
# 1. 데이터
datasets = load_boston()
x = datasets.data
y = datasets.target
print("init x.shape:", x.shape)

# 1.1 데이터 전처리 (train_test_split)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=44,
                                                    shuffle=True,
                                                    test_size=0.2)

# 2 모델 (XGBRFRegressor)
model = XGBRFRegressor(max_depth=4)
model.fit(x_train, y_train)

# 4. 평가
acc = model.score(x_test, y_test)
print("acc:", acc)
print(model.feature_importances_)


# 피쳐 임포턴스 자르는 함수
def earseLowFI_index(fi_arr, low_value, input_arr):
    input_arr = input_arr.T
    temp = []
    for i in range(fi_arr.shape[0]):
        if fi_arr[i] >= low_value:
            temp.append(input_arr[i, :])
    temp = np.array(temp)
Ejemplo n.º 17
0
from sklearn.metrics import mean_squared_error as MSE
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from time import time
from function import plot_learning_curve

boston = load_boston()
X, y = boston.data, boston.target

x_train, x_test,  y_train, y_test =  train_test_split(X, y, test_size=0.3)

# 构建梯度提升树模型
xgbr = XGBR(n_estimators=100)
xgbr.fit(x_train, y_train)
# 预测结果
predict = xgbr.predict(x_test)


# 计算均方误差
print(MSE(y_test, xgbr.predict(x_test)))

# 绘制学习曲线
cv = KFold(n_splits=5, shuffle=True, random_state=32)
plot_learning_curve(XGBR(n_estimators=100, random_state=30), 'XGBR', X, y, ax=None, cv=cv)
plt.show()

# 通过观察图可以发现在数据量很少的情况下,模型处于过拟合状态,在数据流不断提高时,模型的泛华能力不断提高。

Ejemplo n.º 18
0
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRFRegressor
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_diabetes

x, y = load_diabetes(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=42,
                                                    shuffle=True,
                                                    train_size=0.8)

model1 = XGBRFRegressor()
model1.fit(x_train, y_train)

default_score = model1.score(x_test, y_test)

model = XGBRFRegressor()
model.fit(x_train, y_train)
print(model.feature_importances_)

index7 = np.sort(model.feature_importances_)[::-1][int(
    0.7 * len(model.feature_importances_))]

delete_list = []
for i in model.feature_importances_:
    if i < index7:
        print(i, "제거 ")
        delete_list.append(model.feature_importances_.tolist().index(i))
Ejemplo n.º 19
0
# gpu_id=0, tree_method='gpu_hist'
# model1 = XGBRFRegressor(n_estimators= 300,learning_rate=1,colsample_bytree=1,colsample_bylevel=1,max_depth=50,subsample=0.8, n_jobs=-1)
model2 = XGBRFRegressor(n_estimators=400,
                        learning_rate=1,
                        colsample_bytree=1,
                        colsample_bylevel=1,
                        max_depth=50)
# model3 = XGBRFRegressor(n_estimators= 350,learning_rate=1,colsample_bytree=1,colsample_bylevel=1,max_depth=40,subsample=1,n_jobs=-1)
# model4 = XGBRFRegressor(n_estimators= 100,learning_rate=1,colsample_bytree=1,colsample_bylevel=0.7,max_depth=30,n_jobs=-1)

# model = GridSearchCV(model, parameters, cv =5)
# model = MultiOutputRegressor(model2)

warnings.filterwarnings('ignore')
# model1.fit(x_train, y1_train)
model2.fit([x1_train, x2_train], y2_train)
# model3.fit(x_train, y3_train)
# model4.fit(x_train, y4_train)

# y1_pred = model1.predict(x_test)
# print(y1_pred)
# print(y1_pred.shape)

y2_pred = model2.predict([x1_test, x2_test])
# print(y2_pred)
# print(y2_pred.shape)

# y3_pred = model3.predict(x_test)
# print(y3_pred)
# print(y3_pred.shape)