Ejemplo n.º 1
0
def main():
    pre_process = PreProcessor()
    X, y = pre_process.get_train_data()

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3,
                                                        shuffle=True,
                                                        random_state=42,
                                                        stratify=y)

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
    }

    # 学習時に用いる検証用データ
    evals = [(dtrain, 'train'), (dtest, 'eval')]
    # 学習過程を記録するための辞書
    evals_result = {}
    bst = xgb.train(xgb_params,
                    dtrain,
                    num_boost_round=1000,  # ラウンド数を増やしておく
                    evals=evals,
                    evals_result=evals_result,
                    )

    y_pred_proba = bst.predict(dtest)
    y_pred = np.where(y_pred_proba > 0.5, 1, 0)
    acc = accuracy_score(y_test, y_pred)
    print('Accuracy:', acc)

    # 学習の課程を折れ線グラフとしてプロットする
    train_metric = evals_result['train']['logloss']
    plt.plot(train_metric, label='train logloss')
    eval_metric = evals_result['eval']['logloss']
    plt.plot(eval_metric, label='eval logloss')
    plt.grid()
    plt.legend()
    plt.xlabel('rounds')
    plt.ylabel('logloss')
    plt.show()
Ejemplo n.º 2
0
class Trainer(object):
    """
    Train Class
    """
    def __init__(self):
        """
        Constructor
        """
        # preprocessor instance
        self.__pre_process = PreProcessor()
        self.__train_data, self.__train_targets = self.__pre_process.get_train_data(
        )
        # print(self.__train_data)

        # Tuning Parameters
        self.__n_folds = 5  # Cross-validation with k-folds
        self.__num_epochs = 400

    def build_model(self):
        """
        モデル構築
        :return:
        """
        # NN model
        model = models.Sequential()
        model.add(
            layers.Dense(256,
                         activation='relu',
                         kernel_initializer='normal',
                         input_shape=(self.__train_data.shape[1], )))
        model.add(
            layers.Dense(256, activation='relu', kernel_initializer='normal'))
        model.add(
            layers.Dense(256, activation='relu', kernel_initializer='normal'))
        model.add(
            layers.Dense(1, kernel_initializer='normal', activation='linear'))
        model.compile(optimizer='adam', loss="mse", metrics=['mape'])
        model.summary()
        return model

    def fit_model(self):
        """
        モデルをFitする
        :return:
        """
        # Kerasモデル構築(コンパイル済)
        model = self.build_model()

        # モデルをサイレントモード(verbose=0)で適合
        model.fit(self.__train_data,
                  self.__train_targets,
                  epochs=self.__num_epochs,
                  batch_size=16,
                  verbose=0)

        return model

    def evaluate_cross(self):
        """
        交差評価
        :return:
        """
        all_scores = []
        num_val_samples = int(len(self.__train_data) / self.__n_folds)

        for i in range(self.__n_folds):
            print('processing fold #  {}'.format(i))

            # 検証データの準備
            val_data = self.__train_data[i * num_val_samples:(i + 1) *
                                         num_val_samples]
            val_targets = self.__train_targets[i * num_val_samples:(i + 1) *
                                               num_val_samples]

            # 訓練データの準備
            partial_train_data = np.concatenate([
                self.__train_data[:i * num_val_samples],
                self.__train_data[(i + 1) * num_val_samples:]
            ],
                                                axis=0)
            partial_targets_data = np.concatenate([
                self.__train_targets[:i * num_val_samples],
                self.__train_targets[(i + 1) * num_val_samples:]
            ],
                                                  axis=0)

            # Kerasモデル構築(コンパイル済)
            model = self.build_model()

            # モデルをサイレントモード(verbose=0)で適合
            model.fit(partial_train_data,
                      partial_targets_data,
                      epochs=self.__num_epochs,
                      batch_size=16,
                      verbose=0)

            # モデルを検証データで評価
            val_mse, val_mape = model.evaluate(val_data,
                                               val_targets,
                                               verbose=0)
            all_scores.append(val_mape)

        print(all_scores)

        return np.mean(all_scores)

    def visualize_k_folds(self):
        """
        k分割交差検証のvisualization
        :return:
        """
        all_mape_histories = []
        num_val_samples = int(len(self.__train_data) / self.__n_folds)

        for i in range(self.__n_folds):
            print('processing fold #  {}'.format(i))

            # 検証データの準備
            val_data = self.__train_data[i * num_val_samples:(i + 1) *
                                         num_val_samples]
            val_targets = self.__train_targets[i * num_val_samples:(i + 1) *
                                               num_val_samples]

            # 訓練データの準備
            partial_train_data = np.concatenate([
                self.__train_data[:i * num_val_samples],
                self.__train_data[(i + 1) * num_val_samples:]
            ],
                                                axis=0)
            partial_targets_data = np.concatenate([
                self.__train_targets[:i * num_val_samples],
                self.__train_targets[(i + 1) * num_val_samples:]
            ],
                                                  axis=0)

            # Kerasモデル構築(コンパイル済)
            model = self.build_model()

            # モデルをサイレントモード(verbose=0)で適合
            history = model.fit(partial_train_data,
                                partial_targets_data,
                                validation_data=(val_data, val_targets),
                                epochs=self.__num_epochs,
                                batch_size=16,
                                verbose=0)

            # モデルを検証データで評価
            mape_history = history.history[
                'val_mean_absolute_percentage_error']
            all_mape_histories.append(mape_history)

        print(all_mape_histories)

        average_mape_history = [
            np.mean([x[i] for x in all_mape_histories])
            for i in range(self.__num_epochs)
        ]

        plt.plot(range(1, len(average_mape_history) + 1), average_mape_history)
        plt.xlabel('Epochs')
        plt.ylabel('Validation MAPE')
        plt.show()
Ejemplo n.º 3
0
class Trainer(object):
    """
    Train Class
    """

    def __init__(self):
        """
        Constructor
        """
        # preprocessor instance
        self.__pre_process = PreProcessor()
        self.__train, self.__y_train = self.__pre_process.get_train_data()

        # Tuning Parameters
        self.__n_folds = 3  # Cross-validation with k-folds

        # Models
        self.__lasso = make_pipeline(
            RobustScaler(), Lasso(alpha=0.0005, random_state=1))
        self.__ENet = make_pipeline(RobustScaler(), ElasticNet(
            alpha=0.0005, l1_ratio=.9, random_state=3))
        self.__KRR = KernelRidge(
            alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
        self.__GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                                  max_depth=4, max_features='sqrt',
                                                  min_samples_leaf=15, min_samples_split=10,
                                                  loss='huber', random_state=5)
        self.__model_xgb = xgb.XGBRegressor(colsample_bytree=0.2, gamma=0.0,
                                            learning_rate=0.05, max_depth=6,
                                            min_child_weight=1.5, n_estimators=7200,
                                            reg_alpha=0.9, reg_lambda=0.6,
                                            subsample=0.2, seed=42, silent=1,
                                            random_state=7)
        # self.__model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=5,
        #                                      learning_rate=0.05, n_estimators=720,
        #                                      max_bin=55, bagging_fraction=0.8,
        #                                      bagging_freq=5, feature_fraction=0.2319,
        #                                      feature_fraction_seed=9, bagging_seed=9,
        #                                      min_data_in_leaf=6, min_sum_hessian_in_leaf=11)

    def get_scores(self):
        """
        学習関数
        :return:
        """
        score = self.rmsle_cv(self.__lasso)
        print("\nLasso score: {:.4f} ({:.4f})\n".format(
            score.mean(), score.std()))
        score = self.rmsle_cv(self.__ENet)
        print("ElasticNet score: {:.4f} ({:.4f})\n".format(
            score.mean(), score.std()))
        score = self.rmsle_cv(self.__KRR)
        print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(
            score.mean(), score.std()))
        score = self.rmsle_cv(self.__GBoost)
        print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(
            score.mean(), score.std()))
        score = self.rmsle_cv(self.__model_xgb)
        print("Xgboost score: {:.4f} ({:.4f})\n".format(
            score.mean(), score.std()))
        # score = self.rmsle_cv(self.__model_lgb)
        # print("LGBM score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

    def mean_absolute_percentage_error(self, y_true, y_pred):
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    def adaboost(self):
        regr = AdaBoostRegressor(random_state=0, n_estimators=100)
        regr.fit(self.__train,self.__y_train)
        score = regr.score(self.__train,self.__y_train)
        print(score)
        return regr
        
    
    def fit_model(self):
        """
        モデルをフィットする
        :return:
        """
        # model = self.train_model(self.__train, self.__y_train)
        test_size = 1/self.__n_folds
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(self.__train, 
                                                                                    self.__y_train,
                                                                                    test_size = test_size,
                                                                                    random_state=0)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
                                0.3, 0.6, 1],
                        max_iter=50000, cv=10)
        # lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
        #                         0.3, 0.6, 1], cv=10)

        # lasso = ElasticNetCV(cv=10, random_state=0)

   


        # lasso.fit(x_train_split, y_train_split)
        # y_predicted = lasso.predict(X=x_test_split)
        # mape = self.mean_absolute_percentage_error(y_test_split,y_predicted)
        # print(mape)
     

        # xgboostモデルの作成
        reg = xgb.XGBRegressor()

        # ハイパーパラメータ探索
        reg_cv = GridSearchCV(reg, {'max_depth': [2,4,6], 'n_estimators': [50,100,200]}, verbose=1)
        reg_cv.fit(x_train_split, y_train_split)
        print(reg_cv.best_params_, reg_cv.best_score_)
        # 改めて最適パラメータで学習
        reg = xgb.XGBRegressor(**reg_cv.best_params_)
        reg.fit(x_train_split, y_train_split)


        # 学習モデルの保存、読み込み
        # import pickle
        # pickle.dump(reg, open("model.pkl", "wb"))
        # reg = pickle.load(open("model.pkl", "rb"))

        # 学習モデルの評価
        pred_train = reg.predict(x_train_split)
        pred_test = reg.predict(x_test_split)
        # print(self.mean_absolute_percentage_error(y_train_split, pred_train))
        print(self.mean_absolute_percentage_error(y_test_split, pred_test))

        # import pandas as pd
        # import matplotlib.pyplot as plt
        # importances = pd.Series(reg.feature_importances_, index = boston.feature_names)
        # importances = importances.sort_values()
        # importances.plot(kind = "barh")
        # plt.title("imporance in the xgboost Model")
        # plt.show()
        return reg

    def train_model(self, X, y):
        """ Performs grid search over the 'max_depth' parameter for a
            decision tree regressor trained on the input data [X, y]. """

        # Create cross-validation sets from the training data
        cv_sets = ShuffleSplit(n_splits=10, test_size=0.20, random_state=0)

        # Create a decision tree regressor object
        regressor = DecisionTreeRegressor()

        # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
        params = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

        # Transform 'performance_metric' into a scoring function using 'make_scorer'
        scoring_fnc = make_scorer(self.r2_score)

        # Create the grid search cv object --> GridSearchCV()
        grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)

        # Fit the grid search object to the data to compute the optimal model
        grid = grid.fit(X, y)

        # Return the optimal model after fitting the data
        return grid.best_estimator_

    def rmsle_cv(self, model):
        """
        calculate rmse for cross validation
        :return:
        """
        kf = KFold(self.__n_folds, shuffle=True, random_state=42).get_n_splits(self.__train.values)
        rmse = np.sqrt(-cross_val_score(model, self.__train.values, self.__y_train, scoring="neg_mean_squared_error",
                                        cv=kf))
        return rmse

    @staticmethod
    def r2_score(y_true, y_predict):
        """ Calculates and returns the performance score between
                true (y_true) and predicted (y_predict) values based on the metric chosen. """

        score = r2_score(y_true, y_predict)

        # Return the score
        return score