Example #1
0
    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        self.xgb = XGBRegressor(objective=self.objective,
                                learning_rate=self.learning_rate,
                                min_child_weight=self.min_child_weight,
                                subsample=self.subsample,
                                colsample_bytree=self.colsample_bytree,
                                max_depth=self.max_depth,
                                n_estimators=self.n_estimators,
                                nthread=self.nthread,
                                missing=0.0,
                                seed=self.seed)
        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,
        self.off = DigitizedOptimizedOffsetRegressor(
            n_buckets=self.n_buckets,
            initial_params=self.initial_params,
            minimizer=self.minimizer,
            scoring=self.scoring)

        self.xgb.fit(X, y)

        tr_y_hat = self.xgb.predict(
            X, ntree_limit=self.xgb.booster().best_iteration)
        print('Train score is:', -self.scoring(tr_y_hat, y))
        self.off.fit(tr_y_hat, y)
        print("Offsets:", self.off.params)

        return self
    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        self.xgb = XGBRegressor(
                       objective=self.objective,
                       learning_rate=self.learning_rate,
                       min_child_weight=self.min_child_weight,
                       subsample=self.subsample,
                       colsample_bytree=self.colsample_bytree,
                       max_depth=self.max_depth,
                       n_estimators=self.n_estimators,
                       nthread=self.nthread,
                       missing=0.0,
                       seed=self.seed)
        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,
        self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
                       initial_params=self.initial_params,
                       minimizer=self.minimizer,
                       scoring=self.scoring)

        self.xgb.fit(X, y)

        tr_y_hat = self.xgb.predict(X,
                                    ntree_limit=self.xgb.booster().best_iteration)
        print('Train score is:', -self.scoring(tr_y_hat, y))
        self.off.fit(tr_y_hat, y)
        print("Offsets:", self.off.params)

        return self
    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,

        """
2 / 5
grid scores:
  mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65531

3 / 5
grid scores:
  mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65474

4 / 5
grid scores:
  mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65490


2 / 10
grid scores:
  mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65688

3 / 10
grid scores:
  mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65705

4 / 10
grid scores:
  mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65643

5 / 10
grid scores:
  mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65630

        """
        from sklearn.cross_validation import StratifiedKFold
        kf = StratifiedKFold(y, n_folds=2)
        print(kf)
        params = []
        for itrain, itest in kf:
            ytrain = y[itrain]
            Xtrain = X.iloc[list(itrain)]
            ytest = y[itest]
            Xtest = X.iloc[list(itest)]

            self.xgb = XGBRegressor(
                           objective=self.objective,
                           learning_rate=self.learning_rate,
                           min_child_weight=self.min_child_weight,
                           subsample=self.subsample,
                           colsample_bytree=self.colsample_bytree,
                           max_depth=self.max_depth,
                           n_estimators=self.n_estimators,
                           nthread=self.nthread,
                           missing=0.0,
                           seed=self.seed)
            self.xgb.fit(Xtrain, ytrain)
            te_y_hat = self.xgb.predict(Xtest,
                                        ntree_limit=self.xgb.booster().best_iteration)
            print('XGB Test score is:', -self.scoring(te_y_hat, ytest))

            self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
                           initial_params=self.initial_params,
                           minimizer=self.minimizer,
                           scoring=self.scoring)
            self.off.fit(te_y_hat, ytest)
            print("Offsets:", self.off.params)
            params += [list(self.off.params)]

            pass

        from numpy import array
        self.off.params = array(params).mean(axis=0)
        print("Mean Offsets:", self.off.params)
        self.xgb.fit(X, y)

        return self
class PrudentialRegressorCVO(BaseEstimator, RegressorMixin):
    def __init__(self,
                objective='reg:linear',
                learning_rate=0.045,
                min_child_weight=50,
                subsample=0.8,
                colsample_bytree=0.7,
                max_depth=7,
                n_estimators=700,
                nthread=-1,
                seed=0,
                n_buckets=8,
                initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6,
                                #1., 2., 3., 4., 5., 6., 7.
                                ],
                minimizer='BFGS',
                scoring=NegQWKappaScorer):

        self.objective = objective
        self.learning_rate = learning_rate
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.nthread = nthread
        self.seed = seed
        self.n_buckets = n_buckets
        self.initial_params = initial_params
        self.minimizer = minimizer
        self.scoring = scoring

        return


    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,

        """
2 / 5
grid scores:
  mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65531

3 / 5
grid scores:
  mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65474

4 / 5
grid scores:
  mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65490


2 / 10
grid scores:
  mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65688

3 / 10
grid scores:
  mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65705

4 / 10
grid scores:
  mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65643

5 / 10
grid scores:
  mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65630

        """
        from sklearn.cross_validation import StratifiedKFold
        kf = StratifiedKFold(y, n_folds=2)
        print(kf)
        params = []
        for itrain, itest in kf:
            ytrain = y[itrain]
            Xtrain = X.iloc[list(itrain)]
            ytest = y[itest]
            Xtest = X.iloc[list(itest)]

            self.xgb = XGBRegressor(
                           objective=self.objective,
                           learning_rate=self.learning_rate,
                           min_child_weight=self.min_child_weight,
                           subsample=self.subsample,
                           colsample_bytree=self.colsample_bytree,
                           max_depth=self.max_depth,
                           n_estimators=self.n_estimators,
                           nthread=self.nthread,
                           missing=0.0,
                           seed=self.seed)
            self.xgb.fit(Xtrain, ytrain)
            te_y_hat = self.xgb.predict(Xtest,
                                        ntree_limit=self.xgb.booster().best_iteration)
            print('XGB Test score is:', -self.scoring(te_y_hat, ytest))

            self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
                           initial_params=self.initial_params,
                           minimizer=self.minimizer,
                           scoring=self.scoring)
            self.off.fit(te_y_hat, ytest)
            print("Offsets:", self.off.params)
            params += [list(self.off.params)]

            pass

        from numpy import array
        self.off.params = array(params).mean(axis=0)
        print("Mean Offsets:", self.off.params)
        self.xgb.fit(X, y)

        return self


    def predict(self, X):
        from numpy import clip
        te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration)
        return clip(self.off.predict(te_y_hat), 1, 8)

    pass
class PrudentialRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,
                objective='reg:linear',
                learning_rate=0.045,
                min_child_weight=50,
                subsample=0.8,
                colsample_bytree=0.7,
                max_depth=7,
                n_estimators=700,
                nthread=-1,
                seed=0,
                n_buckets=8,
                initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6,
                                #1., 2., 3., 4., 5., 6., 7.
                                ],
                minimizer='BFGS',
                scoring=NegQWKappaScorer):

        self.objective = objective
        self.learning_rate = learning_rate
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.nthread = nthread
        self.seed = seed
        self.n_buckets = n_buckets
        self.initial_params = initial_params
        self.minimizer = minimizer
        self.scoring = scoring

        return


    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        self.xgb = XGBRegressor(
                       objective=self.objective,
                       learning_rate=self.learning_rate,
                       min_child_weight=self.min_child_weight,
                       subsample=self.subsample,
                       colsample_bytree=self.colsample_bytree,
                       max_depth=self.max_depth,
                       n_estimators=self.n_estimators,
                       nthread=self.nthread,
                       missing=0.0,
                       seed=self.seed)
        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,
        self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
                       initial_params=self.initial_params,
                       minimizer=self.minimizer,
                       scoring=self.scoring)

        self.xgb.fit(X, y)

        tr_y_hat = self.xgb.predict(X,
                                    ntree_limit=self.xgb.booster().best_iteration)
        print('Train score is:', -self.scoring(tr_y_hat, y))
        self.off.fit(tr_y_hat, y)
        print("Offsets:", self.off.params)

        return self


    def predict(self, X):
        from numpy import clip
        te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration)
        return clip(self.off.predict(te_y_hat), 1, 8)

    pass
Example #6
0
    def fit(self, X, y):
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,
        """
5-fold Stratified CV
grid scores:
  mean: 0.64475, std: 0.00483, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 2, 'max_depth': 6}
  mean: 0.64926, std: 0.00401, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 3, 'max_depth': 6}
  mean: 0.65281, std: 0.00384, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 4, 'max_depth': 6}
  mean: 0.65471, std: 0.00422, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 5, 'max_depth': 6}
  mean: 0.65563, std: 0.00440, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 6, 'max_depth': 6}
  mean: 0.65635, std: 0.00433, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6}
  mean: 0.65600, std: 0.00471, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 8, 'max_depth': 6}
best score: 0.65635
best params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6}


reversed params [8 bins]:
  mean: 0.65588, std: 0.00417, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 6, 'max_depth': 6}
  mean: 0.65640, std: 0.00438, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6}


with Scirpus obj
grid scores:
  mean: 0.65775, std: 0.00429, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6}
best score: 0.65775

+1 na trzech Product_info_2*
  mean: 0.65555, std: 0.00462, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 6, 'max_depth': 6}
  mean: 0.65613, std: 0.00438, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6}

DISCRETE: NaN=most_common, +Medical_History_10,24, (24 jest znaczacy)
  mean: 0.65589, std: 0.00490, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6}


PROPER DATA + Scirpus + reversed params + no-drops
  mean: 0.65783, std: 0.00444, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6}

PROPER DATA + Scirpus + reversed params + no-drops, [email protected]
  mean: 0.65790, std: 0.00421, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6}

jak wyzej, max_depth=7
  mean: 0.65802, std: 0.00420, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 7}

jak wyzej, max_depth=10
  mean: 0.65833, std: 0.00387, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 10}

jak wyzej, max_depth=10, eta=0.03
  mean: 0.65888, std: 0.00391, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 10}

jak wyzej, max_depth=30, eta=0.02
  mean: 0.65798, std: 0.00340, params: {'colsample_bytree': 0.67, 'learning_rate': 0.02, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 30}

jak wyzej, max_depth=10, eta=0.03, eval_metric=Scirpus
  mean: 0.65891, std: 0.00395, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 10}

jak wyzej, max_depth=10, eta=0.03, eval_metric=QWKappa
  mean: 0.65827, std: 0.00368, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 10}


jak wyzej, max_depth=10, eta=0.03, eval_metric=Scirpus, GMM6,GMM17
  mean: 0.65862, std: 0.00423, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 10}
jak wyzej, max_depth=10, eta=0.03, eval_metric=Scirpus, Gvector
  mean: 0.65864, std: 0.00384, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 10}

jak wyzej, max_depth=10, eta=0.03, eval_metric=Scirpus, learning_rates=[0.03] * 200 + [0.02] * 500,
  mean: 0.65910, std: 0.00384, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 10}

        """

        from sklearn.cross_validation import StratifiedKFold
        kf = StratifiedKFold(y, n_folds=self.int_fold)
        print(kf)
        self.xgb = []
        self.off = []
        for i, (itrain, itest) in enumerate(kf):
            ytrain = y[itrain]
            Xtrain = X.iloc[list(itrain)]
            ytest = y[itest]
            Xtest = X.iloc[list(itest)]

            self.xgb += [None]

            from xgb_sklearn import XGBRegressor
            #from xgboost import XGBRegressor
            self.xgb[i] = XGBRegressor(objective=self.objective,
                                       learning_rate=self.learning_rate,
                                       min_child_weight=self.min_child_weight,
                                       subsample=self.subsample,
                                       colsample_bytree=self.colsample_bytree,
                                       max_depth=self.max_depth,
                                       n_estimators=self.n_estimators,
                                       nthread=self.nthread,
                                       missing=0.0,
                                       seed=self.seed)
            self.xgb[i].fit(
                Xtrain,
                ytrain,
                eval_set=[(Xtest, ytest)],
                #eval_metric=self.scoring,
                #eval_metric='rmse',
                eval_metric=scirpus_error,
                #eval_metric=qwkappa_error,
                verbose=False,
                early_stopping_rounds=30,
                #learning_rates=[self.learning_rate] * 200 + [0.02] * 500,
                obj=scirpus_regobj
                #obj=qwkappa_regobj
            )
            print("best iteration:", self.xgb[i].booster().best_iteration)
            te_y_hat = self.xgb[i].predict(
                Xtest, ntree_limit=self.xgb[i].booster().best_iteration)
            print('XGB Test score is:', -self.scoring(te_y_hat, ytest))

            self.off += [None]
            self.off[i] = DigitizedOptimizedOffsetRegressor(
                n_buckets=self.n_buckets,
                initial_params=self.initial_params,
                minimizer=self.minimizer,
                scoring=self.scoring)
            self.off[i].fit(te_y_hat, ytest)
            print("Offsets:", self.off[i].params)
            pass

        return self
Example #7
0
    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,
        """
2 / 5
grid scores:
  mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65531

3 / 5
grid scores:
  mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65474

4 / 5
grid scores:
  mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65490


2 / 10
grid scores:
  mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65688

3 / 10
grid scores:
  mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65705

4 / 10
grid scores:
  mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65643

5 / 10
grid scores:
  mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65630

        """
        from sklearn.cross_validation import StratifiedKFold
        kf = StratifiedKFold(y, n_folds=2)
        print(kf)
        params = []
        for itrain, itest in kf:
            ytrain = y[itrain]
            Xtrain = X.iloc[list(itrain)]
            ytest = y[itest]
            Xtest = X.iloc[list(itest)]

            self.xgb = XGBRegressor(objective=self.objective,
                                    learning_rate=self.learning_rate,
                                    min_child_weight=self.min_child_weight,
                                    subsample=self.subsample,
                                    colsample_bytree=self.colsample_bytree,
                                    max_depth=self.max_depth,
                                    n_estimators=self.n_estimators,
                                    nthread=self.nthread,
                                    missing=0.0,
                                    seed=self.seed)
            self.xgb.fit(Xtrain, ytrain)
            te_y_hat = self.xgb.predict(
                Xtest, ntree_limit=self.xgb.booster().best_iteration)
            print('XGB Test score is:', -self.scoring(te_y_hat, ytest))

            self.off = DigitizedOptimizedOffsetRegressor(
                n_buckets=self.n_buckets,
                initial_params=self.initial_params,
                minimizer=self.minimizer,
                scoring=self.scoring)
            self.off.fit(te_y_hat, ytest)
            print("Offsets:", self.off.params)
            params += [list(self.off.params)]

            pass

        from numpy import array
        self.off.params = array(params).mean(axis=0)
        print("Mean Offsets:", self.off.params)
        self.xgb.fit(X, y)

        return self
Example #8
0
class PrudentialRegressorCVO(BaseEstimator, RegressorMixin):
    def __init__(
            self,
            objective='reg:linear',
            learning_rate=0.045,
            min_child_weight=50,
            subsample=0.8,
            colsample_bytree=0.7,
            max_depth=7,
            n_estimators=700,
            nthread=-1,
            seed=0,
            n_buckets=8,
            initial_params=[
                -1.5,
                -2.6,
                -3.6,
                -1.2,
                -0.8,
                0.04,
                0.7,
                3.6,
                #1., 2., 3., 4., 5., 6., 7.
            ],
            minimizer='BFGS',
            scoring=NegQWKappaScorer):

        self.objective = objective
        self.learning_rate = learning_rate
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.nthread = nthread
        self.seed = seed
        self.n_buckets = n_buckets
        self.initial_params = initial_params
        self.minimizer = minimizer
        self.scoring = scoring

        return

    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,
        """
2 / 5
grid scores:
  mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65531

3 / 5
grid scores:
  mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65474

4 / 5
grid scores:
  mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65490


2 / 10
grid scores:
  mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65688

3 / 10
grid scores:
  mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65705

4 / 10
grid scores:
  mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65643

5 / 10
grid scores:
  mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240}
best score: 0.65630

        """
        from sklearn.cross_validation import StratifiedKFold
        kf = StratifiedKFold(y, n_folds=2)
        print(kf)
        params = []
        for itrain, itest in kf:
            ytrain = y[itrain]
            Xtrain = X.iloc[list(itrain)]
            ytest = y[itest]
            Xtest = X.iloc[list(itest)]

            self.xgb = XGBRegressor(objective=self.objective,
                                    learning_rate=self.learning_rate,
                                    min_child_weight=self.min_child_weight,
                                    subsample=self.subsample,
                                    colsample_bytree=self.colsample_bytree,
                                    max_depth=self.max_depth,
                                    n_estimators=self.n_estimators,
                                    nthread=self.nthread,
                                    missing=0.0,
                                    seed=self.seed)
            self.xgb.fit(Xtrain, ytrain)
            te_y_hat = self.xgb.predict(
                Xtest, ntree_limit=self.xgb.booster().best_iteration)
            print('XGB Test score is:', -self.scoring(te_y_hat, ytest))

            self.off = DigitizedOptimizedOffsetRegressor(
                n_buckets=self.n_buckets,
                initial_params=self.initial_params,
                minimizer=self.minimizer,
                scoring=self.scoring)
            self.off.fit(te_y_hat, ytest)
            print("Offsets:", self.off.params)
            params += [list(self.off.params)]

            pass

        from numpy import array
        self.off.params = array(params).mean(axis=0)
        print("Mean Offsets:", self.off.params)
        self.xgb.fit(X, y)

        return self

    def predict(self, X):
        from numpy import clip
        te_y_hat = self.xgb.predict(
            X, ntree_limit=self.xgb.booster().best_iteration)
        return clip(self.off.predict(te_y_hat), 1, 8)

    pass
Example #9
0
class PrudentialRegressor(BaseEstimator, RegressorMixin):
    def __init__(
            self,
            objective='reg:linear',
            learning_rate=0.045,
            min_child_weight=50,
            subsample=0.8,
            colsample_bytree=0.7,
            max_depth=7,
            n_estimators=700,
            nthread=-1,
            seed=0,
            n_buckets=8,
            initial_params=[
                -1.5,
                -2.6,
                -3.6,
                -1.2,
                -0.8,
                0.04,
                0.7,
                3.6,
                #1., 2., 3., 4., 5., 6., 7.
            ],
            minimizer='BFGS',
            scoring=NegQWKappaScorer):

        self.objective = objective
        self.learning_rate = learning_rate
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.nthread = nthread
        self.seed = seed
        self.n_buckets = n_buckets
        self.initial_params = initial_params
        self.minimizer = minimizer
        self.scoring = scoring

        return

    def fit(self, X, y):
        from xgboost import XGBRegressor
        if not KAGGLE:
            from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor

        self.xgb = XGBRegressor(objective=self.objective,
                                learning_rate=self.learning_rate,
                                min_child_weight=self.min_child_weight,
                                subsample=self.subsample,
                                colsample_bytree=self.colsample_bytree,
                                max_depth=self.max_depth,
                                n_estimators=self.n_estimators,
                                nthread=self.nthread,
                                missing=0.0,
                                seed=self.seed)
        #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor
        #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets,
        #               basinhopping=True,
        self.off = DigitizedOptimizedOffsetRegressor(
            n_buckets=self.n_buckets,
            initial_params=self.initial_params,
            minimizer=self.minimizer,
            scoring=self.scoring)

        self.xgb.fit(X, y)

        tr_y_hat = self.xgb.predict(
            X, ntree_limit=self.xgb.booster().best_iteration)
        print('Train score is:', -self.scoring(tr_y_hat, y))
        self.off.fit(tr_y_hat, y)
        print("Offsets:", self.off.params)

        return self

    def predict(self, X):
        from numpy import clip
        te_y_hat = self.xgb.predict(
            X, ntree_limit=self.xgb.booster().best_iteration)
        return clip(self.off.predict(te_y_hat), 1, 8)

    pass