Ejemplo n.º 1
0
def xgb_bagging(x_train,
                y_train,
                x_test,
                folds,
                max_round,
                n_splits=5,
                bags=5):
    params = {}
    params['max_depth'] = 5
    params['objective'] = "binary:logistic"
    params['eta'] = 0.04  # learning rate
    params['subsample'] = 0.8
    params['min_child_weight'] = 8
    params['colsample_bytree'] = 0.8
    params['gamma'] = 0.60
    params['n_jobs'] = -1
    params['reg_alpha'] = 10.4
    params['reg_lambda'] = 5
    params['silent'] = 1

    # Additional processing of data
    x_train, x_test = feature_engineering_3(x_train, x_test, y_train)

    # Cross Validate
    cv = Cross_Validate(None, n_splits, x_train.shape[0], x_test.shape[0], -1,
                        params, max_round, bags)
    cv.bagging(x_train, y_train, x_test, idx=1, verbose_eval=100)

    return cv.trn_gini, cv.y_trn, cv.y_tst, cv.y_tst_mrank, cv.fscore
Ejemplo n.º 2
0
def log06(x_train, y_train, x_test, folds, max_round, n_splits=5):
    clf = LogisticRegression(
        penalty='l2',
        dual=False,
        tol=0.0001,
        C=0.005,
        fit_intercept=True,
        intercept_scaling=1,
        class_weight='balanced',
        random_state=None,
        solver='sag',
        max_iter=200,
        multi_class='ovr',
        verbose=0,
        warm_start=False,
        n_jobs=4,
    )
    # Additional processing of data
    x_train, x_test = feature_engineering_6(x_train, x_test, y_train)

    # Cross Validate
    cv = Cross_Validate(log06.__name__, n_splits, x_train.shape[0],
                        x_test.shape[0], clf, -1, -1)
    cv.cross_validate(x_train, y_train, x_test, folds, verbose_eval=True)

    return cv.trn_gini, cv.y_trn, cv.y_tst, cv.fscore
Ejemplo n.º 3
0
def rgf04(x_train, y_train, x_test, folds, max_round, n_splits=5):
    clf = RGFClassifier(
        max_leaf=1000,
        algorithm="RGF",
        loss="Log",
        l2=0.01,
        sl2=0.01,
        normalize=False,
        min_samples_leaf=7,  # 10,
        n_iter=None,
        opt_interval=100,
        learning_rate=.45,  # .3,
        calc_prob="sigmoid",
        n_jobs=-2,
        memory_policy="generous",
        verbose=0)

    # Additional processing of data
    x_train, x_test = feature_engineering_4(x_train, x_test, y_train)

    # Cross Validate
    cv = Cross_Validate(rgf04.__name__, n_splits, x_train.shape[0],
                        x_test.shape[0], clf, -1, -1)
    cv.cross_validate(x_train, y_train, x_test, folds, verbose_eval=True)

    return cv.trn_gini, cv.y_trn, cv.y_tst, cv.fscore
Ejemplo n.º 4
0
def etc07(x_train, y_train, x_test, folds, max_round, n_splits=5):
    clf = ExtraTreesClassifier(
        n_estimators = 800,
        criterion = 'gini',
        max_depth = 5,
        min_samples_split = 100,
        min_samples_leaf = 100,
        max_features ='auto',
        min_impurity_decrease = 0.0,
        n_jobs = 4,
        verbose = 0,
    )
    # Additional processing of data
    x_train, x_test = feature_engineering_7(x_train, x_test, y_train)


    # Cross Validate
    cv = Cross_Validate(etc07.__name__, n_splits, x_train.shape[0], x_test.shape[0], clf, -1, -1)
    cv.cross_validate(x_train, y_train, x_test, folds, verbose_eval=True)

    return cv.trn_gini, cv.y_trn, cv.y_tst, cv.fscore
Ejemplo n.º 5
0
def xgb03(x_train, y_train, x_test, folds, max_round, n_splits=5):
    params = {}
    params['max_depth'] = 4
    params['objective'] = "binary:logistic"
    params['eta'] = 0.025  # learning rate
    params['subsample'] = 0.9
    params['min_child_weight'] = 100
    params['colsample_bytree'] = 0.7
    params['gamma'] = 0.60
    params['n_jobs'] = -1
    params['reg_alpha'] = 4
    # params['reg_lambda'] = 5
    params['silent'] = 1

    # Additional processing of data
    x_train, x_test = feature_engineering_3(x_train, x_test, y_train)
    # Cross Validate
    cv = Cross_Validate(xgb03.__name__, n_splits, x_train.shape[0],
                        x_test.shape[0], -1, params, max_round)
    cv.cross_validate_xgb(x_train, y_train, x_test, folds, verbose_eval=100)

    return cv.trn_gini, cv.y_trn, cv.y_tst, cv.fscore
Ejemplo n.º 6
0
def cat05(x_train, y_train, x_test, folds, max_round, n_splits=5):
    clf = CatBoostClassifier(
        iterations=900,
        learning_rate=0.057,
        depth=5,
        l2_leaf_reg=23,
        leaf_estimation_method='Newton',
        loss_function='Logloss',
        thread_count=7,
        random_seed=177,
        one_hot_max_size=10,
        allow_writing_files=False,
    )
    # Additional processing of data
    x_train, x_test = feature_engineering_5(x_train, x_test, y_train)

    # Cross Validate
    cv = Cross_Validate(cat05.__name__, n_splits, x_train.shape[0],
                        x_test.shape[0], clf, -1, -1)
    cv.cross_validate(x_train, y_train, x_test, folds, verbose_eval=True)

    return cv.trn_gini, cv.y_trn, cv.y_tst, cv.fscore
Ejemplo n.º 7
0
    def tune_seq(self, x_train, y_train, x_test, folds, verbose_eval=False):
        """ Tune parameters sequentially

        :return:
        """
        print("\ntuning starts...")

        for key in self.params_dict.keys():
            for item in self.params_dict[key]:
                print('Tuning for parameter %s with value %f' % (key, item))
                self.params_temp = self.params
                self.params_temp.update({key: item})
                cv = Cross_Validate(None,
                                    n_splits=self.n_splits,
                                    len_trn=x_train.shape[0],
                                    len_tst=x_test.shape[0],
                                    clf=-1,
                                    params=self.params,
                                    max_round=self.max_round)
                cv.cross_validate_xgb(x_train, y_train, x_test, folds,
                                      verbose_eval)
                self.params_temp.update({'score': cv.trn_gini})
                self.sframe = pd.concat([
                    self.sframe,
                    pd.Series(self.params_temp.values(),
                              index=self.params_temp.keys())
                ],
                                        axis=1)

                if cv.trn_gini > self.max_score:
                    self.max_item = item
                    self.max_score = cv.trn_gini

            self.params.update({key: self.max_item})

            self.max_item = 0
            self.max_score = 0
        self.sframe = self.sframe.transpose().reset_index()
    def forward_selection(self, x_train, y_train, x_test, folds, cols):
        cv = Cross_Validate(None,
                            n_splits=self.n_splits,
                            len_trn=x_train.shape[0],
                            len_tst=x_test.shape[0],
                            clf=-1,
                            params=self.params,
                            max_round=self.max_round)

        x_train_cols = x_train[cols]
        x_test_cols = x_test[cols]

        x_train.drop(cols, axis=1, inplace=True)
        x_test.drop(cols, axis=1, inplace=True)

        cv.cross_validate_xgb(x_train, y_train, x_test, folds)
        self.current_best = cv.trn_gini
        self.scores.append(self.current_best)

        for i in range(len(cols)):
            print("Round %i" % (i + 1))
            print("Shape of train"),
            print x_train.shape

            for col in cols:

                x_train = pd.concat([x_train, x_train_cols[col]], axis=1)
                x_test = pd.concat([x_test, x_test_cols[col]], axis=1)

                cv.cross_validate_xgb(x_train, y_train, x_test, folds)

                if cv.trn_gini > self.current_best:
                    self.current_best = cv.trn_gini
                    self.col_temp = col

                x_train.drop(x_train_cols[col], axis=1, inplace=True)
                x_test.drop(x_test_cols[col], axis=1, inplace=True)

            if self.col_temp != 0:
                cols.remove(self.col_temp)
                x_train = pd.concat([x_train, x_train_cols[self.col_temp]],
                                    axis=1)
                x_test = pd.concat([x_test, x_test_cols[self.col_temp]],
                                   axis=1)

                self.cols.append(self.col_temp)
                self.scores.append(self.current_best)
                self.col_temp = 0
            else:
                break