Beispiel #1
0
def stacking_pred(x_train,
                  y_train,
                  x_valid,
                  kf,
                  clf_list,
                  label_split=None,
                  clf_fin='lgb',
                  if_concat_origin=True):
    for k, clf_list in enumerate(clf_list):
        clf_list = [clf_list]
        column_list = []
        train_data_list = []
        test_data_list = []

        for clf in clf_list:
            train_data, test_data, clf_name = clf(x_train,
                                                  y_train,
                                                  x_valid,
                                                  kf,
                                                  label_split=label_split)
            train_data_list.append(train_data)

            test_data_list.append(test_data)
            column_list.append("clf_%s" % (clf_name))

        train = np.concatenate(train_data_list, axis=1)
        test = np.concatenate(test_data_list, axis=1)

        if if_concat_origin:
            train = np.concatenate([x_train, train], axis=1)
            test = np.concatenate([x_valid, test], axis=1)

        print(x_train.shape)
        print(train.shape)
        print(clf_name)
        print(clf_name in ['lgb'])

        if clf_fin in ['rf', 'ada', 'gb', 'et', 'lr', 'lsvc', 'knn']:
            if clf_fin in ['rf']:
                clf = RandomForestRegressor(n_estimators=600,
                                            max_depth=20,
                                            n_jobs=-1,
                                            random_state=2017,
                                            max_features='auto',
                                            verbose=1)
            elif clf_fin in ['ada']:
                clf = AdaBoostRegressor(n_estimators=30,
                                        random_state=2017,
                                        learning_rate=0.01)
            elif clf_fin in ['gb']:
                clf = GradientBoostingRegressor(learning_rate=0.04,
                                                n_estimators=100,
                                                subsample=0.8,
                                                random_state=2017,
                                                max_depth=5,
                                                verbose=1)
            elif clf_fin in ['et']:
                clf = ExtraTreesRegressor(n_estimators=600,
                                          max_depth=35,
                                          max_features='auto',
                                          n_jobs=-1,
                                          random_state=2017,
                                          verbose=-1)
            elif clf_fin in ['lr']:
                clf = LinearRegression(n_jobs=-1)

            clf.fit(train, y_train)
            pre = clf.predict(test).reshape(-1, 1)
            return pre
        elif clf_fin in ['xgb']:
            clf = xgboost
            train_matrix = clf.DMatrix(train, label=y_train, missing=-1)
            test_matrix = clf.DMatrix(train, label=y_train, missing=-1)

            params = {
                'booster': 'gbtree',
                'eval_metric': 'rmse',
                'gamma': 1,
                'min_child_weight': 1.5,
                'max_depth': 5,
                'lambda': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'eta': 0.03,
                'tree_method': 'extra',
                'seed': 2017,
                'nthread': 12
            }

            num_round = 10000
            early_stopping_rounds = 100
            watchlist = [(train_matrix, 'train'), (test_matrix, 'eval')]

            model = clf.train(params,
                              train_matrix,
                              num_boost_round=num_round,
                              evals=watchlist,
                              early_stopping_rounds=early_stopping_rounds)

            pre = model.predict(test,
                                ntree_limit=model.best_ntree_limit).reshape(
                                    -1, 1)

            return pre

        elif clf_fin in ['lgb']:
            print(clf_name)
            clf = lightgbm
            train_matrix = clf.Dataset(train, label=y_train)
            test_matrix = clf.Dataset(train, label=y_train)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'regression_l2',
                'metric': 'mse',
                'min_child_weight': 1.5,
                'num_leaves': 2**5,
                'lambda_l2': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'learning_rate': 0.03,
                'tree_method': 'extract',
                'seed': 2017,
                'nthread': 12,
                'silent': True
            }

            num_round = 10000
            early_stopping_rounds = 100
            model = clf.train(params,
                              train_matrix,
                              num_round,
                              valid_sets=test_matrix,
                              early_stopping_rounds=early_stopping_rounds)

            pre = model.predict(test,
                                num_iteration=model.best_iteration).reshape(
                                    -1, 1)

            return pre