예제 #1
0
    def data_prepare(self):
        self.__train_feature_before = pd.read_csv(
            os.path.join(self.__input_path, "train_feature_before_df.csv"))
        self.__train_feature_after = pd.read_csv(
            os.path.join(self.__input_path, "train_feature_after_df.csv"))
        self.__train = pd.concat(
            [self.__train_feature_before, self.__train_feature_after])
        self.__test = pd.read_csv(
            os.path.join(self.__input_path, "test_feature_df.csv"))

        self.__train_label = self.__train["TARGET"].copy()
        self.__train_feature = (self.__train.drop(["TARGET"] + [
            col for col in self.__train.columns.tolist()
            if re.search(r"SK_ID", col)
        ],
                                                  axis=1)).copy()
        self.__test_feature = self.__test[
            self.__train_feature.columns.tolist()].copy()
        self.__categorical_columns = self.__train_feature.select_dtypes(
            include="object").columns.tolist()

        encoder = TargetEncoder()
        encoder.fit(self.__train_feature[self.__categorical_columns],
                    self.__train_label)
        self.__train_feature[self.__categorical_columns] = encoder.transform(
            self.__train_feature[self.__categorical_columns])
예제 #2
0
def target_encoding(train,
                    target,
                    test=None,
                    feat_to_encode=None,
                    smooth=0.2,
                    random_state=9527):
    print('Target encoding...')
    train.sort_index(inplace=True)
    target = train.pop(target)
    if feat_to_encode is None:
        feat_to_encode = train.columns.tolist()
    smoothing = smooth
    oof = pd.DataFrame([])
    for tr_idx, oof_idx in StratifiedKFold(n_splits=5,
                                           random_state=random_state,
                                           shuffle=True).split(train, target):
        ce_target_encoder = TargetEncoder(cols=feat_to_encode,
                                          smoothing=smoothing)
        ce_target_encoder.fit(train.iloc[tr_idx, :], target.iloc[tr_idx])
        oof = oof.append(ce_target_encoder.transform(train.iloc[oof_idx, :]),
                         ignore_index=False)
    ce_target_encoder = TargetEncoder(cols=feat_to_encode, smoothing=smoothing)
    ce_target_encoder.fit(train, target)
    train = oof.sort_index()
    if test is not None:
        test = ce_target_encoder.transform(test)
    features = list(train)
    print('Target encoding done!')
    return train, test, features, target
예제 #3
0
class DFMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.model = TargetEncoder(**kwargs)
        self.transform_cols = None

    def fit(self, X, y):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols], y)

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.drop(columns=self.transform_cols)
        new_X = pd.concat(
            [new_X, self.model.transform(X[self.transform_cols])], axis=1)

        return new_X

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)
예제 #4
0
class CategoricalPreprocessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mode_imputer = SimpleImputer(strategy="most_frequent")
        self.cat_cols = [
            'home_ownership', 'purpose', 'addr_state', 'initial_list_status'
        ]
        self.target_encoder = TargetEncoder(handle_missing='return_nan',
                                            handle_unknown='return_nan')

    def fit(self, X, y=None):
        self.mode_imputer.fit(X[self.cat_cols])
        self.target_encoder.fit(X["zip_code"], y)
        return self

    def transform(self, X, y=None):
        Xc = X.copy()

        # encode emp_length
        lookup = {
            '< 1 year': 0,
            '1 year': 1,
            '2 years': 2,
            '3 years': 3,
            '4 years': 4,
            '5 years': 5,
            '6 years': 6,
            '7 years': 7,
            '8 years': 8,
            '9 years': 9,
            '10+ years': 10
        }
        Xc["emp_length"] = Xc["emp_length"].replace(lookup)

        # issue date
        Xc["issue_d"] = pd.to_datetime(Xc["issue_d"])
        tmp = Xc[
            "issue_d"].values  # keep a copy of the raw date for when we transform earliest credit line
        Xc["issue_d"] = (
            Xc["issue_d"] -
            datetime.datetime(2000, 1, 1)).astype('timedelta64[M]')

        # earliest credit line
        Xc["earliest_cr_line"] = pd.to_datetime(Xc["earliest_cr_line"])
        Xc["earliest_cr_line"] = (
            tmp - Xc["earliest_cr_line"]).astype('timedelta64[M]')

        # imputation for home_ownership, purpose, addr_state, and initial_list_status
        Xc[self.cat_cols] = self.mode_imputer.transform(Xc[self.cat_cols])

        # encode zip code
        Xc["zip_code"] = self.target_encoder.transform(Xc["zip_code"])

        return Xc

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
예제 #5
0
def categorical_encoding(df_X, y, cat_vars, id_train, method=None):
    if method is None:
        return df_X.values, df_X.columns

    target_enc = TargetEncoder(cols=cat_vars,
                               drop_invariant=False,
                               return_df=True,
                               impute_missing=False,
                               handle_unknown='error')
    target_enc.fit(df_X.iloc[id_train], pd.Series(y).iloc[id_train])
    df_X = target_enc.transform(df_X)

    return df_X.values, df_X.columns
예제 #6
0
def frontend_preproc(df, y):
    '''
    Function that produces the preprocessing of the DataFrame before applying the model on the front-end.
    :df: concat of df_input by the user and X features of the model
    :y: target
    '''
    ### Feature Engineering
    ohe_cols = ['gearbox', 'fuel_type', 'warranty', 'dealer', 'doors']

    # OHE
    ohe = OneHotEncoder(categories='auto')
    feature_arr = ohe.fit_transform(df[ohe_cols]).toarray()
    feature_labels = ohe.categories_

    # Using a dictionary to produce all the new OHE columns
    feature_cols = []
    for k, v in dict(zip(ohe_cols, feature_labels)).items():
        for i in v:
            el = k + '_' + str(i)
            feature_cols.append(el)

    ohe_features = pd.DataFrame(feature_arr, columns=feature_cols)
    df = pd.concat([df, ohe_features], axis=1)
    df = df.drop(ohe_cols, axis=1)

    # Target Encoding
    cat_cols = df.select_dtypes(exclude=["number"]).columns
    cols_encoded = list(map(lambda c: c + '_encoded', cat_cols))

    t_encoder = TargetEncoder()
    t_encoder.fit(df[1:][cat_cols], y)
    df[cols_encoded] = t_encoder.transform(df[cat_cols])
    df = df.drop(cat_cols, axis=1)

    # Column Transformation: QuantileTransformer
    qt = QuantileTransformer(n_quantiles=500,
                             output_distribution='normal',
                             random_state=33)

    data = qt.fit_transform(df)
    df = pd.DataFrame(data, columns=df.columns)
    
    return df
예제 #7
0
def target_encoding(X_train, y_train, X_test, cols, cv_id):
    cols = list(cols)
    train_new = X_train.copy()
    test_new = X_test.copy()
    test_new[:] = 0
    cv = PredefinedSplit(cv_id)
    X_train.index = X_train.index.astype(int)
    for trn_idx, val_idx in tqdm(cv.split(X_train), total=cv.get_n_splits()):
        enc = TargetEncoder(cols=cols)
        enc.fit(X_train.iloc[trn_idx], y_train[trn_idx])
        train_new.iloc[val_idx] = enc.transform(X_train.iloc[val_idx])
        test_new += enc.transform(X_test)
    test_new /= cv.get_n_splits()
    train_new = train_new[cols]
    test_new = test_new[cols]
    train_new.columns = train_new.columns + '_target'
    test_new.columns = test_new.columns + '_target'
    print(list(train_new.columns))
    return train_new, test_new
def clean_train_data_target_encoded(data):
    #uses target encodier instead
    data = data.reset_index(drop=True)
    train_y = data.iloc[:,-1]
    train_y = train_y.reset_index(drop=True)
    train_X = data.iloc[:,:-1]
    
    train_X = process_features(train_X)
    
    
    encoder = TargetEncoder(cols = ["Hair Color",
         "Wears Glasses","University Degree","Gender","Country","Profession", 
         "Housing Situation", "Satisfation with employer"], smoothing = 300)

    encoder.fit(train_X,train_y)
    data2 =  pd.concat([encoder.transform(train_X,train_y).reset_index(drop=True),train_y.reset_index(drop=True)],axis=1)
    #data2 = data2.fillna(method="ffill")
    
    return (data2,encoder)
예제 #9
0
class ScatterPlot(object):
    def __init__(self, *, input_path, output_path):
        self.__input_path, self.__output_path = input_path, output_path

        self.__train = None
        self.__train_feature, self.__train_label = [None for _ in range(2)]

        self.__encoder = None
        self.__pca, self.__t_sne = [None for _ in range(2)]

    def data_read(self):
        self.__train = pd.read_csv(os.path.join(self.__input_path,
                                                "train.csv"))
        self.__train = self.__train.drop(["id"], axis=1)
        self.__train_feature, self.__train_label = (self.__train.drop(
            ["target"],
            axis=1).copy(deep=True), self.__train["target"].copy(deep=True))
        self.__train_feature = self.__train_feature.astype(str)

    def data_prepare(self):
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature, self.__train_label)
        self.__train_feature = self.__encoder.transform(self.__train_feature)

        self.__pca = PCA(n_components=2, random_state=7)
        self.__train_feature = self.__pca.fit_transform(self.__train_feature)
        self.__train_feature = pd.DataFrame(self.__train_feature,
                                            columns=["col_1", "col_2"])

        # self.__t_sne = TSNE(verbose=True, random_state=7)
        # self.__train_feature = self.__t_sne.fit_transform(self.__train_feature)
        # self.__train_feature = pd.DataFrame(self.__train_feature, columns=["col_1", "col_2"])

    def scatter_plot(self):
        _, ax = plt.subplots(figsize=(16, 9))
        ax = sns.scatterplot(x="col_1",
                             y="col_2",
                             hue=self.__train_label,
                             data=self.__train_feature,
                             ax=ax)
        ax.get_figure().savefig(os.path.join(self.__output_path, "PCA.png"))
예제 #10
0
def fit_model(X_train, y_train, X_val, y_val, **params):

    if args.model == "catboost":

        if args.gpu:
            model = CatBoostRegressor(**params, loss_function="RMSE", random_state=42, use_best_model=True,
                                      task_type="GPU")
        else:
            model = CatBoostRegressor(**params, loss_function="RMSE", random_state=42, use_best_model=True,
                                      task_type="CPU")
        model.fit(X_train, y_train,
                  cat_features=cat_cols,
                  early_stopping_rounds=config.EARLY_STOPPING_ROUNDS,
                  eval_set=(X_val, y_val),
                  plot=False)
        return model, None

    elif args.model == "xgboost":

        te = TargetEncoder(cols=cat_cols, smoothing=300)
        te.fit(X_train, y_train)
        X_train = te.transform(X_train)
        X_val = te.transform(X_val)
        if args.gpu:
            model = XGBRegressor(**params, random_state=42, verbosity=1,
                                 tree_method='gpu_hist', gpu_id=0, predictor="cpu_predictor")
        else:
            model = XGBRegressor(**params, random_state=42, verbosity=1)
        model.fit(X_train, y_train,
                  eval_set=[(X_train, y_train),
                            (X_val, y_val)],
                  eval_metric="rmse",
                  early_stopping_rounds=config.EARLY_STOPPING_ROUNDS,
                  verbose=True)
        return model, te

    else:

        raise ValueError("Invalid value passed to model. Has to be either CatBoost or XGBoost.")
train_myVolts_Null_cbf_parser['cbf_parser']=train_myVolts_Null_cbf_parser.query_char_count.apply(lambda x: cbf_parser_estimator(x))


train_data_1=pd.concat([train_myVolts_Null_item_type, train_myVolts_Not_Null_item_type], axis=0)
train_data_2=pd.concat([train_myVolts_Null_cbf_parser, train_myVolts_Not_Null_cbf_parser], axis=0)

train_myVolts['item_type']=train_data_1['item_type']
train_myVolts['cbf_parser']=train_data_2['cbf_parser']
train_myVolts['country_by_ip']=train_myVolts['country_by_ip'].fillna('missing')
print('Values with NANs Train',train_myVolts[feature_cols].isnull().sum())

y = train_myVolts.set_clicked
X = train_myVolts[feature_cols]
from category_encoders import TargetEncoder
t1 = TargetEncoder()
t1.fit(X, y)
X = t1.transform(X)

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=1234)

##check
X_train.to_csv('X_train4.csv',index=False)
y_train.to_csv('y_train4.csv',index=False)


from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
logreg1 =LogisticRegression ()
# logreg1 = RandomForestClassifier(n_estimators=500)
예제 #12
0
# SKLEARN TARGET ENCODING

#!pip install category_encoders
from category_encoders import TargetEncoder

us_adults = pd.read_csv("./adult.csv", na_values="?")

us_adults.head()

features_original = [f for f in us_adults.columns if f not in "income"]

features_original

target_mapping

#Remap outcome variable
us_adults.loc[:, "income"] = us_adults.income.map(target_mapping)

us_adults.income.value_counts()

te = TargetEncoder(return_df=True, smoothing=0)

te.fit(X=us_adults[features_original], y=us_adults.income)

encoded_df_sk = te.transform(X=us_adults[features_original])

encoded_df_sk.shape

encoded_df_sk.head()
예제 #13
0
class CatBoostKfold(object):

    def __init__(self, *, input_path_1, input_path_2, output_path):
        self.__input_path_1 = input_path_1
        self.__input_path_2 = input_path_2
        self.__output_path = output_path

        self.__sample_submission = None
        self.__train, self.__test = [None for _ in range(2)]
        self.__train_res, self.__test_res = [None for _ in range(2)]

        self.__train_feature, self.__train_label = [None for _ in range(2)]
        self.__test_feature = None
        self.__categorical_index = None
        self.__encoder = None
        self.__numeric_index = None

        self.__folds = None
        self.__oof_preds = None
        self.__sub_preds = None
        self.__cat = None

    def data_prepare(self):
        self.__sample_submission = pd.read_csv(os.path.join(self.__input_path_1, "sample_submission.csv"))
        self.__train = pd.read_csv(os.path.join(self.__input_path_1, "train_feature_df.csv"))
        self.__test = pd.read_csv(os.path.join(self.__input_path_1, "test_feature_df.csv"))
        self.__train_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_train_res.csv"))
        self.__test_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_test_res.csv"))

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["SK_ID_CURR", "TARGET"], axis=1)
        self.__test_feature = self.__test[self.__train_feature.columns]

        self.__train_res = self.__train_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1)
        self.__test_res = self.__test_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1)

        self.__train_feature = pd.concat([self.__train_feature, self.__train_res], axis=1)
        self.__test_feature = pd.concat([self.__test_feature, self.__test_res], axis=1)

        self.__categorical_index = np.where(self.__train_feature.dtypes == "object")[0]
        self.__train_feature.iloc[:, self.__categorical_index] = (
            self.__train_feature.iloc[:, self.__categorical_index].fillna("missing")
        )
        self.__test_feature.iloc[:, self.__categorical_index] = (
            self.__test_feature.iloc[:, self.__categorical_index].fillna("missing")
        )

        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature.iloc[:, self.__categorical_index], self.__train_label)
        self.__train_feature.iloc[:, self.__categorical_index] = (
            self.__encoder.transform(self.__train_feature.iloc[:, self.__categorical_index])
        )
        self.__test_feature.iloc[:, self.__categorical_index] = (
            self.__encoder.transform(self.__test_feature.iloc[:, self.__categorical_index])
        )

        # There are NaNs in test dataset (feature number 77) but there were no NaNs in learn dataset"
        self.__numeric_index = np.where(self.__train_feature.dtypes != "object")[0]
        self.__train_feature.iloc[:, self.__numeric_index] = (
            self.__train_feature.iloc[:, self.__numeric_index].apply(
                lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0)
            )
        )
        self.__test_feature.iloc[:, self.__numeric_index] = (
            self.__test_feature.iloc[:, self.__numeric_index].apply(
                lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0)
            )
        )

        # blending 之前需要 shuffle, 这里其实并不需要, 因为后面 StratifiedKFold shuffle
        self.__train_feature, self.__train_label = shuffle(self.__train_feature, self.__train_label)

    def model_fit(self):
        self.__folds = StratifiedKFold(n_splits=5, shuffle=True)
        self.__oof_preds = np.zeros(shape=self.__train_feature.shape[0])
        self.__sub_preds = np.zeros(shape=self.__test_feature.shape[0])

        for n_fold, (trn_idx, val_idx) in enumerate(self.__folds.split(self.__train_feature, self.__train_label)):
            trn_x, trn_y = self.__train_feature.iloc[trn_idx], self.__train_label.iloc[trn_idx]
            val_x, val_y = self.__train_feature.iloc[val_idx], self.__train_label.iloc[val_idx]

            self.__cat = CatBoostClassifier(
                iterations=6000,
                od_wait=200,
                od_type="Iter",
                eval_metric="AUC"
            )
            self.__cat.fit(
                trn_x,
                trn_y,
                eval_set=[(val_x, val_y)],
                use_best_model=True
            )
            pred_val = self.__cat.predict_proba(val_x)[:, 1]
            pred_test = self.__cat.predict_proba(self.__test_feature)[:, 1]

            self.__oof_preds[val_idx] = pred_val
            self.__sub_preds += pred_test / self.__folds.n_splits
            print("Fold %2d AUC : %.6f" % (n_fold + 1, roc_auc_score(val_y, self.__oof_preds[val_idx])))
        print("Full AUC score %.6f" % roc_auc_score(self.__train_label, self.__oof_preds))

    def model_predict(self):
        self.__sample_submission["TARGET"] = self.__sub_preds
        self.__sample_submission.to_csv(os.path.join(self.__output_path, "sample_submission.csv"), index=False)
class BayesianOptimizationGoss(object):
    def __init__(self, *, input_path):
        self.__input_path = input_path

        # data prepare
        self.__train = None
        self.__train_label = None
        self.__train_feature = None
        self.__train_feature_stacking_tree = None
        self.__train_feature_stacking_linear = None
        self.__train_feature_stacking_network = None
        self.__train_feature_gp = None
        self.__encoder = None
        self.__categorical_columns = None

        # parameter tuning
        self.__gbm_bo = None
        self.__gbm_params = None
        self.__gp_params = {"alpha": 1e-4}

    def data_prepare(self):
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_select_feature_df.csv"))
        self.__train_feature_stacking_tree = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_tree_train.csv"))
        self.__train_feature_stacking_linear = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_linear_train.csv"))
        self.__train_feature_stacking_network = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_network_train.csv"))
        self.__train_feature_gp = pd.read_csv(
            os.path.join(self.__input_path, "genetic_train_feature.csv"))
        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["TARGET"] + [
            col for col in self.__train.columns.tolist()
            if re.search(r"SK_ID", col)
        ],
                                                 axis=1)

        self.__encoder = TargetEncoder()
        self.__categorical_columns = self.__train_feature.select_dtypes(
            "object").columns.tolist()
        self.__encoder.fit(self.__train_feature[self.__categorical_columns],
                           self.__train_label)
        self.__train_feature[self.__categorical_columns] = (
            self.__encoder.transform(
                self.__train_feature[self.__categorical_columns]))

        self.__train_feature = pd.concat([
            self.__train_feature, self.__train_feature_stacking_tree,
            self.__train_feature_stacking_linear,
            self.__train_feature_stacking_network
        ],
                                         axis=1)

    def parameter_tuning(self):
        def __cv(drop_rate, max_drop, skip_drop, n_estimators, learning_rate,
                 max_depth, num_leaves, min_split_gain, min_child_weight,
                 colsample_bytree, subsample, reg_alpha, reg_lambda):
            val = cross_val_score(
                LGBMClassifier(
                    boosting_type="dart",
                    drop_rate=max(min(drop_rate, 1.0), 0),
                    max_drop=max(round(max_drop), 1),
                    skip_drop=max(min(skip_drop, 1.0), 0),
                    n_estimators=max(round(n_estimators), 1),
                    learning_rate=max(min(learning_rate, 1.0), 0),
                    max_depth=max(round(max_depth), 1),
                    num_leaves=(max(
                        round(2 ^ round(max_depth) if num_leaves > 2
                              ^ round(max_depth) else round(num_leaves)), 1)),
                    min_split_gain=max(min_split_gain, 0),
                    min_child_weight=max(min_child_weight, 0),
                    colsample_bytree=max(min(colsample_bytree, 1.0), 0),
                    subsample=max(min(subsample, 1.0), 0),
                    reg_alpha=max(reg_alpha, 0),
                    reg_lambda=max(reg_lambda, 0),
                    n_jobs=-1,
                    verbose=-1),
                self.__train_feature,
                self.__train_label,
                scoring="roc_auc",
                # 要与使用 blending 的 lightgbm 相同
                cv=StratifiedKFold(n_splits=5, shuffle=True,
                                   random_state=8)).mean()

            return val

        self.__gbm_params = {
            # dart parameter
            "drop_rate": (0, 1.0),
            "max_drop": (10, 200),
            "skip_drop": (0, 1.0),
            # Gradient boosting parameter
            "n_estimators": (500, 3000),
            "learning_rate": (0.001, 0.1),
            # tree parameter
            "max_depth": (4, 10),
            "num_leaves": (10, 200),
            "min_split_gain": (0.00001, 0.1),
            "min_child_weight": (1, 100),
            # bagging parameter
            "colsample_bytree": (0.5, 1.0),
            "subsample": (0.5, 1.0),
            # reg parameter
            "reg_alpha": (0, 10),
            "reg_lambda": (0, 10)
        }
        self.__gbm_bo = BayesianOptimization(__cv, self.__gbm_params)
        self.__gbm_bo.maximize(init_points=30, n_iter=130, **self.__gp_params)
예제 #15
0
M = pd.read_csv('prediction_data.csv')
M['Year of Record'] = simpleimputermedian.fit_transform(
    M['Year of Record'].values.reshape(-1, 1))
M['Age'] = simpleimputermedian.fit_transform(M['Age'].values.reshape(-1, 1))
M['Body Height [cm]'] = simpleimputermedian.fit_transform(
    M['Body Height [cm]'].values.reshape(-1, 1))
Mnoncateg = M.drop(
    ['Instance', 'Hair Color', 'Wears Glasses', 'Hair Color', 'Income'],
    axis=1)

X = datasetnoncateg.drop('Income in EUR', axis=1).values
Y = datasetnoncateg['Income in EUR'].values
#target encoding
t1 = TargetEncoder()
t1.fit(X, Y)
X = t1.transform(X)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                Y,
                                                test_size=0.33,
                                                random_state=0)
# regressor = BayesianRidge()
regressor = RandomForestRegressor()
#regressor = AdaBoostRegressor()
#regressor = = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)

fitResult = regressor.fit(Xtrain, Ytrain)
YPredTest = regressor.predict(Xtest)
#learningTest = pd.DataFrame({'Predicted': YPredTest, 'Actual': Ytest })
np.sqrt(metrics.mean_squared_error(Ytest, YPredTest))
예제 #16
0
class BayesianOptimizationGbdt(object):
    def __init__(self, *, input_path):
        self.__input_path = input_path

        # data prepare
        self.__train = None
        self.__train_label = None
        self.__train_feature = None
        self.__encoder = None
        self.__categorical_columns = None

        # parameter tuning
        self.__gbm_bo = None
        self.__gbm_params = None
        self.__gp_params = {"alpha": 1e-4}

    def data_prepare(self):
        self.__train = pd.read_csv(os.path.join(self.__input_path, "train_select_feature_df.csv"))
        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(
            ["TARGET"] + [col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col)], axis=1)

        self.__encoder = TargetEncoder()
        self.__categorical_columns = self.__train_feature.select_dtypes("object").columns.tolist()
        self.__encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label)
        self.__train_feature[self.__categorical_columns] = (
            self.__encoder.transform(self.__train_feature[self.__categorical_columns])
        )

    def parameter_tuning(self):
        def __cv(
                n_estimators, learning_rate,
                max_depth, num_leaves, min_split_gain, min_child_weight,
                colsample_bytree, subsample, reg_alpha, reg_lambda):
            val = cross_val_score(
                LGBMClassifier(
                    n_estimators=max(int(round(n_estimators)), 1),
                    learning_rate=max(min(learning_rate, 1.0), 0),
                    max_depth=max(int(round(max_depth)), 1),
                    # 如果 num_leaves > 2 ^ round(max_depth) 时 leaf-wise 的树就会太深导致 overfitting
                    num_leaves=max(2 ^ int(round(max_depth)) if num_leaves > 2 ^ int(round(max_depth)) else int(round(num_leaves)), 1),
                    min_split_gain=max(min_split_gain, 0),
                    min_child_weight=max(min_child_weight, 0),
                    colsample_bytree=max(min(colsample_bytree, 1.0), 0),
                    subsample=max(min(subsample, 1.0), 0),
                    reg_alpha=max(reg_alpha, 0),
                    reg_lambda=max(reg_lambda, 0),
                    n_jobs=-1,
                    verbose=-1
                ),
                self.__train_feature,
                self.__train_label,
                scoring="roc_auc",
                cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=7)
            ).mean()

            return val

        self.__gbm_params = {
            # Gradient boosting parameter
            "n_estimators": (5500, 6500),
            "learning_rate": (0.001, 0.03),
            # tree parameter
            "max_depth": (4, 10),
            "num_leaves": (10, 200),
            "min_split_gain": (0.00001, 0.1),
            "min_child_weight": (1, 100),
            # bagging parameter
            "colsample_bytree": (0.5, 1.0),
            "subsample": (0.5, 1.0),
            # reg parameter
            "reg_alpha": (0, 10),
            "reg_lambda": (0, 10)
        }
        self.__gbm_bo = BayesianOptimization(__cv, self.__gbm_params)
        self.__gbm_bo.maximize(** self.__gp_params)
예제 #17
0
def train(MODEL="GNB"):

    # load voter data and merge with Census data
    df = pd.read_csv(DIR + "/data/nc_voter_geocoded_census_block_trigrams.csv")

    df = prep_data(df)

    tes = {}
    #tes = joblib.load(DIR + "/data/models/transformers_binary.joblib")

    models = {}

    # Loop through each race class, create model for each
    for race in ["W", "B", "A", "I", "HL"]:

        X = df.copy()

        # If hispanic, use ethnic_code instead of race code
        if race == "HL":
            X["ethnic_code"] = np.where(X["ethnic_code"] == race, True, False)
            y = X["ethnic_code"]

        # other wise race code
        else:
            X["race_code"] = np.where(X["race_code"] == race, True, False)
            y = X["race_code"]

        # target encode names, save target encoder
        for col in ["first_name", "last_name", "middle_name"]:

            #te = tes[race][col]
            te = TargetEncoder()
            te.fit(X[col], y)

            X[col] = te.transform(X[col])

        # remove target variables and fill in any nas with 0
        #sample_weights = X["sample_weights"]
        #X = X.drop(["race_code", "ethnic_code", "zip", "sample_weights"], axis=1)
        X = X.fillna(0)

        sm = SMOTE(n_jobs=-1)
        X, y = sm.fit_resample(X, y)
        sample_weights = X["sample_weights"]
        X = X.drop(["zip", "sample_weights"], axis=1)

        # train model
        if MODEL == "LGBM":
            from lightgbm import LGBMClassifier
            model = LGBMClassifier(n_jobs=-1)
        elif MODEL == "GNB":
            from sklearn.naive_bayes import GaussianNB
            model = GaussianNB()
        elif MODEL == "XGB":
            from xgboost import XGBClassifier
            model = XGBClassifier(n_jobs=-1)
        elif MODEL == "SGD":
            model = SGDClassifier(alpha=0.0,
                                  eta0=0.1,
                                  fit_intercept=True,
                                  l1_ratio=1.0,
                                  learning_rate="constant",
                                  loss="modified_huber",
                                  penalty="elasticnet",
                                  power_t=0.0)
        elif MODEL == "RF":
            from sklearn.ensemble import RandomForestClassifier
            model = RandomForestClassifier(n_jobs=-1, max_depth=10)

        model.fit(X[MODEL_COLS], y, sample_weight=sample_weights)

        # save model
        models[race] = model

        # score model
        print(race, model.score(X[MODEL_COLS], y))

    # Save the models and encoders
    handle = MODEL.lower()

    #joblib.dump(tes, DIR + "/data/models/transformers_binary.joblib", compress=True)
    joblib.dump(models,
                DIR + "/data/models/models_binary_%s.joblib" % handle,
                compress=True)
    #joblib.dump(scalers, DIR + "/data/models/scalers_binary.joblib", compress=True)

    print("Trained model saved to ./data/models/")
class FeatureSelectionUseVariance(object):
    def __init__(self, *, input_path, output_path):
        self.__input_path = input_path
        self.__output_path = output_path

        self.__train, self.__test = [None for _ in range(2)]
        self.__train_label = None
        self.__train_feature, self.__test_feature = [None for _ in range(2)]

        self.__categorical_columns = None
        self.__encoder = None

        self.__remove_feature = []

    def data_prepare(self):
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_feature_df.csv"))
        self.__test = pd.read_csv(
            os.path.join(self.__input_path, "test_feature_df.csv"))
        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["SK_ID_CURR", "TARGET"],
                                                 axis=1)
        self.__test_feature = self.__test[
            self.__train_feature.columns.tolist()]

        # 使用这种方式无法报错, 可以换一种方法, 删除相似度为 1 的特征
        # drop duplicate column
        # self.__train_feature = self.__train_feature.T.drop_duplicates().T
        # self.__test_feature = self.__test_feature[self.__train_feature.columns.tolist()]

        # encoder
        self.__categorical_columns = (self.__train_feature.select_dtypes(
            include="object").columns.tolist())
        self.__train_feature[self.__categorical_columns] = (
            self.__train_feature[self.__categorical_columns].fillna("missing"))
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature[self.__categorical_columns],
                           self.__train_label)
        self.__train_feature[self.__categorical_columns] = (
            self.__encoder.transform(
                self.__train_feature[self.__categorical_columns]))

        for col in self.__train_feature.columns.tolist():
            if self.__train_feature[col].std() == 0.:
                print(col)
                self.__remove_feature.append(col)

    def data_output(self):
        self.__train[[
            col for col in self.__train.columns.tolist()
            if col not in self.__remove_feature
        ]].to_csv(os.path.join(self.__output_path,
                               "train_select_feature_df.csv"),
                  index=False)

        self.__test[[
            col for col in self.__test.columns.tolist()
            if col not in self.__remove_feature
        ]].to_csv(os.path.join(self.__output_path,
                               "test_select_feature_df.csv"),
                  index=False)
class LightGbmOneFold(object):
    def __init__(self, *, input_path, output_path):
        self.__input_path, self.__output_path = input_path, output_path

        # data prepare
        self.__sample_submission = None
        self.__train, self.__test = [None for _ in range(2)]
        self.__train_feature, self.__test_feature = [None for _ in range(2)]
        self.__train_label = None
        self.__categorical_columns = None
        self.__encoder = None

        # model fit
        self.__folds = None
        self.__train_preds = None
        self.__test_preds = None
        self.__gbm = None

    def data_prepare(self):
        self.__sample_submission = pd.read_csv(os.path.join(self.__input_path, "sample_submission.csv"))

        # selected feature
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_select_feature_df.csv"))
        self.__test = pd.read_csv(
            os.path.join(self.__input_path, "test_select_feature_df.csv"))

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(
            ["TARGET"] + [col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col)], axis=1)
        self.__test_feature = self.__test[self.__train_feature.columns.tolist()]

        self.__categorical_columns = self.__train_feature.select_dtypes("object").columns.tolist()
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature.loc[:, self.__categorical_columns], self.__train_label)
        self.__train_feature.loc[:, self.__categorical_columns] = (
            self.__encoder.transform(self.__train_feature.loc[:, self.__categorical_columns])
        )
        self.__test_feature.loc[:, self.__categorical_columns] = (
            self.__encoder.transform(self.__test_feature.loc[:, self.__categorical_columns])
        )

    def model_fit(self):
        feature_importance_df = pd.DataFrame()

        self.__gbm = LGBMClassifier(
            n_estimators=5000,
            learning_rate=0.0128,
            max_depth=8,
            num_leaves=11,
            min_split_gain=0.0018,
            min_child_weight=2.6880,
            colsample_bytree=0.5672,
            subsample=0.6406,
            reg_alpha=3.5025,
            reg_lambda=0.9549,
            n_jobs=-1
        )

        self.__gbm.fit(self.__train_feature, self.__train_label, verbose=True)
        self.__train_preds = self.__gbm.predict_proba(self.__train_feature)[:, 1]
        self.__test_preds = self.__gbm.predict_proba(self.__test_feature)[:, 1]

        feature_importance_df["feature"] = pd.Series(self.__train_feature.columns)
        feature_importance_df["importance"] = self.__gbm.feature_importances_
        feature_importance_df.to_csv(os.path.join(self.__output_path, "feature_importance.csv"), index=False)
        print("Train AUC score %.6f" % roc_auc_score(self.__train_label, self.__train_preds))

    def model_predict(self):
        self.__sample_submission["TARGET"] = self.__test_preds
        self.__sample_submission.to_csv(os.path.join(self.__output_path, "sample_submission.csv"), index=False)
예제 #20
0
class ROHE(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.__columns = None
        self.__missing = None
        self.__categories = None
        self.__lab_encoder = None
        self.__tar_encoder = None
        self.__ohe_encoder = None

    def fit(self, X, y=None):
        feature, label = X.copy(deep=True), y.copy(deep=True)
        del X, y
        gc.collect()

        self.__columns = list()
        self.__missing = dict()
        self.__categories = dict()
        self.__lab_encoder = dict()

        for column in feature.columns:
            num_unique = feature[column].nunique()

            if num_unique == 1:
                continue
            else:

                self.__columns.append(column)

                if feature[column].isna().sum():
                    self.__missing[column] = "missing"
                    self.__categories[column] = feature[column].unique()
                else:
                    self.__missing[column] = feature[column].value_counts(
                        ascending=True).index[0]
                    self.__categories[column] = feature[column].unique()

                encoder = LabelEncoder()
                encoder.fit(feature[column])
                feature[column] = encoder.transform(feature[column])
                self.__lab_encoder[column] = encoder

        feature = feature[self.__columns].copy(deep=True)

        self.__tar_encoder = TargetEncoder()
        self.__tar_encoder.fit(feature.astype(str), label)

        self.__ohe_encoder = OneHotEncoder(categories="auto",
                                           sparse=True)  # drop="first" bad
        self.__ohe_encoder.fit(
            self.__tar_encoder.transform(feature.astype(str)))

    def transform(self, X):
        feature = X.copy(deep=True)
        del X
        gc.collect()

        feature = feature[self.__columns].copy(deep=True)

        for column in feature.columns:
            feature[column] = feature[column].fillna(self.__missing[column])
            feature[column] = feature[column].apply(
                lambda element: element if element in self.__categories[
                    column] else self.__missing[column])
            feature[column] = self.__lab_encoder[column].transform(
                feature[column])

        return self.__ohe_encoder.transform(
            self.__tar_encoder.transform(feature.astype(str)))

    def fit_transform(self, X, y=None, **fit_params):
        feature, label = X.copy(deep=True), y.copy(deep=True)
        del X, y
        gc.collect()

        self.fit(feature, label)
        return self.transform(feature)
예제 #21
0
class LightGbmKfold(object):
    def __init__(self, *, input_path, output_path):
        self.__input_path, self.__output_path = input_path, output_path

        # data prepare
        self.__sample_submission = None
        self.__train, self.__test = [None for _ in range(2)]
        self.__train_feature, self.__test_feature = [None for _ in range(2)]
        self.__train_label = None
        self.__categorical_columns = None
        self.__encoder = None

        # model fit
        self.__folds = None
        self.__oof_preds = None
        self.__sub_preds = None
        self.__gbm = None

    def data_prepare(self):
        self.__sample_submission = pd.read_csv(os.path.join(self.__input_path, "sample_submission.csv"))
        self.__train = pd.read_csv(os.path.join(self.__input_path, "train_feature_df.csv"))
        self.__test = pd.read_csv(os.path.join(self.__input_path, "test_feature_df.csv"))

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(
            ["TARGET"] + [col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col)], axis=1)
        self.__test_feature = self.__test[self.__train_feature.columns.tolist()]

        self.__categorical_columns = self.__train_feature.select_dtypes("object").columns.tolist()
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature.loc[:, self.__categorical_columns], self.__train_label)
        self.__train_feature.loc[:, self.__categorical_columns] = (
            self.__encoder.transform(self.__train_feature.loc[:, self.__categorical_columns])
        )
        self.__test_feature.loc[:, self.__categorical_columns] = (
            self.__encoder.transform(self.__test_feature.loc[:, self.__categorical_columns])
        )

    def model_fit(self):
        self.__folds = StratifiedKFold(n_splits=5, shuffle=True)
        self.__oof_preds = np.zeros(shape=self.__train_feature.shape[0])
        self.__sub_preds = np.zeros(shape=self.__test_feature.shape[0])

        feature_importance_df = pd.DataFrame()
        for n_fold, (trn_idx, val_idx) in enumerate(self.__folds.split(self.__train_feature, self.__train_label)):
            trn_x, trn_y = self.__train_feature.iloc[trn_idx], self.__train_label.iloc[trn_idx]
            val_x, val_y = self.__train_feature.iloc[val_idx], self.__train_label.iloc[val_idx]

            self.__gbm = LGBMClassifier(
                n_estimators=10000,
                learning_rate=0.02,
                num_leaves=34,
                colsample_bytree=0.9497036,
                subsample=0.8715623,
                max_depth=8,
                reg_alpha=0.041545473,
                reg_lambda=0.0735294,
                min_split_gain=0.0222415,
                min_child_weight=39.3259775
            )

            self.__gbm.fit(
                trn_x,
                trn_y,
                eval_set=[(trn_x, trn_y), (val_x, val_y)],
                eval_metric="auc",
                verbose=True,
                early_stopping_rounds=200
            )
            pred_val = self.__gbm.predict_proba(val_x, num_iteration=self.__gbm.best_iteration_)[:, 1]
            pred_test = self.__gbm.predict_proba(self.__test_feature, num_iteration=self.__gbm.best_iteration_)[:, 1]

            self.__oof_preds[val_idx] = pred_val
            self.__sub_preds += pred_test / self.__folds.n_splits

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = pd.Series(self.__train_feature.columns)
            fold_importance_df["importance"] = self.__gbm.feature_importances_
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
            print("Fold %2d AUC : %.6f" % (n_fold + 1, roc_auc_score(val_y, self.__oof_preds[val_idx])))

        feature_importance_df.to_csv(os.path.join(self.__output_path, "feature_importance.csv"), index=False)
        print("Full AUC score %.6f" % roc_auc_score(self.__train_label, self.__oof_preds))

    def model_predict(self):
        self.__sample_submission["TARGET"] = self.__sub_preds
        self.__sample_submission.to_csv(os.path.join(self.__output_path, "sample_submission.csv"), index=False)
예제 #22
0
import pandas as pd
from category_encoders import TargetEncoder
import joblib

data = pd.read_csv('./京东万象数据填充2.csv', encoding='GBK')
data = data.dropna(subset=['价格'])
data = data.dropna(subset=['数据标签'])
data = data.dropna(subset=['数据名称'])
data = data.dropna(subset=['店铺'])

enc = TargetEncoder(cols=['数据名称', '店铺', '数据标签'])
# print(type(enc))
dataframe = data[['数据名称', '店铺', '数据标签', '数据大小', '浏览量', '价格']]
enc.fit(dataframe, dataframe['价格'])

data1 = enc.transform(dataframe)
# print(type(data1))
# dataframe = pd.DataFrame({'数据名称': data1['数据名称'], '店铺': data1['店铺'],
#                           '数据标签': data1['数据标签'], '数据大小': data1['数据大小'],
#                           '浏览量': data1['浏览量'], '价格': data1['价格']})
joblib.dump(enc, 'encoding.joblib')
data1.to_csv('final_data.csv', encoding='GBK', sep=',')
예제 #23
0
#training['Hair Color'] = training['Hair Color'].replace( np.nan ,'Nan_data')
#training['Hair Color'] = training['Hair Color'].replace( '0' ,'Nan_data')
#training['Hair Color'] = training['Hair Color'].replace( 'Unknown' ,'Nan_data')

#test['Hair Color'] = test['Hair Color'].replace( np.nan ,'Nan_data')
#test['Hair Color'] = test['Hair Color'].replace( '0' ,'Nan_data')
#test['Hair Color'] = test['Hair Color'].replace( 'Unknown' ,'Nan_data')

X = training.iloc[:, :-1]
y = training.iloc[:, -1]

#Target encoding for categorical features.

te = TargetEncoder()
te.fit(X, y)

X = te.transform(X)

predict_dataset = te.transform(test)

from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.3,
                                                  random_state=42)

#from catboost import CatBoostRegressor

# Using CatBoost
#cat_model3 = CatBoostRegressor(iterations=125000)
class BayesianOptimizationGoss(object):
    def __init__(self, *, input_path):
        self.__input_path = input_path

        # data prepare
        self.__train = None
        self.__train_label = None
        self.__train_feature = None
        self.__encoder = None
        self.__categorical_columns = None

        # parameter tuning
        self.__gbm_bo = None
        self.__gbm_params = None
        self.__gp_params = {"alpha": 1e-3}

    def data_prepare(self):
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_select_feature_df.csv"))
        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["TARGET"] + [
            col for col in self.__train.columns.tolist()
            if re.search(r"SK_ID", col)
        ],
                                                 axis=1)

        self.__encoder = TargetEncoder()
        self.__categorical_columns = self.__train_feature.select_dtypes(
            "object").columns.tolist()
        self.__encoder.fit(self.__train_feature[self.__categorical_columns],
                           self.__train_label)
        self.__train_feature[self.__categorical_columns] = (
            self.__encoder.transform(
                self.__train_feature[self.__categorical_columns]))

        self.__train_feature, self.__train_label = shuffle(
            self.__train_feature, self.__train_label)

    def parameter_tuning(self):
        def __cv(top_rate, other_rate, n_estimators, learning_rate, max_depth,
                 num_leaves, min_split_gain, min_child_weight,
                 colsample_bytree, reg_alpha, reg_lambda):
            val = cross_val_score(
                LGBMClassifier(
                    boosting_type="goss",
                    top_rate=max(min(top_rate, 1.0), 0),
                    other_rate=max(min(1.0 - top_rate, 1.0), 0),
                    n_estimators=max(int(n_estimators), 1),
                    learning_rate=max(min(learning_rate, 1.0), 0),
                    max_depth=max(
                        int(2 ^ int(max_depth) if num_leaves > 2
                            ^ int(max_depth) else int(num_leaves)), 1),
                    num_leaves=max(int(num_leaves), 1),
                    min_split_gain=max(min_split_gain, 0),
                    min_child_weight=max(min_child_weight, 0),
                    colsample_bytree=max(min(colsample_bytree, 1.0), 0),
                    # subsample=max(min(subsample, 1.0), 0),
                    reg_alpha=max(reg_alpha, 0),
                    reg_lambda=max(reg_lambda, 0),
                    n_jobs=4,
                    verbose=-1),
                self.__train_feature,
                self.__train_label,
                scoring="roc_auc",
                cv=StratifiedKFold(n_splits=5, shuffle=True,
                                   random_state=7)).mean()

            return val

        self.__gbm_params = {
            # GOSS top_rate + other_rate = 1
            "top_rate": (0.01, 1),
            "other_rate": (0.01, 1),
            # Gradient boosting parameter
            "n_estimators": (1000, 4000),
            "learning_rate": (0.001, 0.1),
            # tree parameter
            "max_depth": (4, 10),
            "num_leaves": (10, 200),
            "min_split_gain": (0.00001, 0.1),
            "min_child_weight": (1, 100),
            # bagging parameter
            "colsample_bytree": (0, 0.999),
            # "subsample": (0, 0.999),
            # reg parameter
            "reg_alpha": (0, 10),
            "reg_lambda": (0, 10)
        }
        self.__gbm_bo = BayesianOptimization(__cv, self.__gbm_params)
        self.__gbm_bo.maximize(init_points=10,
                               n_iter=50,
                               kappa=2.576 * 2,
                               **self.__gp_params)

        print(self.__gbm_bo.res["max"]["max_val"])
        print(self.__gbm_bo.res["max"]["max_params"]["top_rate"])
        print(self.__gbm_bo.res["max"]["max_params"]["other_rate"])
        print(self.__gbm_bo.res["max"]["max_params"]["n_estimators"])
        print(self.__gbm_bo.res["max"]["max_params"]["learning_rate"])
        print(self.__gbm_bo.res["max"]["max_params"]["max_depth"])
        print(self.__gbm_bo.res["max"]["max_params"]["num_leaves"])
        print(self.__gbm_bo.res["max"]["max_params"]["min_split_gain"])
        print(self.__gbm_bo.res["max"]["max_params"]["min_child_weight"])
        print(self.__gbm_bo.res["max"]["max_params"]["colsample_bytree"])
        # print(self.__gbm_bo.res["max"]["max_params"]["subsample"])
        print(self.__gbm_bo.res["max"]["max_params"]["reg_alpha"])
        print(self.__gbm_bo.res["max"]["max_params"]["reg_lambda"])
def ProcessRawData(df, schemaCols=None):

    medianSimpleImputer = SimpleImputer(strategy='median')
    standardScaler = preprocessing.StandardScaler()

    # Adding extra features AgeLog and HeightLog
    df['AgeLog'] = np.log(df['Age'].values)
    df['HeightLog'] = np.log(df['Body Height [cm]'].values)

    # Fill missing values
    df[['Year of Record', 'Age', 'AgeLog', 'HeightLog']] = medianSimpleImputer.fit_transform(df[['Year of Record', 'Age', 'AgeLog', 'HeightLog']].values)

    # Scale numeric columns 1
    df[['Year of Record', 'Size of City', 'Body Height [cm]', 'Age', 'AgeLog']] = standardScaler.fit_transform(df[['Year of Record', 'Size of City', 'Body Height [cm]', 'Age', 'AgeLog']].values)

    # Scale numeric columns 2
    if 'Income in EUR' in df.columns:
        global YScaler
        YScaler = preprocessing.StandardScaler()
        df[['Income in EUR']] = YScaler.fit_transform(df[['Income in EUR']].values)

    # Reducing complexity of features
    df.Profession = list(df.Profession.map(S2))

    # To be used while writing results to CSV
    instances = df['Instance'].values
    df = df.drop(['Instance'], axis=1)

    print('Columns available 1 - ', df.columns)

    # Target encoding the data - could've been done with a single encoder object, will try later,
    if (schemaCols is None): # condition to skip fitting on Prediction dataset and only transform then
        global t1, t2, t3, t4, t5
        t1 = TargetEncoder()
        t2 = TargetEncoder()
        t3 = TargetEncoder()
        t4 = TargetEncoder()
        t5 = TargetEncoder()
        t1.fit(df.Country.values, df['Income in EUR'].values)
        t2.fit(df.Profession.values, df['Income in EUR'].values)
        t3.fit(df.Gender.values, df['Income in EUR'].values)
        t4.fit(df['University Degree'].values, df['Income in EUR'].values)
        t5.fit(df['Hair Color'].values, df['Income in EUR'].values)

    df.Country = t1.transform(df.Country.values)
    df.Profession = t2.transform(df.Profession.values)
    df.Gender = t3.transform(df.Gender.values)
    df['University Degree'] = t4.transform(df['University Degree'].values)
    df['Hair Color'] = t5.transform(df['Hair Color'].values)

    if (schemaCols is not None):
        newdf = pd.DataFrame()
        for columnName in schemaCols:
            if columnName not in df.columns:
                newdf[columnName] = 0
            else:
                newdf[columnName] = df[columnName].values
        df = newdf

    df = df.sort_index(axis=1)

    # standardize datasets prediction and training to use the same code from there on
    if 'Income in EUR' not in df.columns:
        df['Income in EUR'] = np.zeros(df.values.shape[0])

    if 'Income' in df.columns:
        df.drop('Income')

    X = df.drop('Income in EUR', axis=1).values
    Y = df['Income in EUR'].values

    print('Shape - ', df.shape)

    global featSel
    if featSel is None:
        print('k = ? ')
        featSel = SelectKBest(f_regression, k=10)
        featSel.fit(X, Y)

    X = featSel.transform(X)
    print('Shape after feature selection - ', X.shape)
    return instances, X, Y, df.columns
class StackingFirstLayerLinear(object):
    def __init__(self, input_path, output_path):
        self.__input_path, self.__output_path = input_path, output_path
        self.__skwp = importlib.import_module("SklWrapper")

        # data prepare
        self.__train, self.__test = [None for _ in range(2)]
        self.__train_feature, self.__train_label = [None for _ in range(2)]
        self.__test_feature = None

        self.__categorical_index = None
        self.__numeric_index = None

        # filler encoder scaler
        self.__filler, self.__encoder, self.__scaler = [None for _ in range(3)]
        self.__oof_train, self.__oof_test = [None for _ in range(2)]
        self.__first_layer_train, self.__first_layer_test = [
            None for _ in range(2)
        ]

        # model fit

    def data_prepare(self):
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_select_feature_df.csv"))
        self.__test = pd.read_csv(
            os.path.join(self.__input_path, "test_select_feature_df.csv"))

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["TARGET"] + [
            col for col in self.__train.columns.tolist()
            if re.search(r"SK_ID", col)
        ],
                                                 axis=1)
        self.__test_feature = self.__test[
            self.__train_feature.columns.tolist()]

        # drop column na
        self.__train_feature = self.__train_feature[list(
            (self.__train_feature.isna().sum() /
             self.__train_feature.isna().count()
             )[(self.__train_feature.isna().sum() /
                self.__train_feature.isna().count()) < 0.2].index)]
        self.__test_feature = self.__test_feature[
            self.__train_feature.columns.tolist()]

        # columns 而不是 index
        self.__categorical_index = self.__train_feature.select_dtypes(
            include="object").columns.tolist()
        self.__numeric_index = self.__train_feature.select_dtypes(
            exclude="object").columns.tolist()

        # filler Imputer all np.nan remove column
        self.__filler = Imputer(strategy="median")
        self.__filler.fit(self.__train_feature[self.__numeric_index])
        self.__train_feature[self.__numeric_index] = self.__filler.transform(
            self.__train_feature[self.__numeric_index])
        self.__test_feature[self.__numeric_index] = self.__filler.transform(
            self.__test_feature[self.__numeric_index])

        # encoder
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature[self.__categorical_index],
                           self.__train_label)
        self.__train_feature[
            self.__categorical_index] = self.__encoder.transform(
                self.__train_feature[self.__categorical_index])
        self.__test_feature[
            self.__categorical_index] = self.__encoder.transform(
                self.__test_feature[self.__categorical_index])

        # scaler pandas in numpy out
        self.__scaler = MinMaxScaler()
        self.__scaler.fit(self.__train_feature)
        self.__train_feature = pd.DataFrame(
            self.__scaler.transform(self.__train_feature),
            columns=self.__train_feature.columns)
        self.__test_feature = pd.DataFrame(self.__scaler.transform(
            self.__test_feature),
                                           columns=self.__test_feature.columns)

    def model_fit(self):
        def __get_oof(clf, train_feature, train_label, test_feature):
            folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
            oof_train = np.zeros(shape=train_feature.shape[0])
            oof_test = np.zeros(shape=test_feature.shape[0])

            for n_fold, (trn_idx, val_idx) in enumerate(
                    folds.split(train_feature, train_label)):
                trn_x, trn_y = train_feature.iloc[trn_idx], train_label.iloc[
                    trn_idx]
                val_x, val_y = train_feature.iloc[val_idx], train_label.iloc[
                    val_idx]

                clf.train(trn_x, trn_y)
                pred_val = clf.predict(val_x)
                pred_test = clf.predict(test_feature)

                oof_train[val_idx] = pred_val
                oof_test += pred_test / folds.n_splits

            return oof_train.reshape((-1, 1)), oof_test.reshape((-1, 1))

        lr_p1 = self.__skwp.SklWrapper(clf=LogisticRegression,
                                       init_params={"penalty": "l1"})
        lr_p2 = self.__skwp.SklWrapper(clf=LogisticRegression,
                                       init_params={"penalty": "l2"})
        mlp_unit_100 = self.__skwp.SklWrapper(
            clf=MLPClassifier, init_params={"hidden_layer_sizes": (100, )})
        mlp_unit_200 = self.__skwp.SklWrapper(
            clf=MLPClassifier, init_params={"hidden_layer_sizes": (200, )})
        mlp_unit_300 = self.__skwp.SklWrapper(
            clf=MLPClassifier, init_params={"hidden_layer_sizes": (300, )})
        # mlp_unit_5_100 = self.__skwp.SklWrapper(
        #     clf=MLPClassifier,
        #     init_params={
        #         "hidden_layer_sizes": (100, 100, 100, 100, 100)
        #     }
        # )
        # mlp_unit_5_300 = self.__skwp.SklWrapper(
        #     clf=MLPClassifier,
        #     init_params={
        #         "hidden_layer_sizes": (300, 300, 300, 300, 300)
        #     }
        # )
        # mlp_unit_5_900 = self.__skwp.SklWrapper(
        #     clf=MLPClassifier,
        #     init_params={
        #         "hidden_layer_sizes": (900, 900, 900, 900, 900)
        #     }
        # )

        lr_p1_oof_train, lr_p1_oof_test = __get_oof(lr_p1,
                                                    self.__train_feature,
                                                    self.__train_label,
                                                    self.__test_feature)
        print("lr l1 oof complete !")
        lr_p2_oof_train, lr_p2_oof_test = __get_oof(lr_p2,
                                                    self.__train_feature,
                                                    self.__train_label,
                                                    self.__test_feature)
        print("lr l2 oof complete !")
        mlp_unit_100_oof_train, mlp_unit_100_oof_test = __get_oof(
            mlp_unit_100, self.__train_feature, self.__train_label,
            self.__test_feature)
        print("mlp 100 oof complete !")
        mlp_unit_200_oof_train, mlp_unit_200_oof_test = __get_oof(
            mlp_unit_200, self.__train_feature, self.__train_label,
            self.__test_feature)
        print("mlp 200 oof complete !")
        mlp_unit_300_oof_train, mlp_unit_300_oof_test = __get_oof(
            mlp_unit_300, self.__train_feature, self.__train_label,
            self.__test_feature)
        print("mlp 300 oof complete !")
        # mlp_unit_5_100_oof_train, mlp_unit_5_100_oof_test = __get_oof(
        #     mlp_unit_5_100,
        #     self.__train_feature,
        #     self.__train_label,
        #     self.__test_feature
        # )
        # print("mlp 5 100 oof complete !")
        # mlp_unit_5_300_oof_train, mlp_unit_5_300_oof_test = __get_oof(
        #     mlp_unit_5_300,
        #     self.__train_feature,
        #     self.__train_label,
        #     self.__test_feature
        # )
        # print("mlp 5 300 oof complete !")
        # mlp_unit_5_900_oof_train, mlp_unit_5_900_oof_test = __get_oof(
        #     mlp_unit_5_900,
        #     self.__train_feature,
        #     self.__train_label,
        #     self.__test_feature
        # )
        # print("mlp 5 900 oof complete !")

        self.__oof_train = np.hstack(
            (lr_p1_oof_train, lr_p2_oof_train, mlp_unit_100_oof_train,
             mlp_unit_200_oof_train, mlp_unit_300_oof_train))
        self.__oof_test = np.hstack(
            (lr_p1_oof_test, lr_p2_oof_test, mlp_unit_100_oof_test,
             mlp_unit_200_oof_test, mlp_unit_300_oof_test))

    def model_predict(self):
        self.__oof_train = pd.DataFrame(self.__oof_train,
                                        columns=[
                                            "lr_p1", "lr_p2", "mlp_unit_100",
                                            "mlp_unit_200", "mlp_unit_300"
                                        ])
        self.__oof_test = pd.DataFrame(self.__oof_test,
                                       columns=[
                                           "lr_p1", "lr_p2", "mlp_unit_100",
                                           "mlp_unit_200", "mlp_unit_300"
                                       ])
        self.__first_layer_train = self.__oof_train
        self.__first_layer_test = self.__oof_test
        self.__first_layer_train.to_csv(os.path.join(
            self.__output_path, "first_layer_linear_train.csv"),
                                        index=False)
        self.__first_layer_test.to_csv(os.path.join(
            self.__output_path, "first_layer_linear_test.csv"),
                                       index=False)
# Splitting Training Dataset into Test and Train
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7, random_state=100)

# Installing category_encoders to import Target Encoder
!pip install category_encoders

# Importing Target Encoder
from category_encoders import TargetEncoder

# creating an object "te" for Target Encoder
te=TargetEncoder()

# Fitting Target Encoder on X_train and y_train (Training Data)
te.fit(X_train,y_train)

#Transforming X_train (Training Data)
X_train=te.transform(X_train)

#Transforming X_test (Training Data)
X_test=te.transform(X_test)

#Importing Logistic Regression from sklearn
from sklearn.linear_model import LogisticRegression

# Creating object for Logistic Regression
lr=LogisticRegression()

#Fitting Logistic Regression on X_train annd y_train (Training Data)
lr.fit(X_train,y_train)
def preprocessing(data):

	#--- Drop columns where all values are missing. Do this first to try and save space ---#
	data.dropna(how='all', axis=1, inplace=True)

	#########################################################################################################################
	# Creating Some Additional Variables
	#########################################################################################################################

	print("Generating some domain knowledge features...")

	# Loan_to_income Ratio (LTV)
	data['LOAN_INCOME_RATIO'] = data['AMT_CREDIT'] / data['AMT_INCOME_TOTAL']

	# Inc to Anuity Ratio
	data['ANNUITY_INCOME_RATIO'] = data['AMT_ANNUITY'] / data['AMT_INCOME_TOTAL']

	# Income to Collateral Ratio
	data['COLLATERAL_INCOME_RATIO'] =  data['AMT_GOODS_PRICE'] / data['AMT_INCOME_TOTAL']

	# LTV
	data['LOAN_TO_VALUE_RATIO'] = data['AMT_CREDIT'] / data['AMT_GOODS_PRICE']

	# Stats on the external scores
	data['EXT_SOURCE_MEAN'] = data[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].mean(axis=1)
	data['EXT_SOURCE_MIN'] = data[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].min(axis=1)
	data['EXT_SOURCE_MAX'] = data[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].max(axis=1)
	data['EXT_SOURCE_STD'] = data[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].std(axis=1)
	data['EXT_SOURCE_SKEW'] = data[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].skew(axis=1)

	#Income to no. kids ratio
	data['INC_TO_KIDS'] = data['AMT_INCOME_TOTAL'] / (data['CNT_CHILDREN'] + 1)

	#Fraction of family kids
	data['PERC_KIDS'] = data['CNT_CHILDREN'] / data['CNT_FAM_MEMBERS']

	#indebtendess*kids
	data['KIDS_AMT_ANNUITY_PRODUCT'] = data['CNT_CHILDREN']*data['AMT_ANNUITY']

	#Fraction of life worked
	data['WORK_FRAC'] = data['DAYS_EMPLOYED'] / data['DAYS_BIRTH']

	#Days old wehn got first car
	data['FIRST_CAR_DAYS'] = -data['DAYS_BIRTH'] - 365*data['OWN_CAR_AGE']

	# Total ways in which customer can be contacted
	data['SUM_CONTACT'] = data['FLAG_MOBIL'] + data['FLAG_EMP_PHONE'] + data['FLAG_WORK_PHONE'] + data['FLAG_CONT_MOBILE'] + data['FLAG_PHONE'] + data['FLAG_EMAIL']

	# Age income product
	data['AGE_INCOME_PROD'] = data['DAYS_BIRTH']*data['AMT_INCOME_TOTAL']

	# Working Age income product
	data['EMPLOYED_INCOME_PROD'] = data['DAYS_EMPLOYED']*data['AMT_INCOME_TOTAL']

	#########################################################################################################################
	# Dealing with categorical data (columns with string or object values) - filling in missings and creating dummy variables
	#########################################################################################################################

	print("filling in missing categorical data...")

	cat_data = data.select_dtypes(['object'])
	cat_col = list(data.select_dtypes(['object']).columns.values)
	cat_col.remove('Source')


	#--- Creating a list of categorical variabels with missing rows and filling in with string 'missing_' ---#
	cat_miss_col = cat_data.columns[cat_data.isna().any()].tolist()
	for item in cat_miss_col:
		data['%s' %(item)].fillna('missing_', inplace=True)

	encoder = TargetEncoder(verbose=0, impute_missing=True, return_df=False, smoothing=1)

	encoder.fit(X=data.loc[data['Source']=='Train',cat_col].values,
		y=data.loc[data['Source']=='Train',['TARGET']].values.reshape(-1,))

	X = encoder.transform(X=data[cat_col].values)

	data.loc[:,cat_col] = pd.DataFrame(X, columns=cat_col, index=list(data.index))


	################################################################################################################################
	# Dealing with floating values - imputing and then normalising
	################################################################################################################################

	print("Imputing, normalisig and scaling...")

	# Initialising preprocessing to normalise, scale data and impute missing data
	normaliser = Normalizer()
	scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
	imputer = Imputer(axis=0, strategy="median", missing_values="NaN")

	# Method to impute, scale and transform numerical columns into gaussian distribution
	column_list = list(data.columns)
	column_list.remove('TARGET')
	column_list.remove('Source')

	print("Imputing....")
	imputer.fit(data.loc[data['Source']=='Train',column_list])
	data.loc[:,column_list] = imputer.transform(data[column_list])

	print("Scaling....")
	scaler.fit(data.loc[data['Source']=='Train',column_list])
	data.loc[:,column_list] = scaler.transform(data[column_list])

	print("Normalising....")
	normaliser.fit(data.loc[data['Source']=='Train',column_list])
	data.loc[:,column_list] = normaliser.transform(data[column_list])


	#--- Deleting all zero columns ---#
	data = data.loc[:, (data != 0).any(axis=0)]

	#--- Getting list of variables in the dataset and correlations ---#
	
	correlations = []
	index = []

	for col in list(data.columns):
		if col!='TARGET' and col!='Source':
			correlations.append(round(data['TARGET'].corr(data[col]),3))
			index.append(col)

	correlations = pd.DataFrame(data=correlations, index=index, columns=['Correlation'])
	correlations['abs_Correlation'] = abs(correlations['Correlation'])
	correlations.to_csv("correlations.csv")

	################################################################################################################################
	# For highly correlated variables create a news variables
	################################################################################################################################

	# Creating products and divisor features for all numerical variables
	correlations = correlations[correlations['abs_Correlation']>0.05]
	num_vars = list(correlations.index.values)

	combinations = itertools.combinations(num_vars,2)
	
	print("Generating %s new features...." %(3*len(list(combinations))))

	start_time_2 = time()

	iteration = 0

	for i, j in itertools.combinations(num_vars,2):

		iteration+=1

		data['PROD_%s__%s' %(i,j)] = data[i]*data[j]
		data['DIV_%s__%s' %(i,j)] = data[i]/data[j]
		data['DIV2_%s__%s' %(i,j)] = data[j]/data[i]

		print("iteration: %s, time:" %(iteration,time()-start_time_2))

	correlations = []
	index = []

	for col in list(data.columns):
		if col!='TARGET' and col!='Source':
			correlations.append(round(data['TARGET'].corr(data[col]),3))
			index.append(col)

	correlations = pd.DataFrame(data=correlations, index=index, columns=['Correlation'])
	correlations['abs_Correlation'] = abs(correlations['Correlation'])
	correlations.to_csv("correlations_2.csv")
	correlations.to_csv("C:\\Users\\Cemlyn\\OneDrive\\Python_Code_Repository\\correlations_2.csv")
	'''
	print("Creating non-linear versions of numeric variables...")

	#--- If variable if float or number and non-binary then check for non-linear relationships ---#
	dtypes = list(set(data.dtypes))'''

	#--- Convert all int numbers to float - this will be memory intensive ---#
	'''
	power_list = [1.0,2.0,3.0]

	for col in data:

		#--- if column is numeric and non-binary then create new versions
		if data[col].dtype!='object' and len(data[col].unique()) > 2:

			corr_list={}

			data["%s_sqrt" %(col)] = np.sqrt(np.abs(data[col]))
			corr_list['sqrt'] = data['TARGET'].corr(data["%s_sqrt" %(col)])

			for power in power_list:
				data["%s_%s" %(col,power)] = np.power(data[col],power)

				corr = data['TARGET'].corr(data["%s_%s" %(col,power)])

				corr_list[power] = corr

			#--- if non-linearised variable has higher correlation keep the non-linear form which has the highest correlation
			if data['TARGET'].corr(data[col])<max(corr_list.values()):

				data.drop("%s" %(col),axis=1,inplace=True)

				for x in corr_list.keys():
					if x!=max(corr_list.values()):
						

						if ("%s_%s" %(col,power)) in data.columns:
							data.drop("%s_%s" %(col,power),axis=1,inplace=True)
		
	print(data.info())
	'''
	# Converting all int64 to int32 to save space. Might do the same with float64
	'''
	dtypes = list(set(data.dtypes))
	for types in dtypes:
		df_type = list(data.select_dtypes(types).columns)

		if 'TARGET' in df_type:
			df_type.remove('TARGET')

		#Convert 64bit values to 32 to save space
		if types=='int64':
			for col in df_type:
				data[col] = data[col].astype('int32')
		if types=='float64':
			for col in df_type:
				data[col] = data[col].astype('float32')

		df_type = data.select_dtypes(types).columns
	'''

	print(data.info())

	data.to_pickle("Processed_DFS_Data_v03.pkl")
	data.to_pickle("C:\\Users\\Cemlyn\\OneDrive\\Python_Code_Repository\\Processed_DFS_Data_v03.pkl")
	#data[:1000].to_csv("Processed_DFS_Data_sampled.csv")

	return 0
예제 #29
0
test = pd.read_csv(
    "D:\PythonProjects\ML_Group_Data/tcd-ml-comp-201920-income-pred-group/test.csv"
)

train_data = preprocessing(train)
test_data = preprocessing(test)

y = train_data[target]
train_data.drop(target, axis=1, inplace=True)
test_data.drop(target, axis=1, inplace=True)

enc = TargetEncoder(cols=[
    'Gender', 'Country', 'Profession', 'University Degree',
    'Housing Situation', 'Satisfation with employer'
])
enc.fit(train_data, y)
train_data = enc.transform(train_data)
test_data = enc.transform(test_data)
train_data.head()
test_data.head()

#X_Train, X_Test, y_train, y_test = train_test_split(train_data, y, test_size=0.3, random_state=1)
X_Train = train_data
y_train = y

y_train_log = np.log(y_train)

training = lgb.Dataset(X_Train, y_train_log)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
class LightGbmKfold(object):
    def __init__(self, *, input_path, output_path):
        self.__input_path, self.__output_path = input_path, output_path

        # data prepare
        self.__sample_submission = None
        self.__train, self.__test = [None for _ in range(2)]
        self.__train_feature, self.__test_feature = [None for _ in range(2)]
        self.__train_feature_stacking_tree, self.__test_feature_stacking_tree = [
            None for _ in range(2)
        ]
        self.__train_feature_stacking_linear, self.__test_feature_stacking_linear = [
            None for _ in range(2)
        ]
        self.__train_feature_stacking_network, self.__test_feature_stacking_network = [
            None for _ in range(2)
        ]
        self.__train_feature_stacking_gp, self.__test_feature_stacking_gp = [
            None for _ in range(2)
        ]
        self.__train_label = None
        self.__categorical_columns = None
        self.__encoder = None

        # model fit
        self.__folds = None
        self.__oof_preds = None
        self.__sub_preds = None
        self.__gbm = None
        # self.__metric_weight = []

    def data_prepare(self):
        self.__sample_submission = pd.read_csv(
            os.path.join(self.__input_path, "sample_submission.csv"))

        # selected feature
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_select_feature_df.csv"))
        self.__test = pd.read_csv(
            os.path.join(self.__input_path, "test_select_feature_df.csv"))
        # stacking tree
        self.__train_feature_stacking_tree = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_tree_train.csv"))
        self.__test_feature_stacking_tree = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_tree_test.csv"))
        # stacking linear
        self.__train_feature_stacking_linear = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_linear_train.csv"))
        self.__test_feature_stacking_linear = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_linear_test.csv"))
        # stacking network
        self.__train_feature_stacking_network = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_network_train.csv"))
        self.__test_feature_stacking_network = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_network_test.csv"))
        # gp
        self.__train_feature_stacking_gp = pd.read_csv(
            os.path.join(self.__input_path, "genetic_train_feature.csv"))
        self.__test_feature_stacking_gp = pd.read_csv(
            os.path.join(self.__input_path, "genetic_test_feature.csv"))

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["TARGET"] + [
            col for col in self.__train.columns.tolist()
            if re.search(r"SK_ID", col)
        ],
                                                 axis=1)
        self.__test_feature = self.__test[
            self.__train_feature.columns.tolist()]

        self.__categorical_columns = self.__train_feature.select_dtypes(
            "object").columns.tolist()
        self.__encoder = TargetEncoder()
        self.__encoder.fit(
            self.__train_feature.loc[:, self.__categorical_columns],
            self.__train_label)
        self.__train_feature.loc[:, self.__categorical_columns] = (
            self.__encoder.transform(
                self.__train_feature.loc[:, self.__categorical_columns]))
        self.__test_feature.loc[:, self.__categorical_columns] = (
            self.__encoder.transform(
                self.__test_feature.loc[:, self.__categorical_columns]))

        self.__train_feature = pd.concat([
            self.__train_feature, self.__train_feature_stacking_tree,
            self.__train_feature_stacking_linear,
            self.__train_feature_stacking_network,
            self.__train_feature_stacking_gp
        ],
                                         axis=1)
        self.__test_feature = pd.concat([
            self.__test_feature, self.__test_feature_stacking_tree,
            self.__test_feature_stacking_linear,
            self.__test_feature_stacking_network,
            self.__test_feature_stacking_gp
        ],
                                        axis=1)

    def model_fit(self):
        self.__folds = StratifiedKFold(n_splits=4,
                                       shuffle=True,
                                       random_state=8)
        self.__oof_preds = np.zeros(shape=self.__train_feature.shape[0])
        self.__sub_preds = np.zeros(shape=self.__test_feature.shape[0])
        # self.__sub_preds = np.zeros(shape=(self.__test_feature.shape[0], 5))

        feature_importance_df = pd.DataFrame()
        for n_fold, (trn_idx, val_idx) in enumerate(
                self.__folds.split(self.__train_feature, self.__train_label)):
            trn_x, trn_y = self.__train_feature.iloc[
                trn_idx], self.__train_label.iloc[trn_idx]
            val_x, val_y = self.__train_feature.iloc[
                val_idx], self.__train_label.iloc[val_idx]

            self.__gbm = LGBMClassifier(colsample_bytree=0.6659,
                                        learning_rate=0.0197,
                                        max_depth=8,
                                        min_child_weight=1.0652,
                                        min_split_gain=0.058,
                                        n_estimators=501,
                                        num_leaves=11,
                                        reg_alpha=2.2487,
                                        reg_lambda=6.2587,
                                        subsample=0.9401)

            self.__gbm.fit(trn_x,
                           trn_y,
                           eval_set=[(trn_x, trn_y), (val_x, val_y)],
                           eval_metric="auc",
                           verbose=True,
                           early_stopping_rounds=5)
            pred_val = self.__gbm.predict_proba(
                val_x, num_iteration=self.__gbm.best_iteration_)[:, 1]
            pred_test = self.__gbm.predict_proba(
                self.__test_feature,
                num_iteration=self.__gbm.best_iteration_)[:, 1]

            self.__oof_preds[val_idx] = pred_val
            self.__sub_preds += pred_test / self.__folds.n_splits
            # self.__sub_preds[:, n_fold] = pred_test

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = pd.Series(
                self.__train_feature.columns)
            fold_importance_df["importance"] = self.__gbm.feature_importances_
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat(
                [feature_importance_df, fold_importance_df], axis=0)
            # 保存 weight
            # self.__metric_weight.append(roc_auc_score(val_y, self.__oof_preds[val_idx]))
            print(
                "Fold %2d AUC : %.6f" %
                (n_fold + 1, roc_auc_score(val_y, self.__oof_preds[val_idx])))

        feature_importance_df.to_csv(os.path.join(self.__output_path,
                                                  "feature_importance.csv"),
                                     index=False)
        print("Full AUC score %.6f" %
              roc_auc_score(self.__train_label, self.__oof_preds))

    def model_predict(self):
        # weight sum
        # self.__metric_weight = pd.Series(self.__metric_weight).rank()
        # self.__metric_weight = self.__metric_weight / self.__metric_weight.sum()
        # self.__metric_weight = self.__metric_weight.values.reshape((5, 1))
        # self.__sub_preds = np.dot(self.__sub_preds, self.__metric_weight)
        self.__sample_submission["TARGET"] = self.__sub_preds
        self.__sample_submission.to_csv(os.path.join(self.__output_path,
                                                     "sample_submission.csv"),
                                        index=False)