def PreprocessOriginalData(train, test, filename):
    # preprocessing training/testing
    print("preprocessing on whole training/testing set")
    # deepcopy the train/test set to preprocess
    from copy import deepcopy
    train = deepcopy(train)
    test = deepcopy(test)
    test.X = pd.DataFrame.copy(test.X)
    # feature engineering
    train.X = firststeps(train.X)
    test.X = firststeps(test.X)
    # fillNA
    train.X, NAmethod = PP.fillNA(train.X)
    test.X, _ = PP.fillNA(test.X, NAmethod)
    # Empirical Bayesian Encoding
    encoding_lst = [col for col in train.X.columns if col.endswith('cat') 
                    and len(pd.concat([train.X, test.X])[col].unique()) >= 13]
    train.X, Encodingmethod = PP.encoding(train, encoding_lst)
    test.X, _ = PP.encoding(test, encoding_lst, param = Encodingmethod)
    if True:
        # set down which list need 2b dummied
        mydummylist = PP.dummylist(
            pd.concat([train.X, test.X], axis=0))
        myheader = PP.makehead(
            pd.concat([train.X, test.X], axis=0), mydummylist)
    train.X = PP.dummy(
        PP.addhead(train.X, myheader), mydummylist)
    train.X = PP.rmhead(train.X)
    test.X = PP.dummy(
        PP.addhead(test.X, myheader), mydummylist)
    test.X = PP.rmhead(test.X)
    train.X, STDmethod = PP.standardize(train.X)
    test.X, _ = PP.standardize(test.X, STDmethod)
    print("saving %strain_X.csv (%d row * %d col)"%(filename, train.X.shape[0], train.X.shape[1]))
    train.X.to_csv(filename + "train_X.csv",
                   index=False, float_format="%.5f")
    train.y.to_csv(filename + "train_y.csv",
                   index=False, float_format="%.5f")
    print("saving %stest_X.csv (%d row * %d col)"%(filename, test.X.shape[0], test.X.shape[1]))
    test.X.to_csv(filename + "test_X.csv",
                  index=False, float_format="%.5f")
    test.y.to_csv(filename + "test_y.csv",
                  index=False, float_format="%.5f")
    return mydummylist, myheader, encoding_lst
def KFoldsPreprocess(train, test, mydummylist, myheader, encoding_lst, KFOLDS, filename):
    folds = list(StratifiedKFold(n_splits=KFOLDS, shuffle=True,
                             random_state=10086).split(train.X, train.y))
    # do methods in preprocessing in each fold
    for i, (train_idx, valid_idx) in enumerate(folds):
        print("preprocessing on cv #%d" % i)
        train_this_cut = dataset(
            train.X.iloc[train_idx], train.y.iloc[train_idx])
        valid_this_cut = dataset(
            train.X.iloc[valid_idx], train.y.iloc[valid_idx])
        # feature engineering
        train_this_cut.X = firststeps(train_this_cut.X)
        valid_this_cut.X = firststeps(valid_this_cut.X)
        # fillNA
        train_this_cut.X, NAmethod = PP.fillNA(train_this_cut.X)
        # Empirical Bayesian Encoding
        train_this_cut.X, Encodingmethod = PP.encoding(train_this_cut, encoding_lst)
        train_this_cut.X = PP.dummy(
            PP.addhead(train_this_cut.X, myheader), mydummylist)
        train_this_cut.X = PP.rmhead(train_this_cut.X)
        train_this_cut.X, STDmethod = PP.standardize(
            train_this_cut.X)
        valid_this_cut.X, _ = PP.fillNA(
            valid_this_cut.X, NAmethod)
        valid_this_cut.X, _ = PP.encoding(valid_this_cut, encoding_lst, param = Encodingmethod)
        valid_this_cut.X = PP.dummy(
            PP.addhead(valid_this_cut.X, myheader), mydummylist)
        valid_this_cut.X = PP.rmhead(valid_this_cut.X)
        valid_this_cut.X, _ = PP.standardize(
            valid_this_cut.X, STDmethod)

        print("saving %strain_X_%d.csv (%d row * %d col)"%(filename, i, train_this_cut.X.shape[0], train_this_cut.X.shape[1]))
        train_this_cut.X.to_csv(filename + "train_X_%d.csv" %
                                i, index=False, float_format="%.5f")
        train_this_cut.y.to_csv(filename + "train_y_%d.csv" %
                                i, index=False, float_format="%.5f")
        print("saving %svalid_X_%d.csv (%d row * %d col)"%(filename, i, valid_this_cut.X.shape[0], valid_this_cut.X.shape[1]))
        valid_this_cut.X.to_csv(filename + "valid_X_%d.csv" %
                                i, index=False, float_format="%.5f")
        valid_this_cut.y.to_csv(filename + "valid_y_%d.csv" %
                                i, index=False, float_format="%.5f")