def clean_myvolts(df):
    dropped_cols = DEFAULTS['MyVoltsDroppedCols'].split(
        ',') + DEFAULTS['MyVoltsIgnoredCols'].split(',')
    new_df = df.drop(dropped_cols, axis=1)

    # new_df.dropna(inplace=True)

    for col in DEFAULTS['MyVoltsNumberCols'].split(','):
        mean = new_df[col].mean()
        new_df[col].fillna(mean, inplace=True)
    new_df.fillna('unknown', inplace=True)

    encode_cols = DEFAULTS['MyVoltsEncodeCols'].split(',')

    cbe = CatBoostEncoder(cols=encode_cols,
                          return_df=True,
                          drop_invariant=True,
                          handle_missing='return_nan')
    cbe.fit(X=new_df, y=new_df['set_clicked'])
    new_df = cbe.transform(new_df)

    # one_hot_encode_cols = DEFAULTS['MyVoltsOneHotEncodeCols'].split(',')
    # new_df = oh_encode(new_df, one_hot_encode_cols)

    # label_encode_cols = DEFAULTS['MyVoltsLabelEncodeCols'].split(',')
    # new_df = label_encode(new_df, label_encode_cols)

    return new_df
class DFCatBoostEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.model = CatBoostEncoder(**kwargs)
        self.transform_cols = None

    def fit(self, X, y):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols], y)

        return self

    def transform(self, X):
        return self.__transform(X)

    def __transform(self, X, y=None):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.drop(columns=self.transform_cols)
        new_X = pd.concat([
            new_X,
            self.model.transform(X[self.transform_cols]) if y is None else
            self.model.fit_transform(X[self.transform_cols], y)
        ],
                          axis=1)

        return new_X

    def fit_transform(self, X, y):
        # NOTE: Result of fit_transform() is different from fit() + transform()
        return self.fit(X, y).__transform(X, y)
Exemple #3
0
 def encode_cat_features(self, X, y, cat_features, train_mask, val_mask, test_mask):
     from category_encoders import CatBoostEncoder
     enc = CatBoostEncoder()
     A = X.to_numpy(copy=True)
     b = y.to_numpy(copy=True)
     A[np.ix_(train_mask, cat_features)] = enc.fit_transform(A[np.ix_(train_mask, cat_features)], b[train_mask])
     A[np.ix_(val_mask + test_mask, cat_features)] = enc.transform(A[np.ix_(val_mask + test_mask, cat_features)])
     A = A.astype(float)
     return pd.DataFrame(A, columns=X.columns)
Exemple #4
0
 def CatBoost_Encoding(self, sigma: float = None, a: float = 1):
     """
     CatBoost是一个基于树的梯度提升模型。其在包含大量类别特征的数据集问题中具有出色的效果。
     在使用Catboost编码器之前,必须先对训练数据随机排列,因为在Catboost中,编码是基于“时间”的概念,即数据集中观测值的顺序。
     :param sigma:
     :param a:
     :return:
     """
     self.encoder = CatBoostEncoder(cols=self.cols, a=a, sigma=sigma)
Exemple #5
0
 def models_to_compare(self) -> Dict[ModelName, Dict]:
     lightgbm_step_categorical_features_params = f"{ModelName.LIGHTGBM.value}__{CATEGORICAL_FEATURE}"
     return {
         ModelName.CATBOOST: {
             TaskName.CLASSIFICATION:
             Pipeline([(ModelName.CATBOOST.value,
                        CatBoostClassifier(
                            cat_features=self.categorical_features_indices,
                            verbose=0))]),
             TaskName.REGRESSION:
             Pipeline([(ModelName.CATBOOST.value,
                        CatBoostRegressor(
                            cat_features=self.categorical_features_indices,
                            verbose=0))])
         },
         ModelName.LIGHTGBM: {
             TaskName.CLASSIFICATION:
             Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()),
                       (ModelName.LIGHTGBM.value, LGBMClassifier())]),
             TaskName.REGRESSION:
             Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()),
                       (ModelName.LIGHTGBM.value, LGBMRegressor())]),
             FIT_PARAMS: {
                 lightgbm_step_categorical_features_params:
                 self.categorical_features
             }
         },
         ModelName.LIGHTGBM_WITH_CATBOOST_ENCODER: {
             TaskName.CLASSIFICATION:
             Pipeline([(ModelName.CATBOOST_ENCODER.value,
                        CatBoostEncoder()),
                       (ModelName.LIGHTGBM.value, LGBMClassifier())]),
             TaskName.REGRESSION:
             Pipeline([(ModelName.CATBOOST_ENCODER.value,
                        CatBoostEncoder()),
                       (ModelName.LIGHTGBM.value, LGBMRegressor())])
         },
         ModelName.XGBOOST_WITH_CATBOOST_ENCODER: {
             TaskName.CLASSIFICATION:
             Pipeline([(ModelName.CATBOOST_ENCODER.value,
                        CatBoostEncoder()),
                       (ModelName.XGBOOST.value, XGBClassifier())]),
             TaskName.REGRESSION:
             Pipeline([(ModelName.CATBOOST_ENCODER.value,
                        CatBoostEncoder()),
                       (ModelName.XGBOOST.value, XGBRegressor())])
         },
         ModelName.XGBOOST: {
             TaskName.CLASSIFICATION:
             Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()),
                       (ModelName.XGBOOST.value, XGBClassifier())]),
             TaskName.REGRESSION:
             Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()),
                       (ModelName.XGBOOST.value, XGBRegressor())])
         }
     }
Exemple #6
0
 def LeaveOneOut_Encoding(self, sigma: float = 0.05):
     """
     留一编码
     :param sigma:
     :return:
     """
     self.encoder = LeaveOneOutEncoder(cols=self.cols, sigma=sigma)
Exemple #7
0
 def Hashing_Encoding(self, n_components: int = 8):
     """
     哈希编码,将任意数量的变量以一定的规则映射到给定数量的变量。特征哈希可能会导致要素之间发生冲突。哈希编码器的大小及复杂程度不随数据类别的增多而增多。
     :param n_components: 用来表示特征的位数
     :return:
     """
     self.encoder = HashingEncoder(cols=self.cols,
                                   n_components=n_components)
Exemple #8
0
 def OneHot_Encoding(self,
                     handle_missing='indicator',
                     handle_unknown='indicator'):
     """
     one-hot编码,其可以将具有n_categories个可能值的一个分类特征转换为n_categories个二进制特征,其中一个为1,所有其他为0
     :param handle_missing: 默认value,缺失值用全0替代;indicator,增加缺失值一列
     :param handle_unknown: 默认value,未知值用全0替代;indicator,增加未知值一列
     :return:
     """
     self.encoder = OneHotEncoder(cols=self.cols,
                                  handle_missing=handle_missing,
                                  handle_unknown=handle_unknown)
Exemple #9
0
 def Helmert_Encoding(self,
                      handle_missing='indicator',
                      handle_unknown='indicator'):
     """
     Helmert编码,分类特征中的每个值对应于Helmert矩阵中的一行
     :param handle_missing: 默认value,缺失值用全0替代;indicator,增加缺失值一列
     :param handle_unknown: 默认value,未知值用全0替代;indicator,增加未知值一列
     :return:
     """
     self.encoder = HelmertEncoder(cols=self.cols,
                                   handle_unknown=handle_unknown,
                                   handle_missing=handle_missing)
Exemple #10
0
 def Devaition_Encoding(self,
                        handle_missing='indicator',
                        handle_unknown='indicator'):
     """
     偏差编码。偏差编码后,线性模型的系数可以反映该给定该类别变量值的情况下因变量的平均值与全局因变量的平均值的差异
     :param handle_missing: 默认value,缺失值用全0替代;indicator,增加缺失值一列
     :param handle_unknown: 默认value,未知值用全0替代;indicator,增加未知值一列
     :return:
     """
     self.encoder = SumEncoder(cols=self.cols,
                               handle_missing=handle_missing,
                               handle_unknown=handle_unknown)
Exemple #11
0
    def fit_transform(self, X: pd.DataFrame, y) -> pd.DataFrame:
        self.cbe_ = []
        cv = check_cv(self.cv)
        cbe = CatBoostEncoder(cols=X.columns.tolist(),
                              return_df=False,
                              **self.cbe_params)

        X_transformed = np.zeros_like(X, dtype=np.float64)
        for train_idx, valid_idx in cv.split(X, y):
            self.cbe_.append(clone(cbe).fit(X.loc[train_idx], y[train_idx]))
            X_transformed[valid_idx] = self.cbe_[-1].transform(
                X.loc[valid_idx])
        return pd.DataFrame(X_transformed, columns=X.columns)
def get_training_and_test_dfs():
    """
    Read in the test data csv file and return a dataframe for each organization
    :return:
    """
    missing_values = DEFAULTS['MissingValues'].split(',')
    df_train = pd.read_csv(DEFAULTS['TrainingFile'], na_values=missing_values)
    df_test = pd.read_csv(DEFAULTS['TestFile'], na_values=missing_values)
    output_df = pd.DataFrame(columns=['recommendation_set_id', 'set_clicked'])

    jabref_train, myvolts_train, homepage_train = split_data(df_train)
    jabref_test, myvolts_test, homepage_test = split_data(df_test)

    output_df['recommendation_set_id'] = myvolts_test[
        'recommendation_set_id'].copy()

    dropped_cols = DEFAULTS['MyVoltsDroppedCols'].split(
        ',') + DEFAULTS['MyVoltsIgnoredCols'].split(',')
    myvolts_train = myvolts_train.drop(dropped_cols, axis=1)
    myvolts_test = myvolts_test.drop(dropped_cols, axis=1)

    for col in DEFAULTS['MyVoltsNumberCols'].split(','):
        mean = myvolts_train[col].mean()
        myvolts_train[col].fillna(mean, inplace=True)

        mean = myvolts_test[col].mean()
        myvolts_test[col].fillna(mean, inplace=True)

    myvolts_train.fillna('unknown', inplace=True)
    myvolts_test.fillna('unknown', inplace=True)

    # myvolts_train['train'] = 1
    # myvolts_test['train'] = 0

    encode_cols = DEFAULTS['MyVoltsEncodeCols'].split(',')

    cbe = CatBoostEncoder(cols=encode_cols,
                          return_df=True,
                          drop_invariant=True,
                          handle_missing='return_nan')
    cbe.fit(X=myvolts_train, y=myvolts_train['set_clicked'])
    myvolts_train = cbe.transform(myvolts_train)
    myvolts_test = cbe.transform(myvolts_test)

    # combined = pd.concat([myvolts_train, myvolts_test])
    # combined = oh_encode(combined, encode_cols)

    # label_encode_cols = DEFAULTS['MyVoltsLabelEncodeCols'].split(',')
    # combined = label_encode(combined, label_encode_cols)

    # myvolts_train = combined[combined['train'] == 1]
    # myvolts_test = combined[combined['train'] == 0]
    # myvolts_train = myvolts_train.drop(['train'], axis=1)
    # myvolts_test = myvolts_test.drop(['train'], axis=1)

    return myvolts_train, myvolts_test, output_df
Exemple #13
0
 def MEstimate_Encoding(self,
                        m: float = 1.0,
                        sigma: float = 0.05,
                        randomized: bool = False):
     """
     M估计量编码是目标编码的一个简化版本
     :param m:
     :param sigma:
     :param randomized:
     :return:
     """
     self.encoder = MEstimateEncoder(cols=self.cols,
                                     m=m,
                                     sigma=sigma,
                                     randomized=randomized)
Exemple #14
0
 def Target_Encoding(self,
                     min_samples_leaf: int = 1,
                     smoothing: float = 1.0):
     """
     目标编码是一种不仅基于特征值本身,还基于相应因变量的类别变量编码方法。
     对于分类问题:将类别特征替换为给定某一特定类别值的因变量后验概率与所有训练数据上因变量的先验概率的组合。
     对于连续目标:将类别特征替换为给定某一特定类别值的因变量目标期望值与所有训练数据上因变量的目标期望值的组合。
     该方法严重依赖于因变量的分布,但这大大减少了生成编码后特征的数量。
     :param min_samples_leaf:
     :param smoothing:
     :return:
     """
     self.encoder = TargetEncoder(cols=self.cols,
                                  min_samples_leaf=min_samples_leaf,
                                  smoothing=smoothing)
Exemple #15
0
 def WOE_Encoding(self,
                  regularization: float = 1.0,
                  sigma: float = 0.05,
                  randomized: bool = False):
     """
     woe编码
     :param regularization:
     :param sigma:
     :param randomized:
     :return:
     """
     self.encoder = WOEEncoder(cols=self.cols,
                               regularization=regularization,
                               randomized=randomized,
                               sigma=sigma)
Exemple #16
0
def CatEncoder(X, cat_cols, tags, estimator_name, objective_type, trial, n_classes, random_state):
    if tags["handles categorical"] == False:
        large_threshold = 6
        #TODO: handle numpy arrays with categorical?
        #TODO: handle multiclass / Regression
        if isinstance(X, pd.DataFrame) and isinstance(cat_cols[0], str):
            large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold]
            small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold]
        elif isinstance(X, pd.DataFrame):
            large_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) > large_threshold]
            small_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) <= large_threshold]
        else:
            large_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) > large_threshold]
            small_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) <= large_threshold]

        enc_pipe = None
        cat_enc_types = ["target", "binary", "catboost"]

        if len(small_cardinal_cats) > 0:
            enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True))

        if len(large_cardinal_cats) > 0:
            if (objective_type == "classification" and n_classes == 1):
                cat_enc_types.append("woe")

            cat_enc_type = trial.suggest_categorical(estimator_name + " cat_enc_type", cat_enc_types)

            if cat_enc_type == "binary":
                # mapping = get_mapping(X, large_cardinal_cats)
                enc = BinaryEncoder(cols=large_cardinal_cats,
                                    # mapping=mapping
                                    )

            elif cat_enc_type == "woe":
                enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True)

            elif cat_enc_type == "target":
                min_samples_leaf = 6  # TODO: calculate percentage or something else
                enc = TargetEncoder(min_samples_leaf=min_samples_leaf,
                                    cols=large_cardinal_cats)

            else: # catboost
                enc = CatBoostEncoder(cols=large_cardinal_cats,
                                      random_state=random_state)  # TODO: replace SEED
                # TODO: permute to the dataset beforehand

            enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc)
    return enc_pipe
Exemple #17
0
 def JamesStein_Encoding(self,
                         model: str = 'independent',
                         sigma: float = 0.05,
                         randomized: bool = False):
     """
     James-Stein编码,也是一种基于目标编码的编码方法,也尝试通过参数B来平衡先验概率与观测到的条件概率。
     但与目标编码与M估计量编码不同的是,James-Stein编码器通过方差比而不是样本大小来平衡两个概率。
     :param model:
     :param sigma:
     :param randomized:
     :return:
     """
     self.encoder = JamesSteinEncoder(cols=self.cols,
                                      model=model,
                                      sigma=sigma,
                                      randomized=randomized)
Exemple #18
0
def CatEncoder(X, cat_cols, tags, objective_type, trial, n_classes, random_state):
    if tags["handles categorical"] == False:
        large_threshold = 6
        #TODO: handle numpy arrays with categorical?
        large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold]
        small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold]

        enc_pipe = None
        cat_enc_types = ["binary", "catboost", "woe", "target"]

        if small_cardinal_cats is not None:
            enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True))

        if large_cardinal_cats is not None:
            if (objective_type == "classification" and n_classes > 2): #multiclass
                cat_enc_types = ["binary"]

            cat_enc_type = trial.suggest_categorical("cat_enc_type", cat_enc_types)

            if cat_enc_type == "binary":
                # mapping = get_mapping(X, large_cardinal_cats)
                enc = BinaryEncoder(cols=large_cardinal_cats,
                                    drop_invariant=True,
                                    # mapping=mapping
                                    )

            elif cat_enc_type == "woe":
                enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True)

            elif cat_enc_type == "target":
                min_samples_leaf = 10  # TODO: calculate percentage or something else
                enc = TargetEncoder(min_samples_leaf=min_samples_leaf,
                                    cols=large_cardinal_cats,
                                    drop_invariant=True)

            else: # catboost
                enc = CatBoostEncoder(cols=large_cardinal_cats,
                                      drop_invariant=True,
                                      random_state=random_state)  # TODO: replace SEED
                # TODO: permute to the dataset beforehand

            enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc)
    return enc_pipe
Exemple #19
0
def make_pipeline(df):
    x = df
    col_dtypes = get_types(x)

    encoder = ColumnTransformer(
        [('categorical', CatBoostEncoder(), col_dtypes['object']),
         # could use passthrough=remainder, but this way makes column ordering more obvious
         ('numeric', FunctionTransformer(), col_dtypes['int64'] + col_dtypes['float64'])
         ]
    )

    all_columns_idx = np.full((len(x)), True, dtype=bool)
    imputer = ColumnTransformer(
        [('knn_imputer', KNNImputer(), all_columns_idx)]
    )

    pipeline = Pipeline(steps=[
        ('encoder', encoder),
        ('imputer', imputer),
    ])

    return pipeline, col_dtypes['object'] + col_dtypes['int64'] + col_dtypes['float64']
 def __init__(self, columns=None, **kwargs):
     self.columns = columns
     self.model = CatBoostEncoder(**kwargs)
     self.transform_cols = None
Exemple #21
0
 def Ordinal_Encoding(self):
     """
     序数编码将类别变量转化为一列序数变量,包含从1到类别数量之间的整数
     :return:
     """
     self.encoder = OrdinalEncoder(cols=self.cols)
Exemple #22
0
def reg_model(labelled_data, unlabelled_data):
    """ Parameters: training dataframe, unknown dataframe
        Returns: results dataframe (Instance, Income)

        ffill on NaN from training data,
        Replaces NaN in test data with ffill, 
        cat-encodes non-numeric fields, 
        scales values,
        80/20 splits data to help verify model, 
        uses LightGBM
    """

    # print("throwing away rows to speed up model")
    # speed up testing by throwing away some data
    # clean_labelled = labelled_data.sample(frac=0.2)
    clean_labelled = labelled_data.copy()
    clean_unlabelled = unlabelled_data.copy()

    print("cleaning data...")
    # get rid of weird value
    clean_labelled.loc[:,
                       "Work Experience in Current Job [years]"] = pandas.to_numeric(
                           labelled_data[
                               "Work Experience in Current Job [years]"],
                           errors="coerce")
    clean_unlabelled.loc[:,
                         "Work Experience in Current Job [years]"] = pandas.to_numeric(
                             unlabelled_data[
                                 "Work Experience in Current Job [years]"],
                             errors="coerce")
    print("mixed type issue fixed..")

    # fix additional income field
    clean_labelled.loc[:, "Yearly Income in addition to Salary (e.g. Rental Income)"] = pandas.to_numeric(
        np.fromiter(map(
            lambda s: s.replace(" EUR", ""),
            clean_labelled[
                "Yearly Income in addition to Salary (e.g. Rental Income)"],
        ),
                    dtype=np.float),
        errors="coerce")
    clean_unlabelled.loc[:, "Yearly Income in addition to Salary (e.g. Rental Income)"] = pandas.to_numeric(
        np.fromiter(map(
            lambda s: s.replace(" EUR", ""),
            clean_unlabelled[
                "Yearly Income in addition to Salary (e.g. Rental Income)"],
        ),
                    dtype=np.float),
        errors="coerce")

    # dropping useless columns
    drop_columns(clean_unlabelled)
    drop_columns(clean_labelled)

    # removing NaN values
    clean_labelled.fillna(method="ffill", inplace=True)
    clean_unlabelled = clean_unlabelled[all_columns]
    clean_unlabelled.fillna(method="ffill", inplace=True)

    # input data for final predictions
    unknown_data = clean_unlabelled.drop(["Instance"], axis=1)

    print("splitting data into train and test...")
    # 80/20 split, and separating targets
    split = split_data(clean_labelled)
    train_data, train_target, test_data, test_target = split

    print("encoding categorical data...")
    # categorical encoding
    cat = CatBoostEncoder()
    train_data = cat.fit_transform(train_data, train_target)
    test_data = cat.transform(test_data)
    unknown_data = cat.transform(unknown_data)

    # separate additional income
    train_add_income = train_data[
        "Yearly Income in addition to Salary (e.g. Rental Income)"].values
    test_add_income = test_data[
        "Yearly Income in addition to Salary (e.g. Rental Income)"].values
    unknown_add_income = unknown_data[
        "Yearly Income in addition to Salary (e.g. Rental Income)"].values

    train_data = train_data[no_income_columns]
    test_data = test_data[no_income_columns]
    unknown_data = unknown_data[no_income_columns]

    train_target = train_target[
        "Total Yearly Income [EUR]"].values - train_add_income
    test_target = test_target["Total Yearly Income [EUR]"].values

    print("scaling values...")
    # scaling values
    scaler = StandardScaler()
    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)
    unknown_data = scaler.transform(unknown_data)

    print("fitting model...")
    # fit model
    reg = LGBMRegressor()
    # reg = TransformedTargetRegressor(
    #     regressor=mod,
    #     transformer=scaler
    # )
    reg.fit(train_data, train_target)

    print("predicting test data...")
    test_result = reg.predict(test_data, num_iterations=15000)
    # add additional income
    test_result = test_result + test_add_income

    print("analysing test results...")
    # validate test
    error = mean_absolute_error(test_target, test_result)
    score = explained_variance_score(test_target, test_result)
    print("Mean absolute error of test data: ", error)
    print("Score: ", score)

    print("predicting unknown data...")
    # predict and format
    values = reg.predict(unknown_data)
    values = values + unknown_add_income

    results = pandas.DataFrame({
        "Instance": clean_unlabelled["Instance"].values,
        "Total Yearly Income [EUR]": values
    })
    print("Finished.")
    return results
Exemple #23
0
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

if __name__ == '__main__':
    df = pd.read_csv('dataset/data.csv')
    y = df['target']
    df.drop('target', axis=1, inplace=True)
    perm = np.random.permutation(len(df))
    train = df.iloc[perm].reset_index(drop=True)
    y = y.iloc[perm].reset_index(drop=True)

    # drop columns
    train.drop(['track_name', 'track_id'], axis=1, inplace=True)

    # Categorical Encoding
    cbe = CatBoostEncoder(cols=['artist', 'album'])
    cbe = cbe.fit(train, y)
    pickle.dump(cbe, open('saved_models/catboostencoder.pkl', 'wb'))
    train = cbe.transform(train)

    # Feature Scaling
    scaler = MinMaxScaler().fit(train)
    train = scaler.transform(train)

    # Model
    rf_model = pickle.load(
        open('saved_models/random_forest_grid_model.pkl', 'rb'))
    new_model = RandomForestClassifier(
        criterion=rf_model.best_params_['criterion'],
        min_impurity_decrease=rf_model.best_params_['min_impurity_decrease'],
        min_samples_leaf=rf_model.best_params_['min_samples_leaf'],
Exemple #24
0
class FeatureEncoding(TransformerMixin):
    def __init__(self, cols: List = None):
        """
        初始化函数
        :param cols: 编码列列表
        """
        self.cols = cols
        self.encoder = None

    def Ordinal_Encoding(self):
        """
        序数编码将类别变量转化为一列序数变量,包含从1到类别数量之间的整数
        :return:
        """
        self.encoder = OrdinalEncoder(cols=self.cols)

    def OneHot_Encoding(self,
                        handle_missing='indicator',
                        handle_unknown='indicator'):
        """
        one-hot编码,其可以将具有n_categories个可能值的一个分类特征转换为n_categories个二进制特征,其中一个为1,所有其他为0
        :param handle_missing: 默认value,缺失值用全0替代;indicator,增加缺失值一列
        :param handle_unknown: 默认value,未知值用全0替代;indicator,增加未知值一列
        :return:
        """
        self.encoder = OneHotEncoder(cols=self.cols,
                                     handle_missing=handle_missing,
                                     handle_unknown=handle_unknown)

    def Hashing_Encoding(self, n_components: int = 8):
        """
        哈希编码,将任意数量的变量以一定的规则映射到给定数量的变量。特征哈希可能会导致要素之间发生冲突。哈希编码器的大小及复杂程度不随数据类别的增多而增多。
        :param n_components: 用来表示特征的位数
        :return:
        """
        self.encoder = HashingEncoder(cols=self.cols,
                                      n_components=n_components)

    def Helmert_Encoding(self,
                         handle_missing='indicator',
                         handle_unknown='indicator'):
        """
        Helmert编码,分类特征中的每个值对应于Helmert矩阵中的一行
        :param handle_missing: 默认value,缺失值用全0替代;indicator,增加缺失值一列
        :param handle_unknown: 默认value,未知值用全0替代;indicator,增加未知值一列
        :return:
        """
        self.encoder = HelmertEncoder(cols=self.cols,
                                      handle_unknown=handle_unknown,
                                      handle_missing=handle_missing)

    def Devaition_Encoding(self,
                           handle_missing='indicator',
                           handle_unknown='indicator'):
        """
        偏差编码。偏差编码后,线性模型的系数可以反映该给定该类别变量值的情况下因变量的平均值与全局因变量的平均值的差异
        :param handle_missing: 默认value,缺失值用全0替代;indicator,增加缺失值一列
        :param handle_unknown: 默认value,未知值用全0替代;indicator,增加未知值一列
        :return:
        """
        self.encoder = SumEncoder(cols=self.cols,
                                  handle_missing=handle_missing,
                                  handle_unknown=handle_unknown)

    def Target_Encoding(self,
                        min_samples_leaf: int = 1,
                        smoothing: float = 1.0):
        """
        目标编码是一种不仅基于特征值本身,还基于相应因变量的类别变量编码方法。
        对于分类问题:将类别特征替换为给定某一特定类别值的因变量后验概率与所有训练数据上因变量的先验概率的组合。
        对于连续目标:将类别特征替换为给定某一特定类别值的因变量目标期望值与所有训练数据上因变量的目标期望值的组合。
        该方法严重依赖于因变量的分布,但这大大减少了生成编码后特征的数量。
        :param min_samples_leaf:
        :param smoothing:
        :return:
        """
        self.encoder = TargetEncoder(cols=self.cols,
                                     min_samples_leaf=min_samples_leaf,
                                     smoothing=smoothing)

    def MEstimate_Encoding(self,
                           m: float = 1.0,
                           sigma: float = 0.05,
                           randomized: bool = False):
        """
        M估计量编码是目标编码的一个简化版本
        :param m:
        :param sigma:
        :param randomized:
        :return:
        """
        self.encoder = MEstimateEncoder(cols=self.cols,
                                        m=m,
                                        sigma=sigma,
                                        randomized=randomized)

    def JamesStein_Encoding(self,
                            model: str = 'independent',
                            sigma: float = 0.05,
                            randomized: bool = False):
        """
        James-Stein编码,也是一种基于目标编码的编码方法,也尝试通过参数B来平衡先验概率与观测到的条件概率。
        但与目标编码与M估计量编码不同的是,James-Stein编码器通过方差比而不是样本大小来平衡两个概率。
        :param model:
        :param sigma:
        :param randomized:
        :return:
        """
        self.encoder = JamesSteinEncoder(cols=self.cols,
                                         model=model,
                                         sigma=sigma,
                                         randomized=randomized)

    def WOE_Encoding(self,
                     regularization: float = 1.0,
                     sigma: float = 0.05,
                     randomized: bool = False):
        """
        woe编码
        :param regularization:
        :param sigma:
        :param randomized:
        :return:
        """
        self.encoder = WOEEncoder(cols=self.cols,
                                  regularization=regularization,
                                  randomized=randomized,
                                  sigma=sigma)

    def LeaveOneOut_Encoding(self, sigma: float = 0.05):
        """
        留一编码
        :param sigma:
        :return:
        """
        self.encoder = LeaveOneOutEncoder(cols=self.cols, sigma=sigma)

    def CatBoost_Encoding(self, sigma: float = None, a: float = 1):
        """
        CatBoost是一个基于树的梯度提升模型。其在包含大量类别特征的数据集问题中具有出色的效果。
        在使用Catboost编码器之前,必须先对训练数据随机排列,因为在Catboost中,编码是基于“时间”的概念,即数据集中观测值的顺序。
        :param sigma:
        :param a:
        :return:
        """
        self.encoder = CatBoostEncoder(cols=self.cols, a=a, sigma=sigma)

    def fit(self, X: DataFrame, y: Series = None):
        """
        拟合函数
        :param X:
        :param y:
        :return:
        """
        if y is None:
            self.encoder.fit(X)
        else:
            self.encoder.fit(X, y)

    def transform(self, X: DataFrame):
        """
        转换函数
        :param X:
        :return:
        """
        res = self.encoder.transform(X)

        return res
Exemple #25
0
 def encoders(self):
     ohe = OneHotEncoder()
     cbe = CatBoostEncoder()
     return ohe, cbe
    def _ml_data_prep(self):
        """Prepares datasets for ML

        This does one hot encoding, cat boost encoding, and train test
        split (if necessary).
        """

        df_post = copy.deepcopy(self.df_post)
        train_prior = copy.deepcopy(self.df_prior)

        # create test data if not provided
        if self.test_data is None:

            logger.info(
                "No test data was provided. Test data will be created with",
                "a {}-{} ".format(self.train_size*100, (1-self.train_size)*100),
                "shuffle split from the post data set."
            )

            df_post = shuffle(df_post)
            n_split = int(len(df_post)*self.train_size)

            train_post = df_post.iloc[:n_split]
            test = df_post.iloc[n_split:]

        else:
            test = copy.deepcopy(self.test_data)
            train_post = df_post

        # determine columns for OHE & CatBoost
        OHE_columns = [col for col in self.OHE_columns if
                       col != self.target_column]
        high_cardinality_columns = [col for col in self.high_cardinality_columns
                                 if col != self.target_column]

        if len(OHE_columns) > 0:
            logger.info("One hot encoded columns: ", OHE_columns)
        if len(high_cardinality_columns) > 0:
            logger.info("Cat boost encoded columns: ", high_cardinality_columns)

        # concat and then OHE to ensure columns match
        train_prior['source'] = "Train Prior"
        test['source'] = "Test"
        train_post['source'] = "Train Post"

        df = pd.concat([train_prior, test, train_post])
        df = pd.get_dummies(data=df, columns=OHE_columns)

        train_prior = df[df.source == 'Train Prior'].drop('source', axis=1)
        test = df[df.source == 'Test'].drop('source', axis=1)
        train_post = df[df.source == 'Train Post'].drop('source', axis=1)

        # CatBoostEncoder for high cardinality columns
        test_prior = copy.deepcopy(test)
        test_post = copy.deepcopy(test)

        tf_prior = CatBoostEncoder(cols=high_cardinality_columns,
                                   random_state=self.random_state)
        tf_post = CatBoostEncoder(cols=high_cardinality_columns,
                                  random_state=self.random_state)

        train_prior[high_cardinality_columns] = (
            tf_prior.fit_transform(train_prior[high_cardinality_columns],
                                   train_prior[self.target_column])
        )
        test_prior[high_cardinality_columns] = (
            tf_prior.transform(test_prior[high_cardinality_columns],
                               test_prior[self.target_column])
        )
        train_post[high_cardinality_columns] = (
            tf_post.fit_transform(train_post[high_cardinality_columns],
                                  train_post[self.target_column])
        )
        test_post[high_cardinality_columns] = (
            tf_post.transform(test_post[high_cardinality_columns],
                              test_post[self.target_column])
        )

        X_train_prior = train_prior.drop(self.target_column, axis=1).astype(float)
        y_train_prior = train_prior[self.target_column].astype(float)
        X_test_prior = test_prior.drop(self.target_column, axis=1).astype(float)
        y_test = test[self.target_column].astype(float)

        X_train_post = train_post.drop(self.target_column, axis=1).astype(float)
        y_train_post = train_post[self.target_column].astype(float)
        X_test_post = test_post.drop(self.target_column, axis=1).astype(float)

        self.X_train_prior = X_train_prior
        self.y_train_prior = y_train_prior
        self.X_test_prior = X_test_prior
        self.y_test = y_test
        self.X_train_post = X_train_post
        self.y_train_post = y_train_post
        self.X_test_post = X_test_post