Ejemplo n.º 1
0
    def gen_groupby_cat_encode_features(self, data, cat_columns, num_column,
                                cat_encoder_name='JamesSteinEncoder'):
        """
        Description of group_encoder

        Args:
            data (pd.DataFrame): dataset
            cat_columns (list): cat columns names
            num_column (str): num column name

        Returns:
            pd.DataFrame

        """

        if isinstance(cat_encoder_name, str):
            if cat_encoder_name in self.cat_encoder_names_list:
                encoder = JamesSteinEncoder(cols=self.cat_features, model='beta', return_df = True, drop_invariant=True)
                encoder.fit(X=data[cat_columns], y=data[num_column].values)
            else:
                print(f"{cat_encoder_name} is not supported!")
                return ('', '')
        else:
            encoder = copy.deepcopy(cat_encoder_name)

        data_encodet = encoder.transform(X=data[cat_columns], y=data[num_column].values)
        data_encodet = data_encodet.add_prefix('GroupEncoded_' + num_column + '_')

        return (data_encodet, encoder)
Ejemplo n.º 2
0
 def JamesStein_Encoding(self,
                         model: str = 'independent',
                         sigma: float = 0.05,
                         randomized: bool = False):
     """
     James-Stein编码,也是一种基于目标编码的编码方法,也尝试通过参数B来平衡先验概率与观测到的条件概率。
     但与目标编码与M估计量编码不同的是,James-Stein编码器通过方差比而不是样本大小来平衡两个概率。
     :param model:
     :param sigma:
     :param randomized:
     :return:
     """
     self.encoder = JamesSteinEncoder(cols=self.cols,
                                      model=model,
                                      sigma=sigma,
                                      randomized=randomized)
Ejemplo n.º 3
0
    TargetClassifierEncoderCV,
    TargetRegressorEncoder,
    TargetRegressorEncoderCV,
)
from category_encoders import JamesSteinEncoder
from sklearn.preprocessing import LabelEncoder

DATA_PATH = Path(".") / "data"


@pytest.mark.parametrize(
    "encoder",
    [
        TargetClassifierEncoder(),
        TargetClassifierEncoderCV(),
        JamesSteinEncoder(),
    ],
)
def test_adult(encoder):
    """Smoke test for adult dataset."""

    adult_path = DATA_PATH / "adult.csv"
    adult_df = pd.read_csv(adult_path)

    X = adult_df.drop("class", axis=1)
    y = LabelEncoder().fit_transform(adult_df["class"])

    prep = ColumnTransformer([
        (
            "cat",
            encoder,
Ejemplo n.º 4
0
}

ENCODERS = {
    "drop":
    "drop",
    "SKOrdinalEncoder":
    make_pipeline(
        SimpleImputer(strategy="constant", fill_value="sk_missing"),
        OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    ),
    "SKTargetEncoder":
    TargetRegressorEncoder(),
    "SKTargetEncoderCV":
    TargetRegressorEncoderCV(),
    "JamesSteinEncoder":
    JamesSteinEncoder(),
    "JamesSteinEncoderCV":
    NestedEncoderCV(JamesSteinEncoder()),
}


def run_single_benchmark(data_str, encoder_str, cv, n_jobs, write_result,
                         force):
    print(f"running benchmark for {data_str} and {encoder_str}")
    data_info = DATA_INFOS[data_str]
    encoder = ENCODERS[encoder_str]

    results_path = get_results_path(RESULTS_DIR, data_info, encoder_str)
    if results_path.exists() and not force:
        print(
            f"benchmark for {data_str} and {encoder_str} exists pass --force to rerun"