def helmert_encoding(df, col):
    import pandas as pd
    import category_encoders as ce
    encoder = ce.HelmertEncoder(cols=col, drop_invariant=True)
    dfh = encoder.fit_transform(df[col])
    df = pd.concat([df, dfh], axis=1)
    return df
Exemple #2
0
def helmert():
    X, _, _ = get_mushroom_data()
    print(X.info())
    enc = ce.HelmertEncoder()
    enc.fit(X, None)
    out = enc.transform(X)
    print(out.info())
    del enc, _, X, out
Exemple #3
0
def apply_helmert_encoding(df, categorical_columns):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    import category_encoders as ce
    encoder = ce.HelmertEncoder(cols=categorical_columns).fit(df.values)
    X_transformed = encoder.transform(df)
    X.drop(['intercept'], inplace=True, axis=1)
    return X_transformed
 def create_features(self, df_train, df_test):
     encoder = ce.HelmertEncoder(cols=self.columns)
     encoder.fit(df_train[self.columns],
                 df_train[self.target_column].values.tolist())
     encoded_train = encoder.transform(df_train[self.columns])
     encoded_test = encoder.transform(df_test[self.columns])
     for column in encoded_train.columns:
         self.train[column + '_HelmertEncoder'] = encoded_train[column]
         self.test[column + '_HelmertEncoder'] = encoded_test[column]
    def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self):
        train = ['A', 'B', np.nan]

        encoder = encoders.HelmertEncoder(handle_missing='indicator',
                                          handle_unknown='value')
        result = encoder.fit_transform(train)

        expected = [[1, -1, -1], [1, 1, -1], [1, 0, 2]]
        self.assertTrue(np.array_equal(result.values.tolist(), expected))
Exemple #6
0
    def test_helmert_preserve_dimension_4(self):
        train = ['A', 'B', 'C']
        test = ['D', 'B', 'C', None]

        encoder = encoders.HelmertEncoder(handle_unknown='value',
                                          handle_missing='value')
        encoder.fit(train)
        test_t = encoder.transform(test)

        expected = [[1, 0, 0], [1, 1, -1], [1, 0, 2], [1, 0, 0]]
        self.assertEqual(test_t.values.tolist(), expected)
Exemple #7
0
    def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self):
        train = ['A', 'B']
        test = ['A', 'B', 'C']

        encoder = encoders.HelmertEncoder(handle_unknown='indicator',
                                          handle_missing='value')
        encoder.fit(train)
        result = encoder.transform(test)

        expected = [[1, -1, -1], [1, 1, -1], [1, 0, 2]]
        self.assertEqual(result.values.tolist(), expected)
Exemple #8
0
def main(params, inputs, outputs):
    columns_param = params.columns
    data = inputs.data
    data_new = outputs.data_new

    data_0 = pd.read_pickle(data)

    encoder = ce.HelmertEncoder(cols=[col for col in columns_param.split(",")])
    data_1 = encoder.fit_transform(data_0)

    data_1.to_pickle(data_new)
    def __init__(self, encoder_type, columns_name=None):
        """
        :param encoder_type:
        :param columns_name: list, 特征名组成的列表名
        """
        if encoder_type == "BackwardDe":  # 反向差分编码
            self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name)

        elif encoder_type == "BaseN":  # BaseN编码
            self.encoder = ce.BaseNEncoder(cols=columns_name)

        elif encoder_type == "Binary":  # 二值编码
            self.encoder = ce.BinaryEncoder(cols=columns_name)

        elif encoder_type == "Catboost":
            self.encoder = ce.CatBoostEncoder(cols=columns_name)

        elif encoder_type == "Hash":
            self.encoder = ce.HashingEncoder(cols=columns_name)

        elif encoder_type == "Helmert":
            self.encoder = ce.HelmertEncoder(cols=columns_name)

        elif encoder_type == "JamesStein":
            self.encoder = ce.JamesSteinEncoder(cols=columns_name)

        elif encoder_type == "LOO":  # LeaveOneOutEncoder  编码
            self.encoder = ce.LeaveOneOutEncoder(cols=columns_name)

        elif encoder_type == "ME":
            self.encoder = ce.MEstimateEncoder(cols=columns_name)  # M估计编码器

        elif encoder_type == "OneHot":
            self.encoder = ce.OneHotEncoder(cols=columns_name)

        elif encoder_type == "OridinalEncoder":  # 原始编码
            self.encoder = ce.OrdinalEncoder(cols=columns_name)

        elif encoder_type == "Sum":  # 求和编码
            self.encoder = ce.SumEncoder(cols=columns_name)

        elif encoder_type == "Polynomial":  # 多项式编码
            self.encoder = ce.PolynomialEncoder(cols=columns_name)

        elif encoder_type == "Target":  # 目标编码
            self.encoder = ce.TargetEncoder(cols=columns_name)

        elif encoder_type == "WOE":  # WOE 编码器
            self.encoder = ce.WOEEncoder(cols=columns_name)

        else:
            raise ValueError("请选择正确的编码方式")
Exemple #10
0
def get_encoder_dict():
    encoder_dict = {
        'OneHotEncoder': ce.OneHotEncoder(),
        'BinaryEncoder': ce.BinaryEncoder(),
        'HashingEncoder': ce.HashingEncoder(),
        'LabelEncoder': le.MultiColumnLabelEncoder(),
        'FrequencyEncoder': fe.FrequencyEncoder(),
        'TargetEncoder': ce.TargetEncoder(),
        'HelmertEncoder': ce.HelmertEncoder(),
        'JamesSteinEncoder': ce.JamesSteinEncoder(),
        'BaseNEncoder': ce.BaseNEncoder(),
        'SumEncoder': ce.SumEncoder(),
    }
    return encoder_dict
Exemple #11
0
def encode(data):
    encoder = ce.HelmertEncoder()   # promising
    #encoder = ce.BinaryEncoder( )  # promising
    #encoder = ce.OrdinalEncoder( ) # simple but working
    #encoder = ce.polynomial.PolynomialEncoder()
    #encoder = ce.OneHotEncoder()
    #encoder = ce.BackwardDifferenceEncoder()
    #encoder = ce.HashingEncoder( )
    #encoder = ce.SumEncoder()
    encoder.fit(data, verbose=1)
    data = encoder.transform(data)
    data = data.values.tolist()

    #print(data[len(signal)-1])
    return data
Exemple #12
0
    def test_helmert(self):
        """

        :return:
        """

        cols = ['C1', 'D', 'E', 'F']
        enc = encoders.HelmertEncoder(verbose=1, cols=cols)
        X = self.create_dataset(n_rows=1000)

        X_test = enc.fit_transform(X, None)

        for dt in X_test.dtypes:
            numeric = False
            if dt == int or dt == float:
                numeric = True
            self.assertTrue(numeric)
Exemple #13
0
 def fit(self, X, y=None):
     if self.type == 'backdiff':
         self.encoder = ce.BackwardDifferenceEncoder(
             handle_unknown='ignore')
     if self.type == 'binenc':
         self.encoder = ce.BinaryEncoder(handle_unknown='impute')
     if self.type == 'hashenc':
         self.encoder = ce.HashingEncoder()
     if self.type == 'helmenc':
         self.encoder = ce.HelmertEncoder(handle_unknown='impute')
     if self.type == 'onehot':
         self.encoder = ce.OneHotEncoder(handle_unknown='ignore')
     if self.type == 'ordenc':
         self.encoder = ce.OrdinalEncoder(handle_unknown='impute')
     if self.type == 'sumenc':
         self.encoder = ce.SumEncoder(handle_unknown='ignore')
     if self.type == 'polyenc':
         self.encoder = ce.PolynomialEncoder(handle_unknown='impute')
     self.encoder.fit(X, y)
     return self
    def test_helmert_2StringCols_ExpectCorrectOrder(self):
        train = pd.DataFrame(
            {
                'col1': [1, 2, 3, 4],
                'col2': ['A', 'B', 'C', 'D'],
                'col3': [1, 2, 3, 4],
                'col4': ['A', 'B', 'C', 'A']
            },
            columns=['col1', 'col2', 'col3', 'col4'])
        expected_columns = [
            'intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3',
            'col4_0', 'col4_1'
        ]
        encoder = encoders.HelmertEncoder(handle_unknown='value',
                                          handle_missing='value')

        encoder.fit(train)
        columns = encoder.transform(train).columns.values

        self.assertTrue(np.array_equal(expected_columns, columns))
Exemple #15
0
 def __init__(
     self,
     encoder_name,
     reduction_method=None,
     ngram_range=(2, 4),
     categories="auto",
     dtype=np.float64,
     handle_unknown="ignore",
     clf_type=None,
     n_components=None,
 ):
     self.ngram_range = ngram_range
     self.encoder_name = encoder_name
     self.categories = categories
     self.dtype = dtype
     self.clf_type = clf_type
     self.handle_unknown = handle_unknown
     self.reduction_method = reduction_method
     self.n_components = n_components
     self.encoders_dict = {
         "OneHotEncoder":
         OneHotEncoder(handle_unknown="ignore"),
         "OneHotEncoder-1":
         OneHotEncoderRemoveOne(handle_unknown="ignore"),
         "Categorical":
         None,
         "OneHotEncoderDense":
         OneHotEncoder(handle_unknown="ignore", sparse=False),
         "OneHotEncoderDense-1":
         OneHotEncoderRemoveOne(handle_unknown="ignore", sparse=False),
         "SimilarityEncoder":
         SimilarityEncoder(ngram_range=self.ngram_range, random_state=10),
         "NgramNaiveFisherKernel":
         NgramNaiveFisherKernel(ngram_range=self.ngram_range,
                                random_state=10),
         "ngrams_hot_vectorizer": [],
         "NgramsCountVectorizer":
         CountVectorizer(analyzer="char", ngram_range=self.ngram_range),
         "NgramsTfIdfVectorizer":
         TfidfVectorizer(analyzer="char",
                         ngram_range=self.ngram_range,
                         smooth_idf=False),
         "WordNgramsTfIdfVectorizer":
         TfidfVectorizer(analyzer="word",
                         ngram_range=(1, 1),
                         smooth_idf=False),
         "TargetEncoder":
         TargetEncoder(clf_type=self.clf_type, handle_unknown="ignore"),
         "MDVEncoder":
         MDVEncoder(self.clf_type),
         "BackwardDifferenceEncoder":
         cat_enc.BackwardDifferenceEncoder(),
         "BinaryEncoder":
         cat_enc.BinaryEncoder(),
         "HashingEncoder":
         cat_enc.HashingEncoder(),
         "HelmertEncoder":
         cat_enc.HelmertEncoder(),
         "SumEncoder":
         cat_enc.SumEncoder(),
         "PolynomialEncoder":
         cat_enc.PolynomialEncoder(),
         "BaseNEncoder":
         cat_enc.BaseNEncoder(),
         "LeaveOneOutEncoder":
         cat_enc.LeaveOneOutEncoder(),
         "NgramsLDA":
         Pipeline([
             (
                 "ngrams_count",
                 CountVectorizer(analyzer="char",
                                 ngram_range=self.ngram_range),
             ),
             (
                 "LDA",
                 LatentDirichletAllocation(n_components=self.n_components,
                                           learning_method="batch"),
             ),
         ]),
         "NMF":
         Pipeline([
             (
                 "ngrams_count",
                 CountVectorizer(analyzer="char",
                                 ngram_range=self.ngram_range),
             ),
             ("NMF", NMF(n_components=self.n_components)),
         ]),
         "WordNMF":
         Pipeline([
             ("ngrams_count",
              CountVectorizer(analyzer="word", ngram_range=(1, 1))),
             ("NMF", NMF(n_components=self.n_components)),
         ]),
         "NgramsMultinomialMixture":
         NgramsMultinomialMixture(n_topics=self.n_components, max_iters=10),
         "AdHocNgramsMultinomialMixture":
         AdHocNgramsMultinomialMixture(n_iters=0),
         "AdHocIndependentPDF":
         AdHocIndependentPDF(),
         "OnlineGammaPoissonFactorization":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             rho=0.99,
             r=None,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=10,
         ),
         "OnlineGammaPoissonFactorization2":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             rho=None,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20,
         ),
         "OnlineGammaPoissonFactorization3":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             rho=None,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init="k-means",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20,
         ),
         "OnlineGammaPoissonFactorization4":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=None,
             rho=0.95,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init="k-means",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20,
         ),
         "WordOnlineGammaPoissonFactorization":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             ngram_range=(1, 1),
             analizer="word",
             rescale_W=True,
             max_iter_e_step=10,
         ),
         "OnlineGammaPoissonFactorization_fast":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             ngram_range=(3, 3),
             max_iter=1,
             min_iter=1,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             rescale_W=False,
         ),
         "MinHashEncoder":
         MinHashEncoder(n_components=self.n_components),
         "PretrainedFastText":
         PretrainedFastText(n_components=self.n_components),
         "PretrainedFastText_fr":
         PretrainedFastText(n_components=self.n_components,
                            language="french"),
         "PretrainedFastText_hu":
         PretrainedFastText(n_components=self.n_components,
                            language="hungarian"),
         None:
         FunctionTransformer(None, validate=True),
         "Passthrough":
         PasstroughEncoder(),
     }
     self.list_1D_array_methods = [
         "NgramsCountVectorizer",
         "NgramsTfIdfVectorizer",
         "WordNgramsTfIdfVectorizer",
         "ngrams_hot_vectorizer",
         "NgramsLDA",
         "NMF",
         "WordNMF",
         "NgramsMultinomialMixture",
         "NgramsMultinomialMixtureKMeans2",
         "AdHocNgramsMultinomialMixture",
         "AdHocIndependentPDF",
         "GammaPoissonFactorization",
         "OnlineGammaPoissonFactorization",
         "WordOnlineGammaPoissonFactorization",
         "OnlineGammaPoissonFactorization2",
         "OnlineGammaPoissonFactorization3",
         "OnlineGammaPoissonFactorization4",
         "OnlineGammaPoissonFactorization_fast",
         "MinHashEncoder",
         "MinMeanMinHashEncoder",
     ]