def helmert_encoding(df, col): import pandas as pd import category_encoders as ce encoder = ce.HelmertEncoder(cols=col, drop_invariant=True) dfh = encoder.fit_transform(df[col]) df = pd.concat([df, dfh], axis=1) return df
def helmert(): X, _, _ = get_mushroom_data() print(X.info()) enc = ce.HelmertEncoder() enc.fit(X, None) out = enc.transform(X) print(out.info()) del enc, _, X, out
def apply_helmert_encoding(df, categorical_columns): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) import category_encoders as ce encoder = ce.HelmertEncoder(cols=categorical_columns).fit(df.values) X_transformed = encoder.transform(df) X.drop(['intercept'], inplace=True, axis=1) return X_transformed
def create_features(self, df_train, df_test): encoder = ce.HelmertEncoder(cols=self.columns) encoder.fit(df_train[self.columns], df_train[self.target_column].values.tolist()) encoded_train = encoder.transform(df_train[self.columns]) encoded_test = encoder.transform(df_test[self.columns]) for column in encoded_train.columns: self.train[column + '_HelmertEncoder'] = encoded_train[column] self.test[column + '_HelmertEncoder'] = encoded_test[column]
def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): train = ['A', 'B', np.nan] encoder = encoders.HelmertEncoder(handle_missing='indicator', handle_unknown='value') result = encoder.fit_transform(train) expected = [[1, -1, -1], [1, 1, -1], [1, 0, 2]] self.assertTrue(np.array_equal(result.values.tolist(), expected))
def test_helmert_preserve_dimension_4(self): train = ['A', 'B', 'C'] test = ['D', 'B', 'C', None] encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) expected = [[1, 0, 0], [1, 1, -1], [1, 0, 2], [1, 0, 0]] self.assertEqual(test_t.values.tolist(), expected)
def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): train = ['A', 'B'] test = ['A', 'B', 'C'] encoder = encoders.HelmertEncoder(handle_unknown='indicator', handle_missing='value') encoder.fit(train) result = encoder.transform(test) expected = [[1, -1, -1], [1, 1, -1], [1, 0, 2]] self.assertEqual(result.values.tolist(), expected)
def main(params, inputs, outputs): columns_param = params.columns data = inputs.data data_new = outputs.data_new data_0 = pd.read_pickle(data) encoder = ce.HelmertEncoder(cols=[col for col in columns_param.split(",")]) data_1 = encoder.fit_transform(data_0) data_1.to_pickle(data_new)
def __init__(self, encoder_type, columns_name=None): """ :param encoder_type: :param columns_name: list, 特征名组成的列表名 """ if encoder_type == "BackwardDe": # 反向差分编码 self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name) elif encoder_type == "BaseN": # BaseN编码 self.encoder = ce.BaseNEncoder(cols=columns_name) elif encoder_type == "Binary": # 二值编码 self.encoder = ce.BinaryEncoder(cols=columns_name) elif encoder_type == "Catboost": self.encoder = ce.CatBoostEncoder(cols=columns_name) elif encoder_type == "Hash": self.encoder = ce.HashingEncoder(cols=columns_name) elif encoder_type == "Helmert": self.encoder = ce.HelmertEncoder(cols=columns_name) elif encoder_type == "JamesStein": self.encoder = ce.JamesSteinEncoder(cols=columns_name) elif encoder_type == "LOO": # LeaveOneOutEncoder 编码 self.encoder = ce.LeaveOneOutEncoder(cols=columns_name) elif encoder_type == "ME": self.encoder = ce.MEstimateEncoder(cols=columns_name) # M估计编码器 elif encoder_type == "OneHot": self.encoder = ce.OneHotEncoder(cols=columns_name) elif encoder_type == "OridinalEncoder": # 原始编码 self.encoder = ce.OrdinalEncoder(cols=columns_name) elif encoder_type == "Sum": # 求和编码 self.encoder = ce.SumEncoder(cols=columns_name) elif encoder_type == "Polynomial": # 多项式编码 self.encoder = ce.PolynomialEncoder(cols=columns_name) elif encoder_type == "Target": # 目标编码 self.encoder = ce.TargetEncoder(cols=columns_name) elif encoder_type == "WOE": # WOE 编码器 self.encoder = ce.WOEEncoder(cols=columns_name) else: raise ValueError("请选择正确的编码方式")
def get_encoder_dict(): encoder_dict = { 'OneHotEncoder': ce.OneHotEncoder(), 'BinaryEncoder': ce.BinaryEncoder(), 'HashingEncoder': ce.HashingEncoder(), 'LabelEncoder': le.MultiColumnLabelEncoder(), 'FrequencyEncoder': fe.FrequencyEncoder(), 'TargetEncoder': ce.TargetEncoder(), 'HelmertEncoder': ce.HelmertEncoder(), 'JamesSteinEncoder': ce.JamesSteinEncoder(), 'BaseNEncoder': ce.BaseNEncoder(), 'SumEncoder': ce.SumEncoder(), } return encoder_dict
def encode(data): encoder = ce.HelmertEncoder() # promising #encoder = ce.BinaryEncoder( ) # promising #encoder = ce.OrdinalEncoder( ) # simple but working #encoder = ce.polynomial.PolynomialEncoder() #encoder = ce.OneHotEncoder() #encoder = ce.BackwardDifferenceEncoder() #encoder = ce.HashingEncoder( ) #encoder = ce.SumEncoder() encoder.fit(data, verbose=1) data = encoder.transform(data) data = data.values.tolist() #print(data[len(signal)-1]) return data
def test_helmert(self): """ :return: """ cols = ['C1', 'D', 'E', 'F'] enc = encoders.HelmertEncoder(verbose=1, cols=cols) X = self.create_dataset(n_rows=1000) X_test = enc.fit_transform(X, None) for dt in X_test.dtypes: numeric = False if dt == int or dt == float: numeric = True self.assertTrue(numeric)
def fit(self, X, y=None): if self.type == 'backdiff': self.encoder = ce.BackwardDifferenceEncoder( handle_unknown='ignore') if self.type == 'binenc': self.encoder = ce.BinaryEncoder(handle_unknown='impute') if self.type == 'hashenc': self.encoder = ce.HashingEncoder() if self.type == 'helmenc': self.encoder = ce.HelmertEncoder(handle_unknown='impute') if self.type == 'onehot': self.encoder = ce.OneHotEncoder(handle_unknown='ignore') if self.type == 'ordenc': self.encoder = ce.OrdinalEncoder(handle_unknown='impute') if self.type == 'sumenc': self.encoder = ce.SumEncoder(handle_unknown='ignore') if self.type == 'polyenc': self.encoder = ce.PolynomialEncoder(handle_unknown='impute') self.encoder.fit(X, y) return self
def test_helmert_2StringCols_ExpectCorrectOrder(self): train = pd.DataFrame( { 'col1': [1, 2, 3, 4], 'col2': ['A', 'B', 'C', 'D'], 'col3': [1, 2, 3, 4], 'col4': ['A', 'B', 'C', 'A'] }, columns=['col1', 'col2', 'col3', 'col4']) expected_columns = [ 'intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1' ] encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) columns = encoder.transform(train).columns.values self.assertTrue(np.array_equal(expected_columns, columns))
def __init__( self, encoder_name, reduction_method=None, ngram_range=(2, 4), categories="auto", dtype=np.float64, handle_unknown="ignore", clf_type=None, n_components=None, ): self.ngram_range = ngram_range self.encoder_name = encoder_name self.categories = categories self.dtype = dtype self.clf_type = clf_type self.handle_unknown = handle_unknown self.reduction_method = reduction_method self.n_components = n_components self.encoders_dict = { "OneHotEncoder": OneHotEncoder(handle_unknown="ignore"), "OneHotEncoder-1": OneHotEncoderRemoveOne(handle_unknown="ignore"), "Categorical": None, "OneHotEncoderDense": OneHotEncoder(handle_unknown="ignore", sparse=False), "OneHotEncoderDense-1": OneHotEncoderRemoveOne(handle_unknown="ignore", sparse=False), "SimilarityEncoder": SimilarityEncoder(ngram_range=self.ngram_range, random_state=10), "NgramNaiveFisherKernel": NgramNaiveFisherKernel(ngram_range=self.ngram_range, random_state=10), "ngrams_hot_vectorizer": [], "NgramsCountVectorizer": CountVectorizer(analyzer="char", ngram_range=self.ngram_range), "NgramsTfIdfVectorizer": TfidfVectorizer(analyzer="char", ngram_range=self.ngram_range, smooth_idf=False), "WordNgramsTfIdfVectorizer": TfidfVectorizer(analyzer="word", ngram_range=(1, 1), smooth_idf=False), "TargetEncoder": TargetEncoder(clf_type=self.clf_type, handle_unknown="ignore"), "MDVEncoder": MDVEncoder(self.clf_type), "BackwardDifferenceEncoder": cat_enc.BackwardDifferenceEncoder(), "BinaryEncoder": cat_enc.BinaryEncoder(), "HashingEncoder": cat_enc.HashingEncoder(), "HelmertEncoder": cat_enc.HelmertEncoder(), "SumEncoder": cat_enc.SumEncoder(), "PolynomialEncoder": cat_enc.PolynomialEncoder(), "BaseNEncoder": cat_enc.BaseNEncoder(), "LeaveOneOutEncoder": cat_enc.LeaveOneOutEncoder(), "NgramsLDA": Pipeline([ ( "ngrams_count", CountVectorizer(analyzer="char", ngram_range=self.ngram_range), ), ( "LDA", LatentDirichletAllocation(n_components=self.n_components, learning_method="batch"), ), ]), "NMF": Pipeline([ ( "ngrams_count", CountVectorizer(analyzer="char", ngram_range=self.ngram_range), ), ("NMF", NMF(n_components=self.n_components)), ]), "WordNMF": Pipeline([ ("ngrams_count", CountVectorizer(analyzer="word", ngram_range=(1, 1))), ("NMF", NMF(n_components=self.n_components)), ]), "NgramsMultinomialMixture": NgramsMultinomialMixture(n_topics=self.n_components, max_iters=10), "AdHocNgramsMultinomialMixture": AdHocNgramsMultinomialMixture(n_iters=0), "AdHocIndependentPDF": AdHocIndependentPDF(), "OnlineGammaPoissonFactorization": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, rho=0.99, r=None, tol=1e-4, random_state=18, init="k-means++", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=10, ), "OnlineGammaPoissonFactorization2": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, rho=None, batch_size=256, tol=1e-4, random_state=18, init="k-means++", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20, ), "OnlineGammaPoissonFactorization3": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, rho=None, batch_size=256, tol=1e-4, random_state=18, init="k-means", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20, ), "OnlineGammaPoissonFactorization4": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=None, rho=0.95, batch_size=256, tol=1e-4, random_state=18, init="k-means", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20, ), "WordOnlineGammaPoissonFactorization": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, tol=1e-4, random_state=18, init="k-means++", ngram_range=(1, 1), analizer="word", rescale_W=True, max_iter_e_step=10, ), "OnlineGammaPoissonFactorization_fast": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, ngram_range=(3, 3), max_iter=1, min_iter=1, tol=1e-4, random_state=18, init="k-means++", rescale_W=False, ), "MinHashEncoder": MinHashEncoder(n_components=self.n_components), "PretrainedFastText": PretrainedFastText(n_components=self.n_components), "PretrainedFastText_fr": PretrainedFastText(n_components=self.n_components, language="french"), "PretrainedFastText_hu": PretrainedFastText(n_components=self.n_components, language="hungarian"), None: FunctionTransformer(None, validate=True), "Passthrough": PasstroughEncoder(), } self.list_1D_array_methods = [ "NgramsCountVectorizer", "NgramsTfIdfVectorizer", "WordNgramsTfIdfVectorizer", "ngrams_hot_vectorizer", "NgramsLDA", "NMF", "WordNMF", "NgramsMultinomialMixture", "NgramsMultinomialMixtureKMeans2", "AdHocNgramsMultinomialMixture", "AdHocIndependentPDF", "GammaPoissonFactorization", "OnlineGammaPoissonFactorization", "WordOnlineGammaPoissonFactorization", "OnlineGammaPoissonFactorization2", "OnlineGammaPoissonFactorization3", "OnlineGammaPoissonFactorization4", "OnlineGammaPoissonFactorization_fast", "MinHashEncoder", "MinMeanMinHashEncoder", ]