def test_polynomial(self):
        """

        :return:
        """

        cols = ['C1', 'D', 'E', 'F']
        X = self.create_dataset(n_rows=1000)
        X_t = self.create_dataset(n_rows=100)

        enc = encoders.PolynomialEncoder(verbose=1, cols=cols)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.PolynomialEncoder(verbose=1)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.PolynomialEncoder(verbose=1, drop_invariant=True)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.PolynomialEncoder(verbose=1, return_df=False)
        enc.fit(X, None)
        self.assertTrue(isinstance(enc.transform(X_t), np.ndarray))
Beispiel #2
0
    def _fit_polynomial(self, df, y, target, parameter):
        poly_encoder = ce.PolynomialEncoder()

        poly_encoder.fit(df[target].map(to_str), df[y])
        name = ['continuous_' + remove_continuous_discrete_prefix(x) + '_poly' for x in
                poly_encoder.get_feature_names()]
        self.trans_ls.append(('catboost', name, target, poly_encoder))
Beispiel #3
0
def polynomial():
    X, _, _ = get_mushroom_data()
    print(X.info())
    enc = ce.PolynomialEncoder()
    enc.fit(X, None)
    out = enc.transform(X)
    print(out.info())
    del enc, _, X, out
Beispiel #4
0
    def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self):
        train = ['A', 'B']

        encoder = encoders.PolynomialEncoder(handle_unknown='indicator')
        result = encoder.fit_transform(train)

        expected = [a_encoding, b_encoding]
        self.assertEqual(deep_round(result.values.tolist()),
                         deep_round(expected))
 def create_features(self, df_train, df_test):
     encoder = ce.PolynomialEncoder(cols=self.columns)
     encoder.fit(df_train[self.columns],
                 df_train[self.target_column].values.tolist())
     encoded_train = encoder.transform(df_train[self.columns])
     encoded_test = encoder.transform(df_test[self.columns])
     for column in encoded_train.columns:
         self.train[column + '_PolynomialEncoder'] = encoded_train[column]
         self.test[column + '_PolynomialEncoder'] = encoded_test[column]
Beispiel #6
0
    def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self):
        train = ['A', 'B', np.nan]

        encoder = encoders.PolynomialEncoder(handle_missing='indicator', handle_unknown='value')
        result = encoder.fit_transform(train)

        expected = [a_encoding,
                    b_encoding,
                    c_encoding]
        self.assertEqual(deep_round(result.values.tolist()), deep_round(expected))
Beispiel #7
0
def encode_categorical(df,encoder_name='binary'):
    encoder_dic = {"one_hot":ce.OneHotEncoder(),
                   "feature_hashing":ce.HashingEncoder(n_components=32),
                   "binary":ce.BinaryEncoder(),
                   "ordinal":ce.OrdinalEncoder(),
                   "polynomial":ce.PolynomialEncoder()}
    encoder = encoder_dic.get(encoder_name)
    encoder.fit(df,verbose=1)
    df = encoder.transform(df)
    return df
Beispiel #8
0
    def test_polynomial_encoder_2cols(self):
        train = [['A', 'A'], ['B', 'B'], ['C', 'C']]

        encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value')
        encoder.fit(train)
        obtained = encoder.transform(train)

        expected = [[1, a_encoding[1], a_encoding[2], a_encoding[1], a_encoding[2]],
                    [1, b_encoding[1], b_encoding[2], b_encoding[1], b_encoding[2]],
                    [1, c_encoding[1], c_encoding[2], c_encoding[1], c_encoding[2]]]
        self.assertEqual(deep_round(obtained.values.tolist()), deep_round(expected))
Beispiel #9
0
    def test_polynomial_encoder_preserve_dimension_4(self):
        train = ['A', 'B', 'C']
        test = ['D', 'B', 'C', None]

        encoder = encoders.PolynomialEncoder()
        encoder.fit(train)
        test_t = encoder.transform(test)

        expected = [[1, 0, 0], b_encoding, c_encoding, [1, 0, 0]]
        self.assertEqual(deep_round(test_t.values.tolist()),
                         deep_round(expected))
Beispiel #10
0
    def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self):
        train = ['A', 'B']
        test = ['A', 'B', 'C']

        encoder = encoders.PolynomialEncoder(handle_unknown='indicator')
        encoder.fit(train)
        result = encoder.transform(test)

        expected = [a_encoding, b_encoding, c_encoding]
        self.assertEqual(deep_round(result.values.tolist()),
                         deep_round(expected))
Beispiel #11
0
    def test_polynomial_encoder_preserve_dimension_3(self):
        train = ['A', 'B', 'C']
        test = ['A', 'B', 'C', None]

        encoder = encoders.PolynomialEncoder(handle_unknown='value',
                                             handle_missing='value')
        encoder.fit(train)
        test_t = encoder.transform(test)

        expected = [a_encoding, b_encoding, c_encoding, [1, 0, 0]]
        self.assertEqual(deep_round(test_t.values.tolist()),
                         deep_round(expected))
Beispiel #12
0
def main(params, inputs, outputs):
    columns_param = params.columns
    data = inputs.data
    data_new = outputs.data_new

    data_0 = pd.read_pickle(data)

    encoder = ce.PolynomialEncoder(
        cols=[col for col in columns_param.split(",")])
    data_1 = encoder.fit_transform(data_0)

    data_1.to_pickle(data_new)
    def test_polynomial_np(self):
        """

        :return:
        """

        X = self.create_array(n_rows=1000)
        X_t = self.create_array(n_rows=100)

        enc = encoders.PolynomialEncoder(verbose=1)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))
    def __init__(self, encoder_type, columns_name=None):
        """
        :param encoder_type:
        :param columns_name: list, 特征名组成的列表名
        """
        if encoder_type == "BackwardDe":  # 反向差分编码
            self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name)

        elif encoder_type == "BaseN":  # BaseN编码
            self.encoder = ce.BaseNEncoder(cols=columns_name)

        elif encoder_type == "Binary":  # 二值编码
            self.encoder = ce.BinaryEncoder(cols=columns_name)

        elif encoder_type == "Catboost":
            self.encoder = ce.CatBoostEncoder(cols=columns_name)

        elif encoder_type == "Hash":
            self.encoder = ce.HashingEncoder(cols=columns_name)

        elif encoder_type == "Helmert":
            self.encoder = ce.HelmertEncoder(cols=columns_name)

        elif encoder_type == "JamesStein":
            self.encoder = ce.JamesSteinEncoder(cols=columns_name)

        elif encoder_type == "LOO":  # LeaveOneOutEncoder  编码
            self.encoder = ce.LeaveOneOutEncoder(cols=columns_name)

        elif encoder_type == "ME":
            self.encoder = ce.MEstimateEncoder(cols=columns_name)  # M估计编码器

        elif encoder_type == "OneHot":
            self.encoder = ce.OneHotEncoder(cols=columns_name)

        elif encoder_type == "OridinalEncoder":  # 原始编码
            self.encoder = ce.OrdinalEncoder(cols=columns_name)

        elif encoder_type == "Sum":  # 求和编码
            self.encoder = ce.SumEncoder(cols=columns_name)

        elif encoder_type == "Polynomial":  # 多项式编码
            self.encoder = ce.PolynomialEncoder(cols=columns_name)

        elif encoder_type == "Target":  # 目标编码
            self.encoder = ce.TargetEncoder(cols=columns_name)

        elif encoder_type == "WOE":  # WOE 编码器
            self.encoder = ce.WOEEncoder(cols=columns_name)

        else:
            raise ValueError("请选择正确的编码方式")
Beispiel #15
0
    def test_polynomial_encoder_2StringCols_ExpectCorrectOrder(self):
        train = pd.DataFrame({'col1': [1, 2, 3, 4],
                              'col2': ['A', 'B', 'C', 'D'],
                              'col3': [1, 2, 3, 4],
                              'col4': ['A', 'B', 'C', 'A']
                              },
                             columns=['col1', 'col2', 'col3', 'col4'])
        expected_columns = ['intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1']
        encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value')

        encoder.fit(train)
        columns = encoder.transform(train).columns.values

        self.assertItemsEqual(expected_columns, columns)
Beispiel #16
0
    def test_polynomial(self):
        """

        :return:
        """

        cols = ['C1', 'D', 'E', 'F']
        enc = encoders.PolynomialEncoder(verbose=1, cols=cols)
        X = self.create_dataset(n_rows=1000)

        X_test = enc.fit_transform(X, None)

        for dt in X_test.dtypes:
            numeric = False
            if dt == int or dt == float:
                numeric = True
            self.assertTrue(numeric)
Beispiel #17
0
 def fit(self, X, y=None):
     if self.type == 'backdiff':
         self.encoder = ce.BackwardDifferenceEncoder(
             handle_unknown='ignore')
     if self.type == 'binenc':
         self.encoder = ce.BinaryEncoder(handle_unknown='impute')
     if self.type == 'hashenc':
         self.encoder = ce.HashingEncoder()
     if self.type == 'helmenc':
         self.encoder = ce.HelmertEncoder(handle_unknown='impute')
     if self.type == 'onehot':
         self.encoder = ce.OneHotEncoder(handle_unknown='ignore')
     if self.type == 'ordenc':
         self.encoder = ce.OrdinalEncoder(handle_unknown='impute')
     if self.type == 'sumenc':
         self.encoder = ce.SumEncoder(handle_unknown='ignore')
     if self.type == 'polyenc':
         self.encoder = ce.PolynomialEncoder(handle_unknown='impute')
     self.encoder.fit(X, y)
     return self
]
r_id_cols = [
    'r1_account_id_hash', 'r2_account_id_hash', 'r3_account_id_hash',
    'r4_account_id_hash', 'r5_account_id_hash'
]

d_cols = [
    'd1_hero_name', 'd2_hero_name', 'd3_hero_name', 'd4_hero_name',
    'd5_hero_name'
]
d_id_cols = [
    'd1_account_id_hash', 'd2_account_id_hash', 'd3_account_id_hash',
    'd4_account_id_hash', 'd5_account_id_hash'
]

r_ce_poly = ce.PolynomialEncoder(cols=r_cols)
d_ce_poly = ce.PolynomialEncoder(cols=d_cols)

r_id_ce_hash = ce.OrdinalEncoder(cols=r_id_cols)
d_id_ce_hash = ce.OrdinalEncoder(cols=d_id_cols)

r_ce_poly.fit(r_train_hero_name[r_cols], y)
d_ce_poly.fit(d_train_hero_name[d_cols], y)
r_id_ce_hash.fit(r_train_account[r_id_cols], y)
d_id_ce_hash.fit(d_train_account[d_id_cols], y)

df_new_features.drop([
    'r1_hero_name', 'r2_hero_name', 'r3_hero_name', 'r4_hero_name',
    'r5_hero_name', 'd1_hero_name', 'd2_hero_name', 'd3_hero_name',
    'd4_hero_name', 'd5_hero_name'
],
Beispiel #19
0
 def __init__(
     self,
     encoder_name,
     reduction_method=None,
     ngram_range=(2, 4),
     categories="auto",
     dtype=np.float64,
     handle_unknown="ignore",
     clf_type=None,
     n_components=None,
 ):
     self.ngram_range = ngram_range
     self.encoder_name = encoder_name
     self.categories = categories
     self.dtype = dtype
     self.clf_type = clf_type
     self.handle_unknown = handle_unknown
     self.reduction_method = reduction_method
     self.n_components = n_components
     self.encoders_dict = {
         "OneHotEncoder":
         OneHotEncoder(handle_unknown="ignore"),
         "OneHotEncoder-1":
         OneHotEncoderRemoveOne(handle_unknown="ignore"),
         "Categorical":
         None,
         "OneHotEncoderDense":
         OneHotEncoder(handle_unknown="ignore", sparse=False),
         "OneHotEncoderDense-1":
         OneHotEncoderRemoveOne(handle_unknown="ignore", sparse=False),
         "SimilarityEncoder":
         SimilarityEncoder(ngram_range=self.ngram_range, random_state=10),
         "NgramNaiveFisherKernel":
         NgramNaiveFisherKernel(ngram_range=self.ngram_range,
                                random_state=10),
         "ngrams_hot_vectorizer": [],
         "NgramsCountVectorizer":
         CountVectorizer(analyzer="char", ngram_range=self.ngram_range),
         "NgramsTfIdfVectorizer":
         TfidfVectorizer(analyzer="char",
                         ngram_range=self.ngram_range,
                         smooth_idf=False),
         "WordNgramsTfIdfVectorizer":
         TfidfVectorizer(analyzer="word",
                         ngram_range=(1, 1),
                         smooth_idf=False),
         "TargetEncoder":
         TargetEncoder(clf_type=self.clf_type, handle_unknown="ignore"),
         "MDVEncoder":
         MDVEncoder(self.clf_type),
         "BackwardDifferenceEncoder":
         cat_enc.BackwardDifferenceEncoder(),
         "BinaryEncoder":
         cat_enc.BinaryEncoder(),
         "HashingEncoder":
         cat_enc.HashingEncoder(),
         "HelmertEncoder":
         cat_enc.HelmertEncoder(),
         "SumEncoder":
         cat_enc.SumEncoder(),
         "PolynomialEncoder":
         cat_enc.PolynomialEncoder(),
         "BaseNEncoder":
         cat_enc.BaseNEncoder(),
         "LeaveOneOutEncoder":
         cat_enc.LeaveOneOutEncoder(),
         "NgramsLDA":
         Pipeline([
             (
                 "ngrams_count",
                 CountVectorizer(analyzer="char",
                                 ngram_range=self.ngram_range),
             ),
             (
                 "LDA",
                 LatentDirichletAllocation(n_components=self.n_components,
                                           learning_method="batch"),
             ),
         ]),
         "NMF":
         Pipeline([
             (
                 "ngrams_count",
                 CountVectorizer(analyzer="char",
                                 ngram_range=self.ngram_range),
             ),
             ("NMF", NMF(n_components=self.n_components)),
         ]),
         "WordNMF":
         Pipeline([
             ("ngrams_count",
              CountVectorizer(analyzer="word", ngram_range=(1, 1))),
             ("NMF", NMF(n_components=self.n_components)),
         ]),
         "NgramsMultinomialMixture":
         NgramsMultinomialMixture(n_topics=self.n_components, max_iters=10),
         "AdHocNgramsMultinomialMixture":
         AdHocNgramsMultinomialMixture(n_iters=0),
         "AdHocIndependentPDF":
         AdHocIndependentPDF(),
         "OnlineGammaPoissonFactorization":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             rho=0.99,
             r=None,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=10,
         ),
         "OnlineGammaPoissonFactorization2":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             rho=None,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20,
         ),
         "OnlineGammaPoissonFactorization3":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             rho=None,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init="k-means",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20,
         ),
         "OnlineGammaPoissonFactorization4":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=None,
             rho=0.95,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init="k-means",
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20,
         ),
         "WordOnlineGammaPoissonFactorization":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             ngram_range=(1, 1),
             analizer="word",
             rescale_W=True,
             max_iter_e_step=10,
         ),
         "OnlineGammaPoissonFactorization_fast":
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=0.3,
             ngram_range=(3, 3),
             max_iter=1,
             min_iter=1,
             tol=1e-4,
             random_state=18,
             init="k-means++",
             rescale_W=False,
         ),
         "MinHashEncoder":
         MinHashEncoder(n_components=self.n_components),
         "PretrainedFastText":
         PretrainedFastText(n_components=self.n_components),
         "PretrainedFastText_fr":
         PretrainedFastText(n_components=self.n_components,
                            language="french"),
         "PretrainedFastText_hu":
         PretrainedFastText(n_components=self.n_components,
                            language="hungarian"),
         None:
         FunctionTransformer(None, validate=True),
         "Passthrough":
         PasstroughEncoder(),
     }
     self.list_1D_array_methods = [
         "NgramsCountVectorizer",
         "NgramsTfIdfVectorizer",
         "WordNgramsTfIdfVectorizer",
         "ngrams_hot_vectorizer",
         "NgramsLDA",
         "NMF",
         "WordNMF",
         "NgramsMultinomialMixture",
         "NgramsMultinomialMixtureKMeans2",
         "AdHocNgramsMultinomialMixture",
         "AdHocIndependentPDF",
         "GammaPoissonFactorization",
         "OnlineGammaPoissonFactorization",
         "WordOnlineGammaPoissonFactorization",
         "OnlineGammaPoissonFactorization2",
         "OnlineGammaPoissonFactorization3",
         "OnlineGammaPoissonFactorization4",
         "OnlineGammaPoissonFactorization_fast",
         "MinHashEncoder",
         "MinMeanMinHashEncoder",
     ]
Beispiel #20
0
def pipeline(df, target, cat_columns, models):
    n_rows, n_cols = df.shape
    metrics = {
        "n_rows": [],
        "n_cols": [],
        "cardinality": [],
        "model": [],
        "column": [],
        "encoder": [],
        "rmse": [],
        "mae": [],
        "fit_time": [],
        "rmse_change": [],
        "mae_change": [],
        "fit_time_change": [],
    }
    columns = cat_columns

    for model_name in models:

        base_rmse, base_mae, base_fit_time = model(
            df=df,
            target=target,
            encoder=np.nan,
            col=np.nan,
            model_name=model_name,
            encoder_type="basic",
            encoder_name=[],
        )

        _append_metric(
            row_list=metrics,
            n_rows=n_rows,
            n_cols=n_cols,
            cardinality=np.nan,
            model_name=model_name,
            column=np.nan,
            name="basic",
            rmse=base_rmse,
            mae=base_mae,
            fit_time=base_fit_time,
            base_rmse=base_rmse,
            base_mae=base_mae,
            base_fit_time=base_fit_time,
        )

        for column in columns:
            print()
            print(column)
            cardinality = df[column].nunique()

            print("ohe")
            rmse, mae, fit_time = model(
                df=df,
                target=target,
                encoder=np.nan,
                col=column,
                model_name=model_name,
                encoder_type="basic",
                encoder_name="One Hot Encoder (pd.dummies)",
            )
            _append_metric(
                row_list=metrics,
                n_rows=n_rows,
                n_cols=n_cols,
                cardinality=cardinality,
                model_name=model_name,
                column=column,
                name="One Hot Encoder (pd.dummies)",
                rmse=rmse,
                mae=mae,
                fit_time=fit_time,
                base_rmse=base_rmse,
                base_mae=base_mae,
                base_fit_time=base_fit_time,
            )

            encoders = [
                ("Sum Encoder(sleepmind)", SumEncoder()),
                ("BinaryEncoder", ce.BinaryEncoder(cols=[column])),
                ("HashingEncoder", ce.HashingEncoder(cols=[column])),
                ("OneHotEncoder", ce.OneHotEncoder(cols=[column])),
                ("OrdinalEncoder", ce.OrdinalEncoder(cols=[column])),
                ("BaseNEncoder", ce.BaseNEncoder(cols=[column])),
                (
                    "BackwardDifferenceEncoder",
                    ce.BackwardDifferenceEncoder(cols=[column]),
                ),
                ("HelmertEncoder", ce.HelmertEncoder(cols=[column])),
                ("SumEncoder", ce.SumEncoder(cols=[column])),
                ("PolynomialEncoder", ce.PolynomialEncoder(cols=[column])),
                ("TargetEncoder", ce.TargetEncoder(cols=[column])),
                ("LeaveOneOutEncoder", ce.LeaveOneOutEncoder(cols=[column])),
                (
                    "XAM_bayesian_targetEncoder",
                    BayesianTargetEncoder(columns=[column],
                                          prior_weight=3,
                                          suffix=""),
                ),
            ]

            for name, encoder in encoders:
                print(name)
                rmse, mae, fit_time = model(
                    df=df,
                    target=target,
                    encoder=encoder,
                    col=column,
                    model_name=model_name,
                    encoder_type="sklearn_encoding",
                    encoder_name=name,
                )
                _append_metric(
                    row_list=metrics,
                    n_rows=n_rows,
                    n_cols=n_cols,
                    cardinality=cardinality,
                    model_name=model_name,
                    column=column,
                    name=name,
                    rmse=rmse,
                    mae=mae,
                    fit_time=fit_time,
                    base_rmse=base_rmse,
                    base_mae=base_mae,
                    base_fit_time=base_fit_time,
                )

            bayes_encoders = [
                ("hcc_BayesEncoding", BayesEncoding),
                ("hcc_BayesEncodingKfold", BayesEncodingKfold),
                ("LOOEncoding", LOOEncoding),
                ("LOOEncodingKfold", LOOEncodingKfold),
            ]
            for name, bayes_encoder in bayes_encoders:
                print(name)
                rmse, mae, fit_time = model(
                    df=df,
                    target=target,
                    encoder=bayes_encoder,
                    col=column,
                    model_name=model_name,
                    encoder_name=name,
                    encoder_type="basic",
                    hcc_ind=1,
                )
                _append_metric(
                    row_list=metrics,
                    n_rows=n_rows,
                    n_cols=n_cols,
                    cardinality=cardinality,
                    model_name=model_name,
                    column=column,
                    name=name,
                    rmse=rmse,
                    mae=mae,
                    fit_time=fit_time,
                    base_rmse=base_rmse,
                    base_mae=base_mae,
                    base_fit_time=base_fit_time,
                )
    results = pd.DataFrame(metrics)
    return results
def main(dataSetName, X, y):

    scores = []
    raw_scores_ds = {}

    # Loading logistic regression classifier
    clf = linear_model.LogisticRegression()

    # try every encoding method available
    #encoders = ce.__all__
    encoders = [
        "BackwardDifferenceEncoder", "BinaryEncoder", "HashingEncoder",
        "HelmertEncoder", "OneHotEncoder", "OrdinalEncoder", "SumEncoder",
        "PolynomialEncoder", "BaseNEncoder", "LeaveOneOutEncoder"
    ]
    print(encoders)

    for encoder_name in encoders:
        print(encoder_name)
        if (encoder_name == "BackwardDifferenceEncoder"):
            encoder = ce.BackwardDifferenceEncoder(cols=columnsToEncode)
        if (encoder_name == "BinaryEncoder"):
            encoder = ce.BinaryEncoder(cols=columnsToEncode)
        if (encoder_name == "HashingEncoder"):
            encoder = ce.HashingEncoder(cols=columnsToEncode)
        if (encoder_name == "HelmertEncoder"):
            encoder = ce.HelmertEncoder(cols=columnsToEncode)
        if (encoder_name == "OneHotEncoder"):
            encoder = ce.OneHotEncoder(cols=columnsToEncode)
        if (encoder_name == "OrdinalEncoder"):
            encoder = ce.OrdinalEncoder(cols=columnsToEncode)
        if (encoder_name == "SumEncoder"):
            encoder = ce.SumEncoder(cols=columnsToEncode)
        if (encoder_name == "PolynomialEncoder"):
            encoder = ce.PolynomialEncoder(cols=columnsToEncode)
        if (encoder_name == "BaseNEncoder"):
            encoder = ce.BaseNEncoder(cols=columnsToEncode)
        if (encoder_name == "LeaveOneOutEncoder"):
            encoder = ce.LeaveOneOutEncoder(cols=columnsToEncode)
        #encoder = getattr(category_encoders, encoder_name)
        print(encoder)
        start_time = time.time()
        score, stds, raw_scores, dim = score_models(clf, X, y, encoder,
                                                    encoder_name, dataSetName)
        scores.append([
            encoder_name, dataSetName[0], dim, score, stds,
            time.time() - start_time
        ])
        raw_scores_ds[encoder_name] = raw_scores
        gc.collect()

    results = pd.DataFrame(scores,
                           columns=[
                               'Encoding', 'Dataset', 'Dimensionality',
                               'Avg. Score', 'Score StDev', 'Elapsed Time'
                           ])

    #print(raw_scores_ds)
    #raw = pd.DataFrame.from_dict(raw_scores_ds)
    #print(raw)
    #ax = raw.plot(kind='box', return_type='axes')
    #plt.title('Scores for Encodings on %s Dataset' % (name, ))
    #plt.ylabel('Score (higher better)')
    #for tick in ax.get_xticklabels():
    #tick.set_rotation(90)
    #plt.grid()
    #plt.tight_layout()
    #plt.show()

    #return results, raw
    return results
Beispiel #22
0
def get_factors(model, df, fnum, fname, nvalues, dtype, encoder, rounding,
                sentinel):
    r"""Convert the original feature to a factor.

    Parameters
    ----------
    model : alphapy.Model
        Model object with the feature specifications.
    df : pandas.DataFrame
        Dataframe containing the column ``fname``.
    fnum : int
        Feature number, strictly for logging purposes
    fname : str
        Name of the text column in the dataframe ``df``.
    nvalues : int
        The number of unique values.
    dtype : str
        The values ``'float64'``, ``'int64'``, or ``'bool'``.
    encoder : alphapy.features.Encoders
        Type of encoder to apply.
    rounding : int
        Number of places to round.
    sentinel : float
        The number to be imputed for NaN values.

    Returns
    -------
    all_features : numpy array
        The features that have been transformed to factors.

    """

    logger.info("Feature %d: %s is a factor of type %s with %d unique values",
                fnum, fname, dtype, nvalues)
    logger.info("Encoding: %s", encoder)

    # Extract model data

    feature_map = model.feature_map
    model_type = model.specs['model_type']
    target_value = model.specs['target_value']

    # get feature
    feature = df[fname]
    # convert float to factor
    if dtype == 'float64':
        logger.info("Rounding: %d", rounding)
        feature = feature.apply(float_factor, args=[rounding])
    # encoders
    pd_features = pd.DataFrame()
    enc = None
    ef = pd.DataFrame(feature)
    if encoder == Encoders.factorize:
        pd_factors = pd.factorize(feature)[0]
        pd_features = pd.DataFrame(pd_factors)
    elif encoder == Encoders.onehot:
        pd_features = pd.get_dummies(feature)
    elif encoder == Encoders.ordinal:
        enc = ce.OrdinalEncoder(cols=[fname])
    elif encoder == Encoders.binary:
        enc = ce.BinaryEncoder(cols=[fname])
    elif encoder == Encoders.helmert:
        enc = ce.HelmertEncoder(cols=[fname])
    elif encoder == Encoders.sumcont:
        enc = ce.SumEncoder(cols=[fname])
    elif encoder == Encoders.polynomial:
        enc = ce.PolynomialEncoder(cols=[fname])
    elif encoder == Encoders.backdiff:
        enc = ce.BackwardDifferenceEncoder(cols=[fname])
    else:
        raise ValueError("Unknown Encoder %s" % encoder)
    # If encoding worked, calculate target percentages for classifiers.
    pd_exists = not pd_features.empty
    enc_exists = enc is not None
    all_features = None
    if pd_exists or enc_exists:
        if pd_exists:
            all_features = pd_features
        elif enc_exists:
            all_features = enc.fit_transform(ef, None)
        # Calculate target percentages for factors
        if (model_type == ModelType.classification
                and fname in feature_map['crosstabs']):
            # Get the crosstab for this feature
            ct = feature_map['crosstabs'][fname]
            # map target percentages to the new feature
            ct_map = ct.to_dict()[target_value]
            ct_feature = df[[fname]].applymap(ct_map.get)
            # impute sentinel for any values that could not be mapped
            ct_feature.fillna(value=sentinel, inplace=True)
            # concatenate all generated features
            all_features = np.column_stack((all_features, ct_feature))
            logger.info("Applied target percentages for %s", fname)
    else:
        raise RuntimeError("Encoding for feature %s failed" % fname)
    return all_features
Beispiel #23
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Sun Jan  7 23:17:31 2018

@author: tgadfort
"""

#conda install -c conda-forge category_encoders
#https://github.com/scikit-learn-contrib/categorical-encoding

import category_encoders as ce

encoder = ce.BackwardDifferenceEncoder(cols=[...])
encoder = ce.BinaryEncoder(cols=[...])
encoder = ce.HashingEncoder(cols=[...])
encoder = ce.HelmertEncoder(cols=[...])
encoder = ce.OneHotEncoder(cols=[...])
encoder = ce.OrdinalEncoder(cols=[...])
encoder = ce.SumEncoder(cols=[...])
encoder = ce.PolynomialEncoder(cols=[...])
encoder = ce.BaseNEncoder(cols=[...])
encoder = ce.LeaveOneOutEncoder(cols=[...])
Beispiel #24
0
def get_model(PARAMS):
    """return model for provided params

    :param PARAMS: dictionary with model params
    :type PARAMS: dicr
    :return: model pipeline
    :rtype: sklearn pipeline
    """

    try:
        te_dict = {
            'CatBoostEncoder': ce.CatBoostEncoder(),
            'HashingEncoder': ce.HashingEncoder(),
            'HelmertEncoder': ce.HelmertEncoder(),
            'LeaveOneOutEncoder': ce.LeaveOneOutEncoder(),
            'OneHotEncoder': ce.OneHotEncoder(),
            'TargetEncoder': ce.TargetEncoder(),
            'WOEEncoder': ce.WOEEncoder(),
            'BackwardDifferenceEncoder': ce.BackwardDifferenceEncoder(),
            'BaseNEncoder': ce.BaseNEncoder(),
            'BinaryEncoder': ce.BinaryEncoder(),
            'CountEncoder': ce.CountEncoder(),
            'JamesSteinEncoder': ce.JamesSteinEncoder(),
            'MEstimateEncoder': ce.MEstimateEncoder(),
            'PolynomialEncoder': ce.PolynomialEncoder(),
            'SumEncoder': ce.SumEncoder()
        }

        pipe = make_pipeline(
            helpers.PrepareData(extraxt_year=True, unicode_text=True),
            ColumnTransformer([
                ('num', helpers.PassThroughOrReplace(), [
                    'flat_size', 'rooms', 'floor', 'number_of_floors',
                    'year_of_building', 'GC_latitude', 'GC_longitude'
                ]),
                ('te_producer', te_dict.get(PARAMS['te_producer']),
                 'producer_name'),
                ('te_road', te_dict.get(PARAMS['te_road']), 'GC_addr_road'),
                ('te_neighbourhood', te_dict.get(PARAMS['te_neighbourhood']),
                 'GC_addr_neighbourhood'),
                ('te_suburb', te_dict.get(PARAMS['te_suburb']),
                 'GC_addr_suburb'),
                ('te_postcode', te_dict.get(PARAMS['te_postcode']),
                 'GC_addr_postcode'),
                ('txt_name',
                 TfidfVectorizer(lowercase=True,
                                 ngram_range=(1,
                                              PARAMS['txt_name__ngram_range']),
                                 max_features=PARAMS['txt_name__max_features'],
                                 dtype=np.float32,
                                 binary=PARAMS['txt_name__binary'],
                                 use_idf=PARAMS['txt_name__use_idf']), 'name'),
                ('txt_dscr',
                 TfidfVectorizer(lowercase=True,
                                 ngram_range=(1,
                                              PARAMS['txt_dscr__ngram_range']),
                                 max_features=PARAMS['txt_dscr__max_features'],
                                 dtype=np.float32,
                                 binary=PARAMS['txt_dscr__binary'],
                                 use_idf=PARAMS['txt_dscr__use_idf']),
                 'description'),
            ]),
            TransformedTargetRegressor(
                regressor=lgb.LGBMRegressor(**PARAMS, random_state=seed),
                func=np.log1p,
                inverse_func=np.expm1))

        return pipe

    except BaseException as e:
        LOG.error(e)
        return None
Beispiel #25
0
 def __init__(self, name: str):
     super().__init__(name, '_poly_', ce.PolynomialEncoder(cols=[name]),
                      -1.0, None)
Beispiel #26
0
df['binary_hashmap'] = binary_hashmap(
    df[df.columns.difference(NOT_BINARY_COLS)], low_count=4)

# Lets encode non-binary cols to be used in base models.
print("Recoding object values")
for col in NOT_BINARY_COLS + ['binary_hashmap']:
    df[col] = pd.factorize(df[col])[0]

# Lets do PCA on the binary cols.
binary_cols = list(
    set(list(train_df)) - set(NOT_BINARY_COLS + ['binary_hashmap']))
pca_train, pca_test = decomp_features(train_df[binary_cols],
                                      test_df[binary_cols],
                                      n_comp=50)

encoder = ce.PolynomialEncoder(cols=NOT_BINARY_COLS + ['binary_hashmap'])
poly = encoder.fit_transform(df[NOT_BINARY_COLS + ['binary_hashmap']])

df = pd.concat([df[df.columns.difference(binary_cols)], poly], axis=1)

# Now split into train and test and save the output of the processed dataset.
xtrain = df[:ntrain].copy()
xtest = df[ntrain:].copy()

xtrain = pd.concat([xtrain, pca_train], axis=1)
xtest = pd.concat([xtest, pca_test], axis=1)

xtrain['ID'] = id_train
xtrain['y'] = y_train
xtest['ID'] = id_test
            'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff',
            'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff']

# We painstakingly initialize each encoder here because that gives us the freedom to initialize the
# encoders with any setting we want.
encoders = [category_encoders.BackwardDifferenceEncoder(),
            category_encoders.BaseNEncoder(),
            category_encoders.BinaryEncoder(),
            category_encoders.HashingEncoder(),
            category_encoders.HelmertEncoder(),
            category_encoders.JamesSteinEncoder(),
            category_encoders.LeaveOneOutEncoder(),
            category_encoders.MEstimateEncoder(),
            category_encoders.OneHotEncoder(),
            category_encoders.OrdinalEncoder(),
            category_encoders.PolynomialEncoder(),
            category_encoders.SumEncoder(),
            category_encoders.TargetEncoder(),
            category_encoders.WOEEncoder()]

# Initialization
if os.path.isfile('./output/result.csv'):
    os.remove('./output/result.csv')

# Loop over datasets, then over encoders, and finally, over the models
for dataset_name in datasets:
    X, y, fold_count = arff_loader.load(dataset_name)

    # Random permutation (needed for CatBoost encoder)
    perm = np.random.permutation(len(X))
    X = X.iloc[perm].reset_index(drop=True)
 def __init__(self,
              encoder_name,
              reduction_method=None,
              ngram_range=(2, 4),
              categories='auto',
              dtype=np.float64,
              handle_unknown='ignore',
              clf_type=None,
              n_components=None):
     self.ngram_range = ngram_range
     self.encoder_name = encoder_name
     self.categories = categories
     self.dtype = dtype
     self.clf_type = clf_type
     self.handle_unknown = handle_unknown
     self.reduction_method = reduction_method
     self.n_components = n_components
     self.encoders_dict = {
         'OneHotEncoder':
         OneHotEncoder(handle_unknown='ignore'),
         'OneHotEncoder-1':
         OneHotEncoderRemoveOne(handle_unknown='ignore'),
         'Categorical':
         None,
         'OneHotEncoderDense':
         OneHotEncoder(handle_unknown='ignore', sparse=False),
         'OneHotEncoderDense-1':
         OneHotEncoderRemoveOne(handle_unknown='ignore', sparse=False),
         'SimilarityEncoder':
         SimilarityEncoder(ngram_range=self.ngram_range, random_state=10),
         'NgramNaiveFisherKernel':
         NgramNaiveFisherKernel(ngram_range=self.ngram_range,
                                random_state=10),
         'ngrams_hot_vectorizer': [],
         'NgramsCountVectorizer':
         CountVectorizer(analyzer='char', ngram_range=self.ngram_range),
         'NgramsTfIdfVectorizer':
         TfidfVectorizer(analyzer='char',
                         ngram_range=self.ngram_range,
                         smooth_idf=False),
         'WordNgramsTfIdfVectorizer':
         TfidfVectorizer(analyzer='word',
                         ngram_range=(1, 1),
                         smooth_idf=False),
         'TargetEncoder':
         TargetEncoder(clf_type=self.clf_type, handle_unknown='ignore'),
         'MDVEncoder':
         MDVEncoder(self.clf_type),
         'BackwardDifferenceEncoder':
         cat_enc.BackwardDifferenceEncoder(),
         'BinaryEncoder':
         cat_enc.BinaryEncoder(),
         'HashingEncoder':
         cat_enc.HashingEncoder(),
         'HelmertEncoder':
         cat_enc.HelmertEncoder(),
         'SumEncoder':
         cat_enc.SumEncoder(),
         'PolynomialEncoder':
         cat_enc.PolynomialEncoder(),
         'BaseNEncoder':
         cat_enc.BaseNEncoder(),
         'LeaveOneOutEncoder':
         cat_enc.LeaveOneOutEncoder(),
         'NgramsLDA':
         Pipeline([
             ('ngrams_count',
              CountVectorizer(analyzer='char',
                              ngram_range=self.ngram_range)),
             (
                 'LDA',
                 LatentDirichletAllocation(n_components=self.n_components,
                                           learning_method='batch'),
             )
         ]),
         'NMF':
         Pipeline([('ngrams_count',
                    CountVectorizer(analyzer='char',
                                    ngram_range=self.ngram_range)),
                   ('NMF', NMF(n_components=self.n_components))]),
         'WordNMF':
         Pipeline([('ngrams_count',
                    CountVectorizer(analyzer='word', ngram_range=(1, 1))),
                   ('NMF', NMF(n_components=self.n_components))]),
         'NgramsMultinomialMixture':
         NgramsMultinomialMixture(n_topics=self.n_components, max_iters=10),
         'AdHocNgramsMultinomialMixture':
         AdHocNgramsMultinomialMixture(n_iters=0),
         'AdHocIndependentPDF':
         AdHocIndependentPDF(),
         'OnlineGammaPoissonFactorization':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             rho=.99,
             r=None,
             tol=1e-4,
             random_state=18,
             init='k-means++',
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=10),
         'OnlineGammaPoissonFactorization2':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=.3,
             rho=None,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init='k-means++',
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20),
         'OnlineGammaPoissonFactorization3':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=.3,
             rho=None,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init='k-means',
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20),
         'OnlineGammaPoissonFactorization4':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=None,
             rho=.95,
             batch_size=256,
             tol=1e-4,
             random_state=18,
             init='k-means',
             ngram_range=self.ngram_range,
             rescale_W=True,
             max_iter_e_step=20),
         'WordOnlineGammaPoissonFactorization':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=.3,
             tol=1e-4,
             random_state=18,
             init='k-means++',
             ngram_range=(1, 1),
             analizer='word',
             rescale_W=True,
             max_iter_e_step=10),
         'OnlineGammaPoissonFactorization_fast':
         gamma_poisson_factorization.OnlineGammaPoissonFactorization(
             n_topics=self.n_components,
             r=.3,
             ngram_range=(3, 3),
             max_iter=1,
             min_iter=1,
             tol=1e-4,
             random_state=18,
             init='k-means++',
             rescale_W=False),
         'MinHashEncoder':
         MinHashEncoder(n_components=self.n_components),
         'PretrainedFastText':
         PretrainedFastText(n_components=self.n_components),
         'PretrainedFastText_fr':
         PretrainedFastText(n_components=self.n_components,
                            language='french'),
         'PretrainedFastText_hu':
         PretrainedFastText(n_components=self.n_components,
                            language='hungarian'),
         None:
         FunctionTransformer(None, validate=True),
         'Passthrough':
         PasstroughEncoder(),
     }
     self.list_1D_array_methods = [
         'NgramsCountVectorizer',
         'NgramsTfIdfVectorizer',
         'WordNgramsTfIdfVectorizer',
         'ngrams_hot_vectorizer',
         'NgramsLDA',
         'NMF',
         'WordNMF',
         'NgramsMultinomialMixture',
         'NgramsMultinomialMixtureKMeans2',
         'AdHocNgramsMultinomialMixture',
         'AdHocIndependentPDF',
         'GammaPoissonFactorization',
         'OnlineGammaPoissonFactorization',
         'WordOnlineGammaPoissonFactorization',
         'OnlineGammaPoissonFactorization2',
         'OnlineGammaPoissonFactorization3',
         'OnlineGammaPoissonFactorization4',
         'OnlineGammaPoissonFactorization_fast',
         'MinHashEncoder',
         'MinMeanMinHashEncoder',
     ]
    target = dataset.iloc[:, -1]
else:
    features = dataset
    target = _
"""START: Import encoders"""
import category_encoders as ce
import sys
sys.path.append('../encoders/')
from ceng import CENGEncoder
from pattern_preserving import SimplePPEncoder, AgingPPEncoder, GeneticPPEncoder
from entity_embedding import EntityEmbeddingEncoder
from cesamo import CESAMOEncoder

Encoders = {
    'Ordinal': ce.OrdinalEncoder(),
    'Polynomial': ce.PolynomialEncoder(),
    'OneHot': ce.OneHotEncoder(),
    'BackwardDifference': ce.BackwardDifferenceEncoder(),
    'Helmert': ce.HelmertEncoder(),
    'EntityEmbedding': EntityEmbeddingEncoder(),
    'TargetEnc': ce.TargetEncoder(),
    'WOE': ce.WOEEncoder(),
    'CENG': CENGEncoder(verbose=0),
    'GeneticPP': GeneticPPEncoder(num_predictors=2),
    'AgingPP': AgingPPEncoder(num_predictors=2),
    'SimplePP': SimplePPEncoder(num_predictors=2),
    'CESAMOEncoder': CESAMOEncoder()
}

if target_flag == 0:
    del Encoders['EntityEmbedding']
Beispiel #30
0
def encode_all(df,dfv,dfk,encoder_to_use,handle_missing='return_nan'):
    
    encoders_used = {}
    
    for col in encoder_to_use:

        if encoder_to_use[col] == 'ColumnDropper':
            df = df.drop(columns = col)
            dfv = dfv.drop(columns = col)
            dfk = dfk.drop(columns = col)
            encoders_used[col] = 'ColumnDropper'    
                
        if encoder_to_use[col]=='BackwardDifferenceEncoder':
            encoder=ce.BackwardDifferenceEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='BaseNEncoder':
            encoder=ce.BaseNEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,base=3) 
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='BinaryEncoder':
            encoder=ce.BinaryEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='CatBoostEncoder':
            encoder=ce.CatBoostEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None,a=2)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

    #     if encoder_to_use[col]=='HashingEncoder':
    #         encoder=ce.HashingEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
    #         encoder.fit(X=df,y=df['set_clicked'])
    #         df=encoder.transform(df)
    #         encoders_used[col]=encoder

        if encoder_to_use[col]=='HelmertEncoder':
            encoder=ce.HelmertEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='JamesSteinEncoder':
            encoder=ce.JamesSteinEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing, model='binary')
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='LeaveOneOutEncoder':
            encoder=ce.LeaveOneOutEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='MEstimateEncoder':
            encoder=ce.MEstimateEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None,m=2)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='OneHotEncoder':
            encoder=ce.OneHotEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,use_cat_names=True)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='OrdinalEncoder':
            encoder=ce.OrdinalEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='SumEncoder':
            encoder=ce.SumEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='PolynomialEncoder':
            encoder=ce.PolynomialEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='TargetEncoder':
            encoder=ce.TargetEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,min_samples_leaf=10, smoothing=5)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder


        if encoder_to_use[col]=='WOEEncoder':
            encoder=ce.WOEEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder
            
#         print("Encoding done for - ",col)
    
    print("Completed encoder - ",datetime.datetime.now())
    
    return df, dfv, dfk, encoders_used