Beispiel #1
0
    def _fit_js(self, df, y, target, parameter):
        js_encoder = ce.JamesSteinEncoder()

        js_encoder.fit(df[target].map(to_str), df[y])
        name = ['continuous_' + remove_continuous_discrete_prefix(x) + '_js' for x in
                js_encoder.get_feature_names()]
        self.trans_ls.append(('catboost', name, target, js_encoder))
 def test_continuous_target_beta(self):
     X = np.array(['a', 'b', 'b', 'c'])
     y = np.array([-10, 0, 0, 10])
     out = encoders.JamesSteinEncoder(return_df=False,
                                      model='beta').fit_transform(X, y)
     self.assertEqual(
         [-2, 0, 0, 2], list(out),
         'The model assumes normal distribution -> we support real numbers')
def target_encoder_jamesstein(df, train_df, cols, target):
    ce_jse = ce.JamesSteinEncoder(cols=cols, drop_invariant=True)
    ce_jse.fit(X=train_df[cols], y=train_df[target])
    _df = ce_jse.transform(df[cols])
    # カラム名の変更
    for col in cols:
        _df = _df.rename({col: f'{col}_targetenc_ce_jse'}, axis=1)
    return pd.concat([df, _df], axis=1)
 def test_zero_variance(self):
     X = np.array(['a', 'b', 'c', 'd', 'd'])
     y = np.array([0, 1, 1, 1, 1])
     out = encoders.JamesSteinEncoder(return_df=False,
                                      model='independent').fit_transform(
                                          X, y)
     self.assertEqual([0, 1, 1, 1, 1], list(out),
                      'Should not result into division by zero')
 def test_ids_large_pooled(self):
     X = np.array(['a', 'b', 'c', 'd', 'e'])
     y = np.array([1, 0, 1, 0, 1])
     out = encoders.JamesSteinEncoder(model='pooled').fit_transform(X, y)
     self.assertTrue(
         all(np.var(out) == 0),
         'This is not a standard behaviour of James-Stein estimator. But it helps a lot if we treat id-like attributes as non-predictive.'
     )
 def test_large_samples_binary(self):
     X = np.array(['a', 'b', 'b', 'c', 'd'])
     y = np.array([1, 0, 1, 0, 0])
     out = encoders.JamesSteinEncoder(return_df=False,
                                      model='binary').fit_transform(X, y)
     self.assertNotEqual(
         [1, 0.5, 0.5, 0, 0], list(out),
         'Shrinkage should kick in with 4 or more unique values')
Beispiel #7
0
    def fit(self, input_df, y=None):

        train_df = input_df[~(input_df.likes.isnull())]

        self.cbe = ce.JamesSteinEncoder(cols=self.column, drop_invariant=True)
        self.cbe.fit(train_df[self.column], train_df['likes'])

        return
Beispiel #8
0
def category_encode(dataframe):
    global category_columns
    global category_target
    x = dataframe[category_columns]
    y = dataframe[target]
    ce_ord = ce.JamesSteinEncoder(cols=category_columns)
    dataframe[category_columns] = ce_ord.fit_transform(x, y)
    return dataframe
 def test_large_samples_beta(self):
     X = np.array(['a', 'b', 'b', 'c', 'd'])
     y = np.array([1, 0, 1, 0, 0])
     out = encoders.JamesSteinEncoder(return_df=False,
                                      model='beta').fit_transform(X, y)
     self.assertNotEqual(
         [1, 0.5, 0.5, 0, 0], list(out),
         'Shrinkage should kick in with 4 or more unique values')
     self.assertTrue(np.max(out) <= 1, 'This should still be a probability')
     self.assertTrue(np.min(out) >= 0, 'This should still be a probability')
 def test_small_samples_independent(self):
     X = np.array(['a', 'b', 'b'])
     y = np.array([1, 0, 1])
     out = encoders.JamesSteinEncoder(return_df=False,
                                      model='independent').fit_transform(
                                          X, y)
     self.assertEqual([1, 0.5, 0.5], list(
         out
     ), 'When the count of unique values in the column is <4 (here it is 2), James-Stein estimator returns (unbiased) sample means'
                      )
Beispiel #11
0
def encode_df(X, y, cat_features, cat_encoding):
    ENCODERS = {
        'leave_one_out':
        ce.LeaveOneOutEncoder(cols=cat_features, handle_missing='return_nan'),
        'james_stein':
        ce.JamesSteinEncoder(cols=cat_features, handle_missing='return_nan'),
        'target':
        ce.TargetEncoder(cols=cat_features, handle_missing='return_nan')
    }
    X = ENCODERS[cat_encoding].fit_transform(X, y)
    return X
    def __init__(self, encoder_type, columns_name=None):
        """
        :param encoder_type:
        :param columns_name: list, 特征名组成的列表名
        """
        if encoder_type == "BackwardDe":  # 反向差分编码
            self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name)

        elif encoder_type == "BaseN":  # BaseN编码
            self.encoder = ce.BaseNEncoder(cols=columns_name)

        elif encoder_type == "Binary":  # 二值编码
            self.encoder = ce.BinaryEncoder(cols=columns_name)

        elif encoder_type == "Catboost":
            self.encoder = ce.CatBoostEncoder(cols=columns_name)

        elif encoder_type == "Hash":
            self.encoder = ce.HashingEncoder(cols=columns_name)

        elif encoder_type == "Helmert":
            self.encoder = ce.HelmertEncoder(cols=columns_name)

        elif encoder_type == "JamesStein":
            self.encoder = ce.JamesSteinEncoder(cols=columns_name)

        elif encoder_type == "LOO":  # LeaveOneOutEncoder  编码
            self.encoder = ce.LeaveOneOutEncoder(cols=columns_name)

        elif encoder_type == "ME":
            self.encoder = ce.MEstimateEncoder(cols=columns_name)  # M估计编码器

        elif encoder_type == "OneHot":
            self.encoder = ce.OneHotEncoder(cols=columns_name)

        elif encoder_type == "OridinalEncoder":  # 原始编码
            self.encoder = ce.OrdinalEncoder(cols=columns_name)

        elif encoder_type == "Sum":  # 求和编码
            self.encoder = ce.SumEncoder(cols=columns_name)

        elif encoder_type == "Polynomial":  # 多项式编码
            self.encoder = ce.PolynomialEncoder(cols=columns_name)

        elif encoder_type == "Target":  # 目标编码
            self.encoder = ce.TargetEncoder(cols=columns_name)

        elif encoder_type == "WOE":  # WOE 编码器
            self.encoder = ce.WOEEncoder(cols=columns_name)

        else:
            raise ValueError("请选择正确的编码方式")
Beispiel #13
0
def get_encoder_dict():
    encoder_dict = {
        'OneHotEncoder': ce.OneHotEncoder(),
        'BinaryEncoder': ce.BinaryEncoder(),
        'HashingEncoder': ce.HashingEncoder(),
        'LabelEncoder': le.MultiColumnLabelEncoder(),
        'FrequencyEncoder': fe.FrequencyEncoder(),
        'TargetEncoder': ce.TargetEncoder(),
        'HelmertEncoder': ce.HelmertEncoder(),
        'JamesSteinEncoder': ce.JamesSteinEncoder(),
        'BaseNEncoder': ce.BaseNEncoder(),
        'SumEncoder': ce.SumEncoder(),
    }
    return encoder_dict
Beispiel #14
0
    def test_ids_large_pooled(self):
        X = np.array(['a', 'b', 'c', 'd', 'e'])
        y = np.array([1, 0, 1, 0, 1])
        out = encoders.JamesSteinEncoder(model='pooled').fit_transform(X, y)
        self.assertTrue(
            all(np.var(out) == 0),
            'This is not a standard behaviour of James-Stein estimator. But it helps a lot if we treat id-like attributes as non-predictive.'
        )

        ####### Beta
        def test_continuous_target_pooled(self):
            X = np.array(['a', 'b', 'b', 'c'])
            y = np.array([-10, 0, 0, 10])
            out = encoders.JamesSteinEncoder(return_df=False,
                                             model='beta').fit_transform(X, y)
            self.assertEqual([-10, 0, 0, 10], list(
                out
            ), 'The model assumes normal distribution -> we support real numbers'
                             )

        def test_large_samples_pooled(self):
            X = np.array(['a', 'b', 'b', 'c', 'd'])
            y = np.array([1, 0, 1, 0, 0])
            out = encoders.JamesSteinEncoder(return_df=False,
                                             model='beta').fit_transform(X, y)
            self.assertNotEqual(
                [1, 0.5, 0.5, 0, 0], list(out),
                'Shrinkage should kick in with 4 or more unique values')
            self.assertTrue(
                np.max(out) <= 1, 'This should still be a probability')
            self.assertTrue(
                np.min(out) >= 0, 'This should still be a probability')

        def test_ids_small_pooled(self):
            X = np.array(['a', 'b', 'c'])
            y = np.array([1, 0, 1])
            out = encoders.JamesSteinEncoder(model='beta').fit_transform(X, y)
            self.assertTrue(
                all(np.var(out) == 0),
                'This is not a standard behaviour of James-Stein estimator. But it helps a lot if we treat id-like attributes as non-predictive.'
            )

        def test_ids_large_pooled(self):
            X = np.array(['a', 'b', 'c', 'd', 'e'])
            y = np.array([1, 0, 1, 0, 1])
            out = encoders.JamesSteinEncoder(model='beta').fit_transform(X, y)
            self.assertTrue(
                all(np.var(out) == 0),
                'This is not a standard behaviour of James-Stein estimator. But it helps a lot if we treat id-like attributes as non-predictive.'
            )
 def test_small_samples_binary(self):
     X = np.array(['a', 'b', 'b'])
     y = np.array([1, 0, 1])
     out = encoders.JamesSteinEncoder(return_df=False,
                                      model='binary').fit_transform(X, y)
     self.assertTrue(
         np.sum(
             np.abs([
                 np.log((1.5 * 1.5) / (0.5 * 1.5)),
                 np.log((0.5 * 1.5) / (1.5 * 1.5)),
                 np.log((0.5 * 1.5) / (1.5 * 1.5))
             ] - np.transpose(out))) < 0.001,
         'When the count of unique values in the column is <4 (here it is 2), James-Stein estimator returns (unbiased) sample means'
     )
Beispiel #16
0
def js_encoding(X_fit, y_fit, cols, X_test=None, model='independent'):
    """
    只针对continuous target
    X_fit: 用来计算encoding的df, 包含cols
    y_fit: encoding的target
    X_test: 需要transform的对象
    cols: 需要encoding的列
    model: 'pooled' or 'independent';pooled是假设所有个体具有相同的方差,与casi书中定义一致
    """
    if X_test is None:
        X_test = X_fit
    encoder = ce.JamesSteinEncoder(cols=cols, model=model)
    encoder.fit(X_fit, y_fit)
    result = encoder.transform(X_test)
    return result
Beispiel #17
0
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


df = pd.read_csv("BigDataV5.csv")

y = df['price']
x = df.drop('price', axis=1)

X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

encoder = ce.JamesSteinEncoder(
    cols=['make', 'model', 'city', 'color', 'trans'])

X_train_tran = encoder.fit_transform(X_train, y_train)
X_test_tran = encoder.transform(X_test, y_test)

# Create the model
model = ensemble.GradientBoostingRegressor()

# Parameters we want to try
param_grid = {
    'n_estimators': np.arange(1000, 5000, 100),
    'max_depth': np.arange(5, 10),
    'min_samples_leaf': np.arange(3, 20),
    'learning_rate': np.arange(0.01, 0.1),
    'max_features': np.arange(0.1, 1),
    'loss': ['ls', 'lad', 'huber']
Beispiel #18
0
 def get_encoder(self) -> BaseEstimator:
     return ce.JamesSteinEncoder(cols=self.target_columns)
Beispiel #19
0
            'colic.arff', 'credit.a.arff', 'credit.g.arff', 'cylinder.bands.arff', 'flags.arff', 'heart.c.arff', 'heart.h.arff',
            'hepatitis.arff', 'hypothyroid.arff', 'kr.vs.kp.arff', 'labor.arff', 'lymph.arff', 'mushroom.arff', 'nursery.arff',
            'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff',
            'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff']

# datasets = ['carvana.csv', 'erasmus.csv', 'internetusage.csv', 'ipumsla97small.csv', 'kobe.csv', 'pbcseq.csv', 'phpvcoG8S.csv', 'westnile.csv'] # amazon is too large...


# We painstakingly initialize each encoder here because that gives us the freedom to initialize the
# encoders with any setting we want.
encoders = [ #category_encoders.BackwardDifferenceEncoder(),
             category_encoders.BaseNEncoder(),
             category_encoders.BinaryEncoder(),
             category_encoders.HashingEncoder(),
             # category_encoders.HelmertEncoder(),
             category_encoders.JamesSteinEncoder(),
             category_encoders.LeaveOneOutEncoder(),
             category_encoders.MEstimateEncoder(),
             category_encoders.OneHotEncoder(),
             category_encoders.OrdinalEncoder(),
             # category_encoders.PolynomialEncoder(),
             # category_encoders.SumEncoder(),
             category_encoders.TargetEncoder(),
             category_encoders.WOEEncoder()]

encoders = [ #category_encoders.BackwardDifferenceEncoder(),
             category_encoders.BaseNEncoder(handle_missing='value'),
             category_encoders.BaseNEncoder(handle_missing='indicator'),
             category_encoders.BinaryEncoder(handle_missing='value'),
category_encoders.BinaryEncoder(handle_missing='indicator'),
#              category_encoders.HashingEncoder(handle_missing='value'),
Beispiel #20
0
def encode_all(df,dfv,dfk,encoder_to_use,handle_missing='return_nan'):
    
    encoders_used = {}
    
    for col in encoder_to_use:

        if encoder_to_use[col] == 'ColumnDropper':
            df = df.drop(columns = col)
            dfv = dfv.drop(columns = col)
            dfk = dfk.drop(columns = col)
            encoders_used[col] = 'ColumnDropper'    
                
        if encoder_to_use[col]=='BackwardDifferenceEncoder':
            encoder=ce.BackwardDifferenceEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='BaseNEncoder':
            encoder=ce.BaseNEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,base=3) 
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='BinaryEncoder':
            encoder=ce.BinaryEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='CatBoostEncoder':
            encoder=ce.CatBoostEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None,a=2)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

    #     if encoder_to_use[col]=='HashingEncoder':
    #         encoder=ce.HashingEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
    #         encoder.fit(X=df,y=df['set_clicked'])
    #         df=encoder.transform(df)
    #         encoders_used[col]=encoder

        if encoder_to_use[col]=='HelmertEncoder':
            encoder=ce.HelmertEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='JamesSteinEncoder':
            encoder=ce.JamesSteinEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing, model='binary')
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='LeaveOneOutEncoder':
            encoder=ce.LeaveOneOutEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='MEstimateEncoder':
            encoder=ce.MEstimateEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None,m=2)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='OneHotEncoder':
            encoder=ce.OneHotEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,use_cat_names=True)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='OrdinalEncoder':
            encoder=ce.OrdinalEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='SumEncoder':
            encoder=ce.SumEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='PolynomialEncoder':
            encoder=ce.PolynomialEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='TargetEncoder':
            encoder=ce.TargetEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,min_samples_leaf=10, smoothing=5)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder


        if encoder_to_use[col]=='WOEEncoder':
            encoder=ce.WOEEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder
            
#         print("Encoding done for - ",col)
    
    print("Completed encoder - ",datetime.datetime.now())
    
    return df, dfv, dfk, encoders_used
#     tic-tac-toe.arff
#     trains.arff                       Medium impact   (tiny dataset -> with high variance)
datasets = ['audiology.arff', 'autos.arff', 'breast.cancer.arff', 'bridges.version1.arff', 'bridges.version2.arff', 'car.arff',
            'colic.arff', 'credit.a.arff', 'credit.g.arff', 'cylinder.bands.arff', 'flags.arff', 'heart.c.arff', 'heart.h.arff',
            'hepatitis.arff', 'hypothyroid.arff', 'kr.vs.kp.arff', 'labor.arff', 'lymph.arff', 'mushroom.arff', 'nursery.arff',
            'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff',
            'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff']

# We painstakingly initialize each encoder here because that gives us the freedom to initialize the
# encoders with any setting we want.
encoders = [category_encoders.BackwardDifferenceEncoder(),
            category_encoders.BaseNEncoder(),
            category_encoders.BinaryEncoder(),
            category_encoders.HashingEncoder(),
            category_encoders.HelmertEncoder(),
            category_encoders.JamesSteinEncoder(),
            category_encoders.LeaveOneOutEncoder(),
            category_encoders.MEstimateEncoder(),
            category_encoders.OneHotEncoder(),
            category_encoders.OrdinalEncoder(),
            category_encoders.PolynomialEncoder(),
            category_encoders.SumEncoder(),
            category_encoders.TargetEncoder(),
            category_encoders.WOEEncoder()]

# Initialization
if os.path.isfile('./output/result.csv'):
    os.remove('./output/result.csv')

# Loop over datasets, then over encoders, and finally, over the models
for dataset_name in datasets:
Beispiel #22
0
def get_model(PARAMS):
    """return model for provided params

    :param PARAMS: dictionary with model params
    :type PARAMS: dicr
    :return: model pipeline
    :rtype: sklearn pipeline
    """

    try:
        te_dict = {
            'CatBoostEncoder': ce.CatBoostEncoder(),
            'HashingEncoder': ce.HashingEncoder(),
            'HelmertEncoder': ce.HelmertEncoder(),
            'LeaveOneOutEncoder': ce.LeaveOneOutEncoder(),
            'OneHotEncoder': ce.OneHotEncoder(),
            'TargetEncoder': ce.TargetEncoder(),
            'WOEEncoder': ce.WOEEncoder(),
            'BackwardDifferenceEncoder': ce.BackwardDifferenceEncoder(),
            'BaseNEncoder': ce.BaseNEncoder(),
            'BinaryEncoder': ce.BinaryEncoder(),
            'CountEncoder': ce.CountEncoder(),
            'JamesSteinEncoder': ce.JamesSteinEncoder(),
            'MEstimateEncoder': ce.MEstimateEncoder(),
            'PolynomialEncoder': ce.PolynomialEncoder(),
            'SumEncoder': ce.SumEncoder()
        }

        pipe = make_pipeline(
            helpers.PrepareData(extraxt_year=True, unicode_text=True),
            ColumnTransformer([
                ('num', helpers.PassThroughOrReplace(), [
                    'flat_size', 'rooms', 'floor', 'number_of_floors',
                    'year_of_building', 'GC_latitude', 'GC_longitude'
                ]),
                ('te_producer', te_dict.get(PARAMS['te_producer']),
                 'producer_name'),
                ('te_road', te_dict.get(PARAMS['te_road']), 'GC_addr_road'),
                ('te_neighbourhood', te_dict.get(PARAMS['te_neighbourhood']),
                 'GC_addr_neighbourhood'),
                ('te_suburb', te_dict.get(PARAMS['te_suburb']),
                 'GC_addr_suburb'),
                ('te_postcode', te_dict.get(PARAMS['te_postcode']),
                 'GC_addr_postcode'),
                ('txt_name',
                 TfidfVectorizer(lowercase=True,
                                 ngram_range=(1,
                                              PARAMS['txt_name__ngram_range']),
                                 max_features=PARAMS['txt_name__max_features'],
                                 dtype=np.float32,
                                 binary=PARAMS['txt_name__binary'],
                                 use_idf=PARAMS['txt_name__use_idf']), 'name'),
                ('txt_dscr',
                 TfidfVectorizer(lowercase=True,
                                 ngram_range=(1,
                                              PARAMS['txt_dscr__ngram_range']),
                                 max_features=PARAMS['txt_dscr__max_features'],
                                 dtype=np.float32,
                                 binary=PARAMS['txt_dscr__binary'],
                                 use_idf=PARAMS['txt_dscr__use_idf']),
                 'description'),
            ]),
            TransformedTargetRegressor(
                regressor=lgb.LGBMRegressor(**PARAMS, random_state=seed),
                func=np.log1p,
                inverse_func=np.expm1))

        return pipe

    except BaseException as e:
        LOG.error(e)
        return None
            'colic.arff', 'credit.a.arff', 'credit.g.arff', 'cylinder.bands.arff', 'flags.arff', 'heart.c.arff', 'heart.h.arff',
            'hepatitis.arff', 'hypothyroid.arff', 'kr.vs.kp.arff', 'labor.arff', 'lymph.arff', 'mushroom.arff', 'nursery.arff',
            'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff',
            'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff']

datasets = ['carvana.csv', 'erasmus.csv', 'internetusage.csv', 'ipumsla97small.csv', 'kobe.csv', 'pbcseq.csv', 'phpvcoG8S.csv', 'westnile.csv'] # amazon is too large...


# We painstakingly initialize each encoder here because that gives us the freedom to initialize the
# encoders with any setting we want.
encoders = [ #category_encoders.BackwardDifferenceEncoder(),
             category_encoders.BaseNEncoder(),
             category_encoders.BinaryEncoder(),
             category_encoders.HashingEncoder(),
             # category_encoders.HelmertEncoder(),
             category_encoders.JamesSteinEncoder(),
             category_encoders.LeaveOneOutEncoder(),
             category_encoders.MEstimateEncoder(),
             category_encoders.OneHotEncoder(),
             category_encoders.OrdinalEncoder(),
             # category_encoders.PolynomialEncoder(),
             # category_encoders.SumEncoder(),
             category_encoders.TargetEncoder(),
             category_encoders.WOEEncoder()]

encoders = [category_encoders.TargetEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.WOEEncoder()]

# Initialization
if os.path.isfile('./output/result.csv'):
    os.remove('./output/result.csv')
Beispiel #24
0
 def __init__(self, variables=None):
     if not isinstance(variables, list):
         self.variables = [variables]
     else:
         self.variables = variables
     self.encoder = ce.JamesSteinEncoder(cols=self.variables)
Beispiel #25
0
def frc_runner(df_eng_all, base_product_number_std,
  categoricalVars, numerical_vars,
  num_neighbours, validation_test_size,
  num_iterations, learning_rate, depth,
  feat_importance_keyword = 'feature_importances_',
  experiment_label = 'grocery',
  responseVar = 'wk1_sales_all_stores',
  identifierVar = 'promo_identifier_latex',
  doVisualisation=True,
  doSaveExcel = True):

  # Create an identifier
  df_eng_all['promo_identifier'] = df_eng_all.base_product_number_std + \
    ' (' + df_eng_all.offer_description + ')'

  # for latex
  #all_bpns = df_eng_all.base_product_number_std.unique().tolist()
  all_bpns = [*set(df_eng_all.base_product_number_std)-set(base_product_number_std)]
  all_bpns_as_ids = ['product_' + str(idx) for idx in range(0, len(all_bpns))]
  dict_bpns = dict(zip(all_bpns, all_bpns_as_ids))
  dict_bpns.update({base_product_number_std: 'new_product'})

  df_eng_all['promo_identifier_latex'] = df_eng_all.base_product_number_std.map(dict_bpns)


  prefix = experiment_label + '_' + base_product_number_std + '_' + dt.datetime.today().strftime('%d_%m_%Y_%HH')

  idx_product = df_eng_all.base_product_number_std == base_product_number_std
  df_test = df_eng_all[idx_product].copy()
  df_test.reset_index(inplace=True)

  print(df_test.promo_identifier.iloc[0])

  # From the product, find the PSGC and exclude the product itself
  this_psgc = df_test.product_sub_group_code.iloc[0]
  idx_psgc = df_eng_all.product_sub_group_code.str.contains(this_psgc)
  df = df_eng_all[idx_psgc & ~ idx_product].copy()
  df.reset_index(inplace=True)

  # NUMERICAL:
  # Make sure the numericals are encoded as such.
  vTypes = pt.infer_variable_type(df)
  # Numerical vars
  #numerical_vars = [*set(inputVars) - set(categoricalVars)]
  # Make sure all of them are coded as numerical ones
  missing_vars = [*set(numerical_vars)-set(vTypes['numerical'])]
  if missing_vars:
    print(f'List of missing vars {missing_vars}')

  # CATEGORICAL: Save a copy of the categorical variables as 
  # this encoder overwrites them
  enc_postfix = '_encoded'
  enc_categoricalVars = []
  for varName in categoricalVars:
          currentVarName = varName + enc_postfix
          enc_categoricalVars.append(currentVarName)
          df[currentVarName] = df[varName]
          df_test[currentVarName] = df_test[varName]

  # For quick remapping
  catVarsMapping = dict(zip(enc_categoricalVars, categoricalVars))

  # get the index of the categorical variables for CatBoost
  inputVars = numerical_vars + enc_categoricalVars



  # JamesSteinEncoder
  inputVars_encoder = inputVars + categoricalVars
  encoder_js  = ce.JamesSteinEncoder(cols=enc_categoricalVars, verbose=1)

  # fit training
  df_A_enc = encoder_js.fit_transform(df[inputVars_encoder], df[responseVar])
  df_A_enc[responseVar] = df[responseVar]
  df_A_enc[identifierVar] = df[identifierVar]
  # fit test
  df_test_enc = encoder_js.transform(df_test[inputVars_encoder])
  df_test_enc[responseVar] = df_test[responseVar]
  df_test_enc[identifierVar] = df_test[identifierVar]


  '''
    Train, val and test
  '''
  num_inputVars = len(inputVars)

  X_train = df_A_enc[inputVars].values
  y_train = df_A_enc[responseVar].values
  id_train = df_A_enc[identifierVar].values

  X_test =  df_test_enc[inputVars].values
  y_test = df_test_enc[responseVar].values


  '''
    Model. Using CatBoost here
  '''
  # Create the forecaster
  contrastiveReg = contrastiveRegressor(num_neighbours = num_neighbours, 
    validation_test_size = validation_test_size)

  # CatBoost
  cb_model = CatBoostRegressor(iterations=num_iterations, learning_rate=learning_rate,
  depth=depth, loss_function='RMSE', cat_features=None, silent=True)
  # Set the regressor
  contrastiveReg.set_regressor(cb_model, feat_importance_keyword, inputVars)
  # fit the regressor
  contrastiveReg.fit(X_train, y_train)
  # eval results
  contrastiveReg.predict_eval_test()
  eval_results = contrastiveReg.get_results()

  # Predict
  contrastiveReg.predict(X_test, categorical_mapping = catVarsMapping)
  cold_start_results = contrastiveReg.get_results()


  # Sort by importance
  df_feature_importances = cold_start_results.get('df_feat_importances', None)
  print(df_feature_importances)


  # Arrange the results in a DF so we can easily plot them
  df_frc = df_test.copy()
  df_frc['y_hat'] = cold_start_results['y_hat_weighted']


  # review the cold-start forecast
  all_cold_forecast = []
  all_frc_latex = []

  model_vars = ['y_actual', 'y_forecast', \
      'y_train', 'delta_y_train', \
      'y_train_plus_delta', 'y_train_distances']

  vars_latex = ['y_train', 'delta_y_train', \
      'y_train_plus_delta', 'y_train_distances']


  # Annonymise
  dict_feature_importances = df_feature_importances.to_dict(orient='dict').get(0, None)
  vars_model = numerical_vars + categoricalVars
  inputObfuscated = []
  for idx, iVar in enumerate(vars_model,1):
    str_feat_weight = f' (vi:{dict_feature_importances.get(iVar, 0):3.2f})'
    inputObfuscated.append('v_' + str(idx) + str_feat_weight)

  mapObfuscatedVars = dict(zip(vars_model, inputObfuscated))
  mapObfuscatedVars[responseVar] = 'response'

  list_vars = [iVar for iVar in df_feature_importances.index.tolist() if 'ref_' not in iVar]

  for idx_review in range(df_test.shape[0]):
    print('Running {idx_review}...')
    df_forecast_ext = contrastiveReg.arrange_regressor_results(idx_review, df_A_enc, \
    y_train, id_train, list_vars, \
    identifierVar, df_test_enc, y_test, num_inputVars)
    df_forecast_ext.reset_index(inplace=True)
    
    df_forecast_ext['y_train'].iloc[-2] = \
      df_forecast_ext['y_weighted_forecast'].iloc[-2]
    
    print(df_forecast_ext)
    all_cold_forecast.append(df_forecast_ext)

    
    y_actual = df_forecast_ext['y_actual'].iloc[-2]
    y_forecast = df_forecast_ext['y_weighted_forecast'].iloc[-2]
    print(f'(actual: {y_actual:3.2f}, forecast: {y_forecast:3.2f})')
    all_frc_latex.append(df_forecast_ext[0:-1])

  # Append them all
  df_all_cold_forecast = pd.concat(all_cold_forecast)
  df_all_latex = pd.concat(all_frc_latex)


  top_n_features = 4 
  if top_n_features < num_inputVars:
    list_vars_LaTeX = list_vars[0:top_n_features]
  else:
    list_vars_LaTeX = list_vars

  if doSaveExcel:

      # Created an obfuscated LaTeX version
      df_latex = df_all_latex[[identifierVar] + list_vars_LaTeX + vars_latex].copy()
      # Obfuscate the TSR so we don't get into problems
      tsr_col = 'total_store_revenue' 
      if tsr_col in df_latex.columns.tolist(): 
        df_latex[tsr_col] = df_latex[tsr_col].apply(lambda x: np.log10(x))
        df_latex.rename(columns=mapObfuscatedVars, inplace=True)

      str_latex = fhelp.prepareTableLaTeX(df_latex)
      tex_file_name = _p.join('tex', prefix + '_table.tex')
      fhelp.writeTextFile(str_latex, tex_file_name)

      # Excel
      list_vars.extend(model_vars)
      vars_xls = list_vars + [identifierVar]
      xlsx_file_name = _p.join('results', prefix + '_table.xlsx')
      fhelp.to_excel_file(df_all_cold_forecast[vars_xls], xlsx_file_name)

  '''
    Visualise
  '''
  if doVisualisation:
    varX = list_vars[0]
    varY = list_vars[1]
    varZ = responseVar

    if varX in categoricalVars:
      df[varX] = df[varX].astype(int)
      df_frc[varX] = df_frc[varX].astype(int)
    if varY in categoricalVars:
      df[varY] = df[varY].astype(int)
      df_frc[varY] = df_frc[varY].astype(int)

    _alpha = 0.75
    fig = plt.figure()
    ax = plt.axes(projection='3d')

    # All the subgroup
    ax.scatter(df[varX], df[varY], df[varZ], alpha=0.15)
    # Also plot the test points
    ax.scatter(df_frc[varX], df_frc[varY], df_frc['y_hat'], alpha=1.0, label='cold-forecast', color='red', s=75)
    # plot the selected products

    for idx_forecast in range(0, len(cold_start_results['y_hat'])):
      idx_closest_promos = cold_start_results['y_idx_closest_promos'][idx_forecast]
      df_A = df.iloc[idx_closest_promos].copy()
      ax.scatter(df_A[varX], df_A[varY], df_A[varZ], alpha=_alpha, label='neighbours_' + str(idx_forecast), s=50)


    for idx, row in df_frc.iterrows():
        #point_name = f'F{(1+idx)} ({row[varX]}, {row[varY]:2.0f}, {row.y_hat:3.2f})'
        point_name = f'Frc_{idx}'
        ax.text(row[varX]+2.5,row[varY],row['y_hat'], point_name, color='black', fontsize=9)

    ax.set_xlabel(mapObfuscatedVars[varX])
    ax.set_ylabel(mapObfuscatedVars[varY])
    ax.set_zlabel(mapObfuscatedVars[varZ])

    ax.view_init(elev=32, azim=-50)
    ax.legend()
    ax.grid(True)

    plt.tight_layout()
    plt.show(block = True)
    pfg_file_name = _p.join('figs', prefix + '_plot_3D.png')
    plt.savefig(pfg_file_name)


  d = {'eval_results': eval_results, \
    'cold_start_results': cold_start_results, \
    'contrastiveRegressor': contrastiveReg,
    'X_test': X_test, 'y_test': y_test, 
    'df_train': df, 'df_test': df_test}
  return d
Beispiel #26
0
def generate_candidates(adjusted_cols):
    return [
        (
            "bsplitz_method",
            Pipeline([
                ("cate", category_encoders.OrdinalEncoder(cols=adjusted_cols)),
                (
                    "ordinal_encoder",
                    BsplitZClassifier(adjusted_cols,
                                      random_state=10,
                                      num_samples=100),
                ),
            ]),
        ),
        (
            "target_encoder",
            Pipeline([
                (
                    "target_encoder",
                    category_encoders.TargetEncoder(cols=adjusted_cols),
                ),
                ("clf", BsplitZClassifier()),
            ]),
        ),
        #            ('clf', DecisionTreeClassifier(max_depth=1))])),
        (
            "m_encoder",
            Pipeline([
                (
                    "m_encoder",
                    category_encoders.MEstimateEncoder(cols=adjusted_cols),
                ),
                ("clf", BsplitZClassifier()),
            ]),
        ),
        #        ('clf', DecisionTreeClassifier(max_depth=1))])),
        (
            "cat_encoder",
            Pipeline([
                (
                    "m_encoder",
                    category_encoders.CatBoostEncoder(cols=adjusted_cols),
                ),
                ("clf", BsplitZClassifier()),
            ]),
        ),
        # ('clf', DecisionTreeClassifier(max_depth=1))])),
        # ('backward_encoder', Pipeline([
        #     ('backward_encoder', category_encoders.BackwardDifferenceEncoder(
        #         cols=adjusted_cols)),
        #     ('clf', BsplitZClassifier())])), #skip because of too slow
        (
            "basen_encoder",
            Pipeline([
                (
                    "basen_encoder",
                    category_encoders.BaseNEncoder(cols=adjusted_cols),
                ),
                ("clf", BsplitZClassifier()),
            ]),
        ),
        # ('clf', DecisionTreeClassifier(max_depth=1))])),
        (
            "binary_encoder",
            Pipeline([
                (
                    "basen_encoder",
                    category_encoders.BinaryEncoder(cols=adjusted_cols),
                ),
                ("clf", BsplitZClassifier()),
            ]),
        ),
        # ('clf', DecisionTreeClassifier(max_depth=1))])),
        (
            "count_encoder",
            Pipeline([
                (
                    "basen_encoder",
                    category_encoders.CountEncoder(cols=adjusted_cols),
                ),
                ("clf", BsplitZClassifier()),
            ]),
        ),
        # ('clf', DecisionTreeClassifier(max_depth=1))])),
        # ('hashing_encoder', Pipeline([
        #    ('basen_encoder', category_encoders.HashingEncoder(
        #        cols=adjusted_cols)),
        #    ('clf', BsplitZClassifier())])), #skip because of too slow
        # ('woe_encoder', Pipeline([
        #     ('woe_encoder', category_encoders.WOEEncoder(
        #         cols=adjusted_cols)),
        #     ('clf', BsplitZClassifier())])), #skip because of binary target only
        (
            "jamesstein_encoder",
            Pipeline([
                (
                    "js_encoder",
                    category_encoders.JamesSteinEncoder(cols=adjusted_cols),
                ),
                ("clf", BsplitZClassifier()),
            ]),
        ),
        # ('clf', DecisionTreeClassifier(max_depth=1))])),
        # ('helmert_encoder', Pipeline([
        #    ('helmert_encoder', category_encoders.HelmertEncoder(
        #        cols=adjusted_cols)),
        #    ('clf', BsplitZClassifier())])), #skip because of too slow
    ]
Beispiel #27
0
def encode_categorical_features(dataframe,
                                strategy,
                                list_of_features,
                                list_of_features_to_skip=None):
    """
    This function will take a dataframe as input and perform the following:

    Encode the features passed as a list using the strategy selected. This
    function uses category_encoder library.

    For ordinal features, the functional will use ordinal_encoder
    For non-ordinal features, the function will use JamesStein Encoder

    :param dataframe: dataframe object
    :param strategy: this is the parameter that holds how the categorical features should be encoded
        strategy types available: 'ordinal', 'non_ordinal'
    :param list_of_features: pass a list object containing features to be encoded using strategy selected
    :param list_of_features_to_skip: pass a list object containing features to be omitted from encoding
    :return: dataframe with categorical features encoded.
    """

    if not isinstance(dataframe, pd.DataFrame):
        raise ValueError("Object passed is not a dataframe")

    if strategy is None:
        raise ValueError("Please select a strategy to use")

    if list_of_features is None:
        raise ValueError("Please pass a list of features to be encoded")

    if not isinstance(list_of_features, list):
        raise ValueError("Object passed is not a dataframe")

    encoder_type = "james_stein"

    # split dataframe into features and target variable
    y = dataframe['loan_status']
    x = dataframe.drop("loan_status", axis=1)

    if strategy == 'ordinal':
        # create an ordinal encoder object
        ordinal_encoder = ce.OrdinalEncoder(cols=list_of_features)
        # transform the dataframe - returns a df with categorical features encoded
        dataframe = ordinal_encoder.fit_transform(x, y)

        # merge back the dataframe with the y target variable
        dataframe = dataframe.merge(y, on=y.index)
        # drop the index feature key_0
        dataframe.drop("key_0", axis=1, inplace=True)

        # convert the categorical features back to the category data type
        dataframe[list_of_features] = dataframe[list_of_features].astype(
            'category')
    elif strategy == 'non_ordinal':
        # select all the features in the dataset
        non_ordinal = dataframe.select_dtypes(
            include='category').columns.tolist()
        # filter out non-ordinal features using ordinal features passed
        non_ordinal_features = [
            value for value in non_ordinal if value not in list_of_features
        ]

        # create encoder object
        encoder = ce.JamesSteinEncoder(cols=non_ordinal_features)

        # transform the dataframe - returns a df with categorical features encoded
        dataframe = encoder.fit_transform(x, y)

        # merge back the target variable to the dataframe(df)
        dataframe = dataframe.merge(y, on=y.index)

        # drop the index feature key_0
        dataframe.drop("key_0", axis=1, inplace=True)

        # convert non_ordinal_features back to category data type
        dataframe[non_ordinal_features] = dataframe[
            non_ordinal_features].astype('category')

    return dataframe