def _fit_js(self, df, y, target, parameter): js_encoder = ce.JamesSteinEncoder() js_encoder.fit(df[target].map(to_str), df[y]) name = ['continuous_' + remove_continuous_discrete_prefix(x) + '_js' for x in js_encoder.get_feature_names()] self.trans_ls.append(('catboost', name, target, js_encoder))
def test_continuous_target_beta(self): X = np.array(['a', 'b', 'b', 'c']) y = np.array([-10, 0, 0, 10]) out = encoders.JamesSteinEncoder(return_df=False, model='beta').fit_transform(X, y) self.assertEqual( [-2, 0, 0, 2], list(out), 'The model assumes normal distribution -> we support real numbers')
def target_encoder_jamesstein(df, train_df, cols, target): ce_jse = ce.JamesSteinEncoder(cols=cols, drop_invariant=True) ce_jse.fit(X=train_df[cols], y=train_df[target]) _df = ce_jse.transform(df[cols]) # カラム名の変更 for col in cols: _df = _df.rename({col: f'{col}_targetenc_ce_jse'}, axis=1) return pd.concat([df, _df], axis=1)
def test_zero_variance(self): X = np.array(['a', 'b', 'c', 'd', 'd']) y = np.array([0, 1, 1, 1, 1]) out = encoders.JamesSteinEncoder(return_df=False, model='independent').fit_transform( X, y) self.assertEqual([0, 1, 1, 1, 1], list(out), 'Should not result into division by zero')
def test_ids_large_pooled(self): X = np.array(['a', 'b', 'c', 'd', 'e']) y = np.array([1, 0, 1, 0, 1]) out = encoders.JamesSteinEncoder(model='pooled').fit_transform(X, y) self.assertTrue( all(np.var(out) == 0), 'This is not a standard behaviour of James-Stein estimator. But it helps a lot if we treat id-like attributes as non-predictive.' )
def test_large_samples_binary(self): X = np.array(['a', 'b', 'b', 'c', 'd']) y = np.array([1, 0, 1, 0, 0]) out = encoders.JamesSteinEncoder(return_df=False, model='binary').fit_transform(X, y) self.assertNotEqual( [1, 0.5, 0.5, 0, 0], list(out), 'Shrinkage should kick in with 4 or more unique values')
def fit(self, input_df, y=None): train_df = input_df[~(input_df.likes.isnull())] self.cbe = ce.JamesSteinEncoder(cols=self.column, drop_invariant=True) self.cbe.fit(train_df[self.column], train_df['likes']) return
def category_encode(dataframe): global category_columns global category_target x = dataframe[category_columns] y = dataframe[target] ce_ord = ce.JamesSteinEncoder(cols=category_columns) dataframe[category_columns] = ce_ord.fit_transform(x, y) return dataframe
def test_large_samples_beta(self): X = np.array(['a', 'b', 'b', 'c', 'd']) y = np.array([1, 0, 1, 0, 0]) out = encoders.JamesSteinEncoder(return_df=False, model='beta').fit_transform(X, y) self.assertNotEqual( [1, 0.5, 0.5, 0, 0], list(out), 'Shrinkage should kick in with 4 or more unique values') self.assertTrue(np.max(out) <= 1, 'This should still be a probability') self.assertTrue(np.min(out) >= 0, 'This should still be a probability')
def test_small_samples_independent(self): X = np.array(['a', 'b', 'b']) y = np.array([1, 0, 1]) out = encoders.JamesSteinEncoder(return_df=False, model='independent').fit_transform( X, y) self.assertEqual([1, 0.5, 0.5], list( out ), 'When the count of unique values in the column is <4 (here it is 2), James-Stein estimator returns (unbiased) sample means' )
def encode_df(X, y, cat_features, cat_encoding): ENCODERS = { 'leave_one_out': ce.LeaveOneOutEncoder(cols=cat_features, handle_missing='return_nan'), 'james_stein': ce.JamesSteinEncoder(cols=cat_features, handle_missing='return_nan'), 'target': ce.TargetEncoder(cols=cat_features, handle_missing='return_nan') } X = ENCODERS[cat_encoding].fit_transform(X, y) return X
def __init__(self, encoder_type, columns_name=None): """ :param encoder_type: :param columns_name: list, 特征名组成的列表名 """ if encoder_type == "BackwardDe": # 反向差分编码 self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name) elif encoder_type == "BaseN": # BaseN编码 self.encoder = ce.BaseNEncoder(cols=columns_name) elif encoder_type == "Binary": # 二值编码 self.encoder = ce.BinaryEncoder(cols=columns_name) elif encoder_type == "Catboost": self.encoder = ce.CatBoostEncoder(cols=columns_name) elif encoder_type == "Hash": self.encoder = ce.HashingEncoder(cols=columns_name) elif encoder_type == "Helmert": self.encoder = ce.HelmertEncoder(cols=columns_name) elif encoder_type == "JamesStein": self.encoder = ce.JamesSteinEncoder(cols=columns_name) elif encoder_type == "LOO": # LeaveOneOutEncoder 编码 self.encoder = ce.LeaveOneOutEncoder(cols=columns_name) elif encoder_type == "ME": self.encoder = ce.MEstimateEncoder(cols=columns_name) # M估计编码器 elif encoder_type == "OneHot": self.encoder = ce.OneHotEncoder(cols=columns_name) elif encoder_type == "OridinalEncoder": # 原始编码 self.encoder = ce.OrdinalEncoder(cols=columns_name) elif encoder_type == "Sum": # 求和编码 self.encoder = ce.SumEncoder(cols=columns_name) elif encoder_type == "Polynomial": # 多项式编码 self.encoder = ce.PolynomialEncoder(cols=columns_name) elif encoder_type == "Target": # 目标编码 self.encoder = ce.TargetEncoder(cols=columns_name) elif encoder_type == "WOE": # WOE 编码器 self.encoder = ce.WOEEncoder(cols=columns_name) else: raise ValueError("请选择正确的编码方式")
def get_encoder_dict(): encoder_dict = { 'OneHotEncoder': ce.OneHotEncoder(), 'BinaryEncoder': ce.BinaryEncoder(), 'HashingEncoder': ce.HashingEncoder(), 'LabelEncoder': le.MultiColumnLabelEncoder(), 'FrequencyEncoder': fe.FrequencyEncoder(), 'TargetEncoder': ce.TargetEncoder(), 'HelmertEncoder': ce.HelmertEncoder(), 'JamesSteinEncoder': ce.JamesSteinEncoder(), 'BaseNEncoder': ce.BaseNEncoder(), 'SumEncoder': ce.SumEncoder(), } return encoder_dict
def test_ids_large_pooled(self): X = np.array(['a', 'b', 'c', 'd', 'e']) y = np.array([1, 0, 1, 0, 1]) out = encoders.JamesSteinEncoder(model='pooled').fit_transform(X, y) self.assertTrue( all(np.var(out) == 0), 'This is not a standard behaviour of James-Stein estimator. But it helps a lot if we treat id-like attributes as non-predictive.' ) ####### Beta def test_continuous_target_pooled(self): X = np.array(['a', 'b', 'b', 'c']) y = np.array([-10, 0, 0, 10]) out = encoders.JamesSteinEncoder(return_df=False, model='beta').fit_transform(X, y) self.assertEqual([-10, 0, 0, 10], list( out ), 'The model assumes normal distribution -> we support real numbers' ) def test_large_samples_pooled(self): X = np.array(['a', 'b', 'b', 'c', 'd']) y = np.array([1, 0, 1, 0, 0]) out = encoders.JamesSteinEncoder(return_df=False, model='beta').fit_transform(X, y) self.assertNotEqual( [1, 0.5, 0.5, 0, 0], list(out), 'Shrinkage should kick in with 4 or more unique values') self.assertTrue( np.max(out) <= 1, 'This should still be a probability') self.assertTrue( np.min(out) >= 0, 'This should still be a probability') def test_ids_small_pooled(self): X = np.array(['a', 'b', 'c']) y = np.array([1, 0, 1]) out = encoders.JamesSteinEncoder(model='beta').fit_transform(X, y) self.assertTrue( all(np.var(out) == 0), 'This is not a standard behaviour of James-Stein estimator. But it helps a lot if we treat id-like attributes as non-predictive.' ) def test_ids_large_pooled(self): X = np.array(['a', 'b', 'c', 'd', 'e']) y = np.array([1, 0, 1, 0, 1]) out = encoders.JamesSteinEncoder(model='beta').fit_transform(X, y) self.assertTrue( all(np.var(out) == 0), 'This is not a standard behaviour of James-Stein estimator. But it helps a lot if we treat id-like attributes as non-predictive.' )
def test_small_samples_binary(self): X = np.array(['a', 'b', 'b']) y = np.array([1, 0, 1]) out = encoders.JamesSteinEncoder(return_df=False, model='binary').fit_transform(X, y) self.assertTrue( np.sum( np.abs([ np.log((1.5 * 1.5) / (0.5 * 1.5)), np.log((0.5 * 1.5) / (1.5 * 1.5)), np.log((0.5 * 1.5) / (1.5 * 1.5)) ] - np.transpose(out))) < 0.001, 'When the count of unique values in the column is <4 (here it is 2), James-Stein estimator returns (unbiased) sample means' )
def js_encoding(X_fit, y_fit, cols, X_test=None, model='independent'): """ 只针对continuous target X_fit: 用来计算encoding的df, 包含cols y_fit: encoding的target X_test: 需要transform的对象 cols: 需要encoding的列 model: 'pooled' or 'independent';pooled是假设所有个体具有相同的方差,与casi书中定义一致 """ if X_test is None: X_test = X_fit encoder = ce.JamesSteinEncoder(cols=cols, model=model) encoder.fit(X_fit, y_fit) result = encoder.transform(X_test) return result
def mean_absolute_percentage_error(y_true, y_pred): y_true, y_pred = np.array(y_true), np.array(y_pred) return np.mean(np.abs((y_true - y_pred) / y_true)) * 100 df = pd.read_csv("BigDataV5.csv") y = df['price'] x = df.drop('price', axis=1) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) encoder = ce.JamesSteinEncoder( cols=['make', 'model', 'city', 'color', 'trans']) X_train_tran = encoder.fit_transform(X_train, y_train) X_test_tran = encoder.transform(X_test, y_test) # Create the model model = ensemble.GradientBoostingRegressor() # Parameters we want to try param_grid = { 'n_estimators': np.arange(1000, 5000, 100), 'max_depth': np.arange(5, 10), 'min_samples_leaf': np.arange(3, 20), 'learning_rate': np.arange(0.01, 0.1), 'max_features': np.arange(0.1, 1), 'loss': ['ls', 'lad', 'huber']
def get_encoder(self) -> BaseEstimator: return ce.JamesSteinEncoder(cols=self.target_columns)
'colic.arff', 'credit.a.arff', 'credit.g.arff', 'cylinder.bands.arff', 'flags.arff', 'heart.c.arff', 'heart.h.arff', 'hepatitis.arff', 'hypothyroid.arff', 'kr.vs.kp.arff', 'labor.arff', 'lymph.arff', 'mushroom.arff', 'nursery.arff', 'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', 'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff'] # datasets = ['carvana.csv', 'erasmus.csv', 'internetusage.csv', 'ipumsla97small.csv', 'kobe.csv', 'pbcseq.csv', 'phpvcoG8S.csv', 'westnile.csv'] # amazon is too large... # We painstakingly initialize each encoder here because that gives us the freedom to initialize the # encoders with any setting we want. encoders = [ #category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(), category_encoders.BinaryEncoder(), category_encoders.HashingEncoder(), # category_encoders.HelmertEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.LeaveOneOutEncoder(), category_encoders.MEstimateEncoder(), category_encoders.OneHotEncoder(), category_encoders.OrdinalEncoder(), # category_encoders.PolynomialEncoder(), # category_encoders.SumEncoder(), category_encoders.TargetEncoder(), category_encoders.WOEEncoder()] encoders = [ #category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(handle_missing='value'), category_encoders.BaseNEncoder(handle_missing='indicator'), category_encoders.BinaryEncoder(handle_missing='value'), category_encoders.BinaryEncoder(handle_missing='indicator'), # category_encoders.HashingEncoder(handle_missing='value'),
def encode_all(df,dfv,dfk,encoder_to_use,handle_missing='return_nan'): encoders_used = {} for col in encoder_to_use: if encoder_to_use[col] == 'ColumnDropper': df = df.drop(columns = col) dfv = dfv.drop(columns = col) dfk = dfk.drop(columns = col) encoders_used[col] = 'ColumnDropper' if encoder_to_use[col]=='BackwardDifferenceEncoder': encoder=ce.BackwardDifferenceEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='BaseNEncoder': encoder=ce.BaseNEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,base=3) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='BinaryEncoder': encoder=ce.BinaryEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='CatBoostEncoder': encoder=ce.CatBoostEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None,a=2) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder # if encoder_to_use[col]=='HashingEncoder': # encoder=ce.HashingEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) # encoder.fit(X=df,y=df['set_clicked']) # df=encoder.transform(df) # encoders_used[col]=encoder if encoder_to_use[col]=='HelmertEncoder': encoder=ce.HelmertEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='JamesSteinEncoder': encoder=ce.JamesSteinEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing, model='binary') encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='LeaveOneOutEncoder': encoder=ce.LeaveOneOutEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='MEstimateEncoder': encoder=ce.MEstimateEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None,m=2) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) encoders_used[col]=encoder if encoder_to_use[col]=='OneHotEncoder': encoder=ce.OneHotEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,use_cat_names=True) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='OrdinalEncoder': encoder=ce.OrdinalEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='SumEncoder': encoder=ce.SumEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='PolynomialEncoder': encoder=ce.PolynomialEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='TargetEncoder': encoder=ce.TargetEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,min_samples_leaf=10, smoothing=5) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='WOEEncoder': encoder=ce.WOEEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder # print("Encoding done for - ",col) print("Completed encoder - ",datetime.datetime.now()) return df, dfv, dfk, encoders_used
# tic-tac-toe.arff # trains.arff Medium impact (tiny dataset -> with high variance) datasets = ['audiology.arff', 'autos.arff', 'breast.cancer.arff', 'bridges.version1.arff', 'bridges.version2.arff', 'car.arff', 'colic.arff', 'credit.a.arff', 'credit.g.arff', 'cylinder.bands.arff', 'flags.arff', 'heart.c.arff', 'heart.h.arff', 'hepatitis.arff', 'hypothyroid.arff', 'kr.vs.kp.arff', 'labor.arff', 'lymph.arff', 'mushroom.arff', 'nursery.arff', 'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', 'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff'] # We painstakingly initialize each encoder here because that gives us the freedom to initialize the # encoders with any setting we want. encoders = [category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(), category_encoders.BinaryEncoder(), category_encoders.HashingEncoder(), category_encoders.HelmertEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.LeaveOneOutEncoder(), category_encoders.MEstimateEncoder(), category_encoders.OneHotEncoder(), category_encoders.OrdinalEncoder(), category_encoders.PolynomialEncoder(), category_encoders.SumEncoder(), category_encoders.TargetEncoder(), category_encoders.WOEEncoder()] # Initialization if os.path.isfile('./output/result.csv'): os.remove('./output/result.csv') # Loop over datasets, then over encoders, and finally, over the models for dataset_name in datasets:
def get_model(PARAMS): """return model for provided params :param PARAMS: dictionary with model params :type PARAMS: dicr :return: model pipeline :rtype: sklearn pipeline """ try: te_dict = { 'CatBoostEncoder': ce.CatBoostEncoder(), 'HashingEncoder': ce.HashingEncoder(), 'HelmertEncoder': ce.HelmertEncoder(), 'LeaveOneOutEncoder': ce.LeaveOneOutEncoder(), 'OneHotEncoder': ce.OneHotEncoder(), 'TargetEncoder': ce.TargetEncoder(), 'WOEEncoder': ce.WOEEncoder(), 'BackwardDifferenceEncoder': ce.BackwardDifferenceEncoder(), 'BaseNEncoder': ce.BaseNEncoder(), 'BinaryEncoder': ce.BinaryEncoder(), 'CountEncoder': ce.CountEncoder(), 'JamesSteinEncoder': ce.JamesSteinEncoder(), 'MEstimateEncoder': ce.MEstimateEncoder(), 'PolynomialEncoder': ce.PolynomialEncoder(), 'SumEncoder': ce.SumEncoder() } pipe = make_pipeline( helpers.PrepareData(extraxt_year=True, unicode_text=True), ColumnTransformer([ ('num', helpers.PassThroughOrReplace(), [ 'flat_size', 'rooms', 'floor', 'number_of_floors', 'year_of_building', 'GC_latitude', 'GC_longitude' ]), ('te_producer', te_dict.get(PARAMS['te_producer']), 'producer_name'), ('te_road', te_dict.get(PARAMS['te_road']), 'GC_addr_road'), ('te_neighbourhood', te_dict.get(PARAMS['te_neighbourhood']), 'GC_addr_neighbourhood'), ('te_suburb', te_dict.get(PARAMS['te_suburb']), 'GC_addr_suburb'), ('te_postcode', te_dict.get(PARAMS['te_postcode']), 'GC_addr_postcode'), ('txt_name', TfidfVectorizer(lowercase=True, ngram_range=(1, PARAMS['txt_name__ngram_range']), max_features=PARAMS['txt_name__max_features'], dtype=np.float32, binary=PARAMS['txt_name__binary'], use_idf=PARAMS['txt_name__use_idf']), 'name'), ('txt_dscr', TfidfVectorizer(lowercase=True, ngram_range=(1, PARAMS['txt_dscr__ngram_range']), max_features=PARAMS['txt_dscr__max_features'], dtype=np.float32, binary=PARAMS['txt_dscr__binary'], use_idf=PARAMS['txt_dscr__use_idf']), 'description'), ]), TransformedTargetRegressor( regressor=lgb.LGBMRegressor(**PARAMS, random_state=seed), func=np.log1p, inverse_func=np.expm1)) return pipe except BaseException as e: LOG.error(e) return None
'colic.arff', 'credit.a.arff', 'credit.g.arff', 'cylinder.bands.arff', 'flags.arff', 'heart.c.arff', 'heart.h.arff', 'hepatitis.arff', 'hypothyroid.arff', 'kr.vs.kp.arff', 'labor.arff', 'lymph.arff', 'mushroom.arff', 'nursery.arff', 'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', 'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff'] datasets = ['carvana.csv', 'erasmus.csv', 'internetusage.csv', 'ipumsla97small.csv', 'kobe.csv', 'pbcseq.csv', 'phpvcoG8S.csv', 'westnile.csv'] # amazon is too large... # We painstakingly initialize each encoder here because that gives us the freedom to initialize the # encoders with any setting we want. encoders = [ #category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(), category_encoders.BinaryEncoder(), category_encoders.HashingEncoder(), # category_encoders.HelmertEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.LeaveOneOutEncoder(), category_encoders.MEstimateEncoder(), category_encoders.OneHotEncoder(), category_encoders.OrdinalEncoder(), # category_encoders.PolynomialEncoder(), # category_encoders.SumEncoder(), category_encoders.TargetEncoder(), category_encoders.WOEEncoder()] encoders = [category_encoders.TargetEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.WOEEncoder()] # Initialization if os.path.isfile('./output/result.csv'): os.remove('./output/result.csv')
def __init__(self, variables=None): if not isinstance(variables, list): self.variables = [variables] else: self.variables = variables self.encoder = ce.JamesSteinEncoder(cols=self.variables)
def frc_runner(df_eng_all, base_product_number_std, categoricalVars, numerical_vars, num_neighbours, validation_test_size, num_iterations, learning_rate, depth, feat_importance_keyword = 'feature_importances_', experiment_label = 'grocery', responseVar = 'wk1_sales_all_stores', identifierVar = 'promo_identifier_latex', doVisualisation=True, doSaveExcel = True): # Create an identifier df_eng_all['promo_identifier'] = df_eng_all.base_product_number_std + \ ' (' + df_eng_all.offer_description + ')' # for latex #all_bpns = df_eng_all.base_product_number_std.unique().tolist() all_bpns = [*set(df_eng_all.base_product_number_std)-set(base_product_number_std)] all_bpns_as_ids = ['product_' + str(idx) for idx in range(0, len(all_bpns))] dict_bpns = dict(zip(all_bpns, all_bpns_as_ids)) dict_bpns.update({base_product_number_std: 'new_product'}) df_eng_all['promo_identifier_latex'] = df_eng_all.base_product_number_std.map(dict_bpns) prefix = experiment_label + '_' + base_product_number_std + '_' + dt.datetime.today().strftime('%d_%m_%Y_%HH') idx_product = df_eng_all.base_product_number_std == base_product_number_std df_test = df_eng_all[idx_product].copy() df_test.reset_index(inplace=True) print(df_test.promo_identifier.iloc[0]) # From the product, find the PSGC and exclude the product itself this_psgc = df_test.product_sub_group_code.iloc[0] idx_psgc = df_eng_all.product_sub_group_code.str.contains(this_psgc) df = df_eng_all[idx_psgc & ~ idx_product].copy() df.reset_index(inplace=True) # NUMERICAL: # Make sure the numericals are encoded as such. vTypes = pt.infer_variable_type(df) # Numerical vars #numerical_vars = [*set(inputVars) - set(categoricalVars)] # Make sure all of them are coded as numerical ones missing_vars = [*set(numerical_vars)-set(vTypes['numerical'])] if missing_vars: print(f'List of missing vars {missing_vars}') # CATEGORICAL: Save a copy of the categorical variables as # this encoder overwrites them enc_postfix = '_encoded' enc_categoricalVars = [] for varName in categoricalVars: currentVarName = varName + enc_postfix enc_categoricalVars.append(currentVarName) df[currentVarName] = df[varName] df_test[currentVarName] = df_test[varName] # For quick remapping catVarsMapping = dict(zip(enc_categoricalVars, categoricalVars)) # get the index of the categorical variables for CatBoost inputVars = numerical_vars + enc_categoricalVars # JamesSteinEncoder inputVars_encoder = inputVars + categoricalVars encoder_js = ce.JamesSteinEncoder(cols=enc_categoricalVars, verbose=1) # fit training df_A_enc = encoder_js.fit_transform(df[inputVars_encoder], df[responseVar]) df_A_enc[responseVar] = df[responseVar] df_A_enc[identifierVar] = df[identifierVar] # fit test df_test_enc = encoder_js.transform(df_test[inputVars_encoder]) df_test_enc[responseVar] = df_test[responseVar] df_test_enc[identifierVar] = df_test[identifierVar] ''' Train, val and test ''' num_inputVars = len(inputVars) X_train = df_A_enc[inputVars].values y_train = df_A_enc[responseVar].values id_train = df_A_enc[identifierVar].values X_test = df_test_enc[inputVars].values y_test = df_test_enc[responseVar].values ''' Model. Using CatBoost here ''' # Create the forecaster contrastiveReg = contrastiveRegressor(num_neighbours = num_neighbours, validation_test_size = validation_test_size) # CatBoost cb_model = CatBoostRegressor(iterations=num_iterations, learning_rate=learning_rate, depth=depth, loss_function='RMSE', cat_features=None, silent=True) # Set the regressor contrastiveReg.set_regressor(cb_model, feat_importance_keyword, inputVars) # fit the regressor contrastiveReg.fit(X_train, y_train) # eval results contrastiveReg.predict_eval_test() eval_results = contrastiveReg.get_results() # Predict contrastiveReg.predict(X_test, categorical_mapping = catVarsMapping) cold_start_results = contrastiveReg.get_results() # Sort by importance df_feature_importances = cold_start_results.get('df_feat_importances', None) print(df_feature_importances) # Arrange the results in a DF so we can easily plot them df_frc = df_test.copy() df_frc['y_hat'] = cold_start_results['y_hat_weighted'] # review the cold-start forecast all_cold_forecast = [] all_frc_latex = [] model_vars = ['y_actual', 'y_forecast', \ 'y_train', 'delta_y_train', \ 'y_train_plus_delta', 'y_train_distances'] vars_latex = ['y_train', 'delta_y_train', \ 'y_train_plus_delta', 'y_train_distances'] # Annonymise dict_feature_importances = df_feature_importances.to_dict(orient='dict').get(0, None) vars_model = numerical_vars + categoricalVars inputObfuscated = [] for idx, iVar in enumerate(vars_model,1): str_feat_weight = f' (vi:{dict_feature_importances.get(iVar, 0):3.2f})' inputObfuscated.append('v_' + str(idx) + str_feat_weight) mapObfuscatedVars = dict(zip(vars_model, inputObfuscated)) mapObfuscatedVars[responseVar] = 'response' list_vars = [iVar for iVar in df_feature_importances.index.tolist() if 'ref_' not in iVar] for idx_review in range(df_test.shape[0]): print('Running {idx_review}...') df_forecast_ext = contrastiveReg.arrange_regressor_results(idx_review, df_A_enc, \ y_train, id_train, list_vars, \ identifierVar, df_test_enc, y_test, num_inputVars) df_forecast_ext.reset_index(inplace=True) df_forecast_ext['y_train'].iloc[-2] = \ df_forecast_ext['y_weighted_forecast'].iloc[-2] print(df_forecast_ext) all_cold_forecast.append(df_forecast_ext) y_actual = df_forecast_ext['y_actual'].iloc[-2] y_forecast = df_forecast_ext['y_weighted_forecast'].iloc[-2] print(f'(actual: {y_actual:3.2f}, forecast: {y_forecast:3.2f})') all_frc_latex.append(df_forecast_ext[0:-1]) # Append them all df_all_cold_forecast = pd.concat(all_cold_forecast) df_all_latex = pd.concat(all_frc_latex) top_n_features = 4 if top_n_features < num_inputVars: list_vars_LaTeX = list_vars[0:top_n_features] else: list_vars_LaTeX = list_vars if doSaveExcel: # Created an obfuscated LaTeX version df_latex = df_all_latex[[identifierVar] + list_vars_LaTeX + vars_latex].copy() # Obfuscate the TSR so we don't get into problems tsr_col = 'total_store_revenue' if tsr_col in df_latex.columns.tolist(): df_latex[tsr_col] = df_latex[tsr_col].apply(lambda x: np.log10(x)) df_latex.rename(columns=mapObfuscatedVars, inplace=True) str_latex = fhelp.prepareTableLaTeX(df_latex) tex_file_name = _p.join('tex', prefix + '_table.tex') fhelp.writeTextFile(str_latex, tex_file_name) # Excel list_vars.extend(model_vars) vars_xls = list_vars + [identifierVar] xlsx_file_name = _p.join('results', prefix + '_table.xlsx') fhelp.to_excel_file(df_all_cold_forecast[vars_xls], xlsx_file_name) ''' Visualise ''' if doVisualisation: varX = list_vars[0] varY = list_vars[1] varZ = responseVar if varX in categoricalVars: df[varX] = df[varX].astype(int) df_frc[varX] = df_frc[varX].astype(int) if varY in categoricalVars: df[varY] = df[varY].astype(int) df_frc[varY] = df_frc[varY].astype(int) _alpha = 0.75 fig = plt.figure() ax = plt.axes(projection='3d') # All the subgroup ax.scatter(df[varX], df[varY], df[varZ], alpha=0.15) # Also plot the test points ax.scatter(df_frc[varX], df_frc[varY], df_frc['y_hat'], alpha=1.0, label='cold-forecast', color='red', s=75) # plot the selected products for idx_forecast in range(0, len(cold_start_results['y_hat'])): idx_closest_promos = cold_start_results['y_idx_closest_promos'][idx_forecast] df_A = df.iloc[idx_closest_promos].copy() ax.scatter(df_A[varX], df_A[varY], df_A[varZ], alpha=_alpha, label='neighbours_' + str(idx_forecast), s=50) for idx, row in df_frc.iterrows(): #point_name = f'F{(1+idx)} ({row[varX]}, {row[varY]:2.0f}, {row.y_hat:3.2f})' point_name = f'Frc_{idx}' ax.text(row[varX]+2.5,row[varY],row['y_hat'], point_name, color='black', fontsize=9) ax.set_xlabel(mapObfuscatedVars[varX]) ax.set_ylabel(mapObfuscatedVars[varY]) ax.set_zlabel(mapObfuscatedVars[varZ]) ax.view_init(elev=32, azim=-50) ax.legend() ax.grid(True) plt.tight_layout() plt.show(block = True) pfg_file_name = _p.join('figs', prefix + '_plot_3D.png') plt.savefig(pfg_file_name) d = {'eval_results': eval_results, \ 'cold_start_results': cold_start_results, \ 'contrastiveRegressor': contrastiveReg, 'X_test': X_test, 'y_test': y_test, 'df_train': df, 'df_test': df_test} return d
def generate_candidates(adjusted_cols): return [ ( "bsplitz_method", Pipeline([ ("cate", category_encoders.OrdinalEncoder(cols=adjusted_cols)), ( "ordinal_encoder", BsplitZClassifier(adjusted_cols, random_state=10, num_samples=100), ), ]), ), ( "target_encoder", Pipeline([ ( "target_encoder", category_encoders.TargetEncoder(cols=adjusted_cols), ), ("clf", BsplitZClassifier()), ]), ), # ('clf', DecisionTreeClassifier(max_depth=1))])), ( "m_encoder", Pipeline([ ( "m_encoder", category_encoders.MEstimateEncoder(cols=adjusted_cols), ), ("clf", BsplitZClassifier()), ]), ), # ('clf', DecisionTreeClassifier(max_depth=1))])), ( "cat_encoder", Pipeline([ ( "m_encoder", category_encoders.CatBoostEncoder(cols=adjusted_cols), ), ("clf", BsplitZClassifier()), ]), ), # ('clf', DecisionTreeClassifier(max_depth=1))])), # ('backward_encoder', Pipeline([ # ('backward_encoder', category_encoders.BackwardDifferenceEncoder( # cols=adjusted_cols)), # ('clf', BsplitZClassifier())])), #skip because of too slow ( "basen_encoder", Pipeline([ ( "basen_encoder", category_encoders.BaseNEncoder(cols=adjusted_cols), ), ("clf", BsplitZClassifier()), ]), ), # ('clf', DecisionTreeClassifier(max_depth=1))])), ( "binary_encoder", Pipeline([ ( "basen_encoder", category_encoders.BinaryEncoder(cols=adjusted_cols), ), ("clf", BsplitZClassifier()), ]), ), # ('clf', DecisionTreeClassifier(max_depth=1))])), ( "count_encoder", Pipeline([ ( "basen_encoder", category_encoders.CountEncoder(cols=adjusted_cols), ), ("clf", BsplitZClassifier()), ]), ), # ('clf', DecisionTreeClassifier(max_depth=1))])), # ('hashing_encoder', Pipeline([ # ('basen_encoder', category_encoders.HashingEncoder( # cols=adjusted_cols)), # ('clf', BsplitZClassifier())])), #skip because of too slow # ('woe_encoder', Pipeline([ # ('woe_encoder', category_encoders.WOEEncoder( # cols=adjusted_cols)), # ('clf', BsplitZClassifier())])), #skip because of binary target only ( "jamesstein_encoder", Pipeline([ ( "js_encoder", category_encoders.JamesSteinEncoder(cols=adjusted_cols), ), ("clf", BsplitZClassifier()), ]), ), # ('clf', DecisionTreeClassifier(max_depth=1))])), # ('helmert_encoder', Pipeline([ # ('helmert_encoder', category_encoders.HelmertEncoder( # cols=adjusted_cols)), # ('clf', BsplitZClassifier())])), #skip because of too slow ]
def encode_categorical_features(dataframe, strategy, list_of_features, list_of_features_to_skip=None): """ This function will take a dataframe as input and perform the following: Encode the features passed as a list using the strategy selected. This function uses category_encoder library. For ordinal features, the functional will use ordinal_encoder For non-ordinal features, the function will use JamesStein Encoder :param dataframe: dataframe object :param strategy: this is the parameter that holds how the categorical features should be encoded strategy types available: 'ordinal', 'non_ordinal' :param list_of_features: pass a list object containing features to be encoded using strategy selected :param list_of_features_to_skip: pass a list object containing features to be omitted from encoding :return: dataframe with categorical features encoded. """ if not isinstance(dataframe, pd.DataFrame): raise ValueError("Object passed is not a dataframe") if strategy is None: raise ValueError("Please select a strategy to use") if list_of_features is None: raise ValueError("Please pass a list of features to be encoded") if not isinstance(list_of_features, list): raise ValueError("Object passed is not a dataframe") encoder_type = "james_stein" # split dataframe into features and target variable y = dataframe['loan_status'] x = dataframe.drop("loan_status", axis=1) if strategy == 'ordinal': # create an ordinal encoder object ordinal_encoder = ce.OrdinalEncoder(cols=list_of_features) # transform the dataframe - returns a df with categorical features encoded dataframe = ordinal_encoder.fit_transform(x, y) # merge back the dataframe with the y target variable dataframe = dataframe.merge(y, on=y.index) # drop the index feature key_0 dataframe.drop("key_0", axis=1, inplace=True) # convert the categorical features back to the category data type dataframe[list_of_features] = dataframe[list_of_features].astype( 'category') elif strategy == 'non_ordinal': # select all the features in the dataset non_ordinal = dataframe.select_dtypes( include='category').columns.tolist() # filter out non-ordinal features using ordinal features passed non_ordinal_features = [ value for value in non_ordinal if value not in list_of_features ] # create encoder object encoder = ce.JamesSteinEncoder(cols=non_ordinal_features) # transform the dataframe - returns a df with categorical features encoded dataframe = encoder.fit_transform(x, y) # merge back the target variable to the dataframe(df) dataframe = dataframe.merge(y, on=y.index) # drop the index feature key_0 dataframe.drop("key_0", axis=1, inplace=True) # convert non_ordinal_features back to category data type dataframe[non_ordinal_features] = dataframe[ non_ordinal_features].astype('category') return dataframe