def _fit_catboost(self, df, y, target, parameter): cat_encoder = ce.CatBoostEncoder() cat_encoder.fit(df[target].map(to_str), df[y]) name = ['continuous_' + remove_continuous_discrete_prefix(x) + '_catboost' for x in cat_encoder.get_feature_names()] self.trans_ls.append(('catboost', name, target, cat_encoder))
def test_catBoost_reference2(self): # The reference is from: # https://www.youtube.com/watch?v=hqYQ8Yj9vB0 # time: # 35:03 # as obtained on 21 Aug 2019. # Note: they have an error at line [smooth 6 4.3 4.1]. It should be [smooth 6 4 4.1 3.9] X = pd.DataFrame({ 'col1': [ 'fuzzy', 'soft', 'smooth', 'fuzzy', 'smooth', 'soft', 'smooth', 'smooth' ] }) y = pd.Series([4, 1, 4, 3, 6, 0, 7, 5]) enc = encoders.CatBoostEncoder() obtained = enc.fit_transform(X, y) prior = 30. / 8 self.assertEqual(list(obtained['col1']), [ prior, prior, prior, (4 + prior) / 2, (4 + prior) / 2, (1 + prior) / 2, (10 + prior) / 3, (17 + prior) / 4 ]) print([ prior, prior, prior, (4 + prior) / 2, (4 + prior) / 2, (1 + prior) / 2, (10 + prior) / 3, (17 + prior) / 4 ])
def catboost_encoder(X_train, Y_train, X_val, Y_val, target_col: str, cat_features=None, features=None): """ CatBoost_Encoding: カテゴリ列を目的変数の1行前の行からのみに変換する特徴量エンジニアリング CatBoost使ったターゲットエンコーディング https://www.kaggle.com/matleonard/categorical-encodings """ X_train = pd.DataFrame(X_train, columns=features) Y_train = pd.DataFrame(Y_train, columns=[target_col]) X_val = pd.DataFrame(X_val, columns=features) Y_val = pd.DataFrame(Y_val, columns=[target_col]) train_df = X_train.join(Y_train) valid_df = X_val.join(Y_val) cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7) # trainだけでfitすること(validationやtest含めるとリークする) cb_enc.fit(train_df[cat_features], train_df[target_col]) train_encoded = train_df.join( cb_enc.transform(train_df[cat_features]).add_suffix("_cb") ) valid_encoded = valid_df.join( cb_enc.transform(valid_df[cat_features]).add_suffix("_cb") ) features = train_encoded.drop(target_col, axis=1).columns.to_list() #return train_encoded, valid_encoded return train_encoded.drop(target_col, axis=1), valid_encoded.drop(target_col, axis=1), features
def impute_column(data, column, target_col='diabetes_mellitus'): class WidsColumnImputerCVTrainer(CVTrainer): # Return 'Score' - MSE between 'ytrue' and 'ypred' def score(self, typ: str): ypred, ytrue = self.predict(typ).align(self.ds.labeled.y, join='right') return mean_squared_error(ytrue, ypred) # Dividing Data into 'X' and 'y' dt = data.drop(columns=[target_col]).copy() dt[f'is_train_{column}'] = (~dt[column].isnull()).astype(int).values X = dt.drop(columns=[column]) y = dt[column].values # Defining the 'Categorial' columns (of types 'object' & 'category') & Encoder type (CatBoostEncoder) cat_columns = dt.select_dtypes(['object', 'category']).columns.tolist() encoder = ce.CatBoostEncoder(cols=cat_columns) encoder.fit(X, y) dt = encoder.transform(X) dt[column] = y # Data Block & Data Manager data_block = WidsDataBlock(dt) ds = create_data_manager(data_block=data_block, cv_column='cv_fold', train_split_column=f'is_train_{column}', label_columns=[column], cv_object=KFold(n_splits=5, shuffle=True, random_state=42)) # Defining Lgbm (light Gradient Boosting Machine) - Params & Trainer Model lgbm_params = dict(init_params=dict(metric='rmse', max_depth=-1, num_leaves=31, min_data_per_group=10, learning_rate=0.1), fit_params=dict(verbose=-1, early_stopping_rounds=100)) lgbm_trainer = WidsColumnImputerCVTrainer( fold_trainer_cls=LightGBMRegressorFoldTrainer, ds=ds, model_name=f'{column}_impute', params=lgbm_params, save_path=config.outputs_path) lgbm_trainer.fit() dt = (data.join( lgbm_trainer.predict('tst').iloc[:, 0].to_frame(f'tmp_{column}'). groupby(level=0).mean()).assign( **{ column: lambda dx: dx[column].fillna(0) + dx[f'tmp_{column}'].fillna(0) }).drop(columns=[f'tmp_{column}'])) return dt
def get_catboost_encoder(df, cols, target): """catboostエンコード""" ce_cbe = ce.CatBoostEncoder(cols=cols, random_state=42).fit_transform(X=df[cols], y=df[target]) df[f'catboost_encode_{cols}_{target}'] = ce_cbe return df
def kFold_encoder_features(train, test, features, seed_seed=2019): print("k-fold ce encoder ...") train = train.copy() test = test.copy() kfold = StratifiedKFold(n_splits=5, random_state=seed_seed, shuffle=True) encoder = ce.CatBoostEncoder(cols=features) # encoder = ce.WOEEncoder(cols=[feat]) # ce 默认nan用label.mean填充 train[features] = train[features].astype(str) test[features] = test[features].astype(str) # test encoder.fit(train, train['label']) test = encoder.transform(test) test[features] = test[features].astype(np.float32) # train feat_encoder = train.copy() for n_fold, (train_idx, valid_idx) in enumerate(kfold.split(train, train['label'])): print('processing fold: ', n_fold + 1) encoder.fit(train.loc[train_idx], train.loc[train_idx, 'label']) v_df = encoder.transform(train.loc[valid_idx]) #df feat_encoder.loc[valid_idx, features] = v_df[features].values train[features] = feat_encoder[features].astype(np.float32) print('ce encoder done.') return pd.concat([train, test], ignore_index=True)
def dataset_numerical_train(): train = pd.read_csv('./csv/dataset.csv', low_memory=False) cat_features = [ 'province', 'district', 'maCv', 'FIELD_8', 'FIELD_9', 'FIELD_10', 'FIELD_12', 'FIELD_13', 'FIELD_17', 'FIELD_18', 'FIELD_19', 'FIELD_20', 'FIELD_22', 'FIELD_23', 'FIELD_24', 'FIELD_25', 'FIELD_26', 'FIELD_27', 'FIELD_28', 'FIELD_29', 'FIELD_30', 'FIELD_31', 'FIELD_35', 'FIELD_36', 'FIELD_37', 'FIELD_38', 'FIELD_39', 'FIELD_40', 'FIELD_41', 'FIELD_42', 'FIELD_43', 'FIELD_44', 'FIELD_47', 'FIELD_48', 'FIELD_49' ] cat_features_remove = cat_features + ['id'] target_enc = ce.CatBoostEncoder(cols=cat_features) target_enc.fit(train[cat_features], train['label']) train = train.join( target_enc.transform(train[cat_features]).add_suffix('_process')) train = train.drop(columns=cat_features_remove) f7_array = train['FIELD_7'].apply(lambda x: '[]' if x != x else x).apply(literal_eval) train['FIELD_7'] = f7_array.apply(len) train = train.replace(to_replace='None', value=np.nan) train.to_csv('./csv/preprocess/numerical_train.csv') return True
def fit(X, y, output_dir, **kwargs): """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data DataRobot runs this hook when the task is being trained inside a blueprint. As an output, this hook is expected to create an artifact containg a trained object [in this example - a pre-fit CatBoost encoder], that is then used to transform new data. The input parameters are passed by DataRobot based on project and blueprint configuration. Parameters ------- X: pd.DataFrame Training data that DataRobot passes when this task is being trained. y: pd.Series Project's target column (None is passed for unsupervised projects). output_dir: str A path to the output folder; the artifact [a pickle file containing a pre-fit CatBoost Encoder] must be saved into this folder to be re-used in transform(). Returns ------- None fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir so that the trained object can be used during scoring inside transform() """ # Transform categorical columns into a numeric transformation using Weight of Evidence encoder_catboost = ce.CatBoostEncoder(cols=X.columns) encoder_catboost.fit(X, y) # dump the trained object # into an artifact [in this example - woe.pkl] # and save it into output_dir so that it can be used later to impute on new data output_dir_path = Path(output_dir) if output_dir_path.exists() and output_dir_path.is_dir(): with open("{}/catboost.pkl".format(output_dir), "wb") as fp: pickle.dump(encoder_catboost, fp)
def Train_test_random_forest(hdf, ord_feat_num, ord_feat_cat, nom_feat, cont_feat): ord_feat = ord_feat_cat.union(ord_feat_num) X_train = hdf.loc[~hdf['SalePrice'].isnull(), :] y_train = np.log1p(X_train.loc[~X_train['SalePrice'].isnull(), 'SalePrice']) X_train.drop(columns = ['SalePrice'], inplace = True) X_train.drop((X_train.loc[X_train['GrLivArea']>4400, :]).index, inplace = True) X_train.drop((X_train.loc[X_train['LotArea']>100000, :]).index, inplace = True) X_train.drop((X_train.loc[X_train['LotFrontage']>250, :]).index, inplace = True) X_test = hdf.loc[hdf['SalePrice'].isnull(), :].drop(columns = ['SalePrice']) ord_enc = ce.OrdinalEncoder(cols=ord_feat).fit(X_train,y_train) X_train = ord_enc.transform(X_train) X_test = ord_enc.transform(X_test) perm = np.random.permutation(len(X_train)) X_train = X_train.iloc[perm].reset_index(drop=True) y_train = y_train.iloc[perm].reset_index(drop=True) nom_enc = ce.CatBoostEncoder(cols=nom_feat).fit(X_train,y_train) X_train = nom_enc.transform(X_train) X_test = nom_enc.transform(X_test) X_train.drop(columns = ['Id'], inplace = True) return X_train, y_train, X_test
def catboost_encoder(train_df, valid_df, target_col: str, cat_features=None): """ CatBoost_Encoding: カテゴリ列を目的変数の1行前の行からのみに変換する特徴量エンジニアリング CatBoost使ったターゲットエンコーディング https://www.kaggle.com/matleonard/categorical-encodings """ # conda install -c conda-forge category_encoders import category_encoders as ce if cat_features is None: cat_features = train_df.select_dtypes( include=["object", "category", "bool"]).columns.to_list() cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7) # trainだけでfitすること(validationやtest含めるとリークする) cb_enc.fit(train_df[cat_features], train_df[target_col]) train_encoded = train_df.join( cb_enc.transform(train_df[cat_features]).add_suffix("_cb")) valid_encoded = valid_df.join( cb_enc.transform(valid_df[cat_features]).add_suffix("_cb")) return train_encoded, valid_encoded
def catboost_enc(df_norm): catboost_enc = ce.CatBoostEncoder(cols=cat_features) catboost_enc.fit(df_norm[cat_features], df_norm['precio']) data = df_norm.join( catboost_enc.transform(df_norm[cat_features]).add_suffix('_cb')) data = data.drop(['tipodepropiedad', 'provincia', 'ciudad'], axis=1) return data, catboost_enc
def target_encoder_catboost(df, train_df, cols, target): ce_cbe = ce.CatBoostEncoder(cols=cols, random_state=42) ce_cbe.fit(X=train_df[cols], y=train_df[target]) _df = ce_cbe.transform(df[cols]) # カラム名の変更 for col in cols: _df = _df.rename({col: f'{col}_targetenc_ce_cbe'}, axis=1) return pd.concat([df, _df], axis=1)
def preprocess(self, df, train=True): ''' Process data for training model :param df: pandas Dataframe train: boolean :return processed pandas dataframe and pd.Series with target ''' print('Creating dataframe for data manipulation') cons = pd.DataFrame({ 'column': df.columns, 'missing_perc': (df.isna().sum() / df.shape[0]) * 100, 'dtype': df.dtypes }) print('Droping columns with missing values') cons = cons[cons['missing_perc'] == 0] print('Dropping column with id') cons = cons[cons['column'] != 'Id'] print('Creating list with numeric features') numeric_features = list( cons[(cons['dtype'] == 'int64') | (cons['dtype'] == 'float') | (cons['dtype'] == 'bool')]['column']) print('Creating list with categorical features') categoric_features = list(cons[(cons['dtype'] == 'object')]['column']) self.categoric_features = categoric_features print('removing target') if train == True: numeric_features.remove('y') else: pass print(self.categoric_features) print('feature encoder') print('feature normalization and encoding') std_scaler = StandardScaler() if train == True: y = df['y'] df = df.drop(columns={'y'}) self.numeric_features = numeric_features self.categoric_features = categoric_features self.feature_names = self.numeric_features + self.categoric_features self.scaler = std_scaler self.catb = ce.CatBoostEncoder(cols=self.categoric_features) df[self.numeric_features] = self.scaler.fit_transform( df[self.numeric_features]) df[self.categoric_features] = self.catb.fit_transform( df[self.categoric_features], y=y) self.train_features = self.numeric_features + self.categoric_features return df[self.categoric_features + self.numeric_features], y else: df[self.numeric_features] = self.scaler.transform( df[self.numeric_features]) df[self.categoric_features] = self.catb.transform( df[self.categoric_features]) for column in df[self.categoric_features + self.numeric_features].columns: df[column] = df[column].fillna(df[column].mean()) return df[self.categoric_features + self.numeric_features]
def catboost_multiple(df, cols): encoder = ce.CatBoostEncoder(cols, return_df=1, drop_invariant=1, handle_missing='return_nan', sigma=None, a=2) encoder.fit(X=df, y=df['set_clicked']) df = encoder.transform(df) return df
def test_catBoost(self): X = pd.DataFrame({'col1':['A', 'B', 'B', 'C', 'A']}) y = pd.Series([1, 0, 1, 0, 1]) enc = encoders.CatBoostEncoder() obtained = enc.fit_transform(X, y) self.assertEqual(list(obtained['col1']), [0.6, 0.6, 0.6/2, 0.6, 1.6/2], 'The nominator is incremented by the prior. The denominator by 1.') X_t = pd.DataFrame({'col1': ['B', 'B', 'A']}) obtained = enc.transform(X_t) self.assertEqual(list(obtained['col1']), [1.6/3, 1.6/3, 2.6/3])
def test_catBoost_missing(self): X = pd.DataFrame({'col1':['A', 'B', 'B', 'C', 'A', None, None, None]}) y = pd.Series([1, 0, 1, 0, 1, 0, 1, 0]) enc = encoders.CatBoostEncoder(handle_missing='value') obtained = enc.fit_transform(X, y) self.assertEqual(list(obtained['col1']), [0.5, 0.5, 0.5/2, 0.5, 1.5/2, 0.5, 0.5/2, 1.5/3], 'We treat None as another category.') X_t = pd.DataFrame({'col1': ['B', 'B', 'A', None]}) obtained = enc.transform(X_t) self.assertEqual(list(obtained['col1']), [1.5/3, 1.5/3, 2.5/3, 1.5/4])
def categoryEncoder(dt, col): cache = dt[[col, 'target']] encoder = ce.CatBoostEncoder(cols=col) train = cache[cache['target'].isna() == False] encoder.fit(train, train['target']) cache = encoder.transform(cache) dt[col + '_ctr'] = cache[col] del train, cache gc.collect() return dt
def process(self, df, etapa_treino=True): ''' Process data for training the model. :param df: Pandas DataFrame :param etapa_treino: Boolean :return: processed Pandas Data Frame ''' print('Creating DataFrame for Data Manipulation') cons = pd.DataFrame({ 'column': df.columns, 'missing_perc': (df.isna().sum() / df.shape[0]) * 100, 'dtype': df.dtypes }) print('Droping columns with missing values') cons = cons[cons['missing_perc'] == 0] print('Dropping column with Id') cons = cons[cons['column'] != 'Id'] print('Creating list with numeric features') numeric_features = list(cons[(cons['dtype'] == 'int64') | (cons['dtype'] == 'float')]['column']) print('Creating list with categoric features') categoric_features = list(cons[(cons['dtype'] == 'object')]['column']) print('Removing target') if etapa_treino == True: numeric_features.remove('SalePrice') else: pass print('Feature encoder') print('Feature Normalization and Encoding') std_scaler = StandardScaler() if etapa_treino == True: y = df['SalePrice'] df = df.drop(columns={'SalePrice'}) self.numeric_features = numeric_features self.categoric_features = categoric_features self.feature_names = self.numeric_features + self.categoric_features self.scaler = std_scaler self.catb = ce.CatBoostEncoder(cols=self.categoric_features) df[self.numeric_features] = self.scaler.fit_transform( df[self.numeric_features]) df[self.categoric_features] = self.catb.fit_transform( df[self.categoric_features], y=y) self.train_features = self.numeric_features + self.categoric_features return df[self.categoric_features + self.numeric_features], y else: df[self.numeric_features] = self.scaler.transform( df[self.numeric_features]) df[self.categoric_features] = self.catb.transform( df[self.categoric_features]) for column in df[self.categoric_features + self.numeric_features].columns: df[column] = df[column].fillna(df[column].mean()) return df[self.categoric_features + self.numeric_features]
def __init__(self, encoder_type, columns_name=None): """ :param encoder_type: :param columns_name: list, 特征名组成的列表名 """ if encoder_type == "BackwardDe": # 反向差分编码 self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name) elif encoder_type == "BaseN": # BaseN编码 self.encoder = ce.BaseNEncoder(cols=columns_name) elif encoder_type == "Binary": # 二值编码 self.encoder = ce.BinaryEncoder(cols=columns_name) elif encoder_type == "Catboost": self.encoder = ce.CatBoostEncoder(cols=columns_name) elif encoder_type == "Hash": self.encoder = ce.HashingEncoder(cols=columns_name) elif encoder_type == "Helmert": self.encoder = ce.HelmertEncoder(cols=columns_name) elif encoder_type == "JamesStein": self.encoder = ce.JamesSteinEncoder(cols=columns_name) elif encoder_type == "LOO": # LeaveOneOutEncoder 编码 self.encoder = ce.LeaveOneOutEncoder(cols=columns_name) elif encoder_type == "ME": self.encoder = ce.MEstimateEncoder(cols=columns_name) # M估计编码器 elif encoder_type == "OneHot": self.encoder = ce.OneHotEncoder(cols=columns_name) elif encoder_type == "OridinalEncoder": # 原始编码 self.encoder = ce.OrdinalEncoder(cols=columns_name) elif encoder_type == "Sum": # 求和编码 self.encoder = ce.SumEncoder(cols=columns_name) elif encoder_type == "Polynomial": # 多项式编码 self.encoder = ce.PolynomialEncoder(cols=columns_name) elif encoder_type == "Target": # 目标编码 self.encoder = ce.TargetEncoder(cols=columns_name) elif encoder_type == "WOE": # WOE 编码器 self.encoder = ce.WOEEncoder(cols=columns_name) else: raise ValueError("请选择正确的编码方式")
def preprocessing(train, test): cat_features = [ 'province', 'district', 'maCv', 'FIELD_8', 'FIELD_9', 'FIELD_10', 'FIELD_12', 'FIELD_13', 'FIELD_17', 'FIELD_18', 'FIELD_19', 'FIELD_20', 'FIELD_22', 'FIELD_23', 'FIELD_24', 'FIELD_25', 'FIELD_26', 'FIELD_27', 'FIELD_28', 'FIELD_29', 'FIELD_30', 'FIELD_31', 'FIELD_35', 'FIELD_36', 'FIELD_37', 'FIELD_38', 'FIELD_39', 'FIELD_40', 'FIELD_41', 'FIELD_42', 'FIELD_43', 'FIELD_44', 'FIELD_47', 'FIELD_48', 'FIELD_49' ] # End catBoostEncoder We must remove columns and label from training data and test data cat_features_remove = cat_features target_enc = ce.CatBoostEncoder(cols=cat_features) target_enc.fit(train[cat_features], train['label']) train = train.join( target_enc.transform(train[cat_features]).add_suffix('_process')) test = test.join( target_enc.transform(test[cat_features]).add_suffix('_process')) train = train.drop(columns=cat_features_remove) test = test.drop(columns=cat_features_remove) train = train.replace(to_replace='None', value=np.nan) test = test.replace(to_replace='None', value=np.nan) my_imputer = SimpleImputer(missing_values=np.nan, strategy='mean') convert_tool = my_imputer.fit(train) train = pd.DataFrame(convert_tool.transform(train), columns=train.columns) test = pd.DataFrame(convert_tool.transform(test), columns=test.columns) special_column = "FIELD_55".split(" ") index_outlier_data = detection_outlier(train, special_column).astype(int) train = train.drop(index_outlier_data, axis=0).reset_index(drop=True) train_label = train['label'] # print(train[index_outlier_data.astype(int)]) train = train.drop(columns=['label']) test = test.drop(columns=['label']) print(len(index_outlier_data) / len(train)) scaler = StandardScaler() scaler.fit(train) train = pd.DataFrame(scaler.transform(train), columns=train.columns) test = pd.DataFrame(scaler.transform(test), columns=test.columns) return train, train_label, test
def test_catBoost(self): X = pd.DataFrame({'col1': ['A', 'B', 'B', 'C', 'A']}) y = pd.Series([1, 0, 1, 0, 1]) enc = encoders.CatBoostEncoder() obtained = enc.fit_transform(X, y) self.assertEqual( list(obtained['col1']), [0.6, 0.6, 0.6 / 2, 0.6, 1.6 / 2], 'The nominator is incremented by the prior. The denominator by 1.') # For testing set, use statistics calculated on all the training data. # See: CatBoost: unbiased boosting with categorical features, page 4. X_t = pd.DataFrame({'col1': ['B', 'B', 'A']}) obtained = enc.transform(X_t) self.assertEqual(list(obtained['col1']), [1.6 / 3, 1.6 / 3, 2.6 / 3])
def cb_encodings_solution(): cat_features = ['app', 'device', 'os', 'channel'] cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7) train, valid, _ = get_data_splits() # Learn encoding from the training set cb_enc.fit(train[cat_features], train["is_attributed"]) # Apply encoding to the train and validation sets train_encoded = train.join(cb_enc.transform(train[cat_features]).add_suffix('_cb')) valid_encoded = valid.join(cb_enc.transform(valid[cat_features]).add_suffix('_cb')) return train_encoded, valid_encoded
def categoricalEncoding(self): ''' Numerical encoding for categorical data with Catboost. Returns ------- None. ''' catEncoder = ce.CatBoostEncoder(drop_invariant=True, return_df=True, random_state=2020) catEncoder.fit(self.trainX, self.trainY) self.trainX = catEncoder.transform(self.trainX) self.testX = catEncoder.transform(self.testX)
def categorical_encoding(self): ''' Encode X_train and X_test categorical features using Catboost encoder. Returns ------- None. ''' print('Encoding...') catEncoder = ce.CatBoostEncoder(drop_invariant=True, return_df=True, random_state=2020) catEncoder.fit(self.X_train, self.y_train) self.X_train = catEncoder.transform(self.X_train) self.X_test = catEncoder.transform(self.X_test)
def treat_categorical_features(self, categorical_features, categorical_method, X_train, y_train, X_test): """ Deal with categorical features """ if categorical_features is None or categorical_method is None: return X_train, X_test X_train[categorical_features] = X_train[categorical_features].fillna('MISSING') X_test[categorical_features] = X_test[categorical_features].fillna('MISSING') enc = ce.CatBoostEncoder(cols = categorical_features, drop_invariant = False).fit(X_train, y_train) X_train = enc.transform(X_train) X_test = enc.transform(X_test) return X_train, X_test
def non_to_num(option=None): train = pd.read_csv('./csv/base/train.csv', low_memory=False) if option == 2: test = pd.read_csv('./csv/testset.csv', low_memory=False) else: test = pd.read_csv('./csv/dataset.csv', low_memory=False) cat_features = [ 'province', 'district', 'maCv', 'FIELD_8', 'FIELD_9', 'FIELD_10', 'FIELD_12', 'FIELD_13', 'FIELD_17', 'FIELD_18', 'FIELD_19', 'FIELD_20', 'FIELD_22', 'FIELD_23', 'FIELD_24', 'FIELD_25', 'FIELD_26', 'FIELD_27', 'FIELD_28', 'FIELD_29', 'FIELD_30', 'FIELD_31', 'FIELD_35', 'FIELD_36', 'FIELD_37', 'FIELD_38', 'FIELD_39', 'FIELD_40', 'FIELD_41', 'FIELD_42', 'FIELD_43', 'FIELD_44', 'FIELD_47', 'FIELD_48', 'FIELD_49' ] # End catBoostEncoder We must remove columns and label from training data and test data cat_features_remove = cat_features + ['id'] target_enc = ce.CatBoostEncoder(cols=cat_features) target_enc.fit(train[cat_features], train['label']) train = train.join( target_enc.transform(train[cat_features]).add_suffix('_process')) test = test.join( target_enc.transform(test[cat_features]).add_suffix('_process')) if 'id' in test.columns: test = test.drop(columns=cat_features_remove) else: test = test.drop(columns=cat_features) train = train.drop(columns=cat_features_remove) f7_array = train['FIELD_7'].apply(lambda x: '[]' if x != x else x).apply(literal_eval) train['FIELD_7'] = f7_array.apply(len) f7_array = test['FIELD_7'].apply(lambda x: '[]' if x != x else x).apply(literal_eval) test['FIELD_7'] = f7_array.apply(len) train = train.replace(to_replace='None', value=np.nan) test = test.replace(to_replace='None', value=np.nan) test.to_csv('./csv/preprocess/numerical_test.csv') train.to_csv('./csv/preprocess/numerical_train.csv') return True
def test_catBoost_reference(self): # The reference is from: # https://catboost.ai/docs/concepts/algorithm-main-stages_cat-to-numberic.html # paragraph: # Transforming categorical features to numerical features in classification # as obtained on 17 Aug 2019. X = pd.DataFrame({ 'col1': ['rock', 'indie', 'rock', 'rock', 'pop', 'indie', 'rock'] }) y = pd.Series([0, 0, 1, 1, 1, 0, 0]) enc = encoders.CatBoostEncoder() obtained = enc.fit_transform(X, y) prior = 3. / 7 # Since we do not support prior passing, we replace the prior in the reference = 0.05 with the sample prior = 3/7. self.assertEqual(list(obtained['col1']), [ prior, prior, prior / 2, (1 + prior) / 3, prior, prior / 2, (2 + prior) / 4 ])
def __handle_cat_features(self, test_size): self.df[self.categorical_feat] = self.df[self.categorical_feat].fillna( 'MISSING') self.original_df = self.df.copy() X_train, X_test = train_test_split(self.df, test_size=test_size, random_state=9999) y_train = X_train.pop(self.target) y_test = X_test.pop(self.target) enc = ce.CatBoostEncoder(cols=self.categorical_feat, drop_invariant=False).fit(X_train, y_train) X_train = enc.transform(X_train) X_test = enc.transform(X_test) self.df = X_test.join(y_test)
def __init__(self, model_name, df, teams_df): self.model_name = model_name self.df = df self.teams_df = teams_df self.cat_encoding = ca.CatBoostEncoder() self.scaler = StandardScaler() self.clf = None self.save_folder = 'saved_models' self.features = list( filter(lambda x: x not in NOT_FEATURES, df.columns)) self.category_columns = df[self.features].select_dtypes( 'category').columns self.to_standard_features = df[self.features].select_dtypes( [int, float]).columns self.column_transformer = make_column_transformer( (self.cat_encoding, self.features), (self.scaler, self.to_standard_features), remainder='passthrough')
def categoricalEncoding(self): ''' Fits a catBoostEncoder using the train data and transforms train, test and external categorical features data using the encoder. Returns ------- None. ''' print('Encoding categorical variable') target_encoder = LabelEncoder() y_train = target_encoder.fit_transform(self.y_train) catEncoder = ce.CatBoostEncoder(drop_invariant=True, return_df=True, random_state=2020) catEncoder.fit(self.X_train, y_train) self.X_train = catEncoder.transform(self.X_train) self.X_test = catEncoder.transform(self.X_test) self.external_data = catEncoder.transform(self.external_data)