def test_transform_only_selected(self): x = pd.DataFrame([ ['a', 'b', 'c'], ['a', 'a', 'c'], ['b', 'a', 'c'], ['b', 'c', 'b'], ['b', 'b', 'b'], ['a', 'b', 'a'], ], columns=['f1', 'f2', 'f3']) y = ['bee', 'cat', 'dog', 'dog', 'dog', 'dog'] wrapper = MultiClassWrapper(encoders.LeaveOneOutEncoder(cols=['f2'])) # combination fit() + transform() wrapper.fit(x, y) result = wrapper.transform(x) print(result) self.assertEqual( len(result.columns), 4, 'We expect 2 untouched features + f2 target encoded into 2 features' ) # directly fit_transform() wrapper = MultiClassWrapper(encoders.LeaveOneOutEncoder(cols=['f2'])) result2 = wrapper.fit_transform(x, y) print(result2) self.assertEqual( len(result2.columns), 4, 'We expect 2 untouched features + f2 target encoded into 2 features' ) # in the case of leave-one-out, we expect different results, because leave-one-out principle # is applied only on the training data (to decrease overfitting) while the testing data # use the whole statistics (to be as accurate as possible). self.assertFalse(result.iloc[0, 3] == result2.iloc[0, 3])
def leave_one_out_encoding(df, cols, handle_nan=True, target=False): if handle_nan: encoder = ce.LeaveOneOutEncoder(cols=cols, handle_unknown='value', handle_missing='value') else: encoder = ce.LeaveOneOutEncoder(cols=cols, handle_unknown='return_nan', handle_missing='return_nan') if target: df_new = encoder.fit_transform(df, y=df[[target]]) return df_new else: df_new = encoder.fit_transform(df, y=df[df.columns[-1]]) return df_new
def fit(X, y, output_dir, **kwargs): """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data DataRobot runs this hook when the task is being trained inside a blueprint. As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. The input parameters are passed by DataRobot based on project and blueprint configuration. Parameters ------- X: pd.DataFrame Training data that DataRobot passes when this task is being trained. y: pd.Series Project's target column (None is passed for unsupervised projects). output_dir: str A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform(). Returns ------- None fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir so that the trained object can be used during scoring inside transform() """ # Transform categorical columns into a numeric transformation using Weight of Evidence encoder_loo = ce.LeaveOneOutEncoder(cols=X.columns) encoder_loo.fit(X, y) # dump the trained object # into an artifact [in this example - woe.pkl] # and save it into output_dir so that it can be used later to impute on new data output_dir_path = Path(output_dir) if output_dir_path.exists() and output_dir_path.is_dir(): with open("{}/loo.pkl".format(output_dir), "wb") as fp: pickle.dump(encoder_loo, fp)
def _fit_leave_one_out(self, df, y, target, parameter): loo_encoder = ce.LeaveOneOutEncoder() loo_encoder.fit(df[target].map(to_str), df[y]) name = ['continuous_' + remove_continuous_discrete_prefix(x) + '_leave_one_out' for x in loo_encoder.get_feature_names()] self.trans_ls.append(('leave_one_out', name, target, loo_encoder))
def leave_one_out_encode(self, feature, verbose, drop_invariant, return_df, handle_unknown, handle_missing, sigma): le = preprocessing.LabelEncoder() self.data.loc[self.data[self.target_name].notnull(), self.target_name] = le.fit_transform( self.data.loc[self.data[self.target_name].notnull(), self.target_name]) loue = ce.LeaveOneOutEncoder(verbose=verbose, drop_invariant=drop_invariant, return_df=return_df, handle_unknown=handle_unknown, handle_missing=handle_missing, sigma=sigma) if feature == 'all': self.getFeatureType() for i in self.category_list: if i != self.target_name: temp = self.data.loc[self.data[i].notnull(), i].index self.data.loc[self.data[i].notnull(), i] = \ loue.fit_transform( self.data.loc[self.data[i].notnull(), i], self.data.loc[temp, self.target_name] ) else: self.data.loc[ self.data[feature].notnull(), feature] = loue.fit_transform( self.data.loc[self.data[feature].notnull(), feature], self.data.loc[self.data[self.target_name].notnull(), self.target_name])
def test_leave_one_out(self): enc = encoders.LeaveOneOutEncoder(verbose=1, randomized=True, sigma=0.1) enc.fit(X, y) tu.verify_numeric(enc.transform(X_t)) tu.verify_numeric(enc.transform(X_t, y_t))
def leaveoneout(): X, _, _ = get_mushroom_data() print(X.info()) enc = ce.LeaveOneOutEncoder() enc.fit(X, None) out = enc.transform(X) print(out.info()) del enc, _, X, out
def target_encoder_loo_include_self(train_df, test_df, cols, target): # こちらは自分も含めてtargetの平均でエンコードする ce_loo = ce.LeaveOneOutEncoder(cols=cols).fit(X=train_df[cols], y=train_df[target]) tmp_train = ce_loo.transform(train_df[cols]) if test_df is not None: tmp_test = ce_loo.transform(test_df[cols]) return tmp_train, tmp_test return tmp_train, None
def create_features(self, df_train, df_test): encoder = ce.LeaveOneOutEncoder(cols=self.columns) encoder.fit(df_train[self.columns], df_train[self.target_column].values.tolist()) encoded_train = encoder.transform(df_train[self.columns]) encoded_test = encoder.transform(df_test[self.columns]) for column in encoded_train.columns: self.train[column + '_LeaveOneOutEncoder'] = encoded_train[column] self.test[column + '_LeaveOneOutEncoder'] = encoded_test[column]
def target_encoder_loo(df, train_df, cols, target): # こちらは正真正銘looエンコードする ce_loo = ce.LeaveOneOutEncoder(cols=cols) ce_loo.fit(X=train_df[cols], y=train_df[target]) _df = ce_loo.transform(df[cols]) # カラム名の変更 for col in cols: _df = _df.rename({col: f'{col}_targetenc_ce_loo'}, axis=1) return pd.concat([df, _df], axis=1)
def leave_one_out_encoding(trainData, predictionData): oEncoder = ce.LeaveOneOutEncoder(cols=['country', 'profession', 'degree']) target = trainData['total_income'] oEncoder.fit(trainData, target) trainDataFrame = oEncoder.transform(trainData) predictionDataFrame = oEncoder.transform(predictionData) return trainDataFrame, predictionDataFrame
def test_leave_one_out_unique(self): X = pd.DataFrame(data=['1', '2', '2', '2', '3'], columns=['col']) y = np.array([1, 0, 1, 0, 1]) encoder = encoders.LeaveOneOutEncoder(handle_unknown='value') result = encoder.fit(X, y).transform(X, y) self.assertFalse(result.isnull().any().any(), 'There should not be any missing value') expected = pd.DataFrame(data=[y.mean(), 0.5, 0, 0.5, y.mean()], columns=['col']) pd.testing.assert_frame_equal(expected, result)
def apply_leave_one_out_encoding(df, categorical_columns, label='y'): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) import category_encoders as ce encoder = ce.LeaveOneOutEncoder(return_df=True, cols=categorical_columns).fit( df.drop([label], axis=1), df[label]) X_transformed = encoder.transform(df.drop([label], axis=1)) X_transformed[label] = df[label] return X_transformed
def test_HandleUnknownValue_HaveUnknownInTest_ExpectMean(self): train = pd.Series(["a", "a", "a", "b", "b", "b"], name='color') target = pd.Series([1.6, 0, 0, 1, 0, 1], name='target') test = pd.Series(['b', 'c'], name='color') test_target = pd.Series([0, 0]) ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_unknown='value') ce_leave.fit(train, target) obtained = ce_leave.transform(test, test_target) self.assertEqual([1.0, .6], list(obtained['color']))
def encode_df(X, y, cat_features, cat_encoding): ENCODERS = { 'leave_one_out': ce.LeaveOneOutEncoder(cols=cat_features, handle_missing='return_nan'), 'james_stein': ce.JamesSteinEncoder(cols=cat_features, handle_missing='return_nan'), 'target': ce.TargetEncoder(cols=cat_features, handle_missing='return_nan') } X = ENCODERS[cat_encoding].fit_transform(X, y) return X
def cal_loe(df_tr, col): enc = ce.LeaveOneOutEncoder(cols=[col]).fit(df_tr.loc[::, feature_col], df_tr.loc[::, 'isDefault']) tmp = pd.DataFrame({ f'{col}': df_tr.loc[::, col], f'loe_{col}': enc.transform(df_tr.loc[::, feature_col], df_tr.loc[::, 'isDefault'])[col] }) return tmp.groupby([col])[f'loe_{col}'].mean(), f'loe_{col}'
def test_leave_one_out_fit_callTwiceOnDifferentData_ExpectRefit(self): x_a = pd.DataFrame(data=['1', '2', '2', '2', '2', '2'], columns=['col_a']) x_b = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col_b']) # different values and name y_dummy = [True, False, True, False, True, False] encoder = encoders.LeaveOneOutEncoder() encoder.fit(x_a, y_dummy) encoder.fit(x_b, y_dummy) mapping = encoder.mapping self.assertEqual(1, len(mapping)) self.assertIn('col_b', mapping) # the model should have the updated mapping expected = pd.DataFrame({'sum': [2.0, 1.0], 'count': [3, 3]}, index=['1', '2'], columns=['sum', 'count']) np.testing.assert_equal(expected.values, mapping['col_b'].values)
def test_leave_one_out_values(self): df = pd.DataFrame({ 'color': ["a", "a", "a", "b", "b", "b"], 'outcome': [1, 0, 0, 1, 0, 1]}) X = df.drop('outcome', axis=1) y = df.drop('color', axis=1) ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], randomized=False) obtained = ce_leave.fit_transform(X, y['outcome']) self.assertEquals([0.0, 0.5, 0.5, 0.5, 1.0, 0.5], list(obtained['color']))
def test_HandleMissingIsValueAndNanInTrain_ExpectAtValueSet(self): df = pd.DataFrame({ 'color': [np.nan, np.nan, np.nan, "b", "b", "b"], 'outcome': [2, 2, 0, 1, 0, 1]}) X = df.drop('outcome', axis=1) y = df.drop('color', axis=1) ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_missing='value') obtained = ce_leave.fit_transform(X, y['outcome']) self.assertEqual([1, 1, 2, 0.5, 1.0, 0.5], list(obtained['color']))
def __init__(self, encoder_type, columns_name=None): """ :param encoder_type: :param columns_name: list, 特征名组成的列表名 """ if encoder_type == "BackwardDe": # 反向差分编码 self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name) elif encoder_type == "BaseN": # BaseN编码 self.encoder = ce.BaseNEncoder(cols=columns_name) elif encoder_type == "Binary": # 二值编码 self.encoder = ce.BinaryEncoder(cols=columns_name) elif encoder_type == "Catboost": self.encoder = ce.CatBoostEncoder(cols=columns_name) elif encoder_type == "Hash": self.encoder = ce.HashingEncoder(cols=columns_name) elif encoder_type == "Helmert": self.encoder = ce.HelmertEncoder(cols=columns_name) elif encoder_type == "JamesStein": self.encoder = ce.JamesSteinEncoder(cols=columns_name) elif encoder_type == "LOO": # LeaveOneOutEncoder 编码 self.encoder = ce.LeaveOneOutEncoder(cols=columns_name) elif encoder_type == "ME": self.encoder = ce.MEstimateEncoder(cols=columns_name) # M估计编码器 elif encoder_type == "OneHot": self.encoder = ce.OneHotEncoder(cols=columns_name) elif encoder_type == "OridinalEncoder": # 原始编码 self.encoder = ce.OrdinalEncoder(cols=columns_name) elif encoder_type == "Sum": # 求和编码 self.encoder = ce.SumEncoder(cols=columns_name) elif encoder_type == "Polynomial": # 多项式编码 self.encoder = ce.PolynomialEncoder(cols=columns_name) elif encoder_type == "Target": # 目标编码 self.encoder = ce.TargetEncoder(cols=columns_name) elif encoder_type == "WOE": # WOE 编码器 self.encoder = ce.WOEEncoder(cols=columns_name) else: raise ValueError("请选择正确的编码方式")
def test_leave_one_out_fit_callTwiceOnDifferentData_ExpectRefit(self): x_a = pd.DataFrame(data=['1', '2', '2', '2', '2', '2'], columns=['col_a']) x_b = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col_b']) # different values and name y_dummy = [True, False, True, False, True, False] encoder = encoders.LeaveOneOutEncoder() encoder.fit(x_a, y_dummy) encoder.fit(x_b, y_dummy) mapping = encoder.mapping self.assertEqual(1, len(mapping)) col_b_mapping = mapping[0] self.assertEqual('col_b', col_b_mapping['col']) # the model must get updated self.assertEqual({'sum': 2.0, 'count': 3, 'mean': 2.0/3.0}, col_b_mapping['mapping']['1']) self.assertEqual({'sum': 1.0, 'count': 3, 'mean': 01.0/3.0}, col_b_mapping['mapping']['2'])
def test_leave_one_out(self): """ :return: """ cols = ['C1', 'D', 'E', 'F'] X = self.create_dataset(n_rows=1000) X_t = self.create_dataset(n_rows=100) y = np.random.randn(X.shape[0]) y_t = np.random.randn(X_t.shape[0]) enc = encoders.LeaveOneOutEncoder(verbose=1, cols=cols) enc.fit(X, y) self.verify_numeric(enc.transform(X_t)) self.verify_numeric(enc.transform(X_t, y_t)) enc = encoders.LeaveOneOutEncoder(verbose=1) enc.fit(X, y) self.verify_numeric(enc.transform(X_t)) self.verify_numeric(enc.transform(X_t, y_t)) enc = encoders.LeaveOneOutEncoder(verbose=1, drop_invariant=True) enc.fit(X, y) self.verify_numeric(enc.transform(X_t)) self.verify_numeric(enc.transform(X_t, y_t)) enc = encoders.LeaveOneOutEncoder(verbose=1, return_df=False) enc.fit(X, y) self.assertTrue(isinstance(enc.transform(X_t), np.ndarray)) self.assertTrue(isinstance(enc.transform(X_t, y_t), np.ndarray)) enc = encoders.LeaveOneOutEncoder(verbose=1, randomized=True, sigma=0.1) enc.fit(X, y) self.verify_numeric(enc.transform(X_t)) self.verify_numeric(enc.transform(X_t, y_t))
def test_HandleMissingIsValueAndNanInTestAndNoTestTarget_ExpectMean(self): df = pd.DataFrame({ 'color': ["a", "a", "a", "b", "b", "b"], 'outcome': [1, 0, 0, 1, 0, 1]}) train = df.drop('outcome', axis=1) target = df.drop('color', axis=1) test = pd.Series([np.nan, 'b'], name='color') ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_missing='value') ce_leave.fit(train, target['outcome']) obtained = ce_leave.transform(test) self.assertEqual([.5, 2/3.0], list(obtained['color']))
def encode_category_variables(df, model_params): for key, value in model_params['CATEGORICAL_FEATURES_DICT'].items(): if key not in df.columns: continue if value == 'OrdinalEncoder': ce_oe = ce.OrdinalEncoder(cols=key, handle_unknown='impute') df = ce_oe.fit_transform(df) elif value == 'OneHotEncoder': ce_ohe = ce.OneHotEncoder(cols=key, handle_unknown='impute') df = ce_ohe.fit_transform(df) elif value == 'LeaveOneOutEncoder': ce_looe = ce.LeaveOneOutEncoder(cols=key, handle_unknown='impute') df = ce_looe.fit_transform(df, y=df[model_params['CATEGORICAL_FEATURES_DICT']['target_y']]) return df
def test_leave_one_out_np(self): """ :return: """ X = self.create_array(n_rows=1000) X_t = self.create_array(n_rows=100) y = np.random.randn(X.shape[0]) y_t = np.random.randn(X_t.shape[0]) enc = encoders.LeaveOneOutEncoder(verbose=1) enc.fit(X, y) self.verify_numeric(enc.transform(X_t)) self.verify_numeric(enc.transform(X_t, y_t))
def LeaveOneOutEncoderMethod(self, configFile, data): import category_encoders as ce data_dict = {'train': {}, 'test': {}} scaler = ce.LeaveOneOutEncoder(cols=data['train']['x'].columns) scaler.fit(data['train']['x'], data['train']['y']) data_dict['train']['x'] = pd.DataFrame( scaler.transform(data['train']['x'])) data_dict['train']['y'] = data['train']['y'] data_dict['test']['x'] = pd.DataFrame( scaler.transform(data['test']['x'])) data_dict['test']['y'] = data['test']['y'] if 'test_out' in data: data_dict['test_out'] = {} data_dict['test_out']['x'] = pd.DataFrame( scaler.transform(data['test_out']['x'])) data_dict['test_out']['y'] = data['test_out']['y'] return data_dict
'nursery.arff', 'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', 'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff' ] # We ignore encoders {BackwardDifferenceEncoder, HelmertEncoder, PolynomialEncoder and SumEncoder} because of: # https://github.com/scikit-learn-contrib/categorical-encoding/issues/91 encoders = [ category_encoders.BaseNEncoder(), category_encoders.OneHotEncoder(), category_encoders.BinaryEncoder(), category_encoders.HashingEncoder(), category_encoders.OrdinalEncoder(), category_encoders.TargetEncoder(), category_encoders.LeaveOneOutEncoder(), category_encoders.WOEEncoder() ] # Initialization if os.path.isfile('./output/result.csv'): os.remove('./output/result.csv') # Ok... warnings.filterwarnings('ignore') # Loop over datasets, then over encoders, and finally, over the models for dataset_name in datasets: X, y, fold_count = arff_loader.load(dataset_name) non_numeric = list(X.select_dtypes(exclude=[np.number]).columns.values) for encoder in encoders:
def pipeline(df, target, cat_columns, models): n_rows, n_cols = df.shape metrics = { "n_rows": [], "n_cols": [], "cardinality": [], "model": [], "column": [], "encoder": [], "rmse": [], "mae": [], "fit_time": [], "rmse_change": [], "mae_change": [], "fit_time_change": [], } columns = cat_columns for model_name in models: base_rmse, base_mae, base_fit_time = model( df=df, target=target, encoder=np.nan, col=np.nan, model_name=model_name, encoder_type="basic", encoder_name=[], ) _append_metric( row_list=metrics, n_rows=n_rows, n_cols=n_cols, cardinality=np.nan, model_name=model_name, column=np.nan, name="basic", rmse=base_rmse, mae=base_mae, fit_time=base_fit_time, base_rmse=base_rmse, base_mae=base_mae, base_fit_time=base_fit_time, ) for column in columns: print() print(column) cardinality = df[column].nunique() print("ohe") rmse, mae, fit_time = model( df=df, target=target, encoder=np.nan, col=column, model_name=model_name, encoder_type="basic", encoder_name="One Hot Encoder (pd.dummies)", ) _append_metric( row_list=metrics, n_rows=n_rows, n_cols=n_cols, cardinality=cardinality, model_name=model_name, column=column, name="One Hot Encoder (pd.dummies)", rmse=rmse, mae=mae, fit_time=fit_time, base_rmse=base_rmse, base_mae=base_mae, base_fit_time=base_fit_time, ) encoders = [ ("Sum Encoder(sleepmind)", SumEncoder()), ("BinaryEncoder", ce.BinaryEncoder(cols=[column])), ("HashingEncoder", ce.HashingEncoder(cols=[column])), ("OneHotEncoder", ce.OneHotEncoder(cols=[column])), ("OrdinalEncoder", ce.OrdinalEncoder(cols=[column])), ("BaseNEncoder", ce.BaseNEncoder(cols=[column])), ( "BackwardDifferenceEncoder", ce.BackwardDifferenceEncoder(cols=[column]), ), ("HelmertEncoder", ce.HelmertEncoder(cols=[column])), ("SumEncoder", ce.SumEncoder(cols=[column])), ("PolynomialEncoder", ce.PolynomialEncoder(cols=[column])), ("TargetEncoder", ce.TargetEncoder(cols=[column])), ("LeaveOneOutEncoder", ce.LeaveOneOutEncoder(cols=[column])), ( "XAM_bayesian_targetEncoder", BayesianTargetEncoder(columns=[column], prior_weight=3, suffix=""), ), ] for name, encoder in encoders: print(name) rmse, mae, fit_time = model( df=df, target=target, encoder=encoder, col=column, model_name=model_name, encoder_type="sklearn_encoding", encoder_name=name, ) _append_metric( row_list=metrics, n_rows=n_rows, n_cols=n_cols, cardinality=cardinality, model_name=model_name, column=column, name=name, rmse=rmse, mae=mae, fit_time=fit_time, base_rmse=base_rmse, base_mae=base_mae, base_fit_time=base_fit_time, ) bayes_encoders = [ ("hcc_BayesEncoding", BayesEncoding), ("hcc_BayesEncodingKfold", BayesEncodingKfold), ("LOOEncoding", LOOEncoding), ("LOOEncodingKfold", LOOEncodingKfold), ] for name, bayes_encoder in bayes_encoders: print(name) rmse, mae, fit_time = model( df=df, target=target, encoder=bayes_encoder, col=column, model_name=model_name, encoder_name=name, encoder_type="basic", hcc_ind=1, ) _append_metric( row_list=metrics, n_rows=n_rows, n_cols=n_cols, cardinality=cardinality, model_name=model_name, column=column, name=name, rmse=rmse, mae=mae, fit_time=fit_time, base_rmse=base_rmse, base_mae=base_mae, base_fit_time=base_fit_time, ) results = pd.DataFrame(metrics) return results
'hepatitis.arff', 'hypothyroid.arff', 'kr.vs.kp.arff', 'labor.arff', 'lymph.arff', 'mushroom.arff', 'nursery.arff', 'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', 'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff'] # datasets = ['carvana.csv', 'erasmus.csv', 'internetusage.csv', 'ipumsla97small.csv', 'kobe.csv', 'pbcseq.csv', 'phpvcoG8S.csv', 'westnile.csv'] # amazon is too large... # We painstakingly initialize each encoder here because that gives us the freedom to initialize the # encoders with any setting we want. encoders = [ #category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(), category_encoders.BinaryEncoder(), category_encoders.HashingEncoder(), # category_encoders.HelmertEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.LeaveOneOutEncoder(), category_encoders.MEstimateEncoder(), category_encoders.OneHotEncoder(), category_encoders.OrdinalEncoder(), # category_encoders.PolynomialEncoder(), # category_encoders.SumEncoder(), category_encoders.TargetEncoder(), category_encoders.WOEEncoder()] encoders = [ #category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(handle_missing='value'), category_encoders.BaseNEncoder(handle_missing='indicator'), category_encoders.BinaryEncoder(handle_missing='value'), category_encoders.BinaryEncoder(handle_missing='indicator'), # category_encoders.HashingEncoder(handle_missing='value'), # category_encoders.HashingEncoder(handle_missing='indicator'),
def preprocess_data(self, data: pd.DataFrame, stage: str = "inference") -> Tuple[pd.DataFrame, list]: """The preprocessing, like Categorical Encoding, Normalization, etc. which any dataframe should undergo before feeding into the dataloder Args: data (pd.DataFrame): A dataframe with the features and target stage (str, optional): Internal parameter. Used to distinguisj between fit and inference. Defaults to "inference". Returns: tuple[pd.DataFrame, list]: Returns the processed dataframe and the added features(list) as a tuple """ logger.info(f"Preprocessing data: Stage: {stage}...") added_features = None if self.config.encode_date_columns: for field_name, freq in self.config.date_columns: data = self.make_date(data, field_name) data, added_features = self.add_datepart(data, field_name, frequency=freq, prefix=None, drop=True) # The only features that are added are the date features extracted # from the date which are categorical in nature if (added_features is not None) and (stage == "fit"): logger.debug( f"Added {added_features} features after encoding the date_columns" ) self.config.categorical_cols += added_features self.config.categorical_dim = (len(self.config.categorical_cols) if self.config.categorical_cols is not None else 0) # Encoding Categorical Columns if len(self.config.categorical_cols) > 0: if stage == "fit": if self.do_leave_one_out_encoder(): logger.debug( "Encoding Categorical Columns using LeavOneOutEncoder") self.categorical_encoder = ce.LeaveOneOutEncoder( cols=self.config.categorical_cols, random_state=42) # Multi-Target Regression uses the first target to encode the categorical columns if len(self.config.target) > 1: logger.warning( f"Multi-Target Regression: using the first target({self.config.target[0]}) to encode the categorical columns" ) data = self.categorical_encoder.fit_transform( data, data[self.config.target[0]]) else: logger.debug( "Encoding Categorical Columns using OrdinalEncoder") self.categorical_encoder = OrdinalEncoder( cols=self.config.categorical_cols) data = self.categorical_encoder.fit_transform(data) else: data = self.categorical_encoder.transform(data) # Transforming Continuous Columns if (self.config.continuous_feature_transform is not None) and (len(self.config.continuous_cols) > 0): if stage == "fit": transform = self.CONTINUOUS_TRANSFORMS[ self.config.continuous_feature_transform] self.continuous_transform = transform["callable"]( **transform["params"]) # TODO implement quantile noise data.loc[:, self.config. continuous_cols] = self.continuous_transform.fit_transform( data.loc[:, self.config.continuous_cols]) else: data.loc[:, self.config. continuous_cols] = self.continuous_transform.transform( data.loc[:, self.config.continuous_cols]) # Normalizing Continuous Columns if (self.config.normalize_continuous_features) and (len( self.config.continuous_cols) > 0): if stage == "fit": self.scaler = StandardScaler() data.loc[:, self.config. continuous_cols] = self.scaler.fit_transform( data.loc[:, self.config.continuous_cols]) else: data.loc[:, self.config.continuous_cols] = self.scaler.transform( data.loc[:, self.config.continuous_cols]) # Converting target labels to a 0 indexed label if self.config.task == "classification": if stage == "fit": self.label_encoder = LabelEncoder() data[self.config.target[0]] = self.label_encoder.fit_transform( data[self.config.target[0]]) else: if self.config.target[0] in data.columns: data[self.config.target[0]] = self.label_encoder.transform( data[self.config.target[0]]) # Target Transforms if all([col in data.columns for col in self.config.target]): if self.do_target_transform: target_transforms = [] for col in self.config.target: _target_transform = copy.deepcopy( self.target_transform_template) data[col] = _target_transform.fit_transform( data[col].values.reshape(-1, 1)) target_transforms.append(_target_transform) self.target_transforms = target_transforms return data, added_features