def test_basen(self): """ :return: """ cols = ['C1', 'D', 'E', 'F'] X = self.create_dataset(n_rows=1000) X_t = self.create_dataset(n_rows=100) enc = encoders.BaseNEncoder(verbose=1, cols=cols) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.BaseNEncoder(verbose=1) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.BaseNEncoder(verbose=1, drop_invariant=True) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.BaseNEncoder(verbose=1, return_df=False) enc.fit(X, None) self.assertTrue(isinstance(enc.transform(X_t), np.ndarray))
def num_cols(nvals, base): """Returns the number of columns output for a given number of distinct input values""" vals = [str(i) for i in range(nvals)] df = pd.DataFrame({'vals': vals}) encoder = encoders.BaseNEncoder(base=base) encoder.fit(df) return len(list(encoder.transform(df)))
def test_inv_transform_ct_11(self): """ test inv_transform_ct with BaseN Encoder and passthrough option """ y = pd.DataFrame(data=[0, 1], columns=['y']) train = pd.DataFrame({'city': ['chicago', 'paris'], 'state': ['US', 'FR'], 'other': ['A', 'B']}) enc = ColumnTransformer( transformers=[ ('basen', ce.BaseNEncoder(), ['city', 'state']) ], remainder='passthrough') enc.fit(train, y) test = pd.DataFrame({'city': ['chicago', 'chicago', 'paris'], 'state': ['US', 'FR', 'FR'], 'other': ['A', 'B', 'C']}) expected = pd.DataFrame({'basen_city': ['chicago', 'chicago', 'paris'], 'basen_state': ['US', 'FR', 'FR'], 'other': ['A', 'B', 'C']}) result = pd.DataFrame(enc.transform(test)) result.columns = ['col1_0', 'col1_1', 'col2_0', 'col2_1', 'other'] original = inverse_transform(result, enc) pd.testing.assert_frame_equal(original, expected)
def Base_N_Coder(self, path, data, target): self.log.writeToLog('Performing Base N Encoding...') encodetype = 'base_ncoder' df = pd.read_csv(path + data) category = None for i in df.dtypes: if i == 'O': category = 'yes' if category == 'yes': category, droped_data = self.refining(df) self.log.writeToLog('No. of columns before encoding : ' + str(len(list(df.columns)))) le = LabelEncoder() bne = ce.BaseNEncoder() if target in list(category.columns): category = category.drop(target, axis=1) df[target] = le.fit_transform(df[target]) self.log.writeToLog('Target column has been encoded !') if not category.empty: self.log.writeToLog('Dependant variables encoded is/are: ' + str(list(category.columns))) bne_data = bne.fit_transform(category) dataset = droped_data.join(bne_data) #dataset = dataset.join(df[target]) dataset[target] = df[target] self.convert_to_csv(data, dataset, encodetype) self.log.writeToLog('No. of columns after encoding : ' + str(len(list(dataset.columns)))) else: self.log.writeToLog( 'Dependant variables has no categories to be encoded !') else: self.log.writeToLog( 'No categorical columns found in the dataset to be encoded !') '''
def apply_baseN_encoding(df, categorical_columns): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) import category_encoders as ce encoder = ce.BaseNEncoder(base=3, cols=categorical_columns).fit(df.values) X_transformed = encoder.transform(df) return X_transformed
def nominalToNumeric(keyArr, valueArr): frameDict = {} for i in range(len((keyArr))): frameDict[keyArr[i]] = valueArr[i] nominal = pd.DataFrame(frameDict) baseEncoder = ce.BaseNEncoder(cols=keyArr) return baseEncoder.fit_transform(nominal)
def test_inverse_transform_HaveData_ExpectResultReturned(self): train = pd.Series(list('abcd')).to_frame('letter') enc = encoders.BaseNEncoder(base=2) result = enc.fit_transform(train) inversed_result = enc.inverse_transform(result) pd.testing.assert_frame_equal(train, inversed_result)
def test_inverse_transform_HaveNanInTrainAndHandleMissingReturnNan_ExpectReturnedWithNan(self): train = pd.DataFrame({'city': ['chicago', np.nan]}) enc = encoders.BaseNEncoder(handle_missing='return_nan', handle_unknown='value') result = enc.fit_transform(train) original = enc.inverse_transform(result) pd.testing.assert_frame_equal(train, original)
def create_features(self, df_train, df_test): encoder = ce.BaseNEncoder(cols=self.columns) encoder.fit(df_train[self.columns], df_train[self.target_column].values.tolist()) encoded_train = encoder.transform(df_train[self.columns]) encoded_test = encoder.transform(df_test[self.columns]) for column in encoded_train.columns: self.train[column + '_BaseNEncoder'] = encoded_train[column] self.test[column + '_BaseNEncoder'] = encoded_test[column]
def test_HandleMissingIndicator_HaveNoNan_ExpectThirdColumn(self): train = pd.Series(['a', 'b', 'c']) result = encoders.BaseNEncoder(handle_missing='indicator', base=2).fit_transform(train) self.assertEqual(3, result.shape[0]) self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist())
def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): train = ['A', 'B'] encoder = encoders.BaseNEncoder(handle_unknown='indicator') result = encoder.fit_transform(train) self.assertEqual(2, result.shape[0]) self.assertListEqual([0, 1], result.iloc[0, :].tolist()) self.assertListEqual([1, 0], result.iloc[1, :].tolist())
def test_inverse_transform_13(self): """ Test basen encoding """ train = pd.DataFrame({'city': ['chicago', np.nan]}) enc = ce.BaseNEncoder(handle_missing='value', handle_unknown='value') result = enc.fit_transform(train) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(train, original)
def test_inverse_transform_12(self): """ test inverse_transform having data expecting a returned result """ train = pd.Series(list('abcd')).to_frame('letter') enc = ce.BaseNEncoder(base=2) result = enc.fit_transform(train) inversed_result = inverse_transform(result, enc) pd.testing.assert_frame_equal(train, inversed_result)
def __init__(self, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value'): self.cols = cols self.drop_invariant = drop_invariant self.return_df = return_df self.handle_unknown = handle_unknown self.handle_missing = handle_missing self.base_n_encoder = ce.BaseNEncoder(base=2, cols=self.cols, drop_invariant=self.drop_invariant, return_df=self.return_df, handle_unknown=self.handle_unknown, handle_missing=self.handle_missing)
def test_inverse_transform_ce_basen(self): """ Unit test inverse transform base n """ preprocessing = ce.BaseNEncoder(cols=['Age', 'Sex'], return_df=True, base=3) fitted_dataset = preprocessing.fit_transform(self.ds_titanic_clean) output = inverse_transform(fitted_dataset, preprocessing) pd.testing.assert_frame_equal(output, self.ds_titanic_clean)
def test_HaveIndicatorAndNanValue_ExpectNewColumn(self): train = pd.Series(['a', 'b', 'c', np.nan]) result = encoders.BaseNEncoder(handle_missing='indicator', base=2).fit_transform(train) self.assertEqual(4, result.shape[0]) self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist())
def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self): train = pd.DataFrame({'city': ['chicago', np.nan]}) test = pd.DataFrame({'city': ['chicago', 'los angeles']}) enc = encoders.BaseNEncoder(handle_missing='value', handle_unknown='return_nan') enc.fit(train) result = enc.transform(test) original = enc.inverse_transform(result) pd.testing.assert_frame_equal(train, original)
def test_fit_transform_have_base_2_expect_Correct_Encoding(self): train = pd.Series(['a', 'b', 'c', 'd']) result = encoders.BaseNEncoder(base=2).fit_transform(train) self.assertEqual(4, result.shape[0]) self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist())
def test_inverse_transform_14(self): """ test inverse_transform having Nan in train and handle missing expected a result with Nan """ train = pd.DataFrame({'city': ['chicago', np.nan]}) enc = ce.BaseNEncoder(handle_missing='return_nan', handle_unknown='value') result = enc.fit_transform(train) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(train, original)
def test_basen_np(self): """ :return: """ X = self.create_array(n_rows=1000) X_t = self.create_array(n_rows=100) enc = encoders.BaseNEncoder(verbose=1) enc.fit(X, None) self.verify_numeric(enc.transform(X_t))
def __init__(self, encoder_type, columns_name=None): """ :param encoder_type: :param columns_name: list, 特征名组成的列表名 """ if encoder_type == "BackwardDe": # 反向差分编码 self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name) elif encoder_type == "BaseN": # BaseN编码 self.encoder = ce.BaseNEncoder(cols=columns_name) elif encoder_type == "Binary": # 二值编码 self.encoder = ce.BinaryEncoder(cols=columns_name) elif encoder_type == "Catboost": self.encoder = ce.CatBoostEncoder(cols=columns_name) elif encoder_type == "Hash": self.encoder = ce.HashingEncoder(cols=columns_name) elif encoder_type == "Helmert": self.encoder = ce.HelmertEncoder(cols=columns_name) elif encoder_type == "JamesStein": self.encoder = ce.JamesSteinEncoder(cols=columns_name) elif encoder_type == "LOO": # LeaveOneOutEncoder 编码 self.encoder = ce.LeaveOneOutEncoder(cols=columns_name) elif encoder_type == "ME": self.encoder = ce.MEstimateEncoder(cols=columns_name) # M估计编码器 elif encoder_type == "OneHot": self.encoder = ce.OneHotEncoder(cols=columns_name) elif encoder_type == "OridinalEncoder": # 原始编码 self.encoder = ce.OrdinalEncoder(cols=columns_name) elif encoder_type == "Sum": # 求和编码 self.encoder = ce.SumEncoder(cols=columns_name) elif encoder_type == "Polynomial": # 多项式编码 self.encoder = ce.PolynomialEncoder(cols=columns_name) elif encoder_type == "Target": # 目标编码 self.encoder = ce.TargetEncoder(cols=columns_name) elif encoder_type == "WOE": # WOE 编码器 self.encoder = ce.WOEEncoder(cols=columns_name) else: raise ValueError("请选择正确的编码方式")
def test_inverse_transform_16(self): """ test inverse_transform having handle missing value and Unknown """ train = pd.DataFrame({'city': ['chicago', np.nan]}) test = pd.DataFrame({'city': ['chicago', np.nan, 'los angeles']}) expected = pd.DataFrame({'city': ['chicago', np.nan, np.nan]}) enc = ce.BaseNEncoder(handle_missing='value', handle_unknown='return_nan') enc.fit(train) result = enc.transform(test) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(expected, original)
def test_HandleUnknown_HaveUnknown_ExpectIndicatorInTest(self): train = ['A', 'B', 'C'] test = ['A', 'B', 'C', 'D'] encoder = encoders.BaseNEncoder(handle_unknown='indicator') encoder.fit(train) result = encoder.transform(test) self.assertEqual(4, result.shape[0]) self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist())
def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): train = pd.Series(['a', 'b', 'c']) test = pd.Series(['a', 'b', 'c', np.nan]) encoder = encoders.BaseNEncoder(handle_missing='indicator') encoder.fit(train) result = encoder.transform(test) self.assertEqual(4, result.shape[0]) self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist())
def Base_N_Coder(self, data, target, filename): self.log.writeToLog('Performing Base N Encoding...') encodetype = 'base_ncoder' category, x, df = self.refining(data, target) self.log.writeToLog('Categorical columns to be encoded: ' + str(list(category.columns))) bne = ce.BaseNEncoder() numeric_features = bne.fit_transform(category) self.log.writeToLog('Encoded as: ' + str(list(numeric_features.columns))) x = x.join(numeric_features) dataset = x.join(df[target]) self.convert_to_csv(data, dataset, encodetype, filename)
def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self): train = pd.DataFrame({'city': ['chicago', np.nan]}) test = pd.DataFrame({'city': ['chicago', 'los angeles']}) enc = encoders.BaseNEncoder(handle_missing='return_nan', handle_unknown='return_nan') enc.fit(train) result = enc.transform(test) message = 'inverse_transform is not supported because transform impute '\ 'the unknown category nan when encode city' with self.assertWarns(UserWarning, msg=message) as w: enc.inverse_transform(result)
def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self): train = pd.DataFrame({'city': ['chicago', np.nan]}) test = pd.DataFrame({'city': ['chicago', 'los angeles']}) enc = encoders.BaseNEncoder(handle_missing='return_nan', handle_unknown='return_nan') enc.fit(train) result = enc.transform(test) with warnings.catch_warnings(record=True) as w: enc.inverse_transform(result) self.assertEqual(1, len(w)) self.assertEqual('inverse_transform is not supported because transform impute ' 'the unknown category nan when encode city', str(w[0].message))
def get_encoder_dict(): encoder_dict = { 'OneHotEncoder': ce.OneHotEncoder(), 'BinaryEncoder': ce.BinaryEncoder(), 'HashingEncoder': ce.HashingEncoder(), 'LabelEncoder': le.MultiColumnLabelEncoder(), 'FrequencyEncoder': fe.FrequencyEncoder(), 'TargetEncoder': ce.TargetEncoder(), 'HelmertEncoder': ce.HelmertEncoder(), 'JamesSteinEncoder': ce.JamesSteinEncoder(), 'BaseNEncoder': ce.BaseNEncoder(), 'SumEncoder': ce.SumEncoder(), } return encoder_dict
def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value'): self.base_n_encoder = ce.BaseNEncoder(base=2, verbose=verbose, cols=cols, mapping=mapping, drop_invariant=drop_invariant, return_df=return_df, handle_unknown=handle_unknown, handle_missing=handle_missing)
def test_inverse_transform_contributions_ce_basen(self): """ Unit test inverse transform contributions ce base n """ preprocessing = ce.BaseNEncoder(cols=['Age', 'Sex'], return_df=True, base=3) fitted_dataset = preprocessing.fit_transform(self.ds_titanic_clean) contributions = pd.DataFrame(data=np.random.rand( fitted_dataset.shape[0], fitted_dataset.shape[1]), columns=fitted_dataset.columns, index=self.ds_titanic_clean.index) output = inverse_transform_contributions(contributions, preprocessing) assert isinstance(output, pd.DataFrame) assert self.ds_titanic_clean.shape == output.shape np.testing.assert_almost_equal(contributions.values.sum(axis=1), output.values.sum(axis=1))