def test_inv_transform_ct_4(self): """ test inv_transform_ct with single target category encoders and passthrough option """ y = pd.DataFrame(data=[0, 1, 1, 1], columns=['y']) train = pd.DataFrame({'city': ['chicago', 'paris', 'paris', 'chicago'], 'state': ['US', 'FR', 'FR', 'US'], 'other': ['A', 'B', 'B', 'B']}) enc = ColumnTransformer( transformers=[ ('target', ce.TargetEncoder(), ['city', 'state']) ], remainder='passthrough') test = pd.DataFrame({'city': ['chicago', 'chicago', 'paris'], 'state': ['US', 'FR', 'FR'], 'other': ['A', 'B', 'C']}) expected = pd.DataFrame(data={'target_city': ['chicago', 'chicago', 'paris'], 'target_state': ['US', 'FR', 'FR'], 'other': ['A', 'B', 'C']}, dtype=object) enc.fit(train, y) result = pd.DataFrame(enc.transform(test)) result.columns = ['col1', 'col2', 'other'] original = inverse_transform(result, enc) pd.testing.assert_frame_equal(original, expected)
def test_inv_transform_ct_17(self): """ test inv_transform_ct with OrdinalEncoder Sklearn and passthrough option """ y = pd.DataFrame(data=[0, 1], columns=['y']) train = pd.DataFrame({'city': ['chicago', 'paris'], 'state': ['US', 'FR'], 'other': ['A', 'B']}) enc = ColumnTransformer( transformers=[ ('ordinal', skp.OrdinalEncoder(), ['city', 'state']) ], remainder='passthrough') enc.fit(train, y) test = pd.DataFrame({'city': ['chicago', 'chicago', 'paris'], 'state': ['US', 'FR', 'FR'], 'other': ['A', 'B', 'C']}) expected = pd.DataFrame({'ordinal_city': ['chicago', 'chicago', 'paris'], 'ordinal_state': ['US', 'FR', 'FR'], 'other': ['A', 'B', 'C']}) result = pd.DataFrame(enc.transform(test)) result.columns = ['col1', 'col2', 'other'] original = inverse_transform(result, enc) pd.testing.assert_frame_equal(original, expected)
def test_inv_transform_ct_12(self): """ test inv_transform_ct with single OneHotEncoder and drop option """ y = pd.DataFrame(data=[0, 1], columns=['y']) train = pd.DataFrame({'city': ['chicago', 'paris'], 'state': ['US', 'FR'], 'other': ['A', 'B']}) enc = ColumnTransformer( transformers=[ ('onehot', ce.OneHotEncoder(), ['city', 'state']) ], remainder='drop') enc.fit(train, y) test = pd.DataFrame({'city': ['chicago', 'chicago', 'paris'], 'state': ['US', 'FR', 'FR'], 'other': ['A', 'B', 'C']}) expected = pd.DataFrame({'onehot_city': ['chicago', 'chicago', 'paris'], 'onehot_state': ['US', 'FR', 'FR']}) result = pd.DataFrame(enc.transform(test)) result.columns = ['col1_0', 'col1_1', 'col2_0', 'col2_1'] original = inverse_transform(result, enc) pd.testing.assert_frame_equal(original, expected)
def test_inverse_transform_11(self): """ Test binary encoding """ train = pd.DataFrame({ 'city': ['chicago', 'paris'], 'state': ['US', 'FR'], 'other': ['A', np.nan] }) test = pd.DataFrame({ 'city': ['chicago', 'paris', 'monaco'], 'state': ['US', 'FR', 'FR'], 'other': ['A', np.nan, 'B'] }) expected = pd.DataFrame({ 'city': ['chicago', 'paris', np.nan], 'state': ['US', 'FR', 'FR'], 'other': ['A', np.nan, 'B'] }) enc = ce.BinaryEncoder(cols=['city', 'state']).fit(train) result = enc.transform(test) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(original, expected)
def test_inv_transform_ct_2(self): """ test inv_transform_ct with multiple encoding and passthrough option """ train = pd.DataFrame({'city': ['chicago', 'paris'], 'state': ['US', 'FR'], 'other': ['A', 'B']}, index=['index1', 'index2']) enc = ColumnTransformer( transformers=[ ('onehot_ce', ce.OneHotEncoder(), ['city', 'state']), ('onehot_skp', skp.OneHotEncoder(), ['city', 'state']) ], remainder='passthrough') enc.fit(train) test = pd.DataFrame({'city': ['chicago', 'chicago', 'paris'], 'state': ['US', 'FR', 'FR'], 'other': ['A', 'B', 'C']}, index=['index1', 'index2', 'index3']) expected = pd.DataFrame({'onehot_ce_city': ['chicago', 'chicago', 'paris'], 'onehot_ce_state': ['US', 'FR', 'FR'], 'onehot_skp_city': ['chicago', 'chicago', 'paris'], 'onehot_skp_state': ['US', 'FR', 'FR'], 'other': ['A', 'B', 'C']}, index=['index1', 'index2', 'index3']) result = pd.DataFrame(enc.transform(test)) result.columns = ['col1_0', 'col1_1', 'col2_0', 'col2_1', 'col3_0', 'col3_1', 'col4_0', 'col4_1', 'other'] result.index = ['index1', 'index2', 'index3'] original = inverse_transform(result, enc) pd.testing.assert_frame_equal(original, expected)
def test_inv_transform_ct_22(self): """ test inv_transform_ct with PowerTransformer Encoder Sklearn and passthrough option """ y = pd.DataFrame(data=[0, 1], columns=['y']) train = pd.DataFrame({'num1': [0, 1], 'num2': [0, 2], 'other': ['A', 'B']}) enc = ColumnTransformer( transformers=[ ('power', skp.PowerTransformer(), ['num1', 'num2']) ], remainder='passthrough') enc.fit(train, y) test = pd.DataFrame({'num1': [0, 1, 1], 'num2': [0, 2, 3], 'other': ['A', 'B', 'C']}) expected = pd.DataFrame({'power_num1': [0.0, 1.0, 1.0], 'power_num2': [0.0, 1.9999999997665876, 3.000000000169985], 'other': ['A', 'B', 'C']}) result = pd.DataFrame(enc.transform(test)) result.columns = ['col1', 'col2', 'other'] original = inverse_transform(result, enc) pd.testing.assert_frame_equal(original, expected)
def test_inv_transform_ct_18(self): """ test inv_transform_ct with Standardscaler Encoder Sklearn and passthrough option """ y = pd.DataFrame(data=[0, 1], columns=['y']) train = pd.DataFrame({'num1': [0, 1], 'num2': [0, 2], 'other': ['A', 'B']}) enc = ColumnTransformer( transformers=[ ('std', skp.StandardScaler(), ['num1', 'num2']) ], remainder='passthrough') enc.fit(train, y) test = pd.DataFrame({'num1': [0, 1, 1], 'num2': [0, 2, 3], 'other': ['A', 'B', 'C']}) expected = pd.DataFrame({'std_num1': [0.0, 1.0, 1.0], 'std_num2': [0.0, 2.0, 3.0], 'other': ['A', 'B', 'C']}, dtype=object) result = pd.DataFrame(enc.transform(test)) result.columns = ['col1', 'col2', 'other'] original = inverse_transform(result, enc) pd.testing.assert_frame_equal(original, expected)
def test_inv_transform_ct_23(self): """ test inv_transform_ct with PowerTransformer Encoder Sklearn and drop option """ y = pd.DataFrame(data=[0, 1], columns=['y']) train = pd.DataFrame({'num1': [0, 1], 'num2': [0, 2], 'other': ['A', 'B']}) enc = ColumnTransformer( transformers=[ ('power', skp.QuantileTransformer(n_quantiles=2), ['num1', 'num2']) ], remainder='drop') enc.fit(train, y) test = pd.DataFrame({'num1': [0, 1, 1], 'num2': [0, 2, 3], 'other': ['A', 'B', 'C']}) expected = pd.DataFrame({'power_num1': [0.0, 1.0, 1.0], 'power_num2': [0.0, 2.0, 2.0]}) result = pd.DataFrame(enc.transform(test)) result.columns = ['col1', 'col2'] original = inverse_transform(result, enc) pd.testing.assert_frame_equal(original, expected)
def test_inverse_transform_3(self): """ Test target encoding """ train = pd.DataFrame({ 'city': ['chicago', 'paris', 'paris', 'chicago', 'chicago'], 'state': ['US', 'FR', 'FR', 'US', 'US'], 'other': ['A', 'A', np.nan, 'B', 'B'] }) test = pd.DataFrame({ 'city': ['chicago', 'paris', 'paris'], 'state': ['US', 'FR', 'FR'], 'other': ['A', np.nan, np.nan] }) expected = pd.DataFrame({ 'city': ['chicago', 'paris', 'paris'], 'state': ['US', 'FR', 'FR'], 'other': ['A', np.nan, np.nan] }) y = pd.DataFrame(data=[0, 1, 1, 0, 1], columns=['y']) enc = ce.TargetEncoder(cols=['city', 'state']).fit(train, y) result = enc.transform(test) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(expected, original)
def __init__(self, explainer: SmartExplainer, project_info_file: str, x_train: Optional[pd.DataFrame] = None, y_train: Optional[pd.DataFrame] = None, y_test: Optional[pd.DataFrame] = None, config: Optional[dict] = None): self.explainer = explainer self.metadata = load_yml(path=project_info_file) self.x_train_init = x_train if x_train is not None: self.x_train_pre = inverse_transform(x_train, self.explainer.preprocessing) if self.explainer.postprocessing: self.x_train_pre = apply_postprocessing( self.x_train_pre, self.explainer.postprocessing) else: self.x_train_pre = None self.x_pred = self.explainer.x_pred self.config = config if config is not None else dict() self.col_names = list(self.explainer.columns_dict.values()) self.df_train_test = self._create_train_test_df(test=self.x_pred, train=self.x_train_pre) self.y_pred = self.explainer.model.predict(self.explainer.x_init) self.y_test, target_name_test = self._get_values_and_name( y_test, 'target') self.y_train, target_name_train = self._get_values_and_name( y_train, 'target') self.target_name = target_name_train or target_name_test if 'title_story' in self.config.keys(): self.title_story = config['title_story'] elif self.explainer.title_story != '': self.title_story = self.explainer.title_story else: self.title_story = 'Shapash report' self.title_description = self.config[ 'title_description'] if 'title_description' in self.config.keys( ) else '' print_css_style() print_javascript_misc() if 'metrics' in self.config.keys(): if not isinstance(self.config['metrics'], list) or not isinstance( self.config['metrics'][0], dict): raise ValueError( "The metrics parameter expects a list of dict.") for metric in self.config['metrics']: for key in metric: if key not in ['path', 'name', 'use_proba_values']: raise ValueError( f"Unknown key : {key}. Key should be in ['path', 'name', 'use_proba_values']" ) if key == 'use_proba_values' and not isinstance( metric['use_proba_values'], bool): raise ValueError( '"use_proba_values" metric key expects a boolean value.' )
def test_inverse_transform_ce_ordinal(self): """ Unit test inverse transform ce ordinal """ preprocessing = ce.OrdinalEncoder(cols=['Age', 'Sex'], return_df=True) fitted_dataset = preprocessing.fit_transform(self.ds_titanic_clean) output = inverse_transform(fitted_dataset, preprocessing) pd.testing.assert_frame_equal(output, self.ds_titanic_clean)
def test_inverse_transform_13(self): """ Test basen encoding """ train = pd.DataFrame({'city': ['chicago', np.nan]}) enc = ce.BaseNEncoder(handle_missing='value', handle_unknown='value') result = enc.fit_transform(train) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(train, original)
def test_inverse_transform_12(self): """ test inverse_transform having data expecting a returned result """ train = pd.Series(list('abcd')).to_frame('letter') enc = ce.BaseNEncoder(base=2) result = enc.fit_transform(train) inversed_result = inverse_transform(result, enc) pd.testing.assert_frame_equal(train, inversed_result)
def test_inverse_transform_5(self): """ Test inverse_transform having Nan in train and handle missing value expect returned with nan_Ordinal """ train = pd.DataFrame({'city': ['chicago', np.nan]}) enc = ce.OrdinalEncoder(handle_missing='value', handle_unknown='value') result = enc.fit_transform(train) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(train, original)
def test_inverse_transform_20(self): """ test inverse_transform with Nan in training expecting Nan_Onehot returned result """ train = pd.DataFrame({'city': ['chicago', np.nan]}) enc = ce.OneHotEncoder(handle_missing='value', handle_unknown='value') result = enc.fit_transform(train) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(train, original)
def test_inverse_transform_1(self): """ Test no preprocessing """ train = pd.DataFrame({ 'city': ['chicago', 'paris'], 'state': ['US', 'FR'] }) original = inverse_transform(train) pd.testing.assert_frame_equal(original, train)
def test_inverse_transform_4(self): """ Test ordinal encoding """ train = pd.DataFrame({'city': ['chicago', 'st louis']}) test = pd.DataFrame({'city': ['chicago', 'los angeles']}) expected = pd.DataFrame({'city': ['chicago', np.nan]}) enc = ce.OrdinalEncoder(handle_missing='value', handle_unknown='value') enc.fit(train) result = enc.transform(test) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(expected, original)
def test_inverse_transform_7(self): """ test inverse_transform both fields are return Nan with Nan Expect ValueError Ordinal """ train = pd.DataFrame({'city': ['chicago', np.nan]}) test = pd.DataFrame({'city': ['chicago', 'los angeles']}) enc = ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan') enc.fit(train) result = enc.transform(test) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(train, original)
def test_inverse_transform_14(self): """ test inverse_transform having Nan in train and handle missing expected a result with Nan """ train = pd.DataFrame({'city': ['chicago', np.nan]}) enc = ce.BaseNEncoder(handle_missing='return_nan', handle_unknown='value') result = enc.fit_transform(train) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(train, original)
def test_inverse_transform_23(self): """ test inverse_transform having missing and No Unknown """ train = pd.DataFrame({'city': ['chicago', np.nan]}) test = pd.DataFrame({'city': ['chicago', 'los angeles']}) enc = ce.OneHotEncoder(handle_missing='value', handle_unknown='return_nan') enc.fit(train) result = enc.transform(test) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(train, original)
def test_inverse_transform_16(self): """ test inverse_transform having handle missing value and Unknown """ train = pd.DataFrame({'city': ['chicago', np.nan]}) test = pd.DataFrame({'city': ['chicago', np.nan, 'los angeles']}) expected = pd.DataFrame({'city': ['chicago', np.nan, np.nan]}) enc = ce.BaseNEncoder(handle_missing='value', handle_unknown='return_nan') enc.fit(train) result = enc.transform(test) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(expected, original)
def test_inverse_transform_19(self): """ test inverse_transform having no categories names """ encoder = ce.OneHotEncoder(cols=['match', 'match_box'], use_cat_names=False) value = pd.DataFrame({ 'match': pd.Series('box_-1'), 'match_box': pd.Series(-1) }) transformed = encoder.fit_transform(value) inversed_result = inverse_transform(transformed, encoder) pd.testing.assert_frame_equal(value, inversed_result)
def test_inverse_transform_22(self): """ test inverse_transform with Both fields return_nan """ train = pd.DataFrame({'city': ['chicago', np.nan]}) test = pd.DataFrame({'city': ['chicago', 'los angeles']}) expected = pd.DataFrame({'city': ['chicago', np.nan]}) enc = ce.OneHotEncoder(handle_missing='return_nan', handle_unknown='return_nan') enc.fit(train) result = enc.transform(test) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(original, expected)
def test_inv_transform_ct_3(self): """ test inv_transform_ct with multiple encoding and dictionnary """ train = pd.DataFrame({'city': ['chicago', 'paris'], 'state': ['US', 'FR'], 'other': ['A', 'B']}, index=['index1', 'index2']) enc = ColumnTransformer( transformers=[ ('onehot_ce', ce.OneHotEncoder(), ['city', 'state']), ('onehot_skp', skp.OneHotEncoder(), ['city', 'state']) ], remainder='passthrough') enc.fit(train) test = pd.DataFrame({'city': ['chicago', 'chicago', 'paris'], 'state': ['US', 'FR', 'FR'], 'other': ['A', 'B', 'C']}, index=['index1', 'index2', 'index3']) expected = pd.DataFrame({'onehot_ce_city': ['CH', 'CH', 'PR'], 'onehot_ce_state': ['US-FR', 'US-FR', 'US-FR'], 'onehot_skp_city': ['chicago', 'chicago', 'paris'], 'onehot_skp_state': ['US', 'FR', 'FR'], 'other': ['A-B', 'A-B', 'C']}, index=['index1', 'index2', 'index3']) result = pd.DataFrame(enc.transform(test)) result.columns = ['col1_0', 'col1_1', 'col2_0', 'col2_1', 'col3_0', 'col3_1', 'col4_0', 'col4_1', 'other'] result.index = ['index1', 'index2', 'index3'] input_dict1 = dict() input_dict1['col'] = 'onehot_ce_city' input_dict1['mapping'] = pd.Series(data=['chicago', 'paris'], index=['CH', 'PR']) input_dict1['data_type'] = 'object' input_dict2 = dict() input_dict2['col'] = 'other' input_dict2['mapping'] = pd.Series(data=['A', 'B', 'C'], index=['A-B', 'A-B', 'C']) input_dict2['data_type'] = 'object' input_dict3 = dict() input_dict3['col'] = 'onehot_ce_state' input_dict3['mapping'] = pd.Series(data=['US', 'FR'], index=['US-FR', 'US-FR']) input_dict3['data_type'] = 'object' list_dict = [input_dict2, input_dict3] original = inverse_transform(result, [enc,input_dict1,list_dict]) pd.testing.assert_frame_equal(original, expected)
def test_inverse_transform_17(self): """ test inverse_transform with multiple baseN """ train = pd.DataFrame({ 'city': ['chicago', 'paris'], 'state': ['US', 'FR'] }) test = pd.DataFrame({ 'city_0': [0, 1], 'city_1': [1, 0], 'state_0': [0, 1], 'state_1': [1, 0] }) enc = ce.BaseNEncoder(cols=['city', 'state'], handle_missing='value', handle_unknown='return_nan') enc.fit(train) original = inverse_transform(test, enc) pd.testing.assert_frame_equal(original, train)
def test_inverse_transform_25(self): """ Test dict encoding """ data = pd.DataFrame({ 'city': ['chicago', 'paris-1', 'paris-2'], 'state': ['US', 'FR-1', 'FR-2'], 'other': ['A', 'B', np.nan] }) expected = pd.DataFrame({ 'city': ['chicago', 'paris-1', 'paris-2'], 'state': ['US', 'FR', 'FR'], 'other': ['A', 'B', np.nan] }) input_dict = dict() input_dict['col'] = 'state' input_dict['mapping'] = pd.Series(data=['US', 'FR-1', 'FR-2'], index=['US', 'FR', 'FR']) input_dict['data_type'] = 'object' result = inverse_transform(data, input_dict) pd.testing.assert_frame_equal(result, expected)
def test_inverse_transform_10(self): """ test inverse_transform with multiple ordinal """ data = pd.DataFrame({ 'city': ['chicago', 'paris'], 'state': ['US', 'FR'], 'other': ['a', 'b'] }) test = pd.DataFrame({ 'city': [1, 2, 2], 'state': [1, 2, 2], 'other': ['a', 'b', 'a'] }) expected = pd.DataFrame({ 'city': ['chicago', 'paris', 'paris'], 'state': ['US', 'FR', 'FR'], 'other': ['a', 'b', 'a'] }) enc = ce.OrdinalEncoder(cols=['city', 'state']) enc.fit(data) original = inverse_transform(test, enc) pd.testing.assert_frame_equal(original, expected)
def compile(self, x, model, explainer=None, contributions=None, y_pred=None, preprocessing=None, postprocessing=None, title_story: str = None): """ The compile method is the first step to understand model and prediction. It performs the sorting of contributions, the reverse preprocessing steps and performs all the calculations necessary for a quick display of plots and efficient display of summary of explanation. Most of the parameters are optional but all help to display results that can be understood This step can last a few moments with large datasets. Parameters ---------- x : pandas.DataFrame Prediction set. IMPORTANT: this should be the raw prediction set, whose values are seen by the end user. x is a preprocessed dataset: Shapash can apply the model to it model : model object model used to consistency check. model object can also be used by some method to compute predict and predict_proba values explainer : explainer object explainer must be a shap object contributions : pandas.DataFrame, np.ndarray or list single or multiple contributions (multi-class) to handle. if pandas.Dataframe, the index and columns should be share with the prediction set. if np.ndarray, index and columns will be generated according to x dataset y_pred : pandas.Series or pandas.DataFrame, optional (default: None) Prediction values (1 column only). The index must be identical to the index of x_pred. This is an interesting parameter for more explicit outputs. Shapash lets users define their own predict, as they may wish to set their own threshold (classification) preprocessing : category_encoders, ColumnTransformer, list, dict, optional (default: None) --> Differents types of preprocessing are available: - A single category_encoders (OrdinalEncoder/OnehotEncoder/BaseNEncoder/BinaryEncoder/TargetEncoder) - A single ColumnTransformer with scikit-learn encoding or category_encoders transformers - A list with multiple category_encoders with optional (dict, list of dict) - A list with a single ColumnTransformer with optional (dict, list of dict) - A dict - A list of dict postprocessing : dict, optional (default: None) Dictionnary of postprocessing modifications to apply in x_pred dataframe. Dictionnary with feature names as keys (or number, or well labels referencing to features names), which modifies dataset features by features. --> Different types of postprocessing are available, but the syntax is this one: One key by features, 5 different types of modifications: >>> { ‘feature1’ : { ‘type’ : ‘prefix’, ‘rule’ : ‘age: ‘ }, ‘feature2’ : { ‘type’ : ‘suffix’, ‘rule’ : ‘$/week ‘ }, ‘feature3’ : { ‘type’ : ‘transcoding’, ‘rule‘: { ‘code1’ : ‘single’, ‘code2’ : ‘married’}}, ‘feature4’ : { ‘type’ : ‘regex’ , ‘rule‘: { ‘in’ : ‘AND’, ‘out’ : ‘ & ‘ }}, ‘feature5’ : { ‘type’ : ‘case’ , ‘rule‘: ‘lower’‘ } } Only one transformation by features is possible. title_story: str (default: None) The default title is empty. You can specify a custom title which can be used the webapp, or other methods Example -------- >>> xpl.compile(x=xtest_df,model=my_model) """ self.x_init = x self.x_pred = inverse_transform(self.x_init, preprocessing) self.preprocessing = preprocessing self.model = model self._case, self._classes = self.check_model() self.check_label_dict() if self.label_dict: self.inv_label_dict = {v: k for k, v in self.label_dict.items()} if explainer is not None and contributions is not None: raise ValueError("You have to specify just one of these arguments: explainer, contributions") if contributions is None: contributions, explainer = shap_contributions(model, self.x_init, self.check_explainer(explainer)) adapt_contrib = self.adapt_contributions(contributions) self.state = self.choose_state(adapt_contrib) self.contributions = self.apply_preprocessing(self.validate_contributions(adapt_contrib), preprocessing) self.check_contributions() self.explainer = explainer self.y_pred = self.check_y_pred(y_pred) self.columns_dict = {i: col for i, col in enumerate(self.x_pred.columns)} self.inv_columns_dict = {v: k for k, v in self.columns_dict.items()} self.check_features_dict() self.inv_features_dict = {v: k for k, v in self.features_dict.items()} postprocessing = self.modify_postprocessing(postprocessing) self.check_postprocessing(postprocessing) self.postprocessing_modifications = self.check_postprocessing_modif_strings(postprocessing) self.postprocessing = postprocessing if self.postprocessing_modifications: self.x_contrib_plot = copy.deepcopy(self.x_pred) self.x_pred = self.apply_postprocessing(postprocessing) self.data = self.state.assign_contributions( self.state.rank_contributions( self.contributions, self.x_pred ) ) self.features_imp = None self.features_desc = self.check_features_desc() if title_story is not None: self.title_story = title_story
def test_inverse_transform_26(self): """ Test multiple dict encoding """ train = pd.DataFrame({ 'Onehot1': ['A', 'B', 'A', 'B'], 'Onehot2': ['C', 'D', 'C', 'D'], 'Binary1': ['E', 'F', 'E', 'F'], 'Binary2': ['G', 'H', 'G', 'H'], 'Ordinal1': ['I', 'J', 'I', 'J'], 'Ordinal2': ['K', 'L', 'K', 'L'], 'BaseN1': ['M', 'N', 'M', 'N'], 'BaseN2': ['O', 'P', 'O', 'P'], 'Target1': ['Q', 'R', 'Q', 'R'], 'Target2': ['S', 'T', 'S', 'T'], 'other': ['other', np.nan, 'other', 'other'] }) test = pd.DataFrame( { 'Onehot1': ['A', 'B', 'A'], 'Onehot2': ['C', 'D', 'ZZ'], 'Binary1': ['E', 'F', 'F'], 'Binary2': ['G', 'H', 'ZZ'], 'Ordinal1': ['I', 'J', 'J'], 'Ordinal2': ['K', 'L', 'ZZ'], 'BaseN1': ['M', 'N', 'N'], 'BaseN2': ['O', 'P', 'ZZ'], 'Target1': ['Q', 'R', 'R'], 'Target2': ['S', 'T', 'ZZ'], 'other': ['other', '123', np.nan] }, index=['index1', 'index2', 'index3']) expected = pd.DataFrame( { 'Onehot1': ['A', 'B', 'A'], 'Onehot2': ['C', 'D', 'missing'], 'Binary1': ['E', 'F', 'F'], 'Binary2': ['G', 'H', 'missing'], 'Ordinal1': ['I', 'J', 'J'], 'Ordinal2': ['K', 'L', 'missing'], 'BaseN1': ['M', 'N', 'N'], 'BaseN2': ['O', 'P', np.nan], 'Target1': ['Q', 'R', 'R'], 'Target2': ['S', 'T', 'NaN'], 'other': ['other', '123', np.nan] }, index=['index1', 'index2', 'index3']) y = pd.DataFrame(data=[0, 1, 0, 0], columns=['y']) enc_onehot = ce.OneHotEncoder(cols=['Onehot1', 'Onehot2']).fit(train) train_onehot = enc_onehot.transform(train) enc_binary = ce.BinaryEncoder( cols=['Binary1', 'Binary2']).fit(train_onehot) train_binary = enc_binary.transform(train_onehot) enc_ordinal = ce.OrdinalEncoder( cols=['Ordinal1', 'Ordinal2']).fit(train_binary) train_ordinal = enc_ordinal.transform(train_binary) enc_basen = ce.BaseNEncoder( cols=['BaseN1', 'BaseN2']).fit(train_ordinal) train_basen = enc_basen.transform(train_ordinal) enc_target = ce.TargetEncoder(cols=['Target1', 'Target2']).fit( train_basen, y) input_dict1 = dict() input_dict1['col'] = 'Onehot2' input_dict1['mapping'] = pd.Series(data=['C', 'D', np.nan], index=['C', 'D', 'missing']) input_dict1['data_type'] = 'object' input_dict2 = dict() input_dict2['col'] = 'Binary2' input_dict2['mapping'] = pd.Series(data=['G', 'H', np.nan], index=['G', 'H', 'missing']) input_dict2['data_type'] = 'object' input_dict3 = dict() input_dict3['col'] = 'Ordinal2' input_dict3['mapping'] = pd.Series(data=['K', 'L', np.nan], index=['K', 'L', 'missing']) input_dict3['data_type'] = 'object' list_dict = [input_dict2, input_dict3] result1 = enc_onehot.transform(test) result2 = enc_binary.transform(result1) result3 = enc_ordinal.transform(result2) result4 = enc_basen.transform(result3) result5 = enc_target.transform(result4) original = inverse_transform(result5, [ enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target, input_dict1, list_dict ]) pd.testing.assert_frame_equal(expected, original)