def test_already_index(self): features = pd.DataFrame({ "cat": [0, 1, 0, 0, 2, np.nan, 1], "num": [1, 2, 3, 4, 5, 6, 7] }) encoder = utils.StringIndexer(cols=["cat"]) encoder.fit(features) pd.testing.assert_series_equal(encoder.transform(features)["cat"], pd.Series([1, 2, 1, 1, 3, 0, 2]), check_names=False)
def test_topk(self): features = pd.DataFrame({ "cat": ["a", "b", "a", "a", "d", "c", "b"], "num": [1, 2, 3, 4, 5, 6, 7] }) encoder = utils.StringIndexer(cols=["cat"], topk=2) encoder.fit(features) pd.testing.assert_series_equal(encoder.transform(features)["cat"], pd.Series([1, 2, 1, 1, 3, 3, 2]), check_names=False)
def test_realdata(self): bunch = load_boston() features = pd.DataFrame(bunch.data, columns=bunch.feature_names) encoder = utils.StringIndexer(cols=["CHAS", "RAD"]) encoder.fit(features) truth_chas = copy.deepcopy(features["CHAS"]) def chas_map(x): if x == 0.0: return 1 elif x == 1.0: return 2 truth_chas = truth_chas.map(chas_map) truth_rad = copy.deepcopy(features["RAD"]) def rad_map(x): if x == 24.0: return 1 elif x == 5.0: return 2 elif x == 4.0: return 3 elif x == 3.0: return 4 elif x == 6.0: return 5 elif x == 8.0: return 6 elif x == 2.0: return 7 elif x == 1.0: return 8 elif x == 7.0: return 9 truth_rad = truth_rad.map(rad_map) transformed = encoder.transform(features) pd.testing.assert_series_equal(transformed["CHAS"], truth_chas, check_names=False) pd.testing.assert_series_equal(transformed["RAD"], truth_rad, check_names=False)