def test_NumericalEncoder_dummy_output_dtype(): np.random.seed(123) df = get_sample_df(100, seed=123) ind = np.arange(len(df)) df.index = ind df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3]) df["cat_col_2"] = df["text_col"].apply(lambda s: s[3:6]) encoder = NumericalEncoder(encoding_type="dummy") encoder.fit(df) res = encoder.transform(df) assert (res.dtypes[res.columns.str.startswith("cat_col_")] == "int32" ).all() # check default encoding type = int32
def test_NumericalEncoder_num_output_dtype(): np.random.seed(123) df = get_sample_df(100, seed=123) ind = np.arange(len(df)) df.index = ind np.random.shuffle(ind) df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3]) df["cat_col_2"] = df["text_col"].apply(lambda s: s[3:6]) encoder = NumericalEncoder(encoding_type="num") encoder.fit(df) res = encoder.transform(df) assert res.dtypes["cat_col_1"] == "int32" assert res.dtypes["cat_col_2"] == "int32"
def test_NumericalEncoder_num(): ###################### ### Numerical Mode ### ###################### np.random.seed(123) df = get_sample_df(100, seed=123) ind = np.arange(len(df)) df.index = ind np.random.shuffle(ind) df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3]) df["cat_col_2"] = df["text_col"].apply(lambda s: s[3:6]) encoder = NumericalEncoder(encoding_type="num") encoder.fit(df) res = encoder.transform(df) assert res.shape == df.shape assert (res.index == df.index).all() assert encoder.get_feature_names() == encoder.model._feature_names assert encoder.get_feature_names() == list(res.columns) df2 = df.copy() df2.loc[0, "cat_col_1"] = "something-new" df2.loc[1, "cat_col_2"] = None # Something None res2 = encoder.transform(df2) assert res2.loc[0, "cat_col_1"] == -1 assert res2.loc[1, "cat_col_2"] == -1 df_with_none = df.copy() df_with_none["cat_col_3"] = df_with_none["cat_col_1"] df_with_none.loc[list(range(25)), "cat_col_3"] = None encoder2 = NumericalEncoder(encoding_type="num") res2 = encoder2.fit_transform(df_with_none) assert (df_with_none["cat_col_3"].isnull() == ( res2["cat_col_3"] == 0)).all()
def test_NumericalEncoder_columns_to_encode_object(): np.random.seed(123) Xnum = np.random.randn(1000, 10) dfX = pd.DataFrame(Xnum, columns=["col_%d" % i for i in range(10)]) dfX["object_column"] = ["string_%2.4f" % x for x in dfX["col_0"]] # with --object-- encoder = NumericalEncoder(columns_to_encode="--object--") dfX_enc = encoder.fit_transform(dfX) assert not (dfX_enc.dtypes == "object").any() # with default behavior encoder = NumericalEncoder() dfX_enc = encoder.fit_transform(dfX) assert "object_column" in dfX_enc assert (dfX_enc["object_column"] == dfX["object_column"]).all()
def test_param_from_sklearn_model(): # simple RandomForest model = RandomForestClassifier(n_estimators=250) assert RandomForestClassifier().get_params()["n_estimators"] != 250 assert param_from_sklearn_model( model, simplify_default=True) == ('RandomForestClassifier', { 'n_estimators': 250 }) param = param_from_sklearn_model(model, simplify_default=False) assert isinstance(param, tuple) assert len(param) == 2 assert param[0] == "RandomForestClassifier" assert isinstance( sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) s = json.dumps(param) # check that it can be json serialized assert isinstance(s, str) assert isinstance( sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) # Composition model : BoxCoxTargetTransformer of RandomForestClassifier model = BoxCoxTargetTransformer(RandomForestClassifier(n_estimators=250), ll=0) param = param_from_sklearn_model(model, simplify_default=True) assert param == ('BoxCoxTargetTransformer', { 'model': ('RandomForestClassifier', { 'n_estimators': 250 }) }) assert isinstance( sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) s = json.dumps(param) # check that it can be json serialized assert isinstance(s, str) # Composition model : BoxCoxTargetTransformer of RandomForestClassifier model = BoxCoxTargetTransformer(RandomForestClassifier(n_estimators=250), ll=1) param = param_from_sklearn_model(model, simplify_default=True) assert param == ('BoxCoxTargetTransformer', { 'll': 1, 'model': ('RandomForestClassifier', { 'n_estimators': 250 }) }) s = json.dumps(param) # check that it can be json serialized assert isinstance(s, str) assert isinstance( sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) # Pipeline model = Pipeline([("enc", NumericalEncoder()), ("forest", RandomForestClassifier(n_estimators=250))]) param = param_from_sklearn_model(model, simplify_default=True) assert param == ('Pipeline', { 'steps': [('enc', ('NumericalEncoder', {})), ('forest', ('RandomForestClassifier', { 'n_estimators': 250 }))] }) assert isinstance( sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) s = json.dumps(param) # check that it can be json serialized assert isinstance(s, str) # GraphPipeline model = GraphPipeline(models={ "enc": NumericalEncoder(), "forest": RandomForestClassifier(n_estimators=250) }, edges=[("enc", "forest")]) param = param_from_sklearn_model(model, simplify_default=True) assert param == ('GraphPipeline', { 'models': { 'enc': ('NumericalEncoder', {}), 'forest': ('RandomForestClassifier', { 'n_estimators': 250 }) }, 'edges': [('enc', 'forest')] }) assert isinstance( sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) # GraphPipeline with verbose = True model = GraphPipeline(models={ "enc": NumericalEncoder(), "forest": RandomForestClassifier(n_estimators=250) }, edges=[("enc", "forest")], verbose=True) param = param_from_sklearn_model(model, simplify_default=True) assert param == ('GraphPipeline', { 'models': { 'enc': ('NumericalEncoder', {}), 'forest': ('RandomForestClassifier', { 'n_estimators': 250 }) }, 'edges': [('enc', 'forest')], 'verbose': True }) s = json.dumps(param) # check that it can be json serialized assert isinstance(s, str) model2 = sklearn_model_from_param(param_from_sklearn_model(model)) assert model2.verbose is True assert isinstance(model2, model.__class__) # GraphPipeline + composition model = GraphPipeline(models={ "enc": NumericalEncoder(), "forest": BoxCoxTargetTransformer(RandomForestClassifier(n_estimators=250), ll=1) }, edges=[("enc", "forest")]) param = param_from_sklearn_model(model, simplify_default=True) assert param == ('GraphPipeline', { 'edges': [('enc', 'forest')], 'models': { 'enc': ('NumericalEncoder', {}), 'forest': ('BoxCoxTargetTransformer', { 'll': 1, 'model': ('RandomForestClassifier', { 'n_estimators': 250 }) }) } }) assert isinstance( sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) s = json.dumps(param) # check that it can be json serialized assert isinstance(s, str)
def test_NumericalEncoder_dummy(): #################### ### One Hot Mode ### #################### np.random.seed(123) df = get_sample_df(100, seed=123) ind = np.arange(len(df)) df.index = ind df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3]) df["cat_col_2"] = df["text_col"].apply(lambda s: s[3:6]) encoder = NumericalEncoder(encoding_type="dummy") encoder.fit(df) res = encoder.transform(df) assert encoder.model._dummy_size == len(encoder.model._dummy_feature_names) assert encoder.model._dummy_size == sum( len(v) for k, v in encoder.model.variable_modality_mapping.items()) assert res.shape[0] == df.shape[0] assert res.shape[1] == len(df["cat_col_1"].value_counts()) + len( df["cat_col_2"].value_counts()) + 3 assert (res.index == df.index).all() col = ["float_col", "int_col", "text_col"] col1 = [ "cat_col_1__%s" % c for c in list(df["cat_col_1"].value_counts().index) ] col2 = [ "cat_col_2__%s" % c for c in list(df["cat_col_2"].value_counts().index) ] assert col1 == encoder.columns_mapping["cat_col_1"] assert col2 == encoder.columns_mapping["cat_col_2"] assert encoder.get_feature_names() == encoder.model._feature_names assert encoder.get_feature_names() == col + col1 + col2 assert (res.loc[:, col1 + col2]).isnull().sum().sum() == 0 assert (res.loc[:, col1 + col2]).max().max() == 1 assert (res.loc[:, col1 + col2]).min().min() == 0 assert ((df["cat_col_1"] == "aaa") == (res["cat_col_1__aaa"] == 1)).all() df2 = df.copy() df2.loc[0, "cat_col_1"] = "something-new" df2.loc[1, "cat_col_2"] = None # Something None res2 = encoder.transform(df2) assert res2.loc[0, col1].sum() == 0 # no dummy activated assert res2.loc[ 0, "cat_col_2__" + df2.loc[0, "cat_col_2"]] == 1 #activated in the right position assert res2.loc[0, col2].sum() == 1 # only one dummy activate assert res2.loc[1, col2].sum() == 0 # no dummy activated assert res2.loc[ 1, "cat_col_1__" + df2.loc[1, "cat_col_1"]] == 1 # activated in the right position assert res2.loc[1, col1].sum() == 1 df_with_none = df.copy() df_with_none["cat_col_3"] = df_with_none["cat_col_1"] df_with_none.loc[0:25, "cat_col_3"] = None encoder2 = NumericalEncoder(encoding_type="dummy") res2 = encoder2.fit_transform(df_with_none) col3b = [c for c in res2.columns if c.startswith("cat_col_3")] assert col3b[0] == "cat_col_3____null__" assert list(res2.columns) == col + col1 + col2 + col3b assert list(res2.columns) == encoder2.get_feature_names() assert (res2.loc[:, col1 + col2 + col3b]).isnull().sum().sum() == 0 assert (res2.loc[:, col1 + col2 + col3b]).max().max() == 1 assert (res2.loc[:, col1 + col2 + col3b]).min().min() == 0 assert (df_with_none["cat_col_3"].isnull() == ( res2["cat_col_3____null__"] == 1)).all() df3 = df.copy() df3["cat_col_many"] = [ "m_%d" % x for x in np.ceil(np.minimum(np.exp(np.random.rand(100) * 5), 50)).astype(np.int32) ] encoder3 = NumericalEncoder(encoding_type="dummy") res3 = encoder3.fit_transform(df3) colm = [c for c in res3.columns if c.startswith("cat_col_many")] vc = df3["cat_col_many"].value_counts() colmb = [ "cat_col_many__" + c for c in list(vc.index[vc >= encoder3.min_nb_observations]) + ["__default__"] ] assert colm == colmb
def test_NumericalEncoder_num_fit_parameters(): np.random.seed(123) df = get_sample_df(100, seed=123) ind = np.arange(len(df)) df.index = ind df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3]) df["cat_col_2"] = df["text_col"].apply(lambda s: s[4:7]) df["cat_col_3"] = df["text_col"].apply(lambda s: s[8:11]) df.loc[0:10, "cat_col_3"] = None # All modalities are kept, __null__ category is created encoder = NumericalEncoder(encoding_type="num", min_modalities_number=10, max_modalities_number=100, max_na_percentage=0, min_nb_observations=1, max_cum_proba=1) res = encoder.fit_transform(df) assert len(encoder.model.variable_modality_mapping['cat_col_1']) == 7 assert len(encoder.model.variable_modality_mapping['cat_col_3']) == 8 # We filter on max_cum_proba, __null__ category is created encoder = NumericalEncoder(encoding_type="num", min_modalities_number=1, max_modalities_number=100, max_na_percentage=0, min_nb_observations=1, max_cum_proba=0.6) res = encoder.fit_transform(df) map1 = encoder.model.variable_modality_mapping['cat_col_1'] assert len(map1) == 5 assert np.all( [v in map1 for v in ['eee', 'bbb', 'ddd', 'jjj', '__default__']]) map3 = encoder.model.variable_modality_mapping['cat_col_3'] assert len(map3) == 6 assert np.all([ v in map3 for v in ['bbb', 'ddd', 'ccc', 'aaa', 'jjj', '__default__'] ]) # No __null__ category encoder = NumericalEncoder(encoding_type="num", min_modalities_number=1, max_modalities_number=100, max_na_percentage=0.2, min_nb_observations=1, max_cum_proba=1) res = encoder.fit_transform(df) assert len(encoder.model.variable_modality_mapping['cat_col_3']) == 7 # Max modalities encoder = NumericalEncoder(encoding_type="num", min_modalities_number=1, max_modalities_number=3, max_na_percentage=0.2, min_nb_observations=1, max_cum_proba=1) res = encoder.fit_transform(df) assert len(encoder.model.variable_modality_mapping['cat_col_1']) == 4 assert len(encoder.model.variable_modality_mapping['cat_col_2']) == 4 assert len(encoder.model.variable_modality_mapping['cat_col_3']) == 4