def test_encode_features_matches_calculate_feature_matrix(): df = pd.DataFrame({"category": ["b", "c", "d", "e"]}).astype( {"category": "category"} ) pd_es = EntitySet("test") pd_es.add_dataframe( dataframe_name="a", dataframe=df, index="index", make_index=True ) features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a") features_enc, feature_defs_enc = encode_features( features, feature_defs, to_encode=["category"] ) features_calc = calculate_feature_matrix(feature_defs_enc, entityset=pd_es) pd.testing.assert_frame_equal(features_enc, features_calc) assert features_calc.ww._schema == features_enc.ww._schema
def test_encode_features_drop_first(): df = pd.DataFrame({"category": ["ao", "b", "c", "d", "e"]}).astype( {"category": "category"} ) pd_es = EntitySet("test") pd_es.add_dataframe( dataframe_name="a", dataframe=df, index="index", make_index=True ) features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a") features_enc, _ = encode_features( features, feature_defs, drop_first=True, include_unknown=False ) assert len(features_enc.columns) == 4 features_enc, feature_defs = encode_features( features, feature_defs, top_n=3, drop_first=True, include_unknown=False ) assert len(features_enc.columns) == 2
def test_encode_features_matches_calculate_feature_matrix(): df = pd.DataFrame({ 'category': ['b', 'c', 'd', 'e'] }).astype({'category': 'category'}) pd_es = EntitySet('test') pd_es.add_dataframe(dataframe_name='a', dataframe=df, index='index', make_index=True) features, feature_defs = dfs(entityset=pd_es, target_dataframe_name='a') features_enc, feature_defs_enc = encode_features(features, feature_defs, to_encode=['category']) features_calc = calculate_feature_matrix(feature_defs_enc, entityset=pd_es) pd.testing.assert_frame_equal(features_enc, features_calc) assert features_calc.ww._schema == features_enc.ww._schema
def test_encode_unknown_features(): # Dataframe with categorical column with "unknown" string df = pd.DataFrame({ 'category': ['unknown', 'b', 'c', 'd', 'e'] }).astype({'category': 'category'}) pd_es = EntitySet('test') pd_es.add_dataframe(dataframe_name='a', dataframe=df, index='index', make_index=True) features, feature_defs = dfs(entityset=pd_es, target_dataframe_name='a') # Specify unknown token for replacement features_enc, _ = encode_features(features, feature_defs, include_unknown=True) assert list(features_enc.columns) == [ 'category = unknown', 'category = e', 'category = d', 'category = c', 'category = b', 'category is unknown' ]
def test_encode_unknown_features(): # Dataframe with categorical column with "unknown" string df = pd.DataFrame({"category": ["unknown", "b", "c", "d", "e"]}).astype( {"category": "category"} ) pd_es = EntitySet("test") pd_es.add_dataframe( dataframe_name="a", dataframe=df, index="index", make_index=True ) features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a") # Specify unknown token for replacement features_enc, _ = encode_features(features, feature_defs, include_unknown=True) assert list(features_enc.columns) == [ "category = unknown", "category = e", "category = d", "category = c", "category = b", "category is unknown", ]
def test_encode_features_drop_first(): df = pd.DataFrame({ 'category': ['ao', 'b', 'c', 'd', 'e'] }).astype({'category': 'category'}) pd_es = EntitySet('test') pd_es.add_dataframe(dataframe_name='a', dataframe=df, index='index', make_index=True) features, feature_defs = dfs(entityset=pd_es, target_dataframe_name='a') features_enc, _ = encode_features(features, feature_defs, drop_first=True, include_unknown=False) assert len(features_enc.columns) == 4 features_enc, feature_defs = encode_features(features, feature_defs, top_n=3, drop_first=True, include_unknown=False) assert len(features_enc.columns) == 2