def test_encode_features_matches_calculate_feature_matrix():
    df = pd.DataFrame({"category": ["b", "c", "d", "e"]}).astype(
        {"category": "category"}
    )

    pd_es = EntitySet("test")
    pd_es.add_dataframe(
        dataframe_name="a", dataframe=df, index="index", make_index=True
    )
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a")

    features_enc, feature_defs_enc = encode_features(
        features, feature_defs, to_encode=["category"]
    )

    features_calc = calculate_feature_matrix(feature_defs_enc, entityset=pd_es)

    pd.testing.assert_frame_equal(features_enc, features_calc)
    assert features_calc.ww._schema == features_enc.ww._schema
def test_encode_features_drop_first():
    df = pd.DataFrame({"category": ["ao", "b", "c", "d", "e"]}).astype(
        {"category": "category"}
    )
    pd_es = EntitySet("test")
    pd_es.add_dataframe(
        dataframe_name="a", dataframe=df, index="index", make_index=True
    )
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a")
    features_enc, _ = encode_features(
        features, feature_defs, drop_first=True, include_unknown=False
    )
    assert len(features_enc.columns) == 4

    features_enc, feature_defs = encode_features(
        features, feature_defs, top_n=3, drop_first=True, include_unknown=False
    )

    assert len(features_enc.columns) == 2
def test_encode_features_matches_calculate_feature_matrix():
    df = pd.DataFrame({
        'category': ['b', 'c', 'd', 'e']
    }).astype({'category': 'category'})

    pd_es = EntitySet('test')
    pd_es.add_dataframe(dataframe_name='a',
                        dataframe=df,
                        index='index',
                        make_index=True)
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name='a')

    features_enc, feature_defs_enc = encode_features(features,
                                                     feature_defs,
                                                     to_encode=['category'])

    features_calc = calculate_feature_matrix(feature_defs_enc, entityset=pd_es)

    pd.testing.assert_frame_equal(features_enc, features_calc)
    assert features_calc.ww._schema == features_enc.ww._schema
def test_encode_unknown_features():
    # Dataframe with categorical column with "unknown" string
    df = pd.DataFrame({
        'category': ['unknown', 'b', 'c', 'd', 'e']
    }).astype({'category': 'category'})

    pd_es = EntitySet('test')
    pd_es.add_dataframe(dataframe_name='a',
                        dataframe=df,
                        index='index',
                        make_index=True)
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name='a')

    # Specify unknown token for replacement
    features_enc, _ = encode_features(features,
                                      feature_defs,
                                      include_unknown=True)
    assert list(features_enc.columns) == [
        'category = unknown', 'category = e', 'category = d', 'category = c',
        'category = b', 'category is unknown'
    ]
def test_encode_unknown_features():
    # Dataframe with categorical column with "unknown" string
    df = pd.DataFrame({"category": ["unknown", "b", "c", "d", "e"]}).astype(
        {"category": "category"}
    )

    pd_es = EntitySet("test")
    pd_es.add_dataframe(
        dataframe_name="a", dataframe=df, index="index", make_index=True
    )
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a")

    # Specify unknown token for replacement
    features_enc, _ = encode_features(features, feature_defs, include_unknown=True)
    assert list(features_enc.columns) == [
        "category = unknown",
        "category = e",
        "category = d",
        "category = c",
        "category = b",
        "category is unknown",
    ]
def test_encode_features_drop_first():
    df = pd.DataFrame({
        'category': ['ao', 'b', 'c', 'd', 'e']
    }).astype({'category': 'category'})
    pd_es = EntitySet('test')
    pd_es.add_dataframe(dataframe_name='a',
                        dataframe=df,
                        index='index',
                        make_index=True)
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name='a')
    features_enc, _ = encode_features(features,
                                      feature_defs,
                                      drop_first=True,
                                      include_unknown=False)
    assert len(features_enc.columns) == 4

    features_enc, feature_defs = encode_features(features,
                                                 feature_defs,
                                                 top_n=3,
                                                 drop_first=True,
                                                 include_unknown=False)

    assert len(features_enc.columns) == 2